aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c57
-rw-r--r--mm/hugetlb.c70
-rw-r--r--mm/ksm.c1
-rw-r--r--mm/memory-failure.c13
-rw-r--r--mm/memory.c3
-rw-r--r--mm/mempolicy.c48
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/msync.c3
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page_alloc.c56
-rw-r--r--mm/rmap.c22
-rw-r--r--mm/shmem.c122
-rw-r--r--mm/slab.c90
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c6
-rw-r--r--mm/truncate.c11
16 files changed, 366 insertions, 147 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e60837dc785c..33514d88fef9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -941,6 +941,37 @@ unlock:
941 spin_unlock(ptl); 941 spin_unlock(ptl);
942} 942}
943 943
944/*
945 * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
946 * during copy_user_huge_page()'s copy_page_rep(): in the case when
947 * the source page gets split and a tail freed before copy completes.
948 * Called under pmd_lock of checked pmd, so safe from splitting itself.
949 */
950static void get_user_huge_page(struct page *page)
951{
952 if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
953 struct page *endpage = page + HPAGE_PMD_NR;
954
955 atomic_add(HPAGE_PMD_NR, &page->_count);
956 while (++page < endpage)
957 get_huge_page_tail(page);
958 } else {
959 get_page(page);
960 }
961}
962
963static void put_user_huge_page(struct page *page)
964{
965 if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
966 struct page *endpage = page + HPAGE_PMD_NR;
967
968 while (page < endpage)
969 put_page(page++);
970 } else {
971 put_page(page);
972 }
973}
974
944static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 975static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
945 struct vm_area_struct *vma, 976 struct vm_area_struct *vma,
946 unsigned long address, 977 unsigned long address,
@@ -1074,7 +1105,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1074 ret |= VM_FAULT_WRITE; 1105 ret |= VM_FAULT_WRITE;
1075 goto out_unlock; 1106 goto out_unlock;
1076 } 1107 }
1077 get_page(page); 1108 get_user_huge_page(page);
1078 spin_unlock(ptl); 1109 spin_unlock(ptl);
1079alloc: 1110alloc:
1080 if (transparent_hugepage_enabled(vma) && 1111 if (transparent_hugepage_enabled(vma) &&
@@ -1095,7 +1126,7 @@ alloc:
1095 split_huge_page(page); 1126 split_huge_page(page);
1096 ret |= VM_FAULT_FALLBACK; 1127 ret |= VM_FAULT_FALLBACK;
1097 } 1128 }
1098 put_page(page); 1129 put_user_huge_page(page);
1099 } 1130 }
1100 count_vm_event(THP_FAULT_FALLBACK); 1131 count_vm_event(THP_FAULT_FALLBACK);
1101 goto out; 1132 goto out;
@@ -1105,7 +1136,7 @@ alloc:
1105 put_page(new_page); 1136 put_page(new_page);
1106 if (page) { 1137 if (page) {
1107 split_huge_page(page); 1138 split_huge_page(page);
1108 put_page(page); 1139 put_user_huge_page(page);
1109 } else 1140 } else
1110 split_huge_page_pmd(vma, address, pmd); 1141 split_huge_page_pmd(vma, address, pmd);
1111 ret |= VM_FAULT_FALLBACK; 1142 ret |= VM_FAULT_FALLBACK;
@@ -1127,7 +1158,7 @@ alloc:
1127 1158
1128 spin_lock(ptl); 1159 spin_lock(ptl);
1129 if (page) 1160 if (page)
1130 put_page(page); 1161 put_user_huge_page(page);
1131 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1162 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
1132 spin_unlock(ptl); 1163 spin_unlock(ptl);
1133 mem_cgroup_uncharge_page(new_page); 1164 mem_cgroup_uncharge_page(new_page);
@@ -2392,8 +2423,6 @@ static void collapse_huge_page(struct mm_struct *mm,
2392 pmd = mm_find_pmd(mm, address); 2423 pmd = mm_find_pmd(mm, address);
2393 if (!pmd) 2424 if (!pmd)
2394 goto out; 2425 goto out;
2395 if (pmd_trans_huge(*pmd))
2396 goto out;
2397 2426
2398 anon_vma_lock_write(vma->anon_vma); 2427 anon_vma_lock_write(vma->anon_vma);
2399 2428
@@ -2492,8 +2521,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2492 pmd = mm_find_pmd(mm, address); 2521 pmd = mm_find_pmd(mm, address);
2493 if (!pmd) 2522 if (!pmd)
2494 goto out; 2523 goto out;
2495 if (pmd_trans_huge(*pmd))
2496 goto out;
2497 2524
2498 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); 2525 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
2499 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2526 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2846,12 +2873,22 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
2846static void split_huge_page_address(struct mm_struct *mm, 2873static void split_huge_page_address(struct mm_struct *mm,
2847 unsigned long address) 2874 unsigned long address)
2848{ 2875{
2876 pgd_t *pgd;
2877 pud_t *pud;
2849 pmd_t *pmd; 2878 pmd_t *pmd;
2850 2879
2851 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 2880 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2852 2881
2853 pmd = mm_find_pmd(mm, address); 2882 pgd = pgd_offset(mm, address);
2854 if (!pmd) 2883 if (!pgd_present(*pgd))
2884 return;
2885
2886 pud = pud_offset(pgd, address);
2887 if (!pud_present(*pud))
2888 return;
2889
2890 pmd = pmd_offset(pud, address);
2891 if (!pmd_present(*pmd))
2855 return; 2892 return;
2856 /* 2893 /*
2857 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2894 * Caller holds the mmap_sem write mode, so a huge pmd cannot
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 226910cb7c9b..9221c02ed9e2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2520,6 +2520,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
2520 update_mmu_cache(vma, address, ptep); 2520 update_mmu_cache(vma, address, ptep);
2521} 2521}
2522 2522
2523static int is_hugetlb_entry_migration(pte_t pte)
2524{
2525 swp_entry_t swp;
2526
2527 if (huge_pte_none(pte) || pte_present(pte))
2528 return 0;
2529 swp = pte_to_swp_entry(pte);
2530 if (non_swap_entry(swp) && is_migration_entry(swp))
2531 return 1;
2532 else
2533 return 0;
2534}
2535
2536static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2537{
2538 swp_entry_t swp;
2539
2540 if (huge_pte_none(pte) || pte_present(pte))
2541 return 0;
2542 swp = pte_to_swp_entry(pte);
2543 if (non_swap_entry(swp) && is_hwpoison_entry(swp))
2544 return 1;
2545 else
2546 return 0;
2547}
2523 2548
2524int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 2549int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2525 struct vm_area_struct *vma) 2550 struct vm_area_struct *vma)
@@ -2559,7 +2584,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2559 dst_ptl = huge_pte_lock(h, dst, dst_pte); 2584 dst_ptl = huge_pte_lock(h, dst, dst_pte);
2560 src_ptl = huge_pte_lockptr(h, src, src_pte); 2585 src_ptl = huge_pte_lockptr(h, src, src_pte);
2561 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 2586 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
2562 if (!huge_pte_none(huge_ptep_get(src_pte))) { 2587 entry = huge_ptep_get(src_pte);
2588 if (huge_pte_none(entry)) { /* skip none entry */
2589 ;
2590 } else if (unlikely(is_hugetlb_entry_migration(entry) ||
2591 is_hugetlb_entry_hwpoisoned(entry))) {
2592 swp_entry_t swp_entry = pte_to_swp_entry(entry);
2593
2594 if (is_write_migration_entry(swp_entry) && cow) {
2595 /*
2596 * COW mappings require pages in both
2597 * parent and child to be set to read.
2598 */
2599 make_migration_entry_read(&swp_entry);
2600 entry = swp_entry_to_pte(swp_entry);
2601 set_huge_pte_at(src, addr, src_pte, entry);
2602 }
2603 set_huge_pte_at(dst, addr, dst_pte, entry);
2604 } else {
2563 if (cow) 2605 if (cow)
2564 huge_ptep_set_wrprotect(src, addr, src_pte); 2606 huge_ptep_set_wrprotect(src, addr, src_pte);
2565 entry = huge_ptep_get(src_pte); 2607 entry = huge_ptep_get(src_pte);
@@ -2578,32 +2620,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2578 return ret; 2620 return ret;
2579} 2621}
2580 2622
2581static int is_hugetlb_entry_migration(pte_t pte)
2582{
2583 swp_entry_t swp;
2584
2585 if (huge_pte_none(pte) || pte_present(pte))
2586 return 0;
2587 swp = pte_to_swp_entry(pte);
2588 if (non_swap_entry(swp) && is_migration_entry(swp))
2589 return 1;
2590 else
2591 return 0;
2592}
2593
2594static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2595{
2596 swp_entry_t swp;
2597
2598 if (huge_pte_none(pte) || pte_present(pte))
2599 return 0;
2600 swp = pte_to_swp_entry(pte);
2601 if (non_swap_entry(swp) && is_hwpoison_entry(swp))
2602 return 1;
2603 else
2604 return 0;
2605}
2606
2607void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 2623void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2608 unsigned long start, unsigned long end, 2624 unsigned long start, unsigned long end,
2609 struct page *ref_page) 2625 struct page *ref_page)
diff --git a/mm/ksm.c b/mm/ksm.c
index 68710e80994a..346ddc9e4c0d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
945 pmd = mm_find_pmd(mm, addr); 945 pmd = mm_find_pmd(mm, addr);
946 if (!pmd) 946 if (!pmd)
947 goto out; 947 goto out;
948 BUG_ON(pmd_trans_huge(*pmd));
949 948
950 mmun_start = addr; 949 mmun_start = addr;
951 mmun_end = addr + PAGE_SIZE; 950 mmun_end = addr + PAGE_SIZE;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index cd8989c1027e..7211a73ba14d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -435,7 +435,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
435 if (av == NULL) /* Not actually mapped anymore */ 435 if (av == NULL) /* Not actually mapped anymore */
436 return; 436 return;
437 437
438 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 438 pgoff = page_to_pgoff(page);
439 read_lock(&tasklist_lock); 439 read_lock(&tasklist_lock);
440 for_each_process (tsk) { 440 for_each_process (tsk) {
441 struct anon_vma_chain *vmac; 441 struct anon_vma_chain *vmac;
@@ -469,7 +469,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
469 mutex_lock(&mapping->i_mmap_mutex); 469 mutex_lock(&mapping->i_mmap_mutex);
470 read_lock(&tasklist_lock); 470 read_lock(&tasklist_lock);
471 for_each_process(tsk) { 471 for_each_process(tsk) {
472 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 472 pgoff_t pgoff = page_to_pgoff(page);
473 struct task_struct *t = task_early_kill(tsk, force_early); 473 struct task_struct *t = task_early_kill(tsk, force_early);
474 474
475 if (!t) 475 if (!t)
@@ -895,7 +895,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
895 struct page *hpage = *hpagep; 895 struct page *hpage = *hpagep;
896 struct page *ppage; 896 struct page *ppage;
897 897
898 if (PageReserved(p) || PageSlab(p)) 898 if (PageReserved(p) || PageSlab(p) || !PageLRU(p))
899 return SWAP_SUCCESS; 899 return SWAP_SUCCESS;
900 900
901 /* 901 /*
@@ -1159,9 +1159,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1159 action_result(pfn, "free buddy, 2nd try", DELAYED); 1159 action_result(pfn, "free buddy, 2nd try", DELAYED);
1160 return 0; 1160 return 0;
1161 } 1161 }
1162 action_result(pfn, "non LRU", IGNORED);
1163 put_page(p);
1164 return -EBUSY;
1165 } 1162 }
1166 } 1163 }
1167 1164
@@ -1194,6 +1191,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1194 return 0; 1191 return 0;
1195 } 1192 }
1196 1193
1194 if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
1195 goto identify_page_state;
1196
1197 /* 1197 /*
1198 * For error on the tail page, we should set PG_hwpoison 1198 * For error on the tail page, we should set PG_hwpoison
1199 * on the head page to show that the hugepage is hwpoisoned 1199 * on the head page to show that the hugepage is hwpoisoned
@@ -1243,6 +1243,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1243 goto out; 1243 goto out;
1244 } 1244 }
1245 1245
1246identify_page_state:
1246 res = -EBUSY; 1247 res = -EBUSY;
1247 /* 1248 /*
1248 * The first check uses the current page flags which may not have any 1249 * The first check uses the current page flags which may not have any
diff --git a/mm/memory.c b/mm/memory.c
index d67fd9fcf1f2..7e8d8205b610 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2882,7 +2882,8 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2882 * if page by the offset is not ready to be mapped (cold cache or 2882 * if page by the offset is not ready to be mapped (cold cache or
2883 * something). 2883 * something).
2884 */ 2884 */
2885 if (vma->vm_ops->map_pages && fault_around_pages() > 1) { 2885 if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
2886 fault_around_pages() > 1) {
2886 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2887 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2887 do_fault_around(vma, address, pte, pgoff, flags); 2888 do_fault_around(vma, address, pte, pgoff, flags);
2888 if (!pte_same(*pte, orig_pte)) 2889 if (!pte_same(*pte, orig_pte))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 284974230459..8f5330d74f47 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -656,19 +656,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
656 * @nodes and @flags,) it's isolated and queued to the pagelist which is 656 * @nodes and @flags,) it's isolated and queued to the pagelist which is
657 * passed via @private.) 657 * passed via @private.)
658 */ 658 */
659static struct vm_area_struct * 659static int
660queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 660queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
661 const nodemask_t *nodes, unsigned long flags, void *private) 661 const nodemask_t *nodes, unsigned long flags, void *private)
662{ 662{
663 int err; 663 int err = 0;
664 struct vm_area_struct *first, *vma, *prev; 664 struct vm_area_struct *vma, *prev;
665
666 665
667 first = find_vma(mm, start); 666 vma = find_vma(mm, start);
668 if (!first) 667 if (!vma)
669 return ERR_PTR(-EFAULT); 668 return -EFAULT;
670 prev = NULL; 669 prev = NULL;
671 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 670 for (; vma && vma->vm_start < end; vma = vma->vm_next) {
672 unsigned long endvma = vma->vm_end; 671 unsigned long endvma = vma->vm_end;
673 672
674 if (endvma > end) 673 if (endvma > end)
@@ -678,9 +677,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
678 677
679 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 678 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
680 if (!vma->vm_next && vma->vm_end < end) 679 if (!vma->vm_next && vma->vm_end < end)
681 return ERR_PTR(-EFAULT); 680 return -EFAULT;
682 if (prev && prev->vm_end < vma->vm_start) 681 if (prev && prev->vm_end < vma->vm_start)
683 return ERR_PTR(-EFAULT); 682 return -EFAULT;
684 } 683 }
685 684
686 if (flags & MPOL_MF_LAZY) { 685 if (flags & MPOL_MF_LAZY) {
@@ -694,15 +693,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
694 693
695 err = queue_pages_pgd_range(vma, start, endvma, nodes, 694 err = queue_pages_pgd_range(vma, start, endvma, nodes,
696 flags, private); 695 flags, private);
697 if (err) { 696 if (err)
698 first = ERR_PTR(err);
699 break; 697 break;
700 }
701 } 698 }
702next: 699next:
703 prev = vma; 700 prev = vma;
704 } 701 }
705 return first; 702 return err;
706} 703}
707 704
708/* 705/*
@@ -1156,16 +1153,17 @@ out:
1156 1153
1157/* 1154/*
1158 * Allocate a new page for page migration based on vma policy. 1155 * Allocate a new page for page migration based on vma policy.
1159 * Start assuming that page is mapped by vma pointed to by @private. 1156 * Start by assuming the page is mapped by the same vma as contains @start.
1160 * Search forward from there, if not. N.B., this assumes that the 1157 * Search forward from there, if not. N.B., this assumes that the
1161 * list of pages handed to migrate_pages()--which is how we get here-- 1158 * list of pages handed to migrate_pages()--which is how we get here--
1162 * is in virtual address order. 1159 * is in virtual address order.
1163 */ 1160 */
1164static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 1161static struct page *new_page(struct page *page, unsigned long start, int **x)
1165{ 1162{
1166 struct vm_area_struct *vma = (struct vm_area_struct *)private; 1163 struct vm_area_struct *vma;
1167 unsigned long uninitialized_var(address); 1164 unsigned long uninitialized_var(address);
1168 1165
1166 vma = find_vma(current->mm, start);
1169 while (vma) { 1167 while (vma) {
1170 address = page_address_in_vma(page, vma); 1168 address = page_address_in_vma(page, vma);
1171 if (address != -EFAULT) 1169 if (address != -EFAULT)
@@ -1195,7 +1193,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1195 return -ENOSYS; 1193 return -ENOSYS;
1196} 1194}
1197 1195
1198static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 1196static struct page *new_page(struct page *page, unsigned long start, int **x)
1199{ 1197{
1200 return NULL; 1198 return NULL;
1201} 1199}
@@ -1205,7 +1203,6 @@ static long do_mbind(unsigned long start, unsigned long len,
1205 unsigned short mode, unsigned short mode_flags, 1203 unsigned short mode, unsigned short mode_flags,
1206 nodemask_t *nmask, unsigned long flags) 1204 nodemask_t *nmask, unsigned long flags)
1207{ 1205{
1208 struct vm_area_struct *vma;
1209 struct mm_struct *mm = current->mm; 1206 struct mm_struct *mm = current->mm;
1210 struct mempolicy *new; 1207 struct mempolicy *new;
1211 unsigned long end; 1208 unsigned long end;
@@ -1271,11 +1268,9 @@ static long do_mbind(unsigned long start, unsigned long len,
1271 if (err) 1268 if (err)
1272 goto mpol_out; 1269 goto mpol_out;
1273 1270
1274 vma = queue_pages_range(mm, start, end, nmask, 1271 err = queue_pages_range(mm, start, end, nmask,
1275 flags | MPOL_MF_INVERT, &pagelist); 1272 flags | MPOL_MF_INVERT, &pagelist);
1276 1273 if (!err)
1277 err = PTR_ERR(vma); /* maybe ... */
1278 if (!IS_ERR(vma))
1279 err = mbind_range(mm, start, end, new); 1274 err = mbind_range(mm, start, end, new);
1280 1275
1281 if (!err) { 1276 if (!err) {
@@ -1283,9 +1278,8 @@ static long do_mbind(unsigned long start, unsigned long len,
1283 1278
1284 if (!list_empty(&pagelist)) { 1279 if (!list_empty(&pagelist)) {
1285 WARN_ON_ONCE(flags & MPOL_MF_LAZY); 1280 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1286 nr_failed = migrate_pages(&pagelist, new_vma_page, 1281 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1287 NULL, (unsigned long)vma, 1282 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1288 MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1289 if (nr_failed) 1283 if (nr_failed)
1290 putback_movable_pages(&pagelist); 1284 putback_movable_pages(&pagelist);
1291 } 1285 }
@@ -2145,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
2145 } else 2139 } else
2146 *new = *old; 2140 *new = *old;
2147 2141
2148 rcu_read_lock();
2149 if (current_cpuset_is_being_rebound()) { 2142 if (current_cpuset_is_being_rebound()) {
2150 nodemask_t mems = cpuset_mems_allowed(current); 2143 nodemask_t mems = cpuset_mems_allowed(current);
2151 if (new->flags & MPOL_F_REBINDING) 2144 if (new->flags & MPOL_F_REBINDING)
@@ -2153,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
2153 else 2146 else
2154 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); 2147 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2155 } 2148 }
2156 rcu_read_unlock();
2157 atomic_set(&new->refcnt, 1); 2149 atomic_set(&new->refcnt, 1);
2158 return new; 2150 return new;
2159} 2151}
diff --git a/mm/migrate.c b/mm/migrate.c
index 63f0cd559999..be6dbf995c0c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -120,8 +120,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
120 pmd = mm_find_pmd(mm, addr); 120 pmd = mm_find_pmd(mm, addr);
121 if (!pmd) 121 if (!pmd)
122 goto out; 122 goto out;
123 if (pmd_trans_huge(*pmd))
124 goto out;
125 123
126 ptep = pte_offset_map(pmd, addr); 124 ptep = pte_offset_map(pmd, addr);
127 125
@@ -990,9 +988,10 @@ out:
990 * it. Otherwise, putback_lru_page() will drop the reference grabbed 988 * it. Otherwise, putback_lru_page() will drop the reference grabbed
991 * during isolation. 989 * during isolation.
992 */ 990 */
993 if (rc != MIGRATEPAGE_SUCCESS && put_new_page) 991 if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
992 ClearPageSwapBacked(newpage);
994 put_new_page(newpage, private); 993 put_new_page(newpage, private);
995 else 994 } else
996 putback_lru_page(newpage); 995 putback_lru_page(newpage);
997 996
998 if (result) { 997 if (result) {
diff --git a/mm/msync.c b/mm/msync.c
index a5c673669ca6..992a1673d488 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -78,7 +78,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
78 goto out_unlock; 78 goto out_unlock;
79 } 79 }
80 file = vma->vm_file; 80 file = vma->vm_file;
81 fstart = start + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 81 fstart = (start - vma->vm_start) +
82 ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
82 fend = fstart + (min(end, vma->vm_end) - start) - 1; 83 fend = fstart + (min(end, vma->vm_end) - start) - 1;
83 start = vma->vm_end; 84 start = vma->vm_end;
84 if ((flags & MS_SYNC) && file && 85 if ((flags & MS_SYNC) && file &&
diff --git a/mm/nommu.c b/mm/nommu.c
index b78e3a8f5ee7..4a852f6c5709 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -786,7 +786,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
786 for (i = 0; i < VMACACHE_SIZE; i++) { 786 for (i = 0; i < VMACACHE_SIZE; i++) {
787 /* if the vma is cached, invalidate the entire cache */ 787 /* if the vma is cached, invalidate the entire cache */
788 if (curr->vmacache[i] == vma) { 788 if (curr->vmacache[i] == vma) {
789 vmacache_invalidate(curr->mm); 789 vmacache_invalidate(mm);
790 break; 790 break;
791 } 791 }
792 } 792 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4f59fa29eda8..0ea758b898fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
69 69
70/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 70/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
71static DEFINE_MUTEX(pcp_batch_high_lock); 71static DEFINE_MUTEX(pcp_batch_high_lock);
72#define MIN_PERCPU_PAGELIST_FRACTION (8)
72 73
73#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 74#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
74DEFINE_PER_CPU(int, numa_node); 75DEFINE_PER_CPU(int, numa_node);
@@ -815,9 +816,21 @@ void __init init_cma_reserved_pageblock(struct page *page)
815 set_page_count(p, 0); 816 set_page_count(p, 0);
816 } while (++p, --i); 817 } while (++p, --i);
817 818
818 set_page_refcounted(page);
819 set_pageblock_migratetype(page, MIGRATE_CMA); 819 set_pageblock_migratetype(page, MIGRATE_CMA);
820 __free_pages(page, pageblock_order); 820
821 if (pageblock_order >= MAX_ORDER) {
822 i = pageblock_nr_pages;
823 p = page;
824 do {
825 set_page_refcounted(p);
826 __free_pages(p, MAX_ORDER - 1);
827 p += MAX_ORDER_NR_PAGES;
828 } while (i -= MAX_ORDER_NR_PAGES);
829 } else {
830 set_page_refcounted(page);
831 __free_pages(page, pageblock_order);
832 }
833
821 adjust_managed_page_count(page, pageblock_nr_pages); 834 adjust_managed_page_count(page, pageblock_nr_pages);
822} 835}
823#endif 836#endif
@@ -4145,7 +4158,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
4145 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 4158 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
4146#endif 4159#endif
4147 4160
4148static int __meminit zone_batchsize(struct zone *zone) 4161static int zone_batchsize(struct zone *zone)
4149{ 4162{
4150#ifdef CONFIG_MMU 4163#ifdef CONFIG_MMU
4151 int batch; 4164 int batch;
@@ -4261,8 +4274,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
4261 pageset_update(&p->pcp, high, batch); 4274 pageset_update(&p->pcp, high, batch);
4262} 4275}
4263 4276
4264static void __meminit pageset_set_high_and_batch(struct zone *zone, 4277static void pageset_set_high_and_batch(struct zone *zone,
4265 struct per_cpu_pageset *pcp) 4278 struct per_cpu_pageset *pcp)
4266{ 4279{
4267 if (percpu_pagelist_fraction) 4280 if (percpu_pagelist_fraction)
4268 pageset_set_high(pcp, 4281 pageset_set_high(pcp,
@@ -5881,23 +5894,38 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
5881 void __user *buffer, size_t *length, loff_t *ppos) 5894 void __user *buffer, size_t *length, loff_t *ppos)
5882{ 5895{
5883 struct zone *zone; 5896 struct zone *zone;
5884 unsigned int cpu; 5897 int old_percpu_pagelist_fraction;
5885 int ret; 5898 int ret;
5886 5899
5900 mutex_lock(&pcp_batch_high_lock);
5901 old_percpu_pagelist_fraction = percpu_pagelist_fraction;
5902
5887 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5903 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5888 if (!write || (ret < 0)) 5904 if (!write || ret < 0)
5889 return ret; 5905 goto out;
5906
5907 /* Sanity checking to avoid pcp imbalance */
5908 if (percpu_pagelist_fraction &&
5909 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
5910 percpu_pagelist_fraction = old_percpu_pagelist_fraction;
5911 ret = -EINVAL;
5912 goto out;
5913 }
5914
5915 /* No change? */
5916 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
5917 goto out;
5890 5918
5891 mutex_lock(&pcp_batch_high_lock);
5892 for_each_populated_zone(zone) { 5919 for_each_populated_zone(zone) {
5893 unsigned long high; 5920 unsigned int cpu;
5894 high = zone->managed_pages / percpu_pagelist_fraction; 5921
5895 for_each_possible_cpu(cpu) 5922 for_each_possible_cpu(cpu)
5896 pageset_set_high(per_cpu_ptr(zone->pageset, cpu), 5923 pageset_set_high_and_batch(zone,
5897 high); 5924 per_cpu_ptr(zone->pageset, cpu));
5898 } 5925 }
5926out:
5899 mutex_unlock(&pcp_batch_high_lock); 5927 mutex_unlock(&pcp_batch_high_lock);
5900 return 0; 5928 return ret;
5901} 5929}
5902 5930
5903int hashdist = HASHDIST_DEFAULT; 5931int hashdist = HASHDIST_DEFAULT;
diff --git a/mm/rmap.c b/mm/rmap.c
index bf05fc872ae8..22a4a7699cdb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -517,11 +517,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
517static inline unsigned long 517static inline unsigned long
518__vma_address(struct page *page, struct vm_area_struct *vma) 518__vma_address(struct page *page, struct vm_area_struct *vma)
519{ 519{
520 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 520 pgoff_t pgoff = page_to_pgoff(page);
521
522 if (unlikely(is_vm_hugetlb_page(vma)))
523 pgoff = page->index << huge_page_order(page_hstate(page));
524
525 return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 521 return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
526} 522}
527 523
@@ -569,6 +565,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
569 pgd_t *pgd; 565 pgd_t *pgd;
570 pud_t *pud; 566 pud_t *pud;
571 pmd_t *pmd = NULL; 567 pmd_t *pmd = NULL;
568 pmd_t pmde;
572 569
573 pgd = pgd_offset(mm, address); 570 pgd = pgd_offset(mm, address);
574 if (!pgd_present(*pgd)) 571 if (!pgd_present(*pgd))
@@ -579,7 +576,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
579 goto out; 576 goto out;
580 577
581 pmd = pmd_offset(pud, address); 578 pmd = pmd_offset(pud, address);
582 if (!pmd_present(*pmd)) 579 /*
580 * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
581 * without holding anon_vma lock for write. So when looking for a
582 * genuine pmde (in which to find pte), test present and !THP together.
583 */
584 pmde = ACCESS_ONCE(*pmd);
585 if (!pmd_present(pmde) || pmd_trans_huge(pmde))
583 pmd = NULL; 586 pmd = NULL;
584out: 587out:
585 return pmd; 588 return pmd;
@@ -615,9 +618,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
615 if (!pmd) 618 if (!pmd)
616 return NULL; 619 return NULL;
617 620
618 if (pmd_trans_huge(*pmd))
619 return NULL;
620
621 pte = pte_offset_map(pmd, address); 621 pte = pte_offset_map(pmd, address);
622 /* Make a quick check before getting the lock */ 622 /* Make a quick check before getting the lock */
623 if (!sync && !pte_present(*pte)) { 623 if (!sync && !pte_present(*pte)) {
@@ -1635,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
1635static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) 1635static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1636{ 1636{
1637 struct anon_vma *anon_vma; 1637 struct anon_vma *anon_vma;
1638 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1638 pgoff_t pgoff = page_to_pgoff(page);
1639 struct anon_vma_chain *avc; 1639 struct anon_vma_chain *avc;
1640 int ret = SWAP_AGAIN; 1640 int ret = SWAP_AGAIN;
1641 1641
@@ -1676,7 +1676,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1676static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) 1676static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1677{ 1677{
1678 struct address_space *mapping = page->mapping; 1678 struct address_space *mapping = page->mapping;
1679 pgoff_t pgoff = page->index << compound_order(page); 1679 pgoff_t pgoff = page_to_pgoff(page);
1680 struct vm_area_struct *vma; 1680 struct vm_area_struct *vma;
1681 int ret = SWAP_AGAIN; 1681 int ret = SWAP_AGAIN;
1682 1682
diff --git a/mm/shmem.c b/mm/shmem.c
index f484c276e994..af68b15a8fc1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt;
80#define SHORT_SYMLINK_LEN 128 80#define SHORT_SYMLINK_LEN 128
81 81
82/* 82/*
83 * shmem_fallocate and shmem_writepage communicate via inode->i_private 83 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
84 * (with i_mutex making sure that it has only one user at a time): 84 * inode->i_private (with i_mutex making sure that it has only one user at
85 * we would prefer not to enlarge the shmem inode just for that. 85 * a time): we would prefer not to enlarge the shmem inode just for that.
86 */ 86 */
87struct shmem_falloc { 87struct shmem_falloc {
88 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
88 pgoff_t start; /* start of range currently being fallocated */ 89 pgoff_t start; /* start of range currently being fallocated */
89 pgoff_t next; /* the next page offset to be fallocated */ 90 pgoff_t next; /* the next page offset to be fallocated */
90 pgoff_t nr_falloced; /* how many new pages have been fallocated */ 91 pgoff_t nr_falloced; /* how many new pages have been fallocated */
@@ -467,23 +468,20 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
467 return; 468 return;
468 469
469 index = start; 470 index = start;
470 for ( ; ; ) { 471 while (index < end) {
471 cond_resched(); 472 cond_resched();
472 473
473 pvec.nr = find_get_entries(mapping, index, 474 pvec.nr = find_get_entries(mapping, index,
474 min(end - index, (pgoff_t)PAGEVEC_SIZE), 475 min(end - index, (pgoff_t)PAGEVEC_SIZE),
475 pvec.pages, indices); 476 pvec.pages, indices);
476 if (!pvec.nr) { 477 if (!pvec.nr) {
477 if (index == start || unfalloc) 478 /* If all gone or hole-punch or unfalloc, we're done */
479 if (index == start || end != -1)
478 break; 480 break;
481 /* But if truncating, restart to make sure all gone */
479 index = start; 482 index = start;
480 continue; 483 continue;
481 } 484 }
482 if ((index == start || unfalloc) && indices[0] >= end) {
483 pagevec_remove_exceptionals(&pvec);
484 pagevec_release(&pvec);
485 break;
486 }
487 mem_cgroup_uncharge_start(); 485 mem_cgroup_uncharge_start();
488 for (i = 0; i < pagevec_count(&pvec); i++) { 486 for (i = 0; i < pagevec_count(&pvec); i++) {
489 struct page *page = pvec.pages[i]; 487 struct page *page = pvec.pages[i];
@@ -495,8 +493,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
495 if (radix_tree_exceptional_entry(page)) { 493 if (radix_tree_exceptional_entry(page)) {
496 if (unfalloc) 494 if (unfalloc)
497 continue; 495 continue;
498 nr_swaps_freed += !shmem_free_swap(mapping, 496 if (shmem_free_swap(mapping, index, page)) {
499 index, page); 497 /* Swap was replaced by page: retry */
498 index--;
499 break;
500 }
501 nr_swaps_freed++;
500 continue; 502 continue;
501 } 503 }
502 504
@@ -505,6 +507,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
505 if (page->mapping == mapping) { 507 if (page->mapping == mapping) {
506 VM_BUG_ON_PAGE(PageWriteback(page), page); 508 VM_BUG_ON_PAGE(PageWriteback(page), page);
507 truncate_inode_page(mapping, page); 509 truncate_inode_page(mapping, page);
510 } else {
511 /* Page was replaced by swap: retry */
512 unlock_page(page);
513 index--;
514 break;
508 } 515 }
509 } 516 }
510 unlock_page(page); 517 unlock_page(page);
@@ -759,6 +766,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
759 spin_lock(&inode->i_lock); 766 spin_lock(&inode->i_lock);
760 shmem_falloc = inode->i_private; 767 shmem_falloc = inode->i_private;
761 if (shmem_falloc && 768 if (shmem_falloc &&
769 !shmem_falloc->waitq &&
762 index >= shmem_falloc->start && 770 index >= shmem_falloc->start &&
763 index < shmem_falloc->next) 771 index < shmem_falloc->next)
764 shmem_falloc->nr_unswapped++; 772 shmem_falloc->nr_unswapped++;
@@ -1027,6 +1035,9 @@ repeat:
1027 goto failed; 1035 goto failed;
1028 } 1036 }
1029 1037
1038 if (page && sgp == SGP_WRITE)
1039 mark_page_accessed(page);
1040
1030 /* fallocated page? */ 1041 /* fallocated page? */
1031 if (page && !PageUptodate(page)) { 1042 if (page && !PageUptodate(page)) {
1032 if (sgp != SGP_READ) 1043 if (sgp != SGP_READ)
@@ -1108,6 +1119,9 @@ repeat:
1108 shmem_recalc_inode(inode); 1119 shmem_recalc_inode(inode);
1109 spin_unlock(&info->lock); 1120 spin_unlock(&info->lock);
1110 1121
1122 if (sgp == SGP_WRITE)
1123 mark_page_accessed(page);
1124
1111 delete_from_swap_cache(page); 1125 delete_from_swap_cache(page);
1112 set_page_dirty(page); 1126 set_page_dirty(page);
1113 swap_free(swap); 1127 swap_free(swap);
@@ -1134,6 +1148,9 @@ repeat:
1134 1148
1135 __SetPageSwapBacked(page); 1149 __SetPageSwapBacked(page);
1136 __set_page_locked(page); 1150 __set_page_locked(page);
1151 if (sgp == SGP_WRITE)
1152 init_page_accessed(page);
1153
1137 error = mem_cgroup_charge_file(page, current->mm, 1154 error = mem_cgroup_charge_file(page, current->mm,
1138 gfp & GFP_RECLAIM_MASK); 1155 gfp & GFP_RECLAIM_MASK);
1139 if (error) 1156 if (error)
@@ -1233,6 +1250,64 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1233 int error; 1250 int error;
1234 int ret = VM_FAULT_LOCKED; 1251 int ret = VM_FAULT_LOCKED;
1235 1252
1253 /*
1254 * Trinity finds that probing a hole which tmpfs is punching can
1255 * prevent the hole-punch from ever completing: which in turn
1256 * locks writers out with its hold on i_mutex. So refrain from
1257 * faulting pages into the hole while it's being punched. Although
1258 * shmem_undo_range() does remove the additions, it may be unable to
1259 * keep up, as each new page needs its own unmap_mapping_range() call,
1260 * and the i_mmap tree grows ever slower to scan if new vmas are added.
1261 *
1262 * It does not matter if we sometimes reach this check just before the
1263 * hole-punch begins, so that one fault then races with the punch:
1264 * we just need to make racing faults a rare case.
1265 *
1266 * The implementation below would be much simpler if we just used a
1267 * standard mutex or completion: but we cannot take i_mutex in fault,
1268 * and bloating every shmem inode for this unlikely case would be sad.
1269 */
1270 if (unlikely(inode->i_private)) {
1271 struct shmem_falloc *shmem_falloc;
1272
1273 spin_lock(&inode->i_lock);
1274 shmem_falloc = inode->i_private;
1275 if (shmem_falloc &&
1276 shmem_falloc->waitq &&
1277 vmf->pgoff >= shmem_falloc->start &&
1278 vmf->pgoff < shmem_falloc->next) {
1279 wait_queue_head_t *shmem_falloc_waitq;
1280 DEFINE_WAIT(shmem_fault_wait);
1281
1282 ret = VM_FAULT_NOPAGE;
1283 if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
1284 !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
1285 /* It's polite to up mmap_sem if we can */
1286 up_read(&vma->vm_mm->mmap_sem);
1287 ret = VM_FAULT_RETRY;
1288 }
1289
1290 shmem_falloc_waitq = shmem_falloc->waitq;
1291 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
1292 TASK_UNINTERRUPTIBLE);
1293 spin_unlock(&inode->i_lock);
1294 schedule();
1295
1296 /*
1297 * shmem_falloc_waitq points into the shmem_fallocate()
1298 * stack of the hole-punching task: shmem_falloc_waitq
1299 * is usually invalid by the time we reach here, but
1300 * finish_wait() does not dereference it in that case;
1301 * though i_lock needed lest racing with wake_up_all().
1302 */
1303 spin_lock(&inode->i_lock);
1304 finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
1305 spin_unlock(&inode->i_lock);
1306 return ret;
1307 }
1308 spin_unlock(&inode->i_lock);
1309 }
1310
1236 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1311 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1237 if (error) 1312 if (error)
1238 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1313 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1372,13 +1447,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
1372 loff_t pos, unsigned len, unsigned flags, 1447 loff_t pos, unsigned len, unsigned flags,
1373 struct page **pagep, void **fsdata) 1448 struct page **pagep, void **fsdata)
1374{ 1449{
1375 int ret;
1376 struct inode *inode = mapping->host; 1450 struct inode *inode = mapping->host;
1377 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1451 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1378 ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1452 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1379 if (ret == 0 && *pagep)
1380 init_page_accessed(*pagep);
1381 return ret;
1382} 1453}
1383 1454
1384static int 1455static int
@@ -1724,18 +1795,34 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1724 pgoff_t start, index, end; 1795 pgoff_t start, index, end;
1725 int error; 1796 int error;
1726 1797
1798 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
1799 return -EOPNOTSUPP;
1800
1727 mutex_lock(&inode->i_mutex); 1801 mutex_lock(&inode->i_mutex);
1728 1802
1729 if (mode & FALLOC_FL_PUNCH_HOLE) { 1803 if (mode & FALLOC_FL_PUNCH_HOLE) {
1730 struct address_space *mapping = file->f_mapping; 1804 struct address_space *mapping = file->f_mapping;
1731 loff_t unmap_start = round_up(offset, PAGE_SIZE); 1805 loff_t unmap_start = round_up(offset, PAGE_SIZE);
1732 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 1806 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
1807 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
1808
1809 shmem_falloc.waitq = &shmem_falloc_waitq;
1810 shmem_falloc.start = unmap_start >> PAGE_SHIFT;
1811 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
1812 spin_lock(&inode->i_lock);
1813 inode->i_private = &shmem_falloc;
1814 spin_unlock(&inode->i_lock);
1733 1815
1734 if ((u64)unmap_end > (u64)unmap_start) 1816 if ((u64)unmap_end > (u64)unmap_start)
1735 unmap_mapping_range(mapping, unmap_start, 1817 unmap_mapping_range(mapping, unmap_start,
1736 1 + unmap_end - unmap_start, 0); 1818 1 + unmap_end - unmap_start, 0);
1737 shmem_truncate_range(inode, offset, offset + len - 1); 1819 shmem_truncate_range(inode, offset, offset + len - 1);
1738 /* No need to unmap again: hole-punching leaves COWed pages */ 1820 /* No need to unmap again: hole-punching leaves COWed pages */
1821
1822 spin_lock(&inode->i_lock);
1823 inode->i_private = NULL;
1824 wake_up_all(&shmem_falloc_waitq);
1825 spin_unlock(&inode->i_lock);
1739 error = 0; 1826 error = 0;
1740 goto out; 1827 goto out;
1741 } 1828 }
@@ -1753,6 +1840,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1753 goto out; 1840 goto out;
1754 } 1841 }
1755 1842
1843 shmem_falloc.waitq = NULL;
1756 shmem_falloc.start = start; 1844 shmem_falloc.start = start;
1757 shmem_falloc.next = start; 1845 shmem_falloc.next = start;
1758 shmem_falloc.nr_falloced = 0; 1846 shmem_falloc.nr_falloced = 0;
diff --git a/mm/slab.c b/mm/slab.c
index 9ca3b87edabc..3070b929a1bf 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -386,6 +386,39 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
386 386
387#endif 387#endif
388 388
389#define OBJECT_FREE (0)
390#define OBJECT_ACTIVE (1)
391
392#ifdef CONFIG_DEBUG_SLAB_LEAK
393
394static void set_obj_status(struct page *page, int idx, int val)
395{
396 int freelist_size;
397 char *status;
398 struct kmem_cache *cachep = page->slab_cache;
399
400 freelist_size = cachep->num * sizeof(freelist_idx_t);
401 status = (char *)page->freelist + freelist_size;
402 status[idx] = val;
403}
404
405static inline unsigned int get_obj_status(struct page *page, int idx)
406{
407 int freelist_size;
408 char *status;
409 struct kmem_cache *cachep = page->slab_cache;
410
411 freelist_size = cachep->num * sizeof(freelist_idx_t);
412 status = (char *)page->freelist + freelist_size;
413
414 return status[idx];
415}
416
417#else
418static inline void set_obj_status(struct page *page, int idx, int val) {}
419
420#endif
421
389/* 422/*
390 * Do not go above this order unless 0 objects fit into the slab or 423 * Do not go above this order unless 0 objects fit into the slab or
391 * overridden on the command line. 424 * overridden on the command line.
@@ -576,12 +609,30 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
576 return cachep->array[smp_processor_id()]; 609 return cachep->array[smp_processor_id()];
577} 610}
578 611
612static size_t calculate_freelist_size(int nr_objs, size_t align)
613{
614 size_t freelist_size;
615
616 freelist_size = nr_objs * sizeof(freelist_idx_t);
617 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
618 freelist_size += nr_objs * sizeof(char);
619
620 if (align)
621 freelist_size = ALIGN(freelist_size, align);
622
623 return freelist_size;
624}
625
579static int calculate_nr_objs(size_t slab_size, size_t buffer_size, 626static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
580 size_t idx_size, size_t align) 627 size_t idx_size, size_t align)
581{ 628{
582 int nr_objs; 629 int nr_objs;
630 size_t remained_size;
583 size_t freelist_size; 631 size_t freelist_size;
632 int extra_space = 0;
584 633
634 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
635 extra_space = sizeof(char);
585 /* 636 /*
586 * Ignore padding for the initial guess. The padding 637 * Ignore padding for the initial guess. The padding
587 * is at most @align-1 bytes, and @buffer_size is at 638 * is at most @align-1 bytes, and @buffer_size is at
@@ -590,14 +641,15 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
590 * into the memory allocation when taking the padding 641 * into the memory allocation when taking the padding
591 * into account. 642 * into account.
592 */ 643 */
593 nr_objs = slab_size / (buffer_size + idx_size); 644 nr_objs = slab_size / (buffer_size + idx_size + extra_space);
594 645
595 /* 646 /*
596 * This calculated number will be either the right 647 * This calculated number will be either the right
597 * amount, or one greater than what we want. 648 * amount, or one greater than what we want.
598 */ 649 */
599 freelist_size = slab_size - nr_objs * buffer_size; 650 remained_size = slab_size - nr_objs * buffer_size;
600 if (freelist_size < ALIGN(nr_objs * idx_size, align)) 651 freelist_size = calculate_freelist_size(nr_objs, align);
652 if (remained_size < freelist_size)
601 nr_objs--; 653 nr_objs--;
602 654
603 return nr_objs; 655 return nr_objs;
@@ -635,7 +687,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
635 } else { 687 } else {
636 nr_objs = calculate_nr_objs(slab_size, buffer_size, 688 nr_objs = calculate_nr_objs(slab_size, buffer_size,
637 sizeof(freelist_idx_t), align); 689 sizeof(freelist_idx_t), align);
638 mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align); 690 mgmt_size = calculate_freelist_size(nr_objs, align);
639 } 691 }
640 *num = nr_objs; 692 *num = nr_objs;
641 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 693 *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@ -2041,13 +2093,16 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2041 break; 2093 break;
2042 2094
2043 if (flags & CFLGS_OFF_SLAB) { 2095 if (flags & CFLGS_OFF_SLAB) {
2096 size_t freelist_size_per_obj = sizeof(freelist_idx_t);
2044 /* 2097 /*
2045 * Max number of objs-per-slab for caches which 2098 * Max number of objs-per-slab for caches which
2046 * use off-slab slabs. Needed to avoid a possible 2099 * use off-slab slabs. Needed to avoid a possible
2047 * looping condition in cache_grow(). 2100 * looping condition in cache_grow().
2048 */ 2101 */
2102 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
2103 freelist_size_per_obj += sizeof(char);
2049 offslab_limit = size; 2104 offslab_limit = size;
2050 offslab_limit /= sizeof(freelist_idx_t); 2105 offslab_limit /= freelist_size_per_obj;
2051 2106
2052 if (num > offslab_limit) 2107 if (num > offslab_limit)
2053 break; 2108 break;
@@ -2294,8 +2349,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2294 if (!cachep->num) 2349 if (!cachep->num)
2295 return -E2BIG; 2350 return -E2BIG;
2296 2351
2297 freelist_size = 2352 freelist_size = calculate_freelist_size(cachep->num, cachep->align);
2298 ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align);
2299 2353
2300 /* 2354 /*
2301 * If the slab has been placed off-slab, and we have enough space then 2355 * If the slab has been placed off-slab, and we have enough space then
@@ -2308,7 +2362,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2308 2362
2309 if (flags & CFLGS_OFF_SLAB) { 2363 if (flags & CFLGS_OFF_SLAB) {
2310 /* really off slab. No need for manual alignment */ 2364 /* really off slab. No need for manual alignment */
2311 freelist_size = cachep->num * sizeof(freelist_idx_t); 2365 freelist_size = calculate_freelist_size(cachep->num, 0);
2312 2366
2313#ifdef CONFIG_PAGE_POISONING 2367#ifdef CONFIG_PAGE_POISONING
2314 /* If we're going to use the generic kernel_map_pages() 2368 /* If we're going to use the generic kernel_map_pages()
@@ -2612,6 +2666,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2612 if (cachep->ctor) 2666 if (cachep->ctor)
2613 cachep->ctor(objp); 2667 cachep->ctor(objp);
2614#endif 2668#endif
2669 set_obj_status(page, i, OBJECT_FREE);
2615 set_free_obj(page, i, i); 2670 set_free_obj(page, i, i);
2616 } 2671 }
2617} 2672}
@@ -2820,6 +2875,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2820 BUG_ON(objnr >= cachep->num); 2875 BUG_ON(objnr >= cachep->num);
2821 BUG_ON(objp != index_to_obj(cachep, page, objnr)); 2876 BUG_ON(objp != index_to_obj(cachep, page, objnr));
2822 2877
2878 set_obj_status(page, objnr, OBJECT_FREE);
2823 if (cachep->flags & SLAB_POISON) { 2879 if (cachep->flags & SLAB_POISON) {
2824#ifdef CONFIG_DEBUG_PAGEALLOC 2880#ifdef CONFIG_DEBUG_PAGEALLOC
2825 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 2881 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2953,6 +3009,8 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2953static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 3009static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2954 gfp_t flags, void *objp, unsigned long caller) 3010 gfp_t flags, void *objp, unsigned long caller)
2955{ 3011{
3012 struct page *page;
3013
2956 if (!objp) 3014 if (!objp)
2957 return objp; 3015 return objp;
2958 if (cachep->flags & SLAB_POISON) { 3016 if (cachep->flags & SLAB_POISON) {
@@ -2983,6 +3041,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2983 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 3041 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2984 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 3042 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2985 } 3043 }
3044
3045 page = virt_to_head_page(objp);
3046 set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
2986 objp += obj_offset(cachep); 3047 objp += obj_offset(cachep);
2987 if (cachep->ctor && cachep->flags & SLAB_POISON) 3048 if (cachep->ctor && cachep->flags & SLAB_POISON)
2988 cachep->ctor(objp); 3049 cachep->ctor(objp);
@@ -4219,21 +4280,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
4219 struct page *page) 4280 struct page *page)
4220{ 4281{
4221 void *p; 4282 void *p;
4222 int i, j; 4283 int i;
4223 4284
4224 if (n[0] == n[1]) 4285 if (n[0] == n[1])
4225 return; 4286 return;
4226 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { 4287 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
4227 bool active = true; 4288 if (get_obj_status(page, i) != OBJECT_ACTIVE)
4228
4229 for (j = page->active; j < c->num; j++) {
4230 /* Skip freed item */
4231 if (get_free_obj(page, j) == i) {
4232 active = false;
4233 break;
4234 }
4235 }
4236 if (!active)
4237 continue; 4289 continue;
4238 4290
4239 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4291 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 735e01a0db6f..d31c4bacc6a2 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -55,7 +55,7 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
55 continue; 55 continue;
56 } 56 }
57 57
58#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) 58#if !defined(CONFIG_SLUB)
59 if (!strcmp(s->name, name)) { 59 if (!strcmp(s->name, name)) {
60 pr_err("%s (%s): Cache name already exists.\n", 60 pr_err("%s (%s): Cache name already exists.\n",
61 __func__, name); 61 __func__, name);
diff --git a/mm/slub.c b/mm/slub.c
index b2b047327d76..73004808537e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1881,7 +1881,7 @@ redo:
1881 1881
1882 new.frozen = 0; 1882 new.frozen = 0;
1883 1883
1884 if (!new.inuse && n->nr_partial > s->min_partial) 1884 if (!new.inuse && n->nr_partial >= s->min_partial)
1885 m = M_FREE; 1885 m = M_FREE;
1886 else if (new.freelist) { 1886 else if (new.freelist) {
1887 m = M_PARTIAL; 1887 m = M_PARTIAL;
@@ -1992,7 +1992,7 @@ static void unfreeze_partials(struct kmem_cache *s,
1992 new.freelist, new.counters, 1992 new.freelist, new.counters,
1993 "unfreezing slab")); 1993 "unfreezing slab"));
1994 1994
1995 if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { 1995 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
1996 page->next = discard_page; 1996 page->next = discard_page;
1997 discard_page = page; 1997 discard_page = page;
1998 } else { 1998 } else {
@@ -2620,7 +2620,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2620 return; 2620 return;
2621 } 2621 }
2622 2622
2623 if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) 2623 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
2624 goto slab_empty; 2624 goto slab_empty;
2625 2625
2626 /* 2626 /*
diff --git a/mm/truncate.c b/mm/truncate.c
index 6a78c814bebf..eda247307164 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -355,14 +355,16 @@ void truncate_inode_pages_range(struct address_space *mapping,
355 for ( ; ; ) { 355 for ( ; ; ) {
356 cond_resched(); 356 cond_resched();
357 if (!pagevec_lookup_entries(&pvec, mapping, index, 357 if (!pagevec_lookup_entries(&pvec, mapping, index,
358 min(end - index, (pgoff_t)PAGEVEC_SIZE), 358 min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
359 indices)) { 359 /* If all gone from start onwards, we're done */
360 if (index == start) 360 if (index == start)
361 break; 361 break;
362 /* Otherwise restart to make sure all gone */
362 index = start; 363 index = start;
363 continue; 364 continue;
364 } 365 }
365 if (index == start && indices[0] >= end) { 366 if (index == start && indices[0] >= end) {
367 /* All gone out of hole to be punched, we're done */
366 pagevec_remove_exceptionals(&pvec); 368 pagevec_remove_exceptionals(&pvec);
367 pagevec_release(&pvec); 369 pagevec_release(&pvec);
368 break; 370 break;
@@ -373,8 +375,11 @@ void truncate_inode_pages_range(struct address_space *mapping,
373 375
374 /* We rely upon deletion not changing page->index */ 376 /* We rely upon deletion not changing page->index */
375 index = indices[i]; 377 index = indices[i];
376 if (index >= end) 378 if (index >= end) {
379 /* Restart punch to make sure all gone */
380 index = start - 1;
377 break; 381 break;
382 }
378 383
379 if (radix_tree_exceptional_entry(page)) { 384 if (radix_tree_exceptional_entry(page)) {
380 clear_exceptional_entry(mapping, index, page); 385 clear_exceptional_entry(mapping, index, page);