diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/huge_memory.c | 57 | ||||
-rw-r--r-- | mm/hugetlb.c | 71 | ||||
-rw-r--r-- | mm/ksm.c | 1 | ||||
-rw-r--r-- | mm/memory-failure.c | 9 | ||||
-rw-r--r-- | mm/mempolicy.c | 48 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/msync.c | 3 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 56 | ||||
-rw-r--r-- | mm/rmap.c | 12 | ||||
-rw-r--r-- | mm/shmem.c | 74 | ||||
-rw-r--r-- | mm/slab.c | 90 | ||||
-rw-r--r-- | mm/slub.c | 6 |
13 files changed, 307 insertions, 124 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e60837dc785c..33514d88fef9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -941,6 +941,37 @@ unlock: | |||
941 | spin_unlock(ptl); | 941 | spin_unlock(ptl); |
942 | } | 942 | } |
943 | 943 | ||
944 | /* | ||
945 | * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages | ||
946 | * during copy_user_huge_page()'s copy_page_rep(): in the case when | ||
947 | * the source page gets split and a tail freed before copy completes. | ||
948 | * Called under pmd_lock of checked pmd, so safe from splitting itself. | ||
949 | */ | ||
950 | static void get_user_huge_page(struct page *page) | ||
951 | { | ||
952 | if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { | ||
953 | struct page *endpage = page + HPAGE_PMD_NR; | ||
954 | |||
955 | atomic_add(HPAGE_PMD_NR, &page->_count); | ||
956 | while (++page < endpage) | ||
957 | get_huge_page_tail(page); | ||
958 | } else { | ||
959 | get_page(page); | ||
960 | } | ||
961 | } | ||
962 | |||
963 | static void put_user_huge_page(struct page *page) | ||
964 | { | ||
965 | if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { | ||
966 | struct page *endpage = page + HPAGE_PMD_NR; | ||
967 | |||
968 | while (page < endpage) | ||
969 | put_page(page++); | ||
970 | } else { | ||
971 | put_page(page); | ||
972 | } | ||
973 | } | ||
974 | |||
944 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 975 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
945 | struct vm_area_struct *vma, | 976 | struct vm_area_struct *vma, |
946 | unsigned long address, | 977 | unsigned long address, |
@@ -1074,7 +1105,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1074 | ret |= VM_FAULT_WRITE; | 1105 | ret |= VM_FAULT_WRITE; |
1075 | goto out_unlock; | 1106 | goto out_unlock; |
1076 | } | 1107 | } |
1077 | get_page(page); | 1108 | get_user_huge_page(page); |
1078 | spin_unlock(ptl); | 1109 | spin_unlock(ptl); |
1079 | alloc: | 1110 | alloc: |
1080 | if (transparent_hugepage_enabled(vma) && | 1111 | if (transparent_hugepage_enabled(vma) && |
@@ -1095,7 +1126,7 @@ alloc: | |||
1095 | split_huge_page(page); | 1126 | split_huge_page(page); |
1096 | ret |= VM_FAULT_FALLBACK; | 1127 | ret |= VM_FAULT_FALLBACK; |
1097 | } | 1128 | } |
1098 | put_page(page); | 1129 | put_user_huge_page(page); |
1099 | } | 1130 | } |
1100 | count_vm_event(THP_FAULT_FALLBACK); | 1131 | count_vm_event(THP_FAULT_FALLBACK); |
1101 | goto out; | 1132 | goto out; |
@@ -1105,7 +1136,7 @@ alloc: | |||
1105 | put_page(new_page); | 1136 | put_page(new_page); |
1106 | if (page) { | 1137 | if (page) { |
1107 | split_huge_page(page); | 1138 | split_huge_page(page); |
1108 | put_page(page); | 1139 | put_user_huge_page(page); |
1109 | } else | 1140 | } else |
1110 | split_huge_page_pmd(vma, address, pmd); | 1141 | split_huge_page_pmd(vma, address, pmd); |
1111 | ret |= VM_FAULT_FALLBACK; | 1142 | ret |= VM_FAULT_FALLBACK; |
@@ -1127,7 +1158,7 @@ alloc: | |||
1127 | 1158 | ||
1128 | spin_lock(ptl); | 1159 | spin_lock(ptl); |
1129 | if (page) | 1160 | if (page) |
1130 | put_page(page); | 1161 | put_user_huge_page(page); |
1131 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 1162 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
1132 | spin_unlock(ptl); | 1163 | spin_unlock(ptl); |
1133 | mem_cgroup_uncharge_page(new_page); | 1164 | mem_cgroup_uncharge_page(new_page); |
@@ -2392,8 +2423,6 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2392 | pmd = mm_find_pmd(mm, address); | 2423 | pmd = mm_find_pmd(mm, address); |
2393 | if (!pmd) | 2424 | if (!pmd) |
2394 | goto out; | 2425 | goto out; |
2395 | if (pmd_trans_huge(*pmd)) | ||
2396 | goto out; | ||
2397 | 2426 | ||
2398 | anon_vma_lock_write(vma->anon_vma); | 2427 | anon_vma_lock_write(vma->anon_vma); |
2399 | 2428 | ||
@@ -2492,8 +2521,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2492 | pmd = mm_find_pmd(mm, address); | 2521 | pmd = mm_find_pmd(mm, address); |
2493 | if (!pmd) | 2522 | if (!pmd) |
2494 | goto out; | 2523 | goto out; |
2495 | if (pmd_trans_huge(*pmd)) | ||
2496 | goto out; | ||
2497 | 2524 | ||
2498 | memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); | 2525 | memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); |
2499 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2526 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
@@ -2846,12 +2873,22 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | |||
2846 | static void split_huge_page_address(struct mm_struct *mm, | 2873 | static void split_huge_page_address(struct mm_struct *mm, |
2847 | unsigned long address) | 2874 | unsigned long address) |
2848 | { | 2875 | { |
2876 | pgd_t *pgd; | ||
2877 | pud_t *pud; | ||
2849 | pmd_t *pmd; | 2878 | pmd_t *pmd; |
2850 | 2879 | ||
2851 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); | 2880 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); |
2852 | 2881 | ||
2853 | pmd = mm_find_pmd(mm, address); | 2882 | pgd = pgd_offset(mm, address); |
2854 | if (!pmd) | 2883 | if (!pgd_present(*pgd)) |
2884 | return; | ||
2885 | |||
2886 | pud = pud_offset(pgd, address); | ||
2887 | if (!pud_present(*pud)) | ||
2888 | return; | ||
2889 | |||
2890 | pmd = pmd_offset(pud, address); | ||
2891 | if (!pmd_present(*pmd)) | ||
2855 | return; | 2892 | return; |
2856 | /* | 2893 | /* |
2857 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | 2894 | * Caller holds the mmap_sem write mode, so a huge pmd cannot |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 226910cb7c9b..2024bbd573d2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -2520,6 +2520,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
2520 | update_mmu_cache(vma, address, ptep); | 2520 | update_mmu_cache(vma, address, ptep); |
2521 | } | 2521 | } |
2522 | 2522 | ||
2523 | static int is_hugetlb_entry_migration(pte_t pte) | ||
2524 | { | ||
2525 | swp_entry_t swp; | ||
2526 | |||
2527 | if (huge_pte_none(pte) || pte_present(pte)) | ||
2528 | return 0; | ||
2529 | swp = pte_to_swp_entry(pte); | ||
2530 | if (non_swap_entry(swp) && is_migration_entry(swp)) | ||
2531 | return 1; | ||
2532 | else | ||
2533 | return 0; | ||
2534 | } | ||
2535 | |||
2536 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) | ||
2537 | { | ||
2538 | swp_entry_t swp; | ||
2539 | |||
2540 | if (huge_pte_none(pte) || pte_present(pte)) | ||
2541 | return 0; | ||
2542 | swp = pte_to_swp_entry(pte); | ||
2543 | if (non_swap_entry(swp) && is_hwpoison_entry(swp)) | ||
2544 | return 1; | ||
2545 | else | ||
2546 | return 0; | ||
2547 | } | ||
2523 | 2548 | ||
2524 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | 2549 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, |
2525 | struct vm_area_struct *vma) | 2550 | struct vm_area_struct *vma) |
@@ -2559,10 +2584,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
2559 | dst_ptl = huge_pte_lock(h, dst, dst_pte); | 2584 | dst_ptl = huge_pte_lock(h, dst, dst_pte); |
2560 | src_ptl = huge_pte_lockptr(h, src, src_pte); | 2585 | src_ptl = huge_pte_lockptr(h, src, src_pte); |
2561 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | 2586 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); |
2562 | if (!huge_pte_none(huge_ptep_get(src_pte))) { | 2587 | entry = huge_ptep_get(src_pte); |
2588 | if (huge_pte_none(entry)) { /* skip none entry */ | ||
2589 | ; | ||
2590 | } else if (unlikely(is_hugetlb_entry_migration(entry) || | ||
2591 | is_hugetlb_entry_hwpoisoned(entry))) { | ||
2592 | swp_entry_t swp_entry = pte_to_swp_entry(entry); | ||
2593 | |||
2594 | if (is_write_migration_entry(swp_entry) && cow) { | ||
2595 | /* | ||
2596 | * COW mappings require pages in both | ||
2597 | * parent and child to be set to read. | ||
2598 | */ | ||
2599 | make_migration_entry_read(&swp_entry); | ||
2600 | entry = swp_entry_to_pte(swp_entry); | ||
2601 | set_huge_pte_at(src, addr, src_pte, entry); | ||
2602 | } | ||
2603 | set_huge_pte_at(dst, addr, dst_pte, entry); | ||
2604 | } else { | ||
2563 | if (cow) | 2605 | if (cow) |
2564 | huge_ptep_set_wrprotect(src, addr, src_pte); | 2606 | huge_ptep_set_wrprotect(src, addr, src_pte); |
2565 | entry = huge_ptep_get(src_pte); | ||
2566 | ptepage = pte_page(entry); | 2607 | ptepage = pte_page(entry); |
2567 | get_page(ptepage); | 2608 | get_page(ptepage); |
2568 | page_dup_rmap(ptepage); | 2609 | page_dup_rmap(ptepage); |
@@ -2578,32 +2619,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
2578 | return ret; | 2619 | return ret; |
2579 | } | 2620 | } |
2580 | 2621 | ||
2581 | static int is_hugetlb_entry_migration(pte_t pte) | ||
2582 | { | ||
2583 | swp_entry_t swp; | ||
2584 | |||
2585 | if (huge_pte_none(pte) || pte_present(pte)) | ||
2586 | return 0; | ||
2587 | swp = pte_to_swp_entry(pte); | ||
2588 | if (non_swap_entry(swp) && is_migration_entry(swp)) | ||
2589 | return 1; | ||
2590 | else | ||
2591 | return 0; | ||
2592 | } | ||
2593 | |||
2594 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) | ||
2595 | { | ||
2596 | swp_entry_t swp; | ||
2597 | |||
2598 | if (huge_pte_none(pte) || pte_present(pte)) | ||
2599 | return 0; | ||
2600 | swp = pte_to_swp_entry(pte); | ||
2601 | if (non_swap_entry(swp) && is_hwpoison_entry(swp)) | ||
2602 | return 1; | ||
2603 | else | ||
2604 | return 0; | ||
2605 | } | ||
2606 | |||
2607 | void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | 2622 | void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, |
2608 | unsigned long start, unsigned long end, | 2623 | unsigned long start, unsigned long end, |
2609 | struct page *ref_page) | 2624 | struct page *ref_page) |
@@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
945 | pmd = mm_find_pmd(mm, addr); | 945 | pmd = mm_find_pmd(mm, addr); |
946 | if (!pmd) | 946 | if (!pmd) |
947 | goto out; | 947 | goto out; |
948 | BUG_ON(pmd_trans_huge(*pmd)); | ||
949 | 948 | ||
950 | mmun_start = addr; | 949 | mmun_start = addr; |
951 | mmun_end = addr + PAGE_SIZE; | 950 | mmun_end = addr + PAGE_SIZE; |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index cd8989c1027e..c6399e328931 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -895,7 +895,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
895 | struct page *hpage = *hpagep; | 895 | struct page *hpage = *hpagep; |
896 | struct page *ppage; | 896 | struct page *ppage; |
897 | 897 | ||
898 | if (PageReserved(p) || PageSlab(p)) | 898 | if (PageReserved(p) || PageSlab(p) || !PageLRU(p)) |
899 | return SWAP_SUCCESS; | 899 | return SWAP_SUCCESS; |
900 | 900 | ||
901 | /* | 901 | /* |
@@ -1159,9 +1159,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1159 | action_result(pfn, "free buddy, 2nd try", DELAYED); | 1159 | action_result(pfn, "free buddy, 2nd try", DELAYED); |
1160 | return 0; | 1160 | return 0; |
1161 | } | 1161 | } |
1162 | action_result(pfn, "non LRU", IGNORED); | ||
1163 | put_page(p); | ||
1164 | return -EBUSY; | ||
1165 | } | 1162 | } |
1166 | } | 1163 | } |
1167 | 1164 | ||
@@ -1194,6 +1191,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1194 | return 0; | 1191 | return 0; |
1195 | } | 1192 | } |
1196 | 1193 | ||
1194 | if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) | ||
1195 | goto identify_page_state; | ||
1196 | |||
1197 | /* | 1197 | /* |
1198 | * For error on the tail page, we should set PG_hwpoison | 1198 | * For error on the tail page, we should set PG_hwpoison |
1199 | * on the head page to show that the hugepage is hwpoisoned | 1199 | * on the head page to show that the hugepage is hwpoisoned |
@@ -1243,6 +1243,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1243 | goto out; | 1243 | goto out; |
1244 | } | 1244 | } |
1245 | 1245 | ||
1246 | identify_page_state: | ||
1246 | res = -EBUSY; | 1247 | res = -EBUSY; |
1247 | /* | 1248 | /* |
1248 | * The first check uses the current page flags which may not have any | 1249 | * The first check uses the current page flags which may not have any |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 284974230459..8f5330d74f47 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -656,19 +656,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
656 | * @nodes and @flags,) it's isolated and queued to the pagelist which is | 656 | * @nodes and @flags,) it's isolated and queued to the pagelist which is |
657 | * passed via @private.) | 657 | * passed via @private.) |
658 | */ | 658 | */ |
659 | static struct vm_area_struct * | 659 | static int |
660 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 660 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
661 | const nodemask_t *nodes, unsigned long flags, void *private) | 661 | const nodemask_t *nodes, unsigned long flags, void *private) |
662 | { | 662 | { |
663 | int err; | 663 | int err = 0; |
664 | struct vm_area_struct *first, *vma, *prev; | 664 | struct vm_area_struct *vma, *prev; |
665 | |||
666 | 665 | ||
667 | first = find_vma(mm, start); | 666 | vma = find_vma(mm, start); |
668 | if (!first) | 667 | if (!vma) |
669 | return ERR_PTR(-EFAULT); | 668 | return -EFAULT; |
670 | prev = NULL; | 669 | prev = NULL; |
671 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 670 | for (; vma && vma->vm_start < end; vma = vma->vm_next) { |
672 | unsigned long endvma = vma->vm_end; | 671 | unsigned long endvma = vma->vm_end; |
673 | 672 | ||
674 | if (endvma > end) | 673 | if (endvma > end) |
@@ -678,9 +677,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
678 | 677 | ||
679 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | 678 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { |
680 | if (!vma->vm_next && vma->vm_end < end) | 679 | if (!vma->vm_next && vma->vm_end < end) |
681 | return ERR_PTR(-EFAULT); | 680 | return -EFAULT; |
682 | if (prev && prev->vm_end < vma->vm_start) | 681 | if (prev && prev->vm_end < vma->vm_start) |
683 | return ERR_PTR(-EFAULT); | 682 | return -EFAULT; |
684 | } | 683 | } |
685 | 684 | ||
686 | if (flags & MPOL_MF_LAZY) { | 685 | if (flags & MPOL_MF_LAZY) { |
@@ -694,15 +693,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
694 | 693 | ||
695 | err = queue_pages_pgd_range(vma, start, endvma, nodes, | 694 | err = queue_pages_pgd_range(vma, start, endvma, nodes, |
696 | flags, private); | 695 | flags, private); |
697 | if (err) { | 696 | if (err) |
698 | first = ERR_PTR(err); | ||
699 | break; | 697 | break; |
700 | } | ||
701 | } | 698 | } |
702 | next: | 699 | next: |
703 | prev = vma; | 700 | prev = vma; |
704 | } | 701 | } |
705 | return first; | 702 | return err; |
706 | } | 703 | } |
707 | 704 | ||
708 | /* | 705 | /* |
@@ -1156,16 +1153,17 @@ out: | |||
1156 | 1153 | ||
1157 | /* | 1154 | /* |
1158 | * Allocate a new page for page migration based on vma policy. | 1155 | * Allocate a new page for page migration based on vma policy. |
1159 | * Start assuming that page is mapped by vma pointed to by @private. | 1156 | * Start by assuming the page is mapped by the same vma as contains @start. |
1160 | * Search forward from there, if not. N.B., this assumes that the | 1157 | * Search forward from there, if not. N.B., this assumes that the |
1161 | * list of pages handed to migrate_pages()--which is how we get here-- | 1158 | * list of pages handed to migrate_pages()--which is how we get here-- |
1162 | * is in virtual address order. | 1159 | * is in virtual address order. |
1163 | */ | 1160 | */ |
1164 | static struct page *new_vma_page(struct page *page, unsigned long private, int **x) | 1161 | static struct page *new_page(struct page *page, unsigned long start, int **x) |
1165 | { | 1162 | { |
1166 | struct vm_area_struct *vma = (struct vm_area_struct *)private; | 1163 | struct vm_area_struct *vma; |
1167 | unsigned long uninitialized_var(address); | 1164 | unsigned long uninitialized_var(address); |
1168 | 1165 | ||
1166 | vma = find_vma(current->mm, start); | ||
1169 | while (vma) { | 1167 | while (vma) { |
1170 | address = page_address_in_vma(page, vma); | 1168 | address = page_address_in_vma(page, vma); |
1171 | if (address != -EFAULT) | 1169 | if (address != -EFAULT) |
@@ -1195,7 +1193,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, | |||
1195 | return -ENOSYS; | 1193 | return -ENOSYS; |
1196 | } | 1194 | } |
1197 | 1195 | ||
1198 | static struct page *new_vma_page(struct page *page, unsigned long private, int **x) | 1196 | static struct page *new_page(struct page *page, unsigned long start, int **x) |
1199 | { | 1197 | { |
1200 | return NULL; | 1198 | return NULL; |
1201 | } | 1199 | } |
@@ -1205,7 +1203,6 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1205 | unsigned short mode, unsigned short mode_flags, | 1203 | unsigned short mode, unsigned short mode_flags, |
1206 | nodemask_t *nmask, unsigned long flags) | 1204 | nodemask_t *nmask, unsigned long flags) |
1207 | { | 1205 | { |
1208 | struct vm_area_struct *vma; | ||
1209 | struct mm_struct *mm = current->mm; | 1206 | struct mm_struct *mm = current->mm; |
1210 | struct mempolicy *new; | 1207 | struct mempolicy *new; |
1211 | unsigned long end; | 1208 | unsigned long end; |
@@ -1271,11 +1268,9 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1271 | if (err) | 1268 | if (err) |
1272 | goto mpol_out; | 1269 | goto mpol_out; |
1273 | 1270 | ||
1274 | vma = queue_pages_range(mm, start, end, nmask, | 1271 | err = queue_pages_range(mm, start, end, nmask, |
1275 | flags | MPOL_MF_INVERT, &pagelist); | 1272 | flags | MPOL_MF_INVERT, &pagelist); |
1276 | 1273 | if (!err) | |
1277 | err = PTR_ERR(vma); /* maybe ... */ | ||
1278 | if (!IS_ERR(vma)) | ||
1279 | err = mbind_range(mm, start, end, new); | 1274 | err = mbind_range(mm, start, end, new); |
1280 | 1275 | ||
1281 | if (!err) { | 1276 | if (!err) { |
@@ -1283,9 +1278,8 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1283 | 1278 | ||
1284 | if (!list_empty(&pagelist)) { | 1279 | if (!list_empty(&pagelist)) { |
1285 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | 1280 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); |
1286 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1281 | nr_failed = migrate_pages(&pagelist, new_page, NULL, |
1287 | NULL, (unsigned long)vma, | 1282 | start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); |
1288 | MIGRATE_SYNC, MR_MEMPOLICY_MBIND); | ||
1289 | if (nr_failed) | 1283 | if (nr_failed) |
1290 | putback_movable_pages(&pagelist); | 1284 | putback_movable_pages(&pagelist); |
1291 | } | 1285 | } |
@@ -2145,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) | |||
2145 | } else | 2139 | } else |
2146 | *new = *old; | 2140 | *new = *old; |
2147 | 2141 | ||
2148 | rcu_read_lock(); | ||
2149 | if (current_cpuset_is_being_rebound()) { | 2142 | if (current_cpuset_is_being_rebound()) { |
2150 | nodemask_t mems = cpuset_mems_allowed(current); | 2143 | nodemask_t mems = cpuset_mems_allowed(current); |
2151 | if (new->flags & MPOL_F_REBINDING) | 2144 | if (new->flags & MPOL_F_REBINDING) |
@@ -2153,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) | |||
2153 | else | 2146 | else |
2154 | mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); | 2147 | mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); |
2155 | } | 2148 | } |
2156 | rcu_read_unlock(); | ||
2157 | atomic_set(&new->refcnt, 1); | 2149 | atomic_set(&new->refcnt, 1); |
2158 | return new; | 2150 | return new; |
2159 | } | 2151 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 63f0cd559999..9e0beaa91845 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -120,8 +120,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
120 | pmd = mm_find_pmd(mm, addr); | 120 | pmd = mm_find_pmd(mm, addr); |
121 | if (!pmd) | 121 | if (!pmd) |
122 | goto out; | 122 | goto out; |
123 | if (pmd_trans_huge(*pmd)) | ||
124 | goto out; | ||
125 | 123 | ||
126 | ptep = pte_offset_map(pmd, addr); | 124 | ptep = pte_offset_map(pmd, addr); |
127 | 125 | ||
diff --git a/mm/msync.c b/mm/msync.c index a5c673669ca6..992a1673d488 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -78,7 +78,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) | |||
78 | goto out_unlock; | 78 | goto out_unlock; |
79 | } | 79 | } |
80 | file = vma->vm_file; | 80 | file = vma->vm_file; |
81 | fstart = start + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 81 | fstart = (start - vma->vm_start) + |
82 | ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | ||
82 | fend = fstart + (min(end, vma->vm_end) - start) - 1; | 83 | fend = fstart + (min(end, vma->vm_end) - start) - 1; |
83 | start = vma->vm_end; | 84 | start = vma->vm_end; |
84 | if ((flags & MS_SYNC) && file && | 85 | if ((flags & MS_SYNC) && file && |
diff --git a/mm/nommu.c b/mm/nommu.c index b78e3a8f5ee7..4a852f6c5709 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -786,7 +786,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
786 | for (i = 0; i < VMACACHE_SIZE; i++) { | 786 | for (i = 0; i < VMACACHE_SIZE; i++) { |
787 | /* if the vma is cached, invalidate the entire cache */ | 787 | /* if the vma is cached, invalidate the entire cache */ |
788 | if (curr->vmacache[i] == vma) { | 788 | if (curr->vmacache[i] == vma) { |
789 | vmacache_invalidate(curr->mm); | 789 | vmacache_invalidate(mm); |
790 | break; | 790 | break; |
791 | } | 791 | } |
792 | } | 792 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f59fa29eda8..0ea758b898fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -69,6 +69,7 @@ | |||
69 | 69 | ||
70 | /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ | 70 | /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ |
71 | static DEFINE_MUTEX(pcp_batch_high_lock); | 71 | static DEFINE_MUTEX(pcp_batch_high_lock); |
72 | #define MIN_PERCPU_PAGELIST_FRACTION (8) | ||
72 | 73 | ||
73 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID | 74 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID |
74 | DEFINE_PER_CPU(int, numa_node); | 75 | DEFINE_PER_CPU(int, numa_node); |
@@ -815,9 +816,21 @@ void __init init_cma_reserved_pageblock(struct page *page) | |||
815 | set_page_count(p, 0); | 816 | set_page_count(p, 0); |
816 | } while (++p, --i); | 817 | } while (++p, --i); |
817 | 818 | ||
818 | set_page_refcounted(page); | ||
819 | set_pageblock_migratetype(page, MIGRATE_CMA); | 819 | set_pageblock_migratetype(page, MIGRATE_CMA); |
820 | __free_pages(page, pageblock_order); | 820 | |
821 | if (pageblock_order >= MAX_ORDER) { | ||
822 | i = pageblock_nr_pages; | ||
823 | p = page; | ||
824 | do { | ||
825 | set_page_refcounted(p); | ||
826 | __free_pages(p, MAX_ORDER - 1); | ||
827 | p += MAX_ORDER_NR_PAGES; | ||
828 | } while (i -= MAX_ORDER_NR_PAGES); | ||
829 | } else { | ||
830 | set_page_refcounted(page); | ||
831 | __free_pages(page, pageblock_order); | ||
832 | } | ||
833 | |||
821 | adjust_managed_page_count(page, pageblock_nr_pages); | 834 | adjust_managed_page_count(page, pageblock_nr_pages); |
822 | } | 835 | } |
823 | #endif | 836 | #endif |
@@ -4145,7 +4158,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) | |||
4145 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) | 4158 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) |
4146 | #endif | 4159 | #endif |
4147 | 4160 | ||
4148 | static int __meminit zone_batchsize(struct zone *zone) | 4161 | static int zone_batchsize(struct zone *zone) |
4149 | { | 4162 | { |
4150 | #ifdef CONFIG_MMU | 4163 | #ifdef CONFIG_MMU |
4151 | int batch; | 4164 | int batch; |
@@ -4261,8 +4274,8 @@ static void pageset_set_high(struct per_cpu_pageset *p, | |||
4261 | pageset_update(&p->pcp, high, batch); | 4274 | pageset_update(&p->pcp, high, batch); |
4262 | } | 4275 | } |
4263 | 4276 | ||
4264 | static void __meminit pageset_set_high_and_batch(struct zone *zone, | 4277 | static void pageset_set_high_and_batch(struct zone *zone, |
4265 | struct per_cpu_pageset *pcp) | 4278 | struct per_cpu_pageset *pcp) |
4266 | { | 4279 | { |
4267 | if (percpu_pagelist_fraction) | 4280 | if (percpu_pagelist_fraction) |
4268 | pageset_set_high(pcp, | 4281 | pageset_set_high(pcp, |
@@ -5881,23 +5894,38 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, | |||
5881 | void __user *buffer, size_t *length, loff_t *ppos) | 5894 | void __user *buffer, size_t *length, loff_t *ppos) |
5882 | { | 5895 | { |
5883 | struct zone *zone; | 5896 | struct zone *zone; |
5884 | unsigned int cpu; | 5897 | int old_percpu_pagelist_fraction; |
5885 | int ret; | 5898 | int ret; |
5886 | 5899 | ||
5900 | mutex_lock(&pcp_batch_high_lock); | ||
5901 | old_percpu_pagelist_fraction = percpu_pagelist_fraction; | ||
5902 | |||
5887 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | 5903 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); |
5888 | if (!write || (ret < 0)) | 5904 | if (!write || ret < 0) |
5889 | return ret; | 5905 | goto out; |
5906 | |||
5907 | /* Sanity checking to avoid pcp imbalance */ | ||
5908 | if (percpu_pagelist_fraction && | ||
5909 | percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { | ||
5910 | percpu_pagelist_fraction = old_percpu_pagelist_fraction; | ||
5911 | ret = -EINVAL; | ||
5912 | goto out; | ||
5913 | } | ||
5914 | |||
5915 | /* No change? */ | ||
5916 | if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) | ||
5917 | goto out; | ||
5890 | 5918 | ||
5891 | mutex_lock(&pcp_batch_high_lock); | ||
5892 | for_each_populated_zone(zone) { | 5919 | for_each_populated_zone(zone) { |
5893 | unsigned long high; | 5920 | unsigned int cpu; |
5894 | high = zone->managed_pages / percpu_pagelist_fraction; | 5921 | |
5895 | for_each_possible_cpu(cpu) | 5922 | for_each_possible_cpu(cpu) |
5896 | pageset_set_high(per_cpu_ptr(zone->pageset, cpu), | 5923 | pageset_set_high_and_batch(zone, |
5897 | high); | 5924 | per_cpu_ptr(zone->pageset, cpu)); |
5898 | } | 5925 | } |
5926 | out: | ||
5899 | mutex_unlock(&pcp_batch_high_lock); | 5927 | mutex_unlock(&pcp_batch_high_lock); |
5900 | return 0; | 5928 | return ret; |
5901 | } | 5929 | } |
5902 | 5930 | ||
5903 | int hashdist = HASHDIST_DEFAULT; | 5931 | int hashdist = HASHDIST_DEFAULT; |
@@ -569,6 +569,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) | |||
569 | pgd_t *pgd; | 569 | pgd_t *pgd; |
570 | pud_t *pud; | 570 | pud_t *pud; |
571 | pmd_t *pmd = NULL; | 571 | pmd_t *pmd = NULL; |
572 | pmd_t pmde; | ||
572 | 573 | ||
573 | pgd = pgd_offset(mm, address); | 574 | pgd = pgd_offset(mm, address); |
574 | if (!pgd_present(*pgd)) | 575 | if (!pgd_present(*pgd)) |
@@ -579,7 +580,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) | |||
579 | goto out; | 580 | goto out; |
580 | 581 | ||
581 | pmd = pmd_offset(pud, address); | 582 | pmd = pmd_offset(pud, address); |
582 | if (!pmd_present(*pmd)) | 583 | /* |
584 | * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at() | ||
585 | * without holding anon_vma lock for write. So when looking for a | ||
586 | * genuine pmde (in which to find pte), test present and !THP together. | ||
587 | */ | ||
588 | pmde = ACCESS_ONCE(*pmd); | ||
589 | if (!pmd_present(pmde) || pmd_trans_huge(pmde)) | ||
583 | pmd = NULL; | 590 | pmd = NULL; |
584 | out: | 591 | out: |
585 | return pmd; | 592 | return pmd; |
@@ -615,9 +622,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, | |||
615 | if (!pmd) | 622 | if (!pmd) |
616 | return NULL; | 623 | return NULL; |
617 | 624 | ||
618 | if (pmd_trans_huge(*pmd)) | ||
619 | return NULL; | ||
620 | |||
621 | pte = pte_offset_map(pmd, address); | 625 | pte = pte_offset_map(pmd, address); |
622 | /* Make a quick check before getting the lock */ | 626 | /* Make a quick check before getting the lock */ |
623 | if (!sync && !pte_present(*pte)) { | 627 | if (!sync && !pte_present(*pte)) { |
diff --git a/mm/shmem.c b/mm/shmem.c index f484c276e994..1140f49b6ded 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt; | |||
80 | #define SHORT_SYMLINK_LEN 128 | 80 | #define SHORT_SYMLINK_LEN 128 |
81 | 81 | ||
82 | /* | 82 | /* |
83 | * shmem_fallocate and shmem_writepage communicate via inode->i_private | 83 | * shmem_fallocate communicates with shmem_fault or shmem_writepage via |
84 | * (with i_mutex making sure that it has only one user at a time): | 84 | * inode->i_private (with i_mutex making sure that it has only one user at |
85 | * we would prefer not to enlarge the shmem inode just for that. | 85 | * a time): we would prefer not to enlarge the shmem inode just for that. |
86 | */ | 86 | */ |
87 | struct shmem_falloc { | 87 | struct shmem_falloc { |
88 | int mode; /* FALLOC_FL mode currently operating */ | ||
88 | pgoff_t start; /* start of range currently being fallocated */ | 89 | pgoff_t start; /* start of range currently being fallocated */ |
89 | pgoff_t next; /* the next page offset to be fallocated */ | 90 | pgoff_t next; /* the next page offset to be fallocated */ |
90 | pgoff_t nr_falloced; /* how many new pages have been fallocated */ | 91 | pgoff_t nr_falloced; /* how many new pages have been fallocated */ |
@@ -759,6 +760,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
759 | spin_lock(&inode->i_lock); | 760 | spin_lock(&inode->i_lock); |
760 | shmem_falloc = inode->i_private; | 761 | shmem_falloc = inode->i_private; |
761 | if (shmem_falloc && | 762 | if (shmem_falloc && |
763 | !shmem_falloc->mode && | ||
762 | index >= shmem_falloc->start && | 764 | index >= shmem_falloc->start && |
763 | index < shmem_falloc->next) | 765 | index < shmem_falloc->next) |
764 | shmem_falloc->nr_unswapped++; | 766 | shmem_falloc->nr_unswapped++; |
@@ -1027,6 +1029,9 @@ repeat: | |||
1027 | goto failed; | 1029 | goto failed; |
1028 | } | 1030 | } |
1029 | 1031 | ||
1032 | if (page && sgp == SGP_WRITE) | ||
1033 | mark_page_accessed(page); | ||
1034 | |||
1030 | /* fallocated page? */ | 1035 | /* fallocated page? */ |
1031 | if (page && !PageUptodate(page)) { | 1036 | if (page && !PageUptodate(page)) { |
1032 | if (sgp != SGP_READ) | 1037 | if (sgp != SGP_READ) |
@@ -1108,6 +1113,9 @@ repeat: | |||
1108 | shmem_recalc_inode(inode); | 1113 | shmem_recalc_inode(inode); |
1109 | spin_unlock(&info->lock); | 1114 | spin_unlock(&info->lock); |
1110 | 1115 | ||
1116 | if (sgp == SGP_WRITE) | ||
1117 | mark_page_accessed(page); | ||
1118 | |||
1111 | delete_from_swap_cache(page); | 1119 | delete_from_swap_cache(page); |
1112 | set_page_dirty(page); | 1120 | set_page_dirty(page); |
1113 | swap_free(swap); | 1121 | swap_free(swap); |
@@ -1134,6 +1142,9 @@ repeat: | |||
1134 | 1142 | ||
1135 | __SetPageSwapBacked(page); | 1143 | __SetPageSwapBacked(page); |
1136 | __set_page_locked(page); | 1144 | __set_page_locked(page); |
1145 | if (sgp == SGP_WRITE) | ||
1146 | init_page_accessed(page); | ||
1147 | |||
1137 | error = mem_cgroup_charge_file(page, current->mm, | 1148 | error = mem_cgroup_charge_file(page, current->mm, |
1138 | gfp & GFP_RECLAIM_MASK); | 1149 | gfp & GFP_RECLAIM_MASK); |
1139 | if (error) | 1150 | if (error) |
@@ -1233,6 +1244,44 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1233 | int error; | 1244 | int error; |
1234 | int ret = VM_FAULT_LOCKED; | 1245 | int ret = VM_FAULT_LOCKED; |
1235 | 1246 | ||
1247 | /* | ||
1248 | * Trinity finds that probing a hole which tmpfs is punching can | ||
1249 | * prevent the hole-punch from ever completing: which in turn | ||
1250 | * locks writers out with its hold on i_mutex. So refrain from | ||
1251 | * faulting pages into the hole while it's being punched, and | ||
1252 | * wait on i_mutex to be released if vmf->flags permits. | ||
1253 | */ | ||
1254 | if (unlikely(inode->i_private)) { | ||
1255 | struct shmem_falloc *shmem_falloc; | ||
1256 | |||
1257 | spin_lock(&inode->i_lock); | ||
1258 | shmem_falloc = inode->i_private; | ||
1259 | if (!shmem_falloc || | ||
1260 | shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE || | ||
1261 | vmf->pgoff < shmem_falloc->start || | ||
1262 | vmf->pgoff >= shmem_falloc->next) | ||
1263 | shmem_falloc = NULL; | ||
1264 | spin_unlock(&inode->i_lock); | ||
1265 | /* | ||
1266 | * i_lock has protected us from taking shmem_falloc seriously | ||
1267 | * once return from shmem_fallocate() went back up that stack. | ||
1268 | * i_lock does not serialize with i_mutex at all, but it does | ||
1269 | * not matter if sometimes we wait unnecessarily, or sometimes | ||
1270 | * miss out on waiting: we just need to make those cases rare. | ||
1271 | */ | ||
1272 | if (shmem_falloc) { | ||
1273 | if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && | ||
1274 | !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { | ||
1275 | up_read(&vma->vm_mm->mmap_sem); | ||
1276 | mutex_lock(&inode->i_mutex); | ||
1277 | mutex_unlock(&inode->i_mutex); | ||
1278 | return VM_FAULT_RETRY; | ||
1279 | } | ||
1280 | /* cond_resched? Leave that to GUP or return to user */ | ||
1281 | return VM_FAULT_NOPAGE; | ||
1282 | } | ||
1283 | } | ||
1284 | |||
1236 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); | 1285 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); |
1237 | if (error) | 1286 | if (error) |
1238 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1287 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
@@ -1372,13 +1421,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, | |||
1372 | loff_t pos, unsigned len, unsigned flags, | 1421 | loff_t pos, unsigned len, unsigned flags, |
1373 | struct page **pagep, void **fsdata) | 1422 | struct page **pagep, void **fsdata) |
1374 | { | 1423 | { |
1375 | int ret; | ||
1376 | struct inode *inode = mapping->host; | 1424 | struct inode *inode = mapping->host; |
1377 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1425 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1378 | ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); | 1426 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); |
1379 | if (ret == 0 && *pagep) | ||
1380 | init_page_accessed(*pagep); | ||
1381 | return ret; | ||
1382 | } | 1427 | } |
1383 | 1428 | ||
1384 | static int | 1429 | static int |
@@ -1724,20 +1769,31 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, | |||
1724 | pgoff_t start, index, end; | 1769 | pgoff_t start, index, end; |
1725 | int error; | 1770 | int error; |
1726 | 1771 | ||
1772 | if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) | ||
1773 | return -EOPNOTSUPP; | ||
1774 | |||
1727 | mutex_lock(&inode->i_mutex); | 1775 | mutex_lock(&inode->i_mutex); |
1728 | 1776 | ||
1777 | shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE; | ||
1778 | |||
1729 | if (mode & FALLOC_FL_PUNCH_HOLE) { | 1779 | if (mode & FALLOC_FL_PUNCH_HOLE) { |
1730 | struct address_space *mapping = file->f_mapping; | 1780 | struct address_space *mapping = file->f_mapping; |
1731 | loff_t unmap_start = round_up(offset, PAGE_SIZE); | 1781 | loff_t unmap_start = round_up(offset, PAGE_SIZE); |
1732 | loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; | 1782 | loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; |
1733 | 1783 | ||
1784 | shmem_falloc.start = unmap_start >> PAGE_SHIFT; | ||
1785 | shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; | ||
1786 | spin_lock(&inode->i_lock); | ||
1787 | inode->i_private = &shmem_falloc; | ||
1788 | spin_unlock(&inode->i_lock); | ||
1789 | |||
1734 | if ((u64)unmap_end > (u64)unmap_start) | 1790 | if ((u64)unmap_end > (u64)unmap_start) |
1735 | unmap_mapping_range(mapping, unmap_start, | 1791 | unmap_mapping_range(mapping, unmap_start, |
1736 | 1 + unmap_end - unmap_start, 0); | 1792 | 1 + unmap_end - unmap_start, 0); |
1737 | shmem_truncate_range(inode, offset, offset + len - 1); | 1793 | shmem_truncate_range(inode, offset, offset + len - 1); |
1738 | /* No need to unmap again: hole-punching leaves COWed pages */ | 1794 | /* No need to unmap again: hole-punching leaves COWed pages */ |
1739 | error = 0; | 1795 | error = 0; |
1740 | goto out; | 1796 | goto undone; |
1741 | } | 1797 | } |
1742 | 1798 | ||
1743 | /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ | 1799 | /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ |
@@ -386,6 +386,39 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
386 | 386 | ||
387 | #endif | 387 | #endif |
388 | 388 | ||
389 | #define OBJECT_FREE (0) | ||
390 | #define OBJECT_ACTIVE (1) | ||
391 | |||
392 | #ifdef CONFIG_DEBUG_SLAB_LEAK | ||
393 | |||
394 | static void set_obj_status(struct page *page, int idx, int val) | ||
395 | { | ||
396 | int freelist_size; | ||
397 | char *status; | ||
398 | struct kmem_cache *cachep = page->slab_cache; | ||
399 | |||
400 | freelist_size = cachep->num * sizeof(freelist_idx_t); | ||
401 | status = (char *)page->freelist + freelist_size; | ||
402 | status[idx] = val; | ||
403 | } | ||
404 | |||
405 | static inline unsigned int get_obj_status(struct page *page, int idx) | ||
406 | { | ||
407 | int freelist_size; | ||
408 | char *status; | ||
409 | struct kmem_cache *cachep = page->slab_cache; | ||
410 | |||
411 | freelist_size = cachep->num * sizeof(freelist_idx_t); | ||
412 | status = (char *)page->freelist + freelist_size; | ||
413 | |||
414 | return status[idx]; | ||
415 | } | ||
416 | |||
417 | #else | ||
418 | static inline void set_obj_status(struct page *page, int idx, int val) {} | ||
419 | |||
420 | #endif | ||
421 | |||
389 | /* | 422 | /* |
390 | * Do not go above this order unless 0 objects fit into the slab or | 423 | * Do not go above this order unless 0 objects fit into the slab or |
391 | * overridden on the command line. | 424 | * overridden on the command line. |
@@ -576,12 +609,30 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | |||
576 | return cachep->array[smp_processor_id()]; | 609 | return cachep->array[smp_processor_id()]; |
577 | } | 610 | } |
578 | 611 | ||
612 | static size_t calculate_freelist_size(int nr_objs, size_t align) | ||
613 | { | ||
614 | size_t freelist_size; | ||
615 | |||
616 | freelist_size = nr_objs * sizeof(freelist_idx_t); | ||
617 | if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) | ||
618 | freelist_size += nr_objs * sizeof(char); | ||
619 | |||
620 | if (align) | ||
621 | freelist_size = ALIGN(freelist_size, align); | ||
622 | |||
623 | return freelist_size; | ||
624 | } | ||
625 | |||
579 | static int calculate_nr_objs(size_t slab_size, size_t buffer_size, | 626 | static int calculate_nr_objs(size_t slab_size, size_t buffer_size, |
580 | size_t idx_size, size_t align) | 627 | size_t idx_size, size_t align) |
581 | { | 628 | { |
582 | int nr_objs; | 629 | int nr_objs; |
630 | size_t remained_size; | ||
583 | size_t freelist_size; | 631 | size_t freelist_size; |
632 | int extra_space = 0; | ||
584 | 633 | ||
634 | if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) | ||
635 | extra_space = sizeof(char); | ||
585 | /* | 636 | /* |
586 | * Ignore padding for the initial guess. The padding | 637 | * Ignore padding for the initial guess. The padding |
587 | * is at most @align-1 bytes, and @buffer_size is at | 638 | * is at most @align-1 bytes, and @buffer_size is at |
@@ -590,14 +641,15 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size, | |||
590 | * into the memory allocation when taking the padding | 641 | * into the memory allocation when taking the padding |
591 | * into account. | 642 | * into account. |
592 | */ | 643 | */ |
593 | nr_objs = slab_size / (buffer_size + idx_size); | 644 | nr_objs = slab_size / (buffer_size + idx_size + extra_space); |
594 | 645 | ||
595 | /* | 646 | /* |
596 | * This calculated number will be either the right | 647 | * This calculated number will be either the right |
597 | * amount, or one greater than what we want. | 648 | * amount, or one greater than what we want. |
598 | */ | 649 | */ |
599 | freelist_size = slab_size - nr_objs * buffer_size; | 650 | remained_size = slab_size - nr_objs * buffer_size; |
600 | if (freelist_size < ALIGN(nr_objs * idx_size, align)) | 651 | freelist_size = calculate_freelist_size(nr_objs, align); |
652 | if (remained_size < freelist_size) | ||
601 | nr_objs--; | 653 | nr_objs--; |
602 | 654 | ||
603 | return nr_objs; | 655 | return nr_objs; |
@@ -635,7 +687,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, | |||
635 | } else { | 687 | } else { |
636 | nr_objs = calculate_nr_objs(slab_size, buffer_size, | 688 | nr_objs = calculate_nr_objs(slab_size, buffer_size, |
637 | sizeof(freelist_idx_t), align); | 689 | sizeof(freelist_idx_t), align); |
638 | mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align); | 690 | mgmt_size = calculate_freelist_size(nr_objs, align); |
639 | } | 691 | } |
640 | *num = nr_objs; | 692 | *num = nr_objs; |
641 | *left_over = slab_size - nr_objs*buffer_size - mgmt_size; | 693 | *left_over = slab_size - nr_objs*buffer_size - mgmt_size; |
@@ -2041,13 +2093,16 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
2041 | break; | 2093 | break; |
2042 | 2094 | ||
2043 | if (flags & CFLGS_OFF_SLAB) { | 2095 | if (flags & CFLGS_OFF_SLAB) { |
2096 | size_t freelist_size_per_obj = sizeof(freelist_idx_t); | ||
2044 | /* | 2097 | /* |
2045 | * Max number of objs-per-slab for caches which | 2098 | * Max number of objs-per-slab for caches which |
2046 | * use off-slab slabs. Needed to avoid a possible | 2099 | * use off-slab slabs. Needed to avoid a possible |
2047 | * looping condition in cache_grow(). | 2100 | * looping condition in cache_grow(). |
2048 | */ | 2101 | */ |
2102 | if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) | ||
2103 | freelist_size_per_obj += sizeof(char); | ||
2049 | offslab_limit = size; | 2104 | offslab_limit = size; |
2050 | offslab_limit /= sizeof(freelist_idx_t); | 2105 | offslab_limit /= freelist_size_per_obj; |
2051 | 2106 | ||
2052 | if (num > offslab_limit) | 2107 | if (num > offslab_limit) |
2053 | break; | 2108 | break; |
@@ -2294,8 +2349,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2294 | if (!cachep->num) | 2349 | if (!cachep->num) |
2295 | return -E2BIG; | 2350 | return -E2BIG; |
2296 | 2351 | ||
2297 | freelist_size = | 2352 | freelist_size = calculate_freelist_size(cachep->num, cachep->align); |
2298 | ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align); | ||
2299 | 2353 | ||
2300 | /* | 2354 | /* |
2301 | * If the slab has been placed off-slab, and we have enough space then | 2355 | * If the slab has been placed off-slab, and we have enough space then |
@@ -2308,7 +2362,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2308 | 2362 | ||
2309 | if (flags & CFLGS_OFF_SLAB) { | 2363 | if (flags & CFLGS_OFF_SLAB) { |
2310 | /* really off slab. No need for manual alignment */ | 2364 | /* really off slab. No need for manual alignment */ |
2311 | freelist_size = cachep->num * sizeof(freelist_idx_t); | 2365 | freelist_size = calculate_freelist_size(cachep->num, 0); |
2312 | 2366 | ||
2313 | #ifdef CONFIG_PAGE_POISONING | 2367 | #ifdef CONFIG_PAGE_POISONING |
2314 | /* If we're going to use the generic kernel_map_pages() | 2368 | /* If we're going to use the generic kernel_map_pages() |
@@ -2612,6 +2666,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2612 | if (cachep->ctor) | 2666 | if (cachep->ctor) |
2613 | cachep->ctor(objp); | 2667 | cachep->ctor(objp); |
2614 | #endif | 2668 | #endif |
2669 | set_obj_status(page, i, OBJECT_FREE); | ||
2615 | set_free_obj(page, i, i); | 2670 | set_free_obj(page, i, i); |
2616 | } | 2671 | } |
2617 | } | 2672 | } |
@@ -2820,6 +2875,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2820 | BUG_ON(objnr >= cachep->num); | 2875 | BUG_ON(objnr >= cachep->num); |
2821 | BUG_ON(objp != index_to_obj(cachep, page, objnr)); | 2876 | BUG_ON(objp != index_to_obj(cachep, page, objnr)); |
2822 | 2877 | ||
2878 | set_obj_status(page, objnr, OBJECT_FREE); | ||
2823 | if (cachep->flags & SLAB_POISON) { | 2879 | if (cachep->flags & SLAB_POISON) { |
2824 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2880 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2825 | if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { | 2881 | if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { |
@@ -2953,6 +3009,8 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, | |||
2953 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | 3009 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, |
2954 | gfp_t flags, void *objp, unsigned long caller) | 3010 | gfp_t flags, void *objp, unsigned long caller) |
2955 | { | 3011 | { |
3012 | struct page *page; | ||
3013 | |||
2956 | if (!objp) | 3014 | if (!objp) |
2957 | return objp; | 3015 | return objp; |
2958 | if (cachep->flags & SLAB_POISON) { | 3016 | if (cachep->flags & SLAB_POISON) { |
@@ -2983,6 +3041,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
2983 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; | 3041 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; |
2984 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; | 3042 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; |
2985 | } | 3043 | } |
3044 | |||
3045 | page = virt_to_head_page(objp); | ||
3046 | set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); | ||
2986 | objp += obj_offset(cachep); | 3047 | objp += obj_offset(cachep); |
2987 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 3048 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
2988 | cachep->ctor(objp); | 3049 | cachep->ctor(objp); |
@@ -4219,21 +4280,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, | |||
4219 | struct page *page) | 4280 | struct page *page) |
4220 | { | 4281 | { |
4221 | void *p; | 4282 | void *p; |
4222 | int i, j; | 4283 | int i; |
4223 | 4284 | ||
4224 | if (n[0] == n[1]) | 4285 | if (n[0] == n[1]) |
4225 | return; | 4286 | return; |
4226 | for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { | 4287 | for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { |
4227 | bool active = true; | 4288 | if (get_obj_status(page, i) != OBJECT_ACTIVE) |
4228 | |||
4229 | for (j = page->active; j < c->num; j++) { | ||
4230 | /* Skip freed item */ | ||
4231 | if (get_free_obj(page, j) == i) { | ||
4232 | active = false; | ||
4233 | break; | ||
4234 | } | ||
4235 | } | ||
4236 | if (!active) | ||
4237 | continue; | 4289 | continue; |
4238 | 4290 | ||
4239 | if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) | 4291 | if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) |
@@ -1881,7 +1881,7 @@ redo: | |||
1881 | 1881 | ||
1882 | new.frozen = 0; | 1882 | new.frozen = 0; |
1883 | 1883 | ||
1884 | if (!new.inuse && n->nr_partial > s->min_partial) | 1884 | if (!new.inuse && n->nr_partial >= s->min_partial) |
1885 | m = M_FREE; | 1885 | m = M_FREE; |
1886 | else if (new.freelist) { | 1886 | else if (new.freelist) { |
1887 | m = M_PARTIAL; | 1887 | m = M_PARTIAL; |
@@ -1992,7 +1992,7 @@ static void unfreeze_partials(struct kmem_cache *s, | |||
1992 | new.freelist, new.counters, | 1992 | new.freelist, new.counters, |
1993 | "unfreezing slab")); | 1993 | "unfreezing slab")); |
1994 | 1994 | ||
1995 | if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { | 1995 | if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { |
1996 | page->next = discard_page; | 1996 | page->next = discard_page; |
1997 | discard_page = page; | 1997 | discard_page = page; |
1998 | } else { | 1998 | } else { |
@@ -2620,7 +2620,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2620 | return; | 2620 | return; |
2621 | } | 2621 | } |
2622 | 2622 | ||
2623 | if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) | 2623 | if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) |
2624 | goto slab_empty; | 2624 | goto slab_empty; |
2625 | 2625 | ||
2626 | /* | 2626 | /* |