summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c1
-rw-r--r--mm/debug.c4
-rw-r--r--mm/gup.c3
-rw-r--r--mm/hugetlb.c84
-rw-r--r--mm/kasan/Makefile3
-rw-r--r--mm/kasan/common.c82
-rw-r--r--mm/kasan/tags.c2
-rw-r--r--mm/kmemleak.c10
-rw-r--r--mm/memblock.c11
-rw-r--r--mm/memory-failure.c19
-rw-r--r--mm/memory.c26
-rw-r--r--mm/memory_hotplug.c85
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/migrate.c25
-rw-r--r--mm/mincore.c94
-rw-r--r--mm/oom_kill.c12
-rw-r--r--mm/page_alloc.c40
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/rmap.c8
-rw-r--r--mm/shmem.c10
-rw-r--r--mm/slab.c21
-rw-r--r--mm/slab.h7
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/slub.c61
-rw-r--r--mm/swap.c17
-rw-r--r--mm/usercopy.c9
-rw-r--r--mm/userfaultfd.c11
-rw-r--r--mm/util.c4
-rw-r--r--mm/vmscan.c10
29 files changed, 387 insertions, 285 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8a8bb8796c6c..72e6d0c55cfa 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -689,6 +689,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
689 INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); 689 INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
690 bdi->cgwb_congested_tree = RB_ROOT; 690 bdi->cgwb_congested_tree = RB_ROOT;
691 mutex_init(&bdi->cgwb_release_mutex); 691 mutex_init(&bdi->cgwb_release_mutex);
692 init_rwsem(&bdi->wb_switch_rwsem);
692 693
693 ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); 694 ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
694 if (!ret) { 695 if (!ret) {
diff --git a/mm/debug.c b/mm/debug.c
index 0abb987dad9b..1611cf00a137 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -44,7 +44,7 @@ const struct trace_print_flags vmaflag_names[] = {
44 44
45void __dump_page(struct page *page, const char *reason) 45void __dump_page(struct page *page, const char *reason)
46{ 46{
47 struct address_space *mapping = page_mapping(page); 47 struct address_space *mapping;
48 bool page_poisoned = PagePoisoned(page); 48 bool page_poisoned = PagePoisoned(page);
49 int mapcount; 49 int mapcount;
50 50
@@ -58,6 +58,8 @@ void __dump_page(struct page *page, const char *reason)
58 goto hex_only; 58 goto hex_only;
59 } 59 }
60 60
61 mapping = page_mapping(page);
62
61 /* 63 /*
62 * Avoid VM_BUG_ON() in page_mapcount(). 64 * Avoid VM_BUG_ON() in page_mapcount().
63 * page->_mapcount space in struct page is used by sl[aou]b pages to 65 * page->_mapcount space in struct page is used by sl[aou]b pages to
diff --git a/mm/gup.c b/mm/gup.c
index 05acd7e2eb22..75029649baca 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1674,7 +1674,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
1674 if (!pmd_present(pmd)) 1674 if (!pmd_present(pmd))
1675 return 0; 1675 return 0;
1676 1676
1677 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { 1677 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
1678 pmd_devmap(pmd))) {
1678 /* 1679 /*
1679 * NUMA hinting faults need to be handled in the GUP 1680 * NUMA hinting faults need to be handled in the GUP
1680 * slowpath for accounting purposes and so that they 1681 * slowpath for accounting purposes and so that they
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 745088810965..afef61656c1e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3238,7 +3238,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3238 struct page *ptepage; 3238 struct page *ptepage;
3239 unsigned long addr; 3239 unsigned long addr;
3240 int cow; 3240 int cow;
3241 struct address_space *mapping = vma->vm_file->f_mapping;
3242 struct hstate *h = hstate_vma(vma); 3241 struct hstate *h = hstate_vma(vma);
3243 unsigned long sz = huge_page_size(h); 3242 unsigned long sz = huge_page_size(h);
3244 struct mmu_notifier_range range; 3243 struct mmu_notifier_range range;
@@ -3250,23 +3249,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3250 mmu_notifier_range_init(&range, src, vma->vm_start, 3249 mmu_notifier_range_init(&range, src, vma->vm_start,
3251 vma->vm_end); 3250 vma->vm_end);
3252 mmu_notifier_invalidate_range_start(&range); 3251 mmu_notifier_invalidate_range_start(&range);
3253 } else {
3254 /*
3255 * For shared mappings i_mmap_rwsem must be held to call
3256 * huge_pte_alloc, otherwise the returned ptep could go
3257 * away if part of a shared pmd and another thread calls
3258 * huge_pmd_unshare.
3259 */
3260 i_mmap_lock_read(mapping);
3261 } 3252 }
3262 3253
3263 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 3254 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
3264 spinlock_t *src_ptl, *dst_ptl; 3255 spinlock_t *src_ptl, *dst_ptl;
3265
3266 src_pte = huge_pte_offset(src, addr, sz); 3256 src_pte = huge_pte_offset(src, addr, sz);
3267 if (!src_pte) 3257 if (!src_pte)
3268 continue; 3258 continue;
3269
3270 dst_pte = huge_pte_alloc(dst, addr, sz); 3259 dst_pte = huge_pte_alloc(dst, addr, sz);
3271 if (!dst_pte) { 3260 if (!dst_pte) {
3272 ret = -ENOMEM; 3261 ret = -ENOMEM;
@@ -3337,8 +3326,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3337 3326
3338 if (cow) 3327 if (cow)
3339 mmu_notifier_invalidate_range_end(&range); 3328 mmu_notifier_invalidate_range_end(&range);
3340 else
3341 i_mmap_unlock_read(mapping);
3342 3329
3343 return ret; 3330 return ret;
3344} 3331}
@@ -3755,16 +3742,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
3755 } 3742 }
3756 3743
3757 /* 3744 /*
3758 * We can not race with truncation due to holding i_mmap_rwsem. 3745 * Use page lock to guard against racing truncation
3759 * Check once here for faults beyond end of file. 3746 * before we get page_table_lock.
3760 */ 3747 */
3761 size = i_size_read(mapping->host) >> huge_page_shift(h);
3762 if (idx >= size)
3763 goto out;
3764
3765retry: 3748retry:
3766 page = find_lock_page(mapping, idx); 3749 page = find_lock_page(mapping, idx);
3767 if (!page) { 3750 if (!page) {
3751 size = i_size_read(mapping->host) >> huge_page_shift(h);
3752 if (idx >= size)
3753 goto out;
3754
3768 /* 3755 /*
3769 * Check for page in userfault range 3756 * Check for page in userfault range
3770 */ 3757 */
@@ -3784,18 +3771,14 @@ retry:
3784 }; 3771 };
3785 3772
3786 /* 3773 /*
3787 * hugetlb_fault_mutex and i_mmap_rwsem must be 3774 * hugetlb_fault_mutex must be dropped before
3788 * dropped before handling userfault. Reacquire 3775 * handling userfault. Reacquire after handling
3789 * after handling fault to make calling code simpler. 3776 * fault to make calling code simpler.
3790 */ 3777 */
3791 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, 3778 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
3792 idx, haddr); 3779 idx, haddr);
3793 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 3780 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
3794 i_mmap_unlock_read(mapping);
3795
3796 ret = handle_userfault(&vmf, VM_UFFD_MISSING); 3781 ret = handle_userfault(&vmf, VM_UFFD_MISSING);
3797
3798 i_mmap_lock_read(mapping);
3799 mutex_lock(&hugetlb_fault_mutex_table[hash]); 3782 mutex_lock(&hugetlb_fault_mutex_table[hash]);
3800 goto out; 3783 goto out;
3801 } 3784 }
@@ -3854,6 +3837,9 @@ retry:
3854 } 3837 }
3855 3838
3856 ptl = huge_pte_lock(h, mm, ptep); 3839 ptl = huge_pte_lock(h, mm, ptep);
3840 size = i_size_read(mapping->host) >> huge_page_shift(h);
3841 if (idx >= size)
3842 goto backout;
3857 3843
3858 ret = 0; 3844 ret = 0;
3859 if (!huge_pte_none(huge_ptep_get(ptep))) 3845 if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3940,11 +3926,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3940 3926
3941 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 3927 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
3942 if (ptep) { 3928 if (ptep) {
3943 /*
3944 * Since we hold no locks, ptep could be stale. That is
3945 * OK as we are only making decisions based on content and
3946 * not actually modifying content here.
3947 */
3948 entry = huge_ptep_get(ptep); 3929 entry = huge_ptep_get(ptep);
3949 if (unlikely(is_hugetlb_entry_migration(entry))) { 3930 if (unlikely(is_hugetlb_entry_migration(entry))) {
3950 migration_entry_wait_huge(vma, mm, ptep); 3931 migration_entry_wait_huge(vma, mm, ptep);
@@ -3952,33 +3933,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3952 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 3933 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
3953 return VM_FAULT_HWPOISON_LARGE | 3934 return VM_FAULT_HWPOISON_LARGE |
3954 VM_FAULT_SET_HINDEX(hstate_index(h)); 3935 VM_FAULT_SET_HINDEX(hstate_index(h));
3936 } else {
3937 ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
3938 if (!ptep)
3939 return VM_FAULT_OOM;
3955 } 3940 }
3956 3941
3957 /*
3958 * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
3959 * until finished with ptep. This serves two purposes:
3960 * 1) It prevents huge_pmd_unshare from being called elsewhere
3961 * and making the ptep no longer valid.
3962 * 2) It synchronizes us with file truncation.
3963 *
3964 * ptep could have already be assigned via huge_pte_offset. That
3965 * is OK, as huge_pte_alloc will return the same value unless
3966 * something changed.
3967 */
3968 mapping = vma->vm_file->f_mapping; 3942 mapping = vma->vm_file->f_mapping;
3969 i_mmap_lock_read(mapping); 3943 idx = vma_hugecache_offset(h, vma, haddr);
3970 ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
3971 if (!ptep) {
3972 i_mmap_unlock_read(mapping);
3973 return VM_FAULT_OOM;
3974 }
3975 3944
3976 /* 3945 /*
3977 * Serialize hugepage allocation and instantiation, so that we don't 3946 * Serialize hugepage allocation and instantiation, so that we don't
3978 * get spurious allocation failures if two CPUs race to instantiate 3947 * get spurious allocation failures if two CPUs race to instantiate
3979 * the same page in the page cache. 3948 * the same page in the page cache.
3980 */ 3949 */
3981 idx = vma_hugecache_offset(h, vma, haddr);
3982 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); 3950 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
3983 mutex_lock(&hugetlb_fault_mutex_table[hash]); 3951 mutex_lock(&hugetlb_fault_mutex_table[hash]);
3984 3952
@@ -4066,7 +4034,6 @@ out_ptl:
4066 } 4034 }
4067out_mutex: 4035out_mutex:
4068 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 4036 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
4069 i_mmap_unlock_read(mapping);
4070 /* 4037 /*
4071 * Generally it's safe to hold refcount during waiting page lock. But 4038 * Generally it's safe to hold refcount during waiting page lock. But
4072 * here we just wait to defer the next page fault to avoid busy loop and 4039 * here we just wait to defer the next page fault to avoid busy loop and
@@ -4301,7 +4268,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
4301 break; 4268 break;
4302 } 4269 }
4303 if (ret & VM_FAULT_RETRY) { 4270 if (ret & VM_FAULT_RETRY) {
4304 if (nonblocking) 4271 if (nonblocking &&
4272 !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
4305 *nonblocking = 0; 4273 *nonblocking = 0;
4306 *nr_pages = 0; 4274 *nr_pages = 0;
4307 /* 4275 /*
@@ -4671,12 +4639,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
4671 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 4639 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
4672 * and returns the corresponding pte. While this is not necessary for the 4640 * and returns the corresponding pte. While this is not necessary for the
4673 * !shared pmd case because we can allocate the pmd later as well, it makes the 4641 * !shared pmd case because we can allocate the pmd later as well, it makes the
4674 * code much cleaner. 4642 * code much cleaner. pmd allocation is essential for the shared case because
4675 * 4643 * pud has to be populated inside the same i_mmap_rwsem section - otherwise
4676 * This routine must be called with i_mmap_rwsem held in at least read mode. 4644 * racing tasks could either miss the sharing (see huge_pte_offset) or select a
4677 * For hugetlbfs, this prevents removal of any page table entries associated 4645 * bad pmd for sharing.
4678 * with the address space. This is important as we are setting up sharing
4679 * based on existing page table entries (mappings).
4680 */ 4646 */
4681pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 4647pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
4682{ 4648{
@@ -4693,6 +4659,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
4693 if (!vma_shareable(vma, addr)) 4659 if (!vma_shareable(vma, addr))
4694 return (pte_t *)pmd_alloc(mm, pud, addr); 4660 return (pte_t *)pmd_alloc(mm, pud, addr);
4695 4661
4662 i_mmap_lock_write(mapping);
4696 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 4663 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
4697 if (svma == vma) 4664 if (svma == vma)
4698 continue; 4665 continue;
@@ -4722,6 +4689,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
4722 spin_unlock(ptl); 4689 spin_unlock(ptl);
4723out: 4690out:
4724 pte = (pte_t *)pmd_alloc(mm, pud, addr); 4691 pte = (pte_t *)pmd_alloc(mm, pud, addr);
4692 i_mmap_unlock_write(mapping);
4725 return pte; 4693 return pte;
4726} 4694}
4727 4695
@@ -4732,7 +4700,7 @@ out:
4732 * indicated by page_count > 1, unmap is achieved by clearing pud and 4700 * indicated by page_count > 1, unmap is achieved by clearing pud and
4733 * decrementing the ref count. If count == 1, the pte page is not shared. 4701 * decrementing the ref count. If count == 1, the pte page is not shared.
4734 * 4702 *
4735 * Called with page table lock held and i_mmap_rwsem held in write mode. 4703 * called with page table lock held.
4736 * 4704 *
4737 * returns: 1 successfully unmapped a shared pte page 4705 * returns: 1 successfully unmapped a shared pte page
4738 * 0 the underlying pte page is not shared, or it is the last user 4706 * 0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 0a14fcff70ed..5d1065efbd47 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -5,7 +5,10 @@ UBSAN_SANITIZE_generic.o := n
5UBSAN_SANITIZE_tags.o := n 5UBSAN_SANITIZE_tags.o := n
6KCOV_INSTRUMENT := n 6KCOV_INSTRUMENT := n
7 7
8CFLAGS_REMOVE_common.o = -pg
8CFLAGS_REMOVE_generic.o = -pg 9CFLAGS_REMOVE_generic.o = -pg
10CFLAGS_REMOVE_tags.o = -pg
11
9# Function splitter causes unnecessary splits in __asan_load1/__asan_store1 12# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
10# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 13# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
11 14
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 03d5d1374ca7..09b534fbba17 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -298,8 +298,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
298 return; 298 return;
299 } 299 }
300 300
301 cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE);
302
303 *flags |= SLAB_KASAN; 301 *flags |= SLAB_KASAN;
304} 302}
305 303
@@ -349,28 +347,48 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
349} 347}
350 348
351/* 349/*
352 * Since it's desirable to only call object contructors once during slab 350 * This function assigns a tag to an object considering the following:
353 * allocation, we preassign tags to all such objects. Also preassign tags for 351 * 1. A cache might have a constructor, which might save a pointer to a slab
354 * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports. 352 * object somewhere (e.g. in the object itself). We preassign a tag for
355 * For SLAB allocator we can't preassign tags randomly since the freelist is 353 * each object in caches with constructors during slab creation and reuse
356 * stored as an array of indexes instead of a linked list. Assign tags based 354 * the same tag each time a particular object is allocated.
357 * on objects indexes, so that objects that are next to each other get 355 * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be
358 * different tags. 356 * accessed after being freed. We preassign tags for objects in these
359 * After a tag is assigned, the object always gets allocated with the same tag. 357 * caches as well.
360 * The reason is that we can't change tags for objects with constructors on 358 * 3. For SLAB allocator we can't preassign tags randomly since the freelist
361 * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor 359 * is stored as an array of indexes instead of a linked list. Assign tags
362 * code can save the pointer to the object somewhere (e.g. in the object 360 * based on objects indexes, so that objects that are next to each other
363 * itself). Then if we retag it, the old saved pointer will become invalid. 361 * get different tags.
364 */ 362 */
365static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new) 363static u8 assign_tag(struct kmem_cache *cache, const void *object,
364 bool init, bool keep_tag)
366{ 365{
366 /*
367 * 1. When an object is kmalloc()'ed, two hooks are called:
368 * kasan_slab_alloc() and kasan_kmalloc(). We assign the
369 * tag only in the first one.
370 * 2. We reuse the same tag for krealloc'ed objects.
371 */
372 if (keep_tag)
373 return get_tag(object);
374
375 /*
376 * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
377 * set, assign a tag when the object is being allocated (init == false).
378 */
367 if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) 379 if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
368 return new ? KASAN_TAG_KERNEL : random_tag(); 380 return init ? KASAN_TAG_KERNEL : random_tag();
369 381
382 /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
370#ifdef CONFIG_SLAB 383#ifdef CONFIG_SLAB
384 /* For SLAB assign tags based on the object index in the freelist. */
371 return (u8)obj_to_index(cache, virt_to_page(object), (void *)object); 385 return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
372#else 386#else
373 return new ? random_tag() : get_tag(object); 387 /*
388 * For SLUB assign a random tag during slab creation, otherwise reuse
389 * the already assigned tag.
390 */
391 return init ? random_tag() : get_tag(object);
374#endif 392#endif
375} 393}
376 394
@@ -386,17 +404,12 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
386 __memset(alloc_info, 0, sizeof(*alloc_info)); 404 __memset(alloc_info, 0, sizeof(*alloc_info));
387 405
388 if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) 406 if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
389 object = set_tag(object, assign_tag(cache, object, true)); 407 object = set_tag(object,
408 assign_tag(cache, object, true, false));
390 409
391 return (void *)object; 410 return (void *)object;
392} 411}
393 412
394void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
395 gfp_t flags)
396{
397 return kasan_kmalloc(cache, object, cache->object_size, flags);
398}
399
400static inline bool shadow_invalid(u8 tag, s8 shadow_byte) 413static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
401{ 414{
402 if (IS_ENABLED(CONFIG_KASAN_GENERIC)) 415 if (IS_ENABLED(CONFIG_KASAN_GENERIC))
@@ -452,8 +465,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
452 return __kasan_slab_free(cache, object, ip, true); 465 return __kasan_slab_free(cache, object, ip, true);
453} 466}
454 467
455void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, 468static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
456 size_t size, gfp_t flags) 469 size_t size, gfp_t flags, bool keep_tag)
457{ 470{
458 unsigned long redzone_start; 471 unsigned long redzone_start;
459 unsigned long redzone_end; 472 unsigned long redzone_end;
@@ -471,7 +484,7 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
471 KASAN_SHADOW_SCALE_SIZE); 484 KASAN_SHADOW_SCALE_SIZE);
472 485
473 if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) 486 if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
474 tag = assign_tag(cache, object, false); 487 tag = assign_tag(cache, object, false, keep_tag);
475 488
476 /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ 489 /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
477 kasan_unpoison_shadow(set_tag(object, tag), size); 490 kasan_unpoison_shadow(set_tag(object, tag), size);
@@ -483,6 +496,18 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
483 496
484 return set_tag(object, tag); 497 return set_tag(object, tag);
485} 498}
499
500void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
501 gfp_t flags)
502{
503 return __kasan_kmalloc(cache, object, cache->object_size, flags, false);
504}
505
506void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
507 size_t size, gfp_t flags)
508{
509 return __kasan_kmalloc(cache, object, size, flags, true);
510}
486EXPORT_SYMBOL(kasan_kmalloc); 511EXPORT_SYMBOL(kasan_kmalloc);
487 512
488void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, 513void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
@@ -522,7 +547,8 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
522 if (unlikely(!PageSlab(page))) 547 if (unlikely(!PageSlab(page)))
523 return kasan_kmalloc_large(object, size, flags); 548 return kasan_kmalloc_large(object, size, flags);
524 else 549 else
525 return kasan_kmalloc(page->slab_cache, object, size, flags); 550 return __kasan_kmalloc(page->slab_cache, object, size,
551 flags, true);
526} 552}
527 553
528void kasan_poison_kfree(void *ptr, unsigned long ip) 554void kasan_poison_kfree(void *ptr, unsigned long ip)
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 0777649e07c4..63fca3172659 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -46,7 +46,7 @@ void kasan_init_tags(void)
46 int cpu; 46 int cpu;
47 47
48 for_each_possible_cpu(cpu) 48 for_each_possible_cpu(cpu)
49 per_cpu(prng_state, cpu) = get_random_u32(); 49 per_cpu(prng_state, cpu) = (u32)get_cycles();
50} 50}
51 51
52/* 52/*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f9d9dc250428..707fa5579f66 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -574,6 +574,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
574 unsigned long flags; 574 unsigned long flags;
575 struct kmemleak_object *object, *parent; 575 struct kmemleak_object *object, *parent;
576 struct rb_node **link, *rb_parent; 576 struct rb_node **link, *rb_parent;
577 unsigned long untagged_ptr;
577 578
578 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); 579 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
579 if (!object) { 580 if (!object) {
@@ -619,8 +620,9 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
619 620
620 write_lock_irqsave(&kmemleak_lock, flags); 621 write_lock_irqsave(&kmemleak_lock, flags);
621 622
622 min_addr = min(min_addr, ptr); 623 untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
623 max_addr = max(max_addr, ptr + size); 624 min_addr = min(min_addr, untagged_ptr);
625 max_addr = max(max_addr, untagged_ptr + size);
624 link = &object_tree_root.rb_node; 626 link = &object_tree_root.rb_node;
625 rb_parent = NULL; 627 rb_parent = NULL;
626 while (*link) { 628 while (*link) {
@@ -1333,6 +1335,7 @@ static void scan_block(void *_start, void *_end,
1333 unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); 1335 unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
1334 unsigned long *end = _end - (BYTES_PER_POINTER - 1); 1336 unsigned long *end = _end - (BYTES_PER_POINTER - 1);
1335 unsigned long flags; 1337 unsigned long flags;
1338 unsigned long untagged_ptr;
1336 1339
1337 read_lock_irqsave(&kmemleak_lock, flags); 1340 read_lock_irqsave(&kmemleak_lock, flags);
1338 for (ptr = start; ptr < end; ptr++) { 1341 for (ptr = start; ptr < end; ptr++) {
@@ -1347,7 +1350,8 @@ static void scan_block(void *_start, void *_end,
1347 pointer = *ptr; 1350 pointer = *ptr;
1348 kasan_enable_current(); 1351 kasan_enable_current();
1349 1352
1350 if (pointer < min_addr || pointer >= max_addr) 1353 untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer);
1354 if (untagged_ptr < min_addr || untagged_ptr >= max_addr)
1351 continue; 1355 continue;
1352 1356
1353 /* 1357 /*
diff --git a/mm/memblock.c b/mm/memblock.c
index 022d4cbb3618..ea31045ba704 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -26,6 +26,13 @@
26 26
27#include "internal.h" 27#include "internal.h"
28 28
29#define INIT_MEMBLOCK_REGIONS 128
30#define INIT_PHYSMEM_REGIONS 4
31
32#ifndef INIT_MEMBLOCK_RESERVED_REGIONS
33# define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS
34#endif
35
29/** 36/**
30 * DOC: memblock overview 37 * DOC: memblock overview
31 * 38 *
@@ -92,7 +99,7 @@ unsigned long max_pfn;
92unsigned long long max_possible_pfn; 99unsigned long long max_possible_pfn;
93 100
94static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 101static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
95static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 102static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
96#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP 103#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
97static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock; 104static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
98#endif 105#endif
@@ -105,7 +112,7 @@ struct memblock memblock __initdata_memblock = {
105 112
106 .reserved.regions = memblock_reserved_init_regions, 113 .reserved.regions = memblock_reserved_init_regions,
107 .reserved.cnt = 1, /* empty dummy entry */ 114 .reserved.cnt = 1, /* empty dummy entry */
108 .reserved.max = INIT_MEMBLOCK_REGIONS, 115 .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS,
109 .reserved.name = "reserved", 116 .reserved.name = "reserved",
110 117
111#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP 118#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6379fff1a5ff..831be5ff5f4d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -372,7 +372,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
372 if (fail || tk->addr_valid == 0) { 372 if (fail || tk->addr_valid == 0) {
373 pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", 373 pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
374 pfn, tk->tsk->comm, tk->tsk->pid); 374 pfn, tk->tsk->comm, tk->tsk->pid);
375 force_sig(SIGKILL, tk->tsk); 375 do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
376 tk->tsk, PIDTYPE_PID);
376 } 377 }
377 378
378 /* 379 /*
@@ -966,7 +967,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
966 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 967 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
967 struct address_space *mapping; 968 struct address_space *mapping;
968 LIST_HEAD(tokill); 969 LIST_HEAD(tokill);
969 bool unmap_success = true; 970 bool unmap_success;
970 int kill = 1, forcekill; 971 int kill = 1, forcekill;
971 struct page *hpage = *hpagep; 972 struct page *hpage = *hpagep;
972 bool mlocked = PageMlocked(hpage); 973 bool mlocked = PageMlocked(hpage);
@@ -1028,19 +1029,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
1028 if (kill) 1029 if (kill)
1029 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); 1030 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1030 1031
1031 if (!PageHuge(hpage)) { 1032 unmap_success = try_to_unmap(hpage, ttu);
1032 unmap_success = try_to_unmap(hpage, ttu);
1033 } else if (mapping) {
1034 /*
1035 * For hugetlb pages, try_to_unmap could potentially call
1036 * huge_pmd_unshare. Because of this, take semaphore in
1037 * write mode here and set TTU_RMAP_LOCKED to indicate we
1038 * have taken the lock at this higer level.
1039 */
1040 i_mmap_lock_write(mapping);
1041 unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
1042 i_mmap_unlock_write(mapping);
1043 }
1044 if (!unmap_success) 1033 if (!unmap_success)
1045 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", 1034 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
1046 pfn, page_mapcount(hpage)); 1035 pfn, page_mapcount(hpage));
diff --git a/mm/memory.c b/mm/memory.c
index a52663c0612d..e11ca9dd823f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2994,6 +2994,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
2994 struct vm_area_struct *vma = vmf->vma; 2994 struct vm_area_struct *vma = vmf->vma;
2995 vm_fault_t ret; 2995 vm_fault_t ret;
2996 2996
2997 /*
2998 * Preallocate pte before we take page_lock because this might lead to
2999 * deadlocks for memcg reclaim which waits for pages under writeback:
3000 * lock_page(A)
3001 * SetPageWriteback(A)
3002 * unlock_page(A)
3003 * lock_page(B)
3004 * lock_page(B)
3005 * pte_alloc_pne
3006 * shrink_page_list
3007 * wait_on_page_writeback(A)
3008 * SetPageWriteback(B)
3009 * unlock_page(B)
3010 * # flush A, B to clear the writeback
3011 */
3012 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3013 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3014 if (!vmf->prealloc_pte)
3015 return VM_FAULT_OOM;
3016 smp_wmb(); /* See comment in __pte_alloc() */
3017 }
3018
2997 ret = vma->vm_ops->fault(vmf); 3019 ret = vma->vm_ops->fault(vmf);
2998 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | 3020 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
2999 VM_FAULT_DONE_COW))) 3021 VM_FAULT_DONE_COW)))
@@ -4077,8 +4099,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4077 goto out; 4099 goto out;
4078 4100
4079 if (range) { 4101 if (range) {
4080 range->start = address & PAGE_MASK; 4102 mmu_notifier_range_init(range, mm, address & PAGE_MASK,
4081 range->end = range->start + PAGE_SIZE; 4103 (address & PAGE_MASK) + PAGE_SIZE);
4082 mmu_notifier_invalidate_range_start(range); 4104 mmu_notifier_invalidate_range_start(range);
4083 } 4105 }
4084 ptep = pte_offset_map_lock(mm, pmd, address, ptlp); 4106 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b9a667d36c55..1ad28323fb9f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1188,11 +1188,13 @@ static inline int pageblock_free(struct page *page)
1188 return PageBuddy(page) && page_order(page) >= pageblock_order; 1188 return PageBuddy(page) && page_order(page) >= pageblock_order;
1189} 1189}
1190 1190
1191/* Return the start of the next active pageblock after a given page */ 1191/* Return the pfn of the start of the next active pageblock after a given pfn */
1192static struct page *next_active_pageblock(struct page *page) 1192static unsigned long next_active_pageblock(unsigned long pfn)
1193{ 1193{
1194 struct page *page = pfn_to_page(pfn);
1195
1194 /* Ensure the starting page is pageblock-aligned */ 1196 /* Ensure the starting page is pageblock-aligned */
1195 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1197 BUG_ON(pfn & (pageblock_nr_pages - 1));
1196 1198
1197 /* If the entire pageblock is free, move to the end of free page */ 1199 /* If the entire pageblock is free, move to the end of free page */
1198 if (pageblock_free(page)) { 1200 if (pageblock_free(page)) {
@@ -1200,16 +1202,16 @@ static struct page *next_active_pageblock(struct page *page)
1200 /* be careful. we don't have locks, page_order can be changed.*/ 1202 /* be careful. we don't have locks, page_order can be changed.*/
1201 order = page_order(page); 1203 order = page_order(page);
1202 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1204 if ((order < MAX_ORDER) && (order >= pageblock_order))
1203 return page + (1 << order); 1205 return pfn + (1 << order);
1204 } 1206 }
1205 1207
1206 return page + pageblock_nr_pages; 1208 return pfn + pageblock_nr_pages;
1207} 1209}
1208 1210
1209static bool is_pageblock_removable_nolock(struct page *page) 1211static bool is_pageblock_removable_nolock(unsigned long pfn)
1210{ 1212{
1213 struct page *page = pfn_to_page(pfn);
1211 struct zone *zone; 1214 struct zone *zone;
1212 unsigned long pfn;
1213 1215
1214 /* 1216 /*
1215 * We have to be careful here because we are iterating over memory 1217 * We have to be careful here because we are iterating over memory
@@ -1232,12 +1234,14 @@ static bool is_pageblock_removable_nolock(struct page *page)
1232/* Checks if this range of memory is likely to be hot-removable. */ 1234/* Checks if this range of memory is likely to be hot-removable. */
1233bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1235bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
1234{ 1236{
1235 struct page *page = pfn_to_page(start_pfn); 1237 unsigned long end_pfn, pfn;
1236 struct page *end_page = page + nr_pages; 1238
1239 end_pfn = min(start_pfn + nr_pages,
1240 zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
1237 1241
1238 /* Check the starting page of each pageblock within the range */ 1242 /* Check the starting page of each pageblock within the range */
1239 for (; page < end_page; page = next_active_pageblock(page)) { 1243 for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
1240 if (!is_pageblock_removable_nolock(page)) 1244 if (!is_pageblock_removable_nolock(pfn))
1241 return false; 1245 return false;
1242 cond_resched(); 1246 cond_resched();
1243 } 1247 }
@@ -1273,6 +1277,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
1273 i++; 1277 i++;
1274 if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) 1278 if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
1275 continue; 1279 continue;
1280 /* Check if we got outside of the zone */
1281 if (zone && !zone_spans_pfn(zone, pfn + i))
1282 return 0;
1276 page = pfn_to_page(pfn + i); 1283 page = pfn_to_page(pfn + i);
1277 if (zone && page_zone(page) != zone) 1284 if (zone && page_zone(page) != zone)
1278 return 0; 1285 return 0;
@@ -1301,23 +1308,27 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
1301static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1308static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1302{ 1309{
1303 unsigned long pfn; 1310 unsigned long pfn;
1304 struct page *page; 1311
1305 for (pfn = start; pfn < end; pfn++) { 1312 for (pfn = start; pfn < end; pfn++) {
1306 if (pfn_valid(pfn)) { 1313 struct page *page, *head;
1307 page = pfn_to_page(pfn); 1314 unsigned long skip;
1308 if (PageLRU(page)) 1315
1309 return pfn; 1316 if (!pfn_valid(pfn))
1310 if (__PageMovable(page)) 1317 continue;
1311 return pfn; 1318 page = pfn_to_page(pfn);
1312 if (PageHuge(page)) { 1319 if (PageLRU(page))
1313 if (hugepage_migration_supported(page_hstate(page)) && 1320 return pfn;
1314 page_huge_active(page)) 1321 if (__PageMovable(page))
1315 return pfn; 1322 return pfn;
1316 else 1323
1317 pfn = round_up(pfn + 1, 1324 if (!PageHuge(page))
1318 1 << compound_order(page)) - 1; 1325 continue;
1319 } 1326 head = compound_head(page);
1320 } 1327 if (hugepage_migration_supported(page_hstate(head)) &&
1328 page_huge_active(head))
1329 return pfn;
1330 skip = (1 << compound_order(head)) - (page - head);
1331 pfn += skip - 1;
1321 } 1332 }
1322 return 0; 1333 return 0;
1323} 1334}
@@ -1344,7 +1355,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1344{ 1355{
1345 unsigned long pfn; 1356 unsigned long pfn;
1346 struct page *page; 1357 struct page *page;
1347 int not_managed = 0;
1348 int ret = 0; 1358 int ret = 0;
1349 LIST_HEAD(source); 1359 LIST_HEAD(source);
1350 1360
@@ -1392,7 +1402,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1392 else 1402 else
1393 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); 1403 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1394 if (!ret) { /* Success */ 1404 if (!ret) { /* Success */
1395 put_page(page);
1396 list_add_tail(&page->lru, &source); 1405 list_add_tail(&page->lru, &source);
1397 if (!__PageMovable(page)) 1406 if (!__PageMovable(page))
1398 inc_node_page_state(page, NR_ISOLATED_ANON + 1407 inc_node_page_state(page, NR_ISOLATED_ANON +
@@ -1401,22 +1410,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1401 } else { 1410 } else {
1402 pr_warn("failed to isolate pfn %lx\n", pfn); 1411 pr_warn("failed to isolate pfn %lx\n", pfn);
1403 dump_page(page, "isolation failed"); 1412 dump_page(page, "isolation failed");
1404 put_page(page);
1405 /* Because we don't have big zone->lock. we should
1406 check this again here. */
1407 if (page_count(page)) {
1408 not_managed++;
1409 ret = -EBUSY;
1410 break;
1411 }
1412 } 1413 }
1414 put_page(page);
1413 } 1415 }
1414 if (!list_empty(&source)) { 1416 if (!list_empty(&source)) {
1415 if (not_managed) {
1416 putback_movable_pages(&source);
1417 goto out;
1418 }
1419
1420 /* Allocate a new page from the nearest neighbor node */ 1417 /* Allocate a new page from the nearest neighbor node */
1421 ret = migrate_pages(&source, new_node_page, NULL, 0, 1418 ret = migrate_pages(&source, new_node_page, NULL, 0,
1422 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1419 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
@@ -1429,7 +1426,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1429 putback_movable_pages(&source); 1426 putback_movable_pages(&source);
1430 } 1427 }
1431 } 1428 }
1432out: 1429
1433 return ret; 1430 return ret;
1434} 1431}
1435 1432
@@ -1576,7 +1573,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
1576 we assume this for now. .*/ 1573 we assume this for now. .*/
1577 if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, 1574 if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
1578 &valid_end)) { 1575 &valid_end)) {
1579 mem_hotplug_done();
1580 ret = -EINVAL; 1576 ret = -EINVAL;
1581 reason = "multizone range"; 1577 reason = "multizone range";
1582 goto failed_removal; 1578 goto failed_removal;
@@ -1591,7 +1587,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
1591 MIGRATE_MOVABLE, 1587 MIGRATE_MOVABLE,
1592 SKIP_HWPOISON | REPORT_FAILURE); 1588 SKIP_HWPOISON | REPORT_FAILURE);
1593 if (ret) { 1589 if (ret) {
1594 mem_hotplug_done();
1595 reason = "failure to isolate range"; 1590 reason = "failure to isolate range";
1596 goto failed_removal; 1591 goto failed_removal;
1597 } 1592 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d4496d9d34f5..ee2bce59d2bf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1314,7 +1314,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1314 nodemask_t *nodes) 1314 nodemask_t *nodes)
1315{ 1315{
1316 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 1316 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1317 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); 1317 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1318 1318
1319 if (copy > nbytes) { 1319 if (copy > nbytes) {
1320 if (copy > PAGE_SIZE) 1320 if (copy > PAGE_SIZE)
@@ -1491,7 +1491,7 @@ static int kernel_get_mempolicy(int __user *policy,
1491 int uninitialized_var(pval); 1491 int uninitialized_var(pval);
1492 nodemask_t nodes; 1492 nodemask_t nodes;
1493 1493
1494 if (nmask != NULL && maxnode < MAX_NUMNODES) 1494 if (nmask != NULL && maxnode < nr_node_ids)
1495 return -EINVAL; 1495 return -EINVAL;
1496 1496
1497 err = do_get_mempolicy(&pval, &nodes, addr, flags); 1497 err = do_get_mempolicy(&pval, &nodes, addr, flags);
@@ -1527,7 +1527,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1527 unsigned long nr_bits, alloc_size; 1527 unsigned long nr_bits, alloc_size;
1528 DECLARE_BITMAP(bm, MAX_NUMNODES); 1528 DECLARE_BITMAP(bm, MAX_NUMNODES);
1529 1529
1530 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 1530 nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1531 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 1531 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1532 1532
1533 if (nmask) 1533 if (nmask)
diff --git a/mm/migrate.c b/mm/migrate.c
index ccf8966caf6f..d4fd680be3b0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -709,7 +709,6 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
709 /* Simple case, sync compaction */ 709 /* Simple case, sync compaction */
710 if (mode != MIGRATE_ASYNC) { 710 if (mode != MIGRATE_ASYNC) {
711 do { 711 do {
712 get_bh(bh);
713 lock_buffer(bh); 712 lock_buffer(bh);
714 bh = bh->b_this_page; 713 bh = bh->b_this_page;
715 714
@@ -720,18 +719,15 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
720 719
721 /* async case, we cannot block on lock_buffer so use trylock_buffer */ 720 /* async case, we cannot block on lock_buffer so use trylock_buffer */
722 do { 721 do {
723 get_bh(bh);
724 if (!trylock_buffer(bh)) { 722 if (!trylock_buffer(bh)) {
725 /* 723 /*
726 * We failed to lock the buffer and cannot stall in 724 * We failed to lock the buffer and cannot stall in
727 * async migration. Release the taken locks 725 * async migration. Release the taken locks
728 */ 726 */
729 struct buffer_head *failed_bh = bh; 727 struct buffer_head *failed_bh = bh;
730 put_bh(failed_bh);
731 bh = head; 728 bh = head;
732 while (bh != failed_bh) { 729 while (bh != failed_bh) {
733 unlock_buffer(bh); 730 unlock_buffer(bh);
734 put_bh(bh);
735 bh = bh->b_this_page; 731 bh = bh->b_this_page;
736 } 732 }
737 return false; 733 return false;
@@ -818,7 +814,6 @@ unlock_buffers:
818 bh = head; 814 bh = head;
819 do { 815 do {
820 unlock_buffer(bh); 816 unlock_buffer(bh);
821 put_bh(bh);
822 bh = bh->b_this_page; 817 bh = bh->b_this_page;
823 818
824 } while (bh != head); 819 } while (bh != head);
@@ -1135,10 +1130,13 @@ out:
1135 * If migration is successful, decrease refcount of the newpage 1130 * If migration is successful, decrease refcount of the newpage
1136 * which will not free the page because new page owner increased 1131 * which will not free the page because new page owner increased
1137 * refcounter. As well, if it is LRU page, add the page to LRU 1132 * refcounter. As well, if it is LRU page, add the page to LRU
1138 * list in here. 1133 * list in here. Use the old state of the isolated source page to
1134 * determine if we migrated a LRU page. newpage was already unlocked
1135 * and possibly modified by its owner - don't rely on the page
1136 * state.
1139 */ 1137 */
1140 if (rc == MIGRATEPAGE_SUCCESS) { 1138 if (rc == MIGRATEPAGE_SUCCESS) {
1141 if (unlikely(__PageMovable(newpage))) 1139 if (unlikely(!is_lru))
1142 put_page(newpage); 1140 put_page(newpage);
1143 else 1141 else
1144 putback_lru_page(newpage); 1142 putback_lru_page(newpage);
@@ -1324,19 +1322,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1324 goto put_anon; 1322 goto put_anon;
1325 1323
1326 if (page_mapped(hpage)) { 1324 if (page_mapped(hpage)) {
1327 struct address_space *mapping = page_mapping(hpage);
1328
1329 /*
1330 * try_to_unmap could potentially call huge_pmd_unshare.
1331 * Because of this, take semaphore in write mode here and
1332 * set TTU_RMAP_LOCKED to let lower levels know we have
1333 * taken the lock.
1334 */
1335 i_mmap_lock_write(mapping);
1336 try_to_unmap(hpage, 1325 try_to_unmap(hpage,
1337 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| 1326 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1338 TTU_RMAP_LOCKED);
1339 i_mmap_unlock_write(mapping);
1340 page_was_mapped = 1; 1327 page_was_mapped = 1;
1341 } 1328 }
1342 1329
diff --git a/mm/mincore.c b/mm/mincore.c
index f0f91461a9f4..218099b5ed31 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -42,14 +42,72 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
42 return 0; 42 return 0;
43} 43}
44 44
45static int mincore_unmapped_range(unsigned long addr, unsigned long end, 45/*
46 struct mm_walk *walk) 46 * Later we can get more picky about what "in core" means precisely.
47 * For now, simply check to see if the page is in the page cache,
48 * and is up to date; i.e. that no page-in operation would be required
49 * at this time if an application were to map and access this page.
50 */
51static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
52{
53 unsigned char present = 0;
54 struct page *page;
55
56 /*
57 * When tmpfs swaps out a page from a file, any process mapping that
58 * file will not get a swp_entry_t in its pte, but rather it is like
59 * any other file mapping (ie. marked !present and faulted in with
60 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
61 */
62#ifdef CONFIG_SWAP
63 if (shmem_mapping(mapping)) {
64 page = find_get_entry(mapping, pgoff);
65 /*
66 * shmem/tmpfs may return swap: account for swapcache
67 * page too.
68 */
69 if (xa_is_value(page)) {
70 swp_entry_t swp = radix_to_swp_entry(page);
71 page = find_get_page(swap_address_space(swp),
72 swp_offset(swp));
73 }
74 } else
75 page = find_get_page(mapping, pgoff);
76#else
77 page = find_get_page(mapping, pgoff);
78#endif
79 if (page) {
80 present = PageUptodate(page);
81 put_page(page);
82 }
83
84 return present;
85}
86
87static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
88 struct vm_area_struct *vma, unsigned char *vec)
47{ 89{
48 unsigned char *vec = walk->private;
49 unsigned long nr = (end - addr) >> PAGE_SHIFT; 90 unsigned long nr = (end - addr) >> PAGE_SHIFT;
91 int i;
50 92
51 memset(vec, 0, nr); 93 if (vma->vm_file) {
52 walk->private += nr; 94 pgoff_t pgoff;
95
96 pgoff = linear_page_index(vma, addr);
97 for (i = 0; i < nr; i++, pgoff++)
98 vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
99 } else {
100 for (i = 0; i < nr; i++)
101 vec[i] = 0;
102 }
103 return nr;
104}
105
106static int mincore_unmapped_range(unsigned long addr, unsigned long end,
107 struct mm_walk *walk)
108{
109 walk->private += __mincore_unmapped_range(addr, end,
110 walk->vma, walk->private);
53 return 0; 111 return 0;
54} 112}
55 113
@@ -69,9 +127,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
69 goto out; 127 goto out;
70 } 128 }
71 129
72 /* We'll consider a THP page under construction to be there */
73 if (pmd_trans_unstable(pmd)) { 130 if (pmd_trans_unstable(pmd)) {
74 memset(vec, 1, nr); 131 __mincore_unmapped_range(addr, end, vma, vec);
75 goto out; 132 goto out;
76 } 133 }
77 134
@@ -80,17 +137,28 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
80 pte_t pte = *ptep; 137 pte_t pte = *ptep;
81 138
82 if (pte_none(pte)) 139 if (pte_none(pte))
83 *vec = 0; 140 __mincore_unmapped_range(addr, addr + PAGE_SIZE,
141 vma, vec);
84 else if (pte_present(pte)) 142 else if (pte_present(pte))
85 *vec = 1; 143 *vec = 1;
86 else { /* pte is a swap entry */ 144 else { /* pte is a swap entry */
87 swp_entry_t entry = pte_to_swp_entry(pte); 145 swp_entry_t entry = pte_to_swp_entry(pte);
88 146
89 /* 147 if (non_swap_entry(entry)) {
90 * migration or hwpoison entries are always 148 /*
91 * uptodate 149 * migration or hwpoison entries are always
92 */ 150 * uptodate
93 *vec = !!non_swap_entry(entry); 151 */
152 *vec = 1;
153 } else {
154#ifdef CONFIG_SWAP
155 *vec = mincore_page(swap_address_space(entry),
156 swp_offset(entry));
157#else
158 WARN_ON(1);
159 *vec = 1;
160#endif
161 }
94 } 162 }
95 vec++; 163 vec++;
96 } 164 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f0e8cd9edb1a..26ea8636758f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -647,8 +647,8 @@ static int oom_reaper(void *unused)
647 647
648static void wake_oom_reaper(struct task_struct *tsk) 648static void wake_oom_reaper(struct task_struct *tsk)
649{ 649{
650 /* tsk is already queued? */ 650 /* mm is already queued? */
651 if (tsk == oom_reaper_list || tsk->oom_reaper_list) 651 if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
652 return; 652 return;
653 653
654 get_task_struct(tsk); 654 get_task_struct(tsk);
@@ -975,6 +975,13 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
975 * still freeing memory. 975 * still freeing memory.
976 */ 976 */
977 read_lock(&tasklist_lock); 977 read_lock(&tasklist_lock);
978
979 /*
980 * The task 'p' might have already exited before reaching here. The
981 * put_task_struct() will free task_struct 'p' while the loop still try
982 * to access the field of 'p', so, get an extra reference.
983 */
984 get_task_struct(p);
978 for_each_thread(p, t) { 985 for_each_thread(p, t) {
979 list_for_each_entry(child, &t->children, sibling) { 986 list_for_each_entry(child, &t->children, sibling) {
980 unsigned int child_points; 987 unsigned int child_points;
@@ -994,6 +1001,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
994 } 1001 }
995 } 1002 }
996 } 1003 }
1004 put_task_struct(p);
997 read_unlock(&tasklist_lock); 1005 read_unlock(&tasklist_lock);
998 1006
999 /* 1007 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cde5dac6229a..0b9f577b1a2a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2170,6 +2170,18 @@ static inline void boost_watermark(struct zone *zone)
2170 2170
2171 max_boost = mult_frac(zone->_watermark[WMARK_HIGH], 2171 max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
2172 watermark_boost_factor, 10000); 2172 watermark_boost_factor, 10000);
2173
2174 /*
2175 * high watermark may be uninitialised if fragmentation occurs
2176 * very early in boot so do not boost. We do not fall
2177 * through and boost by pageblock_nr_pages as failing
2178 * allocations that early means that reclaim is not going
2179 * to help and it may even be impossible to reclaim the
2180 * boosted watermark resulting in a hang.
2181 */
2182 if (!max_boost)
2183 return;
2184
2173 max_boost = max(pageblock_nr_pages, max_boost); 2185 max_boost = max(pageblock_nr_pages, max_boost);
2174 2186
2175 zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, 2187 zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
@@ -2214,7 +2226,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
2214 */ 2226 */
2215 boost_watermark(zone); 2227 boost_watermark(zone);
2216 if (alloc_flags & ALLOC_KSWAPD) 2228 if (alloc_flags & ALLOC_KSWAPD)
2217 wakeup_kswapd(zone, 0, 0, zone_idx(zone)); 2229 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
2218 2230
2219 /* We are not allowed to try stealing from the whole block */ 2231 /* We are not allowed to try stealing from the whole block */
2220 if (!whole_block) 2232 if (!whole_block)
@@ -3102,6 +3114,12 @@ struct page *rmqueue(struct zone *preferred_zone,
3102 local_irq_restore(flags); 3114 local_irq_restore(flags);
3103 3115
3104out: 3116out:
3117 /* Separate test+clear to avoid unnecessary atomics */
3118 if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
3119 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
3120 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
3121 }
3122
3105 VM_BUG_ON_PAGE(page && bad_range(zone, page), page); 3123 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
3106 return page; 3124 return page;
3107 3125
@@ -4669,11 +4687,11 @@ refill:
4669 /* Even if we own the page, we do not use atomic_set(). 4687 /* Even if we own the page, we do not use atomic_set().
4670 * This would break get_page_unless_zero() users. 4688 * This would break get_page_unless_zero() users.
4671 */ 4689 */
4672 page_ref_add(page, size - 1); 4690 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
4673 4691
4674 /* reset page count bias and offset to start of new frag */ 4692 /* reset page count bias and offset to start of new frag */
4675 nc->pfmemalloc = page_is_pfmemalloc(page); 4693 nc->pfmemalloc = page_is_pfmemalloc(page);
4676 nc->pagecnt_bias = size; 4694 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
4677 nc->offset = size; 4695 nc->offset = size;
4678 } 4696 }
4679 4697
@@ -4689,10 +4707,10 @@ refill:
4689 size = nc->size; 4707 size = nc->size;
4690#endif 4708#endif
4691 /* OK, page count is 0, we can safely set it */ 4709 /* OK, page count is 0, we can safely set it */
4692 set_page_count(page, size); 4710 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
4693 4711
4694 /* reset page count bias and offset to start of new frag */ 4712 /* reset page count bias and offset to start of new frag */
4695 nc->pagecnt_bias = size; 4713 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
4696 offset = size - fragsz; 4714 offset = size - fragsz;
4697 } 4715 }
4698 4716
@@ -5695,18 +5713,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5695 cond_resched(); 5713 cond_resched();
5696 } 5714 }
5697 } 5715 }
5698#ifdef CONFIG_SPARSEMEM
5699 /*
5700 * If the zone does not span the rest of the section then
5701 * we should at least initialize those pages. Otherwise we
5702 * could blow up on a poisoned page in some paths which depend
5703 * on full sections being initialized (e.g. memory hotplug).
5704 */
5705 while (end_pfn % PAGES_PER_SECTION) {
5706 __init_single_page(pfn_to_page(end_pfn), end_pfn, zone, nid);
5707 end_pfn++;
5708 }
5709#endif
5710} 5716}
5711 5717
5712#ifdef CONFIG_ZONE_DEVICE 5718#ifdef CONFIG_ZONE_DEVICE
diff --git a/mm/page_ext.c b/mm/page_ext.c
index ae44f7adbe07..8c78b8d45117 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -398,10 +398,8 @@ void __init page_ext_init(void)
398 * We know some arch can have a nodes layout such as 398 * We know some arch can have a nodes layout such as
399 * -------------pfn--------------> 399 * -------------pfn-------------->
400 * N0 | N1 | N2 | N0 | N1 | N2|.... 400 * N0 | N1 | N2 | N0 | N1 | N2|....
401 *
402 * Take into account DEFERRED_STRUCT_PAGE_INIT.
403 */ 401 */
404 if (early_pfn_to_nid(pfn) != nid) 402 if (pfn_to_nid(pfn) != nid)
405 continue; 403 continue;
406 if (init_section_page_ext(pfn, nid)) 404 if (init_section_page_ext(pfn, nid))
407 goto oom; 405 goto oom;
diff --git a/mm/rmap.c b/mm/rmap.c
index 21a26cf51114..0454ecc29537 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,7 +25,6 @@
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) 26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
27 * mapping->i_mmap_rwsem 27 * mapping->i_mmap_rwsem
28 * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
29 * anon_vma->rwsem 28 * anon_vma->rwsem
30 * mm->page_table_lock or pte_lock 29 * mm->page_table_lock or pte_lock
31 * zone_lru_lock (in mark_page_accessed, isolate_lru_page) 30 * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -1372,16 +1371,13 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1372 * Note that the page can not be free in this function as call of 1371 * Note that the page can not be free in this function as call of
1373 * try_to_unmap() must hold a reference on the page. 1372 * try_to_unmap() must hold a reference on the page.
1374 */ 1373 */
1375 mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start, 1374 mmu_notifier_range_init(&range, vma->vm_mm, address,
1376 min(vma->vm_end, vma->vm_start + 1375 min(vma->vm_end, address +
1377 (PAGE_SIZE << compound_order(page)))); 1376 (PAGE_SIZE << compound_order(page))));
1378 if (PageHuge(page)) { 1377 if (PageHuge(page)) {
1379 /* 1378 /*
1380 * If sharing is possible, start and end will be adjusted 1379 * If sharing is possible, start and end will be adjusted
1381 * accordingly. 1380 * accordingly.
1382 *
1383 * If called for a huge page, caller must hold i_mmap_rwsem
1384 * in write mode as it is possible to call huge_pmd_unshare.
1385 */ 1381 */
1386 adjust_range_if_pmd_sharing_possible(vma, &range.start, 1382 adjust_range_if_pmd_sharing_possible(vma, &range.start,
1387 &range.end); 1383 &range.end);
diff --git a/mm/shmem.c b/mm/shmem.c
index 6ece1e2fe76e..0905215fb016 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2854,10 +2854,14 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
2854 * No ordinary (disk based) filesystem counts links as inodes; 2854 * No ordinary (disk based) filesystem counts links as inodes;
2855 * but each new link needs a new dentry, pinning lowmem, and 2855 * but each new link needs a new dentry, pinning lowmem, and
2856 * tmpfs dentries cannot be pruned until they are unlinked. 2856 * tmpfs dentries cannot be pruned until they are unlinked.
2857 * But if an O_TMPFILE file is linked into the tmpfs, the
2858 * first link must skip that, to get the accounting right.
2857 */ 2859 */
2858 ret = shmem_reserve_inode(inode->i_sb); 2860 if (inode->i_nlink) {
2859 if (ret) 2861 ret = shmem_reserve_inode(inode->i_sb);
2860 goto out; 2862 if (ret)
2863 goto out;
2864 }
2861 2865
2862 dir->i_size += BOGO_DIRENT_SIZE; 2866 dir->i_size += BOGO_DIRENT_SIZE;
2863 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 2867 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
diff --git a/mm/slab.c b/mm/slab.c
index 73fe23e649c9..91c1863df93d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -666,8 +666,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
666 struct alien_cache *alc = NULL; 666 struct alien_cache *alc = NULL;
667 667
668 alc = kmalloc_node(memsize, gfp, node); 668 alc = kmalloc_node(memsize, gfp, node);
669 init_arraycache(&alc->ac, entries, batch); 669 if (alc) {
670 spin_lock_init(&alc->lock); 670 init_arraycache(&alc->ac, entries, batch);
671 spin_lock_init(&alc->lock);
672 }
671 return alc; 673 return alc;
672} 674}
673 675
@@ -2357,7 +2359,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
2357 void *freelist; 2359 void *freelist;
2358 void *addr = page_address(page); 2360 void *addr = page_address(page);
2359 2361
2360 page->s_mem = kasan_reset_tag(addr) + colour_off; 2362 page->s_mem = addr + colour_off;
2361 page->active = 0; 2363 page->active = 0;
2362 2364
2363 if (OBJFREELIST_SLAB(cachep)) 2365 if (OBJFREELIST_SLAB(cachep))
@@ -2366,6 +2368,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
2366 /* Slab management obj is off-slab. */ 2368 /* Slab management obj is off-slab. */
2367 freelist = kmem_cache_alloc_node(cachep->freelist_cache, 2369 freelist = kmem_cache_alloc_node(cachep->freelist_cache,
2368 local_flags, nodeid); 2370 local_flags, nodeid);
2371 freelist = kasan_reset_tag(freelist);
2369 if (!freelist) 2372 if (!freelist)
2370 return NULL; 2373 return NULL;
2371 } else { 2374 } else {
@@ -2679,6 +2682,13 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
2679 2682
2680 offset *= cachep->colour_off; 2683 offset *= cachep->colour_off;
2681 2684
2685 /*
2686 * Call kasan_poison_slab() before calling alloc_slabmgmt(), so
2687 * page_address() in the latter returns a non-tagged pointer,
2688 * as it should be for slab pages.
2689 */
2690 kasan_poison_slab(page);
2691
2682 /* Get slab management. */ 2692 /* Get slab management. */
2683 freelist = alloc_slabmgmt(cachep, page, offset, 2693 freelist = alloc_slabmgmt(cachep, page, offset,
2684 local_flags & ~GFP_CONSTRAINT_MASK, page_node); 2694 local_flags & ~GFP_CONSTRAINT_MASK, page_node);
@@ -2687,7 +2697,6 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
2687 2697
2688 slab_map_pages(cachep, page, freelist); 2698 slab_map_pages(cachep, page, freelist);
2689 2699
2690 kasan_poison_slab(page);
2691 cache_init_objs(cachep, page); 2700 cache_init_objs(cachep, page);
2692 2701
2693 if (gfpflags_allow_blocking(local_flags)) 2702 if (gfpflags_allow_blocking(local_flags))
@@ -3538,7 +3547,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3538{ 3547{
3539 void *ret = slab_alloc(cachep, flags, _RET_IP_); 3548 void *ret = slab_alloc(cachep, flags, _RET_IP_);
3540 3549
3541 ret = kasan_slab_alloc(cachep, ret, flags);
3542 trace_kmem_cache_alloc(_RET_IP_, ret, 3550 trace_kmem_cache_alloc(_RET_IP_, ret,
3543 cachep->object_size, cachep->size, flags); 3551 cachep->object_size, cachep->size, flags);
3544 3552
@@ -3628,7 +3636,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3628{ 3636{
3629 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3637 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3630 3638
3631 ret = kasan_slab_alloc(cachep, ret, flags);
3632 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3639 trace_kmem_cache_alloc_node(_RET_IP_, ret,
3633 cachep->object_size, cachep->size, 3640 cachep->object_size, cachep->size,
3634 flags, nodeid); 3641 flags, nodeid);
@@ -4406,6 +4413,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
4406 unsigned int objnr; 4413 unsigned int objnr;
4407 unsigned long offset; 4414 unsigned long offset;
4408 4415
4416 ptr = kasan_reset_tag(ptr);
4417
4409 /* Find and validate object. */ 4418 /* Find and validate object. */
4410 cachep = page->slab_cache; 4419 cachep = page->slab_cache;
4411 objnr = obj_to_index(cachep, page, (void *)ptr); 4420 objnr = obj_to_index(cachep, page, (void *)ptr);
diff --git a/mm/slab.h b/mm/slab.h
index 4190c24ef0e9..384105318779 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -437,11 +437,10 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
437 437
438 flags &= gfp_allowed_mask; 438 flags &= gfp_allowed_mask;
439 for (i = 0; i < size; i++) { 439 for (i = 0; i < size; i++) {
440 void *object = p[i]; 440 p[i] = kasan_slab_alloc(s, p[i], flags);
441 441 /* As p[i] might get tagged, call kmemleak hook after KASAN. */
442 kmemleak_alloc_recursive(object, s->object_size, 1, 442 kmemleak_alloc_recursive(p[i], s->object_size, 1,
443 s->flags, flags); 443 s->flags, flags);
444 p[i] = kasan_slab_alloc(s, object, flags);
445 } 444 }
446 445
447 if (memcg_kmem_enabled()) 446 if (memcg_kmem_enabled())
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 81732d05e74a..f9d89c1b5977 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1228,8 +1228,9 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
1228 flags |= __GFP_COMP; 1228 flags |= __GFP_COMP;
1229 page = alloc_pages(flags, order); 1229 page = alloc_pages(flags, order);
1230 ret = page ? page_address(page) : NULL; 1230 ret = page ? page_address(page) : NULL;
1231 kmemleak_alloc(ret, size, 1, flags);
1232 ret = kasan_kmalloc_large(ret, size, flags); 1231 ret = kasan_kmalloc_large(ret, size, flags);
1232 /* As ret might get tagged, call kmemleak hook after KASAN. */
1233 kmemleak_alloc(ret, size, 1, flags);
1233 return ret; 1234 return ret;
1234} 1235}
1235EXPORT_SYMBOL(kmalloc_order); 1236EXPORT_SYMBOL(kmalloc_order);
diff --git a/mm/slub.c b/mm/slub.c
index 36c0befeebd8..dc777761b6b7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -249,7 +249,18 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
249 unsigned long ptr_addr) 249 unsigned long ptr_addr)
250{ 250{
251#ifdef CONFIG_SLAB_FREELIST_HARDENED 251#ifdef CONFIG_SLAB_FREELIST_HARDENED
252 return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr); 252 /*
253 * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
254 * Normally, this doesn't cause any issues, as both set_freepointer()
255 * and get_freepointer() are called with a pointer with the same tag.
256 * However, there are some issues with CONFIG_SLUB_DEBUG code. For
257 * example, when __free_slub() iterates over objects in a cache, it
258 * passes untagged pointers to check_object(). check_object() in turns
259 * calls get_freepointer() with an untagged pointer, which causes the
260 * freepointer to be restored incorrectly.
261 */
262 return (void *)((unsigned long)ptr ^ s->random ^
263 (unsigned long)kasan_reset_tag((void *)ptr_addr));
253#else 264#else
254 return ptr; 265 return ptr;
255#endif 266#endif
@@ -303,15 +314,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
303 __p < (__addr) + (__objects) * (__s)->size; \ 314 __p < (__addr) + (__objects) * (__s)->size; \
304 __p += (__s)->size) 315 __p += (__s)->size)
305 316
306#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
307 for (__p = fixup_red_left(__s, __addr), __idx = 1; \
308 __idx <= __objects; \
309 __p += (__s)->size, __idx++)
310
311/* Determine object index from a given position */ 317/* Determine object index from a given position */
312static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) 318static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
313{ 319{
314 return (p - addr) / s->size; 320 return (kasan_reset_tag(p) - addr) / s->size;
315} 321}
316 322
317static inline unsigned int order_objects(unsigned int order, unsigned int size) 323static inline unsigned int order_objects(unsigned int order, unsigned int size)
@@ -507,6 +513,7 @@ static inline int check_valid_pointer(struct kmem_cache *s,
507 return 1; 513 return 1;
508 514
509 base = page_address(page); 515 base = page_address(page);
516 object = kasan_reset_tag(object);
510 object = restore_red_left(s, object); 517 object = restore_red_left(s, object);
511 if (object < base || object >= base + page->objects * s->size || 518 if (object < base || object >= base + page->objects * s->size ||
512 (object - base) % s->size) { 519 (object - base) % s->size) {
@@ -1075,6 +1082,16 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
1075 init_tracking(s, object); 1082 init_tracking(s, object);
1076} 1083}
1077 1084
1085static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
1086{
1087 if (!(s->flags & SLAB_POISON))
1088 return;
1089
1090 metadata_access_enable();
1091 memset(addr, POISON_INUSE, PAGE_SIZE << order);
1092 metadata_access_disable();
1093}
1094
1078static inline int alloc_consistency_checks(struct kmem_cache *s, 1095static inline int alloc_consistency_checks(struct kmem_cache *s,
1079 struct page *page, 1096 struct page *page,
1080 void *object, unsigned long addr) 1097 void *object, unsigned long addr)
@@ -1330,6 +1347,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
1330#else /* !CONFIG_SLUB_DEBUG */ 1347#else /* !CONFIG_SLUB_DEBUG */
1331static inline void setup_object_debug(struct kmem_cache *s, 1348static inline void setup_object_debug(struct kmem_cache *s,
1332 struct page *page, void *object) {} 1349 struct page *page, void *object) {}
1350static inline void setup_page_debug(struct kmem_cache *s,
1351 void *addr, int order) {}
1333 1352
1334static inline int alloc_debug_processing(struct kmem_cache *s, 1353static inline int alloc_debug_processing(struct kmem_cache *s,
1335 struct page *page, void *object, unsigned long addr) { return 0; } 1354 struct page *page, void *object, unsigned long addr) { return 0; }
@@ -1374,8 +1393,10 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
1374 */ 1393 */
1375static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) 1394static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
1376{ 1395{
1396 ptr = kasan_kmalloc_large(ptr, size, flags);
1397 /* As ptr might get tagged, call kmemleak hook after KASAN. */
1377 kmemleak_alloc(ptr, size, 1, flags); 1398 kmemleak_alloc(ptr, size, 1, flags);
1378 return kasan_kmalloc_large(ptr, size, flags); 1399 return ptr;
1379} 1400}
1380 1401
1381static __always_inline void kfree_hook(void *x) 1402static __always_inline void kfree_hook(void *x)
@@ -1641,27 +1662,25 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1641 if (page_is_pfmemalloc(page)) 1662 if (page_is_pfmemalloc(page))
1642 SetPageSlabPfmemalloc(page); 1663 SetPageSlabPfmemalloc(page);
1643 1664
1644 start = page_address(page); 1665 kasan_poison_slab(page);
1645 1666
1646 if (unlikely(s->flags & SLAB_POISON)) 1667 start = page_address(page);
1647 memset(start, POISON_INUSE, PAGE_SIZE << order);
1648 1668
1649 kasan_poison_slab(page); 1669 setup_page_debug(s, start, order);
1650 1670
1651 shuffle = shuffle_freelist(s, page); 1671 shuffle = shuffle_freelist(s, page);
1652 1672
1653 if (!shuffle) { 1673 if (!shuffle) {
1654 for_each_object_idx(p, idx, s, start, page->objects) {
1655 if (likely(idx < page->objects)) {
1656 next = p + s->size;
1657 next = setup_object(s, page, next);
1658 set_freepointer(s, p, next);
1659 } else
1660 set_freepointer(s, p, NULL);
1661 }
1662 start = fixup_red_left(s, start); 1674 start = fixup_red_left(s, start);
1663 start = setup_object(s, page, start); 1675 start = setup_object(s, page, start);
1664 page->freelist = start; 1676 page->freelist = start;
1677 for (idx = 0, p = start; idx < page->objects - 1; idx++) {
1678 next = p + s->size;
1679 next = setup_object(s, page, next);
1680 set_freepointer(s, p, next);
1681 p = next;
1682 }
1683 set_freepointer(s, p, NULL);
1665 } 1684 }
1666 1685
1667 page->inuse = page->objects; 1686 page->inuse = page->objects;
@@ -3846,6 +3865,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
3846 unsigned int offset; 3865 unsigned int offset;
3847 size_t object_size; 3866 size_t object_size;
3848 3867
3868 ptr = kasan_reset_tag(ptr);
3869
3849 /* Find object and usable object size. */ 3870 /* Find object and usable object size. */
3850 s = page->slab_cache; 3871 s = page->slab_cache;
3851 3872
diff --git a/mm/swap.c b/mm/swap.c
index 4929bc1be60e..4d7d37eb3c40 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -320,11 +320,6 @@ static inline void activate_page_drain(int cpu)
320{ 320{
321} 321}
322 322
323static bool need_activate_page_drain(int cpu)
324{
325 return false;
326}
327
328void activate_page(struct page *page) 323void activate_page(struct page *page)
329{ 324{
330 struct zone *zone = page_zone(page); 325 struct zone *zone = page_zone(page);
@@ -653,13 +648,15 @@ void lru_add_drain(void)
653 put_cpu(); 648 put_cpu();
654} 649}
655 650
651#ifdef CONFIG_SMP
652
653static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
654
656static void lru_add_drain_per_cpu(struct work_struct *dummy) 655static void lru_add_drain_per_cpu(struct work_struct *dummy)
657{ 656{
658 lru_add_drain(); 657 lru_add_drain();
659} 658}
660 659
661static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
662
663/* 660/*
664 * Doesn't need any cpu hotplug locking because we do rely on per-cpu 661 * Doesn't need any cpu hotplug locking because we do rely on per-cpu
665 * kworkers being shut down before our page_alloc_cpu_dead callback is 662 * kworkers being shut down before our page_alloc_cpu_dead callback is
@@ -702,6 +699,12 @@ void lru_add_drain_all(void)
702 699
703 mutex_unlock(&lock); 700 mutex_unlock(&lock);
704} 701}
702#else
703void lru_add_drain_all(void)
704{
705 lru_add_drain();
706}
707#endif
705 708
706/** 709/**
707 * release_pages - batched put_page() 710 * release_pages - batched put_page()
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 852eb4e53f06..14faadcedd06 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
247/* 247/*
248 * Validates that the given object is: 248 * Validates that the given object is:
249 * - not bogus address 249 * - not bogus address
250 * - known-safe heap or stack object 250 * - fully contained by stack (or stack frame, when available)
251 * - fully within SLAB object (or object whitelist area, when available)
251 * - not in kernel text 252 * - not in kernel text
252 */ 253 */
253void __check_object_size(const void *ptr, unsigned long n, bool to_user) 254void __check_object_size(const void *ptr, unsigned long n, bool to_user)
@@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
262 /* Check for invalid addresses. */ 263 /* Check for invalid addresses. */
263 check_bogus_address((const unsigned long)ptr, n, to_user); 264 check_bogus_address((const unsigned long)ptr, n, to_user);
264 265
265 /* Check for bad heap object. */
266 check_heap_object(ptr, n, to_user);
267
268 /* Check for bad stack object. */ 266 /* Check for bad stack object. */
269 switch (check_stack_object(ptr, n)) { 267 switch (check_stack_object(ptr, n)) {
270 case NOT_STACK: 268 case NOT_STACK:
@@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
282 usercopy_abort("process stack", NULL, to_user, 0, n); 280 usercopy_abort("process stack", NULL, to_user, 0, n);
283 } 281 }
284 282
283 /* Check for bad heap object. */
284 check_heap_object(ptr, n, to_user);
285
285 /* Check for object in kernel to avoid text exposure. */ 286 /* Check for object in kernel to avoid text exposure. */
286 check_kernel_text_object((const unsigned long)ptr, n, to_user); 287 check_kernel_text_object((const unsigned long)ptr, n, to_user);
287} 288}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 065c1ce191c4..d59b5a73dfb3 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -267,14 +267,10 @@ retry:
267 VM_BUG_ON(dst_addr & ~huge_page_mask(h)); 267 VM_BUG_ON(dst_addr & ~huge_page_mask(h));
268 268
269 /* 269 /*
270 * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. 270 * Serialize via hugetlb_fault_mutex
271 * i_mmap_rwsem ensures the dst_pte remains valid even
272 * in the case of shared pmds. fault mutex prevents
273 * races with other faulting threads.
274 */ 271 */
275 mapping = dst_vma->vm_file->f_mapping;
276 i_mmap_lock_read(mapping);
277 idx = linear_page_index(dst_vma, dst_addr); 272 idx = linear_page_index(dst_vma, dst_addr);
273 mapping = dst_vma->vm_file->f_mapping;
278 hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, 274 hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
279 idx, dst_addr); 275 idx, dst_addr);
280 mutex_lock(&hugetlb_fault_mutex_table[hash]); 276 mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -283,7 +279,6 @@ retry:
283 dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); 279 dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
284 if (!dst_pte) { 280 if (!dst_pte) {
285 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 281 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
286 i_mmap_unlock_read(mapping);
287 goto out_unlock; 282 goto out_unlock;
288 } 283 }
289 284
@@ -291,7 +286,6 @@ retry:
291 dst_pteval = huge_ptep_get(dst_pte); 286 dst_pteval = huge_ptep_get(dst_pte);
292 if (!huge_pte_none(dst_pteval)) { 287 if (!huge_pte_none(dst_pteval)) {
293 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 288 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
294 i_mmap_unlock_read(mapping);
295 goto out_unlock; 289 goto out_unlock;
296 } 290 }
297 291
@@ -299,7 +293,6 @@ retry:
299 dst_addr, src_addr, &page); 293 dst_addr, src_addr, &page);
300 294
301 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 295 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
302 i_mmap_unlock_read(mapping);
303 vm_alloc_shared = vm_shared; 296 vm_alloc_shared = vm_shared;
304 297
305 cond_resched(); 298 cond_resched();
diff --git a/mm/util.c b/mm/util.c
index 4df23d64aac7..379319b1bcfd 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -150,7 +150,7 @@ void *memdup_user(const void __user *src, size_t len)
150{ 150{
151 void *p; 151 void *p;
152 152
153 p = kmalloc_track_caller(len, GFP_USER); 153 p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
154 if (!p) 154 if (!p)
155 return ERR_PTR(-ENOMEM); 155 return ERR_PTR(-ENOMEM);
156 156
@@ -478,7 +478,7 @@ bool page_mapped(struct page *page)
478 return true; 478 return true;
479 if (PageHuge(page)) 479 if (PageHuge(page))
480 return false; 480 return false;
481 for (i = 0; i < hpage_nr_pages(page); i++) { 481 for (i = 0; i < (1 << compound_order(page)); i++) {
482 if (atomic_read(&page[i]._mapcount) >= 0) 482 if (atomic_read(&page[i]._mapcount) >= 0)
483 return true; 483 return true;
484 } 484 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a714c4f800e9..e979705bbf32 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -491,16 +491,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
491 delta = freeable / 2; 491 delta = freeable / 2;
492 } 492 }
493 493
494 /*
495 * Make sure we apply some minimal pressure on default priority
496 * even on small cgroups. Stale objects are not only consuming memory
497 * by themselves, but can also hold a reference to a dying cgroup,
498 * preventing it from being reclaimed. A dying cgroup with all
499 * corresponding structures like per-cpu stats and kmem caches
500 * can be really big, so it may lead to a significant waste of memory.
501 */
502 delta = max_t(unsigned long long, delta, min(freeable, batch_size));
503
504 total_scan += delta; 494 total_scan += delta;
505 if (total_scan < 0) { 495 if (total_scan < 0) {
506 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", 496 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",