summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-12-28 19:55:46 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-12-28 19:55:46 -0500
commitf346b0becb1bc62e45495f9cdbae3eef35d0b635 (patch)
treeae79f3dfb8e031da51d38f0f095f89d7d23f3643 /mm
parent00d59fde8532b2d42e80909d2e58678755e04da9 (diff)
parent0f4991e8fd48987ae476a92cdee6bfec4aff31b8 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: - large KASAN update to use arm's "software tag-based mode" - a few misc things - sh updates - ocfs2 updates - just about all of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (167 commits) kernel/fork.c: mark 'stack_vm_area' with __maybe_unused memcg, oom: notify on oom killer invocation from the charge path mm, swap: fix swapoff with KSM pages include/linux/gfp.h: fix typo mm/hmm: fix memremap.h, move dev_page_fault_t callback to hmm hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization memory_hotplug: add missing newlines to debugging output mm: remove __hugepage_set_anon_rmap() include/linux/vmstat.h: remove unused page state adjustment macro mm/page_alloc.c: allow error injection mm: migrate: drop unused argument of migrate_page_move_mapping() blkdev: avoid migration stalls for blkdev pages mm: migrate: provide buffer_migrate_page_norefs() mm: migrate: move migrate_page_lock_buffers() mm: migrate: lock buffers before migrate_page_move_mapping() mm: migration: factor out code to compute expected number of page references mm, page_alloc: enable pcpu_drain with zone capability kmemleak: add config to select auto scan mm/page_alloc.c: don't call kasan_free_pages() at deferred mem init ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/cma.c11
-rw-r--r--mm/compaction.c2
-rw-r--r--mm/debug.c27
-rw-r--r--mm/filemap.c96
-rw-r--r--mm/highmem.c5
-rw-r--r--mm/hmm.c331
-rw-r--r--mm/huge_memory.c74
-rw-r--r--mm/hugetlb.c133
-rw-r--r--mm/internal.h24
-rw-r--r--mm/kasan/Makefile15
-rw-r--r--mm/kasan/common.c (renamed from mm/kasan/kasan.c)656
-rw-r--r--mm/kasan/generic.c344
-rw-r--r--mm/kasan/generic_report.c153
-rw-r--r--mm/kasan/init.c (renamed from mm/kasan/kasan_init.c)71
-rw-r--r--mm/kasan/kasan.h59
-rw-r--r--mm/kasan/quarantine.c3
-rw-r--r--mm/kasan/report.c272
-rw-r--r--mm/kasan/tags.c161
-rw-r--r--mm/kasan/tags_report.c58
-rw-r--r--mm/khugepaged.c10
-rw-r--r--mm/kmemleak.c19
-rw-r--r--mm/ksm.c35
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memblock.c52
-rw-r--r--mm/memcontrol.c53
-rw-r--r--mm/memory-failure.c16
-rw-r--r--mm/memory.c103
-rw-r--r--mm/memory_hotplug.c172
-rw-r--r--mm/migrate.c264
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/mmu_notifier.c31
-rw-r--r--mm/mprotect.c15
-rw-r--r--mm/mremap.c10
-rw-r--r--mm/oom_kill.c51
-rw-r--r--mm/page-writeback.c35
-rw-r--r--mm/page_alloc.c404
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_owner.c1
-rw-r--r--mm/readahead.c12
-rw-r--r--mm/rmap.c59
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c31
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slab_common.c10
-rw-r--r--mm/slub.c82
-rw-r--r--mm/sparse.c26
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/userfaultfd.c11
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c143
-rw-r--r--mm/vmstat.c4
-rw-r--r--mm/workingset.c2
-rw-r--r--mm/zswap.c4
57 files changed, 2454 insertions, 1770 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d85e39da47ae..25c71eb8a7db 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -291,6 +291,7 @@ config MMU_NOTIFIER
291config KSM 291config KSM
292 bool "Enable KSM for page merging" 292 bool "Enable KSM for page merging"
293 depends on MMU 293 depends on MMU
294 select XXHASH
294 help 295 help
295 Enable Kernel Samepage Merging: KSM periodically scans those areas 296 Enable Kernel Samepage Merging: KSM periodically scans those areas
296 of an application's address space that an app has advised may be 297 of an application's address space that an app has advised may be
diff --git a/mm/cma.c b/mm/cma.c
index 4cb76121a3ab..c7b39dd3b4f6 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -407,6 +407,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
407 unsigned long pfn = -1; 407 unsigned long pfn = -1;
408 unsigned long start = 0; 408 unsigned long start = 0;
409 unsigned long bitmap_maxno, bitmap_no, bitmap_count; 409 unsigned long bitmap_maxno, bitmap_no, bitmap_count;
410 size_t i;
410 struct page *page = NULL; 411 struct page *page = NULL;
411 int ret = -ENOMEM; 412 int ret = -ENOMEM;
412 413
@@ -466,6 +467,16 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
466 467
467 trace_cma_alloc(pfn, page, count, align); 468 trace_cma_alloc(pfn, page, count, align);
468 469
470 /*
471 * CMA can allocate multiple page blocks, which results in different
472 * blocks being marked with different tags. Reset the tags to ignore
473 * those page blocks.
474 */
475 if (page) {
476 for (i = 0; i < count; i++)
477 page_kasan_tag_reset(page + i);
478 }
479
469 if (ret && !no_warn) { 480 if (ret && !no_warn) {
470 pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", 481 pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
471 __func__, count, ret); 482 __func__, count, ret);
diff --git a/mm/compaction.c b/mm/compaction.c
index 7c607479de4a..ef29490b0f46 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1431,7 +1431,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
1431 if (is_via_compact_memory(order)) 1431 if (is_via_compact_memory(order))
1432 return COMPACT_CONTINUE; 1432 return COMPACT_CONTINUE;
1433 1433
1434 watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1434 watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
1435 /* 1435 /*
1436 * If watermarks for high-order allocation are already met, there 1436 * If watermarks for high-order allocation are already met, there
1437 * should be no need for compaction at all. 1437 * should be no need for compaction at all.
diff --git a/mm/debug.c b/mm/debug.c
index cdacba12e09a..0abb987dad9b 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -17,7 +17,7 @@
17 17
18#include "internal.h" 18#include "internal.h"
19 19
20char *migrate_reason_names[MR_TYPES] = { 20const char *migrate_reason_names[MR_TYPES] = {
21 "compaction", 21 "compaction",
22 "memory_failure", 22 "memory_failure",
23 "memory_hotplug", 23 "memory_hotplug",
@@ -44,6 +44,7 @@ const struct trace_print_flags vmaflag_names[] = {
44 44
45void __dump_page(struct page *page, const char *reason) 45void __dump_page(struct page *page, const char *reason)
46{ 46{
47 struct address_space *mapping = page_mapping(page);
47 bool page_poisoned = PagePoisoned(page); 48 bool page_poisoned = PagePoisoned(page);
48 int mapcount; 49 int mapcount;
49 50
@@ -53,7 +54,7 @@ void __dump_page(struct page *page, const char *reason)
53 * dump_page() when detected. 54 * dump_page() when detected.
54 */ 55 */
55 if (page_poisoned) { 56 if (page_poisoned) {
56 pr_emerg("page:%px is uninitialized and poisoned", page); 57 pr_warn("page:%px is uninitialized and poisoned", page);
57 goto hex_only; 58 goto hex_only;
58 } 59 }
59 60
@@ -64,27 +65,39 @@ void __dump_page(struct page *page, const char *reason)
64 */ 65 */
65 mapcount = PageSlab(page) ? 0 : page_mapcount(page); 66 mapcount = PageSlab(page) ? 0 : page_mapcount(page);
66 67
67 pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx", 68 pr_warn("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
68 page, page_ref_count(page), mapcount, 69 page, page_ref_count(page), mapcount,
69 page->mapping, page_to_pgoff(page)); 70 page->mapping, page_to_pgoff(page));
70 if (PageCompound(page)) 71 if (PageCompound(page))
71 pr_cont(" compound_mapcount: %d", compound_mapcount(page)); 72 pr_cont(" compound_mapcount: %d", compound_mapcount(page));
72 pr_cont("\n"); 73 pr_cont("\n");
74 if (PageAnon(page))
75 pr_warn("anon ");
76 else if (PageKsm(page))
77 pr_warn("ksm ");
78 else if (mapping) {
79 pr_warn("%ps ", mapping->a_ops);
80 if (mapping->host->i_dentry.first) {
81 struct dentry *dentry;
82 dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias);
83 pr_warn("name:\"%pd\" ", dentry);
84 }
85 }
73 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); 86 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
74 87
75 pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); 88 pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
76 89
77hex_only: 90hex_only:
78 print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32, 91 print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
79 sizeof(unsigned long), page, 92 sizeof(unsigned long), page,
80 sizeof(struct page), false); 93 sizeof(struct page), false);
81 94
82 if (reason) 95 if (reason)
83 pr_alert("page dumped because: %s\n", reason); 96 pr_warn("page dumped because: %s\n", reason);
84 97
85#ifdef CONFIG_MEMCG 98#ifdef CONFIG_MEMCG
86 if (!page_poisoned && page->mem_cgroup) 99 if (!page_poisoned && page->mem_cgroup)
87 pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup); 100 pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup);
88#endif 101#endif
89} 102}
90 103
diff --git a/mm/filemap.c b/mm/filemap.c
index 81adec8ee02c..29655fb47a2c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -981,7 +981,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
981 if (wait_page->bit_nr != key->bit_nr) 981 if (wait_page->bit_nr != key->bit_nr)
982 return 0; 982 return 0;
983 983
984 /* Stop walking if it's locked */ 984 /*
985 * Stop walking if it's locked.
986 * Is this safe if put_and_wait_on_page_locked() is in use?
987 * Yes: the waker must hold a reference to this page, and if PG_locked
988 * has now already been set by another task, that task must also hold
989 * a reference to the *same usage* of this page; so there is no need
990 * to walk on to wake even the put_and_wait_on_page_locked() callers.
991 */
985 if (test_bit(key->bit_nr, &key->page->flags)) 992 if (test_bit(key->bit_nr, &key->page->flags))
986 return -1; 993 return -1;
987 994
@@ -1049,25 +1056,44 @@ static void wake_up_page(struct page *page, int bit)
1049 wake_up_page_bit(page, bit); 1056 wake_up_page_bit(page, bit);
1050} 1057}
1051 1058
1059/*
1060 * A choice of three behaviors for wait_on_page_bit_common():
1061 */
1062enum behavior {
1063 EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
1064 * __lock_page() waiting on then setting PG_locked.
1065 */
1066 SHARED, /* Hold ref to page and check the bit when woken, like
1067 * wait_on_page_writeback() waiting on PG_writeback.
1068 */
1069 DROP, /* Drop ref to page before wait, no check when woken,
1070 * like put_and_wait_on_page_locked() on PG_locked.
1071 */
1072};
1073
1052static inline int wait_on_page_bit_common(wait_queue_head_t *q, 1074static inline int wait_on_page_bit_common(wait_queue_head_t *q,
1053 struct page *page, int bit_nr, int state, bool lock) 1075 struct page *page, int bit_nr, int state, enum behavior behavior)
1054{ 1076{
1055 struct wait_page_queue wait_page; 1077 struct wait_page_queue wait_page;
1056 wait_queue_entry_t *wait = &wait_page.wait; 1078 wait_queue_entry_t *wait = &wait_page.wait;
1079 bool bit_is_set;
1057 bool thrashing = false; 1080 bool thrashing = false;
1081 bool delayacct = false;
1058 unsigned long pflags; 1082 unsigned long pflags;
1059 int ret = 0; 1083 int ret = 0;
1060 1084
1061 if (bit_nr == PG_locked && 1085 if (bit_nr == PG_locked &&
1062 !PageUptodate(page) && PageWorkingset(page)) { 1086 !PageUptodate(page) && PageWorkingset(page)) {
1063 if (!PageSwapBacked(page)) 1087 if (!PageSwapBacked(page)) {
1064 delayacct_thrashing_start(); 1088 delayacct_thrashing_start();
1089 delayacct = true;
1090 }
1065 psi_memstall_enter(&pflags); 1091 psi_memstall_enter(&pflags);
1066 thrashing = true; 1092 thrashing = true;
1067 } 1093 }
1068 1094
1069 init_wait(wait); 1095 init_wait(wait);
1070 wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; 1096 wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
1071 wait->func = wake_page_function; 1097 wait->func = wake_page_function;
1072 wait_page.page = page; 1098 wait_page.page = page;
1073 wait_page.bit_nr = bit_nr; 1099 wait_page.bit_nr = bit_nr;
@@ -1084,14 +1110,17 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
1084 1110
1085 spin_unlock_irq(&q->lock); 1111 spin_unlock_irq(&q->lock);
1086 1112
1087 if (likely(test_bit(bit_nr, &page->flags))) { 1113 bit_is_set = test_bit(bit_nr, &page->flags);
1114 if (behavior == DROP)
1115 put_page(page);
1116
1117 if (likely(bit_is_set))
1088 io_schedule(); 1118 io_schedule();
1089 }
1090 1119
1091 if (lock) { 1120 if (behavior == EXCLUSIVE) {
1092 if (!test_and_set_bit_lock(bit_nr, &page->flags)) 1121 if (!test_and_set_bit_lock(bit_nr, &page->flags))
1093 break; 1122 break;
1094 } else { 1123 } else if (behavior == SHARED) {
1095 if (!test_bit(bit_nr, &page->flags)) 1124 if (!test_bit(bit_nr, &page->flags))
1096 break; 1125 break;
1097 } 1126 }
@@ -1100,12 +1129,23 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
1100 ret = -EINTR; 1129 ret = -EINTR;
1101 break; 1130 break;
1102 } 1131 }
1132
1133 if (behavior == DROP) {
1134 /*
1135 * We can no longer safely access page->flags:
1136 * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
1137 * there is a risk of waiting forever on a page reused
1138 * for something that keeps it locked indefinitely.
1139 * But best check for -EINTR above before breaking.
1140 */
1141 break;
1142 }
1103 } 1143 }
1104 1144
1105 finish_wait(q, wait); 1145 finish_wait(q, wait);
1106 1146
1107 if (thrashing) { 1147 if (thrashing) {
1108 if (!PageSwapBacked(page)) 1148 if (delayacct)
1109 delayacct_thrashing_end(); 1149 delayacct_thrashing_end();
1110 psi_memstall_leave(&pflags); 1150 psi_memstall_leave(&pflags);
1111 } 1151 }
@@ -1124,18 +1164,37 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
1124void wait_on_page_bit(struct page *page, int bit_nr) 1164void wait_on_page_bit(struct page *page, int bit_nr)
1125{ 1165{
1126 wait_queue_head_t *q = page_waitqueue(page); 1166 wait_queue_head_t *q = page_waitqueue(page);
1127 wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false); 1167 wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
1128} 1168}
1129EXPORT_SYMBOL(wait_on_page_bit); 1169EXPORT_SYMBOL(wait_on_page_bit);
1130 1170
1131int wait_on_page_bit_killable(struct page *page, int bit_nr) 1171int wait_on_page_bit_killable(struct page *page, int bit_nr)
1132{ 1172{
1133 wait_queue_head_t *q = page_waitqueue(page); 1173 wait_queue_head_t *q = page_waitqueue(page);
1134 return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false); 1174 return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
1135} 1175}
1136EXPORT_SYMBOL(wait_on_page_bit_killable); 1176EXPORT_SYMBOL(wait_on_page_bit_killable);
1137 1177
1138/** 1178/**
1179 * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
1180 * @page: The page to wait for.
1181 *
1182 * The caller should hold a reference on @page. They expect the page to
1183 * become unlocked relatively soon, but do not wish to hold up migration
1184 * (for example) by holding the reference while waiting for the page to
1185 * come unlocked. After this function returns, the caller should not
1186 * dereference @page.
1187 */
1188void put_and_wait_on_page_locked(struct page *page)
1189{
1190 wait_queue_head_t *q;
1191
1192 page = compound_head(page);
1193 q = page_waitqueue(page);
1194 wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
1195}
1196
1197/**
1139 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 1198 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
1140 * @page: Page defining the wait queue of interest 1199 * @page: Page defining the wait queue of interest
1141 * @waiter: Waiter to add to the queue 1200 * @waiter: Waiter to add to the queue
@@ -1264,7 +1323,8 @@ void __lock_page(struct page *__page)
1264{ 1323{
1265 struct page *page = compound_head(__page); 1324 struct page *page = compound_head(__page);
1266 wait_queue_head_t *q = page_waitqueue(page); 1325 wait_queue_head_t *q = page_waitqueue(page);
1267 wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true); 1326 wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
1327 EXCLUSIVE);
1268} 1328}
1269EXPORT_SYMBOL(__lock_page); 1329EXPORT_SYMBOL(__lock_page);
1270 1330
@@ -1272,7 +1332,8 @@ int __lock_page_killable(struct page *__page)
1272{ 1332{
1273 struct page *page = compound_head(__page); 1333 struct page *page = compound_head(__page);
1274 wait_queue_head_t *q = page_waitqueue(page); 1334 wait_queue_head_t *q = page_waitqueue(page);
1275 return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true); 1335 return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
1336 EXCLUSIVE);
1276} 1337}
1277EXPORT_SYMBOL_GPL(__lock_page_killable); 1338EXPORT_SYMBOL_GPL(__lock_page_killable);
1278 1339
@@ -1540,7 +1601,7 @@ repeat:
1540 VM_BUG_ON_PAGE(page->index != offset, page); 1601 VM_BUG_ON_PAGE(page->index != offset, page);
1541 } 1602 }
1542 1603
1543 if (page && (fgp_flags & FGP_ACCESSED)) 1604 if (fgp_flags & FGP_ACCESSED)
1544 mark_page_accessed(page); 1605 mark_page_accessed(page);
1545 1606
1546no_page: 1607no_page:
@@ -2553,6 +2614,13 @@ void filemap_map_pages(struct vm_fault *vmf,
2553 goto next; 2614 goto next;
2554 2615
2555 head = compound_head(page); 2616 head = compound_head(page);
2617
2618 /*
2619 * Check for a locked page first, as a speculative
2620 * reference may adversely influence page migration.
2621 */
2622 if (PageLocked(head))
2623 goto next;
2556 if (!page_cache_get_speculative(head)) 2624 if (!page_cache_get_speculative(head))
2557 goto next; 2625 goto next;
2558 2626
diff --git a/mm/highmem.c b/mm/highmem.c
index 59db3223a5d6..107b10f9878e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -105,9 +105,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
105} 105}
106#endif 106#endif
107 107
108unsigned long totalhigh_pages __read_mostly; 108atomic_long_t _totalhigh_pages __read_mostly;
109EXPORT_SYMBOL(totalhigh_pages); 109EXPORT_SYMBOL(_totalhigh_pages);
110
111 110
112EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); 111EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
113 112
diff --git a/mm/hmm.c b/mm/hmm.c
index 90c34f3d1243..a04e4b810610 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -189,35 +189,30 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
189} 189}
190 190
191static int hmm_invalidate_range_start(struct mmu_notifier *mn, 191static int hmm_invalidate_range_start(struct mmu_notifier *mn,
192 struct mm_struct *mm, 192 const struct mmu_notifier_range *range)
193 unsigned long start,
194 unsigned long end,
195 bool blockable)
196{ 193{
197 struct hmm_update update; 194 struct hmm_update update;
198 struct hmm *hmm = mm->hmm; 195 struct hmm *hmm = range->mm->hmm;
199 196
200 VM_BUG_ON(!hmm); 197 VM_BUG_ON(!hmm);
201 198
202 update.start = start; 199 update.start = range->start;
203 update.end = end; 200 update.end = range->end;
204 update.event = HMM_UPDATE_INVALIDATE; 201 update.event = HMM_UPDATE_INVALIDATE;
205 update.blockable = blockable; 202 update.blockable = range->blockable;
206 return hmm_invalidate_range(hmm, true, &update); 203 return hmm_invalidate_range(hmm, true, &update);
207} 204}
208 205
209static void hmm_invalidate_range_end(struct mmu_notifier *mn, 206static void hmm_invalidate_range_end(struct mmu_notifier *mn,
210 struct mm_struct *mm, 207 const struct mmu_notifier_range *range)
211 unsigned long start,
212 unsigned long end)
213{ 208{
214 struct hmm_update update; 209 struct hmm_update update;
215 struct hmm *hmm = mm->hmm; 210 struct hmm *hmm = range->mm->hmm;
216 211
217 VM_BUG_ON(!hmm); 212 VM_BUG_ON(!hmm);
218 213
219 update.start = start; 214 update.start = range->start;
220 update.end = end; 215 update.end = range->end;
221 update.event = HMM_UPDATE_INVALIDATE; 216 update.event = HMM_UPDATE_INVALIDATE;
222 update.blockable = true; 217 update.blockable = true;
223 hmm_invalidate_range(hmm, false, &update); 218 hmm_invalidate_range(hmm, false, &update);
@@ -986,19 +981,13 @@ static void hmm_devmem_ref_exit(void *data)
986 struct hmm_devmem *devmem; 981 struct hmm_devmem *devmem;
987 982
988 devmem = container_of(ref, struct hmm_devmem, ref); 983 devmem = container_of(ref, struct hmm_devmem, ref);
984 wait_for_completion(&devmem->completion);
989 percpu_ref_exit(ref); 985 percpu_ref_exit(ref);
990 devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
991} 986}
992 987
993static void hmm_devmem_ref_kill(void *data) 988static void hmm_devmem_ref_kill(struct percpu_ref *ref)
994{ 989{
995 struct percpu_ref *ref = data;
996 struct hmm_devmem *devmem;
997
998 devmem = container_of(ref, struct hmm_devmem, ref);
999 percpu_ref_kill(ref); 990 percpu_ref_kill(ref);
1000 wait_for_completion(&devmem->completion);
1001 devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
1002} 991}
1003 992
1004static int hmm_devmem_fault(struct vm_area_struct *vma, 993static int hmm_devmem_fault(struct vm_area_struct *vma,
@@ -1021,172 +1010,6 @@ static void hmm_devmem_free(struct page *page, void *data)
1021 devmem->ops->free(devmem, page); 1010 devmem->ops->free(devmem, page);
1022} 1011}
1023 1012
1024static DEFINE_MUTEX(hmm_devmem_lock);
1025static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
1026
1027static void hmm_devmem_radix_release(struct resource *resource)
1028{
1029 resource_size_t key;
1030
1031 mutex_lock(&hmm_devmem_lock);
1032 for (key = resource->start;
1033 key <= resource->end;
1034 key += PA_SECTION_SIZE)
1035 radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
1036 mutex_unlock(&hmm_devmem_lock);
1037}
1038
1039static void hmm_devmem_release(struct device *dev, void *data)
1040{
1041 struct hmm_devmem *devmem = data;
1042 struct resource *resource = devmem->resource;
1043 unsigned long start_pfn, npages;
1044 struct zone *zone;
1045 struct page *page;
1046
1047 if (percpu_ref_tryget_live(&devmem->ref)) {
1048 dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
1049 percpu_ref_put(&devmem->ref);
1050 }
1051
1052 /* pages are dead and unused, undo the arch mapping */
1053 start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
1054 npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
1055
1056 page = pfn_to_page(start_pfn);
1057 zone = page_zone(page);
1058
1059 mem_hotplug_begin();
1060 if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
1061 __remove_pages(zone, start_pfn, npages, NULL);
1062 else
1063 arch_remove_memory(start_pfn << PAGE_SHIFT,
1064 npages << PAGE_SHIFT, NULL);
1065 mem_hotplug_done();
1066
1067 hmm_devmem_radix_release(resource);
1068}
1069
1070static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
1071{
1072 resource_size_t key, align_start, align_size, align_end;
1073 struct device *device = devmem->device;
1074 int ret, nid, is_ram;
1075
1076 align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
1077 align_size = ALIGN(devmem->resource->start +
1078 resource_size(devmem->resource),
1079 PA_SECTION_SIZE) - align_start;
1080
1081 is_ram = region_intersects(align_start, align_size,
1082 IORESOURCE_SYSTEM_RAM,
1083 IORES_DESC_NONE);
1084 if (is_ram == REGION_MIXED) {
1085 WARN_ONCE(1, "%s attempted on mixed region %pr\n",
1086 __func__, devmem->resource);
1087 return -ENXIO;
1088 }
1089 if (is_ram == REGION_INTERSECTS)
1090 return -ENXIO;
1091
1092 if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
1093 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
1094 else
1095 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
1096
1097 devmem->pagemap.res = *devmem->resource;
1098 devmem->pagemap.page_fault = hmm_devmem_fault;
1099 devmem->pagemap.page_free = hmm_devmem_free;
1100 devmem->pagemap.dev = devmem->device;
1101 devmem->pagemap.ref = &devmem->ref;
1102 devmem->pagemap.data = devmem;
1103
1104 mutex_lock(&hmm_devmem_lock);
1105 align_end = align_start + align_size - 1;
1106 for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
1107 struct hmm_devmem *dup;
1108
1109 dup = radix_tree_lookup(&hmm_devmem_radix,
1110 key >> PA_SECTION_SHIFT);
1111 if (dup) {
1112 dev_err(device, "%s: collides with mapping for %s\n",
1113 __func__, dev_name(dup->device));
1114 mutex_unlock(&hmm_devmem_lock);
1115 ret = -EBUSY;
1116 goto error;
1117 }
1118 ret = radix_tree_insert(&hmm_devmem_radix,
1119 key >> PA_SECTION_SHIFT,
1120 devmem);
1121 if (ret) {
1122 dev_err(device, "%s: failed: %d\n", __func__, ret);
1123 mutex_unlock(&hmm_devmem_lock);
1124 goto error_radix;
1125 }
1126 }
1127 mutex_unlock(&hmm_devmem_lock);
1128
1129 nid = dev_to_node(device);
1130 if (nid < 0)
1131 nid = numa_mem_id();
1132
1133 mem_hotplug_begin();
1134 /*
1135 * For device private memory we call add_pages() as we only need to
1136 * allocate and initialize struct page for the device memory. More-
1137 * over the device memory is un-accessible thus we do not want to
1138 * create a linear mapping for the memory like arch_add_memory()
1139 * would do.
1140 *
1141 * For device public memory, which is accesible by the CPU, we do
1142 * want the linear mapping and thus use arch_add_memory().
1143 */
1144 if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
1145 ret = arch_add_memory(nid, align_start, align_size, NULL,
1146 false);
1147 else
1148 ret = add_pages(nid, align_start >> PAGE_SHIFT,
1149 align_size >> PAGE_SHIFT, NULL, false);
1150 if (ret) {
1151 mem_hotplug_done();
1152 goto error_add_memory;
1153 }
1154 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
1155 align_start >> PAGE_SHIFT,
1156 align_size >> PAGE_SHIFT, NULL);
1157 mem_hotplug_done();
1158
1159 /*
1160 * Initialization of the pages has been deferred until now in order
1161 * to allow us to do the work while not holding the hotplug lock.
1162 */
1163 memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
1164 align_start >> PAGE_SHIFT,
1165 align_size >> PAGE_SHIFT, &devmem->pagemap);
1166
1167 return 0;
1168
1169error_add_memory:
1170 untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
1171error_radix:
1172 hmm_devmem_radix_release(devmem->resource);
1173error:
1174 return ret;
1175}
1176
1177static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
1178{
1179 struct hmm_devmem *devmem = data;
1180
1181 return devmem->resource == match_data;
1182}
1183
1184static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
1185{
1186 devres_release(devmem->device, &hmm_devmem_release,
1187 &hmm_devmem_match, devmem->resource);
1188}
1189
1190/* 1013/*
1191 * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory 1014 * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
1192 * 1015 *
@@ -1210,12 +1033,12 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
1210{ 1033{
1211 struct hmm_devmem *devmem; 1034 struct hmm_devmem *devmem;
1212 resource_size_t addr; 1035 resource_size_t addr;
1036 void *result;
1213 int ret; 1037 int ret;
1214 1038
1215 dev_pagemap_get_ops(); 1039 dev_pagemap_get_ops();
1216 1040
1217 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), 1041 devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
1218 GFP_KERNEL, dev_to_node(device));
1219 if (!devmem) 1042 if (!devmem)
1220 return ERR_PTR(-ENOMEM); 1043 return ERR_PTR(-ENOMEM);
1221 1044
@@ -1229,11 +1052,11 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
1229 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1052 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
1230 0, GFP_KERNEL); 1053 0, GFP_KERNEL);
1231 if (ret) 1054 if (ret)
1232 goto error_percpu_ref; 1055 return ERR_PTR(ret);
1233 1056
1234 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); 1057 ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref);
1235 if (ret) 1058 if (ret)
1236 goto error_devm_add_action; 1059 return ERR_PTR(ret);
1237 1060
1238 size = ALIGN(size, PA_SECTION_SIZE); 1061 size = ALIGN(size, PA_SECTION_SIZE);
1239 addr = min((unsigned long)iomem_resource.end, 1062 addr = min((unsigned long)iomem_resource.end,
@@ -1253,54 +1076,40 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
1253 1076
1254 devmem->resource = devm_request_mem_region(device, addr, size, 1077 devmem->resource = devm_request_mem_region(device, addr, size,
1255 dev_name(device)); 1078 dev_name(device));
1256 if (!devmem->resource) { 1079 if (!devmem->resource)
1257 ret = -ENOMEM; 1080 return ERR_PTR(-ENOMEM);
1258 goto error_no_resource;
1259 }
1260 break; 1081 break;
1261 } 1082 }
1262 if (!devmem->resource) { 1083 if (!devmem->resource)
1263 ret = -ERANGE; 1084 return ERR_PTR(-ERANGE);
1264 goto error_no_resource;
1265 }
1266 1085
1267 devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; 1086 devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
1268 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1087 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
1269 devmem->pfn_last = devmem->pfn_first + 1088 devmem->pfn_last = devmem->pfn_first +
1270 (resource_size(devmem->resource) >> PAGE_SHIFT); 1089 (resource_size(devmem->resource) >> PAGE_SHIFT);
1090 devmem->page_fault = hmm_devmem_fault;
1271 1091
1272 ret = hmm_devmem_pages_create(devmem); 1092 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
1273 if (ret) 1093 devmem->pagemap.res = *devmem->resource;
1274 goto error_pages; 1094 devmem->pagemap.page_free = hmm_devmem_free;
1275 1095 devmem->pagemap.altmap_valid = false;
1276 devres_add(device, devmem); 1096 devmem->pagemap.ref = &devmem->ref;
1277 1097 devmem->pagemap.data = devmem;
1278 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); 1098 devmem->pagemap.kill = hmm_devmem_ref_kill;
1279 if (ret) {
1280 hmm_devmem_remove(devmem);
1281 return ERR_PTR(ret);
1282 }
1283 1099
1100 result = devm_memremap_pages(devmem->device, &devmem->pagemap);
1101 if (IS_ERR(result))
1102 return result;
1284 return devmem; 1103 return devmem;
1285
1286error_pages:
1287 devm_release_mem_region(device, devmem->resource->start,
1288 resource_size(devmem->resource));
1289error_no_resource:
1290error_devm_add_action:
1291 hmm_devmem_ref_kill(&devmem->ref);
1292 hmm_devmem_ref_exit(&devmem->ref);
1293error_percpu_ref:
1294 devres_free(devmem);
1295 return ERR_PTR(ret);
1296} 1104}
1297EXPORT_SYMBOL(hmm_devmem_add); 1105EXPORT_SYMBOL_GPL(hmm_devmem_add);
1298 1106
1299struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, 1107struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
1300 struct device *device, 1108 struct device *device,
1301 struct resource *res) 1109 struct resource *res)
1302{ 1110{
1303 struct hmm_devmem *devmem; 1111 struct hmm_devmem *devmem;
1112 void *result;
1304 int ret; 1113 int ret;
1305 1114
1306 if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) 1115 if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
@@ -1308,8 +1117,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
1308 1117
1309 dev_pagemap_get_ops(); 1118 dev_pagemap_get_ops();
1310 1119
1311 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), 1120 devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
1312 GFP_KERNEL, dev_to_node(device));
1313 if (!devmem) 1121 if (!devmem)
1314 return ERR_PTR(-ENOMEM); 1122 return ERR_PTR(-ENOMEM);
1315 1123
@@ -1323,71 +1131,32 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
1323 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1131 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
1324 0, GFP_KERNEL); 1132 0, GFP_KERNEL);
1325 if (ret) 1133 if (ret)
1326 goto error_percpu_ref; 1134 return ERR_PTR(ret);
1327 1135
1328 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); 1136 ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit,
1137 &devmem->ref);
1329 if (ret) 1138 if (ret)
1330 goto error_devm_add_action; 1139 return ERR_PTR(ret);
1331
1332 1140
1333 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1141 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
1334 devmem->pfn_last = devmem->pfn_first + 1142 devmem->pfn_last = devmem->pfn_first +
1335 (resource_size(devmem->resource) >> PAGE_SHIFT); 1143 (resource_size(devmem->resource) >> PAGE_SHIFT);
1144 devmem->page_fault = hmm_devmem_fault;
1336 1145
1337 ret = hmm_devmem_pages_create(devmem); 1146 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
1338 if (ret) 1147 devmem->pagemap.res = *devmem->resource;
1339 goto error_devm_add_action; 1148 devmem->pagemap.page_free = hmm_devmem_free;
1340 1149 devmem->pagemap.altmap_valid = false;
1341 devres_add(device, devmem); 1150 devmem->pagemap.ref = &devmem->ref;
1342 1151 devmem->pagemap.data = devmem;
1343 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); 1152 devmem->pagemap.kill = hmm_devmem_ref_kill;
1344 if (ret) {
1345 hmm_devmem_remove(devmem);
1346 return ERR_PTR(ret);
1347 }
1348 1153
1154 result = devm_memremap_pages(devmem->device, &devmem->pagemap);
1155 if (IS_ERR(result))
1156 return result;
1349 return devmem; 1157 return devmem;
1350
1351error_devm_add_action:
1352 hmm_devmem_ref_kill(&devmem->ref);
1353 hmm_devmem_ref_exit(&devmem->ref);
1354error_percpu_ref:
1355 devres_free(devmem);
1356 return ERR_PTR(ret);
1357}
1358EXPORT_SYMBOL(hmm_devmem_add_resource);
1359
1360/*
1361 * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
1362 *
1363 * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
1364 *
1365 * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
1366 * of the device driver. It will free struct page and remove the resource that
1367 * reserved the physical address range for this device memory.
1368 */
1369void hmm_devmem_remove(struct hmm_devmem *devmem)
1370{
1371 resource_size_t start, size;
1372 struct device *device;
1373 bool cdm = false;
1374
1375 if (!devmem)
1376 return;
1377
1378 device = devmem->device;
1379 start = devmem->resource->start;
1380 size = resource_size(devmem->resource);
1381
1382 cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
1383 hmm_devmem_ref_kill(&devmem->ref);
1384 hmm_devmem_ref_exit(&devmem->ref);
1385 hmm_devmem_pages_remove(devmem);
1386
1387 if (!cdm)
1388 devm_release_mem_region(device, start, size);
1389} 1158}
1390EXPORT_SYMBOL(hmm_devmem_remove); 1159EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
1391 1160
1392/* 1161/*
1393 * A device driver that wants to handle multiple devices memory through a 1162 * A device driver that wants to handle multiple devices memory through a
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e84a10b0d310..cbd977b1d60d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -62,6 +62,16 @@ static struct shrinker deferred_split_shrinker;
62static atomic_t huge_zero_refcount; 62static atomic_t huge_zero_refcount;
63struct page *huge_zero_page __read_mostly; 63struct page *huge_zero_page __read_mostly;
64 64
65bool transparent_hugepage_enabled(struct vm_area_struct *vma)
66{
67 if (vma_is_anonymous(vma))
68 return __transparent_hugepage_enabled(vma);
69 if (vma_is_shmem(vma) && shmem_huge_enabled(vma))
70 return __transparent_hugepage_enabled(vma);
71
72 return false;
73}
74
65static struct page *get_huge_zero_page(void) 75static struct page *get_huge_zero_page(void)
66{ 76{
67 struct page *zero_page; 77 struct page *zero_page;
@@ -420,7 +430,7 @@ static int __init hugepage_init(void)
420 * where the extra memory used could hurt more than TLB overhead 430 * where the extra memory used could hurt more than TLB overhead
421 * is likely to save. The admin can still enable it through /sys. 431 * is likely to save. The admin can still enable it through /sys.
422 */ 432 */
423 if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { 433 if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
424 transparent_hugepage_flags = 0; 434 transparent_hugepage_flags = 0;
425 return 0; 435 return 0;
426 } 436 }
@@ -1134,8 +1144,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
1134 int i; 1144 int i;
1135 vm_fault_t ret = 0; 1145 vm_fault_t ret = 0;
1136 struct page **pages; 1146 struct page **pages;
1137 unsigned long mmun_start; /* For mmu_notifiers */ 1147 struct mmu_notifier_range range;
1138 unsigned long mmun_end; /* For mmu_notifiers */
1139 1148
1140 pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), 1149 pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
1141 GFP_KERNEL); 1150 GFP_KERNEL);
@@ -1173,9 +1182,9 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
1173 cond_resched(); 1182 cond_resched();
1174 } 1183 }
1175 1184
1176 mmun_start = haddr; 1185 mmu_notifier_range_init(&range, vma->vm_mm, haddr,
1177 mmun_end = haddr + HPAGE_PMD_SIZE; 1186 haddr + HPAGE_PMD_SIZE);
1178 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); 1187 mmu_notifier_invalidate_range_start(&range);
1179 1188
1180 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1189 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1181 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) 1190 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
@@ -1220,8 +1229,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
1220 * No need to double call mmu_notifier->invalidate_range() callback as 1229 * No need to double call mmu_notifier->invalidate_range() callback as
1221 * the above pmdp_huge_clear_flush_notify() did already call it. 1230 * the above pmdp_huge_clear_flush_notify() did already call it.
1222 */ 1231 */
1223 mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, 1232 mmu_notifier_invalidate_range_only_end(&range);
1224 mmun_end);
1225 1233
1226 ret |= VM_FAULT_WRITE; 1234 ret |= VM_FAULT_WRITE;
1227 put_page(page); 1235 put_page(page);
@@ -1231,7 +1239,7 @@ out:
1231 1239
1232out_free_pages: 1240out_free_pages:
1233 spin_unlock(vmf->ptl); 1241 spin_unlock(vmf->ptl);
1234 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 1242 mmu_notifier_invalidate_range_end(&range);
1235 for (i = 0; i < HPAGE_PMD_NR; i++) { 1243 for (i = 0; i < HPAGE_PMD_NR; i++) {
1236 memcg = (void *)page_private(pages[i]); 1244 memcg = (void *)page_private(pages[i]);
1237 set_page_private(pages[i], 0); 1245 set_page_private(pages[i], 0);
@@ -1248,8 +1256,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
1248 struct page *page = NULL, *new_page; 1256 struct page *page = NULL, *new_page;
1249 struct mem_cgroup *memcg; 1257 struct mem_cgroup *memcg;
1250 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1258 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1251 unsigned long mmun_start; /* For mmu_notifiers */ 1259 struct mmu_notifier_range range;
1252 unsigned long mmun_end; /* For mmu_notifiers */
1253 gfp_t huge_gfp; /* for allocation and charge */ 1260 gfp_t huge_gfp; /* for allocation and charge */
1254 vm_fault_t ret = 0; 1261 vm_fault_t ret = 0;
1255 1262
@@ -1293,7 +1300,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
1293 get_page(page); 1300 get_page(page);
1294 spin_unlock(vmf->ptl); 1301 spin_unlock(vmf->ptl);
1295alloc: 1302alloc:
1296 if (transparent_hugepage_enabled(vma) && 1303 if (__transparent_hugepage_enabled(vma) &&
1297 !transparent_hugepage_debug_cow()) { 1304 !transparent_hugepage_debug_cow()) {
1298 huge_gfp = alloc_hugepage_direct_gfpmask(vma); 1305 huge_gfp = alloc_hugepage_direct_gfpmask(vma);
1299 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); 1306 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -1338,9 +1345,9 @@ alloc:
1338 vma, HPAGE_PMD_NR); 1345 vma, HPAGE_PMD_NR);
1339 __SetPageUptodate(new_page); 1346 __SetPageUptodate(new_page);
1340 1347
1341 mmun_start = haddr; 1348 mmu_notifier_range_init(&range, vma->vm_mm, haddr,
1342 mmun_end = haddr + HPAGE_PMD_SIZE; 1349 haddr + HPAGE_PMD_SIZE);
1343 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); 1350 mmu_notifier_invalidate_range_start(&range);
1344 1351
1345 spin_lock(vmf->ptl); 1352 spin_lock(vmf->ptl);
1346 if (page) 1353 if (page)
@@ -1375,8 +1382,7 @@ out_mn:
1375 * No need to double call mmu_notifier->invalidate_range() callback as 1382 * No need to double call mmu_notifier->invalidate_range() callback as
1376 * the above pmdp_huge_clear_flush_notify() did already call it. 1383 * the above pmdp_huge_clear_flush_notify() did already call it.
1377 */ 1384 */
1378 mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, 1385 mmu_notifier_invalidate_range_only_end(&range);
1379 mmun_end);
1380out: 1386out:
1381 return ret; 1387 return ret;
1382out_unlock: 1388out_unlock:
@@ -1490,8 +1496,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1490 if (!get_page_unless_zero(page)) 1496 if (!get_page_unless_zero(page))
1491 goto out_unlock; 1497 goto out_unlock;
1492 spin_unlock(vmf->ptl); 1498 spin_unlock(vmf->ptl);
1493 wait_on_page_locked(page); 1499 put_and_wait_on_page_locked(page);
1494 put_page(page);
1495 goto out; 1500 goto out;
1496 } 1501 }
1497 1502
@@ -1527,8 +1532,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1527 if (!get_page_unless_zero(page)) 1532 if (!get_page_unless_zero(page))
1528 goto out_unlock; 1533 goto out_unlock;
1529 spin_unlock(vmf->ptl); 1534 spin_unlock(vmf->ptl);
1530 wait_on_page_locked(page); 1535 put_and_wait_on_page_locked(page);
1531 put_page(page);
1532 goto out; 1536 goto out;
1533 } 1537 }
1534 1538
@@ -2017,14 +2021,15 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2017 unsigned long address) 2021 unsigned long address)
2018{ 2022{
2019 spinlock_t *ptl; 2023 spinlock_t *ptl;
2020 struct mm_struct *mm = vma->vm_mm; 2024 struct mmu_notifier_range range;
2021 unsigned long haddr = address & HPAGE_PUD_MASK;
2022 2025
2023 mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE); 2026 mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK,
2024 ptl = pud_lock(mm, pud); 2027 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2028 mmu_notifier_invalidate_range_start(&range);
2029 ptl = pud_lock(vma->vm_mm, pud);
2025 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) 2030 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2026 goto out; 2031 goto out;
2027 __split_huge_pud_locked(vma, pud, haddr); 2032 __split_huge_pud_locked(vma, pud, range.start);
2028 2033
2029out: 2034out:
2030 spin_unlock(ptl); 2035 spin_unlock(ptl);
@@ -2032,8 +2037,7 @@ out:
2032 * No need to double call mmu_notifier->invalidate_range() callback as 2037 * No need to double call mmu_notifier->invalidate_range() callback as
2033 * the above pudp_huge_clear_flush_notify() did already call it. 2038 * the above pudp_huge_clear_flush_notify() did already call it.
2034 */ 2039 */
2035 mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + 2040 mmu_notifier_invalidate_range_only_end(&range);
2036 HPAGE_PUD_SIZE);
2037} 2041}
2038#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2042#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2039 2043
@@ -2235,11 +2239,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2235 unsigned long address, bool freeze, struct page *page) 2239 unsigned long address, bool freeze, struct page *page)
2236{ 2240{
2237 spinlock_t *ptl; 2241 spinlock_t *ptl;
2238 struct mm_struct *mm = vma->vm_mm; 2242 struct mmu_notifier_range range;
2239 unsigned long haddr = address & HPAGE_PMD_MASK;
2240 2243
2241 mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); 2244 mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK,
2242 ptl = pmd_lock(mm, pmd); 2245 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2246 mmu_notifier_invalidate_range_start(&range);
2247 ptl = pmd_lock(vma->vm_mm, pmd);
2243 2248
2244 /* 2249 /*
2245 * If caller asks to setup a migration entries, we need a page to check 2250 * If caller asks to setup a migration entries, we need a page to check
@@ -2255,7 +2260,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2255 clear_page_mlock(page); 2260 clear_page_mlock(page);
2256 } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) 2261 } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
2257 goto out; 2262 goto out;
2258 __split_huge_pmd_locked(vma, pmd, haddr, freeze); 2263 __split_huge_pmd_locked(vma, pmd, range.start, freeze);
2259out: 2264out:
2260 spin_unlock(ptl); 2265 spin_unlock(ptl);
2261 /* 2266 /*
@@ -2271,8 +2276,7 @@ out:
2271 * any further changes to individual pte will notify. So no need 2276 * any further changes to individual pte will notify. So no need
2272 * to call mmu_notifier->invalidate_range() 2277 * to call mmu_notifier->invalidate_range()
2273 */ 2278 */
2274 mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + 2279 mmu_notifier_invalidate_range_only_end(&range);
2275 HPAGE_PMD_SIZE);
2276} 2280}
2277 2281
2278void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 2282void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a80832487981..e37efd5d8318 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3238,24 +3238,35 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3238 struct page *ptepage; 3238 struct page *ptepage;
3239 unsigned long addr; 3239 unsigned long addr;
3240 int cow; 3240 int cow;
3241 struct address_space *mapping = vma->vm_file->f_mapping;
3241 struct hstate *h = hstate_vma(vma); 3242 struct hstate *h = hstate_vma(vma);
3242 unsigned long sz = huge_page_size(h); 3243 unsigned long sz = huge_page_size(h);
3243 unsigned long mmun_start; /* For mmu_notifiers */ 3244 struct mmu_notifier_range range;
3244 unsigned long mmun_end; /* For mmu_notifiers */
3245 int ret = 0; 3245 int ret = 0;
3246 3246
3247 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 3247 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
3248 3248
3249 mmun_start = vma->vm_start; 3249 if (cow) {
3250 mmun_end = vma->vm_end; 3250 mmu_notifier_range_init(&range, src, vma->vm_start,
3251 if (cow) 3251 vma->vm_end);
3252 mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); 3252 mmu_notifier_invalidate_range_start(&range);
3253 } else {
3254 /*
3255 * For shared mappings i_mmap_rwsem must be held to call
3256 * huge_pte_alloc, otherwise the returned ptep could go
3257 * away if part of a shared pmd and another thread calls
3258 * huge_pmd_unshare.
3259 */
3260 i_mmap_lock_read(mapping);
3261 }
3253 3262
3254 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 3263 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
3255 spinlock_t *src_ptl, *dst_ptl; 3264 spinlock_t *src_ptl, *dst_ptl;
3265
3256 src_pte = huge_pte_offset(src, addr, sz); 3266 src_pte = huge_pte_offset(src, addr, sz);
3257 if (!src_pte) 3267 if (!src_pte)
3258 continue; 3268 continue;
3269
3259 dst_pte = huge_pte_alloc(dst, addr, sz); 3270 dst_pte = huge_pte_alloc(dst, addr, sz);
3260 if (!dst_pte) { 3271 if (!dst_pte) {
3261 ret = -ENOMEM; 3272 ret = -ENOMEM;
@@ -3325,7 +3336,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3325 } 3336 }
3326 3337
3327 if (cow) 3338 if (cow)
3328 mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); 3339 mmu_notifier_invalidate_range_end(&range);
3340 else
3341 i_mmap_unlock_read(mapping);
3329 3342
3330 return ret; 3343 return ret;
3331} 3344}
@@ -3342,8 +3355,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
3342 struct page *page; 3355 struct page *page;
3343 struct hstate *h = hstate_vma(vma); 3356 struct hstate *h = hstate_vma(vma);
3344 unsigned long sz = huge_page_size(h); 3357 unsigned long sz = huge_page_size(h);
3345 unsigned long mmun_start = start; /* For mmu_notifiers */ 3358 struct mmu_notifier_range range;
3346 unsigned long mmun_end = end; /* For mmu_notifiers */
3347 3359
3348 WARN_ON(!is_vm_hugetlb_page(vma)); 3360 WARN_ON(!is_vm_hugetlb_page(vma));
3349 BUG_ON(start & ~huge_page_mask(h)); 3361 BUG_ON(start & ~huge_page_mask(h));
@@ -3359,8 +3371,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
3359 /* 3371 /*
3360 * If sharing possible, alert mmu notifiers of worst case. 3372 * If sharing possible, alert mmu notifiers of worst case.
3361 */ 3373 */
3362 adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end); 3374 mmu_notifier_range_init(&range, mm, start, end);
3363 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 3375 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
3376 mmu_notifier_invalidate_range_start(&range);
3364 address = start; 3377 address = start;
3365 for (; address < end; address += sz) { 3378 for (; address < end; address += sz) {
3366 ptep = huge_pte_offset(mm, address, sz); 3379 ptep = huge_pte_offset(mm, address, sz);
@@ -3428,7 +3441,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
3428 if (ref_page) 3441 if (ref_page)
3429 break; 3442 break;
3430 } 3443 }
3431 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 3444 mmu_notifier_invalidate_range_end(&range);
3432 tlb_end_vma(tlb, vma); 3445 tlb_end_vma(tlb, vma);
3433} 3446}
3434 3447
@@ -3546,9 +3559,8 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
3546 struct page *old_page, *new_page; 3559 struct page *old_page, *new_page;
3547 int outside_reserve = 0; 3560 int outside_reserve = 0;
3548 vm_fault_t ret = 0; 3561 vm_fault_t ret = 0;
3549 unsigned long mmun_start; /* For mmu_notifiers */
3550 unsigned long mmun_end; /* For mmu_notifiers */
3551 unsigned long haddr = address & huge_page_mask(h); 3562 unsigned long haddr = address & huge_page_mask(h);
3563 struct mmu_notifier_range range;
3552 3564
3553 pte = huge_ptep_get(ptep); 3565 pte = huge_ptep_get(ptep);
3554 old_page = pte_page(pte); 3566 old_page = pte_page(pte);
@@ -3627,9 +3639,8 @@ retry_avoidcopy:
3627 __SetPageUptodate(new_page); 3639 __SetPageUptodate(new_page);
3628 set_page_huge_active(new_page); 3640 set_page_huge_active(new_page);
3629 3641
3630 mmun_start = haddr; 3642 mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
3631 mmun_end = mmun_start + huge_page_size(h); 3643 mmu_notifier_invalidate_range_start(&range);
3632 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
3633 3644
3634 /* 3645 /*
3635 * Retake the page table lock to check for racing updates 3646 * Retake the page table lock to check for racing updates
@@ -3642,7 +3653,7 @@ retry_avoidcopy:
3642 3653
3643 /* Break COW */ 3654 /* Break COW */
3644 huge_ptep_clear_flush(vma, haddr, ptep); 3655 huge_ptep_clear_flush(vma, haddr, ptep);
3645 mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); 3656 mmu_notifier_invalidate_range(mm, range.start, range.end);
3646 set_huge_pte_at(mm, haddr, ptep, 3657 set_huge_pte_at(mm, haddr, ptep,
3647 make_huge_pte(vma, new_page, 1)); 3658 make_huge_pte(vma, new_page, 1));
3648 page_remove_rmap(old_page, true); 3659 page_remove_rmap(old_page, true);
@@ -3651,7 +3662,7 @@ retry_avoidcopy:
3651 new_page = old_page; 3662 new_page = old_page;
3652 } 3663 }
3653 spin_unlock(ptl); 3664 spin_unlock(ptl);
3654 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 3665 mmu_notifier_invalidate_range_end(&range);
3655out_release_all: 3666out_release_all:
3656 restore_reserve_on_error(h, vma, haddr, new_page); 3667 restore_reserve_on_error(h, vma, haddr, new_page);
3657 put_page(new_page); 3668 put_page(new_page);
@@ -3744,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
3744 } 3755 }
3745 3756
3746 /* 3757 /*
3747 * Use page lock to guard against racing truncation 3758 * We can not race with truncation due to holding i_mmap_rwsem.
3748 * before we get page_table_lock. 3759 * Check once here for faults beyond end of file.
3749 */ 3760 */
3761 size = i_size_read(mapping->host) >> huge_page_shift(h);
3762 if (idx >= size)
3763 goto out;
3764
3750retry: 3765retry:
3751 page = find_lock_page(mapping, idx); 3766 page = find_lock_page(mapping, idx);
3752 if (!page) { 3767 if (!page) {
3753 size = i_size_read(mapping->host) >> huge_page_shift(h);
3754 if (idx >= size)
3755 goto out;
3756
3757 /* 3768 /*
3758 * Check for page in userfault range 3769 * Check for page in userfault range
3759 */ 3770 */
@@ -3773,14 +3784,18 @@ retry:
3773 }; 3784 };
3774 3785
3775 /* 3786 /*
3776 * hugetlb_fault_mutex must be dropped before 3787 * hugetlb_fault_mutex and i_mmap_rwsem must be
3777 * handling userfault. Reacquire after handling 3788 * dropped before handling userfault. Reacquire
3778 * fault to make calling code simpler. 3789 * after handling fault to make calling code simpler.
3779 */ 3790 */
3780 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, 3791 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
3781 idx, haddr); 3792 idx, haddr);
3782 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 3793 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
3794 i_mmap_unlock_read(mapping);
3795
3783 ret = handle_userfault(&vmf, VM_UFFD_MISSING); 3796 ret = handle_userfault(&vmf, VM_UFFD_MISSING);
3797
3798 i_mmap_lock_read(mapping);
3784 mutex_lock(&hugetlb_fault_mutex_table[hash]); 3799 mutex_lock(&hugetlb_fault_mutex_table[hash]);
3785 goto out; 3800 goto out;
3786 } 3801 }
@@ -3839,9 +3854,6 @@ retry:
3839 } 3854 }
3840 3855
3841 ptl = huge_pte_lock(h, mm, ptep); 3856 ptl = huge_pte_lock(h, mm, ptep);
3842 size = i_size_read(mapping->host) >> huge_page_shift(h);
3843 if (idx >= size)
3844 goto backout;
3845 3857
3846 ret = 0; 3858 ret = 0;
3847 if (!huge_pte_none(huge_ptep_get(ptep))) 3859 if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3928,6 +3940,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3928 3940
3929 ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 3941 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
3930 if (ptep) { 3942 if (ptep) {
3943 /*
3944 * Since we hold no locks, ptep could be stale. That is
3945 * OK as we are only making decisions based on content and
3946 * not actually modifying content here.
3947 */
3931 entry = huge_ptep_get(ptep); 3948 entry = huge_ptep_get(ptep);
3932 if (unlikely(is_hugetlb_entry_migration(entry))) { 3949 if (unlikely(is_hugetlb_entry_migration(entry))) {
3933 migration_entry_wait_huge(vma, mm, ptep); 3950 migration_entry_wait_huge(vma, mm, ptep);
@@ -3935,20 +3952,33 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3935 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 3952 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
3936 return VM_FAULT_HWPOISON_LARGE | 3953 return VM_FAULT_HWPOISON_LARGE |
3937 VM_FAULT_SET_HINDEX(hstate_index(h)); 3954 VM_FAULT_SET_HINDEX(hstate_index(h));
3938 } else {
3939 ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
3940 if (!ptep)
3941 return VM_FAULT_OOM;
3942 } 3955 }
3943 3956
3957 /*
3958 * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
3959 * until finished with ptep. This serves two purposes:
3960 * 1) It prevents huge_pmd_unshare from being called elsewhere
3961 * and making the ptep no longer valid.
3962 * 2) It synchronizes us with file truncation.
3963 *
3964 * ptep could have already be assigned via huge_pte_offset. That
3965 * is OK, as huge_pte_alloc will return the same value unless
3966 * something changed.
3967 */
3944 mapping = vma->vm_file->f_mapping; 3968 mapping = vma->vm_file->f_mapping;
3945 idx = vma_hugecache_offset(h, vma, haddr); 3969 i_mmap_lock_read(mapping);
3970 ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
3971 if (!ptep) {
3972 i_mmap_unlock_read(mapping);
3973 return VM_FAULT_OOM;
3974 }
3946 3975
3947 /* 3976 /*
3948 * Serialize hugepage allocation and instantiation, so that we don't 3977 * Serialize hugepage allocation and instantiation, so that we don't
3949 * get spurious allocation failures if two CPUs race to instantiate 3978 * get spurious allocation failures if two CPUs race to instantiate
3950 * the same page in the page cache. 3979 * the same page in the page cache.
3951 */ 3980 */
3981 idx = vma_hugecache_offset(h, vma, haddr);
3952 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); 3982 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
3953 mutex_lock(&hugetlb_fault_mutex_table[hash]); 3983 mutex_lock(&hugetlb_fault_mutex_table[hash]);
3954 3984
@@ -4036,6 +4066,7 @@ out_ptl:
4036 } 4066 }
4037out_mutex: 4067out_mutex:
4038 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 4068 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
4069 i_mmap_unlock_read(mapping);
4039 /* 4070 /*
4040 * Generally it's safe to hold refcount during waiting page lock. But 4071 * Generally it's safe to hold refcount during waiting page lock. But
4041 * here we just wait to defer the next page fault to avoid busy loop and 4072 * here we just wait to defer the next page fault to avoid busy loop and
@@ -4340,21 +4371,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
4340 pte_t pte; 4371 pte_t pte;
4341 struct hstate *h = hstate_vma(vma); 4372 struct hstate *h = hstate_vma(vma);
4342 unsigned long pages = 0; 4373 unsigned long pages = 0;
4343 unsigned long f_start = start;
4344 unsigned long f_end = end;
4345 bool shared_pmd = false; 4374 bool shared_pmd = false;
4375 struct mmu_notifier_range range;
4346 4376
4347 /* 4377 /*
4348 * In the case of shared PMDs, the area to flush could be beyond 4378 * In the case of shared PMDs, the area to flush could be beyond
4349 * start/end. Set f_start/f_end to cover the maximum possible 4379 * start/end. Set range.start/range.end to cover the maximum possible
4350 * range if PMD sharing is possible. 4380 * range if PMD sharing is possible.
4351 */ 4381 */
4352 adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end); 4382 mmu_notifier_range_init(&range, mm, start, end);
4383 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
4353 4384
4354 BUG_ON(address >= end); 4385 BUG_ON(address >= end);
4355 flush_cache_range(vma, f_start, f_end); 4386 flush_cache_range(vma, range.start, range.end);
4356 4387
4357 mmu_notifier_invalidate_range_start(mm, f_start, f_end); 4388 mmu_notifier_invalidate_range_start(&range);
4358 i_mmap_lock_write(vma->vm_file->f_mapping); 4389 i_mmap_lock_write(vma->vm_file->f_mapping);
4359 for (; address < end; address += huge_page_size(h)) { 4390 for (; address < end; address += huge_page_size(h)) {
4360 spinlock_t *ptl; 4391 spinlock_t *ptl;
@@ -4405,7 +4436,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
4405 * did unshare a page of pmds, flush the range corresponding to the pud. 4436 * did unshare a page of pmds, flush the range corresponding to the pud.
4406 */ 4437 */
4407 if (shared_pmd) 4438 if (shared_pmd)
4408 flush_hugetlb_tlb_range(vma, f_start, f_end); 4439 flush_hugetlb_tlb_range(vma, range.start, range.end);
4409 else 4440 else
4410 flush_hugetlb_tlb_range(vma, start, end); 4441 flush_hugetlb_tlb_range(vma, start, end);
4411 /* 4442 /*
@@ -4415,7 +4446,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
4415 * See Documentation/vm/mmu_notifier.rst 4446 * See Documentation/vm/mmu_notifier.rst
4416 */ 4447 */
4417 i_mmap_unlock_write(vma->vm_file->f_mapping); 4448 i_mmap_unlock_write(vma->vm_file->f_mapping);
4418 mmu_notifier_invalidate_range_end(mm, f_start, f_end); 4449 mmu_notifier_invalidate_range_end(&range);
4419 4450
4420 return pages << h->order; 4451 return pages << h->order;
4421} 4452}
@@ -4640,10 +4671,12 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
4640 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 4671 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
4641 * and returns the corresponding pte. While this is not necessary for the 4672 * and returns the corresponding pte. While this is not necessary for the
4642 * !shared pmd case because we can allocate the pmd later as well, it makes the 4673 * !shared pmd case because we can allocate the pmd later as well, it makes the
4643 * code much cleaner. pmd allocation is essential for the shared case because 4674 * code much cleaner.
4644 * pud has to be populated inside the same i_mmap_rwsem section - otherwise 4675 *
4645 * racing tasks could either miss the sharing (see huge_pte_offset) or select a 4676 * This routine must be called with i_mmap_rwsem held in at least read mode.
4646 * bad pmd for sharing. 4677 * For hugetlbfs, this prevents removal of any page table entries associated
4678 * with the address space. This is important as we are setting up sharing
4679 * based on existing page table entries (mappings).
4647 */ 4680 */
4648pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 4681pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
4649{ 4682{
@@ -4660,7 +4693,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
4660 if (!vma_shareable(vma, addr)) 4693 if (!vma_shareable(vma, addr))
4661 return (pte_t *)pmd_alloc(mm, pud, addr); 4694 return (pte_t *)pmd_alloc(mm, pud, addr);
4662 4695
4663 i_mmap_lock_write(mapping);
4664 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 4696 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
4665 if (svma == vma) 4697 if (svma == vma)
4666 continue; 4698 continue;
@@ -4690,7 +4722,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
4690 spin_unlock(ptl); 4722 spin_unlock(ptl);
4691out: 4723out:
4692 pte = (pte_t *)pmd_alloc(mm, pud, addr); 4724 pte = (pte_t *)pmd_alloc(mm, pud, addr);
4693 i_mmap_unlock_write(mapping);
4694 return pte; 4725 return pte;
4695} 4726}
4696 4727
@@ -4701,7 +4732,7 @@ out:
4701 * indicated by page_count > 1, unmap is achieved by clearing pud and 4732 * indicated by page_count > 1, unmap is achieved by clearing pud and
4702 * decrementing the ref count. If count == 1, the pte page is not shared. 4733 * decrementing the ref count. If count == 1, the pte page is not shared.
4703 * 4734 *
4704 * called with page table lock held. 4735 * Called with page table lock held and i_mmap_rwsem held in write mode.
4705 * 4736 *
4706 * returns: 1 successfully unmapped a shared pte page 4737 * returns: 1 successfully unmapped a shared pte page
4707 * 0 the underlying pte page is not shared, or it is the last user 4738 * 0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/internal.h b/mm/internal.h
index 291eb2b6d1d8..f4a7bb02decf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -444,6 +444,16 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
444#define NODE_RECLAIM_SOME 0 444#define NODE_RECLAIM_SOME 0
445#define NODE_RECLAIM_SUCCESS 1 445#define NODE_RECLAIM_SUCCESS 1
446 446
447#ifdef CONFIG_NUMA
448extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
449#else
450static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
451 unsigned int order)
452{
453 return NODE_RECLAIM_NOSCAN;
454}
455#endif
456
447extern int hwpoison_filter(struct page *p); 457extern int hwpoison_filter(struct page *p);
448 458
449extern u32 hwpoison_filter_dev_major; 459extern u32 hwpoison_filter_dev_major;
@@ -480,10 +490,16 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
480#define ALLOC_OOM ALLOC_NO_WATERMARKS 490#define ALLOC_OOM ALLOC_NO_WATERMARKS
481#endif 491#endif
482 492
483#define ALLOC_HARDER 0x10 /* try to alloc harder */ 493#define ALLOC_HARDER 0x10 /* try to alloc harder */
484#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 494#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
485#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 495#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
486#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ 496#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
497#ifdef CONFIG_ZONE_DMA32
498#define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */
499#else
500#define ALLOC_NOFRAGMENT 0x0
501#endif
502#define ALLOC_KSWAPD 0x200 /* allow waking of kswapd */
487 503
488enum ttu_flags; 504enum ttu_flags;
489struct tlbflush_unmap_batch; 505struct tlbflush_unmap_batch;
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 3289db38bc87..0a14fcff70ed 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,11 +1,18 @@
1# SPDX-License-Identifier: GPL-2.0 1# SPDX-License-Identifier: GPL-2.0
2KASAN_SANITIZE := n 2KASAN_SANITIZE := n
3UBSAN_SANITIZE_kasan.o := n 3UBSAN_SANITIZE_common.o := n
4UBSAN_SANITIZE_generic.o := n
5UBSAN_SANITIZE_tags.o := n
4KCOV_INSTRUMENT := n 6KCOV_INSTRUMENT := n
5 7
6CFLAGS_REMOVE_kasan.o = -pg 8CFLAGS_REMOVE_generic.o = -pg
7# Function splitter causes unnecessary splits in __asan_load1/__asan_store1 9# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
8# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 10# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
9CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
10 11
11obj-y := kasan.o report.o kasan_init.o quarantine.o 12CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
13CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
14CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
15
16obj-$(CONFIG_KASAN) := common.o init.o report.o
17obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o
18obj-$(CONFIG_KASAN_SW_TAGS) += tags.o tags_report.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/common.c
index c3bd5209da38..03d5d1374ca7 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/common.c
@@ -1,5 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * This file contains shadow memory manipulation code. 3 * This file contains common generic and tag-based KASAN code.
3 * 4 *
4 * Copyright (c) 2014 Samsung Electronics Co., Ltd. 5 * Copyright (c) 2014 Samsung Electronics Co., Ltd.
5 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> 6 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
@@ -13,9 +14,6 @@
13 * 14 *
14 */ 15 */
15 16
16#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
17#define DISABLE_BRANCH_PROFILING
18
19#include <linux/export.h> 17#include <linux/export.h>
20#include <linux/interrupt.h> 18#include <linux/interrupt.h>
21#include <linux/init.h> 19#include <linux/init.h>
@@ -40,6 +38,53 @@
40#include "kasan.h" 38#include "kasan.h"
41#include "../slab.h" 39#include "../slab.h"
42 40
41static inline int in_irqentry_text(unsigned long ptr)
42{
43 return (ptr >= (unsigned long)&__irqentry_text_start &&
44 ptr < (unsigned long)&__irqentry_text_end) ||
45 (ptr >= (unsigned long)&__softirqentry_text_start &&
46 ptr < (unsigned long)&__softirqentry_text_end);
47}
48
49static inline void filter_irq_stacks(struct stack_trace *trace)
50{
51 int i;
52
53 if (!trace->nr_entries)
54 return;
55 for (i = 0; i < trace->nr_entries; i++)
56 if (in_irqentry_text(trace->entries[i])) {
57 /* Include the irqentry function into the stack. */
58 trace->nr_entries = i + 1;
59 break;
60 }
61}
62
63static inline depot_stack_handle_t save_stack(gfp_t flags)
64{
65 unsigned long entries[KASAN_STACK_DEPTH];
66 struct stack_trace trace = {
67 .nr_entries = 0,
68 .entries = entries,
69 .max_entries = KASAN_STACK_DEPTH,
70 .skip = 0
71 };
72
73 save_stack_trace(&trace);
74 filter_irq_stacks(&trace);
75 if (trace.nr_entries != 0 &&
76 trace.entries[trace.nr_entries-1] == ULONG_MAX)
77 trace.nr_entries--;
78
79 return depot_save_stack(&trace, flags);
80}
81
82static inline void set_track(struct kasan_track *track, gfp_t flags)
83{
84 track->pid = current->pid;
85 track->stack = save_stack(flags);
86}
87
43void kasan_enable_current(void) 88void kasan_enable_current(void)
44{ 89{
45 current->kasan_depth++; 90 current->kasan_depth++;
@@ -50,27 +95,85 @@ void kasan_disable_current(void)
50 current->kasan_depth--; 95 current->kasan_depth--;
51} 96}
52 97
98void kasan_check_read(const volatile void *p, unsigned int size)
99{
100 check_memory_region((unsigned long)p, size, false, _RET_IP_);
101}
102EXPORT_SYMBOL(kasan_check_read);
103
104void kasan_check_write(const volatile void *p, unsigned int size)
105{
106 check_memory_region((unsigned long)p, size, true, _RET_IP_);
107}
108EXPORT_SYMBOL(kasan_check_write);
109
110#undef memset
111void *memset(void *addr, int c, size_t len)
112{
113 check_memory_region((unsigned long)addr, len, true, _RET_IP_);
114
115 return __memset(addr, c, len);
116}
117
118#undef memmove
119void *memmove(void *dest, const void *src, size_t len)
120{
121 check_memory_region((unsigned long)src, len, false, _RET_IP_);
122 check_memory_region((unsigned long)dest, len, true, _RET_IP_);
123
124 return __memmove(dest, src, len);
125}
126
127#undef memcpy
128void *memcpy(void *dest, const void *src, size_t len)
129{
130 check_memory_region((unsigned long)src, len, false, _RET_IP_);
131 check_memory_region((unsigned long)dest, len, true, _RET_IP_);
132
133 return __memcpy(dest, src, len);
134}
135
53/* 136/*
54 * Poisons the shadow memory for 'size' bytes starting from 'addr'. 137 * Poisons the shadow memory for 'size' bytes starting from 'addr'.
55 * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. 138 * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
56 */ 139 */
57static void kasan_poison_shadow(const void *address, size_t size, u8 value) 140void kasan_poison_shadow(const void *address, size_t size, u8 value)
58{ 141{
59 void *shadow_start, *shadow_end; 142 void *shadow_start, *shadow_end;
60 143
144 /*
145 * Perform shadow offset calculation based on untagged address, as
146 * some of the callers (e.g. kasan_poison_object_data) pass tagged
147 * addresses to this function.
148 */
149 address = reset_tag(address);
150
61 shadow_start = kasan_mem_to_shadow(address); 151 shadow_start = kasan_mem_to_shadow(address);
62 shadow_end = kasan_mem_to_shadow(address + size); 152 shadow_end = kasan_mem_to_shadow(address + size);
63 153
64 memset(shadow_start, value, shadow_end - shadow_start); 154 __memset(shadow_start, value, shadow_end - shadow_start);
65} 155}
66 156
67void kasan_unpoison_shadow(const void *address, size_t size) 157void kasan_unpoison_shadow(const void *address, size_t size)
68{ 158{
69 kasan_poison_shadow(address, size, 0); 159 u8 tag = get_tag(address);
160
161 /*
162 * Perform shadow offset calculation based on untagged address, as
163 * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
164 * addresses to this function.
165 */
166 address = reset_tag(address);
167
168 kasan_poison_shadow(address, size, tag);
70 169
71 if (size & KASAN_SHADOW_MASK) { 170 if (size & KASAN_SHADOW_MASK) {
72 u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); 171 u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
73 *shadow = size & KASAN_SHADOW_MASK; 172
173 if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
174 *shadow = tag;
175 else
176 *shadow = size & KASAN_SHADOW_MASK;
74 } 177 }
75} 178}
76 179
@@ -116,199 +219,18 @@ void kasan_unpoison_stack_above_sp_to(const void *watermark)
116 kasan_unpoison_shadow(sp, size); 219 kasan_unpoison_shadow(sp, size);
117} 220}
118 221
119/* 222void kasan_alloc_pages(struct page *page, unsigned int order)
120 * All functions below always inlined so compiler could
121 * perform better optimizations in each of __asan_loadX/__assn_storeX
122 * depending on memory access size X.
123 */
124
125static __always_inline bool memory_is_poisoned_1(unsigned long addr)
126{
127 s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
128
129 if (unlikely(shadow_value)) {
130 s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
131 return unlikely(last_accessible_byte >= shadow_value);
132 }
133
134 return false;
135}
136
137static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
138 unsigned long size)
139{
140 u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
141
142 /*
143 * Access crosses 8(shadow size)-byte boundary. Such access maps
144 * into 2 shadow bytes, so we need to check them both.
145 */
146 if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
147 return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
148
149 return memory_is_poisoned_1(addr + size - 1);
150}
151
152static __always_inline bool memory_is_poisoned_16(unsigned long addr)
153{
154 u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
155
156 /* Unaligned 16-bytes access maps into 3 shadow bytes. */
157 if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
158 return *shadow_addr || memory_is_poisoned_1(addr + 15);
159
160 return *shadow_addr;
161}
162
163static __always_inline unsigned long bytes_is_nonzero(const u8 *start,
164 size_t size)
165{
166 while (size) {
167 if (unlikely(*start))
168 return (unsigned long)start;
169 start++;
170 size--;
171 }
172
173 return 0;
174}
175
176static __always_inline unsigned long memory_is_nonzero(const void *start,
177 const void *end)
178{
179 unsigned int words;
180 unsigned long ret;
181 unsigned int prefix = (unsigned long)start % 8;
182
183 if (end - start <= 16)
184 return bytes_is_nonzero(start, end - start);
185
186 if (prefix) {
187 prefix = 8 - prefix;
188 ret = bytes_is_nonzero(start, prefix);
189 if (unlikely(ret))
190 return ret;
191 start += prefix;
192 }
193
194 words = (end - start) / 8;
195 while (words) {
196 if (unlikely(*(u64 *)start))
197 return bytes_is_nonzero(start, 8);
198 start += 8;
199 words--;
200 }
201
202 return bytes_is_nonzero(start, (end - start) % 8);
203}
204
205static __always_inline bool memory_is_poisoned_n(unsigned long addr,
206 size_t size)
207{
208 unsigned long ret;
209
210 ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
211 kasan_mem_to_shadow((void *)addr + size - 1) + 1);
212
213 if (unlikely(ret)) {
214 unsigned long last_byte = addr + size - 1;
215 s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
216
217 if (unlikely(ret != (unsigned long)last_shadow ||
218 ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
219 return true;
220 }
221 return false;
222}
223
224static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
225{
226 if (__builtin_constant_p(size)) {
227 switch (size) {
228 case 1:
229 return memory_is_poisoned_1(addr);
230 case 2:
231 case 4:
232 case 8:
233 return memory_is_poisoned_2_4_8(addr, size);
234 case 16:
235 return memory_is_poisoned_16(addr);
236 default:
237 BUILD_BUG();
238 }
239 }
240
241 return memory_is_poisoned_n(addr, size);
242}
243
244static __always_inline void check_memory_region_inline(unsigned long addr,
245 size_t size, bool write,
246 unsigned long ret_ip)
247{ 223{
248 if (unlikely(size == 0)) 224 u8 tag;
249 return; 225 unsigned long i;
250 226
251 if (unlikely((void *)addr < 227 if (unlikely(PageHighMem(page)))
252 kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
253 kasan_report(addr, size, write, ret_ip);
254 return; 228 return;
255 }
256 229
257 if (likely(!memory_is_poisoned(addr, size))) 230 tag = random_tag();
258 return; 231 for (i = 0; i < (1 << order); i++)
259 232 page_kasan_tag_set(page + i, tag);
260 kasan_report(addr, size, write, ret_ip); 233 kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
261}
262
263static void check_memory_region(unsigned long addr,
264 size_t size, bool write,
265 unsigned long ret_ip)
266{
267 check_memory_region_inline(addr, size, write, ret_ip);
268}
269
270void kasan_check_read(const volatile void *p, unsigned int size)
271{
272 check_memory_region((unsigned long)p, size, false, _RET_IP_);
273}
274EXPORT_SYMBOL(kasan_check_read);
275
276void kasan_check_write(const volatile void *p, unsigned int size)
277{
278 check_memory_region((unsigned long)p, size, true, _RET_IP_);
279}
280EXPORT_SYMBOL(kasan_check_write);
281
282#undef memset
283void *memset(void *addr, int c, size_t len)
284{
285 check_memory_region((unsigned long)addr, len, true, _RET_IP_);
286
287 return __memset(addr, c, len);
288}
289
290#undef memmove
291void *memmove(void *dest, const void *src, size_t len)
292{
293 check_memory_region((unsigned long)src, len, false, _RET_IP_);
294 check_memory_region((unsigned long)dest, len, true, _RET_IP_);
295
296 return __memmove(dest, src, len);
297}
298
299#undef memcpy
300void *memcpy(void *dest, const void *src, size_t len)
301{
302 check_memory_region((unsigned long)src, len, false, _RET_IP_);
303 check_memory_region((unsigned long)dest, len, true, _RET_IP_);
304
305 return __memcpy(dest, src, len);
306}
307
308void kasan_alloc_pages(struct page *page, unsigned int order)
309{
310 if (likely(!PageHighMem(page)))
311 kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
312} 234}
313 235
314void kasan_free_pages(struct page *page, unsigned int order) 236void kasan_free_pages(struct page *page, unsigned int order)
@@ -323,8 +245,11 @@ void kasan_free_pages(struct page *page, unsigned int order)
323 * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. 245 * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
324 * For larger allocations larger redzones are used. 246 * For larger allocations larger redzones are used.
325 */ 247 */
326static unsigned int optimal_redzone(unsigned int object_size) 248static inline unsigned int optimal_redzone(unsigned int object_size)
327{ 249{
250 if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
251 return 0;
252
328 return 253 return
329 object_size <= 64 - 16 ? 16 : 254 object_size <= 64 - 16 ? 16 :
330 object_size <= 128 - 32 ? 32 : 255 object_size <= 128 - 32 ? 32 :
@@ -339,6 +264,7 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
339 slab_flags_t *flags) 264 slab_flags_t *flags)
340{ 265{
341 unsigned int orig_size = *size; 266 unsigned int orig_size = *size;
267 unsigned int redzone_size;
342 int redzone_adjust; 268 int redzone_adjust;
343 269
344 /* Add alloc meta. */ 270 /* Add alloc meta. */
@@ -346,20 +272,20 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
346 *size += sizeof(struct kasan_alloc_meta); 272 *size += sizeof(struct kasan_alloc_meta);
347 273
348 /* Add free meta. */ 274 /* Add free meta. */
349 if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || 275 if (IS_ENABLED(CONFIG_KASAN_GENERIC) &&
350 cache->object_size < sizeof(struct kasan_free_meta)) { 276 (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
277 cache->object_size < sizeof(struct kasan_free_meta))) {
351 cache->kasan_info.free_meta_offset = *size; 278 cache->kasan_info.free_meta_offset = *size;
352 *size += sizeof(struct kasan_free_meta); 279 *size += sizeof(struct kasan_free_meta);
353 } 280 }
354 redzone_adjust = optimal_redzone(cache->object_size) -
355 (*size - cache->object_size);
356 281
282 redzone_size = optimal_redzone(cache->object_size);
283 redzone_adjust = redzone_size - (*size - cache->object_size);
357 if (redzone_adjust > 0) 284 if (redzone_adjust > 0)
358 *size += redzone_adjust; 285 *size += redzone_adjust;
359 286
360 *size = min_t(unsigned int, KMALLOC_MAX_SIZE, 287 *size = min_t(unsigned int, KMALLOC_MAX_SIZE,
361 max(*size, cache->object_size + 288 max(*size, cache->object_size + redzone_size));
362 optimal_redzone(cache->object_size)));
363 289
364 /* 290 /*
365 * If the metadata doesn't fit, don't enable KASAN at all. 291 * If the metadata doesn't fit, don't enable KASAN at all.
@@ -372,30 +298,39 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
372 return; 298 return;
373 } 299 }
374 300
301 cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE);
302
375 *flags |= SLAB_KASAN; 303 *flags |= SLAB_KASAN;
376} 304}
377 305
378void kasan_cache_shrink(struct kmem_cache *cache) 306size_t kasan_metadata_size(struct kmem_cache *cache)
379{ 307{
380 quarantine_remove_cache(cache); 308 return (cache->kasan_info.alloc_meta_offset ?
309 sizeof(struct kasan_alloc_meta) : 0) +
310 (cache->kasan_info.free_meta_offset ?
311 sizeof(struct kasan_free_meta) : 0);
381} 312}
382 313
383void kasan_cache_shutdown(struct kmem_cache *cache) 314struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
315 const void *object)
384{ 316{
385 if (!__kmem_cache_empty(cache)) 317 BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
386 quarantine_remove_cache(cache); 318 return (void *)object + cache->kasan_info.alloc_meta_offset;
387} 319}
388 320
389size_t kasan_metadata_size(struct kmem_cache *cache) 321struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
322 const void *object)
390{ 323{
391 return (cache->kasan_info.alloc_meta_offset ? 324 BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
392 sizeof(struct kasan_alloc_meta) : 0) + 325 return (void *)object + cache->kasan_info.free_meta_offset;
393 (cache->kasan_info.free_meta_offset ?
394 sizeof(struct kasan_free_meta) : 0);
395} 326}
396 327
397void kasan_poison_slab(struct page *page) 328void kasan_poison_slab(struct page *page)
398{ 329{
330 unsigned long i;
331
332 for (i = 0; i < (1 << compound_order(page)); i++)
333 page_kasan_tag_reset(page + i);
399 kasan_poison_shadow(page_address(page), 334 kasan_poison_shadow(page_address(page),
400 PAGE_SIZE << compound_order(page), 335 PAGE_SIZE << compound_order(page),
401 KASAN_KMALLOC_REDZONE); 336 KASAN_KMALLOC_REDZONE);
@@ -413,92 +348,79 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
413 KASAN_KMALLOC_REDZONE); 348 KASAN_KMALLOC_REDZONE);
414} 349}
415 350
416static inline int in_irqentry_text(unsigned long ptr) 351/*
417{ 352 * Since it's desirable to only call object contructors once during slab
418 return (ptr >= (unsigned long)&__irqentry_text_start && 353 * allocation, we preassign tags to all such objects. Also preassign tags for
419 ptr < (unsigned long)&__irqentry_text_end) || 354 * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports.
420 (ptr >= (unsigned long)&__softirqentry_text_start && 355 * For SLAB allocator we can't preassign tags randomly since the freelist is
421 ptr < (unsigned long)&__softirqentry_text_end); 356 * stored as an array of indexes instead of a linked list. Assign tags based
422} 357 * on objects indexes, so that objects that are next to each other get
423 358 * different tags.
424static inline void filter_irq_stacks(struct stack_trace *trace) 359 * After a tag is assigned, the object always gets allocated with the same tag.
360 * The reason is that we can't change tags for objects with constructors on
361 * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor
362 * code can save the pointer to the object somewhere (e.g. in the object
363 * itself). Then if we retag it, the old saved pointer will become invalid.
364 */
365static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new)
425{ 366{
426 int i; 367 if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
368 return new ? KASAN_TAG_KERNEL : random_tag();
427 369
428 if (!trace->nr_entries) 370#ifdef CONFIG_SLAB
429 return; 371 return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
430 for (i = 0; i < trace->nr_entries; i++) 372#else
431 if (in_irqentry_text(trace->entries[i])) { 373 return new ? random_tag() : get_tag(object);
432 /* Include the irqentry function into the stack. */ 374#endif
433 trace->nr_entries = i + 1;
434 break;
435 }
436} 375}
437 376
438static inline depot_stack_handle_t save_stack(gfp_t flags) 377void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
378 const void *object)
439{ 379{
440 unsigned long entries[KASAN_STACK_DEPTH]; 380 struct kasan_alloc_meta *alloc_info;
441 struct stack_trace trace = {
442 .nr_entries = 0,
443 .entries = entries,
444 .max_entries = KASAN_STACK_DEPTH,
445 .skip = 0
446 };
447
448 save_stack_trace(&trace);
449 filter_irq_stacks(&trace);
450 if (trace.nr_entries != 0 &&
451 trace.entries[trace.nr_entries-1] == ULONG_MAX)
452 trace.nr_entries--;
453 381
454 return depot_save_stack(&trace, flags); 382 if (!(cache->flags & SLAB_KASAN))
455} 383 return (void *)object;
456 384
457static inline void set_track(struct kasan_track *track, gfp_t flags) 385 alloc_info = get_alloc_info(cache, object);
458{ 386 __memset(alloc_info, 0, sizeof(*alloc_info));
459 track->pid = current->pid;
460 track->stack = save_stack(flags);
461}
462 387
463struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, 388 if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
464 const void *object) 389 object = set_tag(object, assign_tag(cache, object, true));
465{
466 BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
467 return (void *)object + cache->kasan_info.alloc_meta_offset;
468}
469 390
470struct kasan_free_meta *get_free_info(struct kmem_cache *cache, 391 return (void *)object;
471 const void *object)
472{
473 BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
474 return (void *)object + cache->kasan_info.free_meta_offset;
475} 392}
476 393
477void kasan_init_slab_obj(struct kmem_cache *cache, const void *object) 394void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
395 gfp_t flags)
478{ 396{
479 struct kasan_alloc_meta *alloc_info; 397 return kasan_kmalloc(cache, object, cache->object_size, flags);
480
481 if (!(cache->flags & SLAB_KASAN))
482 return;
483
484 alloc_info = get_alloc_info(cache, object);
485 __memset(alloc_info, 0, sizeof(*alloc_info));
486} 398}
487 399
488void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) 400static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
489{ 401{
490 kasan_kmalloc(cache, object, cache->object_size, flags); 402 if (IS_ENABLED(CONFIG_KASAN_GENERIC))
403 return shadow_byte < 0 ||
404 shadow_byte >= KASAN_SHADOW_SCALE_SIZE;
405 else
406 return tag != (u8)shadow_byte;
491} 407}
492 408
493static bool __kasan_slab_free(struct kmem_cache *cache, void *object, 409static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
494 unsigned long ip, bool quarantine) 410 unsigned long ip, bool quarantine)
495{ 411{
496 s8 shadow_byte; 412 s8 shadow_byte;
413 u8 tag;
414 void *tagged_object;
497 unsigned long rounded_up_size; 415 unsigned long rounded_up_size;
498 416
417 tag = get_tag(object);
418 tagged_object = object;
419 object = reset_tag(object);
420
499 if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) != 421 if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
500 object)) { 422 object)) {
501 kasan_report_invalid_free(object, ip); 423 kasan_report_invalid_free(tagged_object, ip);
502 return true; 424 return true;
503 } 425 }
504 426
@@ -507,20 +429,22 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
507 return false; 429 return false;
508 430
509 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); 431 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
510 if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { 432 if (shadow_invalid(tag, shadow_byte)) {
511 kasan_report_invalid_free(object, ip); 433 kasan_report_invalid_free(tagged_object, ip);
512 return true; 434 return true;
513 } 435 }
514 436
515 rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE); 437 rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE);
516 kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); 438 kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
517 439
518 if (!quarantine || unlikely(!(cache->flags & SLAB_KASAN))) 440 if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) ||
441 unlikely(!(cache->flags & SLAB_KASAN)))
519 return false; 442 return false;
520 443
521 set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); 444 set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT);
522 quarantine_put(get_free_info(cache, object), cache); 445 quarantine_put(get_free_info(cache, object), cache);
523 return true; 446
447 return IS_ENABLED(CONFIG_KASAN_GENERIC);
524} 448}
525 449
526bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) 450bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
@@ -528,33 +452,41 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
528 return __kasan_slab_free(cache, object, ip, true); 452 return __kasan_slab_free(cache, object, ip, true);
529} 453}
530 454
531void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, 455void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
532 gfp_t flags) 456 size_t size, gfp_t flags)
533{ 457{
534 unsigned long redzone_start; 458 unsigned long redzone_start;
535 unsigned long redzone_end; 459 unsigned long redzone_end;
460 u8 tag;
536 461
537 if (gfpflags_allow_blocking(flags)) 462 if (gfpflags_allow_blocking(flags))
538 quarantine_reduce(); 463 quarantine_reduce();
539 464
540 if (unlikely(object == NULL)) 465 if (unlikely(object == NULL))
541 return; 466 return NULL;
542 467
543 redzone_start = round_up((unsigned long)(object + size), 468 redzone_start = round_up((unsigned long)(object + size),
544 KASAN_SHADOW_SCALE_SIZE); 469 KASAN_SHADOW_SCALE_SIZE);
545 redzone_end = round_up((unsigned long)object + cache->object_size, 470 redzone_end = round_up((unsigned long)object + cache->object_size,
546 KASAN_SHADOW_SCALE_SIZE); 471 KASAN_SHADOW_SCALE_SIZE);
547 472
548 kasan_unpoison_shadow(object, size); 473 if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
474 tag = assign_tag(cache, object, false);
475
476 /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
477 kasan_unpoison_shadow(set_tag(object, tag), size);
549 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, 478 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
550 KASAN_KMALLOC_REDZONE); 479 KASAN_KMALLOC_REDZONE);
551 480
552 if (cache->flags & SLAB_KASAN) 481 if (cache->flags & SLAB_KASAN)
553 set_track(&get_alloc_info(cache, object)->alloc_track, flags); 482 set_track(&get_alloc_info(cache, object)->alloc_track, flags);
483
484 return set_tag(object, tag);
554} 485}
555EXPORT_SYMBOL(kasan_kmalloc); 486EXPORT_SYMBOL(kasan_kmalloc);
556 487
557void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) 488void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
489 gfp_t flags)
558{ 490{
559 struct page *page; 491 struct page *page;
560 unsigned long redzone_start; 492 unsigned long redzone_start;
@@ -564,7 +496,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
564 quarantine_reduce(); 496 quarantine_reduce();
565 497
566 if (unlikely(ptr == NULL)) 498 if (unlikely(ptr == NULL))
567 return; 499 return NULL;
568 500
569 page = virt_to_page(ptr); 501 page = virt_to_page(ptr);
570 redzone_start = round_up((unsigned long)(ptr + size), 502 redzone_start = round_up((unsigned long)(ptr + size),
@@ -574,21 +506,23 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
574 kasan_unpoison_shadow(ptr, size); 506 kasan_unpoison_shadow(ptr, size);
575 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, 507 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
576 KASAN_PAGE_REDZONE); 508 KASAN_PAGE_REDZONE);
509
510 return (void *)ptr;
577} 511}
578 512
579void kasan_krealloc(const void *object, size_t size, gfp_t flags) 513void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
580{ 514{
581 struct page *page; 515 struct page *page;
582 516
583 if (unlikely(object == ZERO_SIZE_PTR)) 517 if (unlikely(object == ZERO_SIZE_PTR))
584 return; 518 return (void *)object;
585 519
586 page = virt_to_head_page(object); 520 page = virt_to_head_page(object);
587 521
588 if (unlikely(!PageSlab(page))) 522 if (unlikely(!PageSlab(page)))
589 kasan_kmalloc_large(object, size, flags); 523 return kasan_kmalloc_large(object, size, flags);
590 else 524 else
591 kasan_kmalloc(page->slab_cache, object, size, flags); 525 return kasan_kmalloc(page->slab_cache, object, size, flags);
592} 526}
593 527
594void kasan_poison_kfree(void *ptr, unsigned long ip) 528void kasan_poison_kfree(void *ptr, unsigned long ip)
@@ -632,11 +566,12 @@ int kasan_module_alloc(void *addr, size_t size)
632 566
633 ret = __vmalloc_node_range(shadow_size, 1, shadow_start, 567 ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
634 shadow_start + shadow_size, 568 shadow_start + shadow_size,
635 GFP_KERNEL | __GFP_ZERO, 569 GFP_KERNEL,
636 PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, 570 PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
637 __builtin_return_address(0)); 571 __builtin_return_address(0));
638 572
639 if (ret) { 573 if (ret) {
574 __memset(ret, KASAN_SHADOW_INIT, shadow_size);
640 find_vm_area(addr)->flags |= VM_KASAN; 575 find_vm_area(addr)->flags |= VM_KASAN;
641 kmemleak_ignore(ret); 576 kmemleak_ignore(ret);
642 return 0; 577 return 0;
@@ -651,147 +586,6 @@ void kasan_free_shadow(const struct vm_struct *vm)
651 vfree(kasan_mem_to_shadow(vm->addr)); 586 vfree(kasan_mem_to_shadow(vm->addr));
652} 587}
653 588
654static void register_global(struct kasan_global *global)
655{
656 size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
657
658 kasan_unpoison_shadow(global->beg, global->size);
659
660 kasan_poison_shadow(global->beg + aligned_size,
661 global->size_with_redzone - aligned_size,
662 KASAN_GLOBAL_REDZONE);
663}
664
665void __asan_register_globals(struct kasan_global *globals, size_t size)
666{
667 int i;
668
669 for (i = 0; i < size; i++)
670 register_global(&globals[i]);
671}
672EXPORT_SYMBOL(__asan_register_globals);
673
674void __asan_unregister_globals(struct kasan_global *globals, size_t size)
675{
676}
677EXPORT_SYMBOL(__asan_unregister_globals);
678
679#define DEFINE_ASAN_LOAD_STORE(size) \
680 void __asan_load##size(unsigned long addr) \
681 { \
682 check_memory_region_inline(addr, size, false, _RET_IP_);\
683 } \
684 EXPORT_SYMBOL(__asan_load##size); \
685 __alias(__asan_load##size) \
686 void __asan_load##size##_noabort(unsigned long); \
687 EXPORT_SYMBOL(__asan_load##size##_noabort); \
688 void __asan_store##size(unsigned long addr) \
689 { \
690 check_memory_region_inline(addr, size, true, _RET_IP_); \
691 } \
692 EXPORT_SYMBOL(__asan_store##size); \
693 __alias(__asan_store##size) \
694 void __asan_store##size##_noabort(unsigned long); \
695 EXPORT_SYMBOL(__asan_store##size##_noabort)
696
697DEFINE_ASAN_LOAD_STORE(1);
698DEFINE_ASAN_LOAD_STORE(2);
699DEFINE_ASAN_LOAD_STORE(4);
700DEFINE_ASAN_LOAD_STORE(8);
701DEFINE_ASAN_LOAD_STORE(16);
702
703void __asan_loadN(unsigned long addr, size_t size)
704{
705 check_memory_region(addr, size, false, _RET_IP_);
706}
707EXPORT_SYMBOL(__asan_loadN);
708
709__alias(__asan_loadN)
710void __asan_loadN_noabort(unsigned long, size_t);
711EXPORT_SYMBOL(__asan_loadN_noabort);
712
713void __asan_storeN(unsigned long addr, size_t size)
714{
715 check_memory_region(addr, size, true, _RET_IP_);
716}
717EXPORT_SYMBOL(__asan_storeN);
718
719__alias(__asan_storeN)
720void __asan_storeN_noabort(unsigned long, size_t);
721EXPORT_SYMBOL(__asan_storeN_noabort);
722
723/* to shut up compiler complaints */
724void __asan_handle_no_return(void) {}
725EXPORT_SYMBOL(__asan_handle_no_return);
726
727/* Emitted by compiler to poison large objects when they go out of scope. */
728void __asan_poison_stack_memory(const void *addr, size_t size)
729{
730 /*
731 * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded
732 * by redzones, so we simply round up size to simplify logic.
733 */
734 kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE),
735 KASAN_USE_AFTER_SCOPE);
736}
737EXPORT_SYMBOL(__asan_poison_stack_memory);
738
739/* Emitted by compiler to unpoison large objects when they go into scope. */
740void __asan_unpoison_stack_memory(const void *addr, size_t size)
741{
742 kasan_unpoison_shadow(addr, size);
743}
744EXPORT_SYMBOL(__asan_unpoison_stack_memory);
745
746/* Emitted by compiler to poison alloca()ed objects. */
747void __asan_alloca_poison(unsigned long addr, size_t size)
748{
749 size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
750 size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
751 rounded_up_size;
752 size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
753
754 const void *left_redzone = (const void *)(addr -
755 KASAN_ALLOCA_REDZONE_SIZE);
756 const void *right_redzone = (const void *)(addr + rounded_up_size);
757
758 WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
759
760 kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
761 size - rounded_down_size);
762 kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
763 KASAN_ALLOCA_LEFT);
764 kasan_poison_shadow(right_redzone,
765 padding_size + KASAN_ALLOCA_REDZONE_SIZE,
766 KASAN_ALLOCA_RIGHT);
767}
768EXPORT_SYMBOL(__asan_alloca_poison);
769
770/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
771void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
772{
773 if (unlikely(!stack_top || stack_top > stack_bottom))
774 return;
775
776 kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
777}
778EXPORT_SYMBOL(__asan_allocas_unpoison);
779
780/* Emitted by the compiler to [un]poison local variables. */
781#define DEFINE_ASAN_SET_SHADOW(byte) \
782 void __asan_set_shadow_##byte(const void *addr, size_t size) \
783 { \
784 __memset((void *)addr, 0x##byte, size); \
785 } \
786 EXPORT_SYMBOL(__asan_set_shadow_##byte)
787
788DEFINE_ASAN_SET_SHADOW(00);
789DEFINE_ASAN_SET_SHADOW(f1);
790DEFINE_ASAN_SET_SHADOW(f2);
791DEFINE_ASAN_SET_SHADOW(f3);
792DEFINE_ASAN_SET_SHADOW(f5);
793DEFINE_ASAN_SET_SHADOW(f8);
794
795#ifdef CONFIG_MEMORY_HOTPLUG 589#ifdef CONFIG_MEMORY_HOTPLUG
796static bool shadow_mapped(unsigned long addr) 590static bool shadow_mapped(unsigned long addr)
797{ 591{
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
new file mode 100644
index 000000000000..ccb6207276e3
--- /dev/null
+++ b/mm/kasan/generic.c
@@ -0,0 +1,344 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * This file contains core generic KASAN code.
4 *
5 * Copyright (c) 2014 Samsung Electronics Co., Ltd.
6 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
7 *
8 * Some code borrowed from https://github.com/xairy/kasan-prototype by
9 * Andrey Konovalov <andreyknvl@gmail.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 *
15 */
16
17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18#define DISABLE_BRANCH_PROFILING
19
20#include <linux/export.h>
21#include <linux/interrupt.h>
22#include <linux/init.h>
23#include <linux/kasan.h>
24#include <linux/kernel.h>
25#include <linux/kmemleak.h>
26#include <linux/linkage.h>
27#include <linux/memblock.h>
28#include <linux/memory.h>
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/printk.h>
32#include <linux/sched.h>
33#include <linux/sched/task_stack.h>
34#include <linux/slab.h>
35#include <linux/stacktrace.h>
36#include <linux/string.h>
37#include <linux/types.h>
38#include <linux/vmalloc.h>
39#include <linux/bug.h>
40
41#include "kasan.h"
42#include "../slab.h"
43
44/*
45 * All functions below always inlined so compiler could
46 * perform better optimizations in each of __asan_loadX/__assn_storeX
47 * depending on memory access size X.
48 */
49
50static __always_inline bool memory_is_poisoned_1(unsigned long addr)
51{
52 s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
53
54 if (unlikely(shadow_value)) {
55 s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
56 return unlikely(last_accessible_byte >= shadow_value);
57 }
58
59 return false;
60}
61
62static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
63 unsigned long size)
64{
65 u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
66
67 /*
68 * Access crosses 8(shadow size)-byte boundary. Such access maps
69 * into 2 shadow bytes, so we need to check them both.
70 */
71 if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
72 return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
73
74 return memory_is_poisoned_1(addr + size - 1);
75}
76
77static __always_inline bool memory_is_poisoned_16(unsigned long addr)
78{
79 u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
80
81 /* Unaligned 16-bytes access maps into 3 shadow bytes. */
82 if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
83 return *shadow_addr || memory_is_poisoned_1(addr + 15);
84
85 return *shadow_addr;
86}
87
88static __always_inline unsigned long bytes_is_nonzero(const u8 *start,
89 size_t size)
90{
91 while (size) {
92 if (unlikely(*start))
93 return (unsigned long)start;
94 start++;
95 size--;
96 }
97
98 return 0;
99}
100
101static __always_inline unsigned long memory_is_nonzero(const void *start,
102 const void *end)
103{
104 unsigned int words;
105 unsigned long ret;
106 unsigned int prefix = (unsigned long)start % 8;
107
108 if (end - start <= 16)
109 return bytes_is_nonzero(start, end - start);
110
111 if (prefix) {
112 prefix = 8 - prefix;
113 ret = bytes_is_nonzero(start, prefix);
114 if (unlikely(ret))
115 return ret;
116 start += prefix;
117 }
118
119 words = (end - start) / 8;
120 while (words) {
121 if (unlikely(*(u64 *)start))
122 return bytes_is_nonzero(start, 8);
123 start += 8;
124 words--;
125 }
126
127 return bytes_is_nonzero(start, (end - start) % 8);
128}
129
130static __always_inline bool memory_is_poisoned_n(unsigned long addr,
131 size_t size)
132{
133 unsigned long ret;
134
135 ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
136 kasan_mem_to_shadow((void *)addr + size - 1) + 1);
137
138 if (unlikely(ret)) {
139 unsigned long last_byte = addr + size - 1;
140 s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
141
142 if (unlikely(ret != (unsigned long)last_shadow ||
143 ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
144 return true;
145 }
146 return false;
147}
148
149static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
150{
151 if (__builtin_constant_p(size)) {
152 switch (size) {
153 case 1:
154 return memory_is_poisoned_1(addr);
155 case 2:
156 case 4:
157 case 8:
158 return memory_is_poisoned_2_4_8(addr, size);
159 case 16:
160 return memory_is_poisoned_16(addr);
161 default:
162 BUILD_BUG();
163 }
164 }
165
166 return memory_is_poisoned_n(addr, size);
167}
168
169static __always_inline void check_memory_region_inline(unsigned long addr,
170 size_t size, bool write,
171 unsigned long ret_ip)
172{
173 if (unlikely(size == 0))
174 return;
175
176 if (unlikely((void *)addr <
177 kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
178 kasan_report(addr, size, write, ret_ip);
179 return;
180 }
181
182 if (likely(!memory_is_poisoned(addr, size)))
183 return;
184
185 kasan_report(addr, size, write, ret_ip);
186}
187
188void check_memory_region(unsigned long addr, size_t size, bool write,
189 unsigned long ret_ip)
190{
191 check_memory_region_inline(addr, size, write, ret_ip);
192}
193
194void kasan_cache_shrink(struct kmem_cache *cache)
195{
196 quarantine_remove_cache(cache);
197}
198
199void kasan_cache_shutdown(struct kmem_cache *cache)
200{
201 if (!__kmem_cache_empty(cache))
202 quarantine_remove_cache(cache);
203}
204
205static void register_global(struct kasan_global *global)
206{
207 size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
208
209 kasan_unpoison_shadow(global->beg, global->size);
210
211 kasan_poison_shadow(global->beg + aligned_size,
212 global->size_with_redzone - aligned_size,
213 KASAN_GLOBAL_REDZONE);
214}
215
216void __asan_register_globals(struct kasan_global *globals, size_t size)
217{
218 int i;
219
220 for (i = 0; i < size; i++)
221 register_global(&globals[i]);
222}
223EXPORT_SYMBOL(__asan_register_globals);
224
225void __asan_unregister_globals(struct kasan_global *globals, size_t size)
226{
227}
228EXPORT_SYMBOL(__asan_unregister_globals);
229
230#define DEFINE_ASAN_LOAD_STORE(size) \
231 void __asan_load##size(unsigned long addr) \
232 { \
233 check_memory_region_inline(addr, size, false, _RET_IP_);\
234 } \
235 EXPORT_SYMBOL(__asan_load##size); \
236 __alias(__asan_load##size) \
237 void __asan_load##size##_noabort(unsigned long); \
238 EXPORT_SYMBOL(__asan_load##size##_noabort); \
239 void __asan_store##size(unsigned long addr) \
240 { \
241 check_memory_region_inline(addr, size, true, _RET_IP_); \
242 } \
243 EXPORT_SYMBOL(__asan_store##size); \
244 __alias(__asan_store##size) \
245 void __asan_store##size##_noabort(unsigned long); \
246 EXPORT_SYMBOL(__asan_store##size##_noabort)
247
248DEFINE_ASAN_LOAD_STORE(1);
249DEFINE_ASAN_LOAD_STORE(2);
250DEFINE_ASAN_LOAD_STORE(4);
251DEFINE_ASAN_LOAD_STORE(8);
252DEFINE_ASAN_LOAD_STORE(16);
253
254void __asan_loadN(unsigned long addr, size_t size)
255{
256 check_memory_region(addr, size, false, _RET_IP_);
257}
258EXPORT_SYMBOL(__asan_loadN);
259
260__alias(__asan_loadN)
261void __asan_loadN_noabort(unsigned long, size_t);
262EXPORT_SYMBOL(__asan_loadN_noabort);
263
264void __asan_storeN(unsigned long addr, size_t size)
265{
266 check_memory_region(addr, size, true, _RET_IP_);
267}
268EXPORT_SYMBOL(__asan_storeN);
269
270__alias(__asan_storeN)
271void __asan_storeN_noabort(unsigned long, size_t);
272EXPORT_SYMBOL(__asan_storeN_noabort);
273
274/* to shut up compiler complaints */
275void __asan_handle_no_return(void) {}
276EXPORT_SYMBOL(__asan_handle_no_return);
277
278/* Emitted by compiler to poison large objects when they go out of scope. */
279void __asan_poison_stack_memory(const void *addr, size_t size)
280{
281 /*
282 * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded
283 * by redzones, so we simply round up size to simplify logic.
284 */
285 kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE),
286 KASAN_USE_AFTER_SCOPE);
287}
288EXPORT_SYMBOL(__asan_poison_stack_memory);
289
290/* Emitted by compiler to unpoison large objects when they go into scope. */
291void __asan_unpoison_stack_memory(const void *addr, size_t size)
292{
293 kasan_unpoison_shadow(addr, size);
294}
295EXPORT_SYMBOL(__asan_unpoison_stack_memory);
296
297/* Emitted by compiler to poison alloca()ed objects. */
298void __asan_alloca_poison(unsigned long addr, size_t size)
299{
300 size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
301 size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
302 rounded_up_size;
303 size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
304
305 const void *left_redzone = (const void *)(addr -
306 KASAN_ALLOCA_REDZONE_SIZE);
307 const void *right_redzone = (const void *)(addr + rounded_up_size);
308
309 WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
310
311 kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
312 size - rounded_down_size);
313 kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
314 KASAN_ALLOCA_LEFT);
315 kasan_poison_shadow(right_redzone,
316 padding_size + KASAN_ALLOCA_REDZONE_SIZE,
317 KASAN_ALLOCA_RIGHT);
318}
319EXPORT_SYMBOL(__asan_alloca_poison);
320
321/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
322void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
323{
324 if (unlikely(!stack_top || stack_top > stack_bottom))
325 return;
326
327 kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
328}
329EXPORT_SYMBOL(__asan_allocas_unpoison);
330
331/* Emitted by the compiler to [un]poison local variables. */
332#define DEFINE_ASAN_SET_SHADOW(byte) \
333 void __asan_set_shadow_##byte(const void *addr, size_t size) \
334 { \
335 __memset((void *)addr, 0x##byte, size); \
336 } \
337 EXPORT_SYMBOL(__asan_set_shadow_##byte)
338
339DEFINE_ASAN_SET_SHADOW(00);
340DEFINE_ASAN_SET_SHADOW(f1);
341DEFINE_ASAN_SET_SHADOW(f2);
342DEFINE_ASAN_SET_SHADOW(f3);
343DEFINE_ASAN_SET_SHADOW(f5);
344DEFINE_ASAN_SET_SHADOW(f8);
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
new file mode 100644
index 000000000000..5e12035888f2
--- /dev/null
+++ b/mm/kasan/generic_report.c
@@ -0,0 +1,153 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * This file contains generic KASAN specific error reporting code.
4 *
5 * Copyright (c) 2014 Samsung Electronics Co., Ltd.
6 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
7 *
8 * Some code borrowed from https://github.com/xairy/kasan-prototype by
9 * Andrey Konovalov <andreyknvl@gmail.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 *
15 */
16
17#include <linux/bitops.h>
18#include <linux/ftrace.h>
19#include <linux/init.h>
20#include <linux/kernel.h>
21#include <linux/mm.h>
22#include <linux/printk.h>
23#include <linux/sched.h>
24#include <linux/slab.h>
25#include <linux/stackdepot.h>
26#include <linux/stacktrace.h>
27#include <linux/string.h>
28#include <linux/types.h>
29#include <linux/kasan.h>
30#include <linux/module.h>
31
32#include <asm/sections.h>
33
34#include "kasan.h"
35#include "../slab.h"
36
37void *find_first_bad_addr(void *addr, size_t size)
38{
39 void *p = addr;
40
41 while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
42 p += KASAN_SHADOW_SCALE_SIZE;
43 return p;
44}
45
46static const char *get_shadow_bug_type(struct kasan_access_info *info)
47{
48 const char *bug_type = "unknown-crash";
49 u8 *shadow_addr;
50
51 shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
52
53 /*
54 * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
55 * at the next shadow byte to determine the type of the bad access.
56 */
57 if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
58 shadow_addr++;
59
60 switch (*shadow_addr) {
61 case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
62 /*
63 * In theory it's still possible to see these shadow values
64 * due to a data race in the kernel code.
65 */
66 bug_type = "out-of-bounds";
67 break;
68 case KASAN_PAGE_REDZONE:
69 case KASAN_KMALLOC_REDZONE:
70 bug_type = "slab-out-of-bounds";
71 break;
72 case KASAN_GLOBAL_REDZONE:
73 bug_type = "global-out-of-bounds";
74 break;
75 case KASAN_STACK_LEFT:
76 case KASAN_STACK_MID:
77 case KASAN_STACK_RIGHT:
78 case KASAN_STACK_PARTIAL:
79 bug_type = "stack-out-of-bounds";
80 break;
81 case KASAN_FREE_PAGE:
82 case KASAN_KMALLOC_FREE:
83 bug_type = "use-after-free";
84 break;
85 case KASAN_USE_AFTER_SCOPE:
86 bug_type = "use-after-scope";
87 break;
88 case KASAN_ALLOCA_LEFT:
89 case KASAN_ALLOCA_RIGHT:
90 bug_type = "alloca-out-of-bounds";
91 break;
92 }
93
94 return bug_type;
95}
96
97static const char *get_wild_bug_type(struct kasan_access_info *info)
98{
99 const char *bug_type = "unknown-crash";
100
101 if ((unsigned long)info->access_addr < PAGE_SIZE)
102 bug_type = "null-ptr-deref";
103 else if ((unsigned long)info->access_addr < TASK_SIZE)
104 bug_type = "user-memory-access";
105 else
106 bug_type = "wild-memory-access";
107
108 return bug_type;
109}
110
111const char *get_bug_type(struct kasan_access_info *info)
112{
113 if (addr_has_shadow(info->access_addr))
114 return get_shadow_bug_type(info);
115 return get_wild_bug_type(info);
116}
117
118#define DEFINE_ASAN_REPORT_LOAD(size) \
119void __asan_report_load##size##_noabort(unsigned long addr) \
120{ \
121 kasan_report(addr, size, false, _RET_IP_); \
122} \
123EXPORT_SYMBOL(__asan_report_load##size##_noabort)
124
125#define DEFINE_ASAN_REPORT_STORE(size) \
126void __asan_report_store##size##_noabort(unsigned long addr) \
127{ \
128 kasan_report(addr, size, true, _RET_IP_); \
129} \
130EXPORT_SYMBOL(__asan_report_store##size##_noabort)
131
132DEFINE_ASAN_REPORT_LOAD(1);
133DEFINE_ASAN_REPORT_LOAD(2);
134DEFINE_ASAN_REPORT_LOAD(4);
135DEFINE_ASAN_REPORT_LOAD(8);
136DEFINE_ASAN_REPORT_LOAD(16);
137DEFINE_ASAN_REPORT_STORE(1);
138DEFINE_ASAN_REPORT_STORE(2);
139DEFINE_ASAN_REPORT_STORE(4);
140DEFINE_ASAN_REPORT_STORE(8);
141DEFINE_ASAN_REPORT_STORE(16);
142
143void __asan_report_load_n_noabort(unsigned long addr, size_t size)
144{
145 kasan_report(addr, size, false, _RET_IP_);
146}
147EXPORT_SYMBOL(__asan_report_load_n_noabort);
148
149void __asan_report_store_n_noabort(unsigned long addr, size_t size)
150{
151 kasan_report(addr, size, true, _RET_IP_);
152}
153EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/init.c
index c7550eb65922..34afad56497b 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/init.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * This file contains some kasan initialization code. 3 * This file contains some kasan initialization code.
3 * 4 *
@@ -30,13 +31,13 @@
30 * - Latter it reused it as zero shadow to cover large ranges of memory 31 * - Latter it reused it as zero shadow to cover large ranges of memory
31 * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...). 32 * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...).
32 */ 33 */
33unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; 34unsigned char kasan_early_shadow_page[PAGE_SIZE] __page_aligned_bss;
34 35
35#if CONFIG_PGTABLE_LEVELS > 4 36#if CONFIG_PGTABLE_LEVELS > 4
36p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; 37p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
37static inline bool kasan_p4d_table(pgd_t pgd) 38static inline bool kasan_p4d_table(pgd_t pgd)
38{ 39{
39 return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d)); 40 return pgd_page(pgd) == virt_to_page(lm_alias(kasan_early_shadow_p4d));
40} 41}
41#else 42#else
42static inline bool kasan_p4d_table(pgd_t pgd) 43static inline bool kasan_p4d_table(pgd_t pgd)
@@ -45,10 +46,10 @@ static inline bool kasan_p4d_table(pgd_t pgd)
45} 46}
46#endif 47#endif
47#if CONFIG_PGTABLE_LEVELS > 3 48#if CONFIG_PGTABLE_LEVELS > 3
48pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; 49pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
49static inline bool kasan_pud_table(p4d_t p4d) 50static inline bool kasan_pud_table(p4d_t p4d)
50{ 51{
51 return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud)); 52 return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
52} 53}
53#else 54#else
54static inline bool kasan_pud_table(p4d_t p4d) 55static inline bool kasan_pud_table(p4d_t p4d)
@@ -57,10 +58,10 @@ static inline bool kasan_pud_table(p4d_t p4d)
57} 58}
58#endif 59#endif
59#if CONFIG_PGTABLE_LEVELS > 2 60#if CONFIG_PGTABLE_LEVELS > 2
60pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; 61pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
61static inline bool kasan_pmd_table(pud_t pud) 62static inline bool kasan_pmd_table(pud_t pud)
62{ 63{
63 return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd)); 64 return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
64} 65}
65#else 66#else
66static inline bool kasan_pmd_table(pud_t pud) 67static inline bool kasan_pmd_table(pud_t pud)
@@ -68,16 +69,16 @@ static inline bool kasan_pmd_table(pud_t pud)
68 return 0; 69 return 0;
69} 70}
70#endif 71#endif
71pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; 72pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss;
72 73
73static inline bool kasan_pte_table(pmd_t pmd) 74static inline bool kasan_pte_table(pmd_t pmd)
74{ 75{
75 return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte)); 76 return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte));
76} 77}
77 78
78static inline bool kasan_zero_page_entry(pte_t pte) 79static inline bool kasan_early_shadow_page_entry(pte_t pte)
79{ 80{
80 return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page)); 81 return pte_page(pte) == virt_to_page(lm_alias(kasan_early_shadow_page));
81} 82}
82 83
83static __init void *early_alloc(size_t size, int node) 84static __init void *early_alloc(size_t size, int node)
@@ -92,7 +93,8 @@ static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
92 pte_t *pte = pte_offset_kernel(pmd, addr); 93 pte_t *pte = pte_offset_kernel(pmd, addr);
93 pte_t zero_pte; 94 pte_t zero_pte;
94 95
95 zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_zero_page)), PAGE_KERNEL); 96 zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_early_shadow_page)),
97 PAGE_KERNEL);
96 zero_pte = pte_wrprotect(zero_pte); 98 zero_pte = pte_wrprotect(zero_pte);
97 99
98 while (addr + PAGE_SIZE <= end) { 100 while (addr + PAGE_SIZE <= end) {
@@ -112,7 +114,8 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
112 next = pmd_addr_end(addr, end); 114 next = pmd_addr_end(addr, end);
113 115
114 if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { 116 if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) {
115 pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); 117 pmd_populate_kernel(&init_mm, pmd,
118 lm_alias(kasan_early_shadow_pte));
116 continue; 119 continue;
117 } 120 }
118 121
@@ -145,9 +148,11 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
145 if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { 148 if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) {
146 pmd_t *pmd; 149 pmd_t *pmd;
147 150
148 pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); 151 pud_populate(&init_mm, pud,
152 lm_alias(kasan_early_shadow_pmd));
149 pmd = pmd_offset(pud, addr); 153 pmd = pmd_offset(pud, addr);
150 pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); 154 pmd_populate_kernel(&init_mm, pmd,
155 lm_alias(kasan_early_shadow_pte));
151 continue; 156 continue;
152 } 157 }
153 158
@@ -181,12 +186,14 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
181 pud_t *pud; 186 pud_t *pud;
182 pmd_t *pmd; 187 pmd_t *pmd;
183 188
184 p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); 189 p4d_populate(&init_mm, p4d,
190 lm_alias(kasan_early_shadow_pud));
185 pud = pud_offset(p4d, addr); 191 pud = pud_offset(p4d, addr);
186 pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); 192 pud_populate(&init_mm, pud,
193 lm_alias(kasan_early_shadow_pmd));
187 pmd = pmd_offset(pud, addr); 194 pmd = pmd_offset(pud, addr);
188 pmd_populate_kernel(&init_mm, pmd, 195 pmd_populate_kernel(&init_mm, pmd,
189 lm_alias(kasan_zero_pte)); 196 lm_alias(kasan_early_shadow_pte));
190 continue; 197 continue;
191 } 198 }
192 199
@@ -209,13 +216,13 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
209} 216}
210 217
211/** 218/**
212 * kasan_populate_zero_shadow - populate shadow memory region with 219 * kasan_populate_early_shadow - populate shadow memory region with
213 * kasan_zero_page 220 * kasan_early_shadow_page
214 * @shadow_start - start of the memory range to populate 221 * @shadow_start - start of the memory range to populate
215 * @shadow_end - end of the memory range to populate 222 * @shadow_end - end of the memory range to populate
216 */ 223 */
217int __ref kasan_populate_zero_shadow(const void *shadow_start, 224int __ref kasan_populate_early_shadow(const void *shadow_start,
218 const void *shadow_end) 225 const void *shadow_end)
219{ 226{
220 unsigned long addr = (unsigned long)shadow_start; 227 unsigned long addr = (unsigned long)shadow_start;
221 unsigned long end = (unsigned long)shadow_end; 228 unsigned long end = (unsigned long)shadow_end;
@@ -231,7 +238,7 @@ int __ref kasan_populate_zero_shadow(const void *shadow_start,
231 pmd_t *pmd; 238 pmd_t *pmd;
232 239
233 /* 240 /*
234 * kasan_zero_pud should be populated with pmds 241 * kasan_early_shadow_pud should be populated with pmds
235 * at this moment. 242 * at this moment.
236 * [pud,pmd]_populate*() below needed only for 243 * [pud,pmd]_populate*() below needed only for
237 * 3,2 - level page tables where we don't have 244 * 3,2 - level page tables where we don't have
@@ -241,21 +248,25 @@ int __ref kasan_populate_zero_shadow(const void *shadow_start,
241 * The ifndef is required to avoid build breakage. 248 * The ifndef is required to avoid build breakage.
242 * 249 *
243 * With 5level-fixup.h, pgd_populate() is not nop and 250 * With 5level-fixup.h, pgd_populate() is not nop and
244 * we reference kasan_zero_p4d. It's not defined 251 * we reference kasan_early_shadow_p4d. It's not defined
245 * unless 5-level paging enabled. 252 * unless 5-level paging enabled.
246 * 253 *
247 * The ifndef can be dropped once all KASAN-enabled 254 * The ifndef can be dropped once all KASAN-enabled
248 * architectures will switch to pgtable-nop4d.h. 255 * architectures will switch to pgtable-nop4d.h.
249 */ 256 */
250#ifndef __ARCH_HAS_5LEVEL_HACK 257#ifndef __ARCH_HAS_5LEVEL_HACK
251 pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_p4d)); 258 pgd_populate(&init_mm, pgd,
259 lm_alias(kasan_early_shadow_p4d));
252#endif 260#endif
253 p4d = p4d_offset(pgd, addr); 261 p4d = p4d_offset(pgd, addr);
254 p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); 262 p4d_populate(&init_mm, p4d,
263 lm_alias(kasan_early_shadow_pud));
255 pud = pud_offset(p4d, addr); 264 pud = pud_offset(p4d, addr);
256 pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); 265 pud_populate(&init_mm, pud,
266 lm_alias(kasan_early_shadow_pmd));
257 pmd = pmd_offset(pud, addr); 267 pmd = pmd_offset(pud, addr);
258 pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); 268 pmd_populate_kernel(&init_mm, pmd,
269 lm_alias(kasan_early_shadow_pte));
259 continue; 270 continue;
260 } 271 }
261 272
@@ -350,7 +361,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
350 if (!pte_present(*pte)) 361 if (!pte_present(*pte))
351 continue; 362 continue;
352 363
353 if (WARN_ON(!kasan_zero_page_entry(*pte))) 364 if (WARN_ON(!kasan_early_shadow_page_entry(*pte)))
354 continue; 365 continue;
355 pte_clear(&init_mm, addr, pte); 366 pte_clear(&init_mm, addr, pte);
356 } 367 }
@@ -480,7 +491,7 @@ int kasan_add_zero_shadow(void *start, unsigned long size)
480 WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) 491 WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
481 return -EINVAL; 492 return -EINVAL;
482 493
483 ret = kasan_populate_zero_shadow(shadow_start, shadow_end); 494 ret = kasan_populate_early_shadow(shadow_start, shadow_end);
484 if (ret) 495 if (ret)
485 kasan_remove_zero_shadow(shadow_start, 496 kasan_remove_zero_shadow(shadow_start,
486 size >> KASAN_SHADOW_SCALE_SHIFT); 497 size >> KASAN_SHADOW_SCALE_SHIFT);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index c12dcfde2ebd..ea51b2d898ec 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -8,10 +8,22 @@
8#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) 8#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
9#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) 9#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
10 10
11#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */
12#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */
13#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */
14
15#ifdef CONFIG_KASAN_GENERIC
11#define KASAN_FREE_PAGE 0xFF /* page was freed */ 16#define KASAN_FREE_PAGE 0xFF /* page was freed */
12#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ 17#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
13#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ 18#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
14#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ 19#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
20#else
21#define KASAN_FREE_PAGE KASAN_TAG_INVALID
22#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
23#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID
24#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID
25#endif
26
15#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */ 27#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
16 28
17/* 29/*
@@ -105,11 +117,25 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
105 << KASAN_SHADOW_SCALE_SHIFT); 117 << KASAN_SHADOW_SCALE_SHIFT);
106} 118}
107 119
120static inline bool addr_has_shadow(const void *addr)
121{
122 return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
123}
124
125void kasan_poison_shadow(const void *address, size_t size, u8 value);
126
127void check_memory_region(unsigned long addr, size_t size, bool write,
128 unsigned long ret_ip);
129
130void *find_first_bad_addr(void *addr, size_t size);
131const char *get_bug_type(struct kasan_access_info *info);
132
108void kasan_report(unsigned long addr, size_t size, 133void kasan_report(unsigned long addr, size_t size,
109 bool is_write, unsigned long ip); 134 bool is_write, unsigned long ip);
110void kasan_report_invalid_free(void *object, unsigned long ip); 135void kasan_report_invalid_free(void *object, unsigned long ip);
111 136
112#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) 137#if defined(CONFIG_KASAN_GENERIC) && \
138 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
113void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); 139void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
114void quarantine_reduce(void); 140void quarantine_reduce(void);
115void quarantine_remove_cache(struct kmem_cache *cache); 141void quarantine_remove_cache(struct kmem_cache *cache);
@@ -120,6 +146,37 @@ static inline void quarantine_reduce(void) { }
120static inline void quarantine_remove_cache(struct kmem_cache *cache) { } 146static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
121#endif 147#endif
122 148
149#ifdef CONFIG_KASAN_SW_TAGS
150
151void print_tags(u8 addr_tag, const void *addr);
152
153u8 random_tag(void);
154
155#else
156
157static inline void print_tags(u8 addr_tag, const void *addr) { }
158
159static inline u8 random_tag(void)
160{
161 return 0;
162}
163
164#endif
165
166#ifndef arch_kasan_set_tag
167#define arch_kasan_set_tag(addr, tag) ((void *)(addr))
168#endif
169#ifndef arch_kasan_reset_tag
170#define arch_kasan_reset_tag(addr) ((void *)(addr))
171#endif
172#ifndef arch_kasan_get_tag
173#define arch_kasan_get_tag(addr) 0
174#endif
175
176#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag)))
177#define reset_tag(addr) ((void *)arch_kasan_reset_tag(addr))
178#define get_tag(addr) arch_kasan_get_tag(addr)
179
123/* 180/*
124 * Exported functions for interfaces called from assembly or from generated 181 * Exported functions for interfaces called from assembly or from generated
125 * code. Declarations here to avoid warning about missing declarations. 182 * code. Declarations here to avoid warning about missing declarations.
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index b209dbaefde8..978bc4a3eb51 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * KASAN quarantine. 3 * KASAN quarantine.
3 * 4 *
@@ -236,7 +237,7 @@ void quarantine_reduce(void)
236 * Update quarantine size in case of hotplug. Allocate a fraction of 237 * Update quarantine size in case of hotplug. Allocate a fraction of
237 * the installed memory to quarantine minus per-cpu queue limits. 238 * the installed memory to quarantine minus per-cpu queue limits.
238 */ 239 */
239 total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / 240 total_size = (totalram_pages() << PAGE_SHIFT) /
240 QUARANTINE_FRACTION; 241 QUARANTINE_FRACTION;
241 percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); 242 percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
242 new_quarantine_size = (total_size < percpu_quarantines) ? 243 new_quarantine_size = (total_size < percpu_quarantines) ?
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 5c169aa688fd..ca9418fe9232 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -1,5 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * This file contains error reporting code. 3 * This file contains common generic and tag-based KASAN error reporting code.
3 * 4 *
4 * Copyright (c) 2014 Samsung Electronics Co., Ltd. 5 * Copyright (c) 2014 Samsung Electronics Co., Ltd.
5 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> 6 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
@@ -39,129 +40,43 @@
39#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK) 40#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
40#define SHADOW_ROWS_AROUND_ADDR 2 41#define SHADOW_ROWS_AROUND_ADDR 2
41 42
42static const void *find_first_bad_addr(const void *addr, size_t size) 43static unsigned long kasan_flags;
43{
44 u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
45 const void *first_bad_addr = addr;
46
47 while (!shadow_val && first_bad_addr < addr + size) {
48 first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
49 shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
50 }
51 return first_bad_addr;
52}
53 44
54static bool addr_has_shadow(struct kasan_access_info *info) 45#define KASAN_BIT_REPORTED 0
55{ 46#define KASAN_BIT_MULTI_SHOT 1
56 return (info->access_addr >=
57 kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
58}
59 47
60static const char *get_shadow_bug_type(struct kasan_access_info *info) 48bool kasan_save_enable_multi_shot(void)
61{ 49{
62 const char *bug_type = "unknown-crash"; 50 return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
63 u8 *shadow_addr;
64
65 info->first_bad_addr = find_first_bad_addr(info->access_addr,
66 info->access_size);
67
68 shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
69
70 /*
71 * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
72 * at the next shadow byte to determine the type of the bad access.
73 */
74 if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
75 shadow_addr++;
76
77 switch (*shadow_addr) {
78 case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
79 /*
80 * In theory it's still possible to see these shadow values
81 * due to a data race in the kernel code.
82 */
83 bug_type = "out-of-bounds";
84 break;
85 case KASAN_PAGE_REDZONE:
86 case KASAN_KMALLOC_REDZONE:
87 bug_type = "slab-out-of-bounds";
88 break;
89 case KASAN_GLOBAL_REDZONE:
90 bug_type = "global-out-of-bounds";
91 break;
92 case KASAN_STACK_LEFT:
93 case KASAN_STACK_MID:
94 case KASAN_STACK_RIGHT:
95 case KASAN_STACK_PARTIAL:
96 bug_type = "stack-out-of-bounds";
97 break;
98 case KASAN_FREE_PAGE:
99 case KASAN_KMALLOC_FREE:
100 bug_type = "use-after-free";
101 break;
102 case KASAN_USE_AFTER_SCOPE:
103 bug_type = "use-after-scope";
104 break;
105 case KASAN_ALLOCA_LEFT:
106 case KASAN_ALLOCA_RIGHT:
107 bug_type = "alloca-out-of-bounds";
108 break;
109 }
110
111 return bug_type;
112} 51}
52EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
113 53
114static const char *get_wild_bug_type(struct kasan_access_info *info) 54void kasan_restore_multi_shot(bool enabled)
115{ 55{
116 const char *bug_type = "unknown-crash"; 56 if (!enabled)
117 57 clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
118 if ((unsigned long)info->access_addr < PAGE_SIZE)
119 bug_type = "null-ptr-deref";
120 else if ((unsigned long)info->access_addr < TASK_SIZE)
121 bug_type = "user-memory-access";
122 else
123 bug_type = "wild-memory-access";
124
125 return bug_type;
126} 58}
59EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
127 60
128static const char *get_bug_type(struct kasan_access_info *info) 61static int __init kasan_set_multi_shot(char *str)
129{ 62{
130 if (addr_has_shadow(info)) 63 set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
131 return get_shadow_bug_type(info); 64 return 1;
132 return get_wild_bug_type(info);
133} 65}
66__setup("kasan_multi_shot", kasan_set_multi_shot);
134 67
135static void print_error_description(struct kasan_access_info *info) 68static void print_error_description(struct kasan_access_info *info)
136{ 69{
137 const char *bug_type = get_bug_type(info);
138
139 pr_err("BUG: KASAN: %s in %pS\n", 70 pr_err("BUG: KASAN: %s in %pS\n",
140 bug_type, (void *)info->ip); 71 get_bug_type(info), (void *)info->ip);
141 pr_err("%s of size %zu at addr %px by task %s/%d\n", 72 pr_err("%s of size %zu at addr %px by task %s/%d\n",
142 info->is_write ? "Write" : "Read", info->access_size, 73 info->is_write ? "Write" : "Read", info->access_size,
143 info->access_addr, current->comm, task_pid_nr(current)); 74 info->access_addr, current->comm, task_pid_nr(current));
144} 75}
145 76
146static inline bool kernel_or_module_addr(const void *addr)
147{
148 if (addr >= (void *)_stext && addr < (void *)_end)
149 return true;
150 if (is_module_address((unsigned long)addr))
151 return true;
152 return false;
153}
154
155static inline bool init_task_stack_addr(const void *addr)
156{
157 return addr >= (void *)&init_thread_union.stack &&
158 (addr <= (void *)&init_thread_union.stack +
159 sizeof(init_thread_union.stack));
160}
161
162static DEFINE_SPINLOCK(report_lock); 77static DEFINE_SPINLOCK(report_lock);
163 78
164static void kasan_start_report(unsigned long *flags) 79static void start_report(unsigned long *flags)
165{ 80{
166 /* 81 /*
167 * Make sure we don't end up in loop. 82 * Make sure we don't end up in loop.
@@ -171,7 +86,7 @@ static void kasan_start_report(unsigned long *flags)
171 pr_err("==================================================================\n"); 86 pr_err("==================================================================\n");
172} 87}
173 88
174static void kasan_end_report(unsigned long *flags) 89static void end_report(unsigned long *flags)
175{ 90{
176 pr_err("==================================================================\n"); 91 pr_err("==================================================================\n");
177 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 92 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -249,6 +164,22 @@ static void describe_object(struct kmem_cache *cache, void *object,
249 describe_object_addr(cache, object, addr); 164 describe_object_addr(cache, object, addr);
250} 165}
251 166
167static inline bool kernel_or_module_addr(const void *addr)
168{
169 if (addr >= (void *)_stext && addr < (void *)_end)
170 return true;
171 if (is_module_address((unsigned long)addr))
172 return true;
173 return false;
174}
175
176static inline bool init_task_stack_addr(const void *addr)
177{
178 return addr >= (void *)&init_thread_union.stack &&
179 (addr <= (void *)&init_thread_union.stack +
180 sizeof(init_thread_union.stack));
181}
182
252static void print_address_description(void *addr) 183static void print_address_description(void *addr)
253{ 184{
254 struct page *page = addr_to_page(addr); 185 struct page *page = addr_to_page(addr);
@@ -326,126 +257,69 @@ static void print_shadow_for_address(const void *addr)
326 } 257 }
327} 258}
328 259
260static bool report_enabled(void)
261{
262 if (current->kasan_depth)
263 return false;
264 if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
265 return true;
266 return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
267}
268
329void kasan_report_invalid_free(void *object, unsigned long ip) 269void kasan_report_invalid_free(void *object, unsigned long ip)
330{ 270{
331 unsigned long flags; 271 unsigned long flags;
332 272
333 kasan_start_report(&flags); 273 start_report(&flags);
334 pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); 274 pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
275 print_tags(get_tag(object), reset_tag(object));
276 object = reset_tag(object);
335 pr_err("\n"); 277 pr_err("\n");
336 print_address_description(object); 278 print_address_description(object);
337 pr_err("\n"); 279 pr_err("\n");
338 print_shadow_for_address(object); 280 print_shadow_for_address(object);
339 kasan_end_report(&flags); 281 end_report(&flags);
340}
341
342static void kasan_report_error(struct kasan_access_info *info)
343{
344 unsigned long flags;
345
346 kasan_start_report(&flags);
347
348 print_error_description(info);
349 pr_err("\n");
350
351 if (!addr_has_shadow(info)) {
352 dump_stack();
353 } else {
354 print_address_description((void *)info->access_addr);
355 pr_err("\n");
356 print_shadow_for_address(info->first_bad_addr);
357 }
358
359 kasan_end_report(&flags);
360}
361
362static unsigned long kasan_flags;
363
364#define KASAN_BIT_REPORTED 0
365#define KASAN_BIT_MULTI_SHOT 1
366
367bool kasan_save_enable_multi_shot(void)
368{
369 return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
370}
371EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
372
373void kasan_restore_multi_shot(bool enabled)
374{
375 if (!enabled)
376 clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
377}
378EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
379
380static int __init kasan_set_multi_shot(char *str)
381{
382 set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
383 return 1;
384}
385__setup("kasan_multi_shot", kasan_set_multi_shot);
386
387static inline bool kasan_report_enabled(void)
388{
389 if (current->kasan_depth)
390 return false;
391 if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
392 return true;
393 return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
394} 282}
395 283
396void kasan_report(unsigned long addr, size_t size, 284void kasan_report(unsigned long addr, size_t size,
397 bool is_write, unsigned long ip) 285 bool is_write, unsigned long ip)
398{ 286{
399 struct kasan_access_info info; 287 struct kasan_access_info info;
288 void *tagged_addr;
289 void *untagged_addr;
290 unsigned long flags;
400 291
401 if (likely(!kasan_report_enabled())) 292 if (likely(!report_enabled()))
402 return; 293 return;
403 294
404 disable_trace_on_warning(); 295 disable_trace_on_warning();
405 296
406 info.access_addr = (void *)addr; 297 tagged_addr = (void *)addr;
407 info.first_bad_addr = (void *)addr; 298 untagged_addr = reset_tag(tagged_addr);
299
300 info.access_addr = tagged_addr;
301 if (addr_has_shadow(untagged_addr))
302 info.first_bad_addr = find_first_bad_addr(tagged_addr, size);
303 else
304 info.first_bad_addr = untagged_addr;
408 info.access_size = size; 305 info.access_size = size;
409 info.is_write = is_write; 306 info.is_write = is_write;
410 info.ip = ip; 307 info.ip = ip;
411 308
412 kasan_report_error(&info); 309 start_report(&flags);
413}
414 310
311 print_error_description(&info);
312 if (addr_has_shadow(untagged_addr))
313 print_tags(get_tag(tagged_addr), info.first_bad_addr);
314 pr_err("\n");
415 315
416#define DEFINE_ASAN_REPORT_LOAD(size) \ 316 if (addr_has_shadow(untagged_addr)) {
417void __asan_report_load##size##_noabort(unsigned long addr) \ 317 print_address_description(untagged_addr);
418{ \ 318 pr_err("\n");
419 kasan_report(addr, size, false, _RET_IP_); \ 319 print_shadow_for_address(info.first_bad_addr);
420} \ 320 } else {
421EXPORT_SYMBOL(__asan_report_load##size##_noabort) 321 dump_stack();
422 322 }
423#define DEFINE_ASAN_REPORT_STORE(size) \
424void __asan_report_store##size##_noabort(unsigned long addr) \
425{ \
426 kasan_report(addr, size, true, _RET_IP_); \
427} \
428EXPORT_SYMBOL(__asan_report_store##size##_noabort)
429
430DEFINE_ASAN_REPORT_LOAD(1);
431DEFINE_ASAN_REPORT_LOAD(2);
432DEFINE_ASAN_REPORT_LOAD(4);
433DEFINE_ASAN_REPORT_LOAD(8);
434DEFINE_ASAN_REPORT_LOAD(16);
435DEFINE_ASAN_REPORT_STORE(1);
436DEFINE_ASAN_REPORT_STORE(2);
437DEFINE_ASAN_REPORT_STORE(4);
438DEFINE_ASAN_REPORT_STORE(8);
439DEFINE_ASAN_REPORT_STORE(16);
440
441void __asan_report_load_n_noabort(unsigned long addr, size_t size)
442{
443 kasan_report(addr, size, false, _RET_IP_);
444}
445EXPORT_SYMBOL(__asan_report_load_n_noabort);
446 323
447void __asan_report_store_n_noabort(unsigned long addr, size_t size) 324 end_report(&flags);
448{
449 kasan_report(addr, size, true, _RET_IP_);
450} 325}
451EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
new file mode 100644
index 000000000000..0777649e07c4
--- /dev/null
+++ b/mm/kasan/tags.c
@@ -0,0 +1,161 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * This file contains core tag-based KASAN code.
4 *
5 * Copyright (c) 2018 Google, Inc.
6 * Author: Andrey Konovalov <andreyknvl@google.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 */
13
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15#define DISABLE_BRANCH_PROFILING
16
17#include <linux/export.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20#include <linux/kasan.h>
21#include <linux/kernel.h>
22#include <linux/kmemleak.h>
23#include <linux/linkage.h>
24#include <linux/memblock.h>
25#include <linux/memory.h>
26#include <linux/mm.h>
27#include <linux/module.h>
28#include <linux/printk.h>
29#include <linux/random.h>
30#include <linux/sched.h>
31#include <linux/sched/task_stack.h>
32#include <linux/slab.h>
33#include <linux/stacktrace.h>
34#include <linux/string.h>
35#include <linux/types.h>
36#include <linux/vmalloc.h>
37#include <linux/bug.h>
38
39#include "kasan.h"
40#include "../slab.h"
41
42static DEFINE_PER_CPU(u32, prng_state);
43
44void kasan_init_tags(void)
45{
46 int cpu;
47
48 for_each_possible_cpu(cpu)
49 per_cpu(prng_state, cpu) = get_random_u32();
50}
51
52/*
53 * If a preemption happens between this_cpu_read and this_cpu_write, the only
54 * side effect is that we'll give a few allocated in different contexts objects
55 * the same tag. Since tag-based KASAN is meant to be used a probabilistic
56 * bug-detection debug feature, this doesn't have significant negative impact.
57 *
58 * Ideally the tags use strong randomness to prevent any attempts to predict
59 * them during explicit exploit attempts. But strong randomness is expensive,
60 * and we did an intentional trade-off to use a PRNG. This non-atomic RMW
61 * sequence has in fact positive effect, since interrupts that randomly skew
62 * PRNG at unpredictable points do only good.
63 */
64u8 random_tag(void)
65{
66 u32 state = this_cpu_read(prng_state);
67
68 state = 1664525 * state + 1013904223;
69 this_cpu_write(prng_state, state);
70
71 return (u8)(state % (KASAN_TAG_MAX + 1));
72}
73
74void *kasan_reset_tag(const void *addr)
75{
76 return reset_tag(addr);
77}
78
79void check_memory_region(unsigned long addr, size_t size, bool write,
80 unsigned long ret_ip)
81{
82 u8 tag;
83 u8 *shadow_first, *shadow_last, *shadow;
84 void *untagged_addr;
85
86 if (unlikely(size == 0))
87 return;
88
89 tag = get_tag((const void *)addr);
90
91 /*
92 * Ignore accesses for pointers tagged with 0xff (native kernel
93 * pointer tag) to suppress false positives caused by kmap.
94 *
95 * Some kernel code was written to account for archs that don't keep
96 * high memory mapped all the time, but rather map and unmap particular
97 * pages when needed. Instead of storing a pointer to the kernel memory,
98 * this code saves the address of the page structure and offset within
99 * that page for later use. Those pages are then mapped and unmapped
100 * with kmap/kunmap when necessary and virt_to_page is used to get the
101 * virtual address of the page. For arm64 (that keeps the high memory
102 * mapped all the time), kmap is turned into a page_address call.
103
104 * The issue is that with use of the page_address + virt_to_page
105 * sequence the top byte value of the original pointer gets lost (gets
106 * set to KASAN_TAG_KERNEL (0xFF)).
107 */
108 if (tag == KASAN_TAG_KERNEL)
109 return;
110
111 untagged_addr = reset_tag((const void *)addr);
112 if (unlikely(untagged_addr <
113 kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
114 kasan_report(addr, size, write, ret_ip);
115 return;
116 }
117 shadow_first = kasan_mem_to_shadow(untagged_addr);
118 shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1);
119 for (shadow = shadow_first; shadow <= shadow_last; shadow++) {
120 if (*shadow != tag) {
121 kasan_report(addr, size, write, ret_ip);
122 return;
123 }
124 }
125}
126
127#define DEFINE_HWASAN_LOAD_STORE(size) \
128 void __hwasan_load##size##_noabort(unsigned long addr) \
129 { \
130 check_memory_region(addr, size, false, _RET_IP_); \
131 } \
132 EXPORT_SYMBOL(__hwasan_load##size##_noabort); \
133 void __hwasan_store##size##_noabort(unsigned long addr) \
134 { \
135 check_memory_region(addr, size, true, _RET_IP_); \
136 } \
137 EXPORT_SYMBOL(__hwasan_store##size##_noabort)
138
139DEFINE_HWASAN_LOAD_STORE(1);
140DEFINE_HWASAN_LOAD_STORE(2);
141DEFINE_HWASAN_LOAD_STORE(4);
142DEFINE_HWASAN_LOAD_STORE(8);
143DEFINE_HWASAN_LOAD_STORE(16);
144
145void __hwasan_loadN_noabort(unsigned long addr, unsigned long size)
146{
147 check_memory_region(addr, size, false, _RET_IP_);
148}
149EXPORT_SYMBOL(__hwasan_loadN_noabort);
150
151void __hwasan_storeN_noabort(unsigned long addr, unsigned long size)
152{
153 check_memory_region(addr, size, true, _RET_IP_);
154}
155EXPORT_SYMBOL(__hwasan_storeN_noabort);
156
157void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
158{
159 kasan_poison_shadow((void *)addr, size, tag);
160}
161EXPORT_SYMBOL(__hwasan_tag_memory);
diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c
new file mode 100644
index 000000000000..8eaf5f722271
--- /dev/null
+++ b/mm/kasan/tags_report.c
@@ -0,0 +1,58 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * This file contains tag-based KASAN specific error reporting code.
4 *
5 * Copyright (c) 2014 Samsung Electronics Co., Ltd.
6 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
7 *
8 * Some code borrowed from https://github.com/xairy/kasan-prototype by
9 * Andrey Konovalov <andreyknvl@gmail.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 *
15 */
16
17#include <linux/bitops.h>
18#include <linux/ftrace.h>
19#include <linux/init.h>
20#include <linux/kernel.h>
21#include <linux/mm.h>
22#include <linux/printk.h>
23#include <linux/sched.h>
24#include <linux/slab.h>
25#include <linux/stackdepot.h>
26#include <linux/stacktrace.h>
27#include <linux/string.h>
28#include <linux/types.h>
29#include <linux/kasan.h>
30#include <linux/module.h>
31
32#include <asm/sections.h>
33
34#include "kasan.h"
35#include "../slab.h"
36
37const char *get_bug_type(struct kasan_access_info *info)
38{
39 return "invalid-access";
40}
41
42void *find_first_bad_addr(void *addr, size_t size)
43{
44 u8 tag = get_tag(addr);
45 void *p = reset_tag(addr);
46 void *end = p + size;
47
48 while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
49 p += KASAN_SHADOW_SCALE_SIZE;
50 return p;
51}
52
53void print_tags(u8 addr_tag, const void *addr)
54{
55 u8 *shadow = (u8 *)kasan_mem_to_shadow(addr);
56
57 pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow);
58}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 43ce2f4d2551..4f017339ddb2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -944,8 +944,7 @@ static void collapse_huge_page(struct mm_struct *mm,
944 int isolated = 0, result = 0; 944 int isolated = 0, result = 0;
945 struct mem_cgroup *memcg; 945 struct mem_cgroup *memcg;
946 struct vm_area_struct *vma; 946 struct vm_area_struct *vma;
947 unsigned long mmun_start; /* For mmu_notifiers */ 947 struct mmu_notifier_range range;
948 unsigned long mmun_end; /* For mmu_notifiers */
949 gfp_t gfp; 948 gfp_t gfp;
950 949
951 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 950 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -1017,9 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm,
1017 pte = pte_offset_map(pmd, address); 1016 pte = pte_offset_map(pmd, address);
1018 pte_ptl = pte_lockptr(mm, pmd); 1017 pte_ptl = pte_lockptr(mm, pmd);
1019 1018
1020 mmun_start = address; 1019 mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE);
1021 mmun_end = address + HPAGE_PMD_SIZE; 1020 mmu_notifier_invalidate_range_start(&range);
1022 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1023 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ 1021 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
1024 /* 1022 /*
1025 * After this gup_fast can't run anymore. This also removes 1023 * After this gup_fast can't run anymore. This also removes
@@ -1029,7 +1027,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1029 */ 1027 */
1030 _pmd = pmdp_collapse_flush(vma, address, pmd); 1028 _pmd = pmdp_collapse_flush(vma, address, pmd);
1031 spin_unlock(pmd_ptl); 1029 spin_unlock(pmd_ptl);
1032 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1030 mmu_notifier_invalidate_range_end(&range);
1033 1031
1034 spin_lock(pte_ptl); 1032 spin_lock(pte_ptl);
1035 isolated = __collapse_huge_page_isolate(vma, address, pte); 1033 isolated = __collapse_huge_page_isolate(vma, address, pte);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 877de4fa0720..f9d9dc250428 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1547,11 +1547,14 @@ static void kmemleak_scan(void)
1547 unsigned long pfn; 1547 unsigned long pfn;
1548 1548
1549 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1549 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1550 struct page *page; 1550 struct page *page = pfn_to_online_page(pfn);
1551 1551
1552 if (!pfn_valid(pfn)) 1552 if (!page)
1553 continue;
1554
1555 /* only scan pages belonging to this node */
1556 if (page_to_nid(page) != i)
1553 continue; 1557 continue;
1554 page = pfn_to_page(pfn);
1555 /* only scan if page is in use */ 1558 /* only scan if page is in use */
1556 if (page_count(page) == 0) 1559 if (page_count(page) == 0)
1557 continue; 1560 continue;
@@ -1647,7 +1650,7 @@ static void kmemleak_scan(void)
1647 */ 1650 */
1648static int kmemleak_scan_thread(void *arg) 1651static int kmemleak_scan_thread(void *arg)
1649{ 1652{
1650 static int first_run = 1; 1653 static int first_run = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN);
1651 1654
1652 pr_info("Automatic memory scanning thread started\n"); 1655 pr_info("Automatic memory scanning thread started\n");
1653 set_user_nice(current, 10); 1656 set_user_nice(current, 10);
@@ -2141,9 +2144,11 @@ static int __init kmemleak_late_init(void)
2141 return -ENOMEM; 2144 return -ENOMEM;
2142 } 2145 }
2143 2146
2144 mutex_lock(&scan_mutex); 2147 if (IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN)) {
2145 start_scan_thread(); 2148 mutex_lock(&scan_mutex);
2146 mutex_unlock(&scan_mutex); 2149 start_scan_thread();
2150 mutex_unlock(&scan_mutex);
2151 }
2147 2152
2148 pr_info("Kernel memory leak detector initialized\n"); 2153 pr_info("Kernel memory leak detector initialized\n");
2149 2154
diff --git a/mm/ksm.c b/mm/ksm.c
index 5b0894b45ee5..6c48ad13b4c9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -25,7 +25,7 @@
25#include <linux/pagemap.h> 25#include <linux/pagemap.h>
26#include <linux/rmap.h> 26#include <linux/rmap.h>
27#include <linux/spinlock.h> 27#include <linux/spinlock.h>
28#include <linux/jhash.h> 28#include <linux/xxhash.h>
29#include <linux/delay.h> 29#include <linux/delay.h>
30#include <linux/kthread.h> 30#include <linux/kthread.h>
31#include <linux/wait.h> 31#include <linux/wait.h>
@@ -296,6 +296,7 @@ static unsigned long ksm_run = KSM_RUN_STOP;
296static void wait_while_offlining(void); 296static void wait_while_offlining(void);
297 297
298static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 298static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
299static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
299static DEFINE_MUTEX(ksm_thread_mutex); 300static DEFINE_MUTEX(ksm_thread_mutex);
300static DEFINE_SPINLOCK(ksm_mmlist_lock); 301static DEFINE_SPINLOCK(ksm_mmlist_lock);
301 302
@@ -1009,7 +1010,7 @@ static u32 calc_checksum(struct page *page)
1009{ 1010{
1010 u32 checksum; 1011 u32 checksum;
1011 void *addr = kmap_atomic(page); 1012 void *addr = kmap_atomic(page);
1012 checksum = jhash2(addr, PAGE_SIZE / 4, 17); 1013 checksum = xxhash(addr, PAGE_SIZE, 0);
1013 kunmap_atomic(addr); 1014 kunmap_atomic(addr);
1014 return checksum; 1015 return checksum;
1015} 1016}
@@ -1042,8 +1043,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1042 }; 1043 };
1043 int swapped; 1044 int swapped;
1044 int err = -EFAULT; 1045 int err = -EFAULT;
1045 unsigned long mmun_start; /* For mmu_notifiers */ 1046 struct mmu_notifier_range range;
1046 unsigned long mmun_end; /* For mmu_notifiers */
1047 1047
1048 pvmw.address = page_address_in_vma(page, vma); 1048 pvmw.address = page_address_in_vma(page, vma);
1049 if (pvmw.address == -EFAULT) 1049 if (pvmw.address == -EFAULT)
@@ -1051,9 +1051,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1051 1051
1052 BUG_ON(PageTransCompound(page)); 1052 BUG_ON(PageTransCompound(page));
1053 1053
1054 mmun_start = pvmw.address; 1054 mmu_notifier_range_init(&range, mm, pvmw.address,
1055 mmun_end = pvmw.address + PAGE_SIZE; 1055 pvmw.address + PAGE_SIZE);
1056 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1056 mmu_notifier_invalidate_range_start(&range);
1057 1057
1058 if (!page_vma_mapped_walk(&pvmw)) 1058 if (!page_vma_mapped_walk(&pvmw))
1059 goto out_mn; 1059 goto out_mn;
@@ -1105,7 +1105,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1105out_unlock: 1105out_unlock:
1106 page_vma_mapped_walk_done(&pvmw); 1106 page_vma_mapped_walk_done(&pvmw);
1107out_mn: 1107out_mn:
1108 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1108 mmu_notifier_invalidate_range_end(&range);
1109out: 1109out:
1110 return err; 1110 return err;
1111} 1111}
@@ -1129,8 +1129,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
1129 spinlock_t *ptl; 1129 spinlock_t *ptl;
1130 unsigned long addr; 1130 unsigned long addr;
1131 int err = -EFAULT; 1131 int err = -EFAULT;
1132 unsigned long mmun_start; /* For mmu_notifiers */ 1132 struct mmu_notifier_range range;
1133 unsigned long mmun_end; /* For mmu_notifiers */
1134 1133
1135 addr = page_address_in_vma(page, vma); 1134 addr = page_address_in_vma(page, vma);
1136 if (addr == -EFAULT) 1135 if (addr == -EFAULT)
@@ -1140,9 +1139,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
1140 if (!pmd) 1139 if (!pmd)
1141 goto out; 1140 goto out;
1142 1141
1143 mmun_start = addr; 1142 mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
1144 mmun_end = addr + PAGE_SIZE; 1143 mmu_notifier_invalidate_range_start(&range);
1145 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1146 1144
1147 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 1145 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1148 if (!pte_same(*ptep, orig_pte)) { 1146 if (!pte_same(*ptep, orig_pte)) {
@@ -1188,7 +1186,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
1188 pte_unmap_unlock(ptep, ptl); 1186 pte_unmap_unlock(ptep, ptl);
1189 err = 0; 1187 err = 0;
1190out_mn: 1188out_mn:
1191 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1189 mmu_notifier_invalidate_range_end(&range);
1192out: 1190out:
1193 return err; 1191 return err;
1194} 1192}
@@ -2391,6 +2389,8 @@ static int ksmd_should_run(void)
2391 2389
2392static int ksm_scan_thread(void *nothing) 2390static int ksm_scan_thread(void *nothing)
2393{ 2391{
2392 unsigned int sleep_ms;
2393
2394 set_freezable(); 2394 set_freezable();
2395 set_user_nice(current, 5); 2395 set_user_nice(current, 5);
2396 2396
@@ -2404,8 +2404,10 @@ static int ksm_scan_thread(void *nothing)
2404 try_to_freeze(); 2404 try_to_freeze();
2405 2405
2406 if (ksmd_should_run()) { 2406 if (ksmd_should_run()) {
2407 schedule_timeout_interruptible( 2407 sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2408 msecs_to_jiffies(ksm_thread_sleep_millisecs)); 2408 wait_event_interruptible_timeout(ksm_iter_wait,
2409 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2410 msecs_to_jiffies(sleep_ms));
2409 } else { 2411 } else {
2410 wait_event_freezable(ksm_thread_wait, 2412 wait_event_freezable(ksm_thread_wait,
2411 ksmd_should_run() || kthread_should_stop()); 2413 ksmd_should_run() || kthread_should_stop());
@@ -2824,6 +2826,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj,
2824 return -EINVAL; 2826 return -EINVAL;
2825 2827
2826 ksm_thread_sleep_millisecs = msecs; 2828 ksm_thread_sleep_millisecs = msecs;
2829 wake_up_interruptible(&ksm_iter_wait);
2827 2830
2828 return count; 2831 return count;
2829} 2832}
diff --git a/mm/madvise.c b/mm/madvise.c
index 6cb1ca93e290..21a7881a2db4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -458,29 +458,30 @@ static void madvise_free_page_range(struct mmu_gather *tlb,
458static int madvise_free_single_vma(struct vm_area_struct *vma, 458static int madvise_free_single_vma(struct vm_area_struct *vma,
459 unsigned long start_addr, unsigned long end_addr) 459 unsigned long start_addr, unsigned long end_addr)
460{ 460{
461 unsigned long start, end;
462 struct mm_struct *mm = vma->vm_mm; 461 struct mm_struct *mm = vma->vm_mm;
462 struct mmu_notifier_range range;
463 struct mmu_gather tlb; 463 struct mmu_gather tlb;
464 464
465 /* MADV_FREE works for only anon vma at the moment */ 465 /* MADV_FREE works for only anon vma at the moment */
466 if (!vma_is_anonymous(vma)) 466 if (!vma_is_anonymous(vma))
467 return -EINVAL; 467 return -EINVAL;
468 468
469 start = max(vma->vm_start, start_addr); 469 range.start = max(vma->vm_start, start_addr);
470 if (start >= vma->vm_end) 470 if (range.start >= vma->vm_end)
471 return -EINVAL; 471 return -EINVAL;
472 end = min(vma->vm_end, end_addr); 472 range.end = min(vma->vm_end, end_addr);
473 if (end <= vma->vm_start) 473 if (range.end <= vma->vm_start)
474 return -EINVAL; 474 return -EINVAL;
475 mmu_notifier_range_init(&range, mm, range.start, range.end);
475 476
476 lru_add_drain(); 477 lru_add_drain();
477 tlb_gather_mmu(&tlb, mm, start, end); 478 tlb_gather_mmu(&tlb, mm, range.start, range.end);
478 update_hiwater_rss(mm); 479 update_hiwater_rss(mm);
479 480
480 mmu_notifier_invalidate_range_start(mm, start, end); 481 mmu_notifier_invalidate_range_start(&range);
481 madvise_free_page_range(&tlb, vma, start, end); 482 madvise_free_page_range(&tlb, vma, range.start, range.end);
482 mmu_notifier_invalidate_range_end(mm, start, end); 483 mmu_notifier_invalidate_range_end(&range);
483 tlb_finish_mmu(&tlb, start, end); 484 tlb_finish_mmu(&tlb, range.start, range.end);
484 485
485 return 0; 486 return 0;
486} 487}
diff --git a/mm/memblock.c b/mm/memblock.c
index 81ae63ca78d0..022d4cbb3618 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -262,7 +262,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
262 phys_addr_t kernel_end, ret; 262 phys_addr_t kernel_end, ret;
263 263
264 /* pump up @end */ 264 /* pump up @end */
265 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 265 if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
266 end == MEMBLOCK_ALLOC_KASAN)
266 end = memblock.current_limit; 267 end = memblock.current_limit;
267 268
268 /* avoid allocating the first page */ 269 /* avoid allocating the first page */
@@ -800,7 +801,14 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
800 return memblock_remove_range(&memblock.memory, base, size); 801 return memblock_remove_range(&memblock.memory, base, size);
801} 802}
802 803
803 804/**
805 * memblock_free - free boot memory block
806 * @base: phys starting address of the boot memory block
807 * @size: size of the boot memory block in bytes
808 *
809 * Free boot memory block previously allocated by memblock_alloc_xx() API.
810 * The freeing memory will not be released to the buddy allocator.
811 */
804int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) 812int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
805{ 813{
806 phys_addr_t end = base + size - 1; 814 phys_addr_t end = base + size - 1;
@@ -1412,13 +1420,15 @@ again:
1412done: 1420done:
1413 ptr = phys_to_virt(alloc); 1421 ptr = phys_to_virt(alloc);
1414 1422
1415 /* 1423 /* Skip kmemleak for kasan_init() due to high volume. */
1416 * The min_count is set to 0 so that bootmem allocated blocks 1424 if (max_addr != MEMBLOCK_ALLOC_KASAN)
1417 * are never reported as leaks. This is because many of these blocks 1425 /*
1418 * are only referred via the physical address which is not 1426 * The min_count is set to 0 so that bootmem allocated
1419 * looked up by kmemleak. 1427 * blocks are never reported as leaks. This is because many
1420 */ 1428 * of these blocks are only referred via the physical
1421 kmemleak_alloc(ptr, size, 0, 0); 1429 * address which is not looked up by kmemleak.
1430 */
1431 kmemleak_alloc(ptr, size, 0, 0);
1422 1432
1423 return ptr; 1433 return ptr;
1424} 1434}
@@ -1537,24 +1547,6 @@ void * __init memblock_alloc_try_nid(
1537} 1547}
1538 1548
1539/** 1549/**
1540 * __memblock_free_early - free boot memory block
1541 * @base: phys starting address of the boot memory block
1542 * @size: size of the boot memory block in bytes
1543 *
1544 * Free boot memory block previously allocated by memblock_alloc_xx() API.
1545 * The freeing memory will not be released to the buddy allocator.
1546 */
1547void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
1548{
1549 phys_addr_t end = base + size - 1;
1550
1551 memblock_dbg("%s: [%pa-%pa] %pF\n",
1552 __func__, &base, &end, (void *)_RET_IP_);
1553 kmemleak_free_part_phys(base, size);
1554 memblock_remove_range(&memblock.reserved, base, size);
1555}
1556
1557/**
1558 * __memblock_free_late - free bootmem block pages directly to buddy allocator 1550 * __memblock_free_late - free bootmem block pages directly to buddy allocator
1559 * @base: phys starting address of the boot memory block 1551 * @base: phys starting address of the boot memory block
1560 * @size: size of the boot memory block in bytes 1552 * @size: size of the boot memory block in bytes
@@ -1576,7 +1568,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
1576 1568
1577 for (; cursor < end; cursor++) { 1569 for (; cursor < end; cursor++) {
1578 memblock_free_pages(pfn_to_page(cursor), cursor, 0); 1570 memblock_free_pages(pfn_to_page(cursor), cursor, 0);
1579 totalram_pages++; 1571 totalram_pages_inc();
1580 } 1572 }
1581} 1573}
1582 1574
@@ -1950,7 +1942,7 @@ void reset_node_managed_pages(pg_data_t *pgdat)
1950 struct zone *z; 1942 struct zone *z;
1951 1943
1952 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 1944 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
1953 z->managed_pages = 0; 1945 atomic_long_set(&z->managed_pages, 0);
1954} 1946}
1955 1947
1956void __init reset_all_zones_managed_pages(void) 1948void __init reset_all_zones_managed_pages(void)
@@ -1978,7 +1970,7 @@ unsigned long __init memblock_free_all(void)
1978 reset_all_zones_managed_pages(); 1970 reset_all_zones_managed_pages();
1979 1971
1980 pages = free_low_memory_core_early(); 1972 pages = free_low_memory_core_early();
1981 totalram_pages += pages; 1973 totalram_pages_add(pages);
1982 1974
1983 return pages; 1975 return pages;
1984} 1976}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6e1469b80cb7..af7f18b32389 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1293,32 +1293,39 @@ static const char *const memcg1_stat_names[] = {
1293 1293
1294#define K(x) ((x) << (PAGE_SHIFT-10)) 1294#define K(x) ((x) << (PAGE_SHIFT-10))
1295/** 1295/**
1296 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1296 * mem_cgroup_print_oom_context: Print OOM information relevant to
1297 * memory controller.
1297 * @memcg: The memory cgroup that went over limit 1298 * @memcg: The memory cgroup that went over limit
1298 * @p: Task that is going to be killed 1299 * @p: Task that is going to be killed
1299 * 1300 *
1300 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1301 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1301 * enabled 1302 * enabled
1302 */ 1303 */
1303void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1304void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1304{ 1305{
1305 struct mem_cgroup *iter;
1306 unsigned int i;
1307
1308 rcu_read_lock(); 1306 rcu_read_lock();
1309 1307
1308 if (memcg) {
1309 pr_cont(",oom_memcg=");
1310 pr_cont_cgroup_path(memcg->css.cgroup);
1311 } else
1312 pr_cont(",global_oom");
1310 if (p) { 1313 if (p) {
1311 pr_info("Task in "); 1314 pr_cont(",task_memcg=");
1312 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1315 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1313 pr_cont(" killed as a result of limit of ");
1314 } else {
1315 pr_info("Memory limit reached of cgroup ");
1316 } 1316 }
1317
1318 pr_cont_cgroup_path(memcg->css.cgroup);
1319 pr_cont("\n");
1320
1321 rcu_read_unlock(); 1317 rcu_read_unlock();
1318}
1319
1320/**
1321 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1322 * memory controller.
1323 * @memcg: The memory cgroup that went over limit
1324 */
1325void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1326{
1327 struct mem_cgroup *iter;
1328 unsigned int i;
1322 1329
1323 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1330 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1324 K((u64)page_counter_read(&memcg->memory)), 1331 K((u64)page_counter_read(&memcg->memory)),
@@ -1666,6 +1673,9 @@ enum oom_status {
1666 1673
1667static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1674static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1668{ 1675{
1676 enum oom_status ret;
1677 bool locked;
1678
1669 if (order > PAGE_ALLOC_COSTLY_ORDER) 1679 if (order > PAGE_ALLOC_COSTLY_ORDER)
1670 return OOM_SKIPPED; 1680 return OOM_SKIPPED;
1671 1681
@@ -1700,10 +1710,23 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
1700 return OOM_ASYNC; 1710 return OOM_ASYNC;
1701 } 1711 }
1702 1712
1713 mem_cgroup_mark_under_oom(memcg);
1714
1715 locked = mem_cgroup_oom_trylock(memcg);
1716
1717 if (locked)
1718 mem_cgroup_oom_notify(memcg);
1719
1720 mem_cgroup_unmark_under_oom(memcg);
1703 if (mem_cgroup_out_of_memory(memcg, mask, order)) 1721 if (mem_cgroup_out_of_memory(memcg, mask, order))
1704 return OOM_SUCCESS; 1722 ret = OOM_SUCCESS;
1723 else
1724 ret = OOM_FAILED;
1705 1725
1706 return OOM_FAILED; 1726 if (locked)
1727 mem_cgroup_oom_unlock(memcg);
1728
1729 return ret;
1707} 1730}
1708 1731
1709/** 1732/**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7c72f2a95785..6379fff1a5ff 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
966 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 966 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
967 struct address_space *mapping; 967 struct address_space *mapping;
968 LIST_HEAD(tokill); 968 LIST_HEAD(tokill);
969 bool unmap_success; 969 bool unmap_success = true;
970 int kill = 1, forcekill; 970 int kill = 1, forcekill;
971 struct page *hpage = *hpagep; 971 struct page *hpage = *hpagep;
972 bool mlocked = PageMlocked(hpage); 972 bool mlocked = PageMlocked(hpage);
@@ -1028,7 +1028,19 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
1028 if (kill) 1028 if (kill)
1029 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); 1029 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1030 1030
1031 unmap_success = try_to_unmap(hpage, ttu); 1031 if (!PageHuge(hpage)) {
1032 unmap_success = try_to_unmap(hpage, ttu);
1033 } else if (mapping) {
1034 /*
1035 * For hugetlb pages, try_to_unmap could potentially call
1036 * huge_pmd_unshare. Because of this, take semaphore in
1037 * write mode here and set TTU_RMAP_LOCKED to indicate we
1038 * have taken the lock at this higer level.
1039 */
1040 i_mmap_lock_write(mapping);
1041 unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
1042 i_mmap_unlock_write(mapping);
1043 }
1032 if (!unmap_success) 1044 if (!unmap_success)
1033 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", 1045 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
1034 pfn, page_mapcount(hpage)); 1046 pfn, page_mapcount(hpage));
diff --git a/mm/memory.c b/mm/memory.c
index 4ad2d293ddc2..2dd2f9ab57f4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -973,8 +973,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
973 unsigned long next; 973 unsigned long next;
974 unsigned long addr = vma->vm_start; 974 unsigned long addr = vma->vm_start;
975 unsigned long end = vma->vm_end; 975 unsigned long end = vma->vm_end;
976 unsigned long mmun_start; /* For mmu_notifiers */ 976 struct mmu_notifier_range range;
977 unsigned long mmun_end; /* For mmu_notifiers */
978 bool is_cow; 977 bool is_cow;
979 int ret; 978 int ret;
980 979
@@ -1008,11 +1007,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1008 * is_cow_mapping() returns true. 1007 * is_cow_mapping() returns true.
1009 */ 1008 */
1010 is_cow = is_cow_mapping(vma->vm_flags); 1009 is_cow = is_cow_mapping(vma->vm_flags);
1011 mmun_start = addr; 1010
1012 mmun_end = end; 1011 if (is_cow) {
1013 if (is_cow) 1012 mmu_notifier_range_init(&range, src_mm, addr, end);
1014 mmu_notifier_invalidate_range_start(src_mm, mmun_start, 1013 mmu_notifier_invalidate_range_start(&range);
1015 mmun_end); 1014 }
1016 1015
1017 ret = 0; 1016 ret = 0;
1018 dst_pgd = pgd_offset(dst_mm, addr); 1017 dst_pgd = pgd_offset(dst_mm, addr);
@@ -1029,7 +1028,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1029 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 1028 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1030 1029
1031 if (is_cow) 1030 if (is_cow)
1032 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); 1031 mmu_notifier_invalidate_range_end(&range);
1033 return ret; 1032 return ret;
1034} 1033}
1035 1034
@@ -1332,12 +1331,13 @@ void unmap_vmas(struct mmu_gather *tlb,
1332 struct vm_area_struct *vma, unsigned long start_addr, 1331 struct vm_area_struct *vma, unsigned long start_addr,
1333 unsigned long end_addr) 1332 unsigned long end_addr)
1334{ 1333{
1335 struct mm_struct *mm = vma->vm_mm; 1334 struct mmu_notifier_range range;
1336 1335
1337 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1336 mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr);
1337 mmu_notifier_invalidate_range_start(&range);
1338 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) 1338 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1339 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); 1339 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1340 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1340 mmu_notifier_invalidate_range_end(&range);
1341} 1341}
1342 1342
1343/** 1343/**
@@ -1351,18 +1351,18 @@ void unmap_vmas(struct mmu_gather *tlb,
1351void zap_page_range(struct vm_area_struct *vma, unsigned long start, 1351void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1352 unsigned long size) 1352 unsigned long size)
1353{ 1353{
1354 struct mm_struct *mm = vma->vm_mm; 1354 struct mmu_notifier_range range;
1355 struct mmu_gather tlb; 1355 struct mmu_gather tlb;
1356 unsigned long end = start + size;
1357 1356
1358 lru_add_drain(); 1357 lru_add_drain();
1359 tlb_gather_mmu(&tlb, mm, start, end); 1358 mmu_notifier_range_init(&range, vma->vm_mm, start, start + size);
1360 update_hiwater_rss(mm); 1359 tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
1361 mmu_notifier_invalidate_range_start(mm, start, end); 1360 update_hiwater_rss(vma->vm_mm);
1362 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) 1361 mmu_notifier_invalidate_range_start(&range);
1363 unmap_single_vma(&tlb, vma, start, end, NULL); 1362 for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
1364 mmu_notifier_invalidate_range_end(mm, start, end); 1363 unmap_single_vma(&tlb, vma, start, range.end, NULL);
1365 tlb_finish_mmu(&tlb, start, end); 1364 mmu_notifier_invalidate_range_end(&range);
1365 tlb_finish_mmu(&tlb, start, range.end);
1366} 1366}
1367 1367
1368/** 1368/**
@@ -1377,17 +1377,17 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1377static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, 1377static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1378 unsigned long size, struct zap_details *details) 1378 unsigned long size, struct zap_details *details)
1379{ 1379{
1380 struct mm_struct *mm = vma->vm_mm; 1380 struct mmu_notifier_range range;
1381 struct mmu_gather tlb; 1381 struct mmu_gather tlb;
1382 unsigned long end = address + size;
1383 1382
1384 lru_add_drain(); 1383 lru_add_drain();
1385 tlb_gather_mmu(&tlb, mm, address, end); 1384 mmu_notifier_range_init(&range, vma->vm_mm, address, address + size);
1386 update_hiwater_rss(mm); 1385 tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
1387 mmu_notifier_invalidate_range_start(mm, address, end); 1386 update_hiwater_rss(vma->vm_mm);
1388 unmap_single_vma(&tlb, vma, address, end, details); 1387 mmu_notifier_invalidate_range_start(&range);
1389 mmu_notifier_invalidate_range_end(mm, address, end); 1388 unmap_single_vma(&tlb, vma, address, range.end, details);
1390 tlb_finish_mmu(&tlb, address, end); 1389 mmu_notifier_invalidate_range_end(&range);
1390 tlb_finish_mmu(&tlb, address, range.end);
1391} 1391}
1392 1392
1393/** 1393/**
@@ -2247,9 +2247,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2247 struct page *new_page = NULL; 2247 struct page *new_page = NULL;
2248 pte_t entry; 2248 pte_t entry;
2249 int page_copied = 0; 2249 int page_copied = 0;
2250 const unsigned long mmun_start = vmf->address & PAGE_MASK;
2251 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2252 struct mem_cgroup *memcg; 2250 struct mem_cgroup *memcg;
2251 struct mmu_notifier_range range;
2253 2252
2254 if (unlikely(anon_vma_prepare(vma))) 2253 if (unlikely(anon_vma_prepare(vma)))
2255 goto oom; 2254 goto oom;
@@ -2272,7 +2271,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2272 2271
2273 __SetPageUptodate(new_page); 2272 __SetPageUptodate(new_page);
2274 2273
2275 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2274 mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK,
2275 (vmf->address & PAGE_MASK) + PAGE_SIZE);
2276 mmu_notifier_invalidate_range_start(&range);
2276 2277
2277 /* 2278 /*
2278 * Re-check the pte - we dropped the lock 2279 * Re-check the pte - we dropped the lock
@@ -2349,7 +2350,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2349 * No need to double call mmu_notifier->invalidate_range() callback as 2350 * No need to double call mmu_notifier->invalidate_range() callback as
2350 * the above ptep_clear_flush_notify() did already call it. 2351 * the above ptep_clear_flush_notify() did already call it.
2351 */ 2352 */
2352 mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); 2353 mmu_notifier_invalidate_range_only_end(&range);
2353 if (old_page) { 2354 if (old_page) {
2354 /* 2355 /*
2355 * Don't let another task, with possibly unlocked vma, 2356 * Don't let another task, with possibly unlocked vma,
@@ -3830,7 +3831,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
3830 vmf.pud = pud_alloc(mm, p4d, address); 3831 vmf.pud = pud_alloc(mm, p4d, address);
3831 if (!vmf.pud) 3832 if (!vmf.pud)
3832 return VM_FAULT_OOM; 3833 return VM_FAULT_OOM;
3833 if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) { 3834 if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
3834 ret = create_huge_pud(&vmf); 3835 ret = create_huge_pud(&vmf);
3835 if (!(ret & VM_FAULT_FALLBACK)) 3836 if (!(ret & VM_FAULT_FALLBACK))
3836 return ret; 3837 return ret;
@@ -3856,7 +3857,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
3856 vmf.pmd = pmd_alloc(mm, vmf.pud, address); 3857 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
3857 if (!vmf.pmd) 3858 if (!vmf.pmd)
3858 return VM_FAULT_OOM; 3859 return VM_FAULT_OOM;
3859 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { 3860 if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
3860 ret = create_huge_pmd(&vmf); 3861 ret = create_huge_pmd(&vmf);
3861 if (!(ret & VM_FAULT_FALLBACK)) 3862 if (!(ret & VM_FAULT_FALLBACK))
3862 return ret; 3863 return ret;
@@ -4030,7 +4031,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4030#endif /* __PAGETABLE_PMD_FOLDED */ 4031#endif /* __PAGETABLE_PMD_FOLDED */
4031 4032
4032static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, 4033static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4033 unsigned long *start, unsigned long *end, 4034 struct mmu_notifier_range *range,
4034 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) 4035 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4035{ 4036{
4036 pgd_t *pgd; 4037 pgd_t *pgd;
@@ -4058,10 +4059,10 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4058 if (!pmdpp) 4059 if (!pmdpp)
4059 goto out; 4060 goto out;
4060 4061
4061 if (start && end) { 4062 if (range) {
4062 *start = address & PMD_MASK; 4063 mmu_notifier_range_init(range, mm, address & PMD_MASK,
4063 *end = *start + PMD_SIZE; 4064 (address & PMD_MASK) + PMD_SIZE);
4064 mmu_notifier_invalidate_range_start(mm, *start, *end); 4065 mmu_notifier_invalidate_range_start(range);
4065 } 4066 }
4066 *ptlp = pmd_lock(mm, pmd); 4067 *ptlp = pmd_lock(mm, pmd);
4067 if (pmd_huge(*pmd)) { 4068 if (pmd_huge(*pmd)) {
@@ -4069,17 +4070,17 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4069 return 0; 4070 return 0;
4070 } 4071 }
4071 spin_unlock(*ptlp); 4072 spin_unlock(*ptlp);
4072 if (start && end) 4073 if (range)
4073 mmu_notifier_invalidate_range_end(mm, *start, *end); 4074 mmu_notifier_invalidate_range_end(range);
4074 } 4075 }
4075 4076
4076 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 4077 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4077 goto out; 4078 goto out;
4078 4079
4079 if (start && end) { 4080 if (range) {
4080 *start = address & PAGE_MASK; 4081 range->start = address & PAGE_MASK;
4081 *end = *start + PAGE_SIZE; 4082 range->end = range->start + PAGE_SIZE;
4082 mmu_notifier_invalidate_range_start(mm, *start, *end); 4083 mmu_notifier_invalidate_range_start(range);
4083 } 4084 }
4084 ptep = pte_offset_map_lock(mm, pmd, address, ptlp); 4085 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4085 if (!pte_present(*ptep)) 4086 if (!pte_present(*ptep))
@@ -4088,8 +4089,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4088 return 0; 4089 return 0;
4089unlock: 4090unlock:
4090 pte_unmap_unlock(ptep, *ptlp); 4091 pte_unmap_unlock(ptep, *ptlp);
4091 if (start && end) 4092 if (range)
4092 mmu_notifier_invalidate_range_end(mm, *start, *end); 4093 mmu_notifier_invalidate_range_end(range);
4093out: 4094out:
4094 return -EINVAL; 4095 return -EINVAL;
4095} 4096}
@@ -4101,20 +4102,20 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4101 4102
4102 /* (void) is needed to make gcc happy */ 4103 /* (void) is needed to make gcc happy */
4103 (void) __cond_lock(*ptlp, 4104 (void) __cond_lock(*ptlp,
4104 !(res = __follow_pte_pmd(mm, address, NULL, NULL, 4105 !(res = __follow_pte_pmd(mm, address, NULL,
4105 ptepp, NULL, ptlp))); 4106 ptepp, NULL, ptlp)));
4106 return res; 4107 return res;
4107} 4108}
4108 4109
4109int follow_pte_pmd(struct mm_struct *mm, unsigned long address, 4110int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4110 unsigned long *start, unsigned long *end, 4111 struct mmu_notifier_range *range,
4111 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) 4112 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4112{ 4113{
4113 int res; 4114 int res;
4114 4115
4115 /* (void) is needed to make gcc happy */ 4116 /* (void) is needed to make gcc happy */
4116 (void) __cond_lock(*ptlp, 4117 (void) __cond_lock(*ptlp,
4117 !(res = __follow_pte_pmd(mm, address, start, end, 4118 !(res = __follow_pte_pmd(mm, address, range,
4118 ptepp, pmdpp, ptlp))); 4119 ptepp, pmdpp, ptlp)));
4119 return res; 4120 return res;
4120} 4121}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2b2b3ccbbfb5..b9a667d36c55 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,7 @@
34#include <linux/hugetlb.h> 34#include <linux/hugetlb.h>
35#include <linux/memblock.h> 35#include <linux/memblock.h>
36#include <linux/compaction.h> 36#include <linux/compaction.h>
37#include <linux/rmap.h>
37 38
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
39 40
@@ -253,7 +254,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
253 if (pfn_valid(phys_start_pfn)) 254 if (pfn_valid(phys_start_pfn))
254 return -EEXIST; 255 return -EEXIST;
255 256
256 ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn, altmap); 257 ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
257 if (ret < 0) 258 if (ret < 0)
258 return ret; 259 return ret;
259 260
@@ -743,14 +744,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
743 int nid = pgdat->node_id; 744 int nid = pgdat->node_id;
744 unsigned long flags; 745 unsigned long flags;
745 746
746 if (zone_is_empty(zone))
747 init_currently_empty_zone(zone, start_pfn, nr_pages);
748
749 clear_zone_contiguous(zone); 747 clear_zone_contiguous(zone);
750 748
751 /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ 749 /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
752 pgdat_resize_lock(pgdat, &flags); 750 pgdat_resize_lock(pgdat, &flags);
753 zone_span_writelock(zone); 751 zone_span_writelock(zone);
752 if (zone_is_empty(zone))
753 init_currently_empty_zone(zone, start_pfn, nr_pages);
754 resize_zone_range(zone, start_pfn, nr_pages); 754 resize_zone_range(zone, start_pfn, nr_pages);
755 zone_span_writeunlock(zone); 755 zone_span_writeunlock(zone);
756 resize_pgdat_range(pgdat, start_pfn, nr_pages); 756 resize_pgdat_range(pgdat, start_pfn, nr_pages);
@@ -1078,7 +1078,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
1078 * 1078 *
1079 * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG 1079 * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
1080 */ 1080 */
1081int __ref add_memory_resource(int nid, struct resource *res, bool online) 1081int __ref add_memory_resource(int nid, struct resource *res)
1082{ 1082{
1083 u64 start, size; 1083 u64 start, size;
1084 bool new_node = false; 1084 bool new_node = false;
@@ -1133,7 +1133,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
1133 mem_hotplug_done(); 1133 mem_hotplug_done();
1134 1134
1135 /* online pages if requested */ 1135 /* online pages if requested */
1136 if (online) 1136 if (memhp_auto_online)
1137 walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), 1137 walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
1138 NULL, online_memory_block); 1138 NULL, online_memory_block);
1139 1139
@@ -1157,7 +1157,7 @@ int __ref __add_memory(int nid, u64 start, u64 size)
1157 if (IS_ERR(res)) 1157 if (IS_ERR(res))
1158 return PTR_ERR(res); 1158 return PTR_ERR(res);
1159 1159
1160 ret = add_memory_resource(nid, res, memhp_auto_online); 1160 ret = add_memory_resource(nid, res);
1161 if (ret < 0) 1161 if (ret < 0)
1162 release_memory_resource(res); 1162 release_memory_resource(res);
1163 return ret; 1163 return ret;
@@ -1226,7 +1226,7 @@ static bool is_pageblock_removable_nolock(struct page *page)
1226 if (!zone_spans_pfn(zone, pfn)) 1226 if (!zone_spans_pfn(zone, pfn))
1227 return false; 1227 return false;
1228 1228
1229 return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); 1229 return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON);
1230} 1230}
1231 1231
1232/* Checks if this range of memory is likely to be hot-removable. */ 1232/* Checks if this range of memory is likely to be hot-removable. */
@@ -1339,18 +1339,16 @@ static struct page *new_node_page(struct page *page, unsigned long private)
1339 return new_page_nodemask(page, nid, &nmask); 1339 return new_page_nodemask(page, nid, &nmask);
1340} 1340}
1341 1341
1342#define NR_OFFLINE_AT_ONCE_PAGES (256)
1343static int 1342static int
1344do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1343do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1345{ 1344{
1346 unsigned long pfn; 1345 unsigned long pfn;
1347 struct page *page; 1346 struct page *page;
1348 int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
1349 int not_managed = 0; 1347 int not_managed = 0;
1350 int ret = 0; 1348 int ret = 0;
1351 LIST_HEAD(source); 1349 LIST_HEAD(source);
1352 1350
1353 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1351 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1354 if (!pfn_valid(pfn)) 1352 if (!pfn_valid(pfn))
1355 continue; 1353 continue;
1356 page = pfn_to_page(pfn); 1354 page = pfn_to_page(pfn);
@@ -1362,13 +1360,27 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1362 ret = -EBUSY; 1360 ret = -EBUSY;
1363 break; 1361 break;
1364 } 1362 }
1365 if (isolate_huge_page(page, &source)) 1363 isolate_huge_page(page, &source);
1366 move_pages -= 1 << compound_order(head);
1367 continue; 1364 continue;
1368 } else if (PageTransHuge(page)) 1365 } else if (PageTransHuge(page))
1369 pfn = page_to_pfn(compound_head(page)) 1366 pfn = page_to_pfn(compound_head(page))
1370 + hpage_nr_pages(page) - 1; 1367 + hpage_nr_pages(page) - 1;
1371 1368
1369 /*
1370 * HWPoison pages have elevated reference counts so the migration would
1371 * fail on them. It also doesn't make any sense to migrate them in the
1372 * first place. Still try to unmap such a page in case it is still mapped
1373 * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
1374 * the unmap as the catch all safety net).
1375 */
1376 if (PageHWPoison(page)) {
1377 if (WARN_ON(PageLRU(page)))
1378 isolate_lru_page(page);
1379 if (page_mapped(page))
1380 try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
1381 continue;
1382 }
1383
1372 if (!get_page_unless_zero(page)) 1384 if (!get_page_unless_zero(page))
1373 continue; 1385 continue;
1374 /* 1386 /*
@@ -1382,16 +1394,13 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1382 if (!ret) { /* Success */ 1394 if (!ret) { /* Success */
1383 put_page(page); 1395 put_page(page);
1384 list_add_tail(&page->lru, &source); 1396 list_add_tail(&page->lru, &source);
1385 move_pages--;
1386 if (!__PageMovable(page)) 1397 if (!__PageMovable(page))
1387 inc_node_page_state(page, NR_ISOLATED_ANON + 1398 inc_node_page_state(page, NR_ISOLATED_ANON +
1388 page_is_file_cache(page)); 1399 page_is_file_cache(page));
1389 1400
1390 } else { 1401 } else {
1391#ifdef CONFIG_DEBUG_VM 1402 pr_warn("failed to isolate pfn %lx\n", pfn);
1392 pr_alert("failed to isolate pfn %lx\n", pfn);
1393 dump_page(page, "isolation failed"); 1403 dump_page(page, "isolation failed");
1394#endif
1395 put_page(page); 1404 put_page(page);
1396 /* Because we don't have big zone->lock. we should 1405 /* Because we don't have big zone->lock. we should
1397 check this again here. */ 1406 check this again here. */
@@ -1411,8 +1420,14 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1411 /* Allocate a new page from the nearest neighbor node */ 1420 /* Allocate a new page from the nearest neighbor node */
1412 ret = migrate_pages(&source, new_node_page, NULL, 0, 1421 ret = migrate_pages(&source, new_node_page, NULL, 0,
1413 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1422 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1414 if (ret) 1423 if (ret) {
1424 list_for_each_entry(page, &source, lru) {
1425 pr_warn("migrating pfn %lx failed ret:%d ",
1426 page_to_pfn(page), ret);
1427 dump_page(page, "migration failure");
1428 }
1415 putback_movable_pages(&source); 1429 putback_movable_pages(&source);
1430 }
1416 } 1431 }
1417out: 1432out:
1418 return ret; 1433 return ret;
@@ -1553,12 +1568,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
1553 unsigned long valid_start, valid_end; 1568 unsigned long valid_start, valid_end;
1554 struct zone *zone; 1569 struct zone *zone;
1555 struct memory_notify arg; 1570 struct memory_notify arg;
1556 1571 char *reason;
1557 /* at least, alignment against pageblock is necessary */
1558 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
1559 return -EINVAL;
1560 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
1561 return -EINVAL;
1562 1572
1563 mem_hotplug_begin(); 1573 mem_hotplug_begin();
1564 1574
@@ -1567,7 +1577,9 @@ static int __ref __offline_pages(unsigned long start_pfn,
1567 if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, 1577 if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
1568 &valid_end)) { 1578 &valid_end)) {
1569 mem_hotplug_done(); 1579 mem_hotplug_done();
1570 return -EINVAL; 1580 ret = -EINVAL;
1581 reason = "multizone range";
1582 goto failed_removal;
1571 } 1583 }
1572 1584
1573 zone = page_zone(pfn_to_page(valid_start)); 1585 zone = page_zone(pfn_to_page(valid_start));
@@ -1576,10 +1588,12 @@ static int __ref __offline_pages(unsigned long start_pfn,
1576 1588
1577 /* set above range as isolated */ 1589 /* set above range as isolated */
1578 ret = start_isolate_page_range(start_pfn, end_pfn, 1590 ret = start_isolate_page_range(start_pfn, end_pfn,
1579 MIGRATE_MOVABLE, true); 1591 MIGRATE_MOVABLE,
1592 SKIP_HWPOISON | REPORT_FAILURE);
1580 if (ret) { 1593 if (ret) {
1581 mem_hotplug_done(); 1594 mem_hotplug_done();
1582 return ret; 1595 reason = "failure to isolate range";
1596 goto failed_removal;
1583 } 1597 }
1584 1598
1585 arg.start_pfn = start_pfn; 1599 arg.start_pfn = start_pfn;
@@ -1588,37 +1602,47 @@ static int __ref __offline_pages(unsigned long start_pfn,
1588 1602
1589 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1603 ret = memory_notify(MEM_GOING_OFFLINE, &arg);
1590 ret = notifier_to_errno(ret); 1604 ret = notifier_to_errno(ret);
1591 if (ret) 1605 if (ret) {
1592 goto failed_removal; 1606 reason = "notifier failure";
1607 goto failed_removal_isolated;
1608 }
1593 1609
1594 pfn = start_pfn; 1610 do {
1595repeat: 1611 for (pfn = start_pfn; pfn;) {
1596 /* start memory hot removal */ 1612 if (signal_pending(current)) {
1597 ret = -EINTR; 1613 ret = -EINTR;
1598 if (signal_pending(current)) 1614 reason = "signal backoff";
1599 goto failed_removal; 1615 goto failed_removal_isolated;
1616 }
1600 1617
1601 cond_resched(); 1618 cond_resched();
1602 lru_add_drain_all(); 1619 lru_add_drain_all();
1603 drain_all_pages(zone); 1620 drain_all_pages(zone);
1621
1622 pfn = scan_movable_pages(pfn, end_pfn);
1623 if (pfn) {
1624 /*
1625 * TODO: fatal migration failures should bail
1626 * out
1627 */
1628 do_migrate_range(pfn, end_pfn);
1629 }
1630 }
1604 1631
1605 pfn = scan_movable_pages(start_pfn, end_pfn); 1632 /*
1606 if (pfn) { /* We have movable pages */ 1633 * Dissolve free hugepages in the memory block before doing
1607 ret = do_migrate_range(pfn, end_pfn); 1634 * offlining actually in order to make hugetlbfs's object
1608 goto repeat; 1635 * counting consistent.
1609 } 1636 */
1637 ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1638 if (ret) {
1639 reason = "failure to dissolve huge pages";
1640 goto failed_removal_isolated;
1641 }
1642 /* check again */
1643 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1644 } while (offlined_pages < 0);
1610 1645
1611 /*
1612 * dissolve free hugepages in the memory block before doing offlining
1613 * actually in order to make hugetlbfs's object counting consistent.
1614 */
1615 ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1616 if (ret)
1617 goto failed_removal;
1618 /* check again */
1619 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1620 if (offlined_pages < 0)
1621 goto repeat;
1622 pr_info("Offlined Pages %ld\n", offlined_pages); 1646 pr_info("Offlined Pages %ld\n", offlined_pages);
1623 /* Ok, all of our target is isolated. 1647 /* Ok, all of our target is isolated.
1624 We cannot do rollback at this point. */ 1648 We cannot do rollback at this point. */
@@ -1654,13 +1678,15 @@ repeat:
1654 mem_hotplug_done(); 1678 mem_hotplug_done();
1655 return 0; 1679 return 0;
1656 1680
1681failed_removal_isolated:
1682 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1657failed_removal: 1683failed_removal:
1658 pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", 1684 pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
1659 (unsigned long long) start_pfn << PAGE_SHIFT, 1685 (unsigned long long) start_pfn << PAGE_SHIFT,
1660 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1686 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
1687 reason);
1661 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1688 memory_notify(MEM_CANCEL_OFFLINE, &arg);
1662 /* pushback to free area */ 1689 /* pushback to free area */
1663 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1664 mem_hotplug_done(); 1690 mem_hotplug_done();
1665 return ret; 1691 return ret;
1666} 1692}
@@ -1753,34 +1779,6 @@ static int check_cpu_on_node(pg_data_t *pgdat)
1753 return 0; 1779 return 0;
1754} 1780}
1755 1781
1756static void unmap_cpu_on_node(pg_data_t *pgdat)
1757{
1758#ifdef CONFIG_ACPI_NUMA
1759 int cpu;
1760
1761 for_each_possible_cpu(cpu)
1762 if (cpu_to_node(cpu) == pgdat->node_id)
1763 numa_clear_node(cpu);
1764#endif
1765}
1766
1767static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
1768{
1769 int ret;
1770
1771 ret = check_cpu_on_node(pgdat);
1772 if (ret)
1773 return ret;
1774
1775 /*
1776 * the node will be offlined when we come here, so we can clear
1777 * the cpu_to_node() now.
1778 */
1779
1780 unmap_cpu_on_node(pgdat);
1781 return 0;
1782}
1783
1784/** 1782/**
1785 * try_offline_node 1783 * try_offline_node
1786 * @nid: the node ID 1784 * @nid: the node ID
@@ -1813,7 +1811,7 @@ void try_offline_node(int nid)
1813 return; 1811 return;
1814 } 1812 }
1815 1813
1816 if (check_and_unmap_cpu_on_node(pgdat)) 1814 if (check_cpu_on_node(pgdat))
1817 return; 1815 return;
1818 1816
1819 /* 1817 /*
@@ -1858,7 +1856,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
1858 memblock_free(start, size); 1856 memblock_free(start, size);
1859 memblock_remove(start, size); 1857 memblock_remove(start, size);
1860 1858
1861 arch_remove_memory(start, size, NULL); 1859 arch_remove_memory(nid, start, size, NULL);
1862 1860
1863 try_offline_node(nid); 1861 try_offline_node(nid);
1864 1862
diff --git a/mm/migrate.c b/mm/migrate.c
index f7e4bfdc13b7..5d1839a9148d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -327,16 +327,13 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
327 327
328 /* 328 /*
329 * Once page cache replacement of page migration started, page_count 329 * Once page cache replacement of page migration started, page_count
330 * *must* be zero. And, we don't want to call wait_on_page_locked() 330 * is zero; but we must not call put_and_wait_on_page_locked() without
331 * against a page without get_page(). 331 * a ref. Use get_page_unless_zero(), and just fault again if it fails.
332 * So, we use get_page_unless_zero(), here. Even failed, page fault
333 * will occur again.
334 */ 332 */
335 if (!get_page_unless_zero(page)) 333 if (!get_page_unless_zero(page))
336 goto out; 334 goto out;
337 pte_unmap_unlock(ptep, ptl); 335 pte_unmap_unlock(ptep, ptl);
338 wait_on_page_locked(page); 336 put_and_wait_on_page_locked(page);
339 put_page(page);
340 return; 337 return;
341out: 338out:
342 pte_unmap_unlock(ptep, ptl); 339 pte_unmap_unlock(ptep, ptl);
@@ -370,63 +367,28 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
370 if (!get_page_unless_zero(page)) 367 if (!get_page_unless_zero(page))
371 goto unlock; 368 goto unlock;
372 spin_unlock(ptl); 369 spin_unlock(ptl);
373 wait_on_page_locked(page); 370 put_and_wait_on_page_locked(page);
374 put_page(page);
375 return; 371 return;
376unlock: 372unlock:
377 spin_unlock(ptl); 373 spin_unlock(ptl);
378} 374}
379#endif 375#endif
380 376
381#ifdef CONFIG_BLOCK 377static int expected_page_refs(struct page *page)
382/* Returns true if all buffers are successfully locked */
383static bool buffer_migrate_lock_buffers(struct buffer_head *head,
384 enum migrate_mode mode)
385{ 378{
386 struct buffer_head *bh = head; 379 int expected_count = 1;
387
388 /* Simple case, sync compaction */
389 if (mode != MIGRATE_ASYNC) {
390 do {
391 get_bh(bh);
392 lock_buffer(bh);
393 bh = bh->b_this_page;
394
395 } while (bh != head);
396 380
397 return true; 381 /*
398 } 382 * Device public or private pages have an extra refcount as they are
399 383 * ZONE_DEVICE pages.
400 /* async case, we cannot block on lock_buffer so use trylock_buffer */ 384 */
401 do { 385 expected_count += is_device_private_page(page);
402 get_bh(bh); 386 expected_count += is_device_public_page(page);
403 if (!trylock_buffer(bh)) { 387 if (page_mapping(page))
404 /* 388 expected_count += hpage_nr_pages(page) + page_has_private(page);
405 * We failed to lock the buffer and cannot stall in
406 * async migration. Release the taken locks
407 */
408 struct buffer_head *failed_bh = bh;
409 put_bh(failed_bh);
410 bh = head;
411 while (bh != failed_bh) {
412 unlock_buffer(bh);
413 put_bh(bh);
414 bh = bh->b_this_page;
415 }
416 return false;
417 }
418 389
419 bh = bh->b_this_page; 390 return expected_count;
420 } while (bh != head);
421 return true;
422}
423#else
424static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
425 enum migrate_mode mode)
426{
427 return true;
428} 391}
429#endif /* CONFIG_BLOCK */
430 392
431/* 393/*
432 * Replace the page in the mapping. 394 * Replace the page in the mapping.
@@ -437,21 +399,13 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
437 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 399 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
438 */ 400 */
439int migrate_page_move_mapping(struct address_space *mapping, 401int migrate_page_move_mapping(struct address_space *mapping,
440 struct page *newpage, struct page *page, 402 struct page *newpage, struct page *page, enum migrate_mode mode,
441 struct buffer_head *head, enum migrate_mode mode,
442 int extra_count) 403 int extra_count)
443{ 404{
444 XA_STATE(xas, &mapping->i_pages, page_index(page)); 405 XA_STATE(xas, &mapping->i_pages, page_index(page));
445 struct zone *oldzone, *newzone; 406 struct zone *oldzone, *newzone;
446 int dirty; 407 int dirty;
447 int expected_count = 1 + extra_count; 408 int expected_count = expected_page_refs(page) + extra_count;
448
449 /*
450 * Device public or private pages have an extra refcount as they are
451 * ZONE_DEVICE pages.
452 */
453 expected_count += is_device_private_page(page);
454 expected_count += is_device_public_page(page);
455 409
456 if (!mapping) { 410 if (!mapping) {
457 /* Anonymous page without mapping */ 411 /* Anonymous page without mapping */
@@ -471,8 +425,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
471 newzone = page_zone(newpage); 425 newzone = page_zone(newpage);
472 426
473 xas_lock_irq(&xas); 427 xas_lock_irq(&xas);
474
475 expected_count += hpage_nr_pages(page) + page_has_private(page);
476 if (page_count(page) != expected_count || xas_load(&xas) != page) { 428 if (page_count(page) != expected_count || xas_load(&xas) != page) {
477 xas_unlock_irq(&xas); 429 xas_unlock_irq(&xas);
478 return -EAGAIN; 430 return -EAGAIN;
@@ -484,20 +436,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
484 } 436 }
485 437
486 /* 438 /*
487 * In the async migration case of moving a page with buffers, lock the
488 * buffers using trylock before the mapping is moved. If the mapping
489 * was moved, we later failed to lock the buffers and could not move
490 * the mapping back due to an elevated page count, we would have to
491 * block waiting on other references to be dropped.
492 */
493 if (mode == MIGRATE_ASYNC && head &&
494 !buffer_migrate_lock_buffers(head, mode)) {
495 page_ref_unfreeze(page, expected_count);
496 xas_unlock_irq(&xas);
497 return -EAGAIN;
498 }
499
500 /*
501 * Now we know that no one else is looking at the page: 439 * Now we know that no one else is looking at the page:
502 * no turning back from here. 440 * no turning back from here.
503 */ 441 */
@@ -748,7 +686,7 @@ int migrate_page(struct address_space *mapping,
748 686
749 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 687 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
750 688
751 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); 689 rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
752 690
753 if (rc != MIGRATEPAGE_SUCCESS) 691 if (rc != MIGRATEPAGE_SUCCESS)
754 return rc; 692 return rc;
@@ -762,34 +700,98 @@ int migrate_page(struct address_space *mapping,
762EXPORT_SYMBOL(migrate_page); 700EXPORT_SYMBOL(migrate_page);
763 701
764#ifdef CONFIG_BLOCK 702#ifdef CONFIG_BLOCK
765/* 703/* Returns true if all buffers are successfully locked */
766 * Migration function for pages with buffers. This function can only be used 704static bool buffer_migrate_lock_buffers(struct buffer_head *head,
767 * if the underlying filesystem guarantees that no other references to "page" 705 enum migrate_mode mode)
768 * exist. 706{
769 */ 707 struct buffer_head *bh = head;
770int buffer_migrate_page(struct address_space *mapping, 708
771 struct page *newpage, struct page *page, enum migrate_mode mode) 709 /* Simple case, sync compaction */
710 if (mode != MIGRATE_ASYNC) {
711 do {
712 get_bh(bh);
713 lock_buffer(bh);
714 bh = bh->b_this_page;
715
716 } while (bh != head);
717
718 return true;
719 }
720
721 /* async case, we cannot block on lock_buffer so use trylock_buffer */
722 do {
723 get_bh(bh);
724 if (!trylock_buffer(bh)) {
725 /*
726 * We failed to lock the buffer and cannot stall in
727 * async migration. Release the taken locks
728 */
729 struct buffer_head *failed_bh = bh;
730 put_bh(failed_bh);
731 bh = head;
732 while (bh != failed_bh) {
733 unlock_buffer(bh);
734 put_bh(bh);
735 bh = bh->b_this_page;
736 }
737 return false;
738 }
739
740 bh = bh->b_this_page;
741 } while (bh != head);
742 return true;
743}
744
745static int __buffer_migrate_page(struct address_space *mapping,
746 struct page *newpage, struct page *page, enum migrate_mode mode,
747 bool check_refs)
772{ 748{
773 struct buffer_head *bh, *head; 749 struct buffer_head *bh, *head;
774 int rc; 750 int rc;
751 int expected_count;
775 752
776 if (!page_has_buffers(page)) 753 if (!page_has_buffers(page))
777 return migrate_page(mapping, newpage, page, mode); 754 return migrate_page(mapping, newpage, page, mode);
778 755
756 /* Check whether page does not have extra refs before we do more work */
757 expected_count = expected_page_refs(page);
758 if (page_count(page) != expected_count)
759 return -EAGAIN;
760
779 head = page_buffers(page); 761 head = page_buffers(page);
762 if (!buffer_migrate_lock_buffers(head, mode))
763 return -EAGAIN;
780 764
781 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); 765 if (check_refs) {
766 bool busy;
767 bool invalidated = false;
782 768
783 if (rc != MIGRATEPAGE_SUCCESS) 769recheck_buffers:
784 return rc; 770 busy = false;
771 spin_lock(&mapping->private_lock);
772 bh = head;
773 do {
774 if (atomic_read(&bh->b_count)) {
775 busy = true;
776 break;
777 }
778 bh = bh->b_this_page;
779 } while (bh != head);
780 spin_unlock(&mapping->private_lock);
781 if (busy) {
782 if (invalidated) {
783 rc = -EAGAIN;
784 goto unlock_buffers;
785 }
786 invalidate_bh_lrus();
787 invalidated = true;
788 goto recheck_buffers;
789 }
790 }
785 791
786 /* 792 rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
787 * In the async case, migrate_page_move_mapping locked the buffers 793 if (rc != MIGRATEPAGE_SUCCESS)
788 * with an IRQ-safe spinlock held. In the sync case, the buffers 794 goto unlock_buffers;
789 * need to be locked now
790 */
791 if (mode != MIGRATE_ASYNC)
792 BUG_ON(!buffer_migrate_lock_buffers(head, mode));
793 795
794 ClearPagePrivate(page); 796 ClearPagePrivate(page);
795 set_page_private(newpage, page_private(page)); 797 set_page_private(newpage, page_private(page));
@@ -811,6 +813,8 @@ int buffer_migrate_page(struct address_space *mapping,
811 else 813 else
812 migrate_page_states(newpage, page); 814 migrate_page_states(newpage, page);
813 815
816 rc = MIGRATEPAGE_SUCCESS;
817unlock_buffers:
814 bh = head; 818 bh = head;
815 do { 819 do {
816 unlock_buffer(bh); 820 unlock_buffer(bh);
@@ -819,9 +823,32 @@ int buffer_migrate_page(struct address_space *mapping,
819 823
820 } while (bh != head); 824 } while (bh != head);
821 825
822 return MIGRATEPAGE_SUCCESS; 826 return rc;
827}
828
829/*
830 * Migration function for pages with buffers. This function can only be used
831 * if the underlying filesystem guarantees that no other references to "page"
832 * exist. For example attached buffer heads are accessed only under page lock.
833 */
834int buffer_migrate_page(struct address_space *mapping,
835 struct page *newpage, struct page *page, enum migrate_mode mode)
836{
837 return __buffer_migrate_page(mapping, newpage, page, mode, false);
823} 838}
824EXPORT_SYMBOL(buffer_migrate_page); 839EXPORT_SYMBOL(buffer_migrate_page);
840
841/*
842 * Same as above except that this variant is more careful and checks that there
843 * are also no buffer head references. This function is the right one for
844 * mappings where buffer heads are directly looked up and referenced (such as
845 * block device mappings).
846 */
847int buffer_migrate_page_norefs(struct address_space *mapping,
848 struct page *newpage, struct page *page, enum migrate_mode mode)
849{
850 return __buffer_migrate_page(mapping, newpage, page, mode, true);
851}
825#endif 852#endif
826 853
827/* 854/*
@@ -1297,8 +1324,19 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1297 goto put_anon; 1324 goto put_anon;
1298 1325
1299 if (page_mapped(hpage)) { 1326 if (page_mapped(hpage)) {
1327 struct address_space *mapping = page_mapping(hpage);
1328
1329 /*
1330 * try_to_unmap could potentially call huge_pmd_unshare.
1331 * Because of this, take semaphore in write mode here and
1332 * set TTU_RMAP_LOCKED to let lower levels know we have
1333 * taken the lock.
1334 */
1335 i_mmap_lock_write(mapping);
1300 try_to_unmap(hpage, 1336 try_to_unmap(hpage,
1301 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 1337 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
1338 TTU_RMAP_LOCKED);
1339 i_mmap_unlock_write(mapping);
1302 page_was_mapped = 1; 1340 page_was_mapped = 1;
1303 } 1341 }
1304 1342
@@ -2303,6 +2341,7 @@ next:
2303 */ 2341 */
2304static void migrate_vma_collect(struct migrate_vma *migrate) 2342static void migrate_vma_collect(struct migrate_vma *migrate)
2305{ 2343{
2344 struct mmu_notifier_range range;
2306 struct mm_walk mm_walk; 2345 struct mm_walk mm_walk;
2307 2346
2308 mm_walk.pmd_entry = migrate_vma_collect_pmd; 2347 mm_walk.pmd_entry = migrate_vma_collect_pmd;
@@ -2314,13 +2353,11 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
2314 mm_walk.mm = migrate->vma->vm_mm; 2353 mm_walk.mm = migrate->vma->vm_mm;
2315 mm_walk.private = migrate; 2354 mm_walk.private = migrate;
2316 2355
2317 mmu_notifier_invalidate_range_start(mm_walk.mm, 2356 mmu_notifier_range_init(&range, mm_walk.mm, migrate->start,
2318 migrate->start, 2357 migrate->end);
2319 migrate->end); 2358 mmu_notifier_invalidate_range_start(&range);
2320 walk_page_range(migrate->start, migrate->end, &mm_walk); 2359 walk_page_range(migrate->start, migrate->end, &mm_walk);
2321 mmu_notifier_invalidate_range_end(mm_walk.mm, 2360 mmu_notifier_invalidate_range_end(&range);
2322 migrate->start,
2323 migrate->end);
2324 2361
2325 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 2362 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
2326} 2363}
@@ -2701,9 +2738,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
2701{ 2738{
2702 const unsigned long npages = migrate->npages; 2739 const unsigned long npages = migrate->npages;
2703 const unsigned long start = migrate->start; 2740 const unsigned long start = migrate->start;
2704 struct vm_area_struct *vma = migrate->vma; 2741 struct mmu_notifier_range range;
2705 struct mm_struct *mm = vma->vm_mm; 2742 unsigned long addr, i;
2706 unsigned long addr, i, mmu_start;
2707 bool notified = false; 2743 bool notified = false;
2708 2744
2709 for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { 2745 for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
@@ -2722,11 +2758,12 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
2722 continue; 2758 continue;
2723 } 2759 }
2724 if (!notified) { 2760 if (!notified) {
2725 mmu_start = addr;
2726 notified = true; 2761 notified = true;
2727 mmu_notifier_invalidate_range_start(mm, 2762
2728 mmu_start, 2763 mmu_notifier_range_init(&range,
2729 migrate->end); 2764 migrate->vma->vm_mm,
2765 addr, migrate->end);
2766 mmu_notifier_invalidate_range_start(&range);
2730 } 2767 }
2731 migrate_vma_insert_page(migrate, addr, newpage, 2768 migrate_vma_insert_page(migrate, addr, newpage,
2732 &migrate->src[i], 2769 &migrate->src[i],
@@ -2767,8 +2804,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
2767 * did already call it. 2804 * did already call it.
2768 */ 2805 */
2769 if (notified) 2806 if (notified)
2770 mmu_notifier_invalidate_range_only_end(mm, mmu_start, 2807 mmu_notifier_invalidate_range_only_end(&range);
2771 migrate->end);
2772} 2808}
2773 2809
2774/* 2810/*
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 6838a530789b..33917105a3a2 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -146,7 +146,7 @@ static void __meminit mm_compute_batch(void)
146 s32 batch = max_t(s32, nr*2, 32); 146 s32 batch = max_t(s32, nr*2, 32);
147 147
148 /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ 148 /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
149 memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); 149 memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff);
150 150
151 vm_committed_as_batch = max_t(s32, memsized_batch, batch); 151 vm_committed_as_batch = max_t(s32, memsized_batch, batch);
152} 152}
diff --git a/mm/mmap.c b/mm/mmap.c
index 7bb64381e77c..f901065c4c64 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2973,16 +2973,6 @@ out:
2973 return ret; 2973 return ret;
2974} 2974}
2975 2975
2976static inline void verify_mm_writelocked(struct mm_struct *mm)
2977{
2978#ifdef CONFIG_DEBUG_VM
2979 if (unlikely(down_read_trylock(&mm->mmap_sem))) {
2980 WARN_ON(1);
2981 up_read(&mm->mmap_sem);
2982 }
2983#endif
2984}
2985
2986/* 2976/*
2987 * this is really a simplified "do_mmap". it only handles 2977 * this is really a simplified "do_mmap". it only handles
2988 * anonymous maps. eventually we may be able to do some 2978 * anonymous maps. eventually we may be able to do some
@@ -3010,12 +3000,6 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
3010 return error; 3000 return error;
3011 3001
3012 /* 3002 /*
3013 * mm->mmap_sem is required to protect against another thread
3014 * changing the mappings in case we sleep.
3015 */
3016 verify_mm_writelocked(mm);
3017
3018 /*
3019 * Clear old maps. this also does some error checking for us 3003 * Clear old maps. this also does some error checking for us
3020 */ 3004 */
3021 while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, 3005 while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5119ff846769..9c884abc7850 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -35,13 +35,6 @@ void mmu_notifier_call_srcu(struct rcu_head *rcu,
35} 35}
36EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); 36EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu);
37 37
38void mmu_notifier_synchronize(void)
39{
40 /* Wait for any running method to finish. */
41 srcu_barrier(&srcu);
42}
43EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
44
45/* 38/*
46 * This function can't run concurrently against mmu_notifier_register 39 * This function can't run concurrently against mmu_notifier_register
47 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap 40 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -174,22 +167,20 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
174 srcu_read_unlock(&srcu, id); 167 srcu_read_unlock(&srcu, id);
175} 168}
176 169
177int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, 170int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
178 unsigned long start, unsigned long end,
179 bool blockable)
180{ 171{
181 struct mmu_notifier *mn; 172 struct mmu_notifier *mn;
182 int ret = 0; 173 int ret = 0;
183 int id; 174 int id;
184 175
185 id = srcu_read_lock(&srcu); 176 id = srcu_read_lock(&srcu);
186 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { 177 hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
187 if (mn->ops->invalidate_range_start) { 178 if (mn->ops->invalidate_range_start) {
188 int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable); 179 int _ret = mn->ops->invalidate_range_start(mn, range);
189 if (_ret) { 180 if (_ret) {
190 pr_info("%pS callback failed with %d in %sblockable context.\n", 181 pr_info("%pS callback failed with %d in %sblockable context.\n",
191 mn->ops->invalidate_range_start, _ret, 182 mn->ops->invalidate_range_start, _ret,
192 !blockable ? "non-" : ""); 183 !range->blockable ? "non-" : "");
193 ret = _ret; 184 ret = _ret;
194 } 185 }
195 } 186 }
@@ -200,16 +191,14 @@ int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
200} 191}
201EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); 192EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
202 193
203void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, 194void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
204 unsigned long start,
205 unsigned long end,
206 bool only_end) 195 bool only_end)
207{ 196{
208 struct mmu_notifier *mn; 197 struct mmu_notifier *mn;
209 int id; 198 int id;
210 199
211 id = srcu_read_lock(&srcu); 200 id = srcu_read_lock(&srcu);
212 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { 201 hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
213 /* 202 /*
214 * Call invalidate_range here too to avoid the need for the 203 * Call invalidate_range here too to avoid the need for the
215 * subsystem of having to register an invalidate_range_end 204 * subsystem of having to register an invalidate_range_end
@@ -224,9 +213,11 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
224 * already happen under page table lock. 213 * already happen under page table lock.
225 */ 214 */
226 if (!only_end && mn->ops->invalidate_range) 215 if (!only_end && mn->ops->invalidate_range)
227 mn->ops->invalidate_range(mn, mm, start, end); 216 mn->ops->invalidate_range(mn, range->mm,
217 range->start,
218 range->end);
228 if (mn->ops->invalidate_range_end) 219 if (mn->ops->invalidate_range_end)
229 mn->ops->invalidate_range_end(mn, mm, start, end); 220 mn->ops->invalidate_range_end(mn, range);
230 } 221 }
231 srcu_read_unlock(&srcu, id); 222 srcu_read_unlock(&srcu, id);
232} 223}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6d331620b9e5..36cb358db170 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -167,11 +167,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
167 pgprot_t newprot, int dirty_accountable, int prot_numa) 167 pgprot_t newprot, int dirty_accountable, int prot_numa)
168{ 168{
169 pmd_t *pmd; 169 pmd_t *pmd;
170 struct mm_struct *mm = vma->vm_mm;
171 unsigned long next; 170 unsigned long next;
172 unsigned long pages = 0; 171 unsigned long pages = 0;
173 unsigned long nr_huge_updates = 0; 172 unsigned long nr_huge_updates = 0;
174 unsigned long mni_start = 0; 173 struct mmu_notifier_range range;
174
175 range.start = 0;
175 176
176 pmd = pmd_offset(pud, addr); 177 pmd = pmd_offset(pud, addr);
177 do { 178 do {
@@ -183,9 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
183 goto next; 184 goto next;
184 185
185 /* invoke the mmu notifier if the pmd is populated */ 186 /* invoke the mmu notifier if the pmd is populated */
186 if (!mni_start) { 187 if (!range.start) {
187 mni_start = addr; 188 mmu_notifier_range_init(&range, vma->vm_mm, addr, end);
188 mmu_notifier_invalidate_range_start(mm, mni_start, end); 189 mmu_notifier_invalidate_range_start(&range);
189 } 190 }
190 191
191 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { 192 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
@@ -214,8 +215,8 @@ next:
214 cond_resched(); 215 cond_resched();
215 } while (pmd++, addr = next, addr != end); 216 } while (pmd++, addr = next, addr != end);
216 217
217 if (mni_start) 218 if (range.start)
218 mmu_notifier_invalidate_range_end(mm, mni_start, end); 219 mmu_notifier_invalidate_range_end(&range);
219 220
220 if (nr_huge_updates) 221 if (nr_huge_updates)
221 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); 222 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
diff --git a/mm/mremap.c b/mm/mremap.c
index 7f9f9180e401..def01d86e36f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -197,16 +197,14 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
197 bool need_rmap_locks) 197 bool need_rmap_locks)
198{ 198{
199 unsigned long extent, next, old_end; 199 unsigned long extent, next, old_end;
200 struct mmu_notifier_range range;
200 pmd_t *old_pmd, *new_pmd; 201 pmd_t *old_pmd, *new_pmd;
201 unsigned long mmun_start; /* For mmu_notifiers */
202 unsigned long mmun_end; /* For mmu_notifiers */
203 202
204 old_end = old_addr + len; 203 old_end = old_addr + len;
205 flush_cache_range(vma, old_addr, old_end); 204 flush_cache_range(vma, old_addr, old_end);
206 205
207 mmun_start = old_addr; 206 mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end);
208 mmun_end = old_end; 207 mmu_notifier_invalidate_range_start(&range);
209 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
210 208
211 for (; old_addr < old_end; old_addr += extent, new_addr += extent) { 209 for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
212 cond_resched(); 210 cond_resched();
@@ -247,7 +245,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
247 new_pmd, new_addr, need_rmap_locks); 245 new_pmd, new_addr, need_rmap_locks);
248 } 246 }
249 247
250 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 248 mmu_notifier_invalidate_range_end(&range);
251 249
252 return len + old_addr - old_end; /* how much done */ 250 return len + old_addr - old_end; /* how much done */
253} 251}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6589f60d5018..f0e8cd9edb1a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -245,11 +245,11 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
245 return points > 0 ? points : 1; 245 return points > 0 ? points : 1;
246} 246}
247 247
248enum oom_constraint { 248static const char * const oom_constraint_text[] = {
249 CONSTRAINT_NONE, 249 [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
250 CONSTRAINT_CPUSET, 250 [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
251 CONSTRAINT_MEMORY_POLICY, 251 [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
252 CONSTRAINT_MEMCG, 252 [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
253}; 253};
254 254
255/* 255/*
@@ -269,7 +269,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
269 } 269 }
270 270
271 /* Default to all available memory */ 271 /* Default to all available memory */
272 oc->totalpages = totalram_pages + total_swap_pages; 272 oc->totalpages = totalram_pages() + total_swap_pages;
273 273
274 if (!IS_ENABLED(CONFIG_NUMA)) 274 if (!IS_ENABLED(CONFIG_NUMA))
275 return CONSTRAINT_NONE; 275 return CONSTRAINT_NONE;
@@ -428,19 +428,29 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
428 rcu_read_unlock(); 428 rcu_read_unlock();
429} 429}
430 430
431static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
432{
433 /* one line summary of the oom killer context. */
434 pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
435 oom_constraint_text[oc->constraint],
436 nodemask_pr_args(oc->nodemask));
437 cpuset_print_current_mems_allowed();
438 mem_cgroup_print_oom_context(oc->memcg, victim);
439 pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
440 from_kuid(&init_user_ns, task_uid(victim)));
441}
442
431static void dump_header(struct oom_control *oc, struct task_struct *p) 443static void dump_header(struct oom_control *oc, struct task_struct *p)
432{ 444{
433 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", 445 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
434 current->comm, oc->gfp_mask, &oc->gfp_mask, 446 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
435 nodemask_pr_args(oc->nodemask), oc->order,
436 current->signal->oom_score_adj); 447 current->signal->oom_score_adj);
437 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) 448 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
438 pr_warn("COMPACTION is disabled!!!\n"); 449 pr_warn("COMPACTION is disabled!!!\n");
439 450
440 cpuset_print_current_mems_allowed();
441 dump_stack(); 451 dump_stack();
442 if (is_memcg_oom(oc)) 452 if (is_memcg_oom(oc))
443 mem_cgroup_print_oom_info(oc->memcg, p); 453 mem_cgroup_print_oom_meminfo(oc->memcg);
444 else { 454 else {
445 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); 455 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
446 if (is_dump_unreclaim_slabs()) 456 if (is_dump_unreclaim_slabs())
@@ -448,6 +458,8 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
448 } 458 }
449 if (sysctl_oom_dump_tasks) 459 if (sysctl_oom_dump_tasks)
450 dump_tasks(oc->memcg, oc->nodemask); 460 dump_tasks(oc->memcg, oc->nodemask);
461 if (p)
462 dump_oom_summary(oc, p);
451} 463}
452 464
453/* 465/*
@@ -516,19 +528,20 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
516 * count elevated without a good reason. 528 * count elevated without a good reason.
517 */ 529 */
518 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { 530 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
519 const unsigned long start = vma->vm_start; 531 struct mmu_notifier_range range;
520 const unsigned long end = vma->vm_end;
521 struct mmu_gather tlb; 532 struct mmu_gather tlb;
522 533
523 tlb_gather_mmu(&tlb, mm, start, end); 534 mmu_notifier_range_init(&range, mm, vma->vm_start,
524 if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) { 535 vma->vm_end);
525 tlb_finish_mmu(&tlb, start, end); 536 tlb_gather_mmu(&tlb, mm, range.start, range.end);
537 if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
538 tlb_finish_mmu(&tlb, range.start, range.end);
526 ret = false; 539 ret = false;
527 continue; 540 continue;
528 } 541 }
529 unmap_page_range(&tlb, vma, start, end, NULL); 542 unmap_page_range(&tlb, vma, range.start, range.end, NULL);
530 mmu_notifier_invalidate_range_end(mm, start, end); 543 mmu_notifier_invalidate_range_end(&range);
531 tlb_finish_mmu(&tlb, start, end); 544 tlb_finish_mmu(&tlb, range.start, range.end);
532 } 545 }
533 } 546 }
534 547
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3f690bae6b78..7d1010453fb9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2154,6 +2154,7 @@ int write_cache_pages(struct address_space *mapping,
2154{ 2154{
2155 int ret = 0; 2155 int ret = 0;
2156 int done = 0; 2156 int done = 0;
2157 int error;
2157 struct pagevec pvec; 2158 struct pagevec pvec;
2158 int nr_pages; 2159 int nr_pages;
2159 pgoff_t uninitialized_var(writeback_index); 2160 pgoff_t uninitialized_var(writeback_index);
@@ -2227,25 +2228,31 @@ continue_unlock:
2227 goto continue_unlock; 2228 goto continue_unlock;
2228 2229
2229 trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); 2230 trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
2230 ret = (*writepage)(page, wbc, data); 2231 error = (*writepage)(page, wbc, data);
2231 if (unlikely(ret)) { 2232 if (unlikely(error)) {
2232 if (ret == AOP_WRITEPAGE_ACTIVATE) { 2233 /*
2234 * Handle errors according to the type of
2235 * writeback. There's no need to continue for
2236 * background writeback. Just push done_index
2237 * past this page so media errors won't choke
2238 * writeout for the entire file. For integrity
2239 * writeback, we must process the entire dirty
2240 * set regardless of errors because the fs may
2241 * still have state to clear for each page. In
2242 * that case we continue processing and return
2243 * the first error.
2244 */
2245 if (error == AOP_WRITEPAGE_ACTIVATE) {
2233 unlock_page(page); 2246 unlock_page(page);
2234 ret = 0; 2247 error = 0;
2235 } else { 2248 } else if (wbc->sync_mode != WB_SYNC_ALL) {
2236 /* 2249 ret = error;
2237 * done_index is set past this page,
2238 * so media errors will not choke
2239 * background writeout for the entire
2240 * file. This has consequences for
2241 * range_cyclic semantics (ie. it may
2242 * not be suitable for data integrity
2243 * writeout).
2244 */
2245 done_index = page->index + 1; 2250 done_index = page->index + 1;
2246 done = 1; 2251 done = 1;
2247 break; 2252 break;
2248 } 2253 }
2254 if (!ret)
2255 ret = error;
2249 } 2256 }
2250 2257
2251 /* 2258 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e95b5b7c9c3d..cde5dac6229a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -16,6 +16,7 @@
16 16
17#include <linux/stddef.h> 17#include <linux/stddef.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/highmem.h>
19#include <linux/swap.h> 20#include <linux/swap.h>
20#include <linux/interrupt.h> 21#include <linux/interrupt.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
@@ -96,8 +97,12 @@ int _node_numa_mem_[MAX_NUMNODES];
96#endif 97#endif
97 98
98/* work_structs for global per-cpu drains */ 99/* work_structs for global per-cpu drains */
100struct pcpu_drain {
101 struct zone *zone;
102 struct work_struct work;
103};
99DEFINE_MUTEX(pcpu_drain_mutex); 104DEFINE_MUTEX(pcpu_drain_mutex);
100DEFINE_PER_CPU(struct work_struct, pcpu_drain); 105DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
101 106
102#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 107#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
103volatile unsigned long latent_entropy __latent_entropy; 108volatile unsigned long latent_entropy __latent_entropy;
@@ -121,10 +126,8 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
121}; 126};
122EXPORT_SYMBOL(node_states); 127EXPORT_SYMBOL(node_states);
123 128
124/* Protect totalram_pages and zone->managed_pages */ 129atomic_long_t _totalram_pages __read_mostly;
125static DEFINE_SPINLOCK(managed_page_count_lock); 130EXPORT_SYMBOL(_totalram_pages);
126
127unsigned long totalram_pages __read_mostly;
128unsigned long totalreserve_pages __read_mostly; 131unsigned long totalreserve_pages __read_mostly;
129unsigned long totalcma_pages __read_mostly; 132unsigned long totalcma_pages __read_mostly;
130 133
@@ -237,7 +240,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
237#endif 240#endif
238}; 241};
239 242
240char * const migratetype_names[MIGRATE_TYPES] = { 243const char * const migratetype_names[MIGRATE_TYPES] = {
241 "Unmovable", 244 "Unmovable",
242 "Movable", 245 "Movable",
243 "Reclaimable", 246 "Reclaimable",
@@ -263,20 +266,21 @@ compound_page_dtor * const compound_page_dtors[] = {
263 266
264int min_free_kbytes = 1024; 267int min_free_kbytes = 1024;
265int user_min_free_kbytes = -1; 268int user_min_free_kbytes = -1;
269int watermark_boost_factor __read_mostly = 15000;
266int watermark_scale_factor = 10; 270int watermark_scale_factor = 10;
267 271
268static unsigned long nr_kernel_pages __meminitdata; 272static unsigned long nr_kernel_pages __initdata;
269static unsigned long nr_all_pages __meminitdata; 273static unsigned long nr_all_pages __initdata;
270static unsigned long dma_reserve __meminitdata; 274static unsigned long dma_reserve __initdata;
271 275
272#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 276#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
273static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata; 277static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
274static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata; 278static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
275static unsigned long required_kernelcore __initdata; 279static unsigned long required_kernelcore __initdata;
276static unsigned long required_kernelcore_percent __initdata; 280static unsigned long required_kernelcore_percent __initdata;
277static unsigned long required_movablecore __initdata; 281static unsigned long required_movablecore __initdata;
278static unsigned long required_movablecore_percent __initdata; 282static unsigned long required_movablecore_percent __initdata;
279static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata; 283static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
280static bool mirrored_kernelcore __meminitdata; 284static bool mirrored_kernelcore __meminitdata;
281 285
282/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 286/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -294,6 +298,32 @@ EXPORT_SYMBOL(nr_online_nodes);
294int page_group_by_mobility_disabled __read_mostly; 298int page_group_by_mobility_disabled __read_mostly;
295 299
296#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 300#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
301/*
302 * During boot we initialize deferred pages on-demand, as needed, but once
303 * page_alloc_init_late() has finished, the deferred pages are all initialized,
304 * and we can permanently disable that path.
305 */
306static DEFINE_STATIC_KEY_TRUE(deferred_pages);
307
308/*
309 * Calling kasan_free_pages() only after deferred memory initialization
310 * has completed. Poisoning pages during deferred memory init will greatly
311 * lengthen the process and cause problem in large memory systems as the
312 * deferred pages initialization is done with interrupt disabled.
313 *
314 * Assuming that there will be no reference to those newly initialized
315 * pages before they are ever allocated, this should have no effect on
316 * KASAN memory tracking as the poison will be properly inserted at page
317 * allocation time. The only corner case is when pages are allocated by
318 * on-demand allocation and then freed again before the deferred pages
319 * initialization is done, but this is not likely to happen.
320 */
321static inline void kasan_free_nondeferred_pages(struct page *page, int order)
322{
323 if (!static_branch_unlikely(&deferred_pages))
324 kasan_free_pages(page, order);
325}
326
297/* Returns true if the struct page for the pfn is uninitialised */ 327/* Returns true if the struct page for the pfn is uninitialised */
298static inline bool __meminit early_page_uninitialised(unsigned long pfn) 328static inline bool __meminit early_page_uninitialised(unsigned long pfn)
299{ 329{
@@ -326,8 +356,13 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
326 /* Always populate low zones for address-constrained allocations */ 356 /* Always populate low zones for address-constrained allocations */
327 if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) 357 if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
328 return false; 358 return false;
359
360 /*
361 * We start only with one section of pages, more pages are added as
362 * needed until the rest of deferred pages are initialized.
363 */
329 nr_initialised++; 364 nr_initialised++;
330 if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) && 365 if ((nr_initialised > PAGES_PER_SECTION) &&
331 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 366 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
332 NODE_DATA(nid)->first_deferred_pfn = pfn; 367 NODE_DATA(nid)->first_deferred_pfn = pfn;
333 return true; 368 return true;
@@ -335,6 +370,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
335 return false; 370 return false;
336} 371}
337#else 372#else
373#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
374
338static inline bool early_page_uninitialised(unsigned long pfn) 375static inline bool early_page_uninitialised(unsigned long pfn)
339{ 376{
340 return false; 377 return false;
@@ -426,6 +463,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
426 unsigned long old_word, word; 463 unsigned long old_word, word;
427 464
428 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 465 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
466 BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
429 467
430 bitmap = get_pageblock_bitmap(page, pfn); 468 bitmap = get_pageblock_bitmap(page, pfn);
431 bitidx = pfn_to_bitidx(page, pfn); 469 bitidx = pfn_to_bitidx(page, pfn);
@@ -1037,7 +1075,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
1037 arch_free_page(page, order); 1075 arch_free_page(page, order);
1038 kernel_poison_pages(page, 1 << order, 0); 1076 kernel_poison_pages(page, 1 << order, 0);
1039 kernel_map_pages(page, 1 << order, 0); 1077 kernel_map_pages(page, 1 << order, 0);
1040 kasan_free_pages(page, order); 1078 kasan_free_nondeferred_pages(page, order);
1041 1079
1042 return true; 1080 return true;
1043} 1081}
@@ -1183,6 +1221,7 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1183 init_page_count(page); 1221 init_page_count(page);
1184 page_mapcount_reset(page); 1222 page_mapcount_reset(page);
1185 page_cpupid_reset_last(page); 1223 page_cpupid_reset_last(page);
1224 page_kasan_tag_reset(page);
1186 1225
1187 INIT_LIST_HEAD(&page->lru); 1226 INIT_LIST_HEAD(&page->lru);
1188#ifdef WANT_PAGE_VIRTUAL 1227#ifdef WANT_PAGE_VIRTUAL
@@ -1279,7 +1318,7 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order)
1279 __ClearPageReserved(p); 1318 __ClearPageReserved(p);
1280 set_page_count(p, 0); 1319 set_page_count(p, 0);
1281 1320
1282 page_zone(page)->managed_pages += nr_pages; 1321 atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
1283 set_page_refcounted(page); 1322 set_page_refcounted(page);
1284 __free_pages(page, order); 1323 __free_pages(page, order);
1285} 1324}
@@ -1606,13 +1645,6 @@ static int __init deferred_init_memmap(void *data)
1606} 1645}
1607 1646
1608/* 1647/*
1609 * During boot we initialize deferred pages on-demand, as needed, but once
1610 * page_alloc_init_late() has finished, the deferred pages are all initialized,
1611 * and we can permanently disable that path.
1612 */
1613static DEFINE_STATIC_KEY_TRUE(deferred_pages);
1614
1615/*
1616 * If this zone has deferred pages, try to grow it by initializing enough 1648 * If this zone has deferred pages, try to grow it by initializing enough
1617 * deferred pages to satisfy the allocation specified by order, rounded up to 1649 * deferred pages to satisfy the allocation specified by order, rounded up to
1618 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments 1650 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
@@ -1981,8 +2013,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1981 */ 2013 */
1982static int fallbacks[MIGRATE_TYPES][4] = { 2014static int fallbacks[MIGRATE_TYPES][4] = {
1983 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 2015 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
1984 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
1985 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, 2016 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
2017 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
1986#ifdef CONFIG_CMA 2018#ifdef CONFIG_CMA
1987 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ 2019 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
1988#endif 2020#endif
@@ -2129,6 +2161,21 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
2129 return false; 2161 return false;
2130} 2162}
2131 2163
2164static inline void boost_watermark(struct zone *zone)
2165{
2166 unsigned long max_boost;
2167
2168 if (!watermark_boost_factor)
2169 return;
2170
2171 max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
2172 watermark_boost_factor, 10000);
2173 max_boost = max(pageblock_nr_pages, max_boost);
2174
2175 zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
2176 max_boost);
2177}
2178
2132/* 2179/*
2133 * This function implements actual steal behaviour. If order is large enough, 2180 * This function implements actual steal behaviour. If order is large enough,
2134 * we can steal whole pageblock. If not, we first move freepages in this 2181 * we can steal whole pageblock. If not, we first move freepages in this
@@ -2138,7 +2185,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
2138 * itself, so pages freed in the future will be put on the correct free list. 2185 * itself, so pages freed in the future will be put on the correct free list.
2139 */ 2186 */
2140static void steal_suitable_fallback(struct zone *zone, struct page *page, 2187static void steal_suitable_fallback(struct zone *zone, struct page *page,
2141 int start_type, bool whole_block) 2188 unsigned int alloc_flags, int start_type, bool whole_block)
2142{ 2189{
2143 unsigned int current_order = page_order(page); 2190 unsigned int current_order = page_order(page);
2144 struct free_area *area; 2191 struct free_area *area;
@@ -2160,6 +2207,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
2160 goto single_page; 2207 goto single_page;
2161 } 2208 }
2162 2209
2210 /*
2211 * Boost watermarks to increase reclaim pressure to reduce the
2212 * likelihood of future fallbacks. Wake kswapd now as the node
2213 * may be balanced overall and kswapd will not wake naturally.
2214 */
2215 boost_watermark(zone);
2216 if (alloc_flags & ALLOC_KSWAPD)
2217 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
2218
2163 /* We are not allowed to try stealing from the whole block */ 2219 /* We are not allowed to try stealing from the whole block */
2164 if (!whole_block) 2220 if (!whole_block)
2165 goto single_page; 2221 goto single_page;
@@ -2258,7 +2314,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
2258 * Limit the number reserved to 1 pageblock or roughly 1% of a zone. 2314 * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2259 * Check is race-prone but harmless. 2315 * Check is race-prone but harmless.
2260 */ 2316 */
2261 max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; 2317 max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
2262 if (zone->nr_reserved_highatomic >= max_managed) 2318 if (zone->nr_reserved_highatomic >= max_managed)
2263 return; 2319 return;
2264 2320
@@ -2375,20 +2431,30 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2375 * condition simpler. 2431 * condition simpler.
2376 */ 2432 */
2377static __always_inline bool 2433static __always_inline bool
2378__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 2434__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
2435 unsigned int alloc_flags)
2379{ 2436{
2380 struct free_area *area; 2437 struct free_area *area;
2381 int current_order; 2438 int current_order;
2439 int min_order = order;
2382 struct page *page; 2440 struct page *page;
2383 int fallback_mt; 2441 int fallback_mt;
2384 bool can_steal; 2442 bool can_steal;
2385 2443
2386 /* 2444 /*
2445 * Do not steal pages from freelists belonging to other pageblocks
2446 * i.e. orders < pageblock_order. If there are no local zones free,
2447 * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2448 */
2449 if (alloc_flags & ALLOC_NOFRAGMENT)
2450 min_order = pageblock_order;
2451
2452 /*
2387 * Find the largest available free page in the other list. This roughly 2453 * Find the largest available free page in the other list. This roughly
2388 * approximates finding the pageblock with the most free pages, which 2454 * approximates finding the pageblock with the most free pages, which
2389 * would be too costly to do exactly. 2455 * would be too costly to do exactly.
2390 */ 2456 */
2391 for (current_order = MAX_ORDER - 1; current_order >= order; 2457 for (current_order = MAX_ORDER - 1; current_order >= min_order;
2392 --current_order) { 2458 --current_order) {
2393 area = &(zone->free_area[current_order]); 2459 area = &(zone->free_area[current_order]);
2394 fallback_mt = find_suitable_fallback(area, current_order, 2460 fallback_mt = find_suitable_fallback(area, current_order,
@@ -2433,7 +2499,8 @@ do_steal:
2433 page = list_first_entry(&area->free_list[fallback_mt], 2499 page = list_first_entry(&area->free_list[fallback_mt],
2434 struct page, lru); 2500 struct page, lru);
2435 2501
2436 steal_suitable_fallback(zone, page, start_migratetype, can_steal); 2502 steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
2503 can_steal);
2437 2504
2438 trace_mm_page_alloc_extfrag(page, order, current_order, 2505 trace_mm_page_alloc_extfrag(page, order, current_order,
2439 start_migratetype, fallback_mt); 2506 start_migratetype, fallback_mt);
@@ -2447,7 +2514,8 @@ do_steal:
2447 * Call me with the zone->lock already held. 2514 * Call me with the zone->lock already held.
2448 */ 2515 */
2449static __always_inline struct page * 2516static __always_inline struct page *
2450__rmqueue(struct zone *zone, unsigned int order, int migratetype) 2517__rmqueue(struct zone *zone, unsigned int order, int migratetype,
2518 unsigned int alloc_flags)
2451{ 2519{
2452 struct page *page; 2520 struct page *page;
2453 2521
@@ -2457,7 +2525,8 @@ retry:
2457 if (migratetype == MIGRATE_MOVABLE) 2525 if (migratetype == MIGRATE_MOVABLE)
2458 page = __rmqueue_cma_fallback(zone, order); 2526 page = __rmqueue_cma_fallback(zone, order);
2459 2527
2460 if (!page && __rmqueue_fallback(zone, order, migratetype)) 2528 if (!page && __rmqueue_fallback(zone, order, migratetype,
2529 alloc_flags))
2461 goto retry; 2530 goto retry;
2462 } 2531 }
2463 2532
@@ -2472,13 +2541,14 @@ retry:
2472 */ 2541 */
2473static int rmqueue_bulk(struct zone *zone, unsigned int order, 2542static int rmqueue_bulk(struct zone *zone, unsigned int order,
2474 unsigned long count, struct list_head *list, 2543 unsigned long count, struct list_head *list,
2475 int migratetype) 2544 int migratetype, unsigned int alloc_flags)
2476{ 2545{
2477 int i, alloced = 0; 2546 int i, alloced = 0;
2478 2547
2479 spin_lock(&zone->lock); 2548 spin_lock(&zone->lock);
2480 for (i = 0; i < count; ++i) { 2549 for (i = 0; i < count; ++i) {
2481 struct page *page = __rmqueue(zone, order, migratetype); 2550 struct page *page = __rmqueue(zone, order, migratetype,
2551 alloc_flags);
2482 if (unlikely(page == NULL)) 2552 if (unlikely(page == NULL))
2483 break; 2553 break;
2484 2554
@@ -2592,6 +2662,10 @@ void drain_local_pages(struct zone *zone)
2592 2662
2593static void drain_local_pages_wq(struct work_struct *work) 2663static void drain_local_pages_wq(struct work_struct *work)
2594{ 2664{
2665 struct pcpu_drain *drain;
2666
2667 drain = container_of(work, struct pcpu_drain, work);
2668
2595 /* 2669 /*
2596 * drain_all_pages doesn't use proper cpu hotplug protection so 2670 * drain_all_pages doesn't use proper cpu hotplug protection so
2597 * we can race with cpu offline when the WQ can move this from 2671 * we can race with cpu offline when the WQ can move this from
@@ -2600,7 +2674,7 @@ static void drain_local_pages_wq(struct work_struct *work)
2600 * a different one. 2674 * a different one.
2601 */ 2675 */
2602 preempt_disable(); 2676 preempt_disable();
2603 drain_local_pages(NULL); 2677 drain_local_pages(drain->zone);
2604 preempt_enable(); 2678 preempt_enable();
2605} 2679}
2606 2680
@@ -2671,12 +2745,14 @@ void drain_all_pages(struct zone *zone)
2671 } 2745 }
2672 2746
2673 for_each_cpu(cpu, &cpus_with_pcps) { 2747 for_each_cpu(cpu, &cpus_with_pcps) {
2674 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); 2748 struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
2675 INIT_WORK(work, drain_local_pages_wq); 2749
2676 queue_work_on(cpu, mm_percpu_wq, work); 2750 drain->zone = zone;
2751 INIT_WORK(&drain->work, drain_local_pages_wq);
2752 queue_work_on(cpu, mm_percpu_wq, &drain->work);
2677 } 2753 }
2678 for_each_cpu(cpu, &cpus_with_pcps) 2754 for_each_cpu(cpu, &cpus_with_pcps)
2679 flush_work(per_cpu_ptr(&pcpu_drain, cpu)); 2755 flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
2680 2756
2681 mutex_unlock(&pcpu_drain_mutex); 2757 mutex_unlock(&pcpu_drain_mutex);
2682} 2758}
@@ -2934,6 +3010,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2934 3010
2935/* Remove page from the per-cpu list, caller must protect the list */ 3011/* Remove page from the per-cpu list, caller must protect the list */
2936static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, 3012static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
3013 unsigned int alloc_flags,
2937 struct per_cpu_pages *pcp, 3014 struct per_cpu_pages *pcp,
2938 struct list_head *list) 3015 struct list_head *list)
2939{ 3016{
@@ -2943,7 +3020,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2943 if (list_empty(list)) { 3020 if (list_empty(list)) {
2944 pcp->count += rmqueue_bulk(zone, 0, 3021 pcp->count += rmqueue_bulk(zone, 0,
2945 pcp->batch, list, 3022 pcp->batch, list,
2946 migratetype); 3023 migratetype, alloc_flags);
2947 if (unlikely(list_empty(list))) 3024 if (unlikely(list_empty(list)))
2948 return NULL; 3025 return NULL;
2949 } 3026 }
@@ -2959,7 +3036,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2959/* Lock and remove page from the per-cpu list */ 3036/* Lock and remove page from the per-cpu list */
2960static struct page *rmqueue_pcplist(struct zone *preferred_zone, 3037static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2961 struct zone *zone, unsigned int order, 3038 struct zone *zone, unsigned int order,
2962 gfp_t gfp_flags, int migratetype) 3039 gfp_t gfp_flags, int migratetype,
3040 unsigned int alloc_flags)
2963{ 3041{
2964 struct per_cpu_pages *pcp; 3042 struct per_cpu_pages *pcp;
2965 struct list_head *list; 3043 struct list_head *list;
@@ -2969,7 +3047,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2969 local_irq_save(flags); 3047 local_irq_save(flags);
2970 pcp = &this_cpu_ptr(zone->pageset)->pcp; 3048 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2971 list = &pcp->lists[migratetype]; 3049 list = &pcp->lists[migratetype];
2972 page = __rmqueue_pcplist(zone, migratetype, pcp, list); 3050 page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
2973 if (page) { 3051 if (page) {
2974 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 3052 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2975 zone_statistics(preferred_zone, zone); 3053 zone_statistics(preferred_zone, zone);
@@ -2992,7 +3070,7 @@ struct page *rmqueue(struct zone *preferred_zone,
2992 3070
2993 if (likely(order == 0)) { 3071 if (likely(order == 0)) {
2994 page = rmqueue_pcplist(preferred_zone, zone, order, 3072 page = rmqueue_pcplist(preferred_zone, zone, order,
2995 gfp_flags, migratetype); 3073 gfp_flags, migratetype, alloc_flags);
2996 goto out; 3074 goto out;
2997 } 3075 }
2998 3076
@@ -3011,7 +3089,7 @@ struct page *rmqueue(struct zone *preferred_zone,
3011 trace_mm_page_alloc_zone_locked(page, order, migratetype); 3089 trace_mm_page_alloc_zone_locked(page, order, migratetype);
3012 } 3090 }
3013 if (!page) 3091 if (!page)
3014 page = __rmqueue(zone, order, migratetype); 3092 page = __rmqueue(zone, order, migratetype, alloc_flags);
3015 } while (page && check_new_pages(page, order)); 3093 } while (page && check_new_pages(page, order));
3016 spin_unlock(&zone->lock); 3094 spin_unlock(&zone->lock);
3017 if (!page) 3095 if (!page)
@@ -3053,7 +3131,7 @@ static int __init setup_fail_page_alloc(char *str)
3053} 3131}
3054__setup("fail_page_alloc=", setup_fail_page_alloc); 3132__setup("fail_page_alloc=", setup_fail_page_alloc);
3055 3133
3056static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3134static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3057{ 3135{
3058 if (order < fail_page_alloc.min_order) 3136 if (order < fail_page_alloc.min_order)
3059 return false; 3137 return false;
@@ -3103,13 +3181,19 @@ late_initcall(fail_page_alloc_debugfs);
3103 3181
3104#else /* CONFIG_FAIL_PAGE_ALLOC */ 3182#else /* CONFIG_FAIL_PAGE_ALLOC */
3105 3183
3106static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3184static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3107{ 3185{
3108 return false; 3186 return false;
3109} 3187}
3110 3188
3111#endif /* CONFIG_FAIL_PAGE_ALLOC */ 3189#endif /* CONFIG_FAIL_PAGE_ALLOC */
3112 3190
3191static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3192{
3193 return __should_fail_alloc_page(gfp_mask, order);
3194}
3195ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
3196
3113/* 3197/*
3114 * Return true if free base pages are above 'mark'. For high-order checks it 3198 * Return true if free base pages are above 'mark'. For high-order checks it
3115 * will return true of the order-0 watermark is reached and there is at least 3199 * will return true of the order-0 watermark is reached and there is at least
@@ -3254,6 +3338,40 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3254#endif /* CONFIG_NUMA */ 3338#endif /* CONFIG_NUMA */
3255 3339
3256/* 3340/*
3341 * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3342 * fragmentation is subtle. If the preferred zone was HIGHMEM then
3343 * premature use of a lower zone may cause lowmem pressure problems that
3344 * are worse than fragmentation. If the next zone is ZONE_DMA then it is
3345 * probably too small. It only makes sense to spread allocations to avoid
3346 * fragmentation between the Normal and DMA32 zones.
3347 */
3348static inline unsigned int
3349alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
3350{
3351 unsigned int alloc_flags = 0;
3352
3353 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3354 alloc_flags |= ALLOC_KSWAPD;
3355
3356#ifdef CONFIG_ZONE_DMA32
3357 if (zone_idx(zone) != ZONE_NORMAL)
3358 goto out;
3359
3360 /*
3361 * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
3362 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3363 * on UMA that if Normal is populated then so is DMA32.
3364 */
3365 BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
3366 if (nr_online_nodes > 1 && !populated_zone(--zone))
3367 goto out;
3368
3369out:
3370#endif /* CONFIG_ZONE_DMA32 */
3371 return alloc_flags;
3372}
3373
3374/*
3257 * get_page_from_freelist goes through the zonelist trying to allocate 3375 * get_page_from_freelist goes through the zonelist trying to allocate
3258 * a page. 3376 * a page.
3259 */ 3377 */
@@ -3261,14 +3379,18 @@ static struct page *
3261get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, 3379get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3262 const struct alloc_context *ac) 3380 const struct alloc_context *ac)
3263{ 3381{
3264 struct zoneref *z = ac->preferred_zoneref; 3382 struct zoneref *z;
3265 struct zone *zone; 3383 struct zone *zone;
3266 struct pglist_data *last_pgdat_dirty_limit = NULL; 3384 struct pglist_data *last_pgdat_dirty_limit = NULL;
3385 bool no_fallback;
3267 3386
3387retry:
3268 /* 3388 /*
3269 * Scan zonelist, looking for a zone with enough free. 3389 * Scan zonelist, looking for a zone with enough free.
3270 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 3390 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
3271 */ 3391 */
3392 no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
3393 z = ac->preferred_zoneref;
3272 for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 3394 for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3273 ac->nodemask) { 3395 ac->nodemask) {
3274 struct page *page; 3396 struct page *page;
@@ -3307,7 +3429,23 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3307 } 3429 }
3308 } 3430 }
3309 3431
3310 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 3432 if (no_fallback && nr_online_nodes > 1 &&
3433 zone != ac->preferred_zoneref->zone) {
3434 int local_nid;
3435
3436 /*
3437 * If moving to a remote node, retry but allow
3438 * fragmenting fallbacks. Locality is more important
3439 * than fragmentation avoidance.
3440 */
3441 local_nid = zone_to_nid(ac->preferred_zoneref->zone);
3442 if (zone_to_nid(zone) != local_nid) {
3443 alloc_flags &= ~ALLOC_NOFRAGMENT;
3444 goto retry;
3445 }
3446 }
3447
3448 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
3311 if (!zone_watermark_fast(zone, order, mark, 3449 if (!zone_watermark_fast(zone, order, mark,
3312 ac_classzone_idx(ac), alloc_flags)) { 3450 ac_classzone_idx(ac), alloc_flags)) {
3313 int ret; 3451 int ret;
@@ -3374,6 +3512,15 @@ try_this_zone:
3374 } 3512 }
3375 } 3513 }
3376 3514
3515 /*
3516 * It's possible on a UMA machine to get through all zones that are
3517 * fragmented. If avoiding fragmentation, reset and try again.
3518 */
3519 if (no_fallback) {
3520 alloc_flags &= ~ALLOC_NOFRAGMENT;
3521 goto retry;
3522 }
3523
3377 return NULL; 3524 return NULL;
3378} 3525}
3379 3526
@@ -3413,13 +3560,13 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3413 va_start(args, fmt); 3560 va_start(args, fmt);
3414 vaf.fmt = fmt; 3561 vaf.fmt = fmt;
3415 vaf.va = &args; 3562 vaf.va = &args;
3416 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", 3563 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
3417 current->comm, &vaf, gfp_mask, &gfp_mask, 3564 current->comm, &vaf, gfp_mask, &gfp_mask,
3418 nodemask_pr_args(nodemask)); 3565 nodemask_pr_args(nodemask));
3419 va_end(args); 3566 va_end(args);
3420 3567
3421 cpuset_print_current_mems_allowed(); 3568 cpuset_print_current_mems_allowed();
3422 3569 pr_cont("\n");
3423 dump_stack(); 3570 dump_stack();
3424 warn_alloc_show_mem(gfp_mask, nodemask); 3571 warn_alloc_show_mem(gfp_mask, nodemask);
3425} 3572}
@@ -3861,6 +4008,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
3861 } else if (unlikely(rt_task(current)) && !in_interrupt()) 4008 } else if (unlikely(rt_task(current)) && !in_interrupt())
3862 alloc_flags |= ALLOC_HARDER; 4009 alloc_flags |= ALLOC_HARDER;
3863 4010
4011 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
4012 alloc_flags |= ALLOC_KSWAPD;
4013
3864#ifdef CONFIG_CMA 4014#ifdef CONFIG_CMA
3865 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 4015 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
3866 alloc_flags |= ALLOC_CMA; 4016 alloc_flags |= ALLOC_CMA;
@@ -4092,7 +4242,7 @@ retry_cpuset:
4092 if (!ac->preferred_zoneref->zone) 4242 if (!ac->preferred_zoneref->zone)
4093 goto nopage; 4243 goto nopage;
4094 4244
4095 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 4245 if (alloc_flags & ALLOC_KSWAPD)
4096 wake_all_kswapds(order, gfp_mask, ac); 4246 wake_all_kswapds(order, gfp_mask, ac);
4097 4247
4098 /* 4248 /*
@@ -4150,7 +4300,7 @@ retry_cpuset:
4150 4300
4151retry: 4301retry:
4152 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ 4302 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4153 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 4303 if (alloc_flags & ALLOC_KSWAPD)
4154 wake_all_kswapds(order, gfp_mask, ac); 4304 wake_all_kswapds(order, gfp_mask, ac);
4155 4305
4156 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); 4306 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
@@ -4369,6 +4519,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
4369 4519
4370 finalise_ac(gfp_mask, &ac); 4520 finalise_ac(gfp_mask, &ac);
4371 4521
4522 /*
4523 * Forbid the first pass from falling back to types that fragment
4524 * memory until all local zones are considered.
4525 */
4526 alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
4527
4372 /* First allocation attempt */ 4528 /* First allocation attempt */
4373 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); 4529 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
4374 if (likely(page)) 4530 if (likely(page))
@@ -4427,16 +4583,19 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
4427} 4583}
4428EXPORT_SYMBOL(get_zeroed_page); 4584EXPORT_SYMBOL(get_zeroed_page);
4429 4585
4430void __free_pages(struct page *page, unsigned int order) 4586static inline void free_the_page(struct page *page, unsigned int order)
4431{ 4587{
4432 if (put_page_testzero(page)) { 4588 if (order == 0) /* Via pcp? */
4433 if (order == 0) 4589 free_unref_page(page);
4434 free_unref_page(page); 4590 else
4435 else 4591 __free_pages_ok(page, order);
4436 __free_pages_ok(page, order);
4437 }
4438} 4592}
4439 4593
4594void __free_pages(struct page *page, unsigned int order)
4595{
4596 if (put_page_testzero(page))
4597 free_the_page(page, order);
4598}
4440EXPORT_SYMBOL(__free_pages); 4599EXPORT_SYMBOL(__free_pages);
4441 4600
4442void free_pages(unsigned long addr, unsigned int order) 4601void free_pages(unsigned long addr, unsigned int order)
@@ -4485,14 +4644,8 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
4485{ 4644{
4486 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); 4645 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
4487 4646
4488 if (page_ref_sub_and_test(page, count)) { 4647 if (page_ref_sub_and_test(page, count))
4489 unsigned int order = compound_order(page); 4648 free_the_page(page, compound_order(page));
4490
4491 if (order == 0)
4492 free_unref_page(page);
4493 else
4494 __free_pages_ok(page, order);
4495 }
4496} 4649}
4497EXPORT_SYMBOL(__page_frag_cache_drain); 4650EXPORT_SYMBOL(__page_frag_cache_drain);
4498 4651
@@ -4558,7 +4711,7 @@ void page_frag_free(void *addr)
4558 struct page *page = virt_to_head_page(addr); 4711 struct page *page = virt_to_head_page(addr);
4559 4712
4560 if (unlikely(put_page_testzero(page))) 4713 if (unlikely(put_page_testzero(page)))
4561 __free_pages_ok(page, compound_order(page)); 4714 free_the_page(page, compound_order(page));
4562} 4715}
4563EXPORT_SYMBOL(page_frag_free); 4716EXPORT_SYMBOL(page_frag_free);
4564 4717
@@ -4660,7 +4813,7 @@ static unsigned long nr_free_zone_pages(int offset)
4660 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 4813 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
4661 4814
4662 for_each_zone_zonelist(zone, z, zonelist, offset) { 4815 for_each_zone_zonelist(zone, z, zonelist, offset) {
4663 unsigned long size = zone->managed_pages; 4816 unsigned long size = zone_managed_pages(zone);
4664 unsigned long high = high_wmark_pages(zone); 4817 unsigned long high = high_wmark_pages(zone);
4665 if (size > high) 4818 if (size > high)
4666 sum += size - high; 4819 sum += size - high;
@@ -4712,7 +4865,7 @@ long si_mem_available(void)
4712 pages[lru] = global_node_page_state(NR_LRU_BASE + lru); 4865 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
4713 4866
4714 for_each_zone(zone) 4867 for_each_zone(zone)
4715 wmark_low += zone->watermark[WMARK_LOW]; 4868 wmark_low += low_wmark_pages(zone);
4716 4869
4717 /* 4870 /*
4718 * Estimate the amount of memory available for userspace allocations, 4871 * Estimate the amount of memory available for userspace allocations,
@@ -4746,11 +4899,11 @@ EXPORT_SYMBOL_GPL(si_mem_available);
4746 4899
4747void si_meminfo(struct sysinfo *val) 4900void si_meminfo(struct sysinfo *val)
4748{ 4901{
4749 val->totalram = totalram_pages; 4902 val->totalram = totalram_pages();
4750 val->sharedram = global_node_page_state(NR_SHMEM); 4903 val->sharedram = global_node_page_state(NR_SHMEM);
4751 val->freeram = global_zone_page_state(NR_FREE_PAGES); 4904 val->freeram = global_zone_page_state(NR_FREE_PAGES);
4752 val->bufferram = nr_blockdev_pages(); 4905 val->bufferram = nr_blockdev_pages();
4753 val->totalhigh = totalhigh_pages; 4906 val->totalhigh = totalhigh_pages();
4754 val->freehigh = nr_free_highpages(); 4907 val->freehigh = nr_free_highpages();
4755 val->mem_unit = PAGE_SIZE; 4908 val->mem_unit = PAGE_SIZE;
4756} 4909}
@@ -4767,7 +4920,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
4767 pg_data_t *pgdat = NODE_DATA(nid); 4920 pg_data_t *pgdat = NODE_DATA(nid);
4768 4921
4769 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 4922 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
4770 managed_pages += pgdat->node_zones[zone_type].managed_pages; 4923 managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
4771 val->totalram = managed_pages; 4924 val->totalram = managed_pages;
4772 val->sharedram = node_page_state(pgdat, NR_SHMEM); 4925 val->sharedram = node_page_state(pgdat, NR_SHMEM);
4773 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); 4926 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
@@ -4776,7 +4929,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
4776 struct zone *zone = &pgdat->node_zones[zone_type]; 4929 struct zone *zone = &pgdat->node_zones[zone_type];
4777 4930
4778 if (is_highmem(zone)) { 4931 if (is_highmem(zone)) {
4779 managed_highpages += zone->managed_pages; 4932 managed_highpages += zone_managed_pages(zone);
4780 free_highpages += zone_page_state(zone, NR_FREE_PAGES); 4933 free_highpages += zone_page_state(zone, NR_FREE_PAGES);
4781 } 4934 }
4782 } 4935 }
@@ -4983,7 +5136,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
4983 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), 5136 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
4984 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), 5137 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
4985 K(zone->present_pages), 5138 K(zone->present_pages),
4986 K(zone->managed_pages), 5139 K(zone_managed_pages(zone)),
4987 K(zone_page_state(zone, NR_MLOCK)), 5140 K(zone_page_state(zone, NR_MLOCK)),
4988 zone_page_state(zone, NR_KERNEL_STACK_KB), 5141 zone_page_state(zone, NR_KERNEL_STACK_KB),
4989 K(zone_page_state(zone, NR_PAGETABLE)), 5142 K(zone_page_state(zone, NR_PAGETABLE)),
@@ -5655,7 +5808,7 @@ static int zone_batchsize(struct zone *zone)
5655 * The per-cpu-pages pools are set to around 1000th of the 5808 * The per-cpu-pages pools are set to around 1000th of the
5656 * size of the zone. 5809 * size of the zone.
5657 */ 5810 */
5658 batch = zone->managed_pages / 1024; 5811 batch = zone_managed_pages(zone) / 1024;
5659 /* But no more than a meg. */ 5812 /* But no more than a meg. */
5660 if (batch * PAGE_SIZE > 1024 * 1024) 5813 if (batch * PAGE_SIZE > 1024 * 1024)
5661 batch = (1024 * 1024) / PAGE_SIZE; 5814 batch = (1024 * 1024) / PAGE_SIZE;
@@ -5736,7 +5889,6 @@ static void pageset_init(struct per_cpu_pageset *p)
5736 memset(p, 0, sizeof(*p)); 5889 memset(p, 0, sizeof(*p));
5737 5890
5738 pcp = &p->pcp; 5891 pcp = &p->pcp;
5739 pcp->count = 0;
5740 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 5892 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
5741 INIT_LIST_HEAD(&pcp->lists[migratetype]); 5893 INIT_LIST_HEAD(&pcp->lists[migratetype]);
5742} 5894}
@@ -5766,7 +5918,7 @@ static void pageset_set_high_and_batch(struct zone *zone,
5766{ 5918{
5767 if (percpu_pagelist_fraction) 5919 if (percpu_pagelist_fraction)
5768 pageset_set_high(pcp, 5920 pageset_set_high(pcp,
5769 (zone->managed_pages / 5921 (zone_managed_pages(zone) /
5770 percpu_pagelist_fraction)); 5922 percpu_pagelist_fraction));
5771 else 5923 else
5772 pageset_set_batch(pcp, zone_batchsize(zone)); 5924 pageset_set_batch(pcp, zone_batchsize(zone));
@@ -5920,7 +6072,7 @@ void __init sparse_memory_present_with_active_regions(int nid)
5920 * with no available memory, a warning is printed and the start and end 6072 * with no available memory, a warning is printed and the start and end
5921 * PFNs will be 0. 6073 * PFNs will be 0.
5922 */ 6074 */
5923void __meminit get_pfn_range_for_nid(unsigned int nid, 6075void __init get_pfn_range_for_nid(unsigned int nid,
5924 unsigned long *start_pfn, unsigned long *end_pfn) 6076 unsigned long *start_pfn, unsigned long *end_pfn)
5925{ 6077{
5926 unsigned long this_start_pfn, this_end_pfn; 6078 unsigned long this_start_pfn, this_end_pfn;
@@ -5969,7 +6121,7 @@ static void __init find_usable_zone_for_movable(void)
5969 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 6121 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
5970 * zones within a node are in order of monotonic increases memory addresses 6122 * zones within a node are in order of monotonic increases memory addresses
5971 */ 6123 */
5972static void __meminit adjust_zone_range_for_zone_movable(int nid, 6124static void __init adjust_zone_range_for_zone_movable(int nid,
5973 unsigned long zone_type, 6125 unsigned long zone_type,
5974 unsigned long node_start_pfn, 6126 unsigned long node_start_pfn,
5975 unsigned long node_end_pfn, 6127 unsigned long node_end_pfn,
@@ -6000,7 +6152,7 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
6000 * Return the number of pages a zone spans in a node, including holes 6152 * Return the number of pages a zone spans in a node, including holes
6001 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 6153 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
6002 */ 6154 */
6003static unsigned long __meminit zone_spanned_pages_in_node(int nid, 6155static unsigned long __init zone_spanned_pages_in_node(int nid,
6004 unsigned long zone_type, 6156 unsigned long zone_type,
6005 unsigned long node_start_pfn, 6157 unsigned long node_start_pfn,
6006 unsigned long node_end_pfn, 6158 unsigned long node_end_pfn,
@@ -6035,7 +6187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
6035 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 6187 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
6036 * then all holes in the requested range will be accounted for. 6188 * then all holes in the requested range will be accounted for.
6037 */ 6189 */
6038unsigned long __meminit __absent_pages_in_range(int nid, 6190unsigned long __init __absent_pages_in_range(int nid,
6039 unsigned long range_start_pfn, 6191 unsigned long range_start_pfn,
6040 unsigned long range_end_pfn) 6192 unsigned long range_end_pfn)
6041{ 6193{
@@ -6065,7 +6217,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
6065} 6217}
6066 6218
6067/* Return the number of page frames in holes in a zone on a node */ 6219/* Return the number of page frames in holes in a zone on a node */
6068static unsigned long __meminit zone_absent_pages_in_node(int nid, 6220static unsigned long __init zone_absent_pages_in_node(int nid,
6069 unsigned long zone_type, 6221 unsigned long zone_type,
6070 unsigned long node_start_pfn, 6222 unsigned long node_start_pfn,
6071 unsigned long node_end_pfn, 6223 unsigned long node_end_pfn,
@@ -6117,7 +6269,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
6117} 6269}
6118 6270
6119#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 6271#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6120static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 6272static inline unsigned long __init zone_spanned_pages_in_node(int nid,
6121 unsigned long zone_type, 6273 unsigned long zone_type,
6122 unsigned long node_start_pfn, 6274 unsigned long node_start_pfn,
6123 unsigned long node_end_pfn, 6275 unsigned long node_end_pfn,
@@ -6136,7 +6288,7 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
6136 return zones_size[zone_type]; 6288 return zones_size[zone_type];
6137} 6289}
6138 6290
6139static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 6291static inline unsigned long __init zone_absent_pages_in_node(int nid,
6140 unsigned long zone_type, 6292 unsigned long zone_type,
6141 unsigned long node_start_pfn, 6293 unsigned long node_start_pfn,
6142 unsigned long node_end_pfn, 6294 unsigned long node_end_pfn,
@@ -6150,7 +6302,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
6150 6302
6151#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 6303#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6152 6304
6153static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 6305static void __init calculate_node_totalpages(struct pglist_data *pgdat,
6154 unsigned long node_start_pfn, 6306 unsigned long node_start_pfn,
6155 unsigned long node_end_pfn, 6307 unsigned long node_end_pfn,
6156 unsigned long *zones_size, 6308 unsigned long *zones_size,
@@ -6323,7 +6475,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
6323static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, 6475static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
6324 unsigned long remaining_pages) 6476 unsigned long remaining_pages)
6325{ 6477{
6326 zone->managed_pages = remaining_pages; 6478 atomic_long_set(&zone->managed_pages, remaining_pages);
6327 zone_set_nid(zone, nid); 6479 zone_set_nid(zone, nid);
6328 zone->name = zone_names[idx]; 6480 zone->name = zone_names[idx];
6329 zone->zone_pgdat = NODE_DATA(nid); 6481 zone->zone_pgdat = NODE_DATA(nid);
@@ -6476,12 +6628,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
6476#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 6628#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6477static inline void pgdat_set_deferred_range(pg_data_t *pgdat) 6629static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
6478{ 6630{
6479 /*
6480 * We start only with one section of pages, more pages are added as
6481 * needed until the rest of deferred pages are initialized.
6482 */
6483 pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6484 pgdat->node_spanned_pages);
6485 pgdat->first_deferred_pfn = ULONG_MAX; 6631 pgdat->first_deferred_pfn = ULONG_MAX;
6486} 6632}
6487#else 6633#else
@@ -7075,18 +7221,16 @@ early_param("movablecore", cmdline_parse_movablecore);
7075 7221
7076void adjust_managed_page_count(struct page *page, long count) 7222void adjust_managed_page_count(struct page *page, long count)
7077{ 7223{
7078 spin_lock(&managed_page_count_lock); 7224 atomic_long_add(count, &page_zone(page)->managed_pages);
7079 page_zone(page)->managed_pages += count; 7225 totalram_pages_add(count);
7080 totalram_pages += count;
7081#ifdef CONFIG_HIGHMEM 7226#ifdef CONFIG_HIGHMEM
7082 if (PageHighMem(page)) 7227 if (PageHighMem(page))
7083 totalhigh_pages += count; 7228 totalhigh_pages_add(count);
7084#endif 7229#endif
7085 spin_unlock(&managed_page_count_lock);
7086} 7230}
7087EXPORT_SYMBOL(adjust_managed_page_count); 7231EXPORT_SYMBOL(adjust_managed_page_count);
7088 7232
7089unsigned long free_reserved_area(void *start, void *end, int poison, char *s) 7233unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
7090{ 7234{
7091 void *pos; 7235 void *pos;
7092 unsigned long pages = 0; 7236 unsigned long pages = 0;
@@ -7123,9 +7267,9 @@ EXPORT_SYMBOL(free_reserved_area);
7123void free_highmem_page(struct page *page) 7267void free_highmem_page(struct page *page)
7124{ 7268{
7125 __free_reserved_page(page); 7269 __free_reserved_page(page);
7126 totalram_pages++; 7270 totalram_pages_inc();
7127 page_zone(page)->managed_pages++; 7271 atomic_long_inc(&page_zone(page)->managed_pages);
7128 totalhigh_pages++; 7272 totalhigh_pages_inc();
7129} 7273}
7130#endif 7274#endif
7131 7275
@@ -7174,10 +7318,10 @@ void __init mem_init_print_info(const char *str)
7174 physpages << (PAGE_SHIFT - 10), 7318 physpages << (PAGE_SHIFT - 10),
7175 codesize >> 10, datasize >> 10, rosize >> 10, 7319 codesize >> 10, datasize >> 10, rosize >> 10,
7176 (init_data_size + init_code_size) >> 10, bss_size >> 10, 7320 (init_data_size + init_code_size) >> 10, bss_size >> 10,
7177 (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), 7321 (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
7178 totalcma_pages << (PAGE_SHIFT - 10), 7322 totalcma_pages << (PAGE_SHIFT - 10),
7179#ifdef CONFIG_HIGHMEM 7323#ifdef CONFIG_HIGHMEM
7180 totalhigh_pages << (PAGE_SHIFT - 10), 7324 totalhigh_pages() << (PAGE_SHIFT - 10),
7181#endif 7325#endif
7182 str ? ", " : "", str ? str : ""); 7326 str ? ", " : "", str ? str : "");
7183} 7327}
@@ -7257,6 +7401,7 @@ static void calculate_totalreserve_pages(void)
7257 for (i = 0; i < MAX_NR_ZONES; i++) { 7401 for (i = 0; i < MAX_NR_ZONES; i++) {
7258 struct zone *zone = pgdat->node_zones + i; 7402 struct zone *zone = pgdat->node_zones + i;
7259 long max = 0; 7403 long max = 0;
7404 unsigned long managed_pages = zone_managed_pages(zone);
7260 7405
7261 /* Find valid and maximum lowmem_reserve in the zone */ 7406 /* Find valid and maximum lowmem_reserve in the zone */
7262 for (j = i; j < MAX_NR_ZONES; j++) { 7407 for (j = i; j < MAX_NR_ZONES; j++) {
@@ -7267,8 +7412,8 @@ static void calculate_totalreserve_pages(void)
7267 /* we treat the high watermark as reserved pages. */ 7412 /* we treat the high watermark as reserved pages. */
7268 max += high_wmark_pages(zone); 7413 max += high_wmark_pages(zone);
7269 7414
7270 if (max > zone->managed_pages) 7415 if (max > managed_pages)
7271 max = zone->managed_pages; 7416 max = managed_pages;
7272 7417
7273 pgdat->totalreserve_pages += max; 7418 pgdat->totalreserve_pages += max;
7274 7419
@@ -7292,7 +7437,7 @@ static void setup_per_zone_lowmem_reserve(void)
7292 for_each_online_pgdat(pgdat) { 7437 for_each_online_pgdat(pgdat) {
7293 for (j = 0; j < MAX_NR_ZONES; j++) { 7438 for (j = 0; j < MAX_NR_ZONES; j++) {
7294 struct zone *zone = pgdat->node_zones + j; 7439 struct zone *zone = pgdat->node_zones + j;
7295 unsigned long managed_pages = zone->managed_pages; 7440 unsigned long managed_pages = zone_managed_pages(zone);
7296 7441
7297 zone->lowmem_reserve[j] = 0; 7442 zone->lowmem_reserve[j] = 0;
7298 7443
@@ -7310,7 +7455,7 @@ static void setup_per_zone_lowmem_reserve(void)
7310 lower_zone->lowmem_reserve[j] = 7455 lower_zone->lowmem_reserve[j] =
7311 managed_pages / sysctl_lowmem_reserve_ratio[idx]; 7456 managed_pages / sysctl_lowmem_reserve_ratio[idx];
7312 } 7457 }
7313 managed_pages += lower_zone->managed_pages; 7458 managed_pages += zone_managed_pages(lower_zone);
7314 } 7459 }
7315 } 7460 }
7316 } 7461 }
@@ -7329,14 +7474,14 @@ static void __setup_per_zone_wmarks(void)
7329 /* Calculate total number of !ZONE_HIGHMEM pages */ 7474 /* Calculate total number of !ZONE_HIGHMEM pages */
7330 for_each_zone(zone) { 7475 for_each_zone(zone) {
7331 if (!is_highmem(zone)) 7476 if (!is_highmem(zone))
7332 lowmem_pages += zone->managed_pages; 7477 lowmem_pages += zone_managed_pages(zone);
7333 } 7478 }
7334 7479
7335 for_each_zone(zone) { 7480 for_each_zone(zone) {
7336 u64 tmp; 7481 u64 tmp;
7337 7482
7338 spin_lock_irqsave(&zone->lock, flags); 7483 spin_lock_irqsave(&zone->lock, flags);
7339 tmp = (u64)pages_min * zone->managed_pages; 7484 tmp = (u64)pages_min * zone_managed_pages(zone);
7340 do_div(tmp, lowmem_pages); 7485 do_div(tmp, lowmem_pages);
7341 if (is_highmem(zone)) { 7486 if (is_highmem(zone)) {
7342 /* 7487 /*
@@ -7350,15 +7495,15 @@ static void __setup_per_zone_wmarks(void)
7350 */ 7495 */
7351 unsigned long min_pages; 7496 unsigned long min_pages;
7352 7497
7353 min_pages = zone->managed_pages / 1024; 7498 min_pages = zone_managed_pages(zone) / 1024;
7354 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 7499 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
7355 zone->watermark[WMARK_MIN] = min_pages; 7500 zone->_watermark[WMARK_MIN] = min_pages;
7356 } else { 7501 } else {
7357 /* 7502 /*
7358 * If it's a lowmem zone, reserve a number of pages 7503 * If it's a lowmem zone, reserve a number of pages
7359 * proportionate to the zone's size. 7504 * proportionate to the zone's size.
7360 */ 7505 */
7361 zone->watermark[WMARK_MIN] = tmp; 7506 zone->_watermark[WMARK_MIN] = tmp;
7362 } 7507 }
7363 7508
7364 /* 7509 /*
@@ -7367,11 +7512,12 @@ static void __setup_per_zone_wmarks(void)
7367 * ensure a minimum size on small systems. 7512 * ensure a minimum size on small systems.
7368 */ 7513 */
7369 tmp = max_t(u64, tmp >> 2, 7514 tmp = max_t(u64, tmp >> 2,
7370 mult_frac(zone->managed_pages, 7515 mult_frac(zone_managed_pages(zone),
7371 watermark_scale_factor, 10000)); 7516 watermark_scale_factor, 10000));
7372 7517
7373 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; 7518 zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
7374 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; 7519 zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
7520 zone->watermark_boost = 0;
7375 7521
7376 spin_unlock_irqrestore(&zone->lock, flags); 7522 spin_unlock_irqrestore(&zone->lock, flags);
7377 } 7523 }
@@ -7472,6 +7618,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
7472 return 0; 7618 return 0;
7473} 7619}
7474 7620
7621int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
7622 void __user *buffer, size_t *length, loff_t *ppos)
7623{
7624 int rc;
7625
7626 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7627 if (rc)
7628 return rc;
7629
7630 return 0;
7631}
7632
7475int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, 7633int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
7476 void __user *buffer, size_t *length, loff_t *ppos) 7634 void __user *buffer, size_t *length, loff_t *ppos)
7477{ 7635{
@@ -7497,8 +7655,8 @@ static void setup_min_unmapped_ratio(void)
7497 pgdat->min_unmapped_pages = 0; 7655 pgdat->min_unmapped_pages = 0;
7498 7656
7499 for_each_zone(zone) 7657 for_each_zone(zone)
7500 zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * 7658 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
7501 sysctl_min_unmapped_ratio) / 100; 7659 sysctl_min_unmapped_ratio) / 100;
7502} 7660}
7503 7661
7504 7662
@@ -7525,8 +7683,8 @@ static void setup_min_slab_ratio(void)
7525 pgdat->min_slab_pages = 0; 7683 pgdat->min_slab_pages = 0;
7526 7684
7527 for_each_zone(zone) 7685 for_each_zone(zone)
7528 zone->zone_pgdat->min_slab_pages += (zone->managed_pages * 7686 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
7529 sysctl_min_slab_ratio) / 100; 7687 sysctl_min_slab_ratio) / 100;
7530} 7688}
7531 7689
7532int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, 7690int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
@@ -7766,8 +7924,7 @@ void *__init alloc_large_system_hash(const char *tablename,
7766 * race condition. So you can't expect this function should be exact. 7924 * race condition. So you can't expect this function should be exact.
7767 */ 7925 */
7768bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 7926bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7769 int migratetype, 7927 int migratetype, int flags)
7770 bool skip_hwpoisoned_pages)
7771{ 7928{
7772 unsigned long pfn, iter, found; 7929 unsigned long pfn, iter, found;
7773 7930
@@ -7841,7 +7998,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7841 * The HWPoisoned page may be not in buddy system, and 7998 * The HWPoisoned page may be not in buddy system, and
7842 * page_count() is not 0. 7999 * page_count() is not 0.
7843 */ 8000 */
7844 if (skip_hwpoisoned_pages && PageHWPoison(page)) 8001 if ((flags & SKIP_HWPOISON) && PageHWPoison(page))
7845 continue; 8002 continue;
7846 8003
7847 if (__PageMovable(page)) 8004 if (__PageMovable(page))
@@ -7868,6 +8025,8 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7868 return false; 8025 return false;
7869unmovable: 8026unmovable:
7870 WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); 8027 WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
8028 if (flags & REPORT_FAILURE)
8029 dump_page(pfn_to_page(pfn+iter), "unmovable page");
7871 return true; 8030 return true;
7872} 8031}
7873 8032
@@ -7994,8 +8153,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
7994 */ 8153 */
7995 8154
7996 ret = start_isolate_page_range(pfn_max_align_down(start), 8155 ret = start_isolate_page_range(pfn_max_align_down(start),
7997 pfn_max_align_up(end), migratetype, 8156 pfn_max_align_up(end), migratetype, 0);
7998 false);
7999 if (ret) 8157 if (ret)
8000 return ret; 8158 return ret;
8001 8159
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 43e085608846..ce323e56b34d 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -15,8 +15,7 @@
15#define CREATE_TRACE_POINTS 15#define CREATE_TRACE_POINTS
16#include <trace/events/page_isolation.h> 16#include <trace/events/page_isolation.h>
17 17
18static int set_migratetype_isolate(struct page *page, int migratetype, 18static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
19 bool skip_hwpoisoned_pages)
20{ 19{
21 struct zone *zone; 20 struct zone *zone;
22 unsigned long flags, pfn; 21 unsigned long flags, pfn;
@@ -60,8 +59,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype,
60 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. 59 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
61 * We just check MOVABLE pages. 60 * We just check MOVABLE pages.
62 */ 61 */
63 if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, 62 if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, flags))
64 skip_hwpoisoned_pages))
65 ret = 0; 63 ret = 0;
66 64
67 /* 65 /*
@@ -185,7 +183,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
185 * prevents two threads from simultaneously working on overlapping ranges. 183 * prevents two threads from simultaneously working on overlapping ranges.
186 */ 184 */
187int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, 185int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
188 unsigned migratetype, bool skip_hwpoisoned_pages) 186 unsigned migratetype, int flags)
189{ 187{
190 unsigned long pfn; 188 unsigned long pfn;
191 unsigned long undo_pfn; 189 unsigned long undo_pfn;
@@ -199,7 +197,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
199 pfn += pageblock_nr_pages) { 197 pfn += pageblock_nr_pages) {
200 page = __first_valid_page(pfn, pageblock_nr_pages); 198 page = __first_valid_page(pfn, pageblock_nr_pages);
201 if (page && 199 if (page &&
202 set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) { 200 set_migratetype_isolate(page, migratetype, flags)) {
203 undo_pfn = pfn; 201 undo_pfn = pfn;
204 goto undo; 202 goto undo;
205 } 203 }
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 87bc0dfdb52b..28b06524939f 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -351,6 +351,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
351 .skip = 0 351 .skip = 0
352 }; 352 };
353 353
354 count = min_t(size_t, count, PAGE_SIZE);
354 kbuf = kmalloc(count, GFP_KERNEL); 355 kbuf = kmalloc(count, GFP_KERNEL);
355 if (!kbuf) 356 if (!kbuf)
356 return -ENOMEM; 357 return -ENOMEM;
diff --git a/mm/readahead.c b/mm/readahead.c
index f3d6f9656a3c..1ae16522412a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -270,17 +270,15 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
270 * return it as the new window size. 270 * return it as the new window size.
271 */ 271 */
272static unsigned long get_next_ra_size(struct file_ra_state *ra, 272static unsigned long get_next_ra_size(struct file_ra_state *ra,
273 unsigned long max) 273 unsigned long max)
274{ 274{
275 unsigned long cur = ra->size; 275 unsigned long cur = ra->size;
276 unsigned long newsize;
277 276
278 if (cur < max / 16) 277 if (cur < max / 16)
279 newsize = 4 * cur; 278 return 4 * cur;
280 else 279 if (cur <= max / 2)
281 newsize = 2 * cur; 280 return 2 * cur;
282 281 return max;
283 return min(newsize, max);
284} 282}
285 283
286/* 284/*
diff --git a/mm/rmap.c b/mm/rmap.c
index 85b7f9423352..21a26cf51114 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,6 +25,7 @@
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) 26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
27 * mapping->i_mmap_rwsem 27 * mapping->i_mmap_rwsem
28 * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
28 * anon_vma->rwsem 29 * anon_vma->rwsem
29 * mm->page_table_lock or pte_lock 30 * mm->page_table_lock or pte_lock
30 * zone_lru_lock (in mark_page_accessed, isolate_lru_page) 31 * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -889,15 +890,17 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
889 .address = address, 890 .address = address,
890 .flags = PVMW_SYNC, 891 .flags = PVMW_SYNC,
891 }; 892 };
892 unsigned long start = address, end; 893 struct mmu_notifier_range range;
893 int *cleaned = arg; 894 int *cleaned = arg;
894 895
895 /* 896 /*
896 * We have to assume the worse case ie pmd for invalidation. Note that 897 * We have to assume the worse case ie pmd for invalidation. Note that
897 * the page can not be free from this function. 898 * the page can not be free from this function.
898 */ 899 */
899 end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); 900 mmu_notifier_range_init(&range, vma->vm_mm, address,
900 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); 901 min(vma->vm_end, address +
902 (PAGE_SIZE << compound_order(page))));
903 mmu_notifier_invalidate_range_start(&range);
901 904
902 while (page_vma_mapped_walk(&pvmw)) { 905 while (page_vma_mapped_walk(&pvmw)) {
903 unsigned long cstart; 906 unsigned long cstart;
@@ -949,7 +952,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
949 (*cleaned)++; 952 (*cleaned)++;
950 } 953 }
951 954
952 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 955 mmu_notifier_invalidate_range_end(&range);
953 956
954 return true; 957 return true;
955} 958}
@@ -1017,7 +1020,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
1017 1020
1018/** 1021/**
1019 * __page_set_anon_rmap - set up new anonymous rmap 1022 * __page_set_anon_rmap - set up new anonymous rmap
1020 * @page: Page to add to rmap 1023 * @page: Page or Hugepage to add to rmap
1021 * @vma: VM area to add page to. 1024 * @vma: VM area to add page to.
1022 * @address: User virtual address of the mapping 1025 * @address: User virtual address of the mapping
1023 * @exclusive: the page is exclusively owned by the current process 1026 * @exclusive: the page is exclusively owned by the current process
@@ -1345,7 +1348,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1345 pte_t pteval; 1348 pte_t pteval;
1346 struct page *subpage; 1349 struct page *subpage;
1347 bool ret = true; 1350 bool ret = true;
1348 unsigned long start = address, end; 1351 struct mmu_notifier_range range;
1349 enum ttu_flags flags = (enum ttu_flags)arg; 1352 enum ttu_flags flags = (enum ttu_flags)arg;
1350 1353
1351 /* munlock has nothing to gain from examining un-locked vmas */ 1354 /* munlock has nothing to gain from examining un-locked vmas */
@@ -1369,15 +1372,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1369 * Note that the page can not be free in this function as call of 1372 * Note that the page can not be free in this function as call of
1370 * try_to_unmap() must hold a reference on the page. 1373 * try_to_unmap() must hold a reference on the page.
1371 */ 1374 */
1372 end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); 1375 mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start,
1376 min(vma->vm_end, vma->vm_start +
1377 (PAGE_SIZE << compound_order(page))));
1373 if (PageHuge(page)) { 1378 if (PageHuge(page)) {
1374 /* 1379 /*
1375 * If sharing is possible, start and end will be adjusted 1380 * If sharing is possible, start and end will be adjusted
1376 * accordingly. 1381 * accordingly.
1382 *
1383 * If called for a huge page, caller must hold i_mmap_rwsem
1384 * in write mode as it is possible to call huge_pmd_unshare.
1377 */ 1385 */
1378 adjust_range_if_pmd_sharing_possible(vma, &start, &end); 1386 adjust_range_if_pmd_sharing_possible(vma, &range.start,
1387 &range.end);
1379 } 1388 }
1380 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); 1389 mmu_notifier_invalidate_range_start(&range);
1381 1390
1382 while (page_vma_mapped_walk(&pvmw)) { 1391 while (page_vma_mapped_walk(&pvmw)) {
1383#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1392#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -1428,9 +1437,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1428 * we must flush them all. start/end were 1437 * we must flush them all. start/end were
1429 * already adjusted above to cover this range. 1438 * already adjusted above to cover this range.
1430 */ 1439 */
1431 flush_cache_range(vma, start, end); 1440 flush_cache_range(vma, range.start, range.end);
1432 flush_tlb_range(vma, start, end); 1441 flush_tlb_range(vma, range.start, range.end);
1433 mmu_notifier_invalidate_range(mm, start, end); 1442 mmu_notifier_invalidate_range(mm, range.start,
1443 range.end);
1434 1444
1435 /* 1445 /*
1436 * The ref count of the PMD page was dropped 1446 * The ref count of the PMD page was dropped
@@ -1650,7 +1660,7 @@ discard:
1650 put_page(page); 1660 put_page(page);
1651 } 1661 }
1652 1662
1653 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 1663 mmu_notifier_invalidate_range_end(&range);
1654 1664
1655 return ret; 1665 return ret;
1656} 1666}
@@ -1910,27 +1920,10 @@ void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
1910 1920
1911#ifdef CONFIG_HUGETLB_PAGE 1921#ifdef CONFIG_HUGETLB_PAGE
1912/* 1922/*
1913 * The following three functions are for anonymous (private mapped) hugepages. 1923 * The following two functions are for anonymous (private mapped) hugepages.
1914 * Unlike common anonymous pages, anonymous hugepages have no accounting code 1924 * Unlike common anonymous pages, anonymous hugepages have no accounting code
1915 * and no lru code, because we handle hugepages differently from common pages. 1925 * and no lru code, because we handle hugepages differently from common pages.
1916 */ 1926 */
1917static void __hugepage_set_anon_rmap(struct page *page,
1918 struct vm_area_struct *vma, unsigned long address, int exclusive)
1919{
1920 struct anon_vma *anon_vma = vma->anon_vma;
1921
1922 BUG_ON(!anon_vma);
1923
1924 if (PageAnon(page))
1925 return;
1926 if (!exclusive)
1927 anon_vma = anon_vma->root;
1928
1929 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1930 page->mapping = (struct address_space *) anon_vma;
1931 page->index = linear_page_index(vma, address);
1932}
1933
1934void hugepage_add_anon_rmap(struct page *page, 1927void hugepage_add_anon_rmap(struct page *page,
1935 struct vm_area_struct *vma, unsigned long address) 1928 struct vm_area_struct *vma, unsigned long address)
1936{ 1929{
@@ -1942,7 +1935,7 @@ void hugepage_add_anon_rmap(struct page *page,
1942 /* address might be in next vma when migration races vma_adjust */ 1935 /* address might be in next vma when migration races vma_adjust */
1943 first = atomic_inc_and_test(compound_mapcount_ptr(page)); 1936 first = atomic_inc_and_test(compound_mapcount_ptr(page));
1944 if (first) 1937 if (first)
1945 __hugepage_set_anon_rmap(page, vma, address, 0); 1938 __page_set_anon_rmap(page, vma, address, 0);
1946} 1939}
1947 1940
1948void hugepage_add_new_anon_rmap(struct page *page, 1941void hugepage_add_new_anon_rmap(struct page *page,
@@ -1950,6 +1943,6 @@ void hugepage_add_new_anon_rmap(struct page *page,
1950{ 1943{
1951 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1944 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1952 atomic_set(compound_mapcount_ptr(page), 0); 1945 atomic_set(compound_mapcount_ptr(page), 0);
1953 __hugepage_set_anon_rmap(page, vma, address, 1); 1946 __page_set_anon_rmap(page, vma, address, 1);
1954} 1947}
1955#endif /* CONFIG_HUGETLB_PAGE */ 1948#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index 375f3ac19bb8..6ece1e2fe76e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -109,12 +109,14 @@ struct shmem_falloc {
109#ifdef CONFIG_TMPFS 109#ifdef CONFIG_TMPFS
110static unsigned long shmem_default_max_blocks(void) 110static unsigned long shmem_default_max_blocks(void)
111{ 111{
112 return totalram_pages / 2; 112 return totalram_pages() / 2;
113} 113}
114 114
115static unsigned long shmem_default_max_inodes(void) 115static unsigned long shmem_default_max_inodes(void)
116{ 116{
117 return min(totalram_pages - totalhigh_pages, totalram_pages / 2); 117 unsigned long nr_pages = totalram_pages();
118
119 return min(nr_pages - totalhigh_pages(), nr_pages / 2);
118} 120}
119#endif 121#endif
120 122
@@ -3301,7 +3303,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
3301 size = memparse(value,&rest); 3303 size = memparse(value,&rest);
3302 if (*rest == '%') { 3304 if (*rest == '%') {
3303 size <<= PAGE_SHIFT; 3305 size <<= PAGE_SHIFT;
3304 size *= totalram_pages; 3306 size *= totalram_pages();
3305 do_div(size, 100); 3307 do_div(size, 100);
3306 rest++; 3308 rest++;
3307 } 3309 }
diff --git a/mm/slab.c b/mm/slab.c
index 3abb9feb3818..73fe23e649c9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,19 +406,6 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
406 return page->s_mem + cache->size * idx; 406 return page->s_mem + cache->size * idx;
407} 407}
408 408
409/*
410 * We want to avoid an expensive divide : (offset / cache->size)
411 * Using the fact that size is a constant for a particular cache,
412 * we can replace (offset / cache->size) by
413 * reciprocal_divide(offset, cache->reciprocal_buffer_size)
414 */
415static inline unsigned int obj_to_index(const struct kmem_cache *cache,
416 const struct page *page, void *obj)
417{
418 u32 offset = (obj - page->s_mem);
419 return reciprocal_divide(offset, cache->reciprocal_buffer_size);
420}
421
422#define BOOT_CPUCACHE_ENTRIES 1 409#define BOOT_CPUCACHE_ENTRIES 1
423/* internal cache of cache description objs */ 410/* internal cache of cache description objs */
424static struct kmem_cache kmem_cache_boot = { 411static struct kmem_cache kmem_cache_boot = {
@@ -1248,7 +1235,7 @@ void __init kmem_cache_init(void)
1248 * page orders on machines with more than 32MB of memory if 1235 * page orders on machines with more than 32MB of memory if
1249 * not overridden on the command line. 1236 * not overridden on the command line.
1250 */ 1237 */
1251 if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) 1238 if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT)
1252 slab_max_order = SLAB_MAX_ORDER_HI; 1239 slab_max_order = SLAB_MAX_ORDER_HI;
1253 1240
1254 /* Bootstrap is tricky, because several objects are allocated 1241 /* Bootstrap is tricky, because several objects are allocated
@@ -2370,7 +2357,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
2370 void *freelist; 2357 void *freelist;
2371 void *addr = page_address(page); 2358 void *addr = page_address(page);
2372 2359
2373 page->s_mem = addr + colour_off; 2360 page->s_mem = kasan_reset_tag(addr) + colour_off;
2374 page->active = 0; 2361 page->active = 0;
2375 2362
2376 if (OBJFREELIST_SLAB(cachep)) 2363 if (OBJFREELIST_SLAB(cachep))
@@ -2574,7 +2561,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2574 2561
2575 for (i = 0; i < cachep->num; i++) { 2562 for (i = 0; i < cachep->num; i++) {
2576 objp = index_to_obj(cachep, page, i); 2563 objp = index_to_obj(cachep, page, i);
2577 kasan_init_slab_obj(cachep, objp); 2564 objp = kasan_init_slab_obj(cachep, objp);
2578 2565
2579 /* constructor could break poison info */ 2566 /* constructor could break poison info */
2580 if (DEBUG == 0 && cachep->ctor) { 2567 if (DEBUG == 0 && cachep->ctor) {
@@ -3551,7 +3538,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3551{ 3538{
3552 void *ret = slab_alloc(cachep, flags, _RET_IP_); 3539 void *ret = slab_alloc(cachep, flags, _RET_IP_);
3553 3540
3554 kasan_slab_alloc(cachep, ret, flags); 3541 ret = kasan_slab_alloc(cachep, ret, flags);
3555 trace_kmem_cache_alloc(_RET_IP_, ret, 3542 trace_kmem_cache_alloc(_RET_IP_, ret,
3556 cachep->object_size, cachep->size, flags); 3543 cachep->object_size, cachep->size, flags);
3557 3544
@@ -3617,7 +3604,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
3617 3604
3618 ret = slab_alloc(cachep, flags, _RET_IP_); 3605 ret = slab_alloc(cachep, flags, _RET_IP_);
3619 3606
3620 kasan_kmalloc(cachep, ret, size, flags); 3607 ret = kasan_kmalloc(cachep, ret, size, flags);
3621 trace_kmalloc(_RET_IP_, ret, 3608 trace_kmalloc(_RET_IP_, ret,
3622 size, cachep->size, flags); 3609 size, cachep->size, flags);
3623 return ret; 3610 return ret;
@@ -3641,7 +3628,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3641{ 3628{
3642 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3629 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3643 3630
3644 kasan_slab_alloc(cachep, ret, flags); 3631 ret = kasan_slab_alloc(cachep, ret, flags);
3645 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3632 trace_kmem_cache_alloc_node(_RET_IP_, ret,
3646 cachep->object_size, cachep->size, 3633 cachep->object_size, cachep->size,
3647 flags, nodeid); 3634 flags, nodeid);
@@ -3660,7 +3647,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
3660 3647
3661 ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3648 ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3662 3649
3663 kasan_kmalloc(cachep, ret, size, flags); 3650 ret = kasan_kmalloc(cachep, ret, size, flags);
3664 trace_kmalloc_node(_RET_IP_, ret, 3651 trace_kmalloc_node(_RET_IP_, ret,
3665 size, cachep->size, 3652 size, cachep->size,
3666 flags, nodeid); 3653 flags, nodeid);
@@ -3681,7 +3668,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
3681 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3668 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3682 return cachep; 3669 return cachep;
3683 ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); 3670 ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
3684 kasan_kmalloc(cachep, ret, size, flags); 3671 ret = kasan_kmalloc(cachep, ret, size, flags);
3685 3672
3686 return ret; 3673 return ret;
3687} 3674}
@@ -3719,7 +3706,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3719 return cachep; 3706 return cachep;
3720 ret = slab_alloc(cachep, flags, caller); 3707 ret = slab_alloc(cachep, flags, caller);
3721 3708
3722 kasan_kmalloc(cachep, ret, size, flags); 3709 ret = kasan_kmalloc(cachep, ret, size, flags);
3723 trace_kmalloc(caller, ret, 3710 trace_kmalloc(caller, ret,
3724 size, cachep->size, flags); 3711 size, cachep->size, flags);
3725 3712
diff --git a/mm/slab.h b/mm/slab.h
index 58c6c1c2a78e..4190c24ef0e9 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -441,7 +441,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
441 441
442 kmemleak_alloc_recursive(object, s->object_size, 1, 442 kmemleak_alloc_recursive(object, s->object_size, 1,
443 s->flags, flags); 443 s->flags, flags);
444 kasan_slab_alloc(s, object, flags); 444 p[i] = kasan_slab_alloc(s, object, flags);
445 } 445 }
446 446
447 if (memcg_kmem_enabled()) 447 if (memcg_kmem_enabled())
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 9c11e8a937d2..70b0cc85db67 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1029,10 +1029,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
1029 1029
1030 index = size_index[size_index_elem(size)]; 1030 index = size_index[size_index_elem(size)];
1031 } else { 1031 } else {
1032 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { 1032 if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE))
1033 WARN_ON(1);
1034 return NULL; 1033 return NULL;
1035 }
1036 index = fls(size - 1); 1034 index = fls(size - 1);
1037 } 1035 }
1038 1036
@@ -1204,7 +1202,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
1204 page = alloc_pages(flags, order); 1202 page = alloc_pages(flags, order);
1205 ret = page ? page_address(page) : NULL; 1203 ret = page ? page_address(page) : NULL;
1206 kmemleak_alloc(ret, size, 1, flags); 1204 kmemleak_alloc(ret, size, 1, flags);
1207 kasan_kmalloc_large(ret, size, flags); 1205 ret = kasan_kmalloc_large(ret, size, flags);
1208 return ret; 1206 return ret;
1209} 1207}
1210EXPORT_SYMBOL(kmalloc_order); 1208EXPORT_SYMBOL(kmalloc_order);
@@ -1482,7 +1480,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
1482 ks = ksize(p); 1480 ks = ksize(p);
1483 1481
1484 if (ks >= new_size) { 1482 if (ks >= new_size) {
1485 kasan_krealloc((void *)p, new_size, flags); 1483 p = kasan_krealloc((void *)p, new_size, flags);
1486 return (void *)p; 1484 return (void *)p;
1487 } 1485 }
1488 1486
@@ -1534,7 +1532,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
1534 } 1532 }
1535 1533
1536 ret = __do_krealloc(p, new_size, flags); 1534 ret = __do_krealloc(p, new_size, flags);
1537 if (ret && p != ret) 1535 if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
1538 kfree(p); 1536 kfree(p);
1539 1537
1540 return ret; 1538 return ret;
diff --git a/mm/slub.c b/mm/slub.c
index e3629cd7aff1..36c0befeebd8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1372,10 +1372,10 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
1372 * Hooks for other subsystems that check memory allocations. In a typical 1372 * Hooks for other subsystems that check memory allocations. In a typical
1373 * production configuration these hooks all should produce no code at all. 1373 * production configuration these hooks all should produce no code at all.
1374 */ 1374 */
1375static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) 1375static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
1376{ 1376{
1377 kmemleak_alloc(ptr, size, 1, flags); 1377 kmemleak_alloc(ptr, size, 1, flags);
1378 kasan_kmalloc_large(ptr, size, flags); 1378 return kasan_kmalloc_large(ptr, size, flags);
1379} 1379}
1380 1380
1381static __always_inline void kfree_hook(void *x) 1381static __always_inline void kfree_hook(void *x)
@@ -1451,16 +1451,17 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
1451#endif 1451#endif
1452} 1452}
1453 1453
1454static void setup_object(struct kmem_cache *s, struct page *page, 1454static void *setup_object(struct kmem_cache *s, struct page *page,
1455 void *object) 1455 void *object)
1456{ 1456{
1457 setup_object_debug(s, page, object); 1457 setup_object_debug(s, page, object);
1458 kasan_init_slab_obj(s, object); 1458 object = kasan_init_slab_obj(s, object);
1459 if (unlikely(s->ctor)) { 1459 if (unlikely(s->ctor)) {
1460 kasan_unpoison_object_data(s, object); 1460 kasan_unpoison_object_data(s, object);
1461 s->ctor(object); 1461 s->ctor(object);
1462 kasan_poison_object_data(s, object); 1462 kasan_poison_object_data(s, object);
1463 } 1463 }
1464 return object;
1464} 1465}
1465 1466
1466/* 1467/*
@@ -1568,16 +1569,16 @@ static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
1568 /* First entry is used as the base of the freelist */ 1569 /* First entry is used as the base of the freelist */
1569 cur = next_freelist_entry(s, page, &pos, start, page_limit, 1570 cur = next_freelist_entry(s, page, &pos, start, page_limit,
1570 freelist_count); 1571 freelist_count);
1572 cur = setup_object(s, page, cur);
1571 page->freelist = cur; 1573 page->freelist = cur;
1572 1574
1573 for (idx = 1; idx < page->objects; idx++) { 1575 for (idx = 1; idx < page->objects; idx++) {
1574 setup_object(s, page, cur);
1575 next = next_freelist_entry(s, page, &pos, start, page_limit, 1576 next = next_freelist_entry(s, page, &pos, start, page_limit,
1576 freelist_count); 1577 freelist_count);
1578 next = setup_object(s, page, next);
1577 set_freepointer(s, cur, next); 1579 set_freepointer(s, cur, next);
1578 cur = next; 1580 cur = next;
1579 } 1581 }
1580 setup_object(s, page, cur);
1581 set_freepointer(s, cur, NULL); 1582 set_freepointer(s, cur, NULL);
1582 1583
1583 return true; 1584 return true;
@@ -1599,7 +1600,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1599 struct page *page; 1600 struct page *page;
1600 struct kmem_cache_order_objects oo = s->oo; 1601 struct kmem_cache_order_objects oo = s->oo;
1601 gfp_t alloc_gfp; 1602 gfp_t alloc_gfp;
1602 void *start, *p; 1603 void *start, *p, *next;
1603 int idx, order; 1604 int idx, order;
1604 bool shuffle; 1605 bool shuffle;
1605 1606
@@ -1651,13 +1652,16 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1651 1652
1652 if (!shuffle) { 1653 if (!shuffle) {
1653 for_each_object_idx(p, idx, s, start, page->objects) { 1654 for_each_object_idx(p, idx, s, start, page->objects) {
1654 setup_object(s, page, p); 1655 if (likely(idx < page->objects)) {
1655 if (likely(idx < page->objects)) 1656 next = p + s->size;
1656 set_freepointer(s, p, p + s->size); 1657 next = setup_object(s, page, next);
1657 else 1658 set_freepointer(s, p, next);
1659 } else
1658 set_freepointer(s, p, NULL); 1660 set_freepointer(s, p, NULL);
1659 } 1661 }
1660 page->freelist = fixup_red_left(s, start); 1662 start = fixup_red_left(s, start);
1663 start = setup_object(s, page, start);
1664 page->freelist = start;
1661 } 1665 }
1662 1666
1663 page->inuse = page->objects; 1667 page->inuse = page->objects;
@@ -2127,26 +2131,15 @@ redo:
2127 } 2131 }
2128 2132
2129 if (l != m) { 2133 if (l != m) {
2130
2131 if (l == M_PARTIAL) 2134 if (l == M_PARTIAL)
2132
2133 remove_partial(n, page); 2135 remove_partial(n, page);
2134
2135 else if (l == M_FULL) 2136 else if (l == M_FULL)
2136
2137 remove_full(s, n, page); 2137 remove_full(s, n, page);
2138 2138
2139 if (m == M_PARTIAL) { 2139 if (m == M_PARTIAL)
2140
2141 add_partial(n, page, tail); 2140 add_partial(n, page, tail);
2142 stat(s, tail); 2141 else if (m == M_FULL)
2143
2144 } else if (m == M_FULL) {
2145
2146 stat(s, DEACTIVATE_FULL);
2147 add_full(s, n, page); 2142 add_full(s, n, page);
2148
2149 }
2150 } 2143 }
2151 2144
2152 l = m; 2145 l = m;
@@ -2159,7 +2152,11 @@ redo:
2159 if (lock) 2152 if (lock)
2160 spin_unlock(&n->list_lock); 2153 spin_unlock(&n->list_lock);
2161 2154
2162 if (m == M_FREE) { 2155 if (m == M_PARTIAL)
2156 stat(s, tail);
2157 else if (m == M_FULL)
2158 stat(s, DEACTIVATE_FULL);
2159 else if (m == M_FREE) {
2163 stat(s, DEACTIVATE_EMPTY); 2160 stat(s, DEACTIVATE_EMPTY);
2164 discard_slab(s, page); 2161 discard_slab(s, page);
2165 stat(s, FREE_SLAB); 2162 stat(s, FREE_SLAB);
@@ -2313,12 +2310,10 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
2313{ 2310{
2314 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 2311 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2315 2312
2316 if (likely(c)) { 2313 if (c->page)
2317 if (c->page) 2314 flush_slab(s, c);
2318 flush_slab(s, c);
2319 2315
2320 unfreeze_partials(s, c); 2316 unfreeze_partials(s, c);
2321 }
2322} 2317}
2323 2318
2324static void flush_cpu_slab(void *d) 2319static void flush_cpu_slab(void *d)
@@ -2367,7 +2362,7 @@ static int slub_cpu_dead(unsigned int cpu)
2367static inline int node_match(struct page *page, int node) 2362static inline int node_match(struct page *page, int node)
2368{ 2363{
2369#ifdef CONFIG_NUMA 2364#ifdef CONFIG_NUMA
2370 if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) 2365 if (node != NUMA_NO_NODE && page_to_nid(page) != node)
2371 return 0; 2366 return 0;
2372#endif 2367#endif
2373 return 1; 2368 return 1;
@@ -2768,7 +2763,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
2768{ 2763{
2769 void *ret = slab_alloc(s, gfpflags, _RET_IP_); 2764 void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2770 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2765 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2771 kasan_kmalloc(s, ret, size, gfpflags); 2766 ret = kasan_kmalloc(s, ret, size, gfpflags);
2772 return ret; 2767 return ret;
2773} 2768}
2774EXPORT_SYMBOL(kmem_cache_alloc_trace); 2769EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2796,7 +2791,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
2796 trace_kmalloc_node(_RET_IP_, ret, 2791 trace_kmalloc_node(_RET_IP_, ret,
2797 size, s->size, gfpflags, node); 2792 size, s->size, gfpflags, node);
2798 2793
2799 kasan_kmalloc(s, ret, size, gfpflags); 2794 ret = kasan_kmalloc(s, ret, size, gfpflags);
2800 return ret; 2795 return ret;
2801} 2796}
2802EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 2797EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2992,7 +2987,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
2992 do_slab_free(s, page, head, tail, cnt, addr); 2987 do_slab_free(s, page, head, tail, cnt, addr);
2993} 2988}
2994 2989
2995#ifdef CONFIG_KASAN 2990#ifdef CONFIG_KASAN_GENERIC
2996void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) 2991void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
2997{ 2992{
2998 do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr); 2993 do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
@@ -3364,16 +3359,16 @@ static void early_kmem_cache_node_alloc(int node)
3364 3359
3365 n = page->freelist; 3360 n = page->freelist;
3366 BUG_ON(!n); 3361 BUG_ON(!n);
3367 page->freelist = get_freepointer(kmem_cache_node, n);
3368 page->inuse = 1;
3369 page->frozen = 0;
3370 kmem_cache_node->node[node] = n;
3371#ifdef CONFIG_SLUB_DEBUG 3362#ifdef CONFIG_SLUB_DEBUG
3372 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 3363 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
3373 init_tracking(kmem_cache_node, n); 3364 init_tracking(kmem_cache_node, n);
3374#endif 3365#endif
3375 kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), 3366 n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
3376 GFP_KERNEL); 3367 GFP_KERNEL);
3368 page->freelist = get_freepointer(kmem_cache_node, n);
3369 page->inuse = 1;
3370 page->frozen = 0;
3371 kmem_cache_node->node[node] = n;
3377 init_kmem_cache_node(n); 3372 init_kmem_cache_node(n);
3378 inc_slabs_node(kmem_cache_node, node, page->objects); 3373 inc_slabs_node(kmem_cache_node, node, page->objects);
3379 3374
@@ -3784,7 +3779,7 @@ void *__kmalloc(size_t size, gfp_t flags)
3784 3779
3785 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3780 trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
3786 3781
3787 kasan_kmalloc(s, ret, size, flags); 3782 ret = kasan_kmalloc(s, ret, size, flags);
3788 3783
3789 return ret; 3784 return ret;
3790} 3785}
@@ -3801,8 +3796,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3801 if (page) 3796 if (page)
3802 ptr = page_address(page); 3797 ptr = page_address(page);
3803 3798
3804 kmalloc_large_node_hook(ptr, size, flags); 3799 return kmalloc_large_node_hook(ptr, size, flags);
3805 return ptr;
3806} 3800}
3807 3801
3808void *__kmalloc_node(size_t size, gfp_t flags, int node) 3802void *__kmalloc_node(size_t size, gfp_t flags, int node)
@@ -3829,7 +3823,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
3829 3823
3830 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3824 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
3831 3825
3832 kasan_kmalloc(s, ret, size, flags); 3826 ret = kasan_kmalloc(s, ret, size, flags);
3833 3827
3834 return ret; 3828 return ret;
3835} 3829}
diff --git a/mm/sparse.c b/mm/sparse.c
index 3abc8cc50201..7ea5dc6c6b19 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -678,25 +678,24 @@ static void free_map_bootmem(struct page *memmap)
678 * set. If this is <=0, then that means that the passed-in 678 * set. If this is <=0, then that means that the passed-in
679 * map was not consumed and must be freed. 679 * map was not consumed and must be freed.
680 */ 680 */
681int __meminit sparse_add_one_section(struct pglist_data *pgdat, 681int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
682 unsigned long start_pfn, struct vmem_altmap *altmap) 682 struct vmem_altmap *altmap)
683{ 683{
684 unsigned long section_nr = pfn_to_section_nr(start_pfn); 684 unsigned long section_nr = pfn_to_section_nr(start_pfn);
685 struct mem_section *ms; 685 struct mem_section *ms;
686 struct page *memmap; 686 struct page *memmap;
687 unsigned long *usemap; 687 unsigned long *usemap;
688 unsigned long flags;
689 int ret; 688 int ret;
690 689
691 /* 690 /*
692 * no locking for this, because it does its own 691 * no locking for this, because it does its own
693 * plus, it does a kmalloc 692 * plus, it does a kmalloc
694 */ 693 */
695 ret = sparse_index_init(section_nr, pgdat->node_id); 694 ret = sparse_index_init(section_nr, nid);
696 if (ret < 0 && ret != -EEXIST) 695 if (ret < 0 && ret != -EEXIST)
697 return ret; 696 return ret;
698 ret = 0; 697 ret = 0;
699 memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap); 698 memmap = kmalloc_section_memmap(section_nr, nid, altmap);
700 if (!memmap) 699 if (!memmap)
701 return -ENOMEM; 700 return -ENOMEM;
702 usemap = __kmalloc_section_usemap(); 701 usemap = __kmalloc_section_usemap();
@@ -705,8 +704,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
705 return -ENOMEM; 704 return -ENOMEM;
706 } 705 }
707 706
708 pgdat_resize_lock(pgdat, &flags);
709
710 ms = __pfn_to_section(start_pfn); 707 ms = __pfn_to_section(start_pfn);
711 if (ms->section_mem_map & SECTION_MARKED_PRESENT) { 708 if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
712 ret = -EEXIST; 709 ret = -EEXIST;
@@ -723,7 +720,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
723 sparse_init_one_section(ms, section_nr, memmap, usemap); 720 sparse_init_one_section(ms, section_nr, memmap, usemap);
724 721
725out: 722out:
726 pgdat_resize_unlock(pgdat, &flags);
727 if (ret < 0) { 723 if (ret < 0) {
728 kfree(usemap); 724 kfree(usemap);
729 __kfree_section_memmap(memmap, altmap); 725 __kfree_section_memmap(memmap, altmap);
@@ -740,6 +736,15 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
740 if (!memmap) 736 if (!memmap)
741 return; 737 return;
742 738
739 /*
740 * A further optimization is to have per section refcounted
741 * num_poisoned_pages. But that would need more space per memmap, so
742 * for now just do a quick global check to speed up this routine in the
743 * absence of bad pages.
744 */
745 if (atomic_long_read(&num_poisoned_pages) == 0)
746 return;
747
743 for (i = 0; i < nr_pages; i++) { 748 for (i = 0; i < nr_pages; i++) {
744 if (PageHWPoison(&memmap[i])) { 749 if (PageHWPoison(&memmap[i])) {
745 atomic_long_sub(1, &num_poisoned_pages); 750 atomic_long_sub(1, &num_poisoned_pages);
@@ -785,10 +790,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
785 unsigned long map_offset, struct vmem_altmap *altmap) 790 unsigned long map_offset, struct vmem_altmap *altmap)
786{ 791{
787 struct page *memmap = NULL; 792 struct page *memmap = NULL;
788 unsigned long *usemap = NULL, flags; 793 unsigned long *usemap = NULL;
789 struct pglist_data *pgdat = zone->zone_pgdat;
790 794
791 pgdat_resize_lock(pgdat, &flags);
792 if (ms->section_mem_map) { 795 if (ms->section_mem_map) {
793 usemap = ms->pageblock_flags; 796 usemap = ms->pageblock_flags;
794 memmap = sparse_decode_mem_map(ms->section_mem_map, 797 memmap = sparse_decode_mem_map(ms->section_mem_map,
@@ -796,7 +799,6 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
796 ms->section_mem_map = 0; 799 ms->section_mem_map = 0;
797 ms->pageblock_flags = NULL; 800 ms->pageblock_flags = NULL;
798 } 801 }
799 pgdat_resize_unlock(pgdat, &flags);
800 802
801 clear_hwpoisoned_pages(memmap + map_offset, 803 clear_hwpoisoned_pages(memmap + map_offset,
802 PAGES_PER_SECTION - map_offset); 804 PAGES_PER_SECTION - map_offset);
diff --git a/mm/swap.c b/mm/swap.c
index 5d786019eab9..4d8a1f1afaab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1022,7 +1022,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
1022 */ 1022 */
1023void __init swap_setup(void) 1023void __init swap_setup(void)
1024{ 1024{
1025 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 1025 unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
1026 1026
1027 /* Use a smaller cluster for small-memory machines */ 1027 /* Use a smaller cluster for small-memory machines */
1028 if (megs < 16) 1028 if (megs < 16)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8688ae65ef58..dbac1d49469d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2197,7 +2197,8 @@ int try_to_unuse(unsigned int type, bool frontswap,
2197 */ 2197 */
2198 if (PageSwapCache(page) && 2198 if (PageSwapCache(page) &&
2199 likely(page_private(page) == entry.val) && 2199 likely(page_private(page) == entry.val) &&
2200 !page_swapped(page)) 2200 (!PageTransCompound(page) ||
2201 !swap_page_trans_huge_swapped(si, entry)))
2201 delete_from_swap_cache(compound_head(page)); 2202 delete_from_swap_cache(compound_head(page));
2202 2203
2203 /* 2204 /*
@@ -2812,8 +2813,9 @@ static struct swap_info_struct *alloc_swap_info(void)
2812 struct swap_info_struct *p; 2813 struct swap_info_struct *p;
2813 unsigned int type; 2814 unsigned int type;
2814 int i; 2815 int i;
2816 int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
2815 2817
2816 p = kvzalloc(sizeof(*p), GFP_KERNEL); 2818 p = kvzalloc(size, GFP_KERNEL);
2817 if (!p) 2819 if (!p)
2818 return ERR_PTR(-ENOMEM); 2820 return ERR_PTR(-ENOMEM);
2819 2821
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 458acda96f20..48368589f519 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -267,10 +267,14 @@ retry:
267 VM_BUG_ON(dst_addr & ~huge_page_mask(h)); 267 VM_BUG_ON(dst_addr & ~huge_page_mask(h));
268 268
269 /* 269 /*
270 * Serialize via hugetlb_fault_mutex 270 * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
271 * i_mmap_rwsem ensures the dst_pte remains valid even
272 * in the case of shared pmds. fault mutex prevents
273 * races with other faulting threads.
271 */ 274 */
272 idx = linear_page_index(dst_vma, dst_addr);
273 mapping = dst_vma->vm_file->f_mapping; 275 mapping = dst_vma->vm_file->f_mapping;
276 i_mmap_lock_read(mapping);
277 idx = linear_page_index(dst_vma, dst_addr);
274 hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, 278 hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
275 idx, dst_addr); 279 idx, dst_addr);
276 mutex_lock(&hugetlb_fault_mutex_table[hash]); 280 mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -279,6 +283,7 @@ retry:
279 dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); 283 dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
280 if (!dst_pte) { 284 if (!dst_pte) {
281 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 285 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
286 i_mmap_unlock_read(mapping);
282 goto out_unlock; 287 goto out_unlock;
283 } 288 }
284 289
@@ -286,6 +291,7 @@ retry:
286 dst_pteval = huge_ptep_get(dst_pte); 291 dst_pteval = huge_ptep_get(dst_pte);
287 if (!huge_pte_none(dst_pteval)) { 292 if (!huge_pte_none(dst_pteval)) {
288 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 293 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
294 i_mmap_unlock_read(mapping);
289 goto out_unlock; 295 goto out_unlock;
290 } 296 }
291 297
@@ -293,6 +299,7 @@ retry:
293 dst_addr, src_addr, &page); 299 dst_addr, src_addr, &page);
294 300
295 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 301 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
302 i_mmap_unlock_read(mapping);
296 vm_alloc_shared = vm_shared; 303 vm_alloc_shared = vm_shared;
297 304
298 cond_resched(); 305 cond_resched();
diff --git a/mm/util.c b/mm/util.c
index 8bf08b5b5760..4df23d64aac7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -593,7 +593,7 @@ unsigned long vm_commit_limit(void)
593 if (sysctl_overcommit_kbytes) 593 if (sysctl_overcommit_kbytes)
594 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); 594 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
595 else 595 else
596 allowed = ((totalram_pages - hugetlb_total_pages()) 596 allowed = ((totalram_pages() - hugetlb_total_pages())
597 * sysctl_overcommit_ratio / 100); 597 * sysctl_overcommit_ratio / 100);
598 allowed += total_swap_pages; 598 allowed += total_swap_pages;
599 599
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 97d4b25d0373..871e41c55e23 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1634,7 +1634,7 @@ void *vmap(struct page **pages, unsigned int count,
1634 1634
1635 might_sleep(); 1635 might_sleep();
1636 1636
1637 if (count > totalram_pages) 1637 if (count > totalram_pages())
1638 return NULL; 1638 return NULL;
1639 1639
1640 size = (unsigned long)count << PAGE_SHIFT; 1640 size = (unsigned long)count << PAGE_SHIFT;
@@ -1739,7 +1739,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1739 unsigned long real_size = size; 1739 unsigned long real_size = size;
1740 1740
1741 size = PAGE_ALIGN(size); 1741 size = PAGE_ALIGN(size);
1742 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1742 if (!size || (size >> PAGE_SHIFT) > totalram_pages())
1743 goto fail; 1743 goto fail;
1744 1744
1745 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | 1745 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 24ab1f7394ab..a714c4f800e9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -88,6 +88,9 @@ struct scan_control {
88 /* Can pages be swapped as part of reclaim? */ 88 /* Can pages be swapped as part of reclaim? */
89 unsigned int may_swap:1; 89 unsigned int may_swap:1;
90 90
91 /* e.g. boosted watermark reclaim leaves slabs alone */
92 unsigned int may_shrinkslab:1;
93
91 /* 94 /*
92 * Cgroups are not reclaimed below their configured memory.low, 95 * Cgroups are not reclaimed below their configured memory.low,
93 * unless we threaten to OOM. If any cgroups are skipped due to 96 * unless we threaten to OOM. If any cgroups are skipped due to
@@ -1457,14 +1460,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1457 count_memcg_page_event(page, PGLAZYFREED); 1460 count_memcg_page_event(page, PGLAZYFREED);
1458 } else if (!mapping || !__remove_mapping(mapping, page, true)) 1461 } else if (!mapping || !__remove_mapping(mapping, page, true))
1459 goto keep_locked; 1462 goto keep_locked;
1460 /* 1463
1461 * At this point, we have no other references and there is 1464 unlock_page(page);
1462 * no way to pick any more up (removed from LRU, removed
1463 * from pagecache). Can use non-atomic bitops now (and
1464 * we obviously don't have to worry about waking up a process
1465 * waiting on the page lock, because there are no references.
1466 */
1467 __ClearPageLocked(page);
1468free_it: 1465free_it:
1469 nr_reclaimed++; 1466 nr_reclaimed++;
1470 1467
@@ -2756,8 +2753,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2756 shrink_node_memcg(pgdat, memcg, sc, &lru_pages); 2753 shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
2757 node_lru_pages += lru_pages; 2754 node_lru_pages += lru_pages;
2758 2755
2759 shrink_slab(sc->gfp_mask, pgdat->node_id, 2756 if (sc->may_shrinkslab) {
2757 shrink_slab(sc->gfp_mask, pgdat->node_id,
2760 memcg, sc->priority); 2758 memcg, sc->priority);
2759 }
2761 2760
2762 /* Record the group's reclaim efficiency */ 2761 /* Record the group's reclaim efficiency */
2763 vmpressure(sc->gfp_mask, memcg, false, 2762 vmpressure(sc->gfp_mask, memcg, false,
@@ -3239,6 +3238,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3239 .may_writepage = !laptop_mode, 3238 .may_writepage = !laptop_mode,
3240 .may_unmap = 1, 3239 .may_unmap = 1,
3241 .may_swap = 1, 3240 .may_swap = 1,
3241 .may_shrinkslab = 1,
3242 }; 3242 };
3243 3243
3244 /* 3244 /*
@@ -3283,6 +3283,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3283 .may_unmap = 1, 3283 .may_unmap = 1,
3284 .reclaim_idx = MAX_NR_ZONES - 1, 3284 .reclaim_idx = MAX_NR_ZONES - 1,
3285 .may_swap = !noswap, 3285 .may_swap = !noswap,
3286 .may_shrinkslab = 1,
3286 }; 3287 };
3287 unsigned long lru_pages; 3288 unsigned long lru_pages;
3288 3289
@@ -3329,6 +3330,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3329 .may_writepage = !laptop_mode, 3330 .may_writepage = !laptop_mode,
3330 .may_unmap = 1, 3331 .may_unmap = 1,
3331 .may_swap = may_swap, 3332 .may_swap = may_swap,
3333 .may_shrinkslab = 1,
3332 }; 3334 };
3333 3335
3334 /* 3336 /*
@@ -3379,6 +3381,30 @@ static void age_active_anon(struct pglist_data *pgdat,
3379 } while (memcg); 3381 } while (memcg);
3380} 3382}
3381 3383
3384static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
3385{
3386 int i;
3387 struct zone *zone;
3388
3389 /*
3390 * Check for watermark boosts top-down as the higher zones
3391 * are more likely to be boosted. Both watermarks and boosts
3392 * should not be checked at the time time as reclaim would
3393 * start prematurely when there is no boosting and a lower
3394 * zone is balanced.
3395 */
3396 for (i = classzone_idx; i >= 0; i--) {
3397 zone = pgdat->node_zones + i;
3398 if (!managed_zone(zone))
3399 continue;
3400
3401 if (zone->watermark_boost)
3402 return true;
3403 }
3404
3405 return false;
3406}
3407
3382/* 3408/*
3383 * Returns true if there is an eligible zone balanced for the request order 3409 * Returns true if there is an eligible zone balanced for the request order
3384 * and classzone_idx 3410 * and classzone_idx
@@ -3389,6 +3415,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
3389 unsigned long mark = -1; 3415 unsigned long mark = -1;
3390 struct zone *zone; 3416 struct zone *zone;
3391 3417
3418 /*
3419 * Check watermarks bottom-up as lower zones are more likely to
3420 * meet watermarks.
3421 */
3392 for (i = 0; i <= classzone_idx; i++) { 3422 for (i = 0; i <= classzone_idx; i++) {
3393 zone = pgdat->node_zones + i; 3423 zone = pgdat->node_zones + i;
3394 3424
@@ -3517,14 +3547,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3517 unsigned long nr_soft_reclaimed; 3547 unsigned long nr_soft_reclaimed;
3518 unsigned long nr_soft_scanned; 3548 unsigned long nr_soft_scanned;
3519 unsigned long pflags; 3549 unsigned long pflags;
3550 unsigned long nr_boost_reclaim;
3551 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
3552 bool boosted;
3520 struct zone *zone; 3553 struct zone *zone;
3521 struct scan_control sc = { 3554 struct scan_control sc = {
3522 .gfp_mask = GFP_KERNEL, 3555 .gfp_mask = GFP_KERNEL,
3523 .order = order, 3556 .order = order,
3524 .priority = DEF_PRIORITY,
3525 .may_writepage = !laptop_mode,
3526 .may_unmap = 1, 3557 .may_unmap = 1,
3527 .may_swap = 1,
3528 }; 3558 };
3529 3559
3530 psi_memstall_enter(&pflags); 3560 psi_memstall_enter(&pflags);
@@ -3532,9 +3562,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3532 3562
3533 count_vm_event(PAGEOUTRUN); 3563 count_vm_event(PAGEOUTRUN);
3534 3564
3565 /*
3566 * Account for the reclaim boost. Note that the zone boost is left in
3567 * place so that parallel allocations that are near the watermark will
3568 * stall or direct reclaim until kswapd is finished.
3569 */
3570 nr_boost_reclaim = 0;
3571 for (i = 0; i <= classzone_idx; i++) {
3572 zone = pgdat->node_zones + i;
3573 if (!managed_zone(zone))
3574 continue;
3575
3576 nr_boost_reclaim += zone->watermark_boost;
3577 zone_boosts[i] = zone->watermark_boost;
3578 }
3579 boosted = nr_boost_reclaim;
3580
3581restart:
3582 sc.priority = DEF_PRIORITY;
3535 do { 3583 do {
3536 unsigned long nr_reclaimed = sc.nr_reclaimed; 3584 unsigned long nr_reclaimed = sc.nr_reclaimed;
3537 bool raise_priority = true; 3585 bool raise_priority = true;
3586 bool balanced;
3538 bool ret; 3587 bool ret;
3539 3588
3540 sc.reclaim_idx = classzone_idx; 3589 sc.reclaim_idx = classzone_idx;
@@ -3561,13 +3610,40 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3561 } 3610 }
3562 3611
3563 /* 3612 /*
3564 * Only reclaim if there are no eligible zones. Note that 3613 * If the pgdat is imbalanced then ignore boosting and preserve
3565 * sc.reclaim_idx is not used as buffer_heads_over_limit may 3614 * the watermarks for a later time and restart. Note that the
3566 * have adjusted it. 3615 * zone watermarks will be still reset at the end of balancing
3616 * on the grounds that the normal reclaim should be enough to
3617 * re-evaluate if boosting is required when kswapd next wakes.
3567 */ 3618 */
3568 if (pgdat_balanced(pgdat, sc.order, classzone_idx)) 3619 balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
3620 if (!balanced && nr_boost_reclaim) {
3621 nr_boost_reclaim = 0;
3622 goto restart;
3623 }
3624
3625 /*
3626 * If boosting is not active then only reclaim if there are no
3627 * eligible zones. Note that sc.reclaim_idx is not used as
3628 * buffer_heads_over_limit may have adjusted it.
3629 */
3630 if (!nr_boost_reclaim && balanced)
3569 goto out; 3631 goto out;
3570 3632
3633 /* Limit the priority of boosting to avoid reclaim writeback */
3634 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
3635 raise_priority = false;
3636
3637 /*
3638 * Do not writeback or swap pages for boosted reclaim. The
3639 * intent is to relieve pressure not issue sub-optimal IO
3640 * from reclaim context. If no pages are reclaimed, the
3641 * reclaim will be aborted.
3642 */
3643 sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
3644 sc.may_swap = !nr_boost_reclaim;
3645 sc.may_shrinkslab = !nr_boost_reclaim;
3646
3571 /* 3647 /*
3572 * Do some background aging of the anon list, to give 3648 * Do some background aging of the anon list, to give
3573 * pages a chance to be referenced before reclaiming. All 3649 * pages a chance to be referenced before reclaiming. All
@@ -3619,6 +3695,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3619 * progress in reclaiming pages 3695 * progress in reclaiming pages
3620 */ 3696 */
3621 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; 3697 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3698 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
3699
3700 /*
3701 * If reclaim made no progress for a boost, stop reclaim as
3702 * IO cannot be queued and it could be an infinite loop in
3703 * extreme circumstances.
3704 */
3705 if (nr_boost_reclaim && !nr_reclaimed)
3706 break;
3707
3622 if (raise_priority || !nr_reclaimed) 3708 if (raise_priority || !nr_reclaimed)
3623 sc.priority--; 3709 sc.priority--;
3624 } while (sc.priority >= 1); 3710 } while (sc.priority >= 1);
@@ -3627,6 +3713,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3627 pgdat->kswapd_failures++; 3713 pgdat->kswapd_failures++;
3628 3714
3629out: 3715out:
3716 /* If reclaim was boosted, account for the reclaim done in this pass */
3717 if (boosted) {
3718 unsigned long flags;
3719
3720 for (i = 0; i <= classzone_idx; i++) {
3721 if (!zone_boosts[i])
3722 continue;
3723
3724 /* Increments are under the zone lock */
3725 zone = pgdat->node_zones + i;
3726 spin_lock_irqsave(&zone->lock, flags);
3727 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
3728 spin_unlock_irqrestore(&zone->lock, flags);
3729 }
3730
3731 /*
3732 * As there is now likely space, wakeup kcompact to defragment
3733 * pageblocks.
3734 */
3735 wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
3736 }
3737
3630 snapshot_refaults(NULL, pgdat); 3738 snapshot_refaults(NULL, pgdat);
3631 __fs_reclaim_release(); 3739 __fs_reclaim_release();
3632 psi_memstall_leave(&pflags); 3740 psi_memstall_leave(&pflags);
@@ -3855,7 +3963,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3855 3963
3856 /* Hopeless node, leave it to direct reclaim if possible */ 3964 /* Hopeless node, leave it to direct reclaim if possible */
3857 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || 3965 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
3858 pgdat_balanced(pgdat, order, classzone_idx)) { 3966 (pgdat_balanced(pgdat, order, classzone_idx) &&
3967 !pgdat_watermark_boosted(pgdat, classzone_idx))) {
3859 /* 3968 /*
3860 * There may be plenty of free memory available, but it's too 3969 * There may be plenty of free memory available, but it's too
3861 * fragmented for high-order allocations. Wake up kcompactd 3970 * fragmented for high-order allocations. Wake up kcompactd
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9c624595e904..83b30edc2f7f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -227,7 +227,7 @@ int calculate_normal_threshold(struct zone *zone)
227 * 125 1024 10 16-32 GB 9 227 * 125 1024 10 16-32 GB 9
228 */ 228 */
229 229
230 mem = zone->managed_pages >> (27 - PAGE_SHIFT); 230 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
231 231
232 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); 232 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
233 233
@@ -1569,7 +1569,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1569 high_wmark_pages(zone), 1569 high_wmark_pages(zone),
1570 zone->spanned_pages, 1570 zone->spanned_pages,
1571 zone->present_pages, 1571 zone->present_pages,
1572 zone->managed_pages); 1572 zone_managed_pages(zone));
1573 1573
1574 seq_printf(m, 1574 seq_printf(m,
1575 "\n protection: (%ld", 1575 "\n protection: (%ld",
diff --git a/mm/workingset.c b/mm/workingset.c
index d46f8c92aa2f..dcb994f2acc2 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -549,7 +549,7 @@ static int __init workingset_init(void)
549 * double the initial memory by using totalram_pages as-is. 549 * double the initial memory by using totalram_pages as-is.
550 */ 550 */
551 timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; 551 timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
552 max_order = fls_long(totalram_pages - 1); 552 max_order = fls_long(totalram_pages() - 1);
553 if (max_order > timestamp_bits) 553 if (max_order > timestamp_bits)
554 bucket_order = max_order - timestamp_bits; 554 bucket_order = max_order - timestamp_bits;
555 pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", 555 pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
diff --git a/mm/zswap.c b/mm/zswap.c
index cd91fd9d96b8..a4e4d36ec085 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -219,8 +219,8 @@ static const struct zpool_ops zswap_zpool_ops = {
219 219
220static bool zswap_is_full(void) 220static bool zswap_is_full(void)
221{ 221{
222 return totalram_pages * zswap_max_pool_percent / 100 < 222 return totalram_pages() * zswap_max_pool_percent / 100 <
223 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 223 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
224} 224}
225 225
226static void zswap_update_total_size(void) 226static void zswap_update_total_size(void)