diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-12-28 19:55:46 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-12-28 19:55:46 -0500 |
commit | f346b0becb1bc62e45495f9cdbae3eef35d0b635 (patch) | |
tree | ae79f3dfb8e031da51d38f0f095f89d7d23f3643 /mm | |
parent | 00d59fde8532b2d42e80909d2e58678755e04da9 (diff) | |
parent | 0f4991e8fd48987ae476a92cdee6bfec4aff31b8 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton:
- large KASAN update to use arm's "software tag-based mode"
- a few misc things
- sh updates
- ocfs2 updates
- just about all of MM
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (167 commits)
kernel/fork.c: mark 'stack_vm_area' with __maybe_unused
memcg, oom: notify on oom killer invocation from the charge path
mm, swap: fix swapoff with KSM pages
include/linux/gfp.h: fix typo
mm/hmm: fix memremap.h, move dev_page_fault_t callback to hmm
hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race
hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization
memory_hotplug: add missing newlines to debugging output
mm: remove __hugepage_set_anon_rmap()
include/linux/vmstat.h: remove unused page state adjustment macro
mm/page_alloc.c: allow error injection
mm: migrate: drop unused argument of migrate_page_move_mapping()
blkdev: avoid migration stalls for blkdev pages
mm: migrate: provide buffer_migrate_page_norefs()
mm: migrate: move migrate_page_lock_buffers()
mm: migrate: lock buffers before migrate_page_move_mapping()
mm: migration: factor out code to compute expected number of page references
mm, page_alloc: enable pcpu_drain with zone capability
kmemleak: add config to select auto scan
mm/page_alloc.c: don't call kasan_free_pages() at deferred mem init
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 1 | ||||
-rw-r--r-- | mm/cma.c | 11 | ||||
-rw-r--r-- | mm/compaction.c | 2 | ||||
-rw-r--r-- | mm/debug.c | 27 | ||||
-rw-r--r-- | mm/filemap.c | 96 | ||||
-rw-r--r-- | mm/highmem.c | 5 | ||||
-rw-r--r-- | mm/hmm.c | 331 | ||||
-rw-r--r-- | mm/huge_memory.c | 74 | ||||
-rw-r--r-- | mm/hugetlb.c | 133 | ||||
-rw-r--r-- | mm/internal.h | 24 | ||||
-rw-r--r-- | mm/kasan/Makefile | 15 | ||||
-rw-r--r-- | mm/kasan/common.c (renamed from mm/kasan/kasan.c) | 656 | ||||
-rw-r--r-- | mm/kasan/generic.c | 344 | ||||
-rw-r--r-- | mm/kasan/generic_report.c | 153 | ||||
-rw-r--r-- | mm/kasan/init.c (renamed from mm/kasan/kasan_init.c) | 71 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 59 | ||||
-rw-r--r-- | mm/kasan/quarantine.c | 3 | ||||
-rw-r--r-- | mm/kasan/report.c | 272 | ||||
-rw-r--r-- | mm/kasan/tags.c | 161 | ||||
-rw-r--r-- | mm/kasan/tags_report.c | 58 | ||||
-rw-r--r-- | mm/khugepaged.c | 10 | ||||
-rw-r--r-- | mm/kmemleak.c | 19 | ||||
-rw-r--r-- | mm/ksm.c | 35 | ||||
-rw-r--r-- | mm/madvise.c | 21 | ||||
-rw-r--r-- | mm/memblock.c | 52 | ||||
-rw-r--r-- | mm/memcontrol.c | 53 | ||||
-rw-r--r-- | mm/memory-failure.c | 16 | ||||
-rw-r--r-- | mm/memory.c | 103 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 172 | ||||
-rw-r--r-- | mm/migrate.c | 264 | ||||
-rw-r--r-- | mm/mm_init.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 16 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 31 | ||||
-rw-r--r-- | mm/mprotect.c | 15 | ||||
-rw-r--r-- | mm/mremap.c | 10 | ||||
-rw-r--r-- | mm/oom_kill.c | 51 | ||||
-rw-r--r-- | mm/page-writeback.c | 35 | ||||
-rw-r--r-- | mm/page_alloc.c | 404 | ||||
-rw-r--r-- | mm/page_isolation.c | 10 | ||||
-rw-r--r-- | mm/page_owner.c | 1 | ||||
-rw-r--r-- | mm/readahead.c | 12 | ||||
-rw-r--r-- | mm/rmap.c | 59 | ||||
-rw-r--r-- | mm/shmem.c | 8 | ||||
-rw-r--r-- | mm/slab.c | 31 | ||||
-rw-r--r-- | mm/slab.h | 2 | ||||
-rw-r--r-- | mm/slab_common.c | 10 | ||||
-rw-r--r-- | mm/slub.c | 82 | ||||
-rw-r--r-- | mm/sparse.c | 26 | ||||
-rw-r--r-- | mm/swap.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 6 | ||||
-rw-r--r-- | mm/userfaultfd.c | 11 | ||||
-rw-r--r-- | mm/util.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 143 | ||||
-rw-r--r-- | mm/vmstat.c | 4 | ||||
-rw-r--r-- | mm/workingset.c | 2 | ||||
-rw-r--r-- | mm/zswap.c | 4 |
57 files changed, 2454 insertions, 1770 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index d85e39da47ae..25c71eb8a7db 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -291,6 +291,7 @@ config MMU_NOTIFIER | |||
291 | config KSM | 291 | config KSM |
292 | bool "Enable KSM for page merging" | 292 | bool "Enable KSM for page merging" |
293 | depends on MMU | 293 | depends on MMU |
294 | select XXHASH | ||
294 | help | 295 | help |
295 | Enable Kernel Samepage Merging: KSM periodically scans those areas | 296 | Enable Kernel Samepage Merging: KSM periodically scans those areas |
296 | of an application's address space that an app has advised may be | 297 | of an application's address space that an app has advised may be |
@@ -407,6 +407,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, | |||
407 | unsigned long pfn = -1; | 407 | unsigned long pfn = -1; |
408 | unsigned long start = 0; | 408 | unsigned long start = 0; |
409 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; | 409 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; |
410 | size_t i; | ||
410 | struct page *page = NULL; | 411 | struct page *page = NULL; |
411 | int ret = -ENOMEM; | 412 | int ret = -ENOMEM; |
412 | 413 | ||
@@ -466,6 +467,16 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, | |||
466 | 467 | ||
467 | trace_cma_alloc(pfn, page, count, align); | 468 | trace_cma_alloc(pfn, page, count, align); |
468 | 469 | ||
470 | /* | ||
471 | * CMA can allocate multiple page blocks, which results in different | ||
472 | * blocks being marked with different tags. Reset the tags to ignore | ||
473 | * those page blocks. | ||
474 | */ | ||
475 | if (page) { | ||
476 | for (i = 0; i < count; i++) | ||
477 | page_kasan_tag_reset(page + i); | ||
478 | } | ||
479 | |||
469 | if (ret && !no_warn) { | 480 | if (ret && !no_warn) { |
470 | pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", | 481 | pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", |
471 | __func__, count, ret); | 482 | __func__, count, ret); |
diff --git a/mm/compaction.c b/mm/compaction.c index 7c607479de4a..ef29490b0f46 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -1431,7 +1431,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, | |||
1431 | if (is_via_compact_memory(order)) | 1431 | if (is_via_compact_memory(order)) |
1432 | return COMPACT_CONTINUE; | 1432 | return COMPACT_CONTINUE; |
1433 | 1433 | ||
1434 | watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 1434 | watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); |
1435 | /* | 1435 | /* |
1436 | * If watermarks for high-order allocation are already met, there | 1436 | * If watermarks for high-order allocation are already met, there |
1437 | * should be no need for compaction at all. | 1437 | * should be no need for compaction at all. |
diff --git a/mm/debug.c b/mm/debug.c index cdacba12e09a..0abb987dad9b 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -17,7 +17,7 @@ | |||
17 | 17 | ||
18 | #include "internal.h" | 18 | #include "internal.h" |
19 | 19 | ||
20 | char *migrate_reason_names[MR_TYPES] = { | 20 | const char *migrate_reason_names[MR_TYPES] = { |
21 | "compaction", | 21 | "compaction", |
22 | "memory_failure", | 22 | "memory_failure", |
23 | "memory_hotplug", | 23 | "memory_hotplug", |
@@ -44,6 +44,7 @@ const struct trace_print_flags vmaflag_names[] = { | |||
44 | 44 | ||
45 | void __dump_page(struct page *page, const char *reason) | 45 | void __dump_page(struct page *page, const char *reason) |
46 | { | 46 | { |
47 | struct address_space *mapping = page_mapping(page); | ||
47 | bool page_poisoned = PagePoisoned(page); | 48 | bool page_poisoned = PagePoisoned(page); |
48 | int mapcount; | 49 | int mapcount; |
49 | 50 | ||
@@ -53,7 +54,7 @@ void __dump_page(struct page *page, const char *reason) | |||
53 | * dump_page() when detected. | 54 | * dump_page() when detected. |
54 | */ | 55 | */ |
55 | if (page_poisoned) { | 56 | if (page_poisoned) { |
56 | pr_emerg("page:%px is uninitialized and poisoned", page); | 57 | pr_warn("page:%px is uninitialized and poisoned", page); |
57 | goto hex_only; | 58 | goto hex_only; |
58 | } | 59 | } |
59 | 60 | ||
@@ -64,27 +65,39 @@ void __dump_page(struct page *page, const char *reason) | |||
64 | */ | 65 | */ |
65 | mapcount = PageSlab(page) ? 0 : page_mapcount(page); | 66 | mapcount = PageSlab(page) ? 0 : page_mapcount(page); |
66 | 67 | ||
67 | pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx", | 68 | pr_warn("page:%px count:%d mapcount:%d mapping:%px index:%#lx", |
68 | page, page_ref_count(page), mapcount, | 69 | page, page_ref_count(page), mapcount, |
69 | page->mapping, page_to_pgoff(page)); | 70 | page->mapping, page_to_pgoff(page)); |
70 | if (PageCompound(page)) | 71 | if (PageCompound(page)) |
71 | pr_cont(" compound_mapcount: %d", compound_mapcount(page)); | 72 | pr_cont(" compound_mapcount: %d", compound_mapcount(page)); |
72 | pr_cont("\n"); | 73 | pr_cont("\n"); |
74 | if (PageAnon(page)) | ||
75 | pr_warn("anon "); | ||
76 | else if (PageKsm(page)) | ||
77 | pr_warn("ksm "); | ||
78 | else if (mapping) { | ||
79 | pr_warn("%ps ", mapping->a_ops); | ||
80 | if (mapping->host->i_dentry.first) { | ||
81 | struct dentry *dentry; | ||
82 | dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); | ||
83 | pr_warn("name:\"%pd\" ", dentry); | ||
84 | } | ||
85 | } | ||
73 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); | 86 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); |
74 | 87 | ||
75 | pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); | 88 | pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags); |
76 | 89 | ||
77 | hex_only: | 90 | hex_only: |
78 | print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32, | 91 | print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, |
79 | sizeof(unsigned long), page, | 92 | sizeof(unsigned long), page, |
80 | sizeof(struct page), false); | 93 | sizeof(struct page), false); |
81 | 94 | ||
82 | if (reason) | 95 | if (reason) |
83 | pr_alert("page dumped because: %s\n", reason); | 96 | pr_warn("page dumped because: %s\n", reason); |
84 | 97 | ||
85 | #ifdef CONFIG_MEMCG | 98 | #ifdef CONFIG_MEMCG |
86 | if (!page_poisoned && page->mem_cgroup) | 99 | if (!page_poisoned && page->mem_cgroup) |
87 | pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup); | 100 | pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup); |
88 | #endif | 101 | #endif |
89 | } | 102 | } |
90 | 103 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 81adec8ee02c..29655fb47a2c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -981,7 +981,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, | |||
981 | if (wait_page->bit_nr != key->bit_nr) | 981 | if (wait_page->bit_nr != key->bit_nr) |
982 | return 0; | 982 | return 0; |
983 | 983 | ||
984 | /* Stop walking if it's locked */ | 984 | /* |
985 | * Stop walking if it's locked. | ||
986 | * Is this safe if put_and_wait_on_page_locked() is in use? | ||
987 | * Yes: the waker must hold a reference to this page, and if PG_locked | ||
988 | * has now already been set by another task, that task must also hold | ||
989 | * a reference to the *same usage* of this page; so there is no need | ||
990 | * to walk on to wake even the put_and_wait_on_page_locked() callers. | ||
991 | */ | ||
985 | if (test_bit(key->bit_nr, &key->page->flags)) | 992 | if (test_bit(key->bit_nr, &key->page->flags)) |
986 | return -1; | 993 | return -1; |
987 | 994 | ||
@@ -1049,25 +1056,44 @@ static void wake_up_page(struct page *page, int bit) | |||
1049 | wake_up_page_bit(page, bit); | 1056 | wake_up_page_bit(page, bit); |
1050 | } | 1057 | } |
1051 | 1058 | ||
1059 | /* | ||
1060 | * A choice of three behaviors for wait_on_page_bit_common(): | ||
1061 | */ | ||
1062 | enum behavior { | ||
1063 | EXCLUSIVE, /* Hold ref to page and take the bit when woken, like | ||
1064 | * __lock_page() waiting on then setting PG_locked. | ||
1065 | */ | ||
1066 | SHARED, /* Hold ref to page and check the bit when woken, like | ||
1067 | * wait_on_page_writeback() waiting on PG_writeback. | ||
1068 | */ | ||
1069 | DROP, /* Drop ref to page before wait, no check when woken, | ||
1070 | * like put_and_wait_on_page_locked() on PG_locked. | ||
1071 | */ | ||
1072 | }; | ||
1073 | |||
1052 | static inline int wait_on_page_bit_common(wait_queue_head_t *q, | 1074 | static inline int wait_on_page_bit_common(wait_queue_head_t *q, |
1053 | struct page *page, int bit_nr, int state, bool lock) | 1075 | struct page *page, int bit_nr, int state, enum behavior behavior) |
1054 | { | 1076 | { |
1055 | struct wait_page_queue wait_page; | 1077 | struct wait_page_queue wait_page; |
1056 | wait_queue_entry_t *wait = &wait_page.wait; | 1078 | wait_queue_entry_t *wait = &wait_page.wait; |
1079 | bool bit_is_set; | ||
1057 | bool thrashing = false; | 1080 | bool thrashing = false; |
1081 | bool delayacct = false; | ||
1058 | unsigned long pflags; | 1082 | unsigned long pflags; |
1059 | int ret = 0; | 1083 | int ret = 0; |
1060 | 1084 | ||
1061 | if (bit_nr == PG_locked && | 1085 | if (bit_nr == PG_locked && |
1062 | !PageUptodate(page) && PageWorkingset(page)) { | 1086 | !PageUptodate(page) && PageWorkingset(page)) { |
1063 | if (!PageSwapBacked(page)) | 1087 | if (!PageSwapBacked(page)) { |
1064 | delayacct_thrashing_start(); | 1088 | delayacct_thrashing_start(); |
1089 | delayacct = true; | ||
1090 | } | ||
1065 | psi_memstall_enter(&pflags); | 1091 | psi_memstall_enter(&pflags); |
1066 | thrashing = true; | 1092 | thrashing = true; |
1067 | } | 1093 | } |
1068 | 1094 | ||
1069 | init_wait(wait); | 1095 | init_wait(wait); |
1070 | wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; | 1096 | wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0; |
1071 | wait->func = wake_page_function; | 1097 | wait->func = wake_page_function; |
1072 | wait_page.page = page; | 1098 | wait_page.page = page; |
1073 | wait_page.bit_nr = bit_nr; | 1099 | wait_page.bit_nr = bit_nr; |
@@ -1084,14 +1110,17 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, | |||
1084 | 1110 | ||
1085 | spin_unlock_irq(&q->lock); | 1111 | spin_unlock_irq(&q->lock); |
1086 | 1112 | ||
1087 | if (likely(test_bit(bit_nr, &page->flags))) { | 1113 | bit_is_set = test_bit(bit_nr, &page->flags); |
1114 | if (behavior == DROP) | ||
1115 | put_page(page); | ||
1116 | |||
1117 | if (likely(bit_is_set)) | ||
1088 | io_schedule(); | 1118 | io_schedule(); |
1089 | } | ||
1090 | 1119 | ||
1091 | if (lock) { | 1120 | if (behavior == EXCLUSIVE) { |
1092 | if (!test_and_set_bit_lock(bit_nr, &page->flags)) | 1121 | if (!test_and_set_bit_lock(bit_nr, &page->flags)) |
1093 | break; | 1122 | break; |
1094 | } else { | 1123 | } else if (behavior == SHARED) { |
1095 | if (!test_bit(bit_nr, &page->flags)) | 1124 | if (!test_bit(bit_nr, &page->flags)) |
1096 | break; | 1125 | break; |
1097 | } | 1126 | } |
@@ -1100,12 +1129,23 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, | |||
1100 | ret = -EINTR; | 1129 | ret = -EINTR; |
1101 | break; | 1130 | break; |
1102 | } | 1131 | } |
1132 | |||
1133 | if (behavior == DROP) { | ||
1134 | /* | ||
1135 | * We can no longer safely access page->flags: | ||
1136 | * even if CONFIG_MEMORY_HOTREMOVE is not enabled, | ||
1137 | * there is a risk of waiting forever on a page reused | ||
1138 | * for something that keeps it locked indefinitely. | ||
1139 | * But best check for -EINTR above before breaking. | ||
1140 | */ | ||
1141 | break; | ||
1142 | } | ||
1103 | } | 1143 | } |
1104 | 1144 | ||
1105 | finish_wait(q, wait); | 1145 | finish_wait(q, wait); |
1106 | 1146 | ||
1107 | if (thrashing) { | 1147 | if (thrashing) { |
1108 | if (!PageSwapBacked(page)) | 1148 | if (delayacct) |
1109 | delayacct_thrashing_end(); | 1149 | delayacct_thrashing_end(); |
1110 | psi_memstall_leave(&pflags); | 1150 | psi_memstall_leave(&pflags); |
1111 | } | 1151 | } |
@@ -1124,18 +1164,37 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, | |||
1124 | void wait_on_page_bit(struct page *page, int bit_nr) | 1164 | void wait_on_page_bit(struct page *page, int bit_nr) |
1125 | { | 1165 | { |
1126 | wait_queue_head_t *q = page_waitqueue(page); | 1166 | wait_queue_head_t *q = page_waitqueue(page); |
1127 | wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false); | 1167 | wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); |
1128 | } | 1168 | } |
1129 | EXPORT_SYMBOL(wait_on_page_bit); | 1169 | EXPORT_SYMBOL(wait_on_page_bit); |
1130 | 1170 | ||
1131 | int wait_on_page_bit_killable(struct page *page, int bit_nr) | 1171 | int wait_on_page_bit_killable(struct page *page, int bit_nr) |
1132 | { | 1172 | { |
1133 | wait_queue_head_t *q = page_waitqueue(page); | 1173 | wait_queue_head_t *q = page_waitqueue(page); |
1134 | return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false); | 1174 | return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED); |
1135 | } | 1175 | } |
1136 | EXPORT_SYMBOL(wait_on_page_bit_killable); | 1176 | EXPORT_SYMBOL(wait_on_page_bit_killable); |
1137 | 1177 | ||
1138 | /** | 1178 | /** |
1179 | * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked | ||
1180 | * @page: The page to wait for. | ||
1181 | * | ||
1182 | * The caller should hold a reference on @page. They expect the page to | ||
1183 | * become unlocked relatively soon, but do not wish to hold up migration | ||
1184 | * (for example) by holding the reference while waiting for the page to | ||
1185 | * come unlocked. After this function returns, the caller should not | ||
1186 | * dereference @page. | ||
1187 | */ | ||
1188 | void put_and_wait_on_page_locked(struct page *page) | ||
1189 | { | ||
1190 | wait_queue_head_t *q; | ||
1191 | |||
1192 | page = compound_head(page); | ||
1193 | q = page_waitqueue(page); | ||
1194 | wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP); | ||
1195 | } | ||
1196 | |||
1197 | /** | ||
1139 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue | 1198 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue |
1140 | * @page: Page defining the wait queue of interest | 1199 | * @page: Page defining the wait queue of interest |
1141 | * @waiter: Waiter to add to the queue | 1200 | * @waiter: Waiter to add to the queue |
@@ -1264,7 +1323,8 @@ void __lock_page(struct page *__page) | |||
1264 | { | 1323 | { |
1265 | struct page *page = compound_head(__page); | 1324 | struct page *page = compound_head(__page); |
1266 | wait_queue_head_t *q = page_waitqueue(page); | 1325 | wait_queue_head_t *q = page_waitqueue(page); |
1267 | wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true); | 1326 | wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, |
1327 | EXCLUSIVE); | ||
1268 | } | 1328 | } |
1269 | EXPORT_SYMBOL(__lock_page); | 1329 | EXPORT_SYMBOL(__lock_page); |
1270 | 1330 | ||
@@ -1272,7 +1332,8 @@ int __lock_page_killable(struct page *__page) | |||
1272 | { | 1332 | { |
1273 | struct page *page = compound_head(__page); | 1333 | struct page *page = compound_head(__page); |
1274 | wait_queue_head_t *q = page_waitqueue(page); | 1334 | wait_queue_head_t *q = page_waitqueue(page); |
1275 | return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true); | 1335 | return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, |
1336 | EXCLUSIVE); | ||
1276 | } | 1337 | } |
1277 | EXPORT_SYMBOL_GPL(__lock_page_killable); | 1338 | EXPORT_SYMBOL_GPL(__lock_page_killable); |
1278 | 1339 | ||
@@ -1540,7 +1601,7 @@ repeat: | |||
1540 | VM_BUG_ON_PAGE(page->index != offset, page); | 1601 | VM_BUG_ON_PAGE(page->index != offset, page); |
1541 | } | 1602 | } |
1542 | 1603 | ||
1543 | if (page && (fgp_flags & FGP_ACCESSED)) | 1604 | if (fgp_flags & FGP_ACCESSED) |
1544 | mark_page_accessed(page); | 1605 | mark_page_accessed(page); |
1545 | 1606 | ||
1546 | no_page: | 1607 | no_page: |
@@ -2553,6 +2614,13 @@ void filemap_map_pages(struct vm_fault *vmf, | |||
2553 | goto next; | 2614 | goto next; |
2554 | 2615 | ||
2555 | head = compound_head(page); | 2616 | head = compound_head(page); |
2617 | |||
2618 | /* | ||
2619 | * Check for a locked page first, as a speculative | ||
2620 | * reference may adversely influence page migration. | ||
2621 | */ | ||
2622 | if (PageLocked(head)) | ||
2623 | goto next; | ||
2556 | if (!page_cache_get_speculative(head)) | 2624 | if (!page_cache_get_speculative(head)) |
2557 | goto next; | 2625 | goto next; |
2558 | 2626 | ||
diff --git a/mm/highmem.c b/mm/highmem.c index 59db3223a5d6..107b10f9878e 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -105,9 +105,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) | |||
105 | } | 105 | } |
106 | #endif | 106 | #endif |
107 | 107 | ||
108 | unsigned long totalhigh_pages __read_mostly; | 108 | atomic_long_t _totalhigh_pages __read_mostly; |
109 | EXPORT_SYMBOL(totalhigh_pages); | 109 | EXPORT_SYMBOL(_totalhigh_pages); |
110 | |||
111 | 110 | ||
112 | EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); | 111 | EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); |
113 | 112 | ||
@@ -189,35 +189,30 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) | |||
189 | } | 189 | } |
190 | 190 | ||
191 | static int hmm_invalidate_range_start(struct mmu_notifier *mn, | 191 | static int hmm_invalidate_range_start(struct mmu_notifier *mn, |
192 | struct mm_struct *mm, | 192 | const struct mmu_notifier_range *range) |
193 | unsigned long start, | ||
194 | unsigned long end, | ||
195 | bool blockable) | ||
196 | { | 193 | { |
197 | struct hmm_update update; | 194 | struct hmm_update update; |
198 | struct hmm *hmm = mm->hmm; | 195 | struct hmm *hmm = range->mm->hmm; |
199 | 196 | ||
200 | VM_BUG_ON(!hmm); | 197 | VM_BUG_ON(!hmm); |
201 | 198 | ||
202 | update.start = start; | 199 | update.start = range->start; |
203 | update.end = end; | 200 | update.end = range->end; |
204 | update.event = HMM_UPDATE_INVALIDATE; | 201 | update.event = HMM_UPDATE_INVALIDATE; |
205 | update.blockable = blockable; | 202 | update.blockable = range->blockable; |
206 | return hmm_invalidate_range(hmm, true, &update); | 203 | return hmm_invalidate_range(hmm, true, &update); |
207 | } | 204 | } |
208 | 205 | ||
209 | static void hmm_invalidate_range_end(struct mmu_notifier *mn, | 206 | static void hmm_invalidate_range_end(struct mmu_notifier *mn, |
210 | struct mm_struct *mm, | 207 | const struct mmu_notifier_range *range) |
211 | unsigned long start, | ||
212 | unsigned long end) | ||
213 | { | 208 | { |
214 | struct hmm_update update; | 209 | struct hmm_update update; |
215 | struct hmm *hmm = mm->hmm; | 210 | struct hmm *hmm = range->mm->hmm; |
216 | 211 | ||
217 | VM_BUG_ON(!hmm); | 212 | VM_BUG_ON(!hmm); |
218 | 213 | ||
219 | update.start = start; | 214 | update.start = range->start; |
220 | update.end = end; | 215 | update.end = range->end; |
221 | update.event = HMM_UPDATE_INVALIDATE; | 216 | update.event = HMM_UPDATE_INVALIDATE; |
222 | update.blockable = true; | 217 | update.blockable = true; |
223 | hmm_invalidate_range(hmm, false, &update); | 218 | hmm_invalidate_range(hmm, false, &update); |
@@ -986,19 +981,13 @@ static void hmm_devmem_ref_exit(void *data) | |||
986 | struct hmm_devmem *devmem; | 981 | struct hmm_devmem *devmem; |
987 | 982 | ||
988 | devmem = container_of(ref, struct hmm_devmem, ref); | 983 | devmem = container_of(ref, struct hmm_devmem, ref); |
984 | wait_for_completion(&devmem->completion); | ||
989 | percpu_ref_exit(ref); | 985 | percpu_ref_exit(ref); |
990 | devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data); | ||
991 | } | 986 | } |
992 | 987 | ||
993 | static void hmm_devmem_ref_kill(void *data) | 988 | static void hmm_devmem_ref_kill(struct percpu_ref *ref) |
994 | { | 989 | { |
995 | struct percpu_ref *ref = data; | ||
996 | struct hmm_devmem *devmem; | ||
997 | |||
998 | devmem = container_of(ref, struct hmm_devmem, ref); | ||
999 | percpu_ref_kill(ref); | 990 | percpu_ref_kill(ref); |
1000 | wait_for_completion(&devmem->completion); | ||
1001 | devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data); | ||
1002 | } | 991 | } |
1003 | 992 | ||
1004 | static int hmm_devmem_fault(struct vm_area_struct *vma, | 993 | static int hmm_devmem_fault(struct vm_area_struct *vma, |
@@ -1021,172 +1010,6 @@ static void hmm_devmem_free(struct page *page, void *data) | |||
1021 | devmem->ops->free(devmem, page); | 1010 | devmem->ops->free(devmem, page); |
1022 | } | 1011 | } |
1023 | 1012 | ||
1024 | static DEFINE_MUTEX(hmm_devmem_lock); | ||
1025 | static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); | ||
1026 | |||
1027 | static void hmm_devmem_radix_release(struct resource *resource) | ||
1028 | { | ||
1029 | resource_size_t key; | ||
1030 | |||
1031 | mutex_lock(&hmm_devmem_lock); | ||
1032 | for (key = resource->start; | ||
1033 | key <= resource->end; | ||
1034 | key += PA_SECTION_SIZE) | ||
1035 | radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT); | ||
1036 | mutex_unlock(&hmm_devmem_lock); | ||
1037 | } | ||
1038 | |||
1039 | static void hmm_devmem_release(struct device *dev, void *data) | ||
1040 | { | ||
1041 | struct hmm_devmem *devmem = data; | ||
1042 | struct resource *resource = devmem->resource; | ||
1043 | unsigned long start_pfn, npages; | ||
1044 | struct zone *zone; | ||
1045 | struct page *page; | ||
1046 | |||
1047 | if (percpu_ref_tryget_live(&devmem->ref)) { | ||
1048 | dev_WARN(dev, "%s: page mapping is still live!\n", __func__); | ||
1049 | percpu_ref_put(&devmem->ref); | ||
1050 | } | ||
1051 | |||
1052 | /* pages are dead and unused, undo the arch mapping */ | ||
1053 | start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT; | ||
1054 | npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT; | ||
1055 | |||
1056 | page = pfn_to_page(start_pfn); | ||
1057 | zone = page_zone(page); | ||
1058 | |||
1059 | mem_hotplug_begin(); | ||
1060 | if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) | ||
1061 | __remove_pages(zone, start_pfn, npages, NULL); | ||
1062 | else | ||
1063 | arch_remove_memory(start_pfn << PAGE_SHIFT, | ||
1064 | npages << PAGE_SHIFT, NULL); | ||
1065 | mem_hotplug_done(); | ||
1066 | |||
1067 | hmm_devmem_radix_release(resource); | ||
1068 | } | ||
1069 | |||
1070 | static int hmm_devmem_pages_create(struct hmm_devmem *devmem) | ||
1071 | { | ||
1072 | resource_size_t key, align_start, align_size, align_end; | ||
1073 | struct device *device = devmem->device; | ||
1074 | int ret, nid, is_ram; | ||
1075 | |||
1076 | align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); | ||
1077 | align_size = ALIGN(devmem->resource->start + | ||
1078 | resource_size(devmem->resource), | ||
1079 | PA_SECTION_SIZE) - align_start; | ||
1080 | |||
1081 | is_ram = region_intersects(align_start, align_size, | ||
1082 | IORESOURCE_SYSTEM_RAM, | ||
1083 | IORES_DESC_NONE); | ||
1084 | if (is_ram == REGION_MIXED) { | ||
1085 | WARN_ONCE(1, "%s attempted on mixed region %pr\n", | ||
1086 | __func__, devmem->resource); | ||
1087 | return -ENXIO; | ||
1088 | } | ||
1089 | if (is_ram == REGION_INTERSECTS) | ||
1090 | return -ENXIO; | ||
1091 | |||
1092 | if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY) | ||
1093 | devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; | ||
1094 | else | ||
1095 | devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; | ||
1096 | |||
1097 | devmem->pagemap.res = *devmem->resource; | ||
1098 | devmem->pagemap.page_fault = hmm_devmem_fault; | ||
1099 | devmem->pagemap.page_free = hmm_devmem_free; | ||
1100 | devmem->pagemap.dev = devmem->device; | ||
1101 | devmem->pagemap.ref = &devmem->ref; | ||
1102 | devmem->pagemap.data = devmem; | ||
1103 | |||
1104 | mutex_lock(&hmm_devmem_lock); | ||
1105 | align_end = align_start + align_size - 1; | ||
1106 | for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { | ||
1107 | struct hmm_devmem *dup; | ||
1108 | |||
1109 | dup = radix_tree_lookup(&hmm_devmem_radix, | ||
1110 | key >> PA_SECTION_SHIFT); | ||
1111 | if (dup) { | ||
1112 | dev_err(device, "%s: collides with mapping for %s\n", | ||
1113 | __func__, dev_name(dup->device)); | ||
1114 | mutex_unlock(&hmm_devmem_lock); | ||
1115 | ret = -EBUSY; | ||
1116 | goto error; | ||
1117 | } | ||
1118 | ret = radix_tree_insert(&hmm_devmem_radix, | ||
1119 | key >> PA_SECTION_SHIFT, | ||
1120 | devmem); | ||
1121 | if (ret) { | ||
1122 | dev_err(device, "%s: failed: %d\n", __func__, ret); | ||
1123 | mutex_unlock(&hmm_devmem_lock); | ||
1124 | goto error_radix; | ||
1125 | } | ||
1126 | } | ||
1127 | mutex_unlock(&hmm_devmem_lock); | ||
1128 | |||
1129 | nid = dev_to_node(device); | ||
1130 | if (nid < 0) | ||
1131 | nid = numa_mem_id(); | ||
1132 | |||
1133 | mem_hotplug_begin(); | ||
1134 | /* | ||
1135 | * For device private memory we call add_pages() as we only need to | ||
1136 | * allocate and initialize struct page for the device memory. More- | ||
1137 | * over the device memory is un-accessible thus we do not want to | ||
1138 | * create a linear mapping for the memory like arch_add_memory() | ||
1139 | * would do. | ||
1140 | * | ||
1141 | * For device public memory, which is accesible by the CPU, we do | ||
1142 | * want the linear mapping and thus use arch_add_memory(). | ||
1143 | */ | ||
1144 | if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) | ||
1145 | ret = arch_add_memory(nid, align_start, align_size, NULL, | ||
1146 | false); | ||
1147 | else | ||
1148 | ret = add_pages(nid, align_start >> PAGE_SHIFT, | ||
1149 | align_size >> PAGE_SHIFT, NULL, false); | ||
1150 | if (ret) { | ||
1151 | mem_hotplug_done(); | ||
1152 | goto error_add_memory; | ||
1153 | } | ||
1154 | move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], | ||
1155 | align_start >> PAGE_SHIFT, | ||
1156 | align_size >> PAGE_SHIFT, NULL); | ||
1157 | mem_hotplug_done(); | ||
1158 | |||
1159 | /* | ||
1160 | * Initialization of the pages has been deferred until now in order | ||
1161 | * to allow us to do the work while not holding the hotplug lock. | ||
1162 | */ | ||
1163 | memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], | ||
1164 | align_start >> PAGE_SHIFT, | ||
1165 | align_size >> PAGE_SHIFT, &devmem->pagemap); | ||
1166 | |||
1167 | return 0; | ||
1168 | |||
1169 | error_add_memory: | ||
1170 | untrack_pfn(NULL, PHYS_PFN(align_start), align_size); | ||
1171 | error_radix: | ||
1172 | hmm_devmem_radix_release(devmem->resource); | ||
1173 | error: | ||
1174 | return ret; | ||
1175 | } | ||
1176 | |||
1177 | static int hmm_devmem_match(struct device *dev, void *data, void *match_data) | ||
1178 | { | ||
1179 | struct hmm_devmem *devmem = data; | ||
1180 | |||
1181 | return devmem->resource == match_data; | ||
1182 | } | ||
1183 | |||
1184 | static void hmm_devmem_pages_remove(struct hmm_devmem *devmem) | ||
1185 | { | ||
1186 | devres_release(devmem->device, &hmm_devmem_release, | ||
1187 | &hmm_devmem_match, devmem->resource); | ||
1188 | } | ||
1189 | |||
1190 | /* | 1013 | /* |
1191 | * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory | 1014 | * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory |
1192 | * | 1015 | * |
@@ -1210,12 +1033,12 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, | |||
1210 | { | 1033 | { |
1211 | struct hmm_devmem *devmem; | 1034 | struct hmm_devmem *devmem; |
1212 | resource_size_t addr; | 1035 | resource_size_t addr; |
1036 | void *result; | ||
1213 | int ret; | 1037 | int ret; |
1214 | 1038 | ||
1215 | dev_pagemap_get_ops(); | 1039 | dev_pagemap_get_ops(); |
1216 | 1040 | ||
1217 | devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), | 1041 | devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); |
1218 | GFP_KERNEL, dev_to_node(device)); | ||
1219 | if (!devmem) | 1042 | if (!devmem) |
1220 | return ERR_PTR(-ENOMEM); | 1043 | return ERR_PTR(-ENOMEM); |
1221 | 1044 | ||
@@ -1229,11 +1052,11 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, | |||
1229 | ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, | 1052 | ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, |
1230 | 0, GFP_KERNEL); | 1053 | 0, GFP_KERNEL); |
1231 | if (ret) | 1054 | if (ret) |
1232 | goto error_percpu_ref; | 1055 | return ERR_PTR(ret); |
1233 | 1056 | ||
1234 | ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); | 1057 | ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref); |
1235 | if (ret) | 1058 | if (ret) |
1236 | goto error_devm_add_action; | 1059 | return ERR_PTR(ret); |
1237 | 1060 | ||
1238 | size = ALIGN(size, PA_SECTION_SIZE); | 1061 | size = ALIGN(size, PA_SECTION_SIZE); |
1239 | addr = min((unsigned long)iomem_resource.end, | 1062 | addr = min((unsigned long)iomem_resource.end, |
@@ -1253,54 +1076,40 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, | |||
1253 | 1076 | ||
1254 | devmem->resource = devm_request_mem_region(device, addr, size, | 1077 | devmem->resource = devm_request_mem_region(device, addr, size, |
1255 | dev_name(device)); | 1078 | dev_name(device)); |
1256 | if (!devmem->resource) { | 1079 | if (!devmem->resource) |
1257 | ret = -ENOMEM; | 1080 | return ERR_PTR(-ENOMEM); |
1258 | goto error_no_resource; | ||
1259 | } | ||
1260 | break; | 1081 | break; |
1261 | } | 1082 | } |
1262 | if (!devmem->resource) { | 1083 | if (!devmem->resource) |
1263 | ret = -ERANGE; | 1084 | return ERR_PTR(-ERANGE); |
1264 | goto error_no_resource; | ||
1265 | } | ||
1266 | 1085 | ||
1267 | devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; | 1086 | devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; |
1268 | devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; | 1087 | devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; |
1269 | devmem->pfn_last = devmem->pfn_first + | 1088 | devmem->pfn_last = devmem->pfn_first + |
1270 | (resource_size(devmem->resource) >> PAGE_SHIFT); | 1089 | (resource_size(devmem->resource) >> PAGE_SHIFT); |
1090 | devmem->page_fault = hmm_devmem_fault; | ||
1271 | 1091 | ||
1272 | ret = hmm_devmem_pages_create(devmem); | 1092 | devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; |
1273 | if (ret) | 1093 | devmem->pagemap.res = *devmem->resource; |
1274 | goto error_pages; | 1094 | devmem->pagemap.page_free = hmm_devmem_free; |
1275 | 1095 | devmem->pagemap.altmap_valid = false; | |
1276 | devres_add(device, devmem); | 1096 | devmem->pagemap.ref = &devmem->ref; |
1277 | 1097 | devmem->pagemap.data = devmem; | |
1278 | ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); | 1098 | devmem->pagemap.kill = hmm_devmem_ref_kill; |
1279 | if (ret) { | ||
1280 | hmm_devmem_remove(devmem); | ||
1281 | return ERR_PTR(ret); | ||
1282 | } | ||
1283 | 1099 | ||
1100 | result = devm_memremap_pages(devmem->device, &devmem->pagemap); | ||
1101 | if (IS_ERR(result)) | ||
1102 | return result; | ||
1284 | return devmem; | 1103 | return devmem; |
1285 | |||
1286 | error_pages: | ||
1287 | devm_release_mem_region(device, devmem->resource->start, | ||
1288 | resource_size(devmem->resource)); | ||
1289 | error_no_resource: | ||
1290 | error_devm_add_action: | ||
1291 | hmm_devmem_ref_kill(&devmem->ref); | ||
1292 | hmm_devmem_ref_exit(&devmem->ref); | ||
1293 | error_percpu_ref: | ||
1294 | devres_free(devmem); | ||
1295 | return ERR_PTR(ret); | ||
1296 | } | 1104 | } |
1297 | EXPORT_SYMBOL(hmm_devmem_add); | 1105 | EXPORT_SYMBOL_GPL(hmm_devmem_add); |
1298 | 1106 | ||
1299 | struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, | 1107 | struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, |
1300 | struct device *device, | 1108 | struct device *device, |
1301 | struct resource *res) | 1109 | struct resource *res) |
1302 | { | 1110 | { |
1303 | struct hmm_devmem *devmem; | 1111 | struct hmm_devmem *devmem; |
1112 | void *result; | ||
1304 | int ret; | 1113 | int ret; |
1305 | 1114 | ||
1306 | if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) | 1115 | if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) |
@@ -1308,8 +1117,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, | |||
1308 | 1117 | ||
1309 | dev_pagemap_get_ops(); | 1118 | dev_pagemap_get_ops(); |
1310 | 1119 | ||
1311 | devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), | 1120 | devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); |
1312 | GFP_KERNEL, dev_to_node(device)); | ||
1313 | if (!devmem) | 1121 | if (!devmem) |
1314 | return ERR_PTR(-ENOMEM); | 1122 | return ERR_PTR(-ENOMEM); |
1315 | 1123 | ||
@@ -1323,71 +1131,32 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, | |||
1323 | ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, | 1131 | ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, |
1324 | 0, GFP_KERNEL); | 1132 | 0, GFP_KERNEL); |
1325 | if (ret) | 1133 | if (ret) |
1326 | goto error_percpu_ref; | 1134 | return ERR_PTR(ret); |
1327 | 1135 | ||
1328 | ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); | 1136 | ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, |
1137 | &devmem->ref); | ||
1329 | if (ret) | 1138 | if (ret) |
1330 | goto error_devm_add_action; | 1139 | return ERR_PTR(ret); |
1331 | |||
1332 | 1140 | ||
1333 | devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; | 1141 | devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; |
1334 | devmem->pfn_last = devmem->pfn_first + | 1142 | devmem->pfn_last = devmem->pfn_first + |
1335 | (resource_size(devmem->resource) >> PAGE_SHIFT); | 1143 | (resource_size(devmem->resource) >> PAGE_SHIFT); |
1144 | devmem->page_fault = hmm_devmem_fault; | ||
1336 | 1145 | ||
1337 | ret = hmm_devmem_pages_create(devmem); | 1146 | devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; |
1338 | if (ret) | 1147 | devmem->pagemap.res = *devmem->resource; |
1339 | goto error_devm_add_action; | 1148 | devmem->pagemap.page_free = hmm_devmem_free; |
1340 | 1149 | devmem->pagemap.altmap_valid = false; | |
1341 | devres_add(device, devmem); | 1150 | devmem->pagemap.ref = &devmem->ref; |
1342 | 1151 | devmem->pagemap.data = devmem; | |
1343 | ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); | 1152 | devmem->pagemap.kill = hmm_devmem_ref_kill; |
1344 | if (ret) { | ||
1345 | hmm_devmem_remove(devmem); | ||
1346 | return ERR_PTR(ret); | ||
1347 | } | ||
1348 | 1153 | ||
1154 | result = devm_memremap_pages(devmem->device, &devmem->pagemap); | ||
1155 | if (IS_ERR(result)) | ||
1156 | return result; | ||
1349 | return devmem; | 1157 | return devmem; |
1350 | |||
1351 | error_devm_add_action: | ||
1352 | hmm_devmem_ref_kill(&devmem->ref); | ||
1353 | hmm_devmem_ref_exit(&devmem->ref); | ||
1354 | error_percpu_ref: | ||
1355 | devres_free(devmem); | ||
1356 | return ERR_PTR(ret); | ||
1357 | } | ||
1358 | EXPORT_SYMBOL(hmm_devmem_add_resource); | ||
1359 | |||
1360 | /* | ||
1361 | * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE) | ||
1362 | * | ||
1363 | * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory | ||
1364 | * | ||
1365 | * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf | ||
1366 | * of the device driver. It will free struct page and remove the resource that | ||
1367 | * reserved the physical address range for this device memory. | ||
1368 | */ | ||
1369 | void hmm_devmem_remove(struct hmm_devmem *devmem) | ||
1370 | { | ||
1371 | resource_size_t start, size; | ||
1372 | struct device *device; | ||
1373 | bool cdm = false; | ||
1374 | |||
1375 | if (!devmem) | ||
1376 | return; | ||
1377 | |||
1378 | device = devmem->device; | ||
1379 | start = devmem->resource->start; | ||
1380 | size = resource_size(devmem->resource); | ||
1381 | |||
1382 | cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY; | ||
1383 | hmm_devmem_ref_kill(&devmem->ref); | ||
1384 | hmm_devmem_ref_exit(&devmem->ref); | ||
1385 | hmm_devmem_pages_remove(devmem); | ||
1386 | |||
1387 | if (!cdm) | ||
1388 | devm_release_mem_region(device, start, size); | ||
1389 | } | 1158 | } |
1390 | EXPORT_SYMBOL(hmm_devmem_remove); | 1159 | EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); |
1391 | 1160 | ||
1392 | /* | 1161 | /* |
1393 | * A device driver that wants to handle multiple devices memory through a | 1162 | * A device driver that wants to handle multiple devices memory through a |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e84a10b0d310..cbd977b1d60d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -62,6 +62,16 @@ static struct shrinker deferred_split_shrinker; | |||
62 | static atomic_t huge_zero_refcount; | 62 | static atomic_t huge_zero_refcount; |
63 | struct page *huge_zero_page __read_mostly; | 63 | struct page *huge_zero_page __read_mostly; |
64 | 64 | ||
65 | bool transparent_hugepage_enabled(struct vm_area_struct *vma) | ||
66 | { | ||
67 | if (vma_is_anonymous(vma)) | ||
68 | return __transparent_hugepage_enabled(vma); | ||
69 | if (vma_is_shmem(vma) && shmem_huge_enabled(vma)) | ||
70 | return __transparent_hugepage_enabled(vma); | ||
71 | |||
72 | return false; | ||
73 | } | ||
74 | |||
65 | static struct page *get_huge_zero_page(void) | 75 | static struct page *get_huge_zero_page(void) |
66 | { | 76 | { |
67 | struct page *zero_page; | 77 | struct page *zero_page; |
@@ -420,7 +430,7 @@ static int __init hugepage_init(void) | |||
420 | * where the extra memory used could hurt more than TLB overhead | 430 | * where the extra memory used could hurt more than TLB overhead |
421 | * is likely to save. The admin can still enable it through /sys. | 431 | * is likely to save. The admin can still enable it through /sys. |
422 | */ | 432 | */ |
423 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { | 433 | if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { |
424 | transparent_hugepage_flags = 0; | 434 | transparent_hugepage_flags = 0; |
425 | return 0; | 435 | return 0; |
426 | } | 436 | } |
@@ -1134,8 +1144,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, | |||
1134 | int i; | 1144 | int i; |
1135 | vm_fault_t ret = 0; | 1145 | vm_fault_t ret = 0; |
1136 | struct page **pages; | 1146 | struct page **pages; |
1137 | unsigned long mmun_start; /* For mmu_notifiers */ | 1147 | struct mmu_notifier_range range; |
1138 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1139 | 1148 | ||
1140 | pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), | 1149 | pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), |
1141 | GFP_KERNEL); | 1150 | GFP_KERNEL); |
@@ -1173,9 +1182,9 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, | |||
1173 | cond_resched(); | 1182 | cond_resched(); |
1174 | } | 1183 | } |
1175 | 1184 | ||
1176 | mmun_start = haddr; | 1185 | mmu_notifier_range_init(&range, vma->vm_mm, haddr, |
1177 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1186 | haddr + HPAGE_PMD_SIZE); |
1178 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); | 1187 | mmu_notifier_invalidate_range_start(&range); |
1179 | 1188 | ||
1180 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); | 1189 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
1181 | if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) | 1190 | if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) |
@@ -1220,8 +1229,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, | |||
1220 | * No need to double call mmu_notifier->invalidate_range() callback as | 1229 | * No need to double call mmu_notifier->invalidate_range() callback as |
1221 | * the above pmdp_huge_clear_flush_notify() did already call it. | 1230 | * the above pmdp_huge_clear_flush_notify() did already call it. |
1222 | */ | 1231 | */ |
1223 | mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, | 1232 | mmu_notifier_invalidate_range_only_end(&range); |
1224 | mmun_end); | ||
1225 | 1233 | ||
1226 | ret |= VM_FAULT_WRITE; | 1234 | ret |= VM_FAULT_WRITE; |
1227 | put_page(page); | 1235 | put_page(page); |
@@ -1231,7 +1239,7 @@ out: | |||
1231 | 1239 | ||
1232 | out_free_pages: | 1240 | out_free_pages: |
1233 | spin_unlock(vmf->ptl); | 1241 | spin_unlock(vmf->ptl); |
1234 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); | 1242 | mmu_notifier_invalidate_range_end(&range); |
1235 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 1243 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
1236 | memcg = (void *)page_private(pages[i]); | 1244 | memcg = (void *)page_private(pages[i]); |
1237 | set_page_private(pages[i], 0); | 1245 | set_page_private(pages[i], 0); |
@@ -1248,8 +1256,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) | |||
1248 | struct page *page = NULL, *new_page; | 1256 | struct page *page = NULL, *new_page; |
1249 | struct mem_cgroup *memcg; | 1257 | struct mem_cgroup *memcg; |
1250 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; | 1258 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
1251 | unsigned long mmun_start; /* For mmu_notifiers */ | 1259 | struct mmu_notifier_range range; |
1252 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1253 | gfp_t huge_gfp; /* for allocation and charge */ | 1260 | gfp_t huge_gfp; /* for allocation and charge */ |
1254 | vm_fault_t ret = 0; | 1261 | vm_fault_t ret = 0; |
1255 | 1262 | ||
@@ -1293,7 +1300,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) | |||
1293 | get_page(page); | 1300 | get_page(page); |
1294 | spin_unlock(vmf->ptl); | 1301 | spin_unlock(vmf->ptl); |
1295 | alloc: | 1302 | alloc: |
1296 | if (transparent_hugepage_enabled(vma) && | 1303 | if (__transparent_hugepage_enabled(vma) && |
1297 | !transparent_hugepage_debug_cow()) { | 1304 | !transparent_hugepage_debug_cow()) { |
1298 | huge_gfp = alloc_hugepage_direct_gfpmask(vma); | 1305 | huge_gfp = alloc_hugepage_direct_gfpmask(vma); |
1299 | new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); | 1306 | new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); |
@@ -1338,9 +1345,9 @@ alloc: | |||
1338 | vma, HPAGE_PMD_NR); | 1345 | vma, HPAGE_PMD_NR); |
1339 | __SetPageUptodate(new_page); | 1346 | __SetPageUptodate(new_page); |
1340 | 1347 | ||
1341 | mmun_start = haddr; | 1348 | mmu_notifier_range_init(&range, vma->vm_mm, haddr, |
1342 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1349 | haddr + HPAGE_PMD_SIZE); |
1343 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); | 1350 | mmu_notifier_invalidate_range_start(&range); |
1344 | 1351 | ||
1345 | spin_lock(vmf->ptl); | 1352 | spin_lock(vmf->ptl); |
1346 | if (page) | 1353 | if (page) |
@@ -1375,8 +1382,7 @@ out_mn: | |||
1375 | * No need to double call mmu_notifier->invalidate_range() callback as | 1382 | * No need to double call mmu_notifier->invalidate_range() callback as |
1376 | * the above pmdp_huge_clear_flush_notify() did already call it. | 1383 | * the above pmdp_huge_clear_flush_notify() did already call it. |
1377 | */ | 1384 | */ |
1378 | mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, | 1385 | mmu_notifier_invalidate_range_only_end(&range); |
1379 | mmun_end); | ||
1380 | out: | 1386 | out: |
1381 | return ret; | 1387 | return ret; |
1382 | out_unlock: | 1388 | out_unlock: |
@@ -1490,8 +1496,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) | |||
1490 | if (!get_page_unless_zero(page)) | 1496 | if (!get_page_unless_zero(page)) |
1491 | goto out_unlock; | 1497 | goto out_unlock; |
1492 | spin_unlock(vmf->ptl); | 1498 | spin_unlock(vmf->ptl); |
1493 | wait_on_page_locked(page); | 1499 | put_and_wait_on_page_locked(page); |
1494 | put_page(page); | ||
1495 | goto out; | 1500 | goto out; |
1496 | } | 1501 | } |
1497 | 1502 | ||
@@ -1527,8 +1532,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) | |||
1527 | if (!get_page_unless_zero(page)) | 1532 | if (!get_page_unless_zero(page)) |
1528 | goto out_unlock; | 1533 | goto out_unlock; |
1529 | spin_unlock(vmf->ptl); | 1534 | spin_unlock(vmf->ptl); |
1530 | wait_on_page_locked(page); | 1535 | put_and_wait_on_page_locked(page); |
1531 | put_page(page); | ||
1532 | goto out; | 1536 | goto out; |
1533 | } | 1537 | } |
1534 | 1538 | ||
@@ -2017,14 +2021,15 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, | |||
2017 | unsigned long address) | 2021 | unsigned long address) |
2018 | { | 2022 | { |
2019 | spinlock_t *ptl; | 2023 | spinlock_t *ptl; |
2020 | struct mm_struct *mm = vma->vm_mm; | 2024 | struct mmu_notifier_range range; |
2021 | unsigned long haddr = address & HPAGE_PUD_MASK; | ||
2022 | 2025 | ||
2023 | mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE); | 2026 | mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK, |
2024 | ptl = pud_lock(mm, pud); | 2027 | (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); |
2028 | mmu_notifier_invalidate_range_start(&range); | ||
2029 | ptl = pud_lock(vma->vm_mm, pud); | ||
2025 | if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) | 2030 | if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) |
2026 | goto out; | 2031 | goto out; |
2027 | __split_huge_pud_locked(vma, pud, haddr); | 2032 | __split_huge_pud_locked(vma, pud, range.start); |
2028 | 2033 | ||
2029 | out: | 2034 | out: |
2030 | spin_unlock(ptl); | 2035 | spin_unlock(ptl); |
@@ -2032,8 +2037,7 @@ out: | |||
2032 | * No need to double call mmu_notifier->invalidate_range() callback as | 2037 | * No need to double call mmu_notifier->invalidate_range() callback as |
2033 | * the above pudp_huge_clear_flush_notify() did already call it. | 2038 | * the above pudp_huge_clear_flush_notify() did already call it. |
2034 | */ | 2039 | */ |
2035 | mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + | 2040 | mmu_notifier_invalidate_range_only_end(&range); |
2036 | HPAGE_PUD_SIZE); | ||
2037 | } | 2041 | } |
2038 | #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ | 2042 | #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ |
2039 | 2043 | ||
@@ -2235,11 +2239,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
2235 | unsigned long address, bool freeze, struct page *page) | 2239 | unsigned long address, bool freeze, struct page *page) |
2236 | { | 2240 | { |
2237 | spinlock_t *ptl; | 2241 | spinlock_t *ptl; |
2238 | struct mm_struct *mm = vma->vm_mm; | 2242 | struct mmu_notifier_range range; |
2239 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
2240 | 2243 | ||
2241 | mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); | 2244 | mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK, |
2242 | ptl = pmd_lock(mm, pmd); | 2245 | (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); |
2246 | mmu_notifier_invalidate_range_start(&range); | ||
2247 | ptl = pmd_lock(vma->vm_mm, pmd); | ||
2243 | 2248 | ||
2244 | /* | 2249 | /* |
2245 | * If caller asks to setup a migration entries, we need a page to check | 2250 | * If caller asks to setup a migration entries, we need a page to check |
@@ -2255,7 +2260,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
2255 | clear_page_mlock(page); | 2260 | clear_page_mlock(page); |
2256 | } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) | 2261 | } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) |
2257 | goto out; | 2262 | goto out; |
2258 | __split_huge_pmd_locked(vma, pmd, haddr, freeze); | 2263 | __split_huge_pmd_locked(vma, pmd, range.start, freeze); |
2259 | out: | 2264 | out: |
2260 | spin_unlock(ptl); | 2265 | spin_unlock(ptl); |
2261 | /* | 2266 | /* |
@@ -2271,8 +2276,7 @@ out: | |||
2271 | * any further changes to individual pte will notify. So no need | 2276 | * any further changes to individual pte will notify. So no need |
2272 | * to call mmu_notifier->invalidate_range() | 2277 | * to call mmu_notifier->invalidate_range() |
2273 | */ | 2278 | */ |
2274 | mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + | 2279 | mmu_notifier_invalidate_range_only_end(&range); |
2275 | HPAGE_PMD_SIZE); | ||
2276 | } | 2280 | } |
2277 | 2281 | ||
2278 | void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, | 2282 | void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a80832487981..e37efd5d8318 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -3238,24 +3238,35 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
3238 | struct page *ptepage; | 3238 | struct page *ptepage; |
3239 | unsigned long addr; | 3239 | unsigned long addr; |
3240 | int cow; | 3240 | int cow; |
3241 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
3241 | struct hstate *h = hstate_vma(vma); | 3242 | struct hstate *h = hstate_vma(vma); |
3242 | unsigned long sz = huge_page_size(h); | 3243 | unsigned long sz = huge_page_size(h); |
3243 | unsigned long mmun_start; /* For mmu_notifiers */ | 3244 | struct mmu_notifier_range range; |
3244 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
3245 | int ret = 0; | 3245 | int ret = 0; |
3246 | 3246 | ||
3247 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 3247 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
3248 | 3248 | ||
3249 | mmun_start = vma->vm_start; | 3249 | if (cow) { |
3250 | mmun_end = vma->vm_end; | 3250 | mmu_notifier_range_init(&range, src, vma->vm_start, |
3251 | if (cow) | 3251 | vma->vm_end); |
3252 | mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); | 3252 | mmu_notifier_invalidate_range_start(&range); |
3253 | } else { | ||
3254 | /* | ||
3255 | * For shared mappings i_mmap_rwsem must be held to call | ||
3256 | * huge_pte_alloc, otherwise the returned ptep could go | ||
3257 | * away if part of a shared pmd and another thread calls | ||
3258 | * huge_pmd_unshare. | ||
3259 | */ | ||
3260 | i_mmap_lock_read(mapping); | ||
3261 | } | ||
3253 | 3262 | ||
3254 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { | 3263 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { |
3255 | spinlock_t *src_ptl, *dst_ptl; | 3264 | spinlock_t *src_ptl, *dst_ptl; |
3265 | |||
3256 | src_pte = huge_pte_offset(src, addr, sz); | 3266 | src_pte = huge_pte_offset(src, addr, sz); |
3257 | if (!src_pte) | 3267 | if (!src_pte) |
3258 | continue; | 3268 | continue; |
3269 | |||
3259 | dst_pte = huge_pte_alloc(dst, addr, sz); | 3270 | dst_pte = huge_pte_alloc(dst, addr, sz); |
3260 | if (!dst_pte) { | 3271 | if (!dst_pte) { |
3261 | ret = -ENOMEM; | 3272 | ret = -ENOMEM; |
@@ -3325,7 +3336,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
3325 | } | 3336 | } |
3326 | 3337 | ||
3327 | if (cow) | 3338 | if (cow) |
3328 | mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); | 3339 | mmu_notifier_invalidate_range_end(&range); |
3340 | else | ||
3341 | i_mmap_unlock_read(mapping); | ||
3329 | 3342 | ||
3330 | return ret; | 3343 | return ret; |
3331 | } | 3344 | } |
@@ -3342,8 +3355,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
3342 | struct page *page; | 3355 | struct page *page; |
3343 | struct hstate *h = hstate_vma(vma); | 3356 | struct hstate *h = hstate_vma(vma); |
3344 | unsigned long sz = huge_page_size(h); | 3357 | unsigned long sz = huge_page_size(h); |
3345 | unsigned long mmun_start = start; /* For mmu_notifiers */ | 3358 | struct mmu_notifier_range range; |
3346 | unsigned long mmun_end = end; /* For mmu_notifiers */ | ||
3347 | 3359 | ||
3348 | WARN_ON(!is_vm_hugetlb_page(vma)); | 3360 | WARN_ON(!is_vm_hugetlb_page(vma)); |
3349 | BUG_ON(start & ~huge_page_mask(h)); | 3361 | BUG_ON(start & ~huge_page_mask(h)); |
@@ -3359,8 +3371,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
3359 | /* | 3371 | /* |
3360 | * If sharing possible, alert mmu notifiers of worst case. | 3372 | * If sharing possible, alert mmu notifiers of worst case. |
3361 | */ | 3373 | */ |
3362 | adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end); | 3374 | mmu_notifier_range_init(&range, mm, start, end); |
3363 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 3375 | adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); |
3376 | mmu_notifier_invalidate_range_start(&range); | ||
3364 | address = start; | 3377 | address = start; |
3365 | for (; address < end; address += sz) { | 3378 | for (; address < end; address += sz) { |
3366 | ptep = huge_pte_offset(mm, address, sz); | 3379 | ptep = huge_pte_offset(mm, address, sz); |
@@ -3428,7 +3441,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
3428 | if (ref_page) | 3441 | if (ref_page) |
3429 | break; | 3442 | break; |
3430 | } | 3443 | } |
3431 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 3444 | mmu_notifier_invalidate_range_end(&range); |
3432 | tlb_end_vma(tlb, vma); | 3445 | tlb_end_vma(tlb, vma); |
3433 | } | 3446 | } |
3434 | 3447 | ||
@@ -3546,9 +3559,8 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3546 | struct page *old_page, *new_page; | 3559 | struct page *old_page, *new_page; |
3547 | int outside_reserve = 0; | 3560 | int outside_reserve = 0; |
3548 | vm_fault_t ret = 0; | 3561 | vm_fault_t ret = 0; |
3549 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
3550 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
3551 | unsigned long haddr = address & huge_page_mask(h); | 3562 | unsigned long haddr = address & huge_page_mask(h); |
3563 | struct mmu_notifier_range range; | ||
3552 | 3564 | ||
3553 | pte = huge_ptep_get(ptep); | 3565 | pte = huge_ptep_get(ptep); |
3554 | old_page = pte_page(pte); | 3566 | old_page = pte_page(pte); |
@@ -3627,9 +3639,8 @@ retry_avoidcopy: | |||
3627 | __SetPageUptodate(new_page); | 3639 | __SetPageUptodate(new_page); |
3628 | set_page_huge_active(new_page); | 3640 | set_page_huge_active(new_page); |
3629 | 3641 | ||
3630 | mmun_start = haddr; | 3642 | mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h)); |
3631 | mmun_end = mmun_start + huge_page_size(h); | 3643 | mmu_notifier_invalidate_range_start(&range); |
3632 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
3633 | 3644 | ||
3634 | /* | 3645 | /* |
3635 | * Retake the page table lock to check for racing updates | 3646 | * Retake the page table lock to check for racing updates |
@@ -3642,7 +3653,7 @@ retry_avoidcopy: | |||
3642 | 3653 | ||
3643 | /* Break COW */ | 3654 | /* Break COW */ |
3644 | huge_ptep_clear_flush(vma, haddr, ptep); | 3655 | huge_ptep_clear_flush(vma, haddr, ptep); |
3645 | mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); | 3656 | mmu_notifier_invalidate_range(mm, range.start, range.end); |
3646 | set_huge_pte_at(mm, haddr, ptep, | 3657 | set_huge_pte_at(mm, haddr, ptep, |
3647 | make_huge_pte(vma, new_page, 1)); | 3658 | make_huge_pte(vma, new_page, 1)); |
3648 | page_remove_rmap(old_page, true); | 3659 | page_remove_rmap(old_page, true); |
@@ -3651,7 +3662,7 @@ retry_avoidcopy: | |||
3651 | new_page = old_page; | 3662 | new_page = old_page; |
3652 | } | 3663 | } |
3653 | spin_unlock(ptl); | 3664 | spin_unlock(ptl); |
3654 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 3665 | mmu_notifier_invalidate_range_end(&range); |
3655 | out_release_all: | 3666 | out_release_all: |
3656 | restore_reserve_on_error(h, vma, haddr, new_page); | 3667 | restore_reserve_on_error(h, vma, haddr, new_page); |
3657 | put_page(new_page); | 3668 | put_page(new_page); |
@@ -3744,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, | |||
3744 | } | 3755 | } |
3745 | 3756 | ||
3746 | /* | 3757 | /* |
3747 | * Use page lock to guard against racing truncation | 3758 | * We can not race with truncation due to holding i_mmap_rwsem. |
3748 | * before we get page_table_lock. | 3759 | * Check once here for faults beyond end of file. |
3749 | */ | 3760 | */ |
3761 | size = i_size_read(mapping->host) >> huge_page_shift(h); | ||
3762 | if (idx >= size) | ||
3763 | goto out; | ||
3764 | |||
3750 | retry: | 3765 | retry: |
3751 | page = find_lock_page(mapping, idx); | 3766 | page = find_lock_page(mapping, idx); |
3752 | if (!page) { | 3767 | if (!page) { |
3753 | size = i_size_read(mapping->host) >> huge_page_shift(h); | ||
3754 | if (idx >= size) | ||
3755 | goto out; | ||
3756 | |||
3757 | /* | 3768 | /* |
3758 | * Check for page in userfault range | 3769 | * Check for page in userfault range |
3759 | */ | 3770 | */ |
@@ -3773,14 +3784,18 @@ retry: | |||
3773 | }; | 3784 | }; |
3774 | 3785 | ||
3775 | /* | 3786 | /* |
3776 | * hugetlb_fault_mutex must be dropped before | 3787 | * hugetlb_fault_mutex and i_mmap_rwsem must be |
3777 | * handling userfault. Reacquire after handling | 3788 | * dropped before handling userfault. Reacquire |
3778 | * fault to make calling code simpler. | 3789 | * after handling fault to make calling code simpler. |
3779 | */ | 3790 | */ |
3780 | hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, | 3791 | hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, |
3781 | idx, haddr); | 3792 | idx, haddr); |
3782 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 3793 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
3794 | i_mmap_unlock_read(mapping); | ||
3795 | |||
3783 | ret = handle_userfault(&vmf, VM_UFFD_MISSING); | 3796 | ret = handle_userfault(&vmf, VM_UFFD_MISSING); |
3797 | |||
3798 | i_mmap_lock_read(mapping); | ||
3784 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | 3799 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
3785 | goto out; | 3800 | goto out; |
3786 | } | 3801 | } |
@@ -3839,9 +3854,6 @@ retry: | |||
3839 | } | 3854 | } |
3840 | 3855 | ||
3841 | ptl = huge_pte_lock(h, mm, ptep); | 3856 | ptl = huge_pte_lock(h, mm, ptep); |
3842 | size = i_size_read(mapping->host) >> huge_page_shift(h); | ||
3843 | if (idx >= size) | ||
3844 | goto backout; | ||
3845 | 3857 | ||
3846 | ret = 0; | 3858 | ret = 0; |
3847 | if (!huge_pte_none(huge_ptep_get(ptep))) | 3859 | if (!huge_pte_none(huge_ptep_get(ptep))) |
@@ -3928,6 +3940,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3928 | 3940 | ||
3929 | ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); | 3941 | ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); |
3930 | if (ptep) { | 3942 | if (ptep) { |
3943 | /* | ||
3944 | * Since we hold no locks, ptep could be stale. That is | ||
3945 | * OK as we are only making decisions based on content and | ||
3946 | * not actually modifying content here. | ||
3947 | */ | ||
3931 | entry = huge_ptep_get(ptep); | 3948 | entry = huge_ptep_get(ptep); |
3932 | if (unlikely(is_hugetlb_entry_migration(entry))) { | 3949 | if (unlikely(is_hugetlb_entry_migration(entry))) { |
3933 | migration_entry_wait_huge(vma, mm, ptep); | 3950 | migration_entry_wait_huge(vma, mm, ptep); |
@@ -3935,20 +3952,33 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3935 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 3952 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) |
3936 | return VM_FAULT_HWPOISON_LARGE | | 3953 | return VM_FAULT_HWPOISON_LARGE | |
3937 | VM_FAULT_SET_HINDEX(hstate_index(h)); | 3954 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
3938 | } else { | ||
3939 | ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); | ||
3940 | if (!ptep) | ||
3941 | return VM_FAULT_OOM; | ||
3942 | } | 3955 | } |
3943 | 3956 | ||
3957 | /* | ||
3958 | * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold | ||
3959 | * until finished with ptep. This serves two purposes: | ||
3960 | * 1) It prevents huge_pmd_unshare from being called elsewhere | ||
3961 | * and making the ptep no longer valid. | ||
3962 | * 2) It synchronizes us with file truncation. | ||
3963 | * | ||
3964 | * ptep could have already be assigned via huge_pte_offset. That | ||
3965 | * is OK, as huge_pte_alloc will return the same value unless | ||
3966 | * something changed. | ||
3967 | */ | ||
3944 | mapping = vma->vm_file->f_mapping; | 3968 | mapping = vma->vm_file->f_mapping; |
3945 | idx = vma_hugecache_offset(h, vma, haddr); | 3969 | i_mmap_lock_read(mapping); |
3970 | ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); | ||
3971 | if (!ptep) { | ||
3972 | i_mmap_unlock_read(mapping); | ||
3973 | return VM_FAULT_OOM; | ||
3974 | } | ||
3946 | 3975 | ||
3947 | /* | 3976 | /* |
3948 | * Serialize hugepage allocation and instantiation, so that we don't | 3977 | * Serialize hugepage allocation and instantiation, so that we don't |
3949 | * get spurious allocation failures if two CPUs race to instantiate | 3978 | * get spurious allocation failures if two CPUs race to instantiate |
3950 | * the same page in the page cache. | 3979 | * the same page in the page cache. |
3951 | */ | 3980 | */ |
3981 | idx = vma_hugecache_offset(h, vma, haddr); | ||
3952 | hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); | 3982 | hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); |
3953 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | 3983 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
3954 | 3984 | ||
@@ -4036,6 +4066,7 @@ out_ptl: | |||
4036 | } | 4066 | } |
4037 | out_mutex: | 4067 | out_mutex: |
4038 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 4068 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
4069 | i_mmap_unlock_read(mapping); | ||
4039 | /* | 4070 | /* |
4040 | * Generally it's safe to hold refcount during waiting page lock. But | 4071 | * Generally it's safe to hold refcount during waiting page lock. But |
4041 | * here we just wait to defer the next page fault to avoid busy loop and | 4072 | * here we just wait to defer the next page fault to avoid busy loop and |
@@ -4340,21 +4371,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
4340 | pte_t pte; | 4371 | pte_t pte; |
4341 | struct hstate *h = hstate_vma(vma); | 4372 | struct hstate *h = hstate_vma(vma); |
4342 | unsigned long pages = 0; | 4373 | unsigned long pages = 0; |
4343 | unsigned long f_start = start; | ||
4344 | unsigned long f_end = end; | ||
4345 | bool shared_pmd = false; | 4374 | bool shared_pmd = false; |
4375 | struct mmu_notifier_range range; | ||
4346 | 4376 | ||
4347 | /* | 4377 | /* |
4348 | * In the case of shared PMDs, the area to flush could be beyond | 4378 | * In the case of shared PMDs, the area to flush could be beyond |
4349 | * start/end. Set f_start/f_end to cover the maximum possible | 4379 | * start/end. Set range.start/range.end to cover the maximum possible |
4350 | * range if PMD sharing is possible. | 4380 | * range if PMD sharing is possible. |
4351 | */ | 4381 | */ |
4352 | adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end); | 4382 | mmu_notifier_range_init(&range, mm, start, end); |
4383 | adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); | ||
4353 | 4384 | ||
4354 | BUG_ON(address >= end); | 4385 | BUG_ON(address >= end); |
4355 | flush_cache_range(vma, f_start, f_end); | 4386 | flush_cache_range(vma, range.start, range.end); |
4356 | 4387 | ||
4357 | mmu_notifier_invalidate_range_start(mm, f_start, f_end); | 4388 | mmu_notifier_invalidate_range_start(&range); |
4358 | i_mmap_lock_write(vma->vm_file->f_mapping); | 4389 | i_mmap_lock_write(vma->vm_file->f_mapping); |
4359 | for (; address < end; address += huge_page_size(h)) { | 4390 | for (; address < end; address += huge_page_size(h)) { |
4360 | spinlock_t *ptl; | 4391 | spinlock_t *ptl; |
@@ -4405,7 +4436,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
4405 | * did unshare a page of pmds, flush the range corresponding to the pud. | 4436 | * did unshare a page of pmds, flush the range corresponding to the pud. |
4406 | */ | 4437 | */ |
4407 | if (shared_pmd) | 4438 | if (shared_pmd) |
4408 | flush_hugetlb_tlb_range(vma, f_start, f_end); | 4439 | flush_hugetlb_tlb_range(vma, range.start, range.end); |
4409 | else | 4440 | else |
4410 | flush_hugetlb_tlb_range(vma, start, end); | 4441 | flush_hugetlb_tlb_range(vma, start, end); |
4411 | /* | 4442 | /* |
@@ -4415,7 +4446,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
4415 | * See Documentation/vm/mmu_notifier.rst | 4446 | * See Documentation/vm/mmu_notifier.rst |
4416 | */ | 4447 | */ |
4417 | i_mmap_unlock_write(vma->vm_file->f_mapping); | 4448 | i_mmap_unlock_write(vma->vm_file->f_mapping); |
4418 | mmu_notifier_invalidate_range_end(mm, f_start, f_end); | 4449 | mmu_notifier_invalidate_range_end(&range); |
4419 | 4450 | ||
4420 | return pages << h->order; | 4451 | return pages << h->order; |
4421 | } | 4452 | } |
@@ -4640,10 +4671,12 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, | |||
4640 | * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() | 4671 | * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() |
4641 | * and returns the corresponding pte. While this is not necessary for the | 4672 | * and returns the corresponding pte. While this is not necessary for the |
4642 | * !shared pmd case because we can allocate the pmd later as well, it makes the | 4673 | * !shared pmd case because we can allocate the pmd later as well, it makes the |
4643 | * code much cleaner. pmd allocation is essential for the shared case because | 4674 | * code much cleaner. |
4644 | * pud has to be populated inside the same i_mmap_rwsem section - otherwise | 4675 | * |
4645 | * racing tasks could either miss the sharing (see huge_pte_offset) or select a | 4676 | * This routine must be called with i_mmap_rwsem held in at least read mode. |
4646 | * bad pmd for sharing. | 4677 | * For hugetlbfs, this prevents removal of any page table entries associated |
4678 | * with the address space. This is important as we are setting up sharing | ||
4679 | * based on existing page table entries (mappings). | ||
4647 | */ | 4680 | */ |
4648 | pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | 4681 | pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) |
4649 | { | 4682 | { |
@@ -4660,7 +4693,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
4660 | if (!vma_shareable(vma, addr)) | 4693 | if (!vma_shareable(vma, addr)) |
4661 | return (pte_t *)pmd_alloc(mm, pud, addr); | 4694 | return (pte_t *)pmd_alloc(mm, pud, addr); |
4662 | 4695 | ||
4663 | i_mmap_lock_write(mapping); | ||
4664 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { | 4696 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { |
4665 | if (svma == vma) | 4697 | if (svma == vma) |
4666 | continue; | 4698 | continue; |
@@ -4690,7 +4722,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
4690 | spin_unlock(ptl); | 4722 | spin_unlock(ptl); |
4691 | out: | 4723 | out: |
4692 | pte = (pte_t *)pmd_alloc(mm, pud, addr); | 4724 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
4693 | i_mmap_unlock_write(mapping); | ||
4694 | return pte; | 4725 | return pte; |
4695 | } | 4726 | } |
4696 | 4727 | ||
@@ -4701,7 +4732,7 @@ out: | |||
4701 | * indicated by page_count > 1, unmap is achieved by clearing pud and | 4732 | * indicated by page_count > 1, unmap is achieved by clearing pud and |
4702 | * decrementing the ref count. If count == 1, the pte page is not shared. | 4733 | * decrementing the ref count. If count == 1, the pte page is not shared. |
4703 | * | 4734 | * |
4704 | * called with page table lock held. | 4735 | * Called with page table lock held and i_mmap_rwsem held in write mode. |
4705 | * | 4736 | * |
4706 | * returns: 1 successfully unmapped a shared pte page | 4737 | * returns: 1 successfully unmapped a shared pte page |
4707 | * 0 the underlying pte page is not shared, or it is the last user | 4738 | * 0 the underlying pte page is not shared, or it is the last user |
diff --git a/mm/internal.h b/mm/internal.h index 291eb2b6d1d8..f4a7bb02decf 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -444,6 +444,16 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
444 | #define NODE_RECLAIM_SOME 0 | 444 | #define NODE_RECLAIM_SOME 0 |
445 | #define NODE_RECLAIM_SUCCESS 1 | 445 | #define NODE_RECLAIM_SUCCESS 1 |
446 | 446 | ||
447 | #ifdef CONFIG_NUMA | ||
448 | extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); | ||
449 | #else | ||
450 | static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, | ||
451 | unsigned int order) | ||
452 | { | ||
453 | return NODE_RECLAIM_NOSCAN; | ||
454 | } | ||
455 | #endif | ||
456 | |||
447 | extern int hwpoison_filter(struct page *p); | 457 | extern int hwpoison_filter(struct page *p); |
448 | 458 | ||
449 | extern u32 hwpoison_filter_dev_major; | 459 | extern u32 hwpoison_filter_dev_major; |
@@ -480,10 +490,16 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
480 | #define ALLOC_OOM ALLOC_NO_WATERMARKS | 490 | #define ALLOC_OOM ALLOC_NO_WATERMARKS |
481 | #endif | 491 | #endif |
482 | 492 | ||
483 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | 493 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ |
484 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 494 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
485 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 495 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
486 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ | 496 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ |
497 | #ifdef CONFIG_ZONE_DMA32 | ||
498 | #define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */ | ||
499 | #else | ||
500 | #define ALLOC_NOFRAGMENT 0x0 | ||
501 | #endif | ||
502 | #define ALLOC_KSWAPD 0x200 /* allow waking of kswapd */ | ||
487 | 503 | ||
488 | enum ttu_flags; | 504 | enum ttu_flags; |
489 | struct tlbflush_unmap_batch; | 505 | struct tlbflush_unmap_batch; |
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 3289db38bc87..0a14fcff70ed 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile | |||
@@ -1,11 +1,18 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | 1 | # SPDX-License-Identifier: GPL-2.0 |
2 | KASAN_SANITIZE := n | 2 | KASAN_SANITIZE := n |
3 | UBSAN_SANITIZE_kasan.o := n | 3 | UBSAN_SANITIZE_common.o := n |
4 | UBSAN_SANITIZE_generic.o := n | ||
5 | UBSAN_SANITIZE_tags.o := n | ||
4 | KCOV_INSTRUMENT := n | 6 | KCOV_INSTRUMENT := n |
5 | 7 | ||
6 | CFLAGS_REMOVE_kasan.o = -pg | 8 | CFLAGS_REMOVE_generic.o = -pg |
7 | # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 | 9 | # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 |
8 | # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 | 10 | # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 |
9 | CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) | ||
10 | 11 | ||
11 | obj-y := kasan.o report.o kasan_init.o quarantine.o | 12 | CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) |
13 | CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) | ||
14 | CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) | ||
15 | |||
16 | obj-$(CONFIG_KASAN) := common.o init.o report.o | ||
17 | obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o | ||
18 | obj-$(CONFIG_KASAN_SW_TAGS) += tags.o tags_report.o | ||
diff --git a/mm/kasan/kasan.c b/mm/kasan/common.c index c3bd5209da38..03d5d1374ca7 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/common.c | |||
@@ -1,5 +1,6 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
1 | /* | 2 | /* |
2 | * This file contains shadow memory manipulation code. | 3 | * This file contains common generic and tag-based KASAN code. |
3 | * | 4 | * |
4 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. | 5 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. |
5 | * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> | 6 | * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> |
@@ -13,9 +14,6 @@ | |||
13 | * | 14 | * |
14 | */ | 15 | */ |
15 | 16 | ||
16 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
17 | #define DISABLE_BRANCH_PROFILING | ||
18 | |||
19 | #include <linux/export.h> | 17 | #include <linux/export.h> |
20 | #include <linux/interrupt.h> | 18 | #include <linux/interrupt.h> |
21 | #include <linux/init.h> | 19 | #include <linux/init.h> |
@@ -40,6 +38,53 @@ | |||
40 | #include "kasan.h" | 38 | #include "kasan.h" |
41 | #include "../slab.h" | 39 | #include "../slab.h" |
42 | 40 | ||
41 | static inline int in_irqentry_text(unsigned long ptr) | ||
42 | { | ||
43 | return (ptr >= (unsigned long)&__irqentry_text_start && | ||
44 | ptr < (unsigned long)&__irqentry_text_end) || | ||
45 | (ptr >= (unsigned long)&__softirqentry_text_start && | ||
46 | ptr < (unsigned long)&__softirqentry_text_end); | ||
47 | } | ||
48 | |||
49 | static inline void filter_irq_stacks(struct stack_trace *trace) | ||
50 | { | ||
51 | int i; | ||
52 | |||
53 | if (!trace->nr_entries) | ||
54 | return; | ||
55 | for (i = 0; i < trace->nr_entries; i++) | ||
56 | if (in_irqentry_text(trace->entries[i])) { | ||
57 | /* Include the irqentry function into the stack. */ | ||
58 | trace->nr_entries = i + 1; | ||
59 | break; | ||
60 | } | ||
61 | } | ||
62 | |||
63 | static inline depot_stack_handle_t save_stack(gfp_t flags) | ||
64 | { | ||
65 | unsigned long entries[KASAN_STACK_DEPTH]; | ||
66 | struct stack_trace trace = { | ||
67 | .nr_entries = 0, | ||
68 | .entries = entries, | ||
69 | .max_entries = KASAN_STACK_DEPTH, | ||
70 | .skip = 0 | ||
71 | }; | ||
72 | |||
73 | save_stack_trace(&trace); | ||
74 | filter_irq_stacks(&trace); | ||
75 | if (trace.nr_entries != 0 && | ||
76 | trace.entries[trace.nr_entries-1] == ULONG_MAX) | ||
77 | trace.nr_entries--; | ||
78 | |||
79 | return depot_save_stack(&trace, flags); | ||
80 | } | ||
81 | |||
82 | static inline void set_track(struct kasan_track *track, gfp_t flags) | ||
83 | { | ||
84 | track->pid = current->pid; | ||
85 | track->stack = save_stack(flags); | ||
86 | } | ||
87 | |||
43 | void kasan_enable_current(void) | 88 | void kasan_enable_current(void) |
44 | { | 89 | { |
45 | current->kasan_depth++; | 90 | current->kasan_depth++; |
@@ -50,27 +95,85 @@ void kasan_disable_current(void) | |||
50 | current->kasan_depth--; | 95 | current->kasan_depth--; |
51 | } | 96 | } |
52 | 97 | ||
98 | void kasan_check_read(const volatile void *p, unsigned int size) | ||
99 | { | ||
100 | check_memory_region((unsigned long)p, size, false, _RET_IP_); | ||
101 | } | ||
102 | EXPORT_SYMBOL(kasan_check_read); | ||
103 | |||
104 | void kasan_check_write(const volatile void *p, unsigned int size) | ||
105 | { | ||
106 | check_memory_region((unsigned long)p, size, true, _RET_IP_); | ||
107 | } | ||
108 | EXPORT_SYMBOL(kasan_check_write); | ||
109 | |||
110 | #undef memset | ||
111 | void *memset(void *addr, int c, size_t len) | ||
112 | { | ||
113 | check_memory_region((unsigned long)addr, len, true, _RET_IP_); | ||
114 | |||
115 | return __memset(addr, c, len); | ||
116 | } | ||
117 | |||
118 | #undef memmove | ||
119 | void *memmove(void *dest, const void *src, size_t len) | ||
120 | { | ||
121 | check_memory_region((unsigned long)src, len, false, _RET_IP_); | ||
122 | check_memory_region((unsigned long)dest, len, true, _RET_IP_); | ||
123 | |||
124 | return __memmove(dest, src, len); | ||
125 | } | ||
126 | |||
127 | #undef memcpy | ||
128 | void *memcpy(void *dest, const void *src, size_t len) | ||
129 | { | ||
130 | check_memory_region((unsigned long)src, len, false, _RET_IP_); | ||
131 | check_memory_region((unsigned long)dest, len, true, _RET_IP_); | ||
132 | |||
133 | return __memcpy(dest, src, len); | ||
134 | } | ||
135 | |||
53 | /* | 136 | /* |
54 | * Poisons the shadow memory for 'size' bytes starting from 'addr'. | 137 | * Poisons the shadow memory for 'size' bytes starting from 'addr'. |
55 | * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. | 138 | * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. |
56 | */ | 139 | */ |
57 | static void kasan_poison_shadow(const void *address, size_t size, u8 value) | 140 | void kasan_poison_shadow(const void *address, size_t size, u8 value) |
58 | { | 141 | { |
59 | void *shadow_start, *shadow_end; | 142 | void *shadow_start, *shadow_end; |
60 | 143 | ||
144 | /* | ||
145 | * Perform shadow offset calculation based on untagged address, as | ||
146 | * some of the callers (e.g. kasan_poison_object_data) pass tagged | ||
147 | * addresses to this function. | ||
148 | */ | ||
149 | address = reset_tag(address); | ||
150 | |||
61 | shadow_start = kasan_mem_to_shadow(address); | 151 | shadow_start = kasan_mem_to_shadow(address); |
62 | shadow_end = kasan_mem_to_shadow(address + size); | 152 | shadow_end = kasan_mem_to_shadow(address + size); |
63 | 153 | ||
64 | memset(shadow_start, value, shadow_end - shadow_start); | 154 | __memset(shadow_start, value, shadow_end - shadow_start); |
65 | } | 155 | } |
66 | 156 | ||
67 | void kasan_unpoison_shadow(const void *address, size_t size) | 157 | void kasan_unpoison_shadow(const void *address, size_t size) |
68 | { | 158 | { |
69 | kasan_poison_shadow(address, size, 0); | 159 | u8 tag = get_tag(address); |
160 | |||
161 | /* | ||
162 | * Perform shadow offset calculation based on untagged address, as | ||
163 | * some of the callers (e.g. kasan_unpoison_object_data) pass tagged | ||
164 | * addresses to this function. | ||
165 | */ | ||
166 | address = reset_tag(address); | ||
167 | |||
168 | kasan_poison_shadow(address, size, tag); | ||
70 | 169 | ||
71 | if (size & KASAN_SHADOW_MASK) { | 170 | if (size & KASAN_SHADOW_MASK) { |
72 | u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); | 171 | u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); |
73 | *shadow = size & KASAN_SHADOW_MASK; | 172 | |
173 | if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) | ||
174 | *shadow = tag; | ||
175 | else | ||
176 | *shadow = size & KASAN_SHADOW_MASK; | ||
74 | } | 177 | } |
75 | } | 178 | } |
76 | 179 | ||
@@ -116,199 +219,18 @@ void kasan_unpoison_stack_above_sp_to(const void *watermark) | |||
116 | kasan_unpoison_shadow(sp, size); | 219 | kasan_unpoison_shadow(sp, size); |
117 | } | 220 | } |
118 | 221 | ||
119 | /* | 222 | void kasan_alloc_pages(struct page *page, unsigned int order) |
120 | * All functions below always inlined so compiler could | ||
121 | * perform better optimizations in each of __asan_loadX/__assn_storeX | ||
122 | * depending on memory access size X. | ||
123 | */ | ||
124 | |||
125 | static __always_inline bool memory_is_poisoned_1(unsigned long addr) | ||
126 | { | ||
127 | s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); | ||
128 | |||
129 | if (unlikely(shadow_value)) { | ||
130 | s8 last_accessible_byte = addr & KASAN_SHADOW_MASK; | ||
131 | return unlikely(last_accessible_byte >= shadow_value); | ||
132 | } | ||
133 | |||
134 | return false; | ||
135 | } | ||
136 | |||
137 | static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, | ||
138 | unsigned long size) | ||
139 | { | ||
140 | u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); | ||
141 | |||
142 | /* | ||
143 | * Access crosses 8(shadow size)-byte boundary. Such access maps | ||
144 | * into 2 shadow bytes, so we need to check them both. | ||
145 | */ | ||
146 | if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1)) | ||
147 | return *shadow_addr || memory_is_poisoned_1(addr + size - 1); | ||
148 | |||
149 | return memory_is_poisoned_1(addr + size - 1); | ||
150 | } | ||
151 | |||
152 | static __always_inline bool memory_is_poisoned_16(unsigned long addr) | ||
153 | { | ||
154 | u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); | ||
155 | |||
156 | /* Unaligned 16-bytes access maps into 3 shadow bytes. */ | ||
157 | if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) | ||
158 | return *shadow_addr || memory_is_poisoned_1(addr + 15); | ||
159 | |||
160 | return *shadow_addr; | ||
161 | } | ||
162 | |||
163 | static __always_inline unsigned long bytes_is_nonzero(const u8 *start, | ||
164 | size_t size) | ||
165 | { | ||
166 | while (size) { | ||
167 | if (unlikely(*start)) | ||
168 | return (unsigned long)start; | ||
169 | start++; | ||
170 | size--; | ||
171 | } | ||
172 | |||
173 | return 0; | ||
174 | } | ||
175 | |||
176 | static __always_inline unsigned long memory_is_nonzero(const void *start, | ||
177 | const void *end) | ||
178 | { | ||
179 | unsigned int words; | ||
180 | unsigned long ret; | ||
181 | unsigned int prefix = (unsigned long)start % 8; | ||
182 | |||
183 | if (end - start <= 16) | ||
184 | return bytes_is_nonzero(start, end - start); | ||
185 | |||
186 | if (prefix) { | ||
187 | prefix = 8 - prefix; | ||
188 | ret = bytes_is_nonzero(start, prefix); | ||
189 | if (unlikely(ret)) | ||
190 | return ret; | ||
191 | start += prefix; | ||
192 | } | ||
193 | |||
194 | words = (end - start) / 8; | ||
195 | while (words) { | ||
196 | if (unlikely(*(u64 *)start)) | ||
197 | return bytes_is_nonzero(start, 8); | ||
198 | start += 8; | ||
199 | words--; | ||
200 | } | ||
201 | |||
202 | return bytes_is_nonzero(start, (end - start) % 8); | ||
203 | } | ||
204 | |||
205 | static __always_inline bool memory_is_poisoned_n(unsigned long addr, | ||
206 | size_t size) | ||
207 | { | ||
208 | unsigned long ret; | ||
209 | |||
210 | ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), | ||
211 | kasan_mem_to_shadow((void *)addr + size - 1) + 1); | ||
212 | |||
213 | if (unlikely(ret)) { | ||
214 | unsigned long last_byte = addr + size - 1; | ||
215 | s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); | ||
216 | |||
217 | if (unlikely(ret != (unsigned long)last_shadow || | ||
218 | ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) | ||
219 | return true; | ||
220 | } | ||
221 | return false; | ||
222 | } | ||
223 | |||
224 | static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) | ||
225 | { | ||
226 | if (__builtin_constant_p(size)) { | ||
227 | switch (size) { | ||
228 | case 1: | ||
229 | return memory_is_poisoned_1(addr); | ||
230 | case 2: | ||
231 | case 4: | ||
232 | case 8: | ||
233 | return memory_is_poisoned_2_4_8(addr, size); | ||
234 | case 16: | ||
235 | return memory_is_poisoned_16(addr); | ||
236 | default: | ||
237 | BUILD_BUG(); | ||
238 | } | ||
239 | } | ||
240 | |||
241 | return memory_is_poisoned_n(addr, size); | ||
242 | } | ||
243 | |||
244 | static __always_inline void check_memory_region_inline(unsigned long addr, | ||
245 | size_t size, bool write, | ||
246 | unsigned long ret_ip) | ||
247 | { | 223 | { |
248 | if (unlikely(size == 0)) | 224 | u8 tag; |
249 | return; | 225 | unsigned long i; |
250 | 226 | ||
251 | if (unlikely((void *)addr < | 227 | if (unlikely(PageHighMem(page))) |
252 | kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { | ||
253 | kasan_report(addr, size, write, ret_ip); | ||
254 | return; | 228 | return; |
255 | } | ||
256 | 229 | ||
257 | if (likely(!memory_is_poisoned(addr, size))) | 230 | tag = random_tag(); |
258 | return; | 231 | for (i = 0; i < (1 << order); i++) |
259 | 232 | page_kasan_tag_set(page + i, tag); | |
260 | kasan_report(addr, size, write, ret_ip); | 233 | kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); |
261 | } | ||
262 | |||
263 | static void check_memory_region(unsigned long addr, | ||
264 | size_t size, bool write, | ||
265 | unsigned long ret_ip) | ||
266 | { | ||
267 | check_memory_region_inline(addr, size, write, ret_ip); | ||
268 | } | ||
269 | |||
270 | void kasan_check_read(const volatile void *p, unsigned int size) | ||
271 | { | ||
272 | check_memory_region((unsigned long)p, size, false, _RET_IP_); | ||
273 | } | ||
274 | EXPORT_SYMBOL(kasan_check_read); | ||
275 | |||
276 | void kasan_check_write(const volatile void *p, unsigned int size) | ||
277 | { | ||
278 | check_memory_region((unsigned long)p, size, true, _RET_IP_); | ||
279 | } | ||
280 | EXPORT_SYMBOL(kasan_check_write); | ||
281 | |||
282 | #undef memset | ||
283 | void *memset(void *addr, int c, size_t len) | ||
284 | { | ||
285 | check_memory_region((unsigned long)addr, len, true, _RET_IP_); | ||
286 | |||
287 | return __memset(addr, c, len); | ||
288 | } | ||
289 | |||
290 | #undef memmove | ||
291 | void *memmove(void *dest, const void *src, size_t len) | ||
292 | { | ||
293 | check_memory_region((unsigned long)src, len, false, _RET_IP_); | ||
294 | check_memory_region((unsigned long)dest, len, true, _RET_IP_); | ||
295 | |||
296 | return __memmove(dest, src, len); | ||
297 | } | ||
298 | |||
299 | #undef memcpy | ||
300 | void *memcpy(void *dest, const void *src, size_t len) | ||
301 | { | ||
302 | check_memory_region((unsigned long)src, len, false, _RET_IP_); | ||
303 | check_memory_region((unsigned long)dest, len, true, _RET_IP_); | ||
304 | |||
305 | return __memcpy(dest, src, len); | ||
306 | } | ||
307 | |||
308 | void kasan_alloc_pages(struct page *page, unsigned int order) | ||
309 | { | ||
310 | if (likely(!PageHighMem(page))) | ||
311 | kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); | ||
312 | } | 234 | } |
313 | 235 | ||
314 | void kasan_free_pages(struct page *page, unsigned int order) | 236 | void kasan_free_pages(struct page *page, unsigned int order) |
@@ -323,8 +245,11 @@ void kasan_free_pages(struct page *page, unsigned int order) | |||
323 | * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. | 245 | * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. |
324 | * For larger allocations larger redzones are used. | 246 | * For larger allocations larger redzones are used. |
325 | */ | 247 | */ |
326 | static unsigned int optimal_redzone(unsigned int object_size) | 248 | static inline unsigned int optimal_redzone(unsigned int object_size) |
327 | { | 249 | { |
250 | if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) | ||
251 | return 0; | ||
252 | |||
328 | return | 253 | return |
329 | object_size <= 64 - 16 ? 16 : | 254 | object_size <= 64 - 16 ? 16 : |
330 | object_size <= 128 - 32 ? 32 : | 255 | object_size <= 128 - 32 ? 32 : |
@@ -339,6 +264,7 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, | |||
339 | slab_flags_t *flags) | 264 | slab_flags_t *flags) |
340 | { | 265 | { |
341 | unsigned int orig_size = *size; | 266 | unsigned int orig_size = *size; |
267 | unsigned int redzone_size; | ||
342 | int redzone_adjust; | 268 | int redzone_adjust; |
343 | 269 | ||
344 | /* Add alloc meta. */ | 270 | /* Add alloc meta. */ |
@@ -346,20 +272,20 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, | |||
346 | *size += sizeof(struct kasan_alloc_meta); | 272 | *size += sizeof(struct kasan_alloc_meta); |
347 | 273 | ||
348 | /* Add free meta. */ | 274 | /* Add free meta. */ |
349 | if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || | 275 | if (IS_ENABLED(CONFIG_KASAN_GENERIC) && |
350 | cache->object_size < sizeof(struct kasan_free_meta)) { | 276 | (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || |
277 | cache->object_size < sizeof(struct kasan_free_meta))) { | ||
351 | cache->kasan_info.free_meta_offset = *size; | 278 | cache->kasan_info.free_meta_offset = *size; |
352 | *size += sizeof(struct kasan_free_meta); | 279 | *size += sizeof(struct kasan_free_meta); |
353 | } | 280 | } |
354 | redzone_adjust = optimal_redzone(cache->object_size) - | ||
355 | (*size - cache->object_size); | ||
356 | 281 | ||
282 | redzone_size = optimal_redzone(cache->object_size); | ||
283 | redzone_adjust = redzone_size - (*size - cache->object_size); | ||
357 | if (redzone_adjust > 0) | 284 | if (redzone_adjust > 0) |
358 | *size += redzone_adjust; | 285 | *size += redzone_adjust; |
359 | 286 | ||
360 | *size = min_t(unsigned int, KMALLOC_MAX_SIZE, | 287 | *size = min_t(unsigned int, KMALLOC_MAX_SIZE, |
361 | max(*size, cache->object_size + | 288 | max(*size, cache->object_size + redzone_size)); |
362 | optimal_redzone(cache->object_size))); | ||
363 | 289 | ||
364 | /* | 290 | /* |
365 | * If the metadata doesn't fit, don't enable KASAN at all. | 291 | * If the metadata doesn't fit, don't enable KASAN at all. |
@@ -372,30 +298,39 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, | |||
372 | return; | 298 | return; |
373 | } | 299 | } |
374 | 300 | ||
301 | cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE); | ||
302 | |||
375 | *flags |= SLAB_KASAN; | 303 | *flags |= SLAB_KASAN; |
376 | } | 304 | } |
377 | 305 | ||
378 | void kasan_cache_shrink(struct kmem_cache *cache) | 306 | size_t kasan_metadata_size(struct kmem_cache *cache) |
379 | { | 307 | { |
380 | quarantine_remove_cache(cache); | 308 | return (cache->kasan_info.alloc_meta_offset ? |
309 | sizeof(struct kasan_alloc_meta) : 0) + | ||
310 | (cache->kasan_info.free_meta_offset ? | ||
311 | sizeof(struct kasan_free_meta) : 0); | ||
381 | } | 312 | } |
382 | 313 | ||
383 | void kasan_cache_shutdown(struct kmem_cache *cache) | 314 | struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, |
315 | const void *object) | ||
384 | { | 316 | { |
385 | if (!__kmem_cache_empty(cache)) | 317 | BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); |
386 | quarantine_remove_cache(cache); | 318 | return (void *)object + cache->kasan_info.alloc_meta_offset; |
387 | } | 319 | } |
388 | 320 | ||
389 | size_t kasan_metadata_size(struct kmem_cache *cache) | 321 | struct kasan_free_meta *get_free_info(struct kmem_cache *cache, |
322 | const void *object) | ||
390 | { | 323 | { |
391 | return (cache->kasan_info.alloc_meta_offset ? | 324 | BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); |
392 | sizeof(struct kasan_alloc_meta) : 0) + | 325 | return (void *)object + cache->kasan_info.free_meta_offset; |
393 | (cache->kasan_info.free_meta_offset ? | ||
394 | sizeof(struct kasan_free_meta) : 0); | ||
395 | } | 326 | } |
396 | 327 | ||
397 | void kasan_poison_slab(struct page *page) | 328 | void kasan_poison_slab(struct page *page) |
398 | { | 329 | { |
330 | unsigned long i; | ||
331 | |||
332 | for (i = 0; i < (1 << compound_order(page)); i++) | ||
333 | page_kasan_tag_reset(page + i); | ||
399 | kasan_poison_shadow(page_address(page), | 334 | kasan_poison_shadow(page_address(page), |
400 | PAGE_SIZE << compound_order(page), | 335 | PAGE_SIZE << compound_order(page), |
401 | KASAN_KMALLOC_REDZONE); | 336 | KASAN_KMALLOC_REDZONE); |
@@ -413,92 +348,79 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) | |||
413 | KASAN_KMALLOC_REDZONE); | 348 | KASAN_KMALLOC_REDZONE); |
414 | } | 349 | } |
415 | 350 | ||
416 | static inline int in_irqentry_text(unsigned long ptr) | 351 | /* |
417 | { | 352 | * Since it's desirable to only call object contructors once during slab |
418 | return (ptr >= (unsigned long)&__irqentry_text_start && | 353 | * allocation, we preassign tags to all such objects. Also preassign tags for |
419 | ptr < (unsigned long)&__irqentry_text_end) || | 354 | * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports. |
420 | (ptr >= (unsigned long)&__softirqentry_text_start && | 355 | * For SLAB allocator we can't preassign tags randomly since the freelist is |
421 | ptr < (unsigned long)&__softirqentry_text_end); | 356 | * stored as an array of indexes instead of a linked list. Assign tags based |
422 | } | 357 | * on objects indexes, so that objects that are next to each other get |
423 | 358 | * different tags. | |
424 | static inline void filter_irq_stacks(struct stack_trace *trace) | 359 | * After a tag is assigned, the object always gets allocated with the same tag. |
360 | * The reason is that we can't change tags for objects with constructors on | ||
361 | * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor | ||
362 | * code can save the pointer to the object somewhere (e.g. in the object | ||
363 | * itself). Then if we retag it, the old saved pointer will become invalid. | ||
364 | */ | ||
365 | static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new) | ||
425 | { | 366 | { |
426 | int i; | 367 | if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) |
368 | return new ? KASAN_TAG_KERNEL : random_tag(); | ||
427 | 369 | ||
428 | if (!trace->nr_entries) | 370 | #ifdef CONFIG_SLAB |
429 | return; | 371 | return (u8)obj_to_index(cache, virt_to_page(object), (void *)object); |
430 | for (i = 0; i < trace->nr_entries; i++) | 372 | #else |
431 | if (in_irqentry_text(trace->entries[i])) { | 373 | return new ? random_tag() : get_tag(object); |
432 | /* Include the irqentry function into the stack. */ | 374 | #endif |
433 | trace->nr_entries = i + 1; | ||
434 | break; | ||
435 | } | ||
436 | } | 375 | } |
437 | 376 | ||
438 | static inline depot_stack_handle_t save_stack(gfp_t flags) | 377 | void * __must_check kasan_init_slab_obj(struct kmem_cache *cache, |
378 | const void *object) | ||
439 | { | 379 | { |
440 | unsigned long entries[KASAN_STACK_DEPTH]; | 380 | struct kasan_alloc_meta *alloc_info; |
441 | struct stack_trace trace = { | ||
442 | .nr_entries = 0, | ||
443 | .entries = entries, | ||
444 | .max_entries = KASAN_STACK_DEPTH, | ||
445 | .skip = 0 | ||
446 | }; | ||
447 | |||
448 | save_stack_trace(&trace); | ||
449 | filter_irq_stacks(&trace); | ||
450 | if (trace.nr_entries != 0 && | ||
451 | trace.entries[trace.nr_entries-1] == ULONG_MAX) | ||
452 | trace.nr_entries--; | ||
453 | 381 | ||
454 | return depot_save_stack(&trace, flags); | 382 | if (!(cache->flags & SLAB_KASAN)) |
455 | } | 383 | return (void *)object; |
456 | 384 | ||
457 | static inline void set_track(struct kasan_track *track, gfp_t flags) | 385 | alloc_info = get_alloc_info(cache, object); |
458 | { | 386 | __memset(alloc_info, 0, sizeof(*alloc_info)); |
459 | track->pid = current->pid; | ||
460 | track->stack = save_stack(flags); | ||
461 | } | ||
462 | 387 | ||
463 | struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, | 388 | if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) |
464 | const void *object) | 389 | object = set_tag(object, assign_tag(cache, object, true)); |
465 | { | ||
466 | BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); | ||
467 | return (void *)object + cache->kasan_info.alloc_meta_offset; | ||
468 | } | ||
469 | 390 | ||
470 | struct kasan_free_meta *get_free_info(struct kmem_cache *cache, | 391 | return (void *)object; |
471 | const void *object) | ||
472 | { | ||
473 | BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); | ||
474 | return (void *)object + cache->kasan_info.free_meta_offset; | ||
475 | } | 392 | } |
476 | 393 | ||
477 | void kasan_init_slab_obj(struct kmem_cache *cache, const void *object) | 394 | void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object, |
395 | gfp_t flags) | ||
478 | { | 396 | { |
479 | struct kasan_alloc_meta *alloc_info; | 397 | return kasan_kmalloc(cache, object, cache->object_size, flags); |
480 | |||
481 | if (!(cache->flags & SLAB_KASAN)) | ||
482 | return; | ||
483 | |||
484 | alloc_info = get_alloc_info(cache, object); | ||
485 | __memset(alloc_info, 0, sizeof(*alloc_info)); | ||
486 | } | 398 | } |
487 | 399 | ||
488 | void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) | 400 | static inline bool shadow_invalid(u8 tag, s8 shadow_byte) |
489 | { | 401 | { |
490 | kasan_kmalloc(cache, object, cache->object_size, flags); | 402 | if (IS_ENABLED(CONFIG_KASAN_GENERIC)) |
403 | return shadow_byte < 0 || | ||
404 | shadow_byte >= KASAN_SHADOW_SCALE_SIZE; | ||
405 | else | ||
406 | return tag != (u8)shadow_byte; | ||
491 | } | 407 | } |
492 | 408 | ||
493 | static bool __kasan_slab_free(struct kmem_cache *cache, void *object, | 409 | static bool __kasan_slab_free(struct kmem_cache *cache, void *object, |
494 | unsigned long ip, bool quarantine) | 410 | unsigned long ip, bool quarantine) |
495 | { | 411 | { |
496 | s8 shadow_byte; | 412 | s8 shadow_byte; |
413 | u8 tag; | ||
414 | void *tagged_object; | ||
497 | unsigned long rounded_up_size; | 415 | unsigned long rounded_up_size; |
498 | 416 | ||
417 | tag = get_tag(object); | ||
418 | tagged_object = object; | ||
419 | object = reset_tag(object); | ||
420 | |||
499 | if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) != | 421 | if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) != |
500 | object)) { | 422 | object)) { |
501 | kasan_report_invalid_free(object, ip); | 423 | kasan_report_invalid_free(tagged_object, ip); |
502 | return true; | 424 | return true; |
503 | } | 425 | } |
504 | 426 | ||
@@ -507,20 +429,22 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object, | |||
507 | return false; | 429 | return false; |
508 | 430 | ||
509 | shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); | 431 | shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); |
510 | if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { | 432 | if (shadow_invalid(tag, shadow_byte)) { |
511 | kasan_report_invalid_free(object, ip); | 433 | kasan_report_invalid_free(tagged_object, ip); |
512 | return true; | 434 | return true; |
513 | } | 435 | } |
514 | 436 | ||
515 | rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE); | 437 | rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE); |
516 | kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); | 438 | kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); |
517 | 439 | ||
518 | if (!quarantine || unlikely(!(cache->flags & SLAB_KASAN))) | 440 | if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) || |
441 | unlikely(!(cache->flags & SLAB_KASAN))) | ||
519 | return false; | 442 | return false; |
520 | 443 | ||
521 | set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); | 444 | set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); |
522 | quarantine_put(get_free_info(cache, object), cache); | 445 | quarantine_put(get_free_info(cache, object), cache); |
523 | return true; | 446 | |
447 | return IS_ENABLED(CONFIG_KASAN_GENERIC); | ||
524 | } | 448 | } |
525 | 449 | ||
526 | bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) | 450 | bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) |
@@ -528,33 +452,41 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) | |||
528 | return __kasan_slab_free(cache, object, ip, true); | 452 | return __kasan_slab_free(cache, object, ip, true); |
529 | } | 453 | } |
530 | 454 | ||
531 | void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, | 455 | void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, |
532 | gfp_t flags) | 456 | size_t size, gfp_t flags) |
533 | { | 457 | { |
534 | unsigned long redzone_start; | 458 | unsigned long redzone_start; |
535 | unsigned long redzone_end; | 459 | unsigned long redzone_end; |
460 | u8 tag; | ||
536 | 461 | ||
537 | if (gfpflags_allow_blocking(flags)) | 462 | if (gfpflags_allow_blocking(flags)) |
538 | quarantine_reduce(); | 463 | quarantine_reduce(); |
539 | 464 | ||
540 | if (unlikely(object == NULL)) | 465 | if (unlikely(object == NULL)) |
541 | return; | 466 | return NULL; |
542 | 467 | ||
543 | redzone_start = round_up((unsigned long)(object + size), | 468 | redzone_start = round_up((unsigned long)(object + size), |
544 | KASAN_SHADOW_SCALE_SIZE); | 469 | KASAN_SHADOW_SCALE_SIZE); |
545 | redzone_end = round_up((unsigned long)object + cache->object_size, | 470 | redzone_end = round_up((unsigned long)object + cache->object_size, |
546 | KASAN_SHADOW_SCALE_SIZE); | 471 | KASAN_SHADOW_SCALE_SIZE); |
547 | 472 | ||
548 | kasan_unpoison_shadow(object, size); | 473 | if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) |
474 | tag = assign_tag(cache, object, false); | ||
475 | |||
476 | /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ | ||
477 | kasan_unpoison_shadow(set_tag(object, tag), size); | ||
549 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, | 478 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, |
550 | KASAN_KMALLOC_REDZONE); | 479 | KASAN_KMALLOC_REDZONE); |
551 | 480 | ||
552 | if (cache->flags & SLAB_KASAN) | 481 | if (cache->flags & SLAB_KASAN) |
553 | set_track(&get_alloc_info(cache, object)->alloc_track, flags); | 482 | set_track(&get_alloc_info(cache, object)->alloc_track, flags); |
483 | |||
484 | return set_tag(object, tag); | ||
554 | } | 485 | } |
555 | EXPORT_SYMBOL(kasan_kmalloc); | 486 | EXPORT_SYMBOL(kasan_kmalloc); |
556 | 487 | ||
557 | void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) | 488 | void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, |
489 | gfp_t flags) | ||
558 | { | 490 | { |
559 | struct page *page; | 491 | struct page *page; |
560 | unsigned long redzone_start; | 492 | unsigned long redzone_start; |
@@ -564,7 +496,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) | |||
564 | quarantine_reduce(); | 496 | quarantine_reduce(); |
565 | 497 | ||
566 | if (unlikely(ptr == NULL)) | 498 | if (unlikely(ptr == NULL)) |
567 | return; | 499 | return NULL; |
568 | 500 | ||
569 | page = virt_to_page(ptr); | 501 | page = virt_to_page(ptr); |
570 | redzone_start = round_up((unsigned long)(ptr + size), | 502 | redzone_start = round_up((unsigned long)(ptr + size), |
@@ -574,21 +506,23 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) | |||
574 | kasan_unpoison_shadow(ptr, size); | 506 | kasan_unpoison_shadow(ptr, size); |
575 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, | 507 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, |
576 | KASAN_PAGE_REDZONE); | 508 | KASAN_PAGE_REDZONE); |
509 | |||
510 | return (void *)ptr; | ||
577 | } | 511 | } |
578 | 512 | ||
579 | void kasan_krealloc(const void *object, size_t size, gfp_t flags) | 513 | void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags) |
580 | { | 514 | { |
581 | struct page *page; | 515 | struct page *page; |
582 | 516 | ||
583 | if (unlikely(object == ZERO_SIZE_PTR)) | 517 | if (unlikely(object == ZERO_SIZE_PTR)) |
584 | return; | 518 | return (void *)object; |
585 | 519 | ||
586 | page = virt_to_head_page(object); | 520 | page = virt_to_head_page(object); |
587 | 521 | ||
588 | if (unlikely(!PageSlab(page))) | 522 | if (unlikely(!PageSlab(page))) |
589 | kasan_kmalloc_large(object, size, flags); | 523 | return kasan_kmalloc_large(object, size, flags); |
590 | else | 524 | else |
591 | kasan_kmalloc(page->slab_cache, object, size, flags); | 525 | return kasan_kmalloc(page->slab_cache, object, size, flags); |
592 | } | 526 | } |
593 | 527 | ||
594 | void kasan_poison_kfree(void *ptr, unsigned long ip) | 528 | void kasan_poison_kfree(void *ptr, unsigned long ip) |
@@ -632,11 +566,12 @@ int kasan_module_alloc(void *addr, size_t size) | |||
632 | 566 | ||
633 | ret = __vmalloc_node_range(shadow_size, 1, shadow_start, | 567 | ret = __vmalloc_node_range(shadow_size, 1, shadow_start, |
634 | shadow_start + shadow_size, | 568 | shadow_start + shadow_size, |
635 | GFP_KERNEL | __GFP_ZERO, | 569 | GFP_KERNEL, |
636 | PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, | 570 | PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, |
637 | __builtin_return_address(0)); | 571 | __builtin_return_address(0)); |
638 | 572 | ||
639 | if (ret) { | 573 | if (ret) { |
574 | __memset(ret, KASAN_SHADOW_INIT, shadow_size); | ||
640 | find_vm_area(addr)->flags |= VM_KASAN; | 575 | find_vm_area(addr)->flags |= VM_KASAN; |
641 | kmemleak_ignore(ret); | 576 | kmemleak_ignore(ret); |
642 | return 0; | 577 | return 0; |
@@ -651,147 +586,6 @@ void kasan_free_shadow(const struct vm_struct *vm) | |||
651 | vfree(kasan_mem_to_shadow(vm->addr)); | 586 | vfree(kasan_mem_to_shadow(vm->addr)); |
652 | } | 587 | } |
653 | 588 | ||
654 | static void register_global(struct kasan_global *global) | ||
655 | { | ||
656 | size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); | ||
657 | |||
658 | kasan_unpoison_shadow(global->beg, global->size); | ||
659 | |||
660 | kasan_poison_shadow(global->beg + aligned_size, | ||
661 | global->size_with_redzone - aligned_size, | ||
662 | KASAN_GLOBAL_REDZONE); | ||
663 | } | ||
664 | |||
665 | void __asan_register_globals(struct kasan_global *globals, size_t size) | ||
666 | { | ||
667 | int i; | ||
668 | |||
669 | for (i = 0; i < size; i++) | ||
670 | register_global(&globals[i]); | ||
671 | } | ||
672 | EXPORT_SYMBOL(__asan_register_globals); | ||
673 | |||
674 | void __asan_unregister_globals(struct kasan_global *globals, size_t size) | ||
675 | { | ||
676 | } | ||
677 | EXPORT_SYMBOL(__asan_unregister_globals); | ||
678 | |||
679 | #define DEFINE_ASAN_LOAD_STORE(size) \ | ||
680 | void __asan_load##size(unsigned long addr) \ | ||
681 | { \ | ||
682 | check_memory_region_inline(addr, size, false, _RET_IP_);\ | ||
683 | } \ | ||
684 | EXPORT_SYMBOL(__asan_load##size); \ | ||
685 | __alias(__asan_load##size) \ | ||
686 | void __asan_load##size##_noabort(unsigned long); \ | ||
687 | EXPORT_SYMBOL(__asan_load##size##_noabort); \ | ||
688 | void __asan_store##size(unsigned long addr) \ | ||
689 | { \ | ||
690 | check_memory_region_inline(addr, size, true, _RET_IP_); \ | ||
691 | } \ | ||
692 | EXPORT_SYMBOL(__asan_store##size); \ | ||
693 | __alias(__asan_store##size) \ | ||
694 | void __asan_store##size##_noabort(unsigned long); \ | ||
695 | EXPORT_SYMBOL(__asan_store##size##_noabort) | ||
696 | |||
697 | DEFINE_ASAN_LOAD_STORE(1); | ||
698 | DEFINE_ASAN_LOAD_STORE(2); | ||
699 | DEFINE_ASAN_LOAD_STORE(4); | ||
700 | DEFINE_ASAN_LOAD_STORE(8); | ||
701 | DEFINE_ASAN_LOAD_STORE(16); | ||
702 | |||
703 | void __asan_loadN(unsigned long addr, size_t size) | ||
704 | { | ||
705 | check_memory_region(addr, size, false, _RET_IP_); | ||
706 | } | ||
707 | EXPORT_SYMBOL(__asan_loadN); | ||
708 | |||
709 | __alias(__asan_loadN) | ||
710 | void __asan_loadN_noabort(unsigned long, size_t); | ||
711 | EXPORT_SYMBOL(__asan_loadN_noabort); | ||
712 | |||
713 | void __asan_storeN(unsigned long addr, size_t size) | ||
714 | { | ||
715 | check_memory_region(addr, size, true, _RET_IP_); | ||
716 | } | ||
717 | EXPORT_SYMBOL(__asan_storeN); | ||
718 | |||
719 | __alias(__asan_storeN) | ||
720 | void __asan_storeN_noabort(unsigned long, size_t); | ||
721 | EXPORT_SYMBOL(__asan_storeN_noabort); | ||
722 | |||
723 | /* to shut up compiler complaints */ | ||
724 | void __asan_handle_no_return(void) {} | ||
725 | EXPORT_SYMBOL(__asan_handle_no_return); | ||
726 | |||
727 | /* Emitted by compiler to poison large objects when they go out of scope. */ | ||
728 | void __asan_poison_stack_memory(const void *addr, size_t size) | ||
729 | { | ||
730 | /* | ||
731 | * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded | ||
732 | * by redzones, so we simply round up size to simplify logic. | ||
733 | */ | ||
734 | kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE), | ||
735 | KASAN_USE_AFTER_SCOPE); | ||
736 | } | ||
737 | EXPORT_SYMBOL(__asan_poison_stack_memory); | ||
738 | |||
739 | /* Emitted by compiler to unpoison large objects when they go into scope. */ | ||
740 | void __asan_unpoison_stack_memory(const void *addr, size_t size) | ||
741 | { | ||
742 | kasan_unpoison_shadow(addr, size); | ||
743 | } | ||
744 | EXPORT_SYMBOL(__asan_unpoison_stack_memory); | ||
745 | |||
746 | /* Emitted by compiler to poison alloca()ed objects. */ | ||
747 | void __asan_alloca_poison(unsigned long addr, size_t size) | ||
748 | { | ||
749 | size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); | ||
750 | size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) - | ||
751 | rounded_up_size; | ||
752 | size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE); | ||
753 | |||
754 | const void *left_redzone = (const void *)(addr - | ||
755 | KASAN_ALLOCA_REDZONE_SIZE); | ||
756 | const void *right_redzone = (const void *)(addr + rounded_up_size); | ||
757 | |||
758 | WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); | ||
759 | |||
760 | kasan_unpoison_shadow((const void *)(addr + rounded_down_size), | ||
761 | size - rounded_down_size); | ||
762 | kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, | ||
763 | KASAN_ALLOCA_LEFT); | ||
764 | kasan_poison_shadow(right_redzone, | ||
765 | padding_size + KASAN_ALLOCA_REDZONE_SIZE, | ||
766 | KASAN_ALLOCA_RIGHT); | ||
767 | } | ||
768 | EXPORT_SYMBOL(__asan_alloca_poison); | ||
769 | |||
770 | /* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */ | ||
771 | void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) | ||
772 | { | ||
773 | if (unlikely(!stack_top || stack_top > stack_bottom)) | ||
774 | return; | ||
775 | |||
776 | kasan_unpoison_shadow(stack_top, stack_bottom - stack_top); | ||
777 | } | ||
778 | EXPORT_SYMBOL(__asan_allocas_unpoison); | ||
779 | |||
780 | /* Emitted by the compiler to [un]poison local variables. */ | ||
781 | #define DEFINE_ASAN_SET_SHADOW(byte) \ | ||
782 | void __asan_set_shadow_##byte(const void *addr, size_t size) \ | ||
783 | { \ | ||
784 | __memset((void *)addr, 0x##byte, size); \ | ||
785 | } \ | ||
786 | EXPORT_SYMBOL(__asan_set_shadow_##byte) | ||
787 | |||
788 | DEFINE_ASAN_SET_SHADOW(00); | ||
789 | DEFINE_ASAN_SET_SHADOW(f1); | ||
790 | DEFINE_ASAN_SET_SHADOW(f2); | ||
791 | DEFINE_ASAN_SET_SHADOW(f3); | ||
792 | DEFINE_ASAN_SET_SHADOW(f5); | ||
793 | DEFINE_ASAN_SET_SHADOW(f8); | ||
794 | |||
795 | #ifdef CONFIG_MEMORY_HOTPLUG | 589 | #ifdef CONFIG_MEMORY_HOTPLUG |
796 | static bool shadow_mapped(unsigned long addr) | 590 | static bool shadow_mapped(unsigned long addr) |
797 | { | 591 | { |
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c new file mode 100644 index 000000000000..ccb6207276e3 --- /dev/null +++ b/mm/kasan/generic.c | |||
@@ -0,0 +1,344 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * This file contains core generic KASAN code. | ||
4 | * | ||
5 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. | ||
6 | * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> | ||
7 | * | ||
8 | * Some code borrowed from https://github.com/xairy/kasan-prototype by | ||
9 | * Andrey Konovalov <andreyknvl@gmail.com> | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License version 2 as | ||
13 | * published by the Free Software Foundation. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
18 | #define DISABLE_BRANCH_PROFILING | ||
19 | |||
20 | #include <linux/export.h> | ||
21 | #include <linux/interrupt.h> | ||
22 | #include <linux/init.h> | ||
23 | #include <linux/kasan.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/kmemleak.h> | ||
26 | #include <linux/linkage.h> | ||
27 | #include <linux/memblock.h> | ||
28 | #include <linux/memory.h> | ||
29 | #include <linux/mm.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/printk.h> | ||
32 | #include <linux/sched.h> | ||
33 | #include <linux/sched/task_stack.h> | ||
34 | #include <linux/slab.h> | ||
35 | #include <linux/stacktrace.h> | ||
36 | #include <linux/string.h> | ||
37 | #include <linux/types.h> | ||
38 | #include <linux/vmalloc.h> | ||
39 | #include <linux/bug.h> | ||
40 | |||
41 | #include "kasan.h" | ||
42 | #include "../slab.h" | ||
43 | |||
44 | /* | ||
45 | * All functions below always inlined so compiler could | ||
46 | * perform better optimizations in each of __asan_loadX/__assn_storeX | ||
47 | * depending on memory access size X. | ||
48 | */ | ||
49 | |||
50 | static __always_inline bool memory_is_poisoned_1(unsigned long addr) | ||
51 | { | ||
52 | s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); | ||
53 | |||
54 | if (unlikely(shadow_value)) { | ||
55 | s8 last_accessible_byte = addr & KASAN_SHADOW_MASK; | ||
56 | return unlikely(last_accessible_byte >= shadow_value); | ||
57 | } | ||
58 | |||
59 | return false; | ||
60 | } | ||
61 | |||
62 | static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, | ||
63 | unsigned long size) | ||
64 | { | ||
65 | u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); | ||
66 | |||
67 | /* | ||
68 | * Access crosses 8(shadow size)-byte boundary. Such access maps | ||
69 | * into 2 shadow bytes, so we need to check them both. | ||
70 | */ | ||
71 | if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1)) | ||
72 | return *shadow_addr || memory_is_poisoned_1(addr + size - 1); | ||
73 | |||
74 | return memory_is_poisoned_1(addr + size - 1); | ||
75 | } | ||
76 | |||
77 | static __always_inline bool memory_is_poisoned_16(unsigned long addr) | ||
78 | { | ||
79 | u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); | ||
80 | |||
81 | /* Unaligned 16-bytes access maps into 3 shadow bytes. */ | ||
82 | if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) | ||
83 | return *shadow_addr || memory_is_poisoned_1(addr + 15); | ||
84 | |||
85 | return *shadow_addr; | ||
86 | } | ||
87 | |||
88 | static __always_inline unsigned long bytes_is_nonzero(const u8 *start, | ||
89 | size_t size) | ||
90 | { | ||
91 | while (size) { | ||
92 | if (unlikely(*start)) | ||
93 | return (unsigned long)start; | ||
94 | start++; | ||
95 | size--; | ||
96 | } | ||
97 | |||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | static __always_inline unsigned long memory_is_nonzero(const void *start, | ||
102 | const void *end) | ||
103 | { | ||
104 | unsigned int words; | ||
105 | unsigned long ret; | ||
106 | unsigned int prefix = (unsigned long)start % 8; | ||
107 | |||
108 | if (end - start <= 16) | ||
109 | return bytes_is_nonzero(start, end - start); | ||
110 | |||
111 | if (prefix) { | ||
112 | prefix = 8 - prefix; | ||
113 | ret = bytes_is_nonzero(start, prefix); | ||
114 | if (unlikely(ret)) | ||
115 | return ret; | ||
116 | start += prefix; | ||
117 | } | ||
118 | |||
119 | words = (end - start) / 8; | ||
120 | while (words) { | ||
121 | if (unlikely(*(u64 *)start)) | ||
122 | return bytes_is_nonzero(start, 8); | ||
123 | start += 8; | ||
124 | words--; | ||
125 | } | ||
126 | |||
127 | return bytes_is_nonzero(start, (end - start) % 8); | ||
128 | } | ||
129 | |||
130 | static __always_inline bool memory_is_poisoned_n(unsigned long addr, | ||
131 | size_t size) | ||
132 | { | ||
133 | unsigned long ret; | ||
134 | |||
135 | ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), | ||
136 | kasan_mem_to_shadow((void *)addr + size - 1) + 1); | ||
137 | |||
138 | if (unlikely(ret)) { | ||
139 | unsigned long last_byte = addr + size - 1; | ||
140 | s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); | ||
141 | |||
142 | if (unlikely(ret != (unsigned long)last_shadow || | ||
143 | ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) | ||
144 | return true; | ||
145 | } | ||
146 | return false; | ||
147 | } | ||
148 | |||
149 | static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) | ||
150 | { | ||
151 | if (__builtin_constant_p(size)) { | ||
152 | switch (size) { | ||
153 | case 1: | ||
154 | return memory_is_poisoned_1(addr); | ||
155 | case 2: | ||
156 | case 4: | ||
157 | case 8: | ||
158 | return memory_is_poisoned_2_4_8(addr, size); | ||
159 | case 16: | ||
160 | return memory_is_poisoned_16(addr); | ||
161 | default: | ||
162 | BUILD_BUG(); | ||
163 | } | ||
164 | } | ||
165 | |||
166 | return memory_is_poisoned_n(addr, size); | ||
167 | } | ||
168 | |||
169 | static __always_inline void check_memory_region_inline(unsigned long addr, | ||
170 | size_t size, bool write, | ||
171 | unsigned long ret_ip) | ||
172 | { | ||
173 | if (unlikely(size == 0)) | ||
174 | return; | ||
175 | |||
176 | if (unlikely((void *)addr < | ||
177 | kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { | ||
178 | kasan_report(addr, size, write, ret_ip); | ||
179 | return; | ||
180 | } | ||
181 | |||
182 | if (likely(!memory_is_poisoned(addr, size))) | ||
183 | return; | ||
184 | |||
185 | kasan_report(addr, size, write, ret_ip); | ||
186 | } | ||
187 | |||
188 | void check_memory_region(unsigned long addr, size_t size, bool write, | ||
189 | unsigned long ret_ip) | ||
190 | { | ||
191 | check_memory_region_inline(addr, size, write, ret_ip); | ||
192 | } | ||
193 | |||
194 | void kasan_cache_shrink(struct kmem_cache *cache) | ||
195 | { | ||
196 | quarantine_remove_cache(cache); | ||
197 | } | ||
198 | |||
199 | void kasan_cache_shutdown(struct kmem_cache *cache) | ||
200 | { | ||
201 | if (!__kmem_cache_empty(cache)) | ||
202 | quarantine_remove_cache(cache); | ||
203 | } | ||
204 | |||
205 | static void register_global(struct kasan_global *global) | ||
206 | { | ||
207 | size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); | ||
208 | |||
209 | kasan_unpoison_shadow(global->beg, global->size); | ||
210 | |||
211 | kasan_poison_shadow(global->beg + aligned_size, | ||
212 | global->size_with_redzone - aligned_size, | ||
213 | KASAN_GLOBAL_REDZONE); | ||
214 | } | ||
215 | |||
216 | void __asan_register_globals(struct kasan_global *globals, size_t size) | ||
217 | { | ||
218 | int i; | ||
219 | |||
220 | for (i = 0; i < size; i++) | ||
221 | register_global(&globals[i]); | ||
222 | } | ||
223 | EXPORT_SYMBOL(__asan_register_globals); | ||
224 | |||
225 | void __asan_unregister_globals(struct kasan_global *globals, size_t size) | ||
226 | { | ||
227 | } | ||
228 | EXPORT_SYMBOL(__asan_unregister_globals); | ||
229 | |||
230 | #define DEFINE_ASAN_LOAD_STORE(size) \ | ||
231 | void __asan_load##size(unsigned long addr) \ | ||
232 | { \ | ||
233 | check_memory_region_inline(addr, size, false, _RET_IP_);\ | ||
234 | } \ | ||
235 | EXPORT_SYMBOL(__asan_load##size); \ | ||
236 | __alias(__asan_load##size) \ | ||
237 | void __asan_load##size##_noabort(unsigned long); \ | ||
238 | EXPORT_SYMBOL(__asan_load##size##_noabort); \ | ||
239 | void __asan_store##size(unsigned long addr) \ | ||
240 | { \ | ||
241 | check_memory_region_inline(addr, size, true, _RET_IP_); \ | ||
242 | } \ | ||
243 | EXPORT_SYMBOL(__asan_store##size); \ | ||
244 | __alias(__asan_store##size) \ | ||
245 | void __asan_store##size##_noabort(unsigned long); \ | ||
246 | EXPORT_SYMBOL(__asan_store##size##_noabort) | ||
247 | |||
248 | DEFINE_ASAN_LOAD_STORE(1); | ||
249 | DEFINE_ASAN_LOAD_STORE(2); | ||
250 | DEFINE_ASAN_LOAD_STORE(4); | ||
251 | DEFINE_ASAN_LOAD_STORE(8); | ||
252 | DEFINE_ASAN_LOAD_STORE(16); | ||
253 | |||
254 | void __asan_loadN(unsigned long addr, size_t size) | ||
255 | { | ||
256 | check_memory_region(addr, size, false, _RET_IP_); | ||
257 | } | ||
258 | EXPORT_SYMBOL(__asan_loadN); | ||
259 | |||
260 | __alias(__asan_loadN) | ||
261 | void __asan_loadN_noabort(unsigned long, size_t); | ||
262 | EXPORT_SYMBOL(__asan_loadN_noabort); | ||
263 | |||
264 | void __asan_storeN(unsigned long addr, size_t size) | ||
265 | { | ||
266 | check_memory_region(addr, size, true, _RET_IP_); | ||
267 | } | ||
268 | EXPORT_SYMBOL(__asan_storeN); | ||
269 | |||
270 | __alias(__asan_storeN) | ||
271 | void __asan_storeN_noabort(unsigned long, size_t); | ||
272 | EXPORT_SYMBOL(__asan_storeN_noabort); | ||
273 | |||
274 | /* to shut up compiler complaints */ | ||
275 | void __asan_handle_no_return(void) {} | ||
276 | EXPORT_SYMBOL(__asan_handle_no_return); | ||
277 | |||
278 | /* Emitted by compiler to poison large objects when they go out of scope. */ | ||
279 | void __asan_poison_stack_memory(const void *addr, size_t size) | ||
280 | { | ||
281 | /* | ||
282 | * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded | ||
283 | * by redzones, so we simply round up size to simplify logic. | ||
284 | */ | ||
285 | kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE), | ||
286 | KASAN_USE_AFTER_SCOPE); | ||
287 | } | ||
288 | EXPORT_SYMBOL(__asan_poison_stack_memory); | ||
289 | |||
290 | /* Emitted by compiler to unpoison large objects when they go into scope. */ | ||
291 | void __asan_unpoison_stack_memory(const void *addr, size_t size) | ||
292 | { | ||
293 | kasan_unpoison_shadow(addr, size); | ||
294 | } | ||
295 | EXPORT_SYMBOL(__asan_unpoison_stack_memory); | ||
296 | |||
297 | /* Emitted by compiler to poison alloca()ed objects. */ | ||
298 | void __asan_alloca_poison(unsigned long addr, size_t size) | ||
299 | { | ||
300 | size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); | ||
301 | size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) - | ||
302 | rounded_up_size; | ||
303 | size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE); | ||
304 | |||
305 | const void *left_redzone = (const void *)(addr - | ||
306 | KASAN_ALLOCA_REDZONE_SIZE); | ||
307 | const void *right_redzone = (const void *)(addr + rounded_up_size); | ||
308 | |||
309 | WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); | ||
310 | |||
311 | kasan_unpoison_shadow((const void *)(addr + rounded_down_size), | ||
312 | size - rounded_down_size); | ||
313 | kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, | ||
314 | KASAN_ALLOCA_LEFT); | ||
315 | kasan_poison_shadow(right_redzone, | ||
316 | padding_size + KASAN_ALLOCA_REDZONE_SIZE, | ||
317 | KASAN_ALLOCA_RIGHT); | ||
318 | } | ||
319 | EXPORT_SYMBOL(__asan_alloca_poison); | ||
320 | |||
321 | /* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */ | ||
322 | void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) | ||
323 | { | ||
324 | if (unlikely(!stack_top || stack_top > stack_bottom)) | ||
325 | return; | ||
326 | |||
327 | kasan_unpoison_shadow(stack_top, stack_bottom - stack_top); | ||
328 | } | ||
329 | EXPORT_SYMBOL(__asan_allocas_unpoison); | ||
330 | |||
331 | /* Emitted by the compiler to [un]poison local variables. */ | ||
332 | #define DEFINE_ASAN_SET_SHADOW(byte) \ | ||
333 | void __asan_set_shadow_##byte(const void *addr, size_t size) \ | ||
334 | { \ | ||
335 | __memset((void *)addr, 0x##byte, size); \ | ||
336 | } \ | ||
337 | EXPORT_SYMBOL(__asan_set_shadow_##byte) | ||
338 | |||
339 | DEFINE_ASAN_SET_SHADOW(00); | ||
340 | DEFINE_ASAN_SET_SHADOW(f1); | ||
341 | DEFINE_ASAN_SET_SHADOW(f2); | ||
342 | DEFINE_ASAN_SET_SHADOW(f3); | ||
343 | DEFINE_ASAN_SET_SHADOW(f5); | ||
344 | DEFINE_ASAN_SET_SHADOW(f8); | ||
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c new file mode 100644 index 000000000000..5e12035888f2 --- /dev/null +++ b/mm/kasan/generic_report.c | |||
@@ -0,0 +1,153 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * This file contains generic KASAN specific error reporting code. | ||
4 | * | ||
5 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. | ||
6 | * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> | ||
7 | * | ||
8 | * Some code borrowed from https://github.com/xairy/kasan-prototype by | ||
9 | * Andrey Konovalov <andreyknvl@gmail.com> | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License version 2 as | ||
13 | * published by the Free Software Foundation. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #include <linux/bitops.h> | ||
18 | #include <linux/ftrace.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/printk.h> | ||
23 | #include <linux/sched.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/stackdepot.h> | ||
26 | #include <linux/stacktrace.h> | ||
27 | #include <linux/string.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/kasan.h> | ||
30 | #include <linux/module.h> | ||
31 | |||
32 | #include <asm/sections.h> | ||
33 | |||
34 | #include "kasan.h" | ||
35 | #include "../slab.h" | ||
36 | |||
37 | void *find_first_bad_addr(void *addr, size_t size) | ||
38 | { | ||
39 | void *p = addr; | ||
40 | |||
41 | while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p))) | ||
42 | p += KASAN_SHADOW_SCALE_SIZE; | ||
43 | return p; | ||
44 | } | ||
45 | |||
46 | static const char *get_shadow_bug_type(struct kasan_access_info *info) | ||
47 | { | ||
48 | const char *bug_type = "unknown-crash"; | ||
49 | u8 *shadow_addr; | ||
50 | |||
51 | shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); | ||
52 | |||
53 | /* | ||
54 | * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look | ||
55 | * at the next shadow byte to determine the type of the bad access. | ||
56 | */ | ||
57 | if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1) | ||
58 | shadow_addr++; | ||
59 | |||
60 | switch (*shadow_addr) { | ||
61 | case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: | ||
62 | /* | ||
63 | * In theory it's still possible to see these shadow values | ||
64 | * due to a data race in the kernel code. | ||
65 | */ | ||
66 | bug_type = "out-of-bounds"; | ||
67 | break; | ||
68 | case KASAN_PAGE_REDZONE: | ||
69 | case KASAN_KMALLOC_REDZONE: | ||
70 | bug_type = "slab-out-of-bounds"; | ||
71 | break; | ||
72 | case KASAN_GLOBAL_REDZONE: | ||
73 | bug_type = "global-out-of-bounds"; | ||
74 | break; | ||
75 | case KASAN_STACK_LEFT: | ||
76 | case KASAN_STACK_MID: | ||
77 | case KASAN_STACK_RIGHT: | ||
78 | case KASAN_STACK_PARTIAL: | ||
79 | bug_type = "stack-out-of-bounds"; | ||
80 | break; | ||
81 | case KASAN_FREE_PAGE: | ||
82 | case KASAN_KMALLOC_FREE: | ||
83 | bug_type = "use-after-free"; | ||
84 | break; | ||
85 | case KASAN_USE_AFTER_SCOPE: | ||
86 | bug_type = "use-after-scope"; | ||
87 | break; | ||
88 | case KASAN_ALLOCA_LEFT: | ||
89 | case KASAN_ALLOCA_RIGHT: | ||
90 | bug_type = "alloca-out-of-bounds"; | ||
91 | break; | ||
92 | } | ||
93 | |||
94 | return bug_type; | ||
95 | } | ||
96 | |||
97 | static const char *get_wild_bug_type(struct kasan_access_info *info) | ||
98 | { | ||
99 | const char *bug_type = "unknown-crash"; | ||
100 | |||
101 | if ((unsigned long)info->access_addr < PAGE_SIZE) | ||
102 | bug_type = "null-ptr-deref"; | ||
103 | else if ((unsigned long)info->access_addr < TASK_SIZE) | ||
104 | bug_type = "user-memory-access"; | ||
105 | else | ||
106 | bug_type = "wild-memory-access"; | ||
107 | |||
108 | return bug_type; | ||
109 | } | ||
110 | |||
111 | const char *get_bug_type(struct kasan_access_info *info) | ||
112 | { | ||
113 | if (addr_has_shadow(info->access_addr)) | ||
114 | return get_shadow_bug_type(info); | ||
115 | return get_wild_bug_type(info); | ||
116 | } | ||
117 | |||
118 | #define DEFINE_ASAN_REPORT_LOAD(size) \ | ||
119 | void __asan_report_load##size##_noabort(unsigned long addr) \ | ||
120 | { \ | ||
121 | kasan_report(addr, size, false, _RET_IP_); \ | ||
122 | } \ | ||
123 | EXPORT_SYMBOL(__asan_report_load##size##_noabort) | ||
124 | |||
125 | #define DEFINE_ASAN_REPORT_STORE(size) \ | ||
126 | void __asan_report_store##size##_noabort(unsigned long addr) \ | ||
127 | { \ | ||
128 | kasan_report(addr, size, true, _RET_IP_); \ | ||
129 | } \ | ||
130 | EXPORT_SYMBOL(__asan_report_store##size##_noabort) | ||
131 | |||
132 | DEFINE_ASAN_REPORT_LOAD(1); | ||
133 | DEFINE_ASAN_REPORT_LOAD(2); | ||
134 | DEFINE_ASAN_REPORT_LOAD(4); | ||
135 | DEFINE_ASAN_REPORT_LOAD(8); | ||
136 | DEFINE_ASAN_REPORT_LOAD(16); | ||
137 | DEFINE_ASAN_REPORT_STORE(1); | ||
138 | DEFINE_ASAN_REPORT_STORE(2); | ||
139 | DEFINE_ASAN_REPORT_STORE(4); | ||
140 | DEFINE_ASAN_REPORT_STORE(8); | ||
141 | DEFINE_ASAN_REPORT_STORE(16); | ||
142 | |||
143 | void __asan_report_load_n_noabort(unsigned long addr, size_t size) | ||
144 | { | ||
145 | kasan_report(addr, size, false, _RET_IP_); | ||
146 | } | ||
147 | EXPORT_SYMBOL(__asan_report_load_n_noabort); | ||
148 | |||
149 | void __asan_report_store_n_noabort(unsigned long addr, size_t size) | ||
150 | { | ||
151 | kasan_report(addr, size, true, _RET_IP_); | ||
152 | } | ||
153 | EXPORT_SYMBOL(__asan_report_store_n_noabort); | ||
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/init.c index c7550eb65922..34afad56497b 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/init.c | |||
@@ -1,3 +1,4 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
1 | /* | 2 | /* |
2 | * This file contains some kasan initialization code. | 3 | * This file contains some kasan initialization code. |
3 | * | 4 | * |
@@ -30,13 +31,13 @@ | |||
30 | * - Latter it reused it as zero shadow to cover large ranges of memory | 31 | * - Latter it reused it as zero shadow to cover large ranges of memory |
31 | * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...). | 32 | * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...). |
32 | */ | 33 | */ |
33 | unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; | 34 | unsigned char kasan_early_shadow_page[PAGE_SIZE] __page_aligned_bss; |
34 | 35 | ||
35 | #if CONFIG_PGTABLE_LEVELS > 4 | 36 | #if CONFIG_PGTABLE_LEVELS > 4 |
36 | p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; | 37 | p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; |
37 | static inline bool kasan_p4d_table(pgd_t pgd) | 38 | static inline bool kasan_p4d_table(pgd_t pgd) |
38 | { | 39 | { |
39 | return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d)); | 40 | return pgd_page(pgd) == virt_to_page(lm_alias(kasan_early_shadow_p4d)); |
40 | } | 41 | } |
41 | #else | 42 | #else |
42 | static inline bool kasan_p4d_table(pgd_t pgd) | 43 | static inline bool kasan_p4d_table(pgd_t pgd) |
@@ -45,10 +46,10 @@ static inline bool kasan_p4d_table(pgd_t pgd) | |||
45 | } | 46 | } |
46 | #endif | 47 | #endif |
47 | #if CONFIG_PGTABLE_LEVELS > 3 | 48 | #if CONFIG_PGTABLE_LEVELS > 3 |
48 | pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; | 49 | pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss; |
49 | static inline bool kasan_pud_table(p4d_t p4d) | 50 | static inline bool kasan_pud_table(p4d_t p4d) |
50 | { | 51 | { |
51 | return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud)); | 52 | return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud)); |
52 | } | 53 | } |
53 | #else | 54 | #else |
54 | static inline bool kasan_pud_table(p4d_t p4d) | 55 | static inline bool kasan_pud_table(p4d_t p4d) |
@@ -57,10 +58,10 @@ static inline bool kasan_pud_table(p4d_t p4d) | |||
57 | } | 58 | } |
58 | #endif | 59 | #endif |
59 | #if CONFIG_PGTABLE_LEVELS > 2 | 60 | #if CONFIG_PGTABLE_LEVELS > 2 |
60 | pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; | 61 | pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss; |
61 | static inline bool kasan_pmd_table(pud_t pud) | 62 | static inline bool kasan_pmd_table(pud_t pud) |
62 | { | 63 | { |
63 | return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd)); | 64 | return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd)); |
64 | } | 65 | } |
65 | #else | 66 | #else |
66 | static inline bool kasan_pmd_table(pud_t pud) | 67 | static inline bool kasan_pmd_table(pud_t pud) |
@@ -68,16 +69,16 @@ static inline bool kasan_pmd_table(pud_t pud) | |||
68 | return 0; | 69 | return 0; |
69 | } | 70 | } |
70 | #endif | 71 | #endif |
71 | pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; | 72 | pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss; |
72 | 73 | ||
73 | static inline bool kasan_pte_table(pmd_t pmd) | 74 | static inline bool kasan_pte_table(pmd_t pmd) |
74 | { | 75 | { |
75 | return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte)); | 76 | return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte)); |
76 | } | 77 | } |
77 | 78 | ||
78 | static inline bool kasan_zero_page_entry(pte_t pte) | 79 | static inline bool kasan_early_shadow_page_entry(pte_t pte) |
79 | { | 80 | { |
80 | return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page)); | 81 | return pte_page(pte) == virt_to_page(lm_alias(kasan_early_shadow_page)); |
81 | } | 82 | } |
82 | 83 | ||
83 | static __init void *early_alloc(size_t size, int node) | 84 | static __init void *early_alloc(size_t size, int node) |
@@ -92,7 +93,8 @@ static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, | |||
92 | pte_t *pte = pte_offset_kernel(pmd, addr); | 93 | pte_t *pte = pte_offset_kernel(pmd, addr); |
93 | pte_t zero_pte; | 94 | pte_t zero_pte; |
94 | 95 | ||
95 | zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_zero_page)), PAGE_KERNEL); | 96 | zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_early_shadow_page)), |
97 | PAGE_KERNEL); | ||
96 | zero_pte = pte_wrprotect(zero_pte); | 98 | zero_pte = pte_wrprotect(zero_pte); |
97 | 99 | ||
98 | while (addr + PAGE_SIZE <= end) { | 100 | while (addr + PAGE_SIZE <= end) { |
@@ -112,7 +114,8 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, | |||
112 | next = pmd_addr_end(addr, end); | 114 | next = pmd_addr_end(addr, end); |
113 | 115 | ||
114 | if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { | 116 | if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { |
115 | pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); | 117 | pmd_populate_kernel(&init_mm, pmd, |
118 | lm_alias(kasan_early_shadow_pte)); | ||
116 | continue; | 119 | continue; |
117 | } | 120 | } |
118 | 121 | ||
@@ -145,9 +148,11 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, | |||
145 | if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { | 148 | if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { |
146 | pmd_t *pmd; | 149 | pmd_t *pmd; |
147 | 150 | ||
148 | pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); | 151 | pud_populate(&init_mm, pud, |
152 | lm_alias(kasan_early_shadow_pmd)); | ||
149 | pmd = pmd_offset(pud, addr); | 153 | pmd = pmd_offset(pud, addr); |
150 | pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); | 154 | pmd_populate_kernel(&init_mm, pmd, |
155 | lm_alias(kasan_early_shadow_pte)); | ||
151 | continue; | 156 | continue; |
152 | } | 157 | } |
153 | 158 | ||
@@ -181,12 +186,14 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, | |||
181 | pud_t *pud; | 186 | pud_t *pud; |
182 | pmd_t *pmd; | 187 | pmd_t *pmd; |
183 | 188 | ||
184 | p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); | 189 | p4d_populate(&init_mm, p4d, |
190 | lm_alias(kasan_early_shadow_pud)); | ||
185 | pud = pud_offset(p4d, addr); | 191 | pud = pud_offset(p4d, addr); |
186 | pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); | 192 | pud_populate(&init_mm, pud, |
193 | lm_alias(kasan_early_shadow_pmd)); | ||
187 | pmd = pmd_offset(pud, addr); | 194 | pmd = pmd_offset(pud, addr); |
188 | pmd_populate_kernel(&init_mm, pmd, | 195 | pmd_populate_kernel(&init_mm, pmd, |
189 | lm_alias(kasan_zero_pte)); | 196 | lm_alias(kasan_early_shadow_pte)); |
190 | continue; | 197 | continue; |
191 | } | 198 | } |
192 | 199 | ||
@@ -209,13 +216,13 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, | |||
209 | } | 216 | } |
210 | 217 | ||
211 | /** | 218 | /** |
212 | * kasan_populate_zero_shadow - populate shadow memory region with | 219 | * kasan_populate_early_shadow - populate shadow memory region with |
213 | * kasan_zero_page | 220 | * kasan_early_shadow_page |
214 | * @shadow_start - start of the memory range to populate | 221 | * @shadow_start - start of the memory range to populate |
215 | * @shadow_end - end of the memory range to populate | 222 | * @shadow_end - end of the memory range to populate |
216 | */ | 223 | */ |
217 | int __ref kasan_populate_zero_shadow(const void *shadow_start, | 224 | int __ref kasan_populate_early_shadow(const void *shadow_start, |
218 | const void *shadow_end) | 225 | const void *shadow_end) |
219 | { | 226 | { |
220 | unsigned long addr = (unsigned long)shadow_start; | 227 | unsigned long addr = (unsigned long)shadow_start; |
221 | unsigned long end = (unsigned long)shadow_end; | 228 | unsigned long end = (unsigned long)shadow_end; |
@@ -231,7 +238,7 @@ int __ref kasan_populate_zero_shadow(const void *shadow_start, | |||
231 | pmd_t *pmd; | 238 | pmd_t *pmd; |
232 | 239 | ||
233 | /* | 240 | /* |
234 | * kasan_zero_pud should be populated with pmds | 241 | * kasan_early_shadow_pud should be populated with pmds |
235 | * at this moment. | 242 | * at this moment. |
236 | * [pud,pmd]_populate*() below needed only for | 243 | * [pud,pmd]_populate*() below needed only for |
237 | * 3,2 - level page tables where we don't have | 244 | * 3,2 - level page tables where we don't have |
@@ -241,21 +248,25 @@ int __ref kasan_populate_zero_shadow(const void *shadow_start, | |||
241 | * The ifndef is required to avoid build breakage. | 248 | * The ifndef is required to avoid build breakage. |
242 | * | 249 | * |
243 | * With 5level-fixup.h, pgd_populate() is not nop and | 250 | * With 5level-fixup.h, pgd_populate() is not nop and |
244 | * we reference kasan_zero_p4d. It's not defined | 251 | * we reference kasan_early_shadow_p4d. It's not defined |
245 | * unless 5-level paging enabled. | 252 | * unless 5-level paging enabled. |
246 | * | 253 | * |
247 | * The ifndef can be dropped once all KASAN-enabled | 254 | * The ifndef can be dropped once all KASAN-enabled |
248 | * architectures will switch to pgtable-nop4d.h. | 255 | * architectures will switch to pgtable-nop4d.h. |
249 | */ | 256 | */ |
250 | #ifndef __ARCH_HAS_5LEVEL_HACK | 257 | #ifndef __ARCH_HAS_5LEVEL_HACK |
251 | pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_p4d)); | 258 | pgd_populate(&init_mm, pgd, |
259 | lm_alias(kasan_early_shadow_p4d)); | ||
252 | #endif | 260 | #endif |
253 | p4d = p4d_offset(pgd, addr); | 261 | p4d = p4d_offset(pgd, addr); |
254 | p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); | 262 | p4d_populate(&init_mm, p4d, |
263 | lm_alias(kasan_early_shadow_pud)); | ||
255 | pud = pud_offset(p4d, addr); | 264 | pud = pud_offset(p4d, addr); |
256 | pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); | 265 | pud_populate(&init_mm, pud, |
266 | lm_alias(kasan_early_shadow_pmd)); | ||
257 | pmd = pmd_offset(pud, addr); | 267 | pmd = pmd_offset(pud, addr); |
258 | pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); | 268 | pmd_populate_kernel(&init_mm, pmd, |
269 | lm_alias(kasan_early_shadow_pte)); | ||
259 | continue; | 270 | continue; |
260 | } | 271 | } |
261 | 272 | ||
@@ -350,7 +361,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, | |||
350 | if (!pte_present(*pte)) | 361 | if (!pte_present(*pte)) |
351 | continue; | 362 | continue; |
352 | 363 | ||
353 | if (WARN_ON(!kasan_zero_page_entry(*pte))) | 364 | if (WARN_ON(!kasan_early_shadow_page_entry(*pte))) |
354 | continue; | 365 | continue; |
355 | pte_clear(&init_mm, addr, pte); | 366 | pte_clear(&init_mm, addr, pte); |
356 | } | 367 | } |
@@ -480,7 +491,7 @@ int kasan_add_zero_shadow(void *start, unsigned long size) | |||
480 | WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) | 491 | WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) |
481 | return -EINVAL; | 492 | return -EINVAL; |
482 | 493 | ||
483 | ret = kasan_populate_zero_shadow(shadow_start, shadow_end); | 494 | ret = kasan_populate_early_shadow(shadow_start, shadow_end); |
484 | if (ret) | 495 | if (ret) |
485 | kasan_remove_zero_shadow(shadow_start, | 496 | kasan_remove_zero_shadow(shadow_start, |
486 | size >> KASAN_SHADOW_SCALE_SHIFT); | 497 | size >> KASAN_SHADOW_SCALE_SHIFT); |
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c12dcfde2ebd..ea51b2d898ec 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h | |||
@@ -8,10 +8,22 @@ | |||
8 | #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) | 8 | #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) |
9 | #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) | 9 | #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) |
10 | 10 | ||
11 | #define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */ | ||
12 | #define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */ | ||
13 | #define KASAN_TAG_MAX 0xFD /* maximum value for random tags */ | ||
14 | |||
15 | #ifdef CONFIG_KASAN_GENERIC | ||
11 | #define KASAN_FREE_PAGE 0xFF /* page was freed */ | 16 | #define KASAN_FREE_PAGE 0xFF /* page was freed */ |
12 | #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ | 17 | #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ |
13 | #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ | 18 | #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ |
14 | #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ | 19 | #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ |
20 | #else | ||
21 | #define KASAN_FREE_PAGE KASAN_TAG_INVALID | ||
22 | #define KASAN_PAGE_REDZONE KASAN_TAG_INVALID | ||
23 | #define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID | ||
24 | #define KASAN_KMALLOC_FREE KASAN_TAG_INVALID | ||
25 | #endif | ||
26 | |||
15 | #define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */ | 27 | #define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */ |
16 | 28 | ||
17 | /* | 29 | /* |
@@ -105,11 +117,25 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) | |||
105 | << KASAN_SHADOW_SCALE_SHIFT); | 117 | << KASAN_SHADOW_SCALE_SHIFT); |
106 | } | 118 | } |
107 | 119 | ||
120 | static inline bool addr_has_shadow(const void *addr) | ||
121 | { | ||
122 | return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); | ||
123 | } | ||
124 | |||
125 | void kasan_poison_shadow(const void *address, size_t size, u8 value); | ||
126 | |||
127 | void check_memory_region(unsigned long addr, size_t size, bool write, | ||
128 | unsigned long ret_ip); | ||
129 | |||
130 | void *find_first_bad_addr(void *addr, size_t size); | ||
131 | const char *get_bug_type(struct kasan_access_info *info); | ||
132 | |||
108 | void kasan_report(unsigned long addr, size_t size, | 133 | void kasan_report(unsigned long addr, size_t size, |
109 | bool is_write, unsigned long ip); | 134 | bool is_write, unsigned long ip); |
110 | void kasan_report_invalid_free(void *object, unsigned long ip); | 135 | void kasan_report_invalid_free(void *object, unsigned long ip); |
111 | 136 | ||
112 | #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) | 137 | #if defined(CONFIG_KASAN_GENERIC) && \ |
138 | (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) | ||
113 | void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); | 139 | void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); |
114 | void quarantine_reduce(void); | 140 | void quarantine_reduce(void); |
115 | void quarantine_remove_cache(struct kmem_cache *cache); | 141 | void quarantine_remove_cache(struct kmem_cache *cache); |
@@ -120,6 +146,37 @@ static inline void quarantine_reduce(void) { } | |||
120 | static inline void quarantine_remove_cache(struct kmem_cache *cache) { } | 146 | static inline void quarantine_remove_cache(struct kmem_cache *cache) { } |
121 | #endif | 147 | #endif |
122 | 148 | ||
149 | #ifdef CONFIG_KASAN_SW_TAGS | ||
150 | |||
151 | void print_tags(u8 addr_tag, const void *addr); | ||
152 | |||
153 | u8 random_tag(void); | ||
154 | |||
155 | #else | ||
156 | |||
157 | static inline void print_tags(u8 addr_tag, const void *addr) { } | ||
158 | |||
159 | static inline u8 random_tag(void) | ||
160 | { | ||
161 | return 0; | ||
162 | } | ||
163 | |||
164 | #endif | ||
165 | |||
166 | #ifndef arch_kasan_set_tag | ||
167 | #define arch_kasan_set_tag(addr, tag) ((void *)(addr)) | ||
168 | #endif | ||
169 | #ifndef arch_kasan_reset_tag | ||
170 | #define arch_kasan_reset_tag(addr) ((void *)(addr)) | ||
171 | #endif | ||
172 | #ifndef arch_kasan_get_tag | ||
173 | #define arch_kasan_get_tag(addr) 0 | ||
174 | #endif | ||
175 | |||
176 | #define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag))) | ||
177 | #define reset_tag(addr) ((void *)arch_kasan_reset_tag(addr)) | ||
178 | #define get_tag(addr) arch_kasan_get_tag(addr) | ||
179 | |||
123 | /* | 180 | /* |
124 | * Exported functions for interfaces called from assembly or from generated | 181 | * Exported functions for interfaces called from assembly or from generated |
125 | * code. Declarations here to avoid warning about missing declarations. | 182 | * code. Declarations here to avoid warning about missing declarations. |
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index b209dbaefde8..978bc4a3eb51 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c | |||
@@ -1,3 +1,4 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
1 | /* | 2 | /* |
2 | * KASAN quarantine. | 3 | * KASAN quarantine. |
3 | * | 4 | * |
@@ -236,7 +237,7 @@ void quarantine_reduce(void) | |||
236 | * Update quarantine size in case of hotplug. Allocate a fraction of | 237 | * Update quarantine size in case of hotplug. Allocate a fraction of |
237 | * the installed memory to quarantine minus per-cpu queue limits. | 238 | * the installed memory to quarantine minus per-cpu queue limits. |
238 | */ | 239 | */ |
239 | total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / | 240 | total_size = (totalram_pages() << PAGE_SHIFT) / |
240 | QUARANTINE_FRACTION; | 241 | QUARANTINE_FRACTION; |
241 | percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); | 242 | percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); |
242 | new_quarantine_size = (total_size < percpu_quarantines) ? | 243 | new_quarantine_size = (total_size < percpu_quarantines) ? |
diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 5c169aa688fd..ca9418fe9232 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c | |||
@@ -1,5 +1,6 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
1 | /* | 2 | /* |
2 | * This file contains error reporting code. | 3 | * This file contains common generic and tag-based KASAN error reporting code. |
3 | * | 4 | * |
4 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. | 5 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. |
5 | * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> | 6 | * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> |
@@ -39,129 +40,43 @@ | |||
39 | #define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK) | 40 | #define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK) |
40 | #define SHADOW_ROWS_AROUND_ADDR 2 | 41 | #define SHADOW_ROWS_AROUND_ADDR 2 |
41 | 42 | ||
42 | static const void *find_first_bad_addr(const void *addr, size_t size) | 43 | static unsigned long kasan_flags; |
43 | { | ||
44 | u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr); | ||
45 | const void *first_bad_addr = addr; | ||
46 | |||
47 | while (!shadow_val && first_bad_addr < addr + size) { | ||
48 | first_bad_addr += KASAN_SHADOW_SCALE_SIZE; | ||
49 | shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr); | ||
50 | } | ||
51 | return first_bad_addr; | ||
52 | } | ||
53 | 44 | ||
54 | static bool addr_has_shadow(struct kasan_access_info *info) | 45 | #define KASAN_BIT_REPORTED 0 |
55 | { | 46 | #define KASAN_BIT_MULTI_SHOT 1 |
56 | return (info->access_addr >= | ||
57 | kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); | ||
58 | } | ||
59 | 47 | ||
60 | static const char *get_shadow_bug_type(struct kasan_access_info *info) | 48 | bool kasan_save_enable_multi_shot(void) |
61 | { | 49 | { |
62 | const char *bug_type = "unknown-crash"; | 50 | return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); |
63 | u8 *shadow_addr; | ||
64 | |||
65 | info->first_bad_addr = find_first_bad_addr(info->access_addr, | ||
66 | info->access_size); | ||
67 | |||
68 | shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); | ||
69 | |||
70 | /* | ||
71 | * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look | ||
72 | * at the next shadow byte to determine the type of the bad access. | ||
73 | */ | ||
74 | if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1) | ||
75 | shadow_addr++; | ||
76 | |||
77 | switch (*shadow_addr) { | ||
78 | case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: | ||
79 | /* | ||
80 | * In theory it's still possible to see these shadow values | ||
81 | * due to a data race in the kernel code. | ||
82 | */ | ||
83 | bug_type = "out-of-bounds"; | ||
84 | break; | ||
85 | case KASAN_PAGE_REDZONE: | ||
86 | case KASAN_KMALLOC_REDZONE: | ||
87 | bug_type = "slab-out-of-bounds"; | ||
88 | break; | ||
89 | case KASAN_GLOBAL_REDZONE: | ||
90 | bug_type = "global-out-of-bounds"; | ||
91 | break; | ||
92 | case KASAN_STACK_LEFT: | ||
93 | case KASAN_STACK_MID: | ||
94 | case KASAN_STACK_RIGHT: | ||
95 | case KASAN_STACK_PARTIAL: | ||
96 | bug_type = "stack-out-of-bounds"; | ||
97 | break; | ||
98 | case KASAN_FREE_PAGE: | ||
99 | case KASAN_KMALLOC_FREE: | ||
100 | bug_type = "use-after-free"; | ||
101 | break; | ||
102 | case KASAN_USE_AFTER_SCOPE: | ||
103 | bug_type = "use-after-scope"; | ||
104 | break; | ||
105 | case KASAN_ALLOCA_LEFT: | ||
106 | case KASAN_ALLOCA_RIGHT: | ||
107 | bug_type = "alloca-out-of-bounds"; | ||
108 | break; | ||
109 | } | ||
110 | |||
111 | return bug_type; | ||
112 | } | 51 | } |
52 | EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); | ||
113 | 53 | ||
114 | static const char *get_wild_bug_type(struct kasan_access_info *info) | 54 | void kasan_restore_multi_shot(bool enabled) |
115 | { | 55 | { |
116 | const char *bug_type = "unknown-crash"; | 56 | if (!enabled) |
117 | 57 | clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); | |
118 | if ((unsigned long)info->access_addr < PAGE_SIZE) | ||
119 | bug_type = "null-ptr-deref"; | ||
120 | else if ((unsigned long)info->access_addr < TASK_SIZE) | ||
121 | bug_type = "user-memory-access"; | ||
122 | else | ||
123 | bug_type = "wild-memory-access"; | ||
124 | |||
125 | return bug_type; | ||
126 | } | 58 | } |
59 | EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); | ||
127 | 60 | ||
128 | static const char *get_bug_type(struct kasan_access_info *info) | 61 | static int __init kasan_set_multi_shot(char *str) |
129 | { | 62 | { |
130 | if (addr_has_shadow(info)) | 63 | set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); |
131 | return get_shadow_bug_type(info); | 64 | return 1; |
132 | return get_wild_bug_type(info); | ||
133 | } | 65 | } |
66 | __setup("kasan_multi_shot", kasan_set_multi_shot); | ||
134 | 67 | ||
135 | static void print_error_description(struct kasan_access_info *info) | 68 | static void print_error_description(struct kasan_access_info *info) |
136 | { | 69 | { |
137 | const char *bug_type = get_bug_type(info); | ||
138 | |||
139 | pr_err("BUG: KASAN: %s in %pS\n", | 70 | pr_err("BUG: KASAN: %s in %pS\n", |
140 | bug_type, (void *)info->ip); | 71 | get_bug_type(info), (void *)info->ip); |
141 | pr_err("%s of size %zu at addr %px by task %s/%d\n", | 72 | pr_err("%s of size %zu at addr %px by task %s/%d\n", |
142 | info->is_write ? "Write" : "Read", info->access_size, | 73 | info->is_write ? "Write" : "Read", info->access_size, |
143 | info->access_addr, current->comm, task_pid_nr(current)); | 74 | info->access_addr, current->comm, task_pid_nr(current)); |
144 | } | 75 | } |
145 | 76 | ||
146 | static inline bool kernel_or_module_addr(const void *addr) | ||
147 | { | ||
148 | if (addr >= (void *)_stext && addr < (void *)_end) | ||
149 | return true; | ||
150 | if (is_module_address((unsigned long)addr)) | ||
151 | return true; | ||
152 | return false; | ||
153 | } | ||
154 | |||
155 | static inline bool init_task_stack_addr(const void *addr) | ||
156 | { | ||
157 | return addr >= (void *)&init_thread_union.stack && | ||
158 | (addr <= (void *)&init_thread_union.stack + | ||
159 | sizeof(init_thread_union.stack)); | ||
160 | } | ||
161 | |||
162 | static DEFINE_SPINLOCK(report_lock); | 77 | static DEFINE_SPINLOCK(report_lock); |
163 | 78 | ||
164 | static void kasan_start_report(unsigned long *flags) | 79 | static void start_report(unsigned long *flags) |
165 | { | 80 | { |
166 | /* | 81 | /* |
167 | * Make sure we don't end up in loop. | 82 | * Make sure we don't end up in loop. |
@@ -171,7 +86,7 @@ static void kasan_start_report(unsigned long *flags) | |||
171 | pr_err("==================================================================\n"); | 86 | pr_err("==================================================================\n"); |
172 | } | 87 | } |
173 | 88 | ||
174 | static void kasan_end_report(unsigned long *flags) | 89 | static void end_report(unsigned long *flags) |
175 | { | 90 | { |
176 | pr_err("==================================================================\n"); | 91 | pr_err("==================================================================\n"); |
177 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); | 92 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
@@ -249,6 +164,22 @@ static void describe_object(struct kmem_cache *cache, void *object, | |||
249 | describe_object_addr(cache, object, addr); | 164 | describe_object_addr(cache, object, addr); |
250 | } | 165 | } |
251 | 166 | ||
167 | static inline bool kernel_or_module_addr(const void *addr) | ||
168 | { | ||
169 | if (addr >= (void *)_stext && addr < (void *)_end) | ||
170 | return true; | ||
171 | if (is_module_address((unsigned long)addr)) | ||
172 | return true; | ||
173 | return false; | ||
174 | } | ||
175 | |||
176 | static inline bool init_task_stack_addr(const void *addr) | ||
177 | { | ||
178 | return addr >= (void *)&init_thread_union.stack && | ||
179 | (addr <= (void *)&init_thread_union.stack + | ||
180 | sizeof(init_thread_union.stack)); | ||
181 | } | ||
182 | |||
252 | static void print_address_description(void *addr) | 183 | static void print_address_description(void *addr) |
253 | { | 184 | { |
254 | struct page *page = addr_to_page(addr); | 185 | struct page *page = addr_to_page(addr); |
@@ -326,126 +257,69 @@ static void print_shadow_for_address(const void *addr) | |||
326 | } | 257 | } |
327 | } | 258 | } |
328 | 259 | ||
260 | static bool report_enabled(void) | ||
261 | { | ||
262 | if (current->kasan_depth) | ||
263 | return false; | ||
264 | if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) | ||
265 | return true; | ||
266 | return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); | ||
267 | } | ||
268 | |||
329 | void kasan_report_invalid_free(void *object, unsigned long ip) | 269 | void kasan_report_invalid_free(void *object, unsigned long ip) |
330 | { | 270 | { |
331 | unsigned long flags; | 271 | unsigned long flags; |
332 | 272 | ||
333 | kasan_start_report(&flags); | 273 | start_report(&flags); |
334 | pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); | 274 | pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); |
275 | print_tags(get_tag(object), reset_tag(object)); | ||
276 | object = reset_tag(object); | ||
335 | pr_err("\n"); | 277 | pr_err("\n"); |
336 | print_address_description(object); | 278 | print_address_description(object); |
337 | pr_err("\n"); | 279 | pr_err("\n"); |
338 | print_shadow_for_address(object); | 280 | print_shadow_for_address(object); |
339 | kasan_end_report(&flags); | 281 | end_report(&flags); |
340 | } | ||
341 | |||
342 | static void kasan_report_error(struct kasan_access_info *info) | ||
343 | { | ||
344 | unsigned long flags; | ||
345 | |||
346 | kasan_start_report(&flags); | ||
347 | |||
348 | print_error_description(info); | ||
349 | pr_err("\n"); | ||
350 | |||
351 | if (!addr_has_shadow(info)) { | ||
352 | dump_stack(); | ||
353 | } else { | ||
354 | print_address_description((void *)info->access_addr); | ||
355 | pr_err("\n"); | ||
356 | print_shadow_for_address(info->first_bad_addr); | ||
357 | } | ||
358 | |||
359 | kasan_end_report(&flags); | ||
360 | } | ||
361 | |||
362 | static unsigned long kasan_flags; | ||
363 | |||
364 | #define KASAN_BIT_REPORTED 0 | ||
365 | #define KASAN_BIT_MULTI_SHOT 1 | ||
366 | |||
367 | bool kasan_save_enable_multi_shot(void) | ||
368 | { | ||
369 | return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); | ||
370 | } | ||
371 | EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); | ||
372 | |||
373 | void kasan_restore_multi_shot(bool enabled) | ||
374 | { | ||
375 | if (!enabled) | ||
376 | clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); | ||
377 | } | ||
378 | EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); | ||
379 | |||
380 | static int __init kasan_set_multi_shot(char *str) | ||
381 | { | ||
382 | set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); | ||
383 | return 1; | ||
384 | } | ||
385 | __setup("kasan_multi_shot", kasan_set_multi_shot); | ||
386 | |||
387 | static inline bool kasan_report_enabled(void) | ||
388 | { | ||
389 | if (current->kasan_depth) | ||
390 | return false; | ||
391 | if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) | ||
392 | return true; | ||
393 | return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); | ||
394 | } | 282 | } |
395 | 283 | ||
396 | void kasan_report(unsigned long addr, size_t size, | 284 | void kasan_report(unsigned long addr, size_t size, |
397 | bool is_write, unsigned long ip) | 285 | bool is_write, unsigned long ip) |
398 | { | 286 | { |
399 | struct kasan_access_info info; | 287 | struct kasan_access_info info; |
288 | void *tagged_addr; | ||
289 | void *untagged_addr; | ||
290 | unsigned long flags; | ||
400 | 291 | ||
401 | if (likely(!kasan_report_enabled())) | 292 | if (likely(!report_enabled())) |
402 | return; | 293 | return; |
403 | 294 | ||
404 | disable_trace_on_warning(); | 295 | disable_trace_on_warning(); |
405 | 296 | ||
406 | info.access_addr = (void *)addr; | 297 | tagged_addr = (void *)addr; |
407 | info.first_bad_addr = (void *)addr; | 298 | untagged_addr = reset_tag(tagged_addr); |
299 | |||
300 | info.access_addr = tagged_addr; | ||
301 | if (addr_has_shadow(untagged_addr)) | ||
302 | info.first_bad_addr = find_first_bad_addr(tagged_addr, size); | ||
303 | else | ||
304 | info.first_bad_addr = untagged_addr; | ||
408 | info.access_size = size; | 305 | info.access_size = size; |
409 | info.is_write = is_write; | 306 | info.is_write = is_write; |
410 | info.ip = ip; | 307 | info.ip = ip; |
411 | 308 | ||
412 | kasan_report_error(&info); | 309 | start_report(&flags); |
413 | } | ||
414 | 310 | ||
311 | print_error_description(&info); | ||
312 | if (addr_has_shadow(untagged_addr)) | ||
313 | print_tags(get_tag(tagged_addr), info.first_bad_addr); | ||
314 | pr_err("\n"); | ||
415 | 315 | ||
416 | #define DEFINE_ASAN_REPORT_LOAD(size) \ | 316 | if (addr_has_shadow(untagged_addr)) { |
417 | void __asan_report_load##size##_noabort(unsigned long addr) \ | 317 | print_address_description(untagged_addr); |
418 | { \ | 318 | pr_err("\n"); |
419 | kasan_report(addr, size, false, _RET_IP_); \ | 319 | print_shadow_for_address(info.first_bad_addr); |
420 | } \ | 320 | } else { |
421 | EXPORT_SYMBOL(__asan_report_load##size##_noabort) | 321 | dump_stack(); |
422 | 322 | } | |
423 | #define DEFINE_ASAN_REPORT_STORE(size) \ | ||
424 | void __asan_report_store##size##_noabort(unsigned long addr) \ | ||
425 | { \ | ||
426 | kasan_report(addr, size, true, _RET_IP_); \ | ||
427 | } \ | ||
428 | EXPORT_SYMBOL(__asan_report_store##size##_noabort) | ||
429 | |||
430 | DEFINE_ASAN_REPORT_LOAD(1); | ||
431 | DEFINE_ASAN_REPORT_LOAD(2); | ||
432 | DEFINE_ASAN_REPORT_LOAD(4); | ||
433 | DEFINE_ASAN_REPORT_LOAD(8); | ||
434 | DEFINE_ASAN_REPORT_LOAD(16); | ||
435 | DEFINE_ASAN_REPORT_STORE(1); | ||
436 | DEFINE_ASAN_REPORT_STORE(2); | ||
437 | DEFINE_ASAN_REPORT_STORE(4); | ||
438 | DEFINE_ASAN_REPORT_STORE(8); | ||
439 | DEFINE_ASAN_REPORT_STORE(16); | ||
440 | |||
441 | void __asan_report_load_n_noabort(unsigned long addr, size_t size) | ||
442 | { | ||
443 | kasan_report(addr, size, false, _RET_IP_); | ||
444 | } | ||
445 | EXPORT_SYMBOL(__asan_report_load_n_noabort); | ||
446 | 323 | ||
447 | void __asan_report_store_n_noabort(unsigned long addr, size_t size) | 324 | end_report(&flags); |
448 | { | ||
449 | kasan_report(addr, size, true, _RET_IP_); | ||
450 | } | 325 | } |
451 | EXPORT_SYMBOL(__asan_report_store_n_noabort); | ||
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c new file mode 100644 index 000000000000..0777649e07c4 --- /dev/null +++ b/mm/kasan/tags.c | |||
@@ -0,0 +1,161 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * This file contains core tag-based KASAN code. | ||
4 | * | ||
5 | * Copyright (c) 2018 Google, Inc. | ||
6 | * Author: Andrey Konovalov <andreyknvl@google.com> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License version 2 as | ||
10 | * published by the Free Software Foundation. | ||
11 | * | ||
12 | */ | ||
13 | |||
14 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
15 | #define DISABLE_BRANCH_PROFILING | ||
16 | |||
17 | #include <linux/export.h> | ||
18 | #include <linux/interrupt.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/kasan.h> | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/kmemleak.h> | ||
23 | #include <linux/linkage.h> | ||
24 | #include <linux/memblock.h> | ||
25 | #include <linux/memory.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/printk.h> | ||
29 | #include <linux/random.h> | ||
30 | #include <linux/sched.h> | ||
31 | #include <linux/sched/task_stack.h> | ||
32 | #include <linux/slab.h> | ||
33 | #include <linux/stacktrace.h> | ||
34 | #include <linux/string.h> | ||
35 | #include <linux/types.h> | ||
36 | #include <linux/vmalloc.h> | ||
37 | #include <linux/bug.h> | ||
38 | |||
39 | #include "kasan.h" | ||
40 | #include "../slab.h" | ||
41 | |||
42 | static DEFINE_PER_CPU(u32, prng_state); | ||
43 | |||
44 | void kasan_init_tags(void) | ||
45 | { | ||
46 | int cpu; | ||
47 | |||
48 | for_each_possible_cpu(cpu) | ||
49 | per_cpu(prng_state, cpu) = get_random_u32(); | ||
50 | } | ||
51 | |||
52 | /* | ||
53 | * If a preemption happens between this_cpu_read and this_cpu_write, the only | ||
54 | * side effect is that we'll give a few allocated in different contexts objects | ||
55 | * the same tag. Since tag-based KASAN is meant to be used a probabilistic | ||
56 | * bug-detection debug feature, this doesn't have significant negative impact. | ||
57 | * | ||
58 | * Ideally the tags use strong randomness to prevent any attempts to predict | ||
59 | * them during explicit exploit attempts. But strong randomness is expensive, | ||
60 | * and we did an intentional trade-off to use a PRNG. This non-atomic RMW | ||
61 | * sequence has in fact positive effect, since interrupts that randomly skew | ||
62 | * PRNG at unpredictable points do only good. | ||
63 | */ | ||
64 | u8 random_tag(void) | ||
65 | { | ||
66 | u32 state = this_cpu_read(prng_state); | ||
67 | |||
68 | state = 1664525 * state + 1013904223; | ||
69 | this_cpu_write(prng_state, state); | ||
70 | |||
71 | return (u8)(state % (KASAN_TAG_MAX + 1)); | ||
72 | } | ||
73 | |||
74 | void *kasan_reset_tag(const void *addr) | ||
75 | { | ||
76 | return reset_tag(addr); | ||
77 | } | ||
78 | |||
79 | void check_memory_region(unsigned long addr, size_t size, bool write, | ||
80 | unsigned long ret_ip) | ||
81 | { | ||
82 | u8 tag; | ||
83 | u8 *shadow_first, *shadow_last, *shadow; | ||
84 | void *untagged_addr; | ||
85 | |||
86 | if (unlikely(size == 0)) | ||
87 | return; | ||
88 | |||
89 | tag = get_tag((const void *)addr); | ||
90 | |||
91 | /* | ||
92 | * Ignore accesses for pointers tagged with 0xff (native kernel | ||
93 | * pointer tag) to suppress false positives caused by kmap. | ||
94 | * | ||
95 | * Some kernel code was written to account for archs that don't keep | ||
96 | * high memory mapped all the time, but rather map and unmap particular | ||
97 | * pages when needed. Instead of storing a pointer to the kernel memory, | ||
98 | * this code saves the address of the page structure and offset within | ||
99 | * that page for later use. Those pages are then mapped and unmapped | ||
100 | * with kmap/kunmap when necessary and virt_to_page is used to get the | ||
101 | * virtual address of the page. For arm64 (that keeps the high memory | ||
102 | * mapped all the time), kmap is turned into a page_address call. | ||
103 | |||
104 | * The issue is that with use of the page_address + virt_to_page | ||
105 | * sequence the top byte value of the original pointer gets lost (gets | ||
106 | * set to KASAN_TAG_KERNEL (0xFF)). | ||
107 | */ | ||
108 | if (tag == KASAN_TAG_KERNEL) | ||
109 | return; | ||
110 | |||
111 | untagged_addr = reset_tag((const void *)addr); | ||
112 | if (unlikely(untagged_addr < | ||
113 | kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { | ||
114 | kasan_report(addr, size, write, ret_ip); | ||
115 | return; | ||
116 | } | ||
117 | shadow_first = kasan_mem_to_shadow(untagged_addr); | ||
118 | shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1); | ||
119 | for (shadow = shadow_first; shadow <= shadow_last; shadow++) { | ||
120 | if (*shadow != tag) { | ||
121 | kasan_report(addr, size, write, ret_ip); | ||
122 | return; | ||
123 | } | ||
124 | } | ||
125 | } | ||
126 | |||
127 | #define DEFINE_HWASAN_LOAD_STORE(size) \ | ||
128 | void __hwasan_load##size##_noabort(unsigned long addr) \ | ||
129 | { \ | ||
130 | check_memory_region(addr, size, false, _RET_IP_); \ | ||
131 | } \ | ||
132 | EXPORT_SYMBOL(__hwasan_load##size##_noabort); \ | ||
133 | void __hwasan_store##size##_noabort(unsigned long addr) \ | ||
134 | { \ | ||
135 | check_memory_region(addr, size, true, _RET_IP_); \ | ||
136 | } \ | ||
137 | EXPORT_SYMBOL(__hwasan_store##size##_noabort) | ||
138 | |||
139 | DEFINE_HWASAN_LOAD_STORE(1); | ||
140 | DEFINE_HWASAN_LOAD_STORE(2); | ||
141 | DEFINE_HWASAN_LOAD_STORE(4); | ||
142 | DEFINE_HWASAN_LOAD_STORE(8); | ||
143 | DEFINE_HWASAN_LOAD_STORE(16); | ||
144 | |||
145 | void __hwasan_loadN_noabort(unsigned long addr, unsigned long size) | ||
146 | { | ||
147 | check_memory_region(addr, size, false, _RET_IP_); | ||
148 | } | ||
149 | EXPORT_SYMBOL(__hwasan_loadN_noabort); | ||
150 | |||
151 | void __hwasan_storeN_noabort(unsigned long addr, unsigned long size) | ||
152 | { | ||
153 | check_memory_region(addr, size, true, _RET_IP_); | ||
154 | } | ||
155 | EXPORT_SYMBOL(__hwasan_storeN_noabort); | ||
156 | |||
157 | void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) | ||
158 | { | ||
159 | kasan_poison_shadow((void *)addr, size, tag); | ||
160 | } | ||
161 | EXPORT_SYMBOL(__hwasan_tag_memory); | ||
diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c new file mode 100644 index 000000000000..8eaf5f722271 --- /dev/null +++ b/mm/kasan/tags_report.c | |||
@@ -0,0 +1,58 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * This file contains tag-based KASAN specific error reporting code. | ||
4 | * | ||
5 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. | ||
6 | * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> | ||
7 | * | ||
8 | * Some code borrowed from https://github.com/xairy/kasan-prototype by | ||
9 | * Andrey Konovalov <andreyknvl@gmail.com> | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License version 2 as | ||
13 | * published by the Free Software Foundation. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #include <linux/bitops.h> | ||
18 | #include <linux/ftrace.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/printk.h> | ||
23 | #include <linux/sched.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/stackdepot.h> | ||
26 | #include <linux/stacktrace.h> | ||
27 | #include <linux/string.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/kasan.h> | ||
30 | #include <linux/module.h> | ||
31 | |||
32 | #include <asm/sections.h> | ||
33 | |||
34 | #include "kasan.h" | ||
35 | #include "../slab.h" | ||
36 | |||
37 | const char *get_bug_type(struct kasan_access_info *info) | ||
38 | { | ||
39 | return "invalid-access"; | ||
40 | } | ||
41 | |||
42 | void *find_first_bad_addr(void *addr, size_t size) | ||
43 | { | ||
44 | u8 tag = get_tag(addr); | ||
45 | void *p = reset_tag(addr); | ||
46 | void *end = p + size; | ||
47 | |||
48 | while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p)) | ||
49 | p += KASAN_SHADOW_SCALE_SIZE; | ||
50 | return p; | ||
51 | } | ||
52 | |||
53 | void print_tags(u8 addr_tag, const void *addr) | ||
54 | { | ||
55 | u8 *shadow = (u8 *)kasan_mem_to_shadow(addr); | ||
56 | |||
57 | pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow); | ||
58 | } | ||
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 43ce2f4d2551..4f017339ddb2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c | |||
@@ -944,8 +944,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
944 | int isolated = 0, result = 0; | 944 | int isolated = 0, result = 0; |
945 | struct mem_cgroup *memcg; | 945 | struct mem_cgroup *memcg; |
946 | struct vm_area_struct *vma; | 946 | struct vm_area_struct *vma; |
947 | unsigned long mmun_start; /* For mmu_notifiers */ | 947 | struct mmu_notifier_range range; |
948 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
949 | gfp_t gfp; | 948 | gfp_t gfp; |
950 | 949 | ||
951 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 950 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
@@ -1017,9 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1017 | pte = pte_offset_map(pmd, address); | 1016 | pte = pte_offset_map(pmd, address); |
1018 | pte_ptl = pte_lockptr(mm, pmd); | 1017 | pte_ptl = pte_lockptr(mm, pmd); |
1019 | 1018 | ||
1020 | mmun_start = address; | 1019 | mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE); |
1021 | mmun_end = address + HPAGE_PMD_SIZE; | 1020 | mmu_notifier_invalidate_range_start(&range); |
1022 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1023 | pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ | 1021 | pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ |
1024 | /* | 1022 | /* |
1025 | * After this gup_fast can't run anymore. This also removes | 1023 | * After this gup_fast can't run anymore. This also removes |
@@ -1029,7 +1027,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1029 | */ | 1027 | */ |
1030 | _pmd = pmdp_collapse_flush(vma, address, pmd); | 1028 | _pmd = pmdp_collapse_flush(vma, address, pmd); |
1031 | spin_unlock(pmd_ptl); | 1029 | spin_unlock(pmd_ptl); |
1032 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1030 | mmu_notifier_invalidate_range_end(&range); |
1033 | 1031 | ||
1034 | spin_lock(pte_ptl); | 1032 | spin_lock(pte_ptl); |
1035 | isolated = __collapse_huge_page_isolate(vma, address, pte); | 1033 | isolated = __collapse_huge_page_isolate(vma, address, pte); |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 877de4fa0720..f9d9dc250428 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -1547,11 +1547,14 @@ static void kmemleak_scan(void) | |||
1547 | unsigned long pfn; | 1547 | unsigned long pfn; |
1548 | 1548 | ||
1549 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 1549 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
1550 | struct page *page; | 1550 | struct page *page = pfn_to_online_page(pfn); |
1551 | 1551 | ||
1552 | if (!pfn_valid(pfn)) | 1552 | if (!page) |
1553 | continue; | ||
1554 | |||
1555 | /* only scan pages belonging to this node */ | ||
1556 | if (page_to_nid(page) != i) | ||
1553 | continue; | 1557 | continue; |
1554 | page = pfn_to_page(pfn); | ||
1555 | /* only scan if page is in use */ | 1558 | /* only scan if page is in use */ |
1556 | if (page_count(page) == 0) | 1559 | if (page_count(page) == 0) |
1557 | continue; | 1560 | continue; |
@@ -1647,7 +1650,7 @@ static void kmemleak_scan(void) | |||
1647 | */ | 1650 | */ |
1648 | static int kmemleak_scan_thread(void *arg) | 1651 | static int kmemleak_scan_thread(void *arg) |
1649 | { | 1652 | { |
1650 | static int first_run = 1; | 1653 | static int first_run = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN); |
1651 | 1654 | ||
1652 | pr_info("Automatic memory scanning thread started\n"); | 1655 | pr_info("Automatic memory scanning thread started\n"); |
1653 | set_user_nice(current, 10); | 1656 | set_user_nice(current, 10); |
@@ -2141,9 +2144,11 @@ static int __init kmemleak_late_init(void) | |||
2141 | return -ENOMEM; | 2144 | return -ENOMEM; |
2142 | } | 2145 | } |
2143 | 2146 | ||
2144 | mutex_lock(&scan_mutex); | 2147 | if (IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN)) { |
2145 | start_scan_thread(); | 2148 | mutex_lock(&scan_mutex); |
2146 | mutex_unlock(&scan_mutex); | 2149 | start_scan_thread(); |
2150 | mutex_unlock(&scan_mutex); | ||
2151 | } | ||
2147 | 2152 | ||
2148 | pr_info("Kernel memory leak detector initialized\n"); | 2153 | pr_info("Kernel memory leak detector initialized\n"); |
2149 | 2154 | ||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/pagemap.h> | 25 | #include <linux/pagemap.h> |
26 | #include <linux/rmap.h> | 26 | #include <linux/rmap.h> |
27 | #include <linux/spinlock.h> | 27 | #include <linux/spinlock.h> |
28 | #include <linux/jhash.h> | 28 | #include <linux/xxhash.h> |
29 | #include <linux/delay.h> | 29 | #include <linux/delay.h> |
30 | #include <linux/kthread.h> | 30 | #include <linux/kthread.h> |
31 | #include <linux/wait.h> | 31 | #include <linux/wait.h> |
@@ -296,6 +296,7 @@ static unsigned long ksm_run = KSM_RUN_STOP; | |||
296 | static void wait_while_offlining(void); | 296 | static void wait_while_offlining(void); |
297 | 297 | ||
298 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); | 298 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); |
299 | static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); | ||
299 | static DEFINE_MUTEX(ksm_thread_mutex); | 300 | static DEFINE_MUTEX(ksm_thread_mutex); |
300 | static DEFINE_SPINLOCK(ksm_mmlist_lock); | 301 | static DEFINE_SPINLOCK(ksm_mmlist_lock); |
301 | 302 | ||
@@ -1009,7 +1010,7 @@ static u32 calc_checksum(struct page *page) | |||
1009 | { | 1010 | { |
1010 | u32 checksum; | 1011 | u32 checksum; |
1011 | void *addr = kmap_atomic(page); | 1012 | void *addr = kmap_atomic(page); |
1012 | checksum = jhash2(addr, PAGE_SIZE / 4, 17); | 1013 | checksum = xxhash(addr, PAGE_SIZE, 0); |
1013 | kunmap_atomic(addr); | 1014 | kunmap_atomic(addr); |
1014 | return checksum; | 1015 | return checksum; |
1015 | } | 1016 | } |
@@ -1042,8 +1043,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
1042 | }; | 1043 | }; |
1043 | int swapped; | 1044 | int swapped; |
1044 | int err = -EFAULT; | 1045 | int err = -EFAULT; |
1045 | unsigned long mmun_start; /* For mmu_notifiers */ | 1046 | struct mmu_notifier_range range; |
1046 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1047 | 1047 | ||
1048 | pvmw.address = page_address_in_vma(page, vma); | 1048 | pvmw.address = page_address_in_vma(page, vma); |
1049 | if (pvmw.address == -EFAULT) | 1049 | if (pvmw.address == -EFAULT) |
@@ -1051,9 +1051,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
1051 | 1051 | ||
1052 | BUG_ON(PageTransCompound(page)); | 1052 | BUG_ON(PageTransCompound(page)); |
1053 | 1053 | ||
1054 | mmun_start = pvmw.address; | 1054 | mmu_notifier_range_init(&range, mm, pvmw.address, |
1055 | mmun_end = pvmw.address + PAGE_SIZE; | 1055 | pvmw.address + PAGE_SIZE); |
1056 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1056 | mmu_notifier_invalidate_range_start(&range); |
1057 | 1057 | ||
1058 | if (!page_vma_mapped_walk(&pvmw)) | 1058 | if (!page_vma_mapped_walk(&pvmw)) |
1059 | goto out_mn; | 1059 | goto out_mn; |
@@ -1105,7 +1105,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
1105 | out_unlock: | 1105 | out_unlock: |
1106 | page_vma_mapped_walk_done(&pvmw); | 1106 | page_vma_mapped_walk_done(&pvmw); |
1107 | out_mn: | 1107 | out_mn: |
1108 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1108 | mmu_notifier_invalidate_range_end(&range); |
1109 | out: | 1109 | out: |
1110 | return err; | 1110 | return err; |
1111 | } | 1111 | } |
@@ -1129,8 +1129,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
1129 | spinlock_t *ptl; | 1129 | spinlock_t *ptl; |
1130 | unsigned long addr; | 1130 | unsigned long addr; |
1131 | int err = -EFAULT; | 1131 | int err = -EFAULT; |
1132 | unsigned long mmun_start; /* For mmu_notifiers */ | 1132 | struct mmu_notifier_range range; |
1133 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1134 | 1133 | ||
1135 | addr = page_address_in_vma(page, vma); | 1134 | addr = page_address_in_vma(page, vma); |
1136 | if (addr == -EFAULT) | 1135 | if (addr == -EFAULT) |
@@ -1140,9 +1139,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
1140 | if (!pmd) | 1139 | if (!pmd) |
1141 | goto out; | 1140 | goto out; |
1142 | 1141 | ||
1143 | mmun_start = addr; | 1142 | mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); |
1144 | mmun_end = addr + PAGE_SIZE; | 1143 | mmu_notifier_invalidate_range_start(&range); |
1145 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1146 | 1144 | ||
1147 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | 1145 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); |
1148 | if (!pte_same(*ptep, orig_pte)) { | 1146 | if (!pte_same(*ptep, orig_pte)) { |
@@ -1188,7 +1186,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
1188 | pte_unmap_unlock(ptep, ptl); | 1186 | pte_unmap_unlock(ptep, ptl); |
1189 | err = 0; | 1187 | err = 0; |
1190 | out_mn: | 1188 | out_mn: |
1191 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1189 | mmu_notifier_invalidate_range_end(&range); |
1192 | out: | 1190 | out: |
1193 | return err; | 1191 | return err; |
1194 | } | 1192 | } |
@@ -2391,6 +2389,8 @@ static int ksmd_should_run(void) | |||
2391 | 2389 | ||
2392 | static int ksm_scan_thread(void *nothing) | 2390 | static int ksm_scan_thread(void *nothing) |
2393 | { | 2391 | { |
2392 | unsigned int sleep_ms; | ||
2393 | |||
2394 | set_freezable(); | 2394 | set_freezable(); |
2395 | set_user_nice(current, 5); | 2395 | set_user_nice(current, 5); |
2396 | 2396 | ||
@@ -2404,8 +2404,10 @@ static int ksm_scan_thread(void *nothing) | |||
2404 | try_to_freeze(); | 2404 | try_to_freeze(); |
2405 | 2405 | ||
2406 | if (ksmd_should_run()) { | 2406 | if (ksmd_should_run()) { |
2407 | schedule_timeout_interruptible( | 2407 | sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); |
2408 | msecs_to_jiffies(ksm_thread_sleep_millisecs)); | 2408 | wait_event_interruptible_timeout(ksm_iter_wait, |
2409 | sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), | ||
2410 | msecs_to_jiffies(sleep_ms)); | ||
2409 | } else { | 2411 | } else { |
2410 | wait_event_freezable(ksm_thread_wait, | 2412 | wait_event_freezable(ksm_thread_wait, |
2411 | ksmd_should_run() || kthread_should_stop()); | 2413 | ksmd_should_run() || kthread_should_stop()); |
@@ -2824,6 +2826,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj, | |||
2824 | return -EINVAL; | 2826 | return -EINVAL; |
2825 | 2827 | ||
2826 | ksm_thread_sleep_millisecs = msecs; | 2828 | ksm_thread_sleep_millisecs = msecs; |
2829 | wake_up_interruptible(&ksm_iter_wait); | ||
2827 | 2830 | ||
2828 | return count; | 2831 | return count; |
2829 | } | 2832 | } |
diff --git a/mm/madvise.c b/mm/madvise.c index 6cb1ca93e290..21a7881a2db4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -458,29 +458,30 @@ static void madvise_free_page_range(struct mmu_gather *tlb, | |||
458 | static int madvise_free_single_vma(struct vm_area_struct *vma, | 458 | static int madvise_free_single_vma(struct vm_area_struct *vma, |
459 | unsigned long start_addr, unsigned long end_addr) | 459 | unsigned long start_addr, unsigned long end_addr) |
460 | { | 460 | { |
461 | unsigned long start, end; | ||
462 | struct mm_struct *mm = vma->vm_mm; | 461 | struct mm_struct *mm = vma->vm_mm; |
462 | struct mmu_notifier_range range; | ||
463 | struct mmu_gather tlb; | 463 | struct mmu_gather tlb; |
464 | 464 | ||
465 | /* MADV_FREE works for only anon vma at the moment */ | 465 | /* MADV_FREE works for only anon vma at the moment */ |
466 | if (!vma_is_anonymous(vma)) | 466 | if (!vma_is_anonymous(vma)) |
467 | return -EINVAL; | 467 | return -EINVAL; |
468 | 468 | ||
469 | start = max(vma->vm_start, start_addr); | 469 | range.start = max(vma->vm_start, start_addr); |
470 | if (start >= vma->vm_end) | 470 | if (range.start >= vma->vm_end) |
471 | return -EINVAL; | 471 | return -EINVAL; |
472 | end = min(vma->vm_end, end_addr); | 472 | range.end = min(vma->vm_end, end_addr); |
473 | if (end <= vma->vm_start) | 473 | if (range.end <= vma->vm_start) |
474 | return -EINVAL; | 474 | return -EINVAL; |
475 | mmu_notifier_range_init(&range, mm, range.start, range.end); | ||
475 | 476 | ||
476 | lru_add_drain(); | 477 | lru_add_drain(); |
477 | tlb_gather_mmu(&tlb, mm, start, end); | 478 | tlb_gather_mmu(&tlb, mm, range.start, range.end); |
478 | update_hiwater_rss(mm); | 479 | update_hiwater_rss(mm); |
479 | 480 | ||
480 | mmu_notifier_invalidate_range_start(mm, start, end); | 481 | mmu_notifier_invalidate_range_start(&range); |
481 | madvise_free_page_range(&tlb, vma, start, end); | 482 | madvise_free_page_range(&tlb, vma, range.start, range.end); |
482 | mmu_notifier_invalidate_range_end(mm, start, end); | 483 | mmu_notifier_invalidate_range_end(&range); |
483 | tlb_finish_mmu(&tlb, start, end); | 484 | tlb_finish_mmu(&tlb, range.start, range.end); |
484 | 485 | ||
485 | return 0; | 486 | return 0; |
486 | } | 487 | } |
diff --git a/mm/memblock.c b/mm/memblock.c index 81ae63ca78d0..022d4cbb3618 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -262,7 +262,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, | |||
262 | phys_addr_t kernel_end, ret; | 262 | phys_addr_t kernel_end, ret; |
263 | 263 | ||
264 | /* pump up @end */ | 264 | /* pump up @end */ |
265 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) | 265 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE || |
266 | end == MEMBLOCK_ALLOC_KASAN) | ||
266 | end = memblock.current_limit; | 267 | end = memblock.current_limit; |
267 | 268 | ||
268 | /* avoid allocating the first page */ | 269 | /* avoid allocating the first page */ |
@@ -800,7 +801,14 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) | |||
800 | return memblock_remove_range(&memblock.memory, base, size); | 801 | return memblock_remove_range(&memblock.memory, base, size); |
801 | } | 802 | } |
802 | 803 | ||
803 | 804 | /** | |
805 | * memblock_free - free boot memory block | ||
806 | * @base: phys starting address of the boot memory block | ||
807 | * @size: size of the boot memory block in bytes | ||
808 | * | ||
809 | * Free boot memory block previously allocated by memblock_alloc_xx() API. | ||
810 | * The freeing memory will not be released to the buddy allocator. | ||
811 | */ | ||
804 | int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) | 812 | int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) |
805 | { | 813 | { |
806 | phys_addr_t end = base + size - 1; | 814 | phys_addr_t end = base + size - 1; |
@@ -1412,13 +1420,15 @@ again: | |||
1412 | done: | 1420 | done: |
1413 | ptr = phys_to_virt(alloc); | 1421 | ptr = phys_to_virt(alloc); |
1414 | 1422 | ||
1415 | /* | 1423 | /* Skip kmemleak for kasan_init() due to high volume. */ |
1416 | * The min_count is set to 0 so that bootmem allocated blocks | 1424 | if (max_addr != MEMBLOCK_ALLOC_KASAN) |
1417 | * are never reported as leaks. This is because many of these blocks | 1425 | /* |
1418 | * are only referred via the physical address which is not | 1426 | * The min_count is set to 0 so that bootmem allocated |
1419 | * looked up by kmemleak. | 1427 | * blocks are never reported as leaks. This is because many |
1420 | */ | 1428 | * of these blocks are only referred via the physical |
1421 | kmemleak_alloc(ptr, size, 0, 0); | 1429 | * address which is not looked up by kmemleak. |
1430 | */ | ||
1431 | kmemleak_alloc(ptr, size, 0, 0); | ||
1422 | 1432 | ||
1423 | return ptr; | 1433 | return ptr; |
1424 | } | 1434 | } |
@@ -1537,24 +1547,6 @@ void * __init memblock_alloc_try_nid( | |||
1537 | } | 1547 | } |
1538 | 1548 | ||
1539 | /** | 1549 | /** |
1540 | * __memblock_free_early - free boot memory block | ||
1541 | * @base: phys starting address of the boot memory block | ||
1542 | * @size: size of the boot memory block in bytes | ||
1543 | * | ||
1544 | * Free boot memory block previously allocated by memblock_alloc_xx() API. | ||
1545 | * The freeing memory will not be released to the buddy allocator. | ||
1546 | */ | ||
1547 | void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) | ||
1548 | { | ||
1549 | phys_addr_t end = base + size - 1; | ||
1550 | |||
1551 | memblock_dbg("%s: [%pa-%pa] %pF\n", | ||
1552 | __func__, &base, &end, (void *)_RET_IP_); | ||
1553 | kmemleak_free_part_phys(base, size); | ||
1554 | memblock_remove_range(&memblock.reserved, base, size); | ||
1555 | } | ||
1556 | |||
1557 | /** | ||
1558 | * __memblock_free_late - free bootmem block pages directly to buddy allocator | 1550 | * __memblock_free_late - free bootmem block pages directly to buddy allocator |
1559 | * @base: phys starting address of the boot memory block | 1551 | * @base: phys starting address of the boot memory block |
1560 | * @size: size of the boot memory block in bytes | 1552 | * @size: size of the boot memory block in bytes |
@@ -1576,7 +1568,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) | |||
1576 | 1568 | ||
1577 | for (; cursor < end; cursor++) { | 1569 | for (; cursor < end; cursor++) { |
1578 | memblock_free_pages(pfn_to_page(cursor), cursor, 0); | 1570 | memblock_free_pages(pfn_to_page(cursor), cursor, 0); |
1579 | totalram_pages++; | 1571 | totalram_pages_inc(); |
1580 | } | 1572 | } |
1581 | } | 1573 | } |
1582 | 1574 | ||
@@ -1950,7 +1942,7 @@ void reset_node_managed_pages(pg_data_t *pgdat) | |||
1950 | struct zone *z; | 1942 | struct zone *z; |
1951 | 1943 | ||
1952 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | 1944 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) |
1953 | z->managed_pages = 0; | 1945 | atomic_long_set(&z->managed_pages, 0); |
1954 | } | 1946 | } |
1955 | 1947 | ||
1956 | void __init reset_all_zones_managed_pages(void) | 1948 | void __init reset_all_zones_managed_pages(void) |
@@ -1978,7 +1970,7 @@ unsigned long __init memblock_free_all(void) | |||
1978 | reset_all_zones_managed_pages(); | 1970 | reset_all_zones_managed_pages(); |
1979 | 1971 | ||
1980 | pages = free_low_memory_core_early(); | 1972 | pages = free_low_memory_core_early(); |
1981 | totalram_pages += pages; | 1973 | totalram_pages_add(pages); |
1982 | 1974 | ||
1983 | return pages; | 1975 | return pages; |
1984 | } | 1976 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6e1469b80cb7..af7f18b32389 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1293,32 +1293,39 @@ static const char *const memcg1_stat_names[] = { | |||
1293 | 1293 | ||
1294 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 1294 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
1295 | /** | 1295 | /** |
1296 | * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. | 1296 | * mem_cgroup_print_oom_context: Print OOM information relevant to |
1297 | * memory controller. | ||
1297 | * @memcg: The memory cgroup that went over limit | 1298 | * @memcg: The memory cgroup that went over limit |
1298 | * @p: Task that is going to be killed | 1299 | * @p: Task that is going to be killed |
1299 | * | 1300 | * |
1300 | * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is | 1301 | * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is |
1301 | * enabled | 1302 | * enabled |
1302 | */ | 1303 | */ |
1303 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | 1304 | void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) |
1304 | { | 1305 | { |
1305 | struct mem_cgroup *iter; | ||
1306 | unsigned int i; | ||
1307 | |||
1308 | rcu_read_lock(); | 1306 | rcu_read_lock(); |
1309 | 1307 | ||
1308 | if (memcg) { | ||
1309 | pr_cont(",oom_memcg="); | ||
1310 | pr_cont_cgroup_path(memcg->css.cgroup); | ||
1311 | } else | ||
1312 | pr_cont(",global_oom"); | ||
1310 | if (p) { | 1313 | if (p) { |
1311 | pr_info("Task in "); | 1314 | pr_cont(",task_memcg="); |
1312 | pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); | 1315 | pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); |
1313 | pr_cont(" killed as a result of limit of "); | ||
1314 | } else { | ||
1315 | pr_info("Memory limit reached of cgroup "); | ||
1316 | } | 1316 | } |
1317 | |||
1318 | pr_cont_cgroup_path(memcg->css.cgroup); | ||
1319 | pr_cont("\n"); | ||
1320 | |||
1321 | rcu_read_unlock(); | 1317 | rcu_read_unlock(); |
1318 | } | ||
1319 | |||
1320 | /** | ||
1321 | * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to | ||
1322 | * memory controller. | ||
1323 | * @memcg: The memory cgroup that went over limit | ||
1324 | */ | ||
1325 | void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) | ||
1326 | { | ||
1327 | struct mem_cgroup *iter; | ||
1328 | unsigned int i; | ||
1322 | 1329 | ||
1323 | pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", | 1330 | pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", |
1324 | K((u64)page_counter_read(&memcg->memory)), | 1331 | K((u64)page_counter_read(&memcg->memory)), |
@@ -1666,6 +1673,9 @@ enum oom_status { | |||
1666 | 1673 | ||
1667 | static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 1674 | static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
1668 | { | 1675 | { |
1676 | enum oom_status ret; | ||
1677 | bool locked; | ||
1678 | |||
1669 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 1679 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
1670 | return OOM_SKIPPED; | 1680 | return OOM_SKIPPED; |
1671 | 1681 | ||
@@ -1700,10 +1710,23 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int | |||
1700 | return OOM_ASYNC; | 1710 | return OOM_ASYNC; |
1701 | } | 1711 | } |
1702 | 1712 | ||
1713 | mem_cgroup_mark_under_oom(memcg); | ||
1714 | |||
1715 | locked = mem_cgroup_oom_trylock(memcg); | ||
1716 | |||
1717 | if (locked) | ||
1718 | mem_cgroup_oom_notify(memcg); | ||
1719 | |||
1720 | mem_cgroup_unmark_under_oom(memcg); | ||
1703 | if (mem_cgroup_out_of_memory(memcg, mask, order)) | 1721 | if (mem_cgroup_out_of_memory(memcg, mask, order)) |
1704 | return OOM_SUCCESS; | 1722 | ret = OOM_SUCCESS; |
1723 | else | ||
1724 | ret = OOM_FAILED; | ||
1705 | 1725 | ||
1706 | return OOM_FAILED; | 1726 | if (locked) |
1727 | mem_cgroup_oom_unlock(memcg); | ||
1728 | |||
1729 | return ret; | ||
1707 | } | 1730 | } |
1708 | 1731 | ||
1709 | /** | 1732 | /** |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7c72f2a95785..6379fff1a5ff 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
966 | enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 966 | enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
967 | struct address_space *mapping; | 967 | struct address_space *mapping; |
968 | LIST_HEAD(tokill); | 968 | LIST_HEAD(tokill); |
969 | bool unmap_success; | 969 | bool unmap_success = true; |
970 | int kill = 1, forcekill; | 970 | int kill = 1, forcekill; |
971 | struct page *hpage = *hpagep; | 971 | struct page *hpage = *hpagep; |
972 | bool mlocked = PageMlocked(hpage); | 972 | bool mlocked = PageMlocked(hpage); |
@@ -1028,7 +1028,19 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
1028 | if (kill) | 1028 | if (kill) |
1029 | collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); | 1029 | collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); |
1030 | 1030 | ||
1031 | unmap_success = try_to_unmap(hpage, ttu); | 1031 | if (!PageHuge(hpage)) { |
1032 | unmap_success = try_to_unmap(hpage, ttu); | ||
1033 | } else if (mapping) { | ||
1034 | /* | ||
1035 | * For hugetlb pages, try_to_unmap could potentially call | ||
1036 | * huge_pmd_unshare. Because of this, take semaphore in | ||
1037 | * write mode here and set TTU_RMAP_LOCKED to indicate we | ||
1038 | * have taken the lock at this higer level. | ||
1039 | */ | ||
1040 | i_mmap_lock_write(mapping); | ||
1041 | unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); | ||
1042 | i_mmap_unlock_write(mapping); | ||
1043 | } | ||
1032 | if (!unmap_success) | 1044 | if (!unmap_success) |
1033 | pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", | 1045 | pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", |
1034 | pfn, page_mapcount(hpage)); | 1046 | pfn, page_mapcount(hpage)); |
diff --git a/mm/memory.c b/mm/memory.c index 4ad2d293ddc2..2dd2f9ab57f4 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -973,8 +973,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
973 | unsigned long next; | 973 | unsigned long next; |
974 | unsigned long addr = vma->vm_start; | 974 | unsigned long addr = vma->vm_start; |
975 | unsigned long end = vma->vm_end; | 975 | unsigned long end = vma->vm_end; |
976 | unsigned long mmun_start; /* For mmu_notifiers */ | 976 | struct mmu_notifier_range range; |
977 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
978 | bool is_cow; | 977 | bool is_cow; |
979 | int ret; | 978 | int ret; |
980 | 979 | ||
@@ -1008,11 +1007,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1008 | * is_cow_mapping() returns true. | 1007 | * is_cow_mapping() returns true. |
1009 | */ | 1008 | */ |
1010 | is_cow = is_cow_mapping(vma->vm_flags); | 1009 | is_cow = is_cow_mapping(vma->vm_flags); |
1011 | mmun_start = addr; | 1010 | |
1012 | mmun_end = end; | 1011 | if (is_cow) { |
1013 | if (is_cow) | 1012 | mmu_notifier_range_init(&range, src_mm, addr, end); |
1014 | mmu_notifier_invalidate_range_start(src_mm, mmun_start, | 1013 | mmu_notifier_invalidate_range_start(&range); |
1015 | mmun_end); | 1014 | } |
1016 | 1015 | ||
1017 | ret = 0; | 1016 | ret = 0; |
1018 | dst_pgd = pgd_offset(dst_mm, addr); | 1017 | dst_pgd = pgd_offset(dst_mm, addr); |
@@ -1029,7 +1028,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1029 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); | 1028 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); |
1030 | 1029 | ||
1031 | if (is_cow) | 1030 | if (is_cow) |
1032 | mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); | 1031 | mmu_notifier_invalidate_range_end(&range); |
1033 | return ret; | 1032 | return ret; |
1034 | } | 1033 | } |
1035 | 1034 | ||
@@ -1332,12 +1331,13 @@ void unmap_vmas(struct mmu_gather *tlb, | |||
1332 | struct vm_area_struct *vma, unsigned long start_addr, | 1331 | struct vm_area_struct *vma, unsigned long start_addr, |
1333 | unsigned long end_addr) | 1332 | unsigned long end_addr) |
1334 | { | 1333 | { |
1335 | struct mm_struct *mm = vma->vm_mm; | 1334 | struct mmu_notifier_range range; |
1336 | 1335 | ||
1337 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); | 1336 | mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr); |
1337 | mmu_notifier_invalidate_range_start(&range); | ||
1338 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) | 1338 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) |
1339 | unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); | 1339 | unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); |
1340 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); | 1340 | mmu_notifier_invalidate_range_end(&range); |
1341 | } | 1341 | } |
1342 | 1342 | ||
1343 | /** | 1343 | /** |
@@ -1351,18 +1351,18 @@ void unmap_vmas(struct mmu_gather *tlb, | |||
1351 | void zap_page_range(struct vm_area_struct *vma, unsigned long start, | 1351 | void zap_page_range(struct vm_area_struct *vma, unsigned long start, |
1352 | unsigned long size) | 1352 | unsigned long size) |
1353 | { | 1353 | { |
1354 | struct mm_struct *mm = vma->vm_mm; | 1354 | struct mmu_notifier_range range; |
1355 | struct mmu_gather tlb; | 1355 | struct mmu_gather tlb; |
1356 | unsigned long end = start + size; | ||
1357 | 1356 | ||
1358 | lru_add_drain(); | 1357 | lru_add_drain(); |
1359 | tlb_gather_mmu(&tlb, mm, start, end); | 1358 | mmu_notifier_range_init(&range, vma->vm_mm, start, start + size); |
1360 | update_hiwater_rss(mm); | 1359 | tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); |
1361 | mmu_notifier_invalidate_range_start(mm, start, end); | 1360 | update_hiwater_rss(vma->vm_mm); |
1362 | for ( ; vma && vma->vm_start < end; vma = vma->vm_next) | 1361 | mmu_notifier_invalidate_range_start(&range); |
1363 | unmap_single_vma(&tlb, vma, start, end, NULL); | 1362 | for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) |
1364 | mmu_notifier_invalidate_range_end(mm, start, end); | 1363 | unmap_single_vma(&tlb, vma, start, range.end, NULL); |
1365 | tlb_finish_mmu(&tlb, start, end); | 1364 | mmu_notifier_invalidate_range_end(&range); |
1365 | tlb_finish_mmu(&tlb, start, range.end); | ||
1366 | } | 1366 | } |
1367 | 1367 | ||
1368 | /** | 1368 | /** |
@@ -1377,17 +1377,17 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, | |||
1377 | static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, | 1377 | static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, |
1378 | unsigned long size, struct zap_details *details) | 1378 | unsigned long size, struct zap_details *details) |
1379 | { | 1379 | { |
1380 | struct mm_struct *mm = vma->vm_mm; | 1380 | struct mmu_notifier_range range; |
1381 | struct mmu_gather tlb; | 1381 | struct mmu_gather tlb; |
1382 | unsigned long end = address + size; | ||
1383 | 1382 | ||
1384 | lru_add_drain(); | 1383 | lru_add_drain(); |
1385 | tlb_gather_mmu(&tlb, mm, address, end); | 1384 | mmu_notifier_range_init(&range, vma->vm_mm, address, address + size); |
1386 | update_hiwater_rss(mm); | 1385 | tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); |
1387 | mmu_notifier_invalidate_range_start(mm, address, end); | 1386 | update_hiwater_rss(vma->vm_mm); |
1388 | unmap_single_vma(&tlb, vma, address, end, details); | 1387 | mmu_notifier_invalidate_range_start(&range); |
1389 | mmu_notifier_invalidate_range_end(mm, address, end); | 1388 | unmap_single_vma(&tlb, vma, address, range.end, details); |
1390 | tlb_finish_mmu(&tlb, address, end); | 1389 | mmu_notifier_invalidate_range_end(&range); |
1390 | tlb_finish_mmu(&tlb, address, range.end); | ||
1391 | } | 1391 | } |
1392 | 1392 | ||
1393 | /** | 1393 | /** |
@@ -2247,9 +2247,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) | |||
2247 | struct page *new_page = NULL; | 2247 | struct page *new_page = NULL; |
2248 | pte_t entry; | 2248 | pte_t entry; |
2249 | int page_copied = 0; | 2249 | int page_copied = 0; |
2250 | const unsigned long mmun_start = vmf->address & PAGE_MASK; | ||
2251 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; | ||
2252 | struct mem_cgroup *memcg; | 2250 | struct mem_cgroup *memcg; |
2251 | struct mmu_notifier_range range; | ||
2253 | 2252 | ||
2254 | if (unlikely(anon_vma_prepare(vma))) | 2253 | if (unlikely(anon_vma_prepare(vma))) |
2255 | goto oom; | 2254 | goto oom; |
@@ -2272,7 +2271,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) | |||
2272 | 2271 | ||
2273 | __SetPageUptodate(new_page); | 2272 | __SetPageUptodate(new_page); |
2274 | 2273 | ||
2275 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2274 | mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK, |
2275 | (vmf->address & PAGE_MASK) + PAGE_SIZE); | ||
2276 | mmu_notifier_invalidate_range_start(&range); | ||
2276 | 2277 | ||
2277 | /* | 2278 | /* |
2278 | * Re-check the pte - we dropped the lock | 2279 | * Re-check the pte - we dropped the lock |
@@ -2349,7 +2350,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) | |||
2349 | * No need to double call mmu_notifier->invalidate_range() callback as | 2350 | * No need to double call mmu_notifier->invalidate_range() callback as |
2350 | * the above ptep_clear_flush_notify() did already call it. | 2351 | * the above ptep_clear_flush_notify() did already call it. |
2351 | */ | 2352 | */ |
2352 | mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); | 2353 | mmu_notifier_invalidate_range_only_end(&range); |
2353 | if (old_page) { | 2354 | if (old_page) { |
2354 | /* | 2355 | /* |
2355 | * Don't let another task, with possibly unlocked vma, | 2356 | * Don't let another task, with possibly unlocked vma, |
@@ -3830,7 +3831,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, | |||
3830 | vmf.pud = pud_alloc(mm, p4d, address); | 3831 | vmf.pud = pud_alloc(mm, p4d, address); |
3831 | if (!vmf.pud) | 3832 | if (!vmf.pud) |
3832 | return VM_FAULT_OOM; | 3833 | return VM_FAULT_OOM; |
3833 | if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) { | 3834 | if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) { |
3834 | ret = create_huge_pud(&vmf); | 3835 | ret = create_huge_pud(&vmf); |
3835 | if (!(ret & VM_FAULT_FALLBACK)) | 3836 | if (!(ret & VM_FAULT_FALLBACK)) |
3836 | return ret; | 3837 | return ret; |
@@ -3856,7 +3857,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, | |||
3856 | vmf.pmd = pmd_alloc(mm, vmf.pud, address); | 3857 | vmf.pmd = pmd_alloc(mm, vmf.pud, address); |
3857 | if (!vmf.pmd) | 3858 | if (!vmf.pmd) |
3858 | return VM_FAULT_OOM; | 3859 | return VM_FAULT_OOM; |
3859 | if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { | 3860 | if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { |
3860 | ret = create_huge_pmd(&vmf); | 3861 | ret = create_huge_pmd(&vmf); |
3861 | if (!(ret & VM_FAULT_FALLBACK)) | 3862 | if (!(ret & VM_FAULT_FALLBACK)) |
3862 | return ret; | 3863 | return ret; |
@@ -4030,7 +4031,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | |||
4030 | #endif /* __PAGETABLE_PMD_FOLDED */ | 4031 | #endif /* __PAGETABLE_PMD_FOLDED */ |
4031 | 4032 | ||
4032 | static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, | 4033 | static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, |
4033 | unsigned long *start, unsigned long *end, | 4034 | struct mmu_notifier_range *range, |
4034 | pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) | 4035 | pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) |
4035 | { | 4036 | { |
4036 | pgd_t *pgd; | 4037 | pgd_t *pgd; |
@@ -4058,10 +4059,10 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, | |||
4058 | if (!pmdpp) | 4059 | if (!pmdpp) |
4059 | goto out; | 4060 | goto out; |
4060 | 4061 | ||
4061 | if (start && end) { | 4062 | if (range) { |
4062 | *start = address & PMD_MASK; | 4063 | mmu_notifier_range_init(range, mm, address & PMD_MASK, |
4063 | *end = *start + PMD_SIZE; | 4064 | (address & PMD_MASK) + PMD_SIZE); |
4064 | mmu_notifier_invalidate_range_start(mm, *start, *end); | 4065 | mmu_notifier_invalidate_range_start(range); |
4065 | } | 4066 | } |
4066 | *ptlp = pmd_lock(mm, pmd); | 4067 | *ptlp = pmd_lock(mm, pmd); |
4067 | if (pmd_huge(*pmd)) { | 4068 | if (pmd_huge(*pmd)) { |
@@ -4069,17 +4070,17 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, | |||
4069 | return 0; | 4070 | return 0; |
4070 | } | 4071 | } |
4071 | spin_unlock(*ptlp); | 4072 | spin_unlock(*ptlp); |
4072 | if (start && end) | 4073 | if (range) |
4073 | mmu_notifier_invalidate_range_end(mm, *start, *end); | 4074 | mmu_notifier_invalidate_range_end(range); |
4074 | } | 4075 | } |
4075 | 4076 | ||
4076 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | 4077 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
4077 | goto out; | 4078 | goto out; |
4078 | 4079 | ||
4079 | if (start && end) { | 4080 | if (range) { |
4080 | *start = address & PAGE_MASK; | 4081 | range->start = address & PAGE_MASK; |
4081 | *end = *start + PAGE_SIZE; | 4082 | range->end = range->start + PAGE_SIZE; |
4082 | mmu_notifier_invalidate_range_start(mm, *start, *end); | 4083 | mmu_notifier_invalidate_range_start(range); |
4083 | } | 4084 | } |
4084 | ptep = pte_offset_map_lock(mm, pmd, address, ptlp); | 4085 | ptep = pte_offset_map_lock(mm, pmd, address, ptlp); |
4085 | if (!pte_present(*ptep)) | 4086 | if (!pte_present(*ptep)) |
@@ -4088,8 +4089,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, | |||
4088 | return 0; | 4089 | return 0; |
4089 | unlock: | 4090 | unlock: |
4090 | pte_unmap_unlock(ptep, *ptlp); | 4091 | pte_unmap_unlock(ptep, *ptlp); |
4091 | if (start && end) | 4092 | if (range) |
4092 | mmu_notifier_invalidate_range_end(mm, *start, *end); | 4093 | mmu_notifier_invalidate_range_end(range); |
4093 | out: | 4094 | out: |
4094 | return -EINVAL; | 4095 | return -EINVAL; |
4095 | } | 4096 | } |
@@ -4101,20 +4102,20 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address, | |||
4101 | 4102 | ||
4102 | /* (void) is needed to make gcc happy */ | 4103 | /* (void) is needed to make gcc happy */ |
4103 | (void) __cond_lock(*ptlp, | 4104 | (void) __cond_lock(*ptlp, |
4104 | !(res = __follow_pte_pmd(mm, address, NULL, NULL, | 4105 | !(res = __follow_pte_pmd(mm, address, NULL, |
4105 | ptepp, NULL, ptlp))); | 4106 | ptepp, NULL, ptlp))); |
4106 | return res; | 4107 | return res; |
4107 | } | 4108 | } |
4108 | 4109 | ||
4109 | int follow_pte_pmd(struct mm_struct *mm, unsigned long address, | 4110 | int follow_pte_pmd(struct mm_struct *mm, unsigned long address, |
4110 | unsigned long *start, unsigned long *end, | 4111 | struct mmu_notifier_range *range, |
4111 | pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) | 4112 | pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) |
4112 | { | 4113 | { |
4113 | int res; | 4114 | int res; |
4114 | 4115 | ||
4115 | /* (void) is needed to make gcc happy */ | 4116 | /* (void) is needed to make gcc happy */ |
4116 | (void) __cond_lock(*ptlp, | 4117 | (void) __cond_lock(*ptlp, |
4117 | !(res = __follow_pte_pmd(mm, address, start, end, | 4118 | !(res = __follow_pte_pmd(mm, address, range, |
4118 | ptepp, pmdpp, ptlp))); | 4119 | ptepp, pmdpp, ptlp))); |
4119 | return res; | 4120 | return res; |
4120 | } | 4121 | } |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2b2b3ccbbfb5..b9a667d36c55 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/hugetlb.h> | 34 | #include <linux/hugetlb.h> |
35 | #include <linux/memblock.h> | 35 | #include <linux/memblock.h> |
36 | #include <linux/compaction.h> | 36 | #include <linux/compaction.h> |
37 | #include <linux/rmap.h> | ||
37 | 38 | ||
38 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
39 | 40 | ||
@@ -253,7 +254,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, | |||
253 | if (pfn_valid(phys_start_pfn)) | 254 | if (pfn_valid(phys_start_pfn)) |
254 | return -EEXIST; | 255 | return -EEXIST; |
255 | 256 | ||
256 | ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn, altmap); | 257 | ret = sparse_add_one_section(nid, phys_start_pfn, altmap); |
257 | if (ret < 0) | 258 | if (ret < 0) |
258 | return ret; | 259 | return ret; |
259 | 260 | ||
@@ -743,14 +744,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, | |||
743 | int nid = pgdat->node_id; | 744 | int nid = pgdat->node_id; |
744 | unsigned long flags; | 745 | unsigned long flags; |
745 | 746 | ||
746 | if (zone_is_empty(zone)) | ||
747 | init_currently_empty_zone(zone, start_pfn, nr_pages); | ||
748 | |||
749 | clear_zone_contiguous(zone); | 747 | clear_zone_contiguous(zone); |
750 | 748 | ||
751 | /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ | 749 | /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ |
752 | pgdat_resize_lock(pgdat, &flags); | 750 | pgdat_resize_lock(pgdat, &flags); |
753 | zone_span_writelock(zone); | 751 | zone_span_writelock(zone); |
752 | if (zone_is_empty(zone)) | ||
753 | init_currently_empty_zone(zone, start_pfn, nr_pages); | ||
754 | resize_zone_range(zone, start_pfn, nr_pages); | 754 | resize_zone_range(zone, start_pfn, nr_pages); |
755 | zone_span_writeunlock(zone); | 755 | zone_span_writeunlock(zone); |
756 | resize_pgdat_range(pgdat, start_pfn, nr_pages); | 756 | resize_pgdat_range(pgdat, start_pfn, nr_pages); |
@@ -1078,7 +1078,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) | |||
1078 | * | 1078 | * |
1079 | * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG | 1079 | * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG |
1080 | */ | 1080 | */ |
1081 | int __ref add_memory_resource(int nid, struct resource *res, bool online) | 1081 | int __ref add_memory_resource(int nid, struct resource *res) |
1082 | { | 1082 | { |
1083 | u64 start, size; | 1083 | u64 start, size; |
1084 | bool new_node = false; | 1084 | bool new_node = false; |
@@ -1133,7 +1133,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) | |||
1133 | mem_hotplug_done(); | 1133 | mem_hotplug_done(); |
1134 | 1134 | ||
1135 | /* online pages if requested */ | 1135 | /* online pages if requested */ |
1136 | if (online) | 1136 | if (memhp_auto_online) |
1137 | walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), | 1137 | walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), |
1138 | NULL, online_memory_block); | 1138 | NULL, online_memory_block); |
1139 | 1139 | ||
@@ -1157,7 +1157,7 @@ int __ref __add_memory(int nid, u64 start, u64 size) | |||
1157 | if (IS_ERR(res)) | 1157 | if (IS_ERR(res)) |
1158 | return PTR_ERR(res); | 1158 | return PTR_ERR(res); |
1159 | 1159 | ||
1160 | ret = add_memory_resource(nid, res, memhp_auto_online); | 1160 | ret = add_memory_resource(nid, res); |
1161 | if (ret < 0) | 1161 | if (ret < 0) |
1162 | release_memory_resource(res); | 1162 | release_memory_resource(res); |
1163 | return ret; | 1163 | return ret; |
@@ -1226,7 +1226,7 @@ static bool is_pageblock_removable_nolock(struct page *page) | |||
1226 | if (!zone_spans_pfn(zone, pfn)) | 1226 | if (!zone_spans_pfn(zone, pfn)) |
1227 | return false; | 1227 | return false; |
1228 | 1228 | ||
1229 | return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); | 1229 | return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON); |
1230 | } | 1230 | } |
1231 | 1231 | ||
1232 | /* Checks if this range of memory is likely to be hot-removable. */ | 1232 | /* Checks if this range of memory is likely to be hot-removable. */ |
@@ -1339,18 +1339,16 @@ static struct page *new_node_page(struct page *page, unsigned long private) | |||
1339 | return new_page_nodemask(page, nid, &nmask); | 1339 | return new_page_nodemask(page, nid, &nmask); |
1340 | } | 1340 | } |
1341 | 1341 | ||
1342 | #define NR_OFFLINE_AT_ONCE_PAGES (256) | ||
1343 | static int | 1342 | static int |
1344 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | 1343 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) |
1345 | { | 1344 | { |
1346 | unsigned long pfn; | 1345 | unsigned long pfn; |
1347 | struct page *page; | 1346 | struct page *page; |
1348 | int move_pages = NR_OFFLINE_AT_ONCE_PAGES; | ||
1349 | int not_managed = 0; | 1347 | int not_managed = 0; |
1350 | int ret = 0; | 1348 | int ret = 0; |
1351 | LIST_HEAD(source); | 1349 | LIST_HEAD(source); |
1352 | 1350 | ||
1353 | for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { | 1351 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
1354 | if (!pfn_valid(pfn)) | 1352 | if (!pfn_valid(pfn)) |
1355 | continue; | 1353 | continue; |
1356 | page = pfn_to_page(pfn); | 1354 | page = pfn_to_page(pfn); |
@@ -1362,13 +1360,27 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1362 | ret = -EBUSY; | 1360 | ret = -EBUSY; |
1363 | break; | 1361 | break; |
1364 | } | 1362 | } |
1365 | if (isolate_huge_page(page, &source)) | 1363 | isolate_huge_page(page, &source); |
1366 | move_pages -= 1 << compound_order(head); | ||
1367 | continue; | 1364 | continue; |
1368 | } else if (PageTransHuge(page)) | 1365 | } else if (PageTransHuge(page)) |
1369 | pfn = page_to_pfn(compound_head(page)) | 1366 | pfn = page_to_pfn(compound_head(page)) |
1370 | + hpage_nr_pages(page) - 1; | 1367 | + hpage_nr_pages(page) - 1; |
1371 | 1368 | ||
1369 | /* | ||
1370 | * HWPoison pages have elevated reference counts so the migration would | ||
1371 | * fail on them. It also doesn't make any sense to migrate them in the | ||
1372 | * first place. Still try to unmap such a page in case it is still mapped | ||
1373 | * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep | ||
1374 | * the unmap as the catch all safety net). | ||
1375 | */ | ||
1376 | if (PageHWPoison(page)) { | ||
1377 | if (WARN_ON(PageLRU(page))) | ||
1378 | isolate_lru_page(page); | ||
1379 | if (page_mapped(page)) | ||
1380 | try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS); | ||
1381 | continue; | ||
1382 | } | ||
1383 | |||
1372 | if (!get_page_unless_zero(page)) | 1384 | if (!get_page_unless_zero(page)) |
1373 | continue; | 1385 | continue; |
1374 | /* | 1386 | /* |
@@ -1382,16 +1394,13 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1382 | if (!ret) { /* Success */ | 1394 | if (!ret) { /* Success */ |
1383 | put_page(page); | 1395 | put_page(page); |
1384 | list_add_tail(&page->lru, &source); | 1396 | list_add_tail(&page->lru, &source); |
1385 | move_pages--; | ||
1386 | if (!__PageMovable(page)) | 1397 | if (!__PageMovable(page)) |
1387 | inc_node_page_state(page, NR_ISOLATED_ANON + | 1398 | inc_node_page_state(page, NR_ISOLATED_ANON + |
1388 | page_is_file_cache(page)); | 1399 | page_is_file_cache(page)); |
1389 | 1400 | ||
1390 | } else { | 1401 | } else { |
1391 | #ifdef CONFIG_DEBUG_VM | 1402 | pr_warn("failed to isolate pfn %lx\n", pfn); |
1392 | pr_alert("failed to isolate pfn %lx\n", pfn); | ||
1393 | dump_page(page, "isolation failed"); | 1403 | dump_page(page, "isolation failed"); |
1394 | #endif | ||
1395 | put_page(page); | 1404 | put_page(page); |
1396 | /* Because we don't have big zone->lock. we should | 1405 | /* Because we don't have big zone->lock. we should |
1397 | check this again here. */ | 1406 | check this again here. */ |
@@ -1411,8 +1420,14 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1411 | /* Allocate a new page from the nearest neighbor node */ | 1420 | /* Allocate a new page from the nearest neighbor node */ |
1412 | ret = migrate_pages(&source, new_node_page, NULL, 0, | 1421 | ret = migrate_pages(&source, new_node_page, NULL, 0, |
1413 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); | 1422 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); |
1414 | if (ret) | 1423 | if (ret) { |
1424 | list_for_each_entry(page, &source, lru) { | ||
1425 | pr_warn("migrating pfn %lx failed ret:%d ", | ||
1426 | page_to_pfn(page), ret); | ||
1427 | dump_page(page, "migration failure"); | ||
1428 | } | ||
1415 | putback_movable_pages(&source); | 1429 | putback_movable_pages(&source); |
1430 | } | ||
1416 | } | 1431 | } |
1417 | out: | 1432 | out: |
1418 | return ret; | 1433 | return ret; |
@@ -1553,12 +1568,7 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
1553 | unsigned long valid_start, valid_end; | 1568 | unsigned long valid_start, valid_end; |
1554 | struct zone *zone; | 1569 | struct zone *zone; |
1555 | struct memory_notify arg; | 1570 | struct memory_notify arg; |
1556 | 1571 | char *reason; | |
1557 | /* at least, alignment against pageblock is necessary */ | ||
1558 | if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) | ||
1559 | return -EINVAL; | ||
1560 | if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) | ||
1561 | return -EINVAL; | ||
1562 | 1572 | ||
1563 | mem_hotplug_begin(); | 1573 | mem_hotplug_begin(); |
1564 | 1574 | ||
@@ -1567,7 +1577,9 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
1567 | if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, | 1577 | if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, |
1568 | &valid_end)) { | 1578 | &valid_end)) { |
1569 | mem_hotplug_done(); | 1579 | mem_hotplug_done(); |
1570 | return -EINVAL; | 1580 | ret = -EINVAL; |
1581 | reason = "multizone range"; | ||
1582 | goto failed_removal; | ||
1571 | } | 1583 | } |
1572 | 1584 | ||
1573 | zone = page_zone(pfn_to_page(valid_start)); | 1585 | zone = page_zone(pfn_to_page(valid_start)); |
@@ -1576,10 +1588,12 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
1576 | 1588 | ||
1577 | /* set above range as isolated */ | 1589 | /* set above range as isolated */ |
1578 | ret = start_isolate_page_range(start_pfn, end_pfn, | 1590 | ret = start_isolate_page_range(start_pfn, end_pfn, |
1579 | MIGRATE_MOVABLE, true); | 1591 | MIGRATE_MOVABLE, |
1592 | SKIP_HWPOISON | REPORT_FAILURE); | ||
1580 | if (ret) { | 1593 | if (ret) { |
1581 | mem_hotplug_done(); | 1594 | mem_hotplug_done(); |
1582 | return ret; | 1595 | reason = "failure to isolate range"; |
1596 | goto failed_removal; | ||
1583 | } | 1597 | } |
1584 | 1598 | ||
1585 | arg.start_pfn = start_pfn; | 1599 | arg.start_pfn = start_pfn; |
@@ -1588,37 +1602,47 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
1588 | 1602 | ||
1589 | ret = memory_notify(MEM_GOING_OFFLINE, &arg); | 1603 | ret = memory_notify(MEM_GOING_OFFLINE, &arg); |
1590 | ret = notifier_to_errno(ret); | 1604 | ret = notifier_to_errno(ret); |
1591 | if (ret) | 1605 | if (ret) { |
1592 | goto failed_removal; | 1606 | reason = "notifier failure"; |
1607 | goto failed_removal_isolated; | ||
1608 | } | ||
1593 | 1609 | ||
1594 | pfn = start_pfn; | 1610 | do { |
1595 | repeat: | 1611 | for (pfn = start_pfn; pfn;) { |
1596 | /* start memory hot removal */ | 1612 | if (signal_pending(current)) { |
1597 | ret = -EINTR; | 1613 | ret = -EINTR; |
1598 | if (signal_pending(current)) | 1614 | reason = "signal backoff"; |
1599 | goto failed_removal; | 1615 | goto failed_removal_isolated; |
1616 | } | ||
1600 | 1617 | ||
1601 | cond_resched(); | 1618 | cond_resched(); |
1602 | lru_add_drain_all(); | 1619 | lru_add_drain_all(); |
1603 | drain_all_pages(zone); | 1620 | drain_all_pages(zone); |
1621 | |||
1622 | pfn = scan_movable_pages(pfn, end_pfn); | ||
1623 | if (pfn) { | ||
1624 | /* | ||
1625 | * TODO: fatal migration failures should bail | ||
1626 | * out | ||
1627 | */ | ||
1628 | do_migrate_range(pfn, end_pfn); | ||
1629 | } | ||
1630 | } | ||
1604 | 1631 | ||
1605 | pfn = scan_movable_pages(start_pfn, end_pfn); | 1632 | /* |
1606 | if (pfn) { /* We have movable pages */ | 1633 | * Dissolve free hugepages in the memory block before doing |
1607 | ret = do_migrate_range(pfn, end_pfn); | 1634 | * offlining actually in order to make hugetlbfs's object |
1608 | goto repeat; | 1635 | * counting consistent. |
1609 | } | 1636 | */ |
1637 | ret = dissolve_free_huge_pages(start_pfn, end_pfn); | ||
1638 | if (ret) { | ||
1639 | reason = "failure to dissolve huge pages"; | ||
1640 | goto failed_removal_isolated; | ||
1641 | } | ||
1642 | /* check again */ | ||
1643 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); | ||
1644 | } while (offlined_pages < 0); | ||
1610 | 1645 | ||
1611 | /* | ||
1612 | * dissolve free hugepages in the memory block before doing offlining | ||
1613 | * actually in order to make hugetlbfs's object counting consistent. | ||
1614 | */ | ||
1615 | ret = dissolve_free_huge_pages(start_pfn, end_pfn); | ||
1616 | if (ret) | ||
1617 | goto failed_removal; | ||
1618 | /* check again */ | ||
1619 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); | ||
1620 | if (offlined_pages < 0) | ||
1621 | goto repeat; | ||
1622 | pr_info("Offlined Pages %ld\n", offlined_pages); | 1646 | pr_info("Offlined Pages %ld\n", offlined_pages); |
1623 | /* Ok, all of our target is isolated. | 1647 | /* Ok, all of our target is isolated. |
1624 | We cannot do rollback at this point. */ | 1648 | We cannot do rollback at this point. */ |
@@ -1654,13 +1678,15 @@ repeat: | |||
1654 | mem_hotplug_done(); | 1678 | mem_hotplug_done(); |
1655 | return 0; | 1679 | return 0; |
1656 | 1680 | ||
1681 | failed_removal_isolated: | ||
1682 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | ||
1657 | failed_removal: | 1683 | failed_removal: |
1658 | pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", | 1684 | pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", |
1659 | (unsigned long long) start_pfn << PAGE_SHIFT, | 1685 | (unsigned long long) start_pfn << PAGE_SHIFT, |
1660 | ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); | 1686 | ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, |
1687 | reason); | ||
1661 | memory_notify(MEM_CANCEL_OFFLINE, &arg); | 1688 | memory_notify(MEM_CANCEL_OFFLINE, &arg); |
1662 | /* pushback to free area */ | 1689 | /* pushback to free area */ |
1663 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | ||
1664 | mem_hotplug_done(); | 1690 | mem_hotplug_done(); |
1665 | return ret; | 1691 | return ret; |
1666 | } | 1692 | } |
@@ -1753,34 +1779,6 @@ static int check_cpu_on_node(pg_data_t *pgdat) | |||
1753 | return 0; | 1779 | return 0; |
1754 | } | 1780 | } |
1755 | 1781 | ||
1756 | static void unmap_cpu_on_node(pg_data_t *pgdat) | ||
1757 | { | ||
1758 | #ifdef CONFIG_ACPI_NUMA | ||
1759 | int cpu; | ||
1760 | |||
1761 | for_each_possible_cpu(cpu) | ||
1762 | if (cpu_to_node(cpu) == pgdat->node_id) | ||
1763 | numa_clear_node(cpu); | ||
1764 | #endif | ||
1765 | } | ||
1766 | |||
1767 | static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) | ||
1768 | { | ||
1769 | int ret; | ||
1770 | |||
1771 | ret = check_cpu_on_node(pgdat); | ||
1772 | if (ret) | ||
1773 | return ret; | ||
1774 | |||
1775 | /* | ||
1776 | * the node will be offlined when we come here, so we can clear | ||
1777 | * the cpu_to_node() now. | ||
1778 | */ | ||
1779 | |||
1780 | unmap_cpu_on_node(pgdat); | ||
1781 | return 0; | ||
1782 | } | ||
1783 | |||
1784 | /** | 1782 | /** |
1785 | * try_offline_node | 1783 | * try_offline_node |
1786 | * @nid: the node ID | 1784 | * @nid: the node ID |
@@ -1813,7 +1811,7 @@ void try_offline_node(int nid) | |||
1813 | return; | 1811 | return; |
1814 | } | 1812 | } |
1815 | 1813 | ||
1816 | if (check_and_unmap_cpu_on_node(pgdat)) | 1814 | if (check_cpu_on_node(pgdat)) |
1817 | return; | 1815 | return; |
1818 | 1816 | ||
1819 | /* | 1817 | /* |
@@ -1858,7 +1856,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size) | |||
1858 | memblock_free(start, size); | 1856 | memblock_free(start, size); |
1859 | memblock_remove(start, size); | 1857 | memblock_remove(start, size); |
1860 | 1858 | ||
1861 | arch_remove_memory(start, size, NULL); | 1859 | arch_remove_memory(nid, start, size, NULL); |
1862 | 1860 | ||
1863 | try_offline_node(nid); | 1861 | try_offline_node(nid); |
1864 | 1862 | ||
diff --git a/mm/migrate.c b/mm/migrate.c index f7e4bfdc13b7..5d1839a9148d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -327,16 +327,13 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, | |||
327 | 327 | ||
328 | /* | 328 | /* |
329 | * Once page cache replacement of page migration started, page_count | 329 | * Once page cache replacement of page migration started, page_count |
330 | * *must* be zero. And, we don't want to call wait_on_page_locked() | 330 | * is zero; but we must not call put_and_wait_on_page_locked() without |
331 | * against a page without get_page(). | 331 | * a ref. Use get_page_unless_zero(), and just fault again if it fails. |
332 | * So, we use get_page_unless_zero(), here. Even failed, page fault | ||
333 | * will occur again. | ||
334 | */ | 332 | */ |
335 | if (!get_page_unless_zero(page)) | 333 | if (!get_page_unless_zero(page)) |
336 | goto out; | 334 | goto out; |
337 | pte_unmap_unlock(ptep, ptl); | 335 | pte_unmap_unlock(ptep, ptl); |
338 | wait_on_page_locked(page); | 336 | put_and_wait_on_page_locked(page); |
339 | put_page(page); | ||
340 | return; | 337 | return; |
341 | out: | 338 | out: |
342 | pte_unmap_unlock(ptep, ptl); | 339 | pte_unmap_unlock(ptep, ptl); |
@@ -370,63 +367,28 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) | |||
370 | if (!get_page_unless_zero(page)) | 367 | if (!get_page_unless_zero(page)) |
371 | goto unlock; | 368 | goto unlock; |
372 | spin_unlock(ptl); | 369 | spin_unlock(ptl); |
373 | wait_on_page_locked(page); | 370 | put_and_wait_on_page_locked(page); |
374 | put_page(page); | ||
375 | return; | 371 | return; |
376 | unlock: | 372 | unlock: |
377 | spin_unlock(ptl); | 373 | spin_unlock(ptl); |
378 | } | 374 | } |
379 | #endif | 375 | #endif |
380 | 376 | ||
381 | #ifdef CONFIG_BLOCK | 377 | static int expected_page_refs(struct page *page) |
382 | /* Returns true if all buffers are successfully locked */ | ||
383 | static bool buffer_migrate_lock_buffers(struct buffer_head *head, | ||
384 | enum migrate_mode mode) | ||
385 | { | 378 | { |
386 | struct buffer_head *bh = head; | 379 | int expected_count = 1; |
387 | |||
388 | /* Simple case, sync compaction */ | ||
389 | if (mode != MIGRATE_ASYNC) { | ||
390 | do { | ||
391 | get_bh(bh); | ||
392 | lock_buffer(bh); | ||
393 | bh = bh->b_this_page; | ||
394 | |||
395 | } while (bh != head); | ||
396 | 380 | ||
397 | return true; | 381 | /* |
398 | } | 382 | * Device public or private pages have an extra refcount as they are |
399 | 383 | * ZONE_DEVICE pages. | |
400 | /* async case, we cannot block on lock_buffer so use trylock_buffer */ | 384 | */ |
401 | do { | 385 | expected_count += is_device_private_page(page); |
402 | get_bh(bh); | 386 | expected_count += is_device_public_page(page); |
403 | if (!trylock_buffer(bh)) { | 387 | if (page_mapping(page)) |
404 | /* | 388 | expected_count += hpage_nr_pages(page) + page_has_private(page); |
405 | * We failed to lock the buffer and cannot stall in | ||
406 | * async migration. Release the taken locks | ||
407 | */ | ||
408 | struct buffer_head *failed_bh = bh; | ||
409 | put_bh(failed_bh); | ||
410 | bh = head; | ||
411 | while (bh != failed_bh) { | ||
412 | unlock_buffer(bh); | ||
413 | put_bh(bh); | ||
414 | bh = bh->b_this_page; | ||
415 | } | ||
416 | return false; | ||
417 | } | ||
418 | 389 | ||
419 | bh = bh->b_this_page; | 390 | return expected_count; |
420 | } while (bh != head); | ||
421 | return true; | ||
422 | } | ||
423 | #else | ||
424 | static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, | ||
425 | enum migrate_mode mode) | ||
426 | { | ||
427 | return true; | ||
428 | } | 391 | } |
429 | #endif /* CONFIG_BLOCK */ | ||
430 | 392 | ||
431 | /* | 393 | /* |
432 | * Replace the page in the mapping. | 394 | * Replace the page in the mapping. |
@@ -437,21 +399,13 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, | |||
437 | * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. | 399 | * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. |
438 | */ | 400 | */ |
439 | int migrate_page_move_mapping(struct address_space *mapping, | 401 | int migrate_page_move_mapping(struct address_space *mapping, |
440 | struct page *newpage, struct page *page, | 402 | struct page *newpage, struct page *page, enum migrate_mode mode, |
441 | struct buffer_head *head, enum migrate_mode mode, | ||
442 | int extra_count) | 403 | int extra_count) |
443 | { | 404 | { |
444 | XA_STATE(xas, &mapping->i_pages, page_index(page)); | 405 | XA_STATE(xas, &mapping->i_pages, page_index(page)); |
445 | struct zone *oldzone, *newzone; | 406 | struct zone *oldzone, *newzone; |
446 | int dirty; | 407 | int dirty; |
447 | int expected_count = 1 + extra_count; | 408 | int expected_count = expected_page_refs(page) + extra_count; |
448 | |||
449 | /* | ||
450 | * Device public or private pages have an extra refcount as they are | ||
451 | * ZONE_DEVICE pages. | ||
452 | */ | ||
453 | expected_count += is_device_private_page(page); | ||
454 | expected_count += is_device_public_page(page); | ||
455 | 409 | ||
456 | if (!mapping) { | 410 | if (!mapping) { |
457 | /* Anonymous page without mapping */ | 411 | /* Anonymous page without mapping */ |
@@ -471,8 +425,6 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
471 | newzone = page_zone(newpage); | 425 | newzone = page_zone(newpage); |
472 | 426 | ||
473 | xas_lock_irq(&xas); | 427 | xas_lock_irq(&xas); |
474 | |||
475 | expected_count += hpage_nr_pages(page) + page_has_private(page); | ||
476 | if (page_count(page) != expected_count || xas_load(&xas) != page) { | 428 | if (page_count(page) != expected_count || xas_load(&xas) != page) { |
477 | xas_unlock_irq(&xas); | 429 | xas_unlock_irq(&xas); |
478 | return -EAGAIN; | 430 | return -EAGAIN; |
@@ -484,20 +436,6 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
484 | } | 436 | } |
485 | 437 | ||
486 | /* | 438 | /* |
487 | * In the async migration case of moving a page with buffers, lock the | ||
488 | * buffers using trylock before the mapping is moved. If the mapping | ||
489 | * was moved, we later failed to lock the buffers and could not move | ||
490 | * the mapping back due to an elevated page count, we would have to | ||
491 | * block waiting on other references to be dropped. | ||
492 | */ | ||
493 | if (mode == MIGRATE_ASYNC && head && | ||
494 | !buffer_migrate_lock_buffers(head, mode)) { | ||
495 | page_ref_unfreeze(page, expected_count); | ||
496 | xas_unlock_irq(&xas); | ||
497 | return -EAGAIN; | ||
498 | } | ||
499 | |||
500 | /* | ||
501 | * Now we know that no one else is looking at the page: | 439 | * Now we know that no one else is looking at the page: |
502 | * no turning back from here. | 440 | * no turning back from here. |
503 | */ | 441 | */ |
@@ -748,7 +686,7 @@ int migrate_page(struct address_space *mapping, | |||
748 | 686 | ||
749 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | 687 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ |
750 | 688 | ||
751 | rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); | 689 | rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); |
752 | 690 | ||
753 | if (rc != MIGRATEPAGE_SUCCESS) | 691 | if (rc != MIGRATEPAGE_SUCCESS) |
754 | return rc; | 692 | return rc; |
@@ -762,34 +700,98 @@ int migrate_page(struct address_space *mapping, | |||
762 | EXPORT_SYMBOL(migrate_page); | 700 | EXPORT_SYMBOL(migrate_page); |
763 | 701 | ||
764 | #ifdef CONFIG_BLOCK | 702 | #ifdef CONFIG_BLOCK |
765 | /* | 703 | /* Returns true if all buffers are successfully locked */ |
766 | * Migration function for pages with buffers. This function can only be used | 704 | static bool buffer_migrate_lock_buffers(struct buffer_head *head, |
767 | * if the underlying filesystem guarantees that no other references to "page" | 705 | enum migrate_mode mode) |
768 | * exist. | 706 | { |
769 | */ | 707 | struct buffer_head *bh = head; |
770 | int buffer_migrate_page(struct address_space *mapping, | 708 | |
771 | struct page *newpage, struct page *page, enum migrate_mode mode) | 709 | /* Simple case, sync compaction */ |
710 | if (mode != MIGRATE_ASYNC) { | ||
711 | do { | ||
712 | get_bh(bh); | ||
713 | lock_buffer(bh); | ||
714 | bh = bh->b_this_page; | ||
715 | |||
716 | } while (bh != head); | ||
717 | |||
718 | return true; | ||
719 | } | ||
720 | |||
721 | /* async case, we cannot block on lock_buffer so use trylock_buffer */ | ||
722 | do { | ||
723 | get_bh(bh); | ||
724 | if (!trylock_buffer(bh)) { | ||
725 | /* | ||
726 | * We failed to lock the buffer and cannot stall in | ||
727 | * async migration. Release the taken locks | ||
728 | */ | ||
729 | struct buffer_head *failed_bh = bh; | ||
730 | put_bh(failed_bh); | ||
731 | bh = head; | ||
732 | while (bh != failed_bh) { | ||
733 | unlock_buffer(bh); | ||
734 | put_bh(bh); | ||
735 | bh = bh->b_this_page; | ||
736 | } | ||
737 | return false; | ||
738 | } | ||
739 | |||
740 | bh = bh->b_this_page; | ||
741 | } while (bh != head); | ||
742 | return true; | ||
743 | } | ||
744 | |||
745 | static int __buffer_migrate_page(struct address_space *mapping, | ||
746 | struct page *newpage, struct page *page, enum migrate_mode mode, | ||
747 | bool check_refs) | ||
772 | { | 748 | { |
773 | struct buffer_head *bh, *head; | 749 | struct buffer_head *bh, *head; |
774 | int rc; | 750 | int rc; |
751 | int expected_count; | ||
775 | 752 | ||
776 | if (!page_has_buffers(page)) | 753 | if (!page_has_buffers(page)) |
777 | return migrate_page(mapping, newpage, page, mode); | 754 | return migrate_page(mapping, newpage, page, mode); |
778 | 755 | ||
756 | /* Check whether page does not have extra refs before we do more work */ | ||
757 | expected_count = expected_page_refs(page); | ||
758 | if (page_count(page) != expected_count) | ||
759 | return -EAGAIN; | ||
760 | |||
779 | head = page_buffers(page); | 761 | head = page_buffers(page); |
762 | if (!buffer_migrate_lock_buffers(head, mode)) | ||
763 | return -EAGAIN; | ||
780 | 764 | ||
781 | rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); | 765 | if (check_refs) { |
766 | bool busy; | ||
767 | bool invalidated = false; | ||
782 | 768 | ||
783 | if (rc != MIGRATEPAGE_SUCCESS) | 769 | recheck_buffers: |
784 | return rc; | 770 | busy = false; |
771 | spin_lock(&mapping->private_lock); | ||
772 | bh = head; | ||
773 | do { | ||
774 | if (atomic_read(&bh->b_count)) { | ||
775 | busy = true; | ||
776 | break; | ||
777 | } | ||
778 | bh = bh->b_this_page; | ||
779 | } while (bh != head); | ||
780 | spin_unlock(&mapping->private_lock); | ||
781 | if (busy) { | ||
782 | if (invalidated) { | ||
783 | rc = -EAGAIN; | ||
784 | goto unlock_buffers; | ||
785 | } | ||
786 | invalidate_bh_lrus(); | ||
787 | invalidated = true; | ||
788 | goto recheck_buffers; | ||
789 | } | ||
790 | } | ||
785 | 791 | ||
786 | /* | 792 | rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); |
787 | * In the async case, migrate_page_move_mapping locked the buffers | 793 | if (rc != MIGRATEPAGE_SUCCESS) |
788 | * with an IRQ-safe spinlock held. In the sync case, the buffers | 794 | goto unlock_buffers; |
789 | * need to be locked now | ||
790 | */ | ||
791 | if (mode != MIGRATE_ASYNC) | ||
792 | BUG_ON(!buffer_migrate_lock_buffers(head, mode)); | ||
793 | 795 | ||
794 | ClearPagePrivate(page); | 796 | ClearPagePrivate(page); |
795 | set_page_private(newpage, page_private(page)); | 797 | set_page_private(newpage, page_private(page)); |
@@ -811,6 +813,8 @@ int buffer_migrate_page(struct address_space *mapping, | |||
811 | else | 813 | else |
812 | migrate_page_states(newpage, page); | 814 | migrate_page_states(newpage, page); |
813 | 815 | ||
816 | rc = MIGRATEPAGE_SUCCESS; | ||
817 | unlock_buffers: | ||
814 | bh = head; | 818 | bh = head; |
815 | do { | 819 | do { |
816 | unlock_buffer(bh); | 820 | unlock_buffer(bh); |
@@ -819,9 +823,32 @@ int buffer_migrate_page(struct address_space *mapping, | |||
819 | 823 | ||
820 | } while (bh != head); | 824 | } while (bh != head); |
821 | 825 | ||
822 | return MIGRATEPAGE_SUCCESS; | 826 | return rc; |
827 | } | ||
828 | |||
829 | /* | ||
830 | * Migration function for pages with buffers. This function can only be used | ||
831 | * if the underlying filesystem guarantees that no other references to "page" | ||
832 | * exist. For example attached buffer heads are accessed only under page lock. | ||
833 | */ | ||
834 | int buffer_migrate_page(struct address_space *mapping, | ||
835 | struct page *newpage, struct page *page, enum migrate_mode mode) | ||
836 | { | ||
837 | return __buffer_migrate_page(mapping, newpage, page, mode, false); | ||
823 | } | 838 | } |
824 | EXPORT_SYMBOL(buffer_migrate_page); | 839 | EXPORT_SYMBOL(buffer_migrate_page); |
840 | |||
841 | /* | ||
842 | * Same as above except that this variant is more careful and checks that there | ||
843 | * are also no buffer head references. This function is the right one for | ||
844 | * mappings where buffer heads are directly looked up and referenced (such as | ||
845 | * block device mappings). | ||
846 | */ | ||
847 | int buffer_migrate_page_norefs(struct address_space *mapping, | ||
848 | struct page *newpage, struct page *page, enum migrate_mode mode) | ||
849 | { | ||
850 | return __buffer_migrate_page(mapping, newpage, page, mode, true); | ||
851 | } | ||
825 | #endif | 852 | #endif |
826 | 853 | ||
827 | /* | 854 | /* |
@@ -1297,8 +1324,19 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1297 | goto put_anon; | 1324 | goto put_anon; |
1298 | 1325 | ||
1299 | if (page_mapped(hpage)) { | 1326 | if (page_mapped(hpage)) { |
1327 | struct address_space *mapping = page_mapping(hpage); | ||
1328 | |||
1329 | /* | ||
1330 | * try_to_unmap could potentially call huge_pmd_unshare. | ||
1331 | * Because of this, take semaphore in write mode here and | ||
1332 | * set TTU_RMAP_LOCKED to let lower levels know we have | ||
1333 | * taken the lock. | ||
1334 | */ | ||
1335 | i_mmap_lock_write(mapping); | ||
1300 | try_to_unmap(hpage, | 1336 | try_to_unmap(hpage, |
1301 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 1337 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| |
1338 | TTU_RMAP_LOCKED); | ||
1339 | i_mmap_unlock_write(mapping); | ||
1302 | page_was_mapped = 1; | 1340 | page_was_mapped = 1; |
1303 | } | 1341 | } |
1304 | 1342 | ||
@@ -2303,6 +2341,7 @@ next: | |||
2303 | */ | 2341 | */ |
2304 | static void migrate_vma_collect(struct migrate_vma *migrate) | 2342 | static void migrate_vma_collect(struct migrate_vma *migrate) |
2305 | { | 2343 | { |
2344 | struct mmu_notifier_range range; | ||
2306 | struct mm_walk mm_walk; | 2345 | struct mm_walk mm_walk; |
2307 | 2346 | ||
2308 | mm_walk.pmd_entry = migrate_vma_collect_pmd; | 2347 | mm_walk.pmd_entry = migrate_vma_collect_pmd; |
@@ -2314,13 +2353,11 @@ static void migrate_vma_collect(struct migrate_vma *migrate) | |||
2314 | mm_walk.mm = migrate->vma->vm_mm; | 2353 | mm_walk.mm = migrate->vma->vm_mm; |
2315 | mm_walk.private = migrate; | 2354 | mm_walk.private = migrate; |
2316 | 2355 | ||
2317 | mmu_notifier_invalidate_range_start(mm_walk.mm, | 2356 | mmu_notifier_range_init(&range, mm_walk.mm, migrate->start, |
2318 | migrate->start, | 2357 | migrate->end); |
2319 | migrate->end); | 2358 | mmu_notifier_invalidate_range_start(&range); |
2320 | walk_page_range(migrate->start, migrate->end, &mm_walk); | 2359 | walk_page_range(migrate->start, migrate->end, &mm_walk); |
2321 | mmu_notifier_invalidate_range_end(mm_walk.mm, | 2360 | mmu_notifier_invalidate_range_end(&range); |
2322 | migrate->start, | ||
2323 | migrate->end); | ||
2324 | 2361 | ||
2325 | migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); | 2362 | migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); |
2326 | } | 2363 | } |
@@ -2701,9 +2738,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate) | |||
2701 | { | 2738 | { |
2702 | const unsigned long npages = migrate->npages; | 2739 | const unsigned long npages = migrate->npages; |
2703 | const unsigned long start = migrate->start; | 2740 | const unsigned long start = migrate->start; |
2704 | struct vm_area_struct *vma = migrate->vma; | 2741 | struct mmu_notifier_range range; |
2705 | struct mm_struct *mm = vma->vm_mm; | 2742 | unsigned long addr, i; |
2706 | unsigned long addr, i, mmu_start; | ||
2707 | bool notified = false; | 2743 | bool notified = false; |
2708 | 2744 | ||
2709 | for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { | 2745 | for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { |
@@ -2722,11 +2758,12 @@ static void migrate_vma_pages(struct migrate_vma *migrate) | |||
2722 | continue; | 2758 | continue; |
2723 | } | 2759 | } |
2724 | if (!notified) { | 2760 | if (!notified) { |
2725 | mmu_start = addr; | ||
2726 | notified = true; | 2761 | notified = true; |
2727 | mmu_notifier_invalidate_range_start(mm, | 2762 | |
2728 | mmu_start, | 2763 | mmu_notifier_range_init(&range, |
2729 | migrate->end); | 2764 | migrate->vma->vm_mm, |
2765 | addr, migrate->end); | ||
2766 | mmu_notifier_invalidate_range_start(&range); | ||
2730 | } | 2767 | } |
2731 | migrate_vma_insert_page(migrate, addr, newpage, | 2768 | migrate_vma_insert_page(migrate, addr, newpage, |
2732 | &migrate->src[i], | 2769 | &migrate->src[i], |
@@ -2767,8 +2804,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) | |||
2767 | * did already call it. | 2804 | * did already call it. |
2768 | */ | 2805 | */ |
2769 | if (notified) | 2806 | if (notified) |
2770 | mmu_notifier_invalidate_range_only_end(mm, mmu_start, | 2807 | mmu_notifier_invalidate_range_only_end(&range); |
2771 | migrate->end); | ||
2772 | } | 2808 | } |
2773 | 2809 | ||
2774 | /* | 2810 | /* |
diff --git a/mm/mm_init.c b/mm/mm_init.c index 6838a530789b..33917105a3a2 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
@@ -146,7 +146,7 @@ static void __meminit mm_compute_batch(void) | |||
146 | s32 batch = max_t(s32, nr*2, 32); | 146 | s32 batch = max_t(s32, nr*2, 32); |
147 | 147 | ||
148 | /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ | 148 | /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ |
149 | memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); | 149 | memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff); |
150 | 150 | ||
151 | vm_committed_as_batch = max_t(s32, memsized_batch, batch); | 151 | vm_committed_as_batch = max_t(s32, memsized_batch, batch); |
152 | } | 152 | } |
@@ -2973,16 +2973,6 @@ out: | |||
2973 | return ret; | 2973 | return ret; |
2974 | } | 2974 | } |
2975 | 2975 | ||
2976 | static inline void verify_mm_writelocked(struct mm_struct *mm) | ||
2977 | { | ||
2978 | #ifdef CONFIG_DEBUG_VM | ||
2979 | if (unlikely(down_read_trylock(&mm->mmap_sem))) { | ||
2980 | WARN_ON(1); | ||
2981 | up_read(&mm->mmap_sem); | ||
2982 | } | ||
2983 | #endif | ||
2984 | } | ||
2985 | |||
2986 | /* | 2976 | /* |
2987 | * this is really a simplified "do_mmap". it only handles | 2977 | * this is really a simplified "do_mmap". it only handles |
2988 | * anonymous maps. eventually we may be able to do some | 2978 | * anonymous maps. eventually we may be able to do some |
@@ -3010,12 +3000,6 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla | |||
3010 | return error; | 3000 | return error; |
3011 | 3001 | ||
3012 | /* | 3002 | /* |
3013 | * mm->mmap_sem is required to protect against another thread | ||
3014 | * changing the mappings in case we sleep. | ||
3015 | */ | ||
3016 | verify_mm_writelocked(mm); | ||
3017 | |||
3018 | /* | ||
3019 | * Clear old maps. this also does some error checking for us | 3003 | * Clear old maps. this also does some error checking for us |
3020 | */ | 3004 | */ |
3021 | while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, | 3005 | while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5119ff846769..9c884abc7850 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -35,13 +35,6 @@ void mmu_notifier_call_srcu(struct rcu_head *rcu, | |||
35 | } | 35 | } |
36 | EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); | 36 | EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); |
37 | 37 | ||
38 | void mmu_notifier_synchronize(void) | ||
39 | { | ||
40 | /* Wait for any running method to finish. */ | ||
41 | srcu_barrier(&srcu); | ||
42 | } | ||
43 | EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); | ||
44 | |||
45 | /* | 38 | /* |
46 | * This function can't run concurrently against mmu_notifier_register | 39 | * This function can't run concurrently against mmu_notifier_register |
47 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap | 40 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap |
@@ -174,22 +167,20 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, | |||
174 | srcu_read_unlock(&srcu, id); | 167 | srcu_read_unlock(&srcu, id); |
175 | } | 168 | } |
176 | 169 | ||
177 | int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | 170 | int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) |
178 | unsigned long start, unsigned long end, | ||
179 | bool blockable) | ||
180 | { | 171 | { |
181 | struct mmu_notifier *mn; | 172 | struct mmu_notifier *mn; |
182 | int ret = 0; | 173 | int ret = 0; |
183 | int id; | 174 | int id; |
184 | 175 | ||
185 | id = srcu_read_lock(&srcu); | 176 | id = srcu_read_lock(&srcu); |
186 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { | 177 | hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { |
187 | if (mn->ops->invalidate_range_start) { | 178 | if (mn->ops->invalidate_range_start) { |
188 | int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable); | 179 | int _ret = mn->ops->invalidate_range_start(mn, range); |
189 | if (_ret) { | 180 | if (_ret) { |
190 | pr_info("%pS callback failed with %d in %sblockable context.\n", | 181 | pr_info("%pS callback failed with %d in %sblockable context.\n", |
191 | mn->ops->invalidate_range_start, _ret, | 182 | mn->ops->invalidate_range_start, _ret, |
192 | !blockable ? "non-" : ""); | 183 | !range->blockable ? "non-" : ""); |
193 | ret = _ret; | 184 | ret = _ret; |
194 | } | 185 | } |
195 | } | 186 | } |
@@ -200,16 +191,14 @@ int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | |||
200 | } | 191 | } |
201 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); | 192 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); |
202 | 193 | ||
203 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | 194 | void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, |
204 | unsigned long start, | ||
205 | unsigned long end, | ||
206 | bool only_end) | 195 | bool only_end) |
207 | { | 196 | { |
208 | struct mmu_notifier *mn; | 197 | struct mmu_notifier *mn; |
209 | int id; | 198 | int id; |
210 | 199 | ||
211 | id = srcu_read_lock(&srcu); | 200 | id = srcu_read_lock(&srcu); |
212 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { | 201 | hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { |
213 | /* | 202 | /* |
214 | * Call invalidate_range here too to avoid the need for the | 203 | * Call invalidate_range here too to avoid the need for the |
215 | * subsystem of having to register an invalidate_range_end | 204 | * subsystem of having to register an invalidate_range_end |
@@ -224,9 +213,11 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | |||
224 | * already happen under page table lock. | 213 | * already happen under page table lock. |
225 | */ | 214 | */ |
226 | if (!only_end && mn->ops->invalidate_range) | 215 | if (!only_end && mn->ops->invalidate_range) |
227 | mn->ops->invalidate_range(mn, mm, start, end); | 216 | mn->ops->invalidate_range(mn, range->mm, |
217 | range->start, | ||
218 | range->end); | ||
228 | if (mn->ops->invalidate_range_end) | 219 | if (mn->ops->invalidate_range_end) |
229 | mn->ops->invalidate_range_end(mn, mm, start, end); | 220 | mn->ops->invalidate_range_end(mn, range); |
230 | } | 221 | } |
231 | srcu_read_unlock(&srcu, id); | 222 | srcu_read_unlock(&srcu, id); |
232 | } | 223 | } |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 6d331620b9e5..36cb358db170 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -167,11 +167,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | |||
167 | pgprot_t newprot, int dirty_accountable, int prot_numa) | 167 | pgprot_t newprot, int dirty_accountable, int prot_numa) |
168 | { | 168 | { |
169 | pmd_t *pmd; | 169 | pmd_t *pmd; |
170 | struct mm_struct *mm = vma->vm_mm; | ||
171 | unsigned long next; | 170 | unsigned long next; |
172 | unsigned long pages = 0; | 171 | unsigned long pages = 0; |
173 | unsigned long nr_huge_updates = 0; | 172 | unsigned long nr_huge_updates = 0; |
174 | unsigned long mni_start = 0; | 173 | struct mmu_notifier_range range; |
174 | |||
175 | range.start = 0; | ||
175 | 176 | ||
176 | pmd = pmd_offset(pud, addr); | 177 | pmd = pmd_offset(pud, addr); |
177 | do { | 178 | do { |
@@ -183,9 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | |||
183 | goto next; | 184 | goto next; |
184 | 185 | ||
185 | /* invoke the mmu notifier if the pmd is populated */ | 186 | /* invoke the mmu notifier if the pmd is populated */ |
186 | if (!mni_start) { | 187 | if (!range.start) { |
187 | mni_start = addr; | 188 | mmu_notifier_range_init(&range, vma->vm_mm, addr, end); |
188 | mmu_notifier_invalidate_range_start(mm, mni_start, end); | 189 | mmu_notifier_invalidate_range_start(&range); |
189 | } | 190 | } |
190 | 191 | ||
191 | if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { | 192 | if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { |
@@ -214,8 +215,8 @@ next: | |||
214 | cond_resched(); | 215 | cond_resched(); |
215 | } while (pmd++, addr = next, addr != end); | 216 | } while (pmd++, addr = next, addr != end); |
216 | 217 | ||
217 | if (mni_start) | 218 | if (range.start) |
218 | mmu_notifier_invalidate_range_end(mm, mni_start, end); | 219 | mmu_notifier_invalidate_range_end(&range); |
219 | 220 | ||
220 | if (nr_huge_updates) | 221 | if (nr_huge_updates) |
221 | count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); | 222 | count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); |
diff --git a/mm/mremap.c b/mm/mremap.c index 7f9f9180e401..def01d86e36f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -197,16 +197,14 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
197 | bool need_rmap_locks) | 197 | bool need_rmap_locks) |
198 | { | 198 | { |
199 | unsigned long extent, next, old_end; | 199 | unsigned long extent, next, old_end; |
200 | struct mmu_notifier_range range; | ||
200 | pmd_t *old_pmd, *new_pmd; | 201 | pmd_t *old_pmd, *new_pmd; |
201 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
202 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
203 | 202 | ||
204 | old_end = old_addr + len; | 203 | old_end = old_addr + len; |
205 | flush_cache_range(vma, old_addr, old_end); | 204 | flush_cache_range(vma, old_addr, old_end); |
206 | 205 | ||
207 | mmun_start = old_addr; | 206 | mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end); |
208 | mmun_end = old_end; | 207 | mmu_notifier_invalidate_range_start(&range); |
209 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); | ||
210 | 208 | ||
211 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { | 209 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { |
212 | cond_resched(); | 210 | cond_resched(); |
@@ -247,7 +245,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
247 | new_pmd, new_addr, need_rmap_locks); | 245 | new_pmd, new_addr, need_rmap_locks); |
248 | } | 246 | } |
249 | 247 | ||
250 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); | 248 | mmu_notifier_invalidate_range_end(&range); |
251 | 249 | ||
252 | return len + old_addr - old_end; /* how much done */ | 250 | return len + old_addr - old_end; /* how much done */ |
253 | } | 251 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6589f60d5018..f0e8cd9edb1a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -245,11 +245,11 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
245 | return points > 0 ? points : 1; | 245 | return points > 0 ? points : 1; |
246 | } | 246 | } |
247 | 247 | ||
248 | enum oom_constraint { | 248 | static const char * const oom_constraint_text[] = { |
249 | CONSTRAINT_NONE, | 249 | [CONSTRAINT_NONE] = "CONSTRAINT_NONE", |
250 | CONSTRAINT_CPUSET, | 250 | [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET", |
251 | CONSTRAINT_MEMORY_POLICY, | 251 | [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY", |
252 | CONSTRAINT_MEMCG, | 252 | [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG", |
253 | }; | 253 | }; |
254 | 254 | ||
255 | /* | 255 | /* |
@@ -269,7 +269,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) | |||
269 | } | 269 | } |
270 | 270 | ||
271 | /* Default to all available memory */ | 271 | /* Default to all available memory */ |
272 | oc->totalpages = totalram_pages + total_swap_pages; | 272 | oc->totalpages = totalram_pages() + total_swap_pages; |
273 | 273 | ||
274 | if (!IS_ENABLED(CONFIG_NUMA)) | 274 | if (!IS_ENABLED(CONFIG_NUMA)) |
275 | return CONSTRAINT_NONE; | 275 | return CONSTRAINT_NONE; |
@@ -428,19 +428,29 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
428 | rcu_read_unlock(); | 428 | rcu_read_unlock(); |
429 | } | 429 | } |
430 | 430 | ||
431 | static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) | ||
432 | { | ||
433 | /* one line summary of the oom killer context. */ | ||
434 | pr_info("oom-kill:constraint=%s,nodemask=%*pbl", | ||
435 | oom_constraint_text[oc->constraint], | ||
436 | nodemask_pr_args(oc->nodemask)); | ||
437 | cpuset_print_current_mems_allowed(); | ||
438 | mem_cgroup_print_oom_context(oc->memcg, victim); | ||
439 | pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid, | ||
440 | from_kuid(&init_user_ns, task_uid(victim))); | ||
441 | } | ||
442 | |||
431 | static void dump_header(struct oom_control *oc, struct task_struct *p) | 443 | static void dump_header(struct oom_control *oc, struct task_struct *p) |
432 | { | 444 | { |
433 | pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", | 445 | pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", |
434 | current->comm, oc->gfp_mask, &oc->gfp_mask, | 446 | current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, |
435 | nodemask_pr_args(oc->nodemask), oc->order, | ||
436 | current->signal->oom_score_adj); | 447 | current->signal->oom_score_adj); |
437 | if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) | 448 | if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) |
438 | pr_warn("COMPACTION is disabled!!!\n"); | 449 | pr_warn("COMPACTION is disabled!!!\n"); |
439 | 450 | ||
440 | cpuset_print_current_mems_allowed(); | ||
441 | dump_stack(); | 451 | dump_stack(); |
442 | if (is_memcg_oom(oc)) | 452 | if (is_memcg_oom(oc)) |
443 | mem_cgroup_print_oom_info(oc->memcg, p); | 453 | mem_cgroup_print_oom_meminfo(oc->memcg); |
444 | else { | 454 | else { |
445 | show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); | 455 | show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); |
446 | if (is_dump_unreclaim_slabs()) | 456 | if (is_dump_unreclaim_slabs()) |
@@ -448,6 +458,8 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) | |||
448 | } | 458 | } |
449 | if (sysctl_oom_dump_tasks) | 459 | if (sysctl_oom_dump_tasks) |
450 | dump_tasks(oc->memcg, oc->nodemask); | 460 | dump_tasks(oc->memcg, oc->nodemask); |
461 | if (p) | ||
462 | dump_oom_summary(oc, p); | ||
451 | } | 463 | } |
452 | 464 | ||
453 | /* | 465 | /* |
@@ -516,19 +528,20 @@ bool __oom_reap_task_mm(struct mm_struct *mm) | |||
516 | * count elevated without a good reason. | 528 | * count elevated without a good reason. |
517 | */ | 529 | */ |
518 | if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { | 530 | if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { |
519 | const unsigned long start = vma->vm_start; | 531 | struct mmu_notifier_range range; |
520 | const unsigned long end = vma->vm_end; | ||
521 | struct mmu_gather tlb; | 532 | struct mmu_gather tlb; |
522 | 533 | ||
523 | tlb_gather_mmu(&tlb, mm, start, end); | 534 | mmu_notifier_range_init(&range, mm, vma->vm_start, |
524 | if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) { | 535 | vma->vm_end); |
525 | tlb_finish_mmu(&tlb, start, end); | 536 | tlb_gather_mmu(&tlb, mm, range.start, range.end); |
537 | if (mmu_notifier_invalidate_range_start_nonblock(&range)) { | ||
538 | tlb_finish_mmu(&tlb, range.start, range.end); | ||
526 | ret = false; | 539 | ret = false; |
527 | continue; | 540 | continue; |
528 | } | 541 | } |
529 | unmap_page_range(&tlb, vma, start, end, NULL); | 542 | unmap_page_range(&tlb, vma, range.start, range.end, NULL); |
530 | mmu_notifier_invalidate_range_end(mm, start, end); | 543 | mmu_notifier_invalidate_range_end(&range); |
531 | tlb_finish_mmu(&tlb, start, end); | 544 | tlb_finish_mmu(&tlb, range.start, range.end); |
532 | } | 545 | } |
533 | } | 546 | } |
534 | 547 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3f690bae6b78..7d1010453fb9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2154,6 +2154,7 @@ int write_cache_pages(struct address_space *mapping, | |||
2154 | { | 2154 | { |
2155 | int ret = 0; | 2155 | int ret = 0; |
2156 | int done = 0; | 2156 | int done = 0; |
2157 | int error; | ||
2157 | struct pagevec pvec; | 2158 | struct pagevec pvec; |
2158 | int nr_pages; | 2159 | int nr_pages; |
2159 | pgoff_t uninitialized_var(writeback_index); | 2160 | pgoff_t uninitialized_var(writeback_index); |
@@ -2227,25 +2228,31 @@ continue_unlock: | |||
2227 | goto continue_unlock; | 2228 | goto continue_unlock; |
2228 | 2229 | ||
2229 | trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); | 2230 | trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); |
2230 | ret = (*writepage)(page, wbc, data); | 2231 | error = (*writepage)(page, wbc, data); |
2231 | if (unlikely(ret)) { | 2232 | if (unlikely(error)) { |
2232 | if (ret == AOP_WRITEPAGE_ACTIVATE) { | 2233 | /* |
2234 | * Handle errors according to the type of | ||
2235 | * writeback. There's no need to continue for | ||
2236 | * background writeback. Just push done_index | ||
2237 | * past this page so media errors won't choke | ||
2238 | * writeout for the entire file. For integrity | ||
2239 | * writeback, we must process the entire dirty | ||
2240 | * set regardless of errors because the fs may | ||
2241 | * still have state to clear for each page. In | ||
2242 | * that case we continue processing and return | ||
2243 | * the first error. | ||
2244 | */ | ||
2245 | if (error == AOP_WRITEPAGE_ACTIVATE) { | ||
2233 | unlock_page(page); | 2246 | unlock_page(page); |
2234 | ret = 0; | 2247 | error = 0; |
2235 | } else { | 2248 | } else if (wbc->sync_mode != WB_SYNC_ALL) { |
2236 | /* | 2249 | ret = error; |
2237 | * done_index is set past this page, | ||
2238 | * so media errors will not choke | ||
2239 | * background writeout for the entire | ||
2240 | * file. This has consequences for | ||
2241 | * range_cyclic semantics (ie. it may | ||
2242 | * not be suitable for data integrity | ||
2243 | * writeout). | ||
2244 | */ | ||
2245 | done_index = page->index + 1; | 2250 | done_index = page->index + 1; |
2246 | done = 1; | 2251 | done = 1; |
2247 | break; | 2252 | break; |
2248 | } | 2253 | } |
2254 | if (!ret) | ||
2255 | ret = error; | ||
2249 | } | 2256 | } |
2250 | 2257 | ||
2251 | /* | 2258 | /* |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e95b5b7c9c3d..cde5dac6229a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -16,6 +16,7 @@ | |||
16 | 16 | ||
17 | #include <linux/stddef.h> | 17 | #include <linux/stddef.h> |
18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
19 | #include <linux/highmem.h> | ||
19 | #include <linux/swap.h> | 20 | #include <linux/swap.h> |
20 | #include <linux/interrupt.h> | 21 | #include <linux/interrupt.h> |
21 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
@@ -96,8 +97,12 @@ int _node_numa_mem_[MAX_NUMNODES]; | |||
96 | #endif | 97 | #endif |
97 | 98 | ||
98 | /* work_structs for global per-cpu drains */ | 99 | /* work_structs for global per-cpu drains */ |
100 | struct pcpu_drain { | ||
101 | struct zone *zone; | ||
102 | struct work_struct work; | ||
103 | }; | ||
99 | DEFINE_MUTEX(pcpu_drain_mutex); | 104 | DEFINE_MUTEX(pcpu_drain_mutex); |
100 | DEFINE_PER_CPU(struct work_struct, pcpu_drain); | 105 | DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); |
101 | 106 | ||
102 | #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY | 107 | #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY |
103 | volatile unsigned long latent_entropy __latent_entropy; | 108 | volatile unsigned long latent_entropy __latent_entropy; |
@@ -121,10 +126,8 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { | |||
121 | }; | 126 | }; |
122 | EXPORT_SYMBOL(node_states); | 127 | EXPORT_SYMBOL(node_states); |
123 | 128 | ||
124 | /* Protect totalram_pages and zone->managed_pages */ | 129 | atomic_long_t _totalram_pages __read_mostly; |
125 | static DEFINE_SPINLOCK(managed_page_count_lock); | 130 | EXPORT_SYMBOL(_totalram_pages); |
126 | |||
127 | unsigned long totalram_pages __read_mostly; | ||
128 | unsigned long totalreserve_pages __read_mostly; | 131 | unsigned long totalreserve_pages __read_mostly; |
129 | unsigned long totalcma_pages __read_mostly; | 132 | unsigned long totalcma_pages __read_mostly; |
130 | 133 | ||
@@ -237,7 +240,7 @@ static char * const zone_names[MAX_NR_ZONES] = { | |||
237 | #endif | 240 | #endif |
238 | }; | 241 | }; |
239 | 242 | ||
240 | char * const migratetype_names[MIGRATE_TYPES] = { | 243 | const char * const migratetype_names[MIGRATE_TYPES] = { |
241 | "Unmovable", | 244 | "Unmovable", |
242 | "Movable", | 245 | "Movable", |
243 | "Reclaimable", | 246 | "Reclaimable", |
@@ -263,20 +266,21 @@ compound_page_dtor * const compound_page_dtors[] = { | |||
263 | 266 | ||
264 | int min_free_kbytes = 1024; | 267 | int min_free_kbytes = 1024; |
265 | int user_min_free_kbytes = -1; | 268 | int user_min_free_kbytes = -1; |
269 | int watermark_boost_factor __read_mostly = 15000; | ||
266 | int watermark_scale_factor = 10; | 270 | int watermark_scale_factor = 10; |
267 | 271 | ||
268 | static unsigned long nr_kernel_pages __meminitdata; | 272 | static unsigned long nr_kernel_pages __initdata; |
269 | static unsigned long nr_all_pages __meminitdata; | 273 | static unsigned long nr_all_pages __initdata; |
270 | static unsigned long dma_reserve __meminitdata; | 274 | static unsigned long dma_reserve __initdata; |
271 | 275 | ||
272 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 276 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
273 | static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata; | 277 | static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; |
274 | static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata; | 278 | static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; |
275 | static unsigned long required_kernelcore __initdata; | 279 | static unsigned long required_kernelcore __initdata; |
276 | static unsigned long required_kernelcore_percent __initdata; | 280 | static unsigned long required_kernelcore_percent __initdata; |
277 | static unsigned long required_movablecore __initdata; | 281 | static unsigned long required_movablecore __initdata; |
278 | static unsigned long required_movablecore_percent __initdata; | 282 | static unsigned long required_movablecore_percent __initdata; |
279 | static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata; | 283 | static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; |
280 | static bool mirrored_kernelcore __meminitdata; | 284 | static bool mirrored_kernelcore __meminitdata; |
281 | 285 | ||
282 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 286 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
@@ -294,6 +298,32 @@ EXPORT_SYMBOL(nr_online_nodes); | |||
294 | int page_group_by_mobility_disabled __read_mostly; | 298 | int page_group_by_mobility_disabled __read_mostly; |
295 | 299 | ||
296 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT | 300 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
301 | /* | ||
302 | * During boot we initialize deferred pages on-demand, as needed, but once | ||
303 | * page_alloc_init_late() has finished, the deferred pages are all initialized, | ||
304 | * and we can permanently disable that path. | ||
305 | */ | ||
306 | static DEFINE_STATIC_KEY_TRUE(deferred_pages); | ||
307 | |||
308 | /* | ||
309 | * Calling kasan_free_pages() only after deferred memory initialization | ||
310 | * has completed. Poisoning pages during deferred memory init will greatly | ||
311 | * lengthen the process and cause problem in large memory systems as the | ||
312 | * deferred pages initialization is done with interrupt disabled. | ||
313 | * | ||
314 | * Assuming that there will be no reference to those newly initialized | ||
315 | * pages before they are ever allocated, this should have no effect on | ||
316 | * KASAN memory tracking as the poison will be properly inserted at page | ||
317 | * allocation time. The only corner case is when pages are allocated by | ||
318 | * on-demand allocation and then freed again before the deferred pages | ||
319 | * initialization is done, but this is not likely to happen. | ||
320 | */ | ||
321 | static inline void kasan_free_nondeferred_pages(struct page *page, int order) | ||
322 | { | ||
323 | if (!static_branch_unlikely(&deferred_pages)) | ||
324 | kasan_free_pages(page, order); | ||
325 | } | ||
326 | |||
297 | /* Returns true if the struct page for the pfn is uninitialised */ | 327 | /* Returns true if the struct page for the pfn is uninitialised */ |
298 | static inline bool __meminit early_page_uninitialised(unsigned long pfn) | 328 | static inline bool __meminit early_page_uninitialised(unsigned long pfn) |
299 | { | 329 | { |
@@ -326,8 +356,13 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) | |||
326 | /* Always populate low zones for address-constrained allocations */ | 356 | /* Always populate low zones for address-constrained allocations */ |
327 | if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) | 357 | if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) |
328 | return false; | 358 | return false; |
359 | |||
360 | /* | ||
361 | * We start only with one section of pages, more pages are added as | ||
362 | * needed until the rest of deferred pages are initialized. | ||
363 | */ | ||
329 | nr_initialised++; | 364 | nr_initialised++; |
330 | if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) && | 365 | if ((nr_initialised > PAGES_PER_SECTION) && |
331 | (pfn & (PAGES_PER_SECTION - 1)) == 0) { | 366 | (pfn & (PAGES_PER_SECTION - 1)) == 0) { |
332 | NODE_DATA(nid)->first_deferred_pfn = pfn; | 367 | NODE_DATA(nid)->first_deferred_pfn = pfn; |
333 | return true; | 368 | return true; |
@@ -335,6 +370,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) | |||
335 | return false; | 370 | return false; |
336 | } | 371 | } |
337 | #else | 372 | #else |
373 | #define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o) | ||
374 | |||
338 | static inline bool early_page_uninitialised(unsigned long pfn) | 375 | static inline bool early_page_uninitialised(unsigned long pfn) |
339 | { | 376 | { |
340 | return false; | 377 | return false; |
@@ -426,6 +463,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, | |||
426 | unsigned long old_word, word; | 463 | unsigned long old_word, word; |
427 | 464 | ||
428 | BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); | 465 | BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); |
466 | BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); | ||
429 | 467 | ||
430 | bitmap = get_pageblock_bitmap(page, pfn); | 468 | bitmap = get_pageblock_bitmap(page, pfn); |
431 | bitidx = pfn_to_bitidx(page, pfn); | 469 | bitidx = pfn_to_bitidx(page, pfn); |
@@ -1037,7 +1075,7 @@ static __always_inline bool free_pages_prepare(struct page *page, | |||
1037 | arch_free_page(page, order); | 1075 | arch_free_page(page, order); |
1038 | kernel_poison_pages(page, 1 << order, 0); | 1076 | kernel_poison_pages(page, 1 << order, 0); |
1039 | kernel_map_pages(page, 1 << order, 0); | 1077 | kernel_map_pages(page, 1 << order, 0); |
1040 | kasan_free_pages(page, order); | 1078 | kasan_free_nondeferred_pages(page, order); |
1041 | 1079 | ||
1042 | return true; | 1080 | return true; |
1043 | } | 1081 | } |
@@ -1183,6 +1221,7 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, | |||
1183 | init_page_count(page); | 1221 | init_page_count(page); |
1184 | page_mapcount_reset(page); | 1222 | page_mapcount_reset(page); |
1185 | page_cpupid_reset_last(page); | 1223 | page_cpupid_reset_last(page); |
1224 | page_kasan_tag_reset(page); | ||
1186 | 1225 | ||
1187 | INIT_LIST_HEAD(&page->lru); | 1226 | INIT_LIST_HEAD(&page->lru); |
1188 | #ifdef WANT_PAGE_VIRTUAL | 1227 | #ifdef WANT_PAGE_VIRTUAL |
@@ -1279,7 +1318,7 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order) | |||
1279 | __ClearPageReserved(p); | 1318 | __ClearPageReserved(p); |
1280 | set_page_count(p, 0); | 1319 | set_page_count(p, 0); |
1281 | 1320 | ||
1282 | page_zone(page)->managed_pages += nr_pages; | 1321 | atomic_long_add(nr_pages, &page_zone(page)->managed_pages); |
1283 | set_page_refcounted(page); | 1322 | set_page_refcounted(page); |
1284 | __free_pages(page, order); | 1323 | __free_pages(page, order); |
1285 | } | 1324 | } |
@@ -1606,13 +1645,6 @@ static int __init deferred_init_memmap(void *data) | |||
1606 | } | 1645 | } |
1607 | 1646 | ||
1608 | /* | 1647 | /* |
1609 | * During boot we initialize deferred pages on-demand, as needed, but once | ||
1610 | * page_alloc_init_late() has finished, the deferred pages are all initialized, | ||
1611 | * and we can permanently disable that path. | ||
1612 | */ | ||
1613 | static DEFINE_STATIC_KEY_TRUE(deferred_pages); | ||
1614 | |||
1615 | /* | ||
1616 | * If this zone has deferred pages, try to grow it by initializing enough | 1648 | * If this zone has deferred pages, try to grow it by initializing enough |
1617 | * deferred pages to satisfy the allocation specified by order, rounded up to | 1649 | * deferred pages to satisfy the allocation specified by order, rounded up to |
1618 | * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments | 1650 | * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments |
@@ -1981,8 +2013,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
1981 | */ | 2013 | */ |
1982 | static int fallbacks[MIGRATE_TYPES][4] = { | 2014 | static int fallbacks[MIGRATE_TYPES][4] = { |
1983 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, | 2015 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, |
1984 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, | ||
1985 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, | 2016 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, |
2017 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, | ||
1986 | #ifdef CONFIG_CMA | 2018 | #ifdef CONFIG_CMA |
1987 | [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ | 2019 | [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ |
1988 | #endif | 2020 | #endif |
@@ -2129,6 +2161,21 @@ static bool can_steal_fallback(unsigned int order, int start_mt) | |||
2129 | return false; | 2161 | return false; |
2130 | } | 2162 | } |
2131 | 2163 | ||
2164 | static inline void boost_watermark(struct zone *zone) | ||
2165 | { | ||
2166 | unsigned long max_boost; | ||
2167 | |||
2168 | if (!watermark_boost_factor) | ||
2169 | return; | ||
2170 | |||
2171 | max_boost = mult_frac(zone->_watermark[WMARK_HIGH], | ||
2172 | watermark_boost_factor, 10000); | ||
2173 | max_boost = max(pageblock_nr_pages, max_boost); | ||
2174 | |||
2175 | zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, | ||
2176 | max_boost); | ||
2177 | } | ||
2178 | |||
2132 | /* | 2179 | /* |
2133 | * This function implements actual steal behaviour. If order is large enough, | 2180 | * This function implements actual steal behaviour. If order is large enough, |
2134 | * we can steal whole pageblock. If not, we first move freepages in this | 2181 | * we can steal whole pageblock. If not, we first move freepages in this |
@@ -2138,7 +2185,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt) | |||
2138 | * itself, so pages freed in the future will be put on the correct free list. | 2185 | * itself, so pages freed in the future will be put on the correct free list. |
2139 | */ | 2186 | */ |
2140 | static void steal_suitable_fallback(struct zone *zone, struct page *page, | 2187 | static void steal_suitable_fallback(struct zone *zone, struct page *page, |
2141 | int start_type, bool whole_block) | 2188 | unsigned int alloc_flags, int start_type, bool whole_block) |
2142 | { | 2189 | { |
2143 | unsigned int current_order = page_order(page); | 2190 | unsigned int current_order = page_order(page); |
2144 | struct free_area *area; | 2191 | struct free_area *area; |
@@ -2160,6 +2207,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, | |||
2160 | goto single_page; | 2207 | goto single_page; |
2161 | } | 2208 | } |
2162 | 2209 | ||
2210 | /* | ||
2211 | * Boost watermarks to increase reclaim pressure to reduce the | ||
2212 | * likelihood of future fallbacks. Wake kswapd now as the node | ||
2213 | * may be balanced overall and kswapd will not wake naturally. | ||
2214 | */ | ||
2215 | boost_watermark(zone); | ||
2216 | if (alloc_flags & ALLOC_KSWAPD) | ||
2217 | wakeup_kswapd(zone, 0, 0, zone_idx(zone)); | ||
2218 | |||
2163 | /* We are not allowed to try stealing from the whole block */ | 2219 | /* We are not allowed to try stealing from the whole block */ |
2164 | if (!whole_block) | 2220 | if (!whole_block) |
2165 | goto single_page; | 2221 | goto single_page; |
@@ -2258,7 +2314,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, | |||
2258 | * Limit the number reserved to 1 pageblock or roughly 1% of a zone. | 2314 | * Limit the number reserved to 1 pageblock or roughly 1% of a zone. |
2259 | * Check is race-prone but harmless. | 2315 | * Check is race-prone but harmless. |
2260 | */ | 2316 | */ |
2261 | max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; | 2317 | max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; |
2262 | if (zone->nr_reserved_highatomic >= max_managed) | 2318 | if (zone->nr_reserved_highatomic >= max_managed) |
2263 | return; | 2319 | return; |
2264 | 2320 | ||
@@ -2375,20 +2431,30 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, | |||
2375 | * condition simpler. | 2431 | * condition simpler. |
2376 | */ | 2432 | */ |
2377 | static __always_inline bool | 2433 | static __always_inline bool |
2378 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | 2434 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, |
2435 | unsigned int alloc_flags) | ||
2379 | { | 2436 | { |
2380 | struct free_area *area; | 2437 | struct free_area *area; |
2381 | int current_order; | 2438 | int current_order; |
2439 | int min_order = order; | ||
2382 | struct page *page; | 2440 | struct page *page; |
2383 | int fallback_mt; | 2441 | int fallback_mt; |
2384 | bool can_steal; | 2442 | bool can_steal; |
2385 | 2443 | ||
2386 | /* | 2444 | /* |
2445 | * Do not steal pages from freelists belonging to other pageblocks | ||
2446 | * i.e. orders < pageblock_order. If there are no local zones free, | ||
2447 | * the zonelists will be reiterated without ALLOC_NOFRAGMENT. | ||
2448 | */ | ||
2449 | if (alloc_flags & ALLOC_NOFRAGMENT) | ||
2450 | min_order = pageblock_order; | ||
2451 | |||
2452 | /* | ||
2387 | * Find the largest available free page in the other list. This roughly | 2453 | * Find the largest available free page in the other list. This roughly |
2388 | * approximates finding the pageblock with the most free pages, which | 2454 | * approximates finding the pageblock with the most free pages, which |
2389 | * would be too costly to do exactly. | 2455 | * would be too costly to do exactly. |
2390 | */ | 2456 | */ |
2391 | for (current_order = MAX_ORDER - 1; current_order >= order; | 2457 | for (current_order = MAX_ORDER - 1; current_order >= min_order; |
2392 | --current_order) { | 2458 | --current_order) { |
2393 | area = &(zone->free_area[current_order]); | 2459 | area = &(zone->free_area[current_order]); |
2394 | fallback_mt = find_suitable_fallback(area, current_order, | 2460 | fallback_mt = find_suitable_fallback(area, current_order, |
@@ -2433,7 +2499,8 @@ do_steal: | |||
2433 | page = list_first_entry(&area->free_list[fallback_mt], | 2499 | page = list_first_entry(&area->free_list[fallback_mt], |
2434 | struct page, lru); | 2500 | struct page, lru); |
2435 | 2501 | ||
2436 | steal_suitable_fallback(zone, page, start_migratetype, can_steal); | 2502 | steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, |
2503 | can_steal); | ||
2437 | 2504 | ||
2438 | trace_mm_page_alloc_extfrag(page, order, current_order, | 2505 | trace_mm_page_alloc_extfrag(page, order, current_order, |
2439 | start_migratetype, fallback_mt); | 2506 | start_migratetype, fallback_mt); |
@@ -2447,7 +2514,8 @@ do_steal: | |||
2447 | * Call me with the zone->lock already held. | 2514 | * Call me with the zone->lock already held. |
2448 | */ | 2515 | */ |
2449 | static __always_inline struct page * | 2516 | static __always_inline struct page * |
2450 | __rmqueue(struct zone *zone, unsigned int order, int migratetype) | 2517 | __rmqueue(struct zone *zone, unsigned int order, int migratetype, |
2518 | unsigned int alloc_flags) | ||
2451 | { | 2519 | { |
2452 | struct page *page; | 2520 | struct page *page; |
2453 | 2521 | ||
@@ -2457,7 +2525,8 @@ retry: | |||
2457 | if (migratetype == MIGRATE_MOVABLE) | 2525 | if (migratetype == MIGRATE_MOVABLE) |
2458 | page = __rmqueue_cma_fallback(zone, order); | 2526 | page = __rmqueue_cma_fallback(zone, order); |
2459 | 2527 | ||
2460 | if (!page && __rmqueue_fallback(zone, order, migratetype)) | 2528 | if (!page && __rmqueue_fallback(zone, order, migratetype, |
2529 | alloc_flags)) | ||
2461 | goto retry; | 2530 | goto retry; |
2462 | } | 2531 | } |
2463 | 2532 | ||
@@ -2472,13 +2541,14 @@ retry: | |||
2472 | */ | 2541 | */ |
2473 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 2542 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
2474 | unsigned long count, struct list_head *list, | 2543 | unsigned long count, struct list_head *list, |
2475 | int migratetype) | 2544 | int migratetype, unsigned int alloc_flags) |
2476 | { | 2545 | { |
2477 | int i, alloced = 0; | 2546 | int i, alloced = 0; |
2478 | 2547 | ||
2479 | spin_lock(&zone->lock); | 2548 | spin_lock(&zone->lock); |
2480 | for (i = 0; i < count; ++i) { | 2549 | for (i = 0; i < count; ++i) { |
2481 | struct page *page = __rmqueue(zone, order, migratetype); | 2550 | struct page *page = __rmqueue(zone, order, migratetype, |
2551 | alloc_flags); | ||
2482 | if (unlikely(page == NULL)) | 2552 | if (unlikely(page == NULL)) |
2483 | break; | 2553 | break; |
2484 | 2554 | ||
@@ -2592,6 +2662,10 @@ void drain_local_pages(struct zone *zone) | |||
2592 | 2662 | ||
2593 | static void drain_local_pages_wq(struct work_struct *work) | 2663 | static void drain_local_pages_wq(struct work_struct *work) |
2594 | { | 2664 | { |
2665 | struct pcpu_drain *drain; | ||
2666 | |||
2667 | drain = container_of(work, struct pcpu_drain, work); | ||
2668 | |||
2595 | /* | 2669 | /* |
2596 | * drain_all_pages doesn't use proper cpu hotplug protection so | 2670 | * drain_all_pages doesn't use proper cpu hotplug protection so |
2597 | * we can race with cpu offline when the WQ can move this from | 2671 | * we can race with cpu offline when the WQ can move this from |
@@ -2600,7 +2674,7 @@ static void drain_local_pages_wq(struct work_struct *work) | |||
2600 | * a different one. | 2674 | * a different one. |
2601 | */ | 2675 | */ |
2602 | preempt_disable(); | 2676 | preempt_disable(); |
2603 | drain_local_pages(NULL); | 2677 | drain_local_pages(drain->zone); |
2604 | preempt_enable(); | 2678 | preempt_enable(); |
2605 | } | 2679 | } |
2606 | 2680 | ||
@@ -2671,12 +2745,14 @@ void drain_all_pages(struct zone *zone) | |||
2671 | } | 2745 | } |
2672 | 2746 | ||
2673 | for_each_cpu(cpu, &cpus_with_pcps) { | 2747 | for_each_cpu(cpu, &cpus_with_pcps) { |
2674 | struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); | 2748 | struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); |
2675 | INIT_WORK(work, drain_local_pages_wq); | 2749 | |
2676 | queue_work_on(cpu, mm_percpu_wq, work); | 2750 | drain->zone = zone; |
2751 | INIT_WORK(&drain->work, drain_local_pages_wq); | ||
2752 | queue_work_on(cpu, mm_percpu_wq, &drain->work); | ||
2677 | } | 2753 | } |
2678 | for_each_cpu(cpu, &cpus_with_pcps) | 2754 | for_each_cpu(cpu, &cpus_with_pcps) |
2679 | flush_work(per_cpu_ptr(&pcpu_drain, cpu)); | 2755 | flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); |
2680 | 2756 | ||
2681 | mutex_unlock(&pcpu_drain_mutex); | 2757 | mutex_unlock(&pcpu_drain_mutex); |
2682 | } | 2758 | } |
@@ -2934,6 +3010,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) | |||
2934 | 3010 | ||
2935 | /* Remove page from the per-cpu list, caller must protect the list */ | 3011 | /* Remove page from the per-cpu list, caller must protect the list */ |
2936 | static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, | 3012 | static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, |
3013 | unsigned int alloc_flags, | ||
2937 | struct per_cpu_pages *pcp, | 3014 | struct per_cpu_pages *pcp, |
2938 | struct list_head *list) | 3015 | struct list_head *list) |
2939 | { | 3016 | { |
@@ -2943,7 +3020,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, | |||
2943 | if (list_empty(list)) { | 3020 | if (list_empty(list)) { |
2944 | pcp->count += rmqueue_bulk(zone, 0, | 3021 | pcp->count += rmqueue_bulk(zone, 0, |
2945 | pcp->batch, list, | 3022 | pcp->batch, list, |
2946 | migratetype); | 3023 | migratetype, alloc_flags); |
2947 | if (unlikely(list_empty(list))) | 3024 | if (unlikely(list_empty(list))) |
2948 | return NULL; | 3025 | return NULL; |
2949 | } | 3026 | } |
@@ -2959,7 +3036,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, | |||
2959 | /* Lock and remove page from the per-cpu list */ | 3036 | /* Lock and remove page from the per-cpu list */ |
2960 | static struct page *rmqueue_pcplist(struct zone *preferred_zone, | 3037 | static struct page *rmqueue_pcplist(struct zone *preferred_zone, |
2961 | struct zone *zone, unsigned int order, | 3038 | struct zone *zone, unsigned int order, |
2962 | gfp_t gfp_flags, int migratetype) | 3039 | gfp_t gfp_flags, int migratetype, |
3040 | unsigned int alloc_flags) | ||
2963 | { | 3041 | { |
2964 | struct per_cpu_pages *pcp; | 3042 | struct per_cpu_pages *pcp; |
2965 | struct list_head *list; | 3043 | struct list_head *list; |
@@ -2969,7 +3047,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, | |||
2969 | local_irq_save(flags); | 3047 | local_irq_save(flags); |
2970 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 3048 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
2971 | list = &pcp->lists[migratetype]; | 3049 | list = &pcp->lists[migratetype]; |
2972 | page = __rmqueue_pcplist(zone, migratetype, pcp, list); | 3050 | page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); |
2973 | if (page) { | 3051 | if (page) { |
2974 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); | 3052 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
2975 | zone_statistics(preferred_zone, zone); | 3053 | zone_statistics(preferred_zone, zone); |
@@ -2992,7 +3070,7 @@ struct page *rmqueue(struct zone *preferred_zone, | |||
2992 | 3070 | ||
2993 | if (likely(order == 0)) { | 3071 | if (likely(order == 0)) { |
2994 | page = rmqueue_pcplist(preferred_zone, zone, order, | 3072 | page = rmqueue_pcplist(preferred_zone, zone, order, |
2995 | gfp_flags, migratetype); | 3073 | gfp_flags, migratetype, alloc_flags); |
2996 | goto out; | 3074 | goto out; |
2997 | } | 3075 | } |
2998 | 3076 | ||
@@ -3011,7 +3089,7 @@ struct page *rmqueue(struct zone *preferred_zone, | |||
3011 | trace_mm_page_alloc_zone_locked(page, order, migratetype); | 3089 | trace_mm_page_alloc_zone_locked(page, order, migratetype); |
3012 | } | 3090 | } |
3013 | if (!page) | 3091 | if (!page) |
3014 | page = __rmqueue(zone, order, migratetype); | 3092 | page = __rmqueue(zone, order, migratetype, alloc_flags); |
3015 | } while (page && check_new_pages(page, order)); | 3093 | } while (page && check_new_pages(page, order)); |
3016 | spin_unlock(&zone->lock); | 3094 | spin_unlock(&zone->lock); |
3017 | if (!page) | 3095 | if (!page) |
@@ -3053,7 +3131,7 @@ static int __init setup_fail_page_alloc(char *str) | |||
3053 | } | 3131 | } |
3054 | __setup("fail_page_alloc=", setup_fail_page_alloc); | 3132 | __setup("fail_page_alloc=", setup_fail_page_alloc); |
3055 | 3133 | ||
3056 | static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 3134 | static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
3057 | { | 3135 | { |
3058 | if (order < fail_page_alloc.min_order) | 3136 | if (order < fail_page_alloc.min_order) |
3059 | return false; | 3137 | return false; |
@@ -3103,13 +3181,19 @@ late_initcall(fail_page_alloc_debugfs); | |||
3103 | 3181 | ||
3104 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | 3182 | #else /* CONFIG_FAIL_PAGE_ALLOC */ |
3105 | 3183 | ||
3106 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 3184 | static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
3107 | { | 3185 | { |
3108 | return false; | 3186 | return false; |
3109 | } | 3187 | } |
3110 | 3188 | ||
3111 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 3189 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
3112 | 3190 | ||
3191 | static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | ||
3192 | { | ||
3193 | return __should_fail_alloc_page(gfp_mask, order); | ||
3194 | } | ||
3195 | ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); | ||
3196 | |||
3113 | /* | 3197 | /* |
3114 | * Return true if free base pages are above 'mark'. For high-order checks it | 3198 | * Return true if free base pages are above 'mark'. For high-order checks it |
3115 | * will return true of the order-0 watermark is reached and there is at least | 3199 | * will return true of the order-0 watermark is reached and there is at least |
@@ -3254,6 +3338,40 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | |||
3254 | #endif /* CONFIG_NUMA */ | 3338 | #endif /* CONFIG_NUMA */ |
3255 | 3339 | ||
3256 | /* | 3340 | /* |
3341 | * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid | ||
3342 | * fragmentation is subtle. If the preferred zone was HIGHMEM then | ||
3343 | * premature use of a lower zone may cause lowmem pressure problems that | ||
3344 | * are worse than fragmentation. If the next zone is ZONE_DMA then it is | ||
3345 | * probably too small. It only makes sense to spread allocations to avoid | ||
3346 | * fragmentation between the Normal and DMA32 zones. | ||
3347 | */ | ||
3348 | static inline unsigned int | ||
3349 | alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) | ||
3350 | { | ||
3351 | unsigned int alloc_flags = 0; | ||
3352 | |||
3353 | if (gfp_mask & __GFP_KSWAPD_RECLAIM) | ||
3354 | alloc_flags |= ALLOC_KSWAPD; | ||
3355 | |||
3356 | #ifdef CONFIG_ZONE_DMA32 | ||
3357 | if (zone_idx(zone) != ZONE_NORMAL) | ||
3358 | goto out; | ||
3359 | |||
3360 | /* | ||
3361 | * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and | ||
3362 | * the pointer is within zone->zone_pgdat->node_zones[]. Also assume | ||
3363 | * on UMA that if Normal is populated then so is DMA32. | ||
3364 | */ | ||
3365 | BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); | ||
3366 | if (nr_online_nodes > 1 && !populated_zone(--zone)) | ||
3367 | goto out; | ||
3368 | |||
3369 | out: | ||
3370 | #endif /* CONFIG_ZONE_DMA32 */ | ||
3371 | return alloc_flags; | ||
3372 | } | ||
3373 | |||
3374 | /* | ||
3257 | * get_page_from_freelist goes through the zonelist trying to allocate | 3375 | * get_page_from_freelist goes through the zonelist trying to allocate |
3258 | * a page. | 3376 | * a page. |
3259 | */ | 3377 | */ |
@@ -3261,14 +3379,18 @@ static struct page * | |||
3261 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, | 3379 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, |
3262 | const struct alloc_context *ac) | 3380 | const struct alloc_context *ac) |
3263 | { | 3381 | { |
3264 | struct zoneref *z = ac->preferred_zoneref; | 3382 | struct zoneref *z; |
3265 | struct zone *zone; | 3383 | struct zone *zone; |
3266 | struct pglist_data *last_pgdat_dirty_limit = NULL; | 3384 | struct pglist_data *last_pgdat_dirty_limit = NULL; |
3385 | bool no_fallback; | ||
3267 | 3386 | ||
3387 | retry: | ||
3268 | /* | 3388 | /* |
3269 | * Scan zonelist, looking for a zone with enough free. | 3389 | * Scan zonelist, looking for a zone with enough free. |
3270 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. | 3390 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. |
3271 | */ | 3391 | */ |
3392 | no_fallback = alloc_flags & ALLOC_NOFRAGMENT; | ||
3393 | z = ac->preferred_zoneref; | ||
3272 | for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, | 3394 | for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, |
3273 | ac->nodemask) { | 3395 | ac->nodemask) { |
3274 | struct page *page; | 3396 | struct page *page; |
@@ -3307,7 +3429,23 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, | |||
3307 | } | 3429 | } |
3308 | } | 3430 | } |
3309 | 3431 | ||
3310 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 3432 | if (no_fallback && nr_online_nodes > 1 && |
3433 | zone != ac->preferred_zoneref->zone) { | ||
3434 | int local_nid; | ||
3435 | |||
3436 | /* | ||
3437 | * If moving to a remote node, retry but allow | ||
3438 | * fragmenting fallbacks. Locality is more important | ||
3439 | * than fragmentation avoidance. | ||
3440 | */ | ||
3441 | local_nid = zone_to_nid(ac->preferred_zoneref->zone); | ||
3442 | if (zone_to_nid(zone) != local_nid) { | ||
3443 | alloc_flags &= ~ALLOC_NOFRAGMENT; | ||
3444 | goto retry; | ||
3445 | } | ||
3446 | } | ||
3447 | |||
3448 | mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); | ||
3311 | if (!zone_watermark_fast(zone, order, mark, | 3449 | if (!zone_watermark_fast(zone, order, mark, |
3312 | ac_classzone_idx(ac), alloc_flags)) { | 3450 | ac_classzone_idx(ac), alloc_flags)) { |
3313 | int ret; | 3451 | int ret; |
@@ -3374,6 +3512,15 @@ try_this_zone: | |||
3374 | } | 3512 | } |
3375 | } | 3513 | } |
3376 | 3514 | ||
3515 | /* | ||
3516 | * It's possible on a UMA machine to get through all zones that are | ||
3517 | * fragmented. If avoiding fragmentation, reset and try again. | ||
3518 | */ | ||
3519 | if (no_fallback) { | ||
3520 | alloc_flags &= ~ALLOC_NOFRAGMENT; | ||
3521 | goto retry; | ||
3522 | } | ||
3523 | |||
3377 | return NULL; | 3524 | return NULL; |
3378 | } | 3525 | } |
3379 | 3526 | ||
@@ -3413,13 +3560,13 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) | |||
3413 | va_start(args, fmt); | 3560 | va_start(args, fmt); |
3414 | vaf.fmt = fmt; | 3561 | vaf.fmt = fmt; |
3415 | vaf.va = &args; | 3562 | vaf.va = &args; |
3416 | pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", | 3563 | pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", |
3417 | current->comm, &vaf, gfp_mask, &gfp_mask, | 3564 | current->comm, &vaf, gfp_mask, &gfp_mask, |
3418 | nodemask_pr_args(nodemask)); | 3565 | nodemask_pr_args(nodemask)); |
3419 | va_end(args); | 3566 | va_end(args); |
3420 | 3567 | ||
3421 | cpuset_print_current_mems_allowed(); | 3568 | cpuset_print_current_mems_allowed(); |
3422 | 3569 | pr_cont("\n"); | |
3423 | dump_stack(); | 3570 | dump_stack(); |
3424 | warn_alloc_show_mem(gfp_mask, nodemask); | 3571 | warn_alloc_show_mem(gfp_mask, nodemask); |
3425 | } | 3572 | } |
@@ -3861,6 +4008,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
3861 | } else if (unlikely(rt_task(current)) && !in_interrupt()) | 4008 | } else if (unlikely(rt_task(current)) && !in_interrupt()) |
3862 | alloc_flags |= ALLOC_HARDER; | 4009 | alloc_flags |= ALLOC_HARDER; |
3863 | 4010 | ||
4011 | if (gfp_mask & __GFP_KSWAPD_RECLAIM) | ||
4012 | alloc_flags |= ALLOC_KSWAPD; | ||
4013 | |||
3864 | #ifdef CONFIG_CMA | 4014 | #ifdef CONFIG_CMA |
3865 | if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | 4015 | if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
3866 | alloc_flags |= ALLOC_CMA; | 4016 | alloc_flags |= ALLOC_CMA; |
@@ -4092,7 +4242,7 @@ retry_cpuset: | |||
4092 | if (!ac->preferred_zoneref->zone) | 4242 | if (!ac->preferred_zoneref->zone) |
4093 | goto nopage; | 4243 | goto nopage; |
4094 | 4244 | ||
4095 | if (gfp_mask & __GFP_KSWAPD_RECLAIM) | 4245 | if (alloc_flags & ALLOC_KSWAPD) |
4096 | wake_all_kswapds(order, gfp_mask, ac); | 4246 | wake_all_kswapds(order, gfp_mask, ac); |
4097 | 4247 | ||
4098 | /* | 4248 | /* |
@@ -4150,7 +4300,7 @@ retry_cpuset: | |||
4150 | 4300 | ||
4151 | retry: | 4301 | retry: |
4152 | /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ | 4302 | /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ |
4153 | if (gfp_mask & __GFP_KSWAPD_RECLAIM) | 4303 | if (alloc_flags & ALLOC_KSWAPD) |
4154 | wake_all_kswapds(order, gfp_mask, ac); | 4304 | wake_all_kswapds(order, gfp_mask, ac); |
4155 | 4305 | ||
4156 | reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); | 4306 | reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); |
@@ -4369,6 +4519,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, | |||
4369 | 4519 | ||
4370 | finalise_ac(gfp_mask, &ac); | 4520 | finalise_ac(gfp_mask, &ac); |
4371 | 4521 | ||
4522 | /* | ||
4523 | * Forbid the first pass from falling back to types that fragment | ||
4524 | * memory until all local zones are considered. | ||
4525 | */ | ||
4526 | alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); | ||
4527 | |||
4372 | /* First allocation attempt */ | 4528 | /* First allocation attempt */ |
4373 | page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); | 4529 | page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); |
4374 | if (likely(page)) | 4530 | if (likely(page)) |
@@ -4427,16 +4583,19 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) | |||
4427 | } | 4583 | } |
4428 | EXPORT_SYMBOL(get_zeroed_page); | 4584 | EXPORT_SYMBOL(get_zeroed_page); |
4429 | 4585 | ||
4430 | void __free_pages(struct page *page, unsigned int order) | 4586 | static inline void free_the_page(struct page *page, unsigned int order) |
4431 | { | 4587 | { |
4432 | if (put_page_testzero(page)) { | 4588 | if (order == 0) /* Via pcp? */ |
4433 | if (order == 0) | 4589 | free_unref_page(page); |
4434 | free_unref_page(page); | 4590 | else |
4435 | else | 4591 | __free_pages_ok(page, order); |
4436 | __free_pages_ok(page, order); | ||
4437 | } | ||
4438 | } | 4592 | } |
4439 | 4593 | ||
4594 | void __free_pages(struct page *page, unsigned int order) | ||
4595 | { | ||
4596 | if (put_page_testzero(page)) | ||
4597 | free_the_page(page, order); | ||
4598 | } | ||
4440 | EXPORT_SYMBOL(__free_pages); | 4599 | EXPORT_SYMBOL(__free_pages); |
4441 | 4600 | ||
4442 | void free_pages(unsigned long addr, unsigned int order) | 4601 | void free_pages(unsigned long addr, unsigned int order) |
@@ -4485,14 +4644,8 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) | |||
4485 | { | 4644 | { |
4486 | VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); | 4645 | VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); |
4487 | 4646 | ||
4488 | if (page_ref_sub_and_test(page, count)) { | 4647 | if (page_ref_sub_and_test(page, count)) |
4489 | unsigned int order = compound_order(page); | 4648 | free_the_page(page, compound_order(page)); |
4490 | |||
4491 | if (order == 0) | ||
4492 | free_unref_page(page); | ||
4493 | else | ||
4494 | __free_pages_ok(page, order); | ||
4495 | } | ||
4496 | } | 4649 | } |
4497 | EXPORT_SYMBOL(__page_frag_cache_drain); | 4650 | EXPORT_SYMBOL(__page_frag_cache_drain); |
4498 | 4651 | ||
@@ -4558,7 +4711,7 @@ void page_frag_free(void *addr) | |||
4558 | struct page *page = virt_to_head_page(addr); | 4711 | struct page *page = virt_to_head_page(addr); |
4559 | 4712 | ||
4560 | if (unlikely(put_page_testzero(page))) | 4713 | if (unlikely(put_page_testzero(page))) |
4561 | __free_pages_ok(page, compound_order(page)); | 4714 | free_the_page(page, compound_order(page)); |
4562 | } | 4715 | } |
4563 | EXPORT_SYMBOL(page_frag_free); | 4716 | EXPORT_SYMBOL(page_frag_free); |
4564 | 4717 | ||
@@ -4660,7 +4813,7 @@ static unsigned long nr_free_zone_pages(int offset) | |||
4660 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); | 4813 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); |
4661 | 4814 | ||
4662 | for_each_zone_zonelist(zone, z, zonelist, offset) { | 4815 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
4663 | unsigned long size = zone->managed_pages; | 4816 | unsigned long size = zone_managed_pages(zone); |
4664 | unsigned long high = high_wmark_pages(zone); | 4817 | unsigned long high = high_wmark_pages(zone); |
4665 | if (size > high) | 4818 | if (size > high) |
4666 | sum += size - high; | 4819 | sum += size - high; |
@@ -4712,7 +4865,7 @@ long si_mem_available(void) | |||
4712 | pages[lru] = global_node_page_state(NR_LRU_BASE + lru); | 4865 | pages[lru] = global_node_page_state(NR_LRU_BASE + lru); |
4713 | 4866 | ||
4714 | for_each_zone(zone) | 4867 | for_each_zone(zone) |
4715 | wmark_low += zone->watermark[WMARK_LOW]; | 4868 | wmark_low += low_wmark_pages(zone); |
4716 | 4869 | ||
4717 | /* | 4870 | /* |
4718 | * Estimate the amount of memory available for userspace allocations, | 4871 | * Estimate the amount of memory available for userspace allocations, |
@@ -4746,11 +4899,11 @@ EXPORT_SYMBOL_GPL(si_mem_available); | |||
4746 | 4899 | ||
4747 | void si_meminfo(struct sysinfo *val) | 4900 | void si_meminfo(struct sysinfo *val) |
4748 | { | 4901 | { |
4749 | val->totalram = totalram_pages; | 4902 | val->totalram = totalram_pages(); |
4750 | val->sharedram = global_node_page_state(NR_SHMEM); | 4903 | val->sharedram = global_node_page_state(NR_SHMEM); |
4751 | val->freeram = global_zone_page_state(NR_FREE_PAGES); | 4904 | val->freeram = global_zone_page_state(NR_FREE_PAGES); |
4752 | val->bufferram = nr_blockdev_pages(); | 4905 | val->bufferram = nr_blockdev_pages(); |
4753 | val->totalhigh = totalhigh_pages; | 4906 | val->totalhigh = totalhigh_pages(); |
4754 | val->freehigh = nr_free_highpages(); | 4907 | val->freehigh = nr_free_highpages(); |
4755 | val->mem_unit = PAGE_SIZE; | 4908 | val->mem_unit = PAGE_SIZE; |
4756 | } | 4909 | } |
@@ -4767,7 +4920,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
4767 | pg_data_t *pgdat = NODE_DATA(nid); | 4920 | pg_data_t *pgdat = NODE_DATA(nid); |
4768 | 4921 | ||
4769 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) | 4922 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) |
4770 | managed_pages += pgdat->node_zones[zone_type].managed_pages; | 4923 | managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); |
4771 | val->totalram = managed_pages; | 4924 | val->totalram = managed_pages; |
4772 | val->sharedram = node_page_state(pgdat, NR_SHMEM); | 4925 | val->sharedram = node_page_state(pgdat, NR_SHMEM); |
4773 | val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); | 4926 | val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); |
@@ -4776,7 +4929,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
4776 | struct zone *zone = &pgdat->node_zones[zone_type]; | 4929 | struct zone *zone = &pgdat->node_zones[zone_type]; |
4777 | 4930 | ||
4778 | if (is_highmem(zone)) { | 4931 | if (is_highmem(zone)) { |
4779 | managed_highpages += zone->managed_pages; | 4932 | managed_highpages += zone_managed_pages(zone); |
4780 | free_highpages += zone_page_state(zone, NR_FREE_PAGES); | 4933 | free_highpages += zone_page_state(zone, NR_FREE_PAGES); |
4781 | } | 4934 | } |
4782 | } | 4935 | } |
@@ -4983,7 +5136,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) | |||
4983 | K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), | 5136 | K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), |
4984 | K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), | 5137 | K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), |
4985 | K(zone->present_pages), | 5138 | K(zone->present_pages), |
4986 | K(zone->managed_pages), | 5139 | K(zone_managed_pages(zone)), |
4987 | K(zone_page_state(zone, NR_MLOCK)), | 5140 | K(zone_page_state(zone, NR_MLOCK)), |
4988 | zone_page_state(zone, NR_KERNEL_STACK_KB), | 5141 | zone_page_state(zone, NR_KERNEL_STACK_KB), |
4989 | K(zone_page_state(zone, NR_PAGETABLE)), | 5142 | K(zone_page_state(zone, NR_PAGETABLE)), |
@@ -5655,7 +5808,7 @@ static int zone_batchsize(struct zone *zone) | |||
5655 | * The per-cpu-pages pools are set to around 1000th of the | 5808 | * The per-cpu-pages pools are set to around 1000th of the |
5656 | * size of the zone. | 5809 | * size of the zone. |
5657 | */ | 5810 | */ |
5658 | batch = zone->managed_pages / 1024; | 5811 | batch = zone_managed_pages(zone) / 1024; |
5659 | /* But no more than a meg. */ | 5812 | /* But no more than a meg. */ |
5660 | if (batch * PAGE_SIZE > 1024 * 1024) | 5813 | if (batch * PAGE_SIZE > 1024 * 1024) |
5661 | batch = (1024 * 1024) / PAGE_SIZE; | 5814 | batch = (1024 * 1024) / PAGE_SIZE; |
@@ -5736,7 +5889,6 @@ static void pageset_init(struct per_cpu_pageset *p) | |||
5736 | memset(p, 0, sizeof(*p)); | 5889 | memset(p, 0, sizeof(*p)); |
5737 | 5890 | ||
5738 | pcp = &p->pcp; | 5891 | pcp = &p->pcp; |
5739 | pcp->count = 0; | ||
5740 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) | 5892 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) |
5741 | INIT_LIST_HEAD(&pcp->lists[migratetype]); | 5893 | INIT_LIST_HEAD(&pcp->lists[migratetype]); |
5742 | } | 5894 | } |
@@ -5766,7 +5918,7 @@ static void pageset_set_high_and_batch(struct zone *zone, | |||
5766 | { | 5918 | { |
5767 | if (percpu_pagelist_fraction) | 5919 | if (percpu_pagelist_fraction) |
5768 | pageset_set_high(pcp, | 5920 | pageset_set_high(pcp, |
5769 | (zone->managed_pages / | 5921 | (zone_managed_pages(zone) / |
5770 | percpu_pagelist_fraction)); | 5922 | percpu_pagelist_fraction)); |
5771 | else | 5923 | else |
5772 | pageset_set_batch(pcp, zone_batchsize(zone)); | 5924 | pageset_set_batch(pcp, zone_batchsize(zone)); |
@@ -5920,7 +6072,7 @@ void __init sparse_memory_present_with_active_regions(int nid) | |||
5920 | * with no available memory, a warning is printed and the start and end | 6072 | * with no available memory, a warning is printed and the start and end |
5921 | * PFNs will be 0. | 6073 | * PFNs will be 0. |
5922 | */ | 6074 | */ |
5923 | void __meminit get_pfn_range_for_nid(unsigned int nid, | 6075 | void __init get_pfn_range_for_nid(unsigned int nid, |
5924 | unsigned long *start_pfn, unsigned long *end_pfn) | 6076 | unsigned long *start_pfn, unsigned long *end_pfn) |
5925 | { | 6077 | { |
5926 | unsigned long this_start_pfn, this_end_pfn; | 6078 | unsigned long this_start_pfn, this_end_pfn; |
@@ -5969,7 +6121,7 @@ static void __init find_usable_zone_for_movable(void) | |||
5969 | * highest usable zone for ZONE_MOVABLE. This preserves the assumption that | 6121 | * highest usable zone for ZONE_MOVABLE. This preserves the assumption that |
5970 | * zones within a node are in order of monotonic increases memory addresses | 6122 | * zones within a node are in order of monotonic increases memory addresses |
5971 | */ | 6123 | */ |
5972 | static void __meminit adjust_zone_range_for_zone_movable(int nid, | 6124 | static void __init adjust_zone_range_for_zone_movable(int nid, |
5973 | unsigned long zone_type, | 6125 | unsigned long zone_type, |
5974 | unsigned long node_start_pfn, | 6126 | unsigned long node_start_pfn, |
5975 | unsigned long node_end_pfn, | 6127 | unsigned long node_end_pfn, |
@@ -6000,7 +6152,7 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, | |||
6000 | * Return the number of pages a zone spans in a node, including holes | 6152 | * Return the number of pages a zone spans in a node, including holes |
6001 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() | 6153 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() |
6002 | */ | 6154 | */ |
6003 | static unsigned long __meminit zone_spanned_pages_in_node(int nid, | 6155 | static unsigned long __init zone_spanned_pages_in_node(int nid, |
6004 | unsigned long zone_type, | 6156 | unsigned long zone_type, |
6005 | unsigned long node_start_pfn, | 6157 | unsigned long node_start_pfn, |
6006 | unsigned long node_end_pfn, | 6158 | unsigned long node_end_pfn, |
@@ -6035,7 +6187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
6035 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | 6187 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
6036 | * then all holes in the requested range will be accounted for. | 6188 | * then all holes in the requested range will be accounted for. |
6037 | */ | 6189 | */ |
6038 | unsigned long __meminit __absent_pages_in_range(int nid, | 6190 | unsigned long __init __absent_pages_in_range(int nid, |
6039 | unsigned long range_start_pfn, | 6191 | unsigned long range_start_pfn, |
6040 | unsigned long range_end_pfn) | 6192 | unsigned long range_end_pfn) |
6041 | { | 6193 | { |
@@ -6065,7 +6217,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, | |||
6065 | } | 6217 | } |
6066 | 6218 | ||
6067 | /* Return the number of page frames in holes in a zone on a node */ | 6219 | /* Return the number of page frames in holes in a zone on a node */ |
6068 | static unsigned long __meminit zone_absent_pages_in_node(int nid, | 6220 | static unsigned long __init zone_absent_pages_in_node(int nid, |
6069 | unsigned long zone_type, | 6221 | unsigned long zone_type, |
6070 | unsigned long node_start_pfn, | 6222 | unsigned long node_start_pfn, |
6071 | unsigned long node_end_pfn, | 6223 | unsigned long node_end_pfn, |
@@ -6117,7 +6269,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
6117 | } | 6269 | } |
6118 | 6270 | ||
6119 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 6271 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
6120 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, | 6272 | static inline unsigned long __init zone_spanned_pages_in_node(int nid, |
6121 | unsigned long zone_type, | 6273 | unsigned long zone_type, |
6122 | unsigned long node_start_pfn, | 6274 | unsigned long node_start_pfn, |
6123 | unsigned long node_end_pfn, | 6275 | unsigned long node_end_pfn, |
@@ -6136,7 +6288,7 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
6136 | return zones_size[zone_type]; | 6288 | return zones_size[zone_type]; |
6137 | } | 6289 | } |
6138 | 6290 | ||
6139 | static inline unsigned long __meminit zone_absent_pages_in_node(int nid, | 6291 | static inline unsigned long __init zone_absent_pages_in_node(int nid, |
6140 | unsigned long zone_type, | 6292 | unsigned long zone_type, |
6141 | unsigned long node_start_pfn, | 6293 | unsigned long node_start_pfn, |
6142 | unsigned long node_end_pfn, | 6294 | unsigned long node_end_pfn, |
@@ -6150,7 +6302,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
6150 | 6302 | ||
6151 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 6303 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
6152 | 6304 | ||
6153 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | 6305 | static void __init calculate_node_totalpages(struct pglist_data *pgdat, |
6154 | unsigned long node_start_pfn, | 6306 | unsigned long node_start_pfn, |
6155 | unsigned long node_end_pfn, | 6307 | unsigned long node_end_pfn, |
6156 | unsigned long *zones_size, | 6308 | unsigned long *zones_size, |
@@ -6323,7 +6475,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) | |||
6323 | static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, | 6475 | static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, |
6324 | unsigned long remaining_pages) | 6476 | unsigned long remaining_pages) |
6325 | { | 6477 | { |
6326 | zone->managed_pages = remaining_pages; | 6478 | atomic_long_set(&zone->managed_pages, remaining_pages); |
6327 | zone_set_nid(zone, nid); | 6479 | zone_set_nid(zone, nid); |
6328 | zone->name = zone_names[idx]; | 6480 | zone->name = zone_names[idx]; |
6329 | zone->zone_pgdat = NODE_DATA(nid); | 6481 | zone->zone_pgdat = NODE_DATA(nid); |
@@ -6476,12 +6628,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } | |||
6476 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT | 6628 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
6477 | static inline void pgdat_set_deferred_range(pg_data_t *pgdat) | 6629 | static inline void pgdat_set_deferred_range(pg_data_t *pgdat) |
6478 | { | 6630 | { |
6479 | /* | ||
6480 | * We start only with one section of pages, more pages are added as | ||
6481 | * needed until the rest of deferred pages are initialized. | ||
6482 | */ | ||
6483 | pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, | ||
6484 | pgdat->node_spanned_pages); | ||
6485 | pgdat->first_deferred_pfn = ULONG_MAX; | 6631 | pgdat->first_deferred_pfn = ULONG_MAX; |
6486 | } | 6632 | } |
6487 | #else | 6633 | #else |
@@ -7075,18 +7221,16 @@ early_param("movablecore", cmdline_parse_movablecore); | |||
7075 | 7221 | ||
7076 | void adjust_managed_page_count(struct page *page, long count) | 7222 | void adjust_managed_page_count(struct page *page, long count) |
7077 | { | 7223 | { |
7078 | spin_lock(&managed_page_count_lock); | 7224 | atomic_long_add(count, &page_zone(page)->managed_pages); |
7079 | page_zone(page)->managed_pages += count; | 7225 | totalram_pages_add(count); |
7080 | totalram_pages += count; | ||
7081 | #ifdef CONFIG_HIGHMEM | 7226 | #ifdef CONFIG_HIGHMEM |
7082 | if (PageHighMem(page)) | 7227 | if (PageHighMem(page)) |
7083 | totalhigh_pages += count; | 7228 | totalhigh_pages_add(count); |
7084 | #endif | 7229 | #endif |
7085 | spin_unlock(&managed_page_count_lock); | ||
7086 | } | 7230 | } |
7087 | EXPORT_SYMBOL(adjust_managed_page_count); | 7231 | EXPORT_SYMBOL(adjust_managed_page_count); |
7088 | 7232 | ||
7089 | unsigned long free_reserved_area(void *start, void *end, int poison, char *s) | 7233 | unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) |
7090 | { | 7234 | { |
7091 | void *pos; | 7235 | void *pos; |
7092 | unsigned long pages = 0; | 7236 | unsigned long pages = 0; |
@@ -7123,9 +7267,9 @@ EXPORT_SYMBOL(free_reserved_area); | |||
7123 | void free_highmem_page(struct page *page) | 7267 | void free_highmem_page(struct page *page) |
7124 | { | 7268 | { |
7125 | __free_reserved_page(page); | 7269 | __free_reserved_page(page); |
7126 | totalram_pages++; | 7270 | totalram_pages_inc(); |
7127 | page_zone(page)->managed_pages++; | 7271 | atomic_long_inc(&page_zone(page)->managed_pages); |
7128 | totalhigh_pages++; | 7272 | totalhigh_pages_inc(); |
7129 | } | 7273 | } |
7130 | #endif | 7274 | #endif |
7131 | 7275 | ||
@@ -7174,10 +7318,10 @@ void __init mem_init_print_info(const char *str) | |||
7174 | physpages << (PAGE_SHIFT - 10), | 7318 | physpages << (PAGE_SHIFT - 10), |
7175 | codesize >> 10, datasize >> 10, rosize >> 10, | 7319 | codesize >> 10, datasize >> 10, rosize >> 10, |
7176 | (init_data_size + init_code_size) >> 10, bss_size >> 10, | 7320 | (init_data_size + init_code_size) >> 10, bss_size >> 10, |
7177 | (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), | 7321 | (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), |
7178 | totalcma_pages << (PAGE_SHIFT - 10), | 7322 | totalcma_pages << (PAGE_SHIFT - 10), |
7179 | #ifdef CONFIG_HIGHMEM | 7323 | #ifdef CONFIG_HIGHMEM |
7180 | totalhigh_pages << (PAGE_SHIFT - 10), | 7324 | totalhigh_pages() << (PAGE_SHIFT - 10), |
7181 | #endif | 7325 | #endif |
7182 | str ? ", " : "", str ? str : ""); | 7326 | str ? ", " : "", str ? str : ""); |
7183 | } | 7327 | } |
@@ -7257,6 +7401,7 @@ static void calculate_totalreserve_pages(void) | |||
7257 | for (i = 0; i < MAX_NR_ZONES; i++) { | 7401 | for (i = 0; i < MAX_NR_ZONES; i++) { |
7258 | struct zone *zone = pgdat->node_zones + i; | 7402 | struct zone *zone = pgdat->node_zones + i; |
7259 | long max = 0; | 7403 | long max = 0; |
7404 | unsigned long managed_pages = zone_managed_pages(zone); | ||
7260 | 7405 | ||
7261 | /* Find valid and maximum lowmem_reserve in the zone */ | 7406 | /* Find valid and maximum lowmem_reserve in the zone */ |
7262 | for (j = i; j < MAX_NR_ZONES; j++) { | 7407 | for (j = i; j < MAX_NR_ZONES; j++) { |
@@ -7267,8 +7412,8 @@ static void calculate_totalreserve_pages(void) | |||
7267 | /* we treat the high watermark as reserved pages. */ | 7412 | /* we treat the high watermark as reserved pages. */ |
7268 | max += high_wmark_pages(zone); | 7413 | max += high_wmark_pages(zone); |
7269 | 7414 | ||
7270 | if (max > zone->managed_pages) | 7415 | if (max > managed_pages) |
7271 | max = zone->managed_pages; | 7416 | max = managed_pages; |
7272 | 7417 | ||
7273 | pgdat->totalreserve_pages += max; | 7418 | pgdat->totalreserve_pages += max; |
7274 | 7419 | ||
@@ -7292,7 +7437,7 @@ static void setup_per_zone_lowmem_reserve(void) | |||
7292 | for_each_online_pgdat(pgdat) { | 7437 | for_each_online_pgdat(pgdat) { |
7293 | for (j = 0; j < MAX_NR_ZONES; j++) { | 7438 | for (j = 0; j < MAX_NR_ZONES; j++) { |
7294 | struct zone *zone = pgdat->node_zones + j; | 7439 | struct zone *zone = pgdat->node_zones + j; |
7295 | unsigned long managed_pages = zone->managed_pages; | 7440 | unsigned long managed_pages = zone_managed_pages(zone); |
7296 | 7441 | ||
7297 | zone->lowmem_reserve[j] = 0; | 7442 | zone->lowmem_reserve[j] = 0; |
7298 | 7443 | ||
@@ -7310,7 +7455,7 @@ static void setup_per_zone_lowmem_reserve(void) | |||
7310 | lower_zone->lowmem_reserve[j] = | 7455 | lower_zone->lowmem_reserve[j] = |
7311 | managed_pages / sysctl_lowmem_reserve_ratio[idx]; | 7456 | managed_pages / sysctl_lowmem_reserve_ratio[idx]; |
7312 | } | 7457 | } |
7313 | managed_pages += lower_zone->managed_pages; | 7458 | managed_pages += zone_managed_pages(lower_zone); |
7314 | } | 7459 | } |
7315 | } | 7460 | } |
7316 | } | 7461 | } |
@@ -7329,14 +7474,14 @@ static void __setup_per_zone_wmarks(void) | |||
7329 | /* Calculate total number of !ZONE_HIGHMEM pages */ | 7474 | /* Calculate total number of !ZONE_HIGHMEM pages */ |
7330 | for_each_zone(zone) { | 7475 | for_each_zone(zone) { |
7331 | if (!is_highmem(zone)) | 7476 | if (!is_highmem(zone)) |
7332 | lowmem_pages += zone->managed_pages; | 7477 | lowmem_pages += zone_managed_pages(zone); |
7333 | } | 7478 | } |
7334 | 7479 | ||
7335 | for_each_zone(zone) { | 7480 | for_each_zone(zone) { |
7336 | u64 tmp; | 7481 | u64 tmp; |
7337 | 7482 | ||
7338 | spin_lock_irqsave(&zone->lock, flags); | 7483 | spin_lock_irqsave(&zone->lock, flags); |
7339 | tmp = (u64)pages_min * zone->managed_pages; | 7484 | tmp = (u64)pages_min * zone_managed_pages(zone); |
7340 | do_div(tmp, lowmem_pages); | 7485 | do_div(tmp, lowmem_pages); |
7341 | if (is_highmem(zone)) { | 7486 | if (is_highmem(zone)) { |
7342 | /* | 7487 | /* |
@@ -7350,15 +7495,15 @@ static void __setup_per_zone_wmarks(void) | |||
7350 | */ | 7495 | */ |
7351 | unsigned long min_pages; | 7496 | unsigned long min_pages; |
7352 | 7497 | ||
7353 | min_pages = zone->managed_pages / 1024; | 7498 | min_pages = zone_managed_pages(zone) / 1024; |
7354 | min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); | 7499 | min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); |
7355 | zone->watermark[WMARK_MIN] = min_pages; | 7500 | zone->_watermark[WMARK_MIN] = min_pages; |
7356 | } else { | 7501 | } else { |
7357 | /* | 7502 | /* |
7358 | * If it's a lowmem zone, reserve a number of pages | 7503 | * If it's a lowmem zone, reserve a number of pages |
7359 | * proportionate to the zone's size. | 7504 | * proportionate to the zone's size. |
7360 | */ | 7505 | */ |
7361 | zone->watermark[WMARK_MIN] = tmp; | 7506 | zone->_watermark[WMARK_MIN] = tmp; |
7362 | } | 7507 | } |
7363 | 7508 | ||
7364 | /* | 7509 | /* |
@@ -7367,11 +7512,12 @@ static void __setup_per_zone_wmarks(void) | |||
7367 | * ensure a minimum size on small systems. | 7512 | * ensure a minimum size on small systems. |
7368 | */ | 7513 | */ |
7369 | tmp = max_t(u64, tmp >> 2, | 7514 | tmp = max_t(u64, tmp >> 2, |
7370 | mult_frac(zone->managed_pages, | 7515 | mult_frac(zone_managed_pages(zone), |
7371 | watermark_scale_factor, 10000)); | 7516 | watermark_scale_factor, 10000)); |
7372 | 7517 | ||
7373 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; | 7518 | zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; |
7374 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; | 7519 | zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; |
7520 | zone->watermark_boost = 0; | ||
7375 | 7521 | ||
7376 | spin_unlock_irqrestore(&zone->lock, flags); | 7522 | spin_unlock_irqrestore(&zone->lock, flags); |
7377 | } | 7523 | } |
@@ -7472,6 +7618,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, | |||
7472 | return 0; | 7618 | return 0; |
7473 | } | 7619 | } |
7474 | 7620 | ||
7621 | int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write, | ||
7622 | void __user *buffer, size_t *length, loff_t *ppos) | ||
7623 | { | ||
7624 | int rc; | ||
7625 | |||
7626 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); | ||
7627 | if (rc) | ||
7628 | return rc; | ||
7629 | |||
7630 | return 0; | ||
7631 | } | ||
7632 | |||
7475 | int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, | 7633 | int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, |
7476 | void __user *buffer, size_t *length, loff_t *ppos) | 7634 | void __user *buffer, size_t *length, loff_t *ppos) |
7477 | { | 7635 | { |
@@ -7497,8 +7655,8 @@ static void setup_min_unmapped_ratio(void) | |||
7497 | pgdat->min_unmapped_pages = 0; | 7655 | pgdat->min_unmapped_pages = 0; |
7498 | 7656 | ||
7499 | for_each_zone(zone) | 7657 | for_each_zone(zone) |
7500 | zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * | 7658 | zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * |
7501 | sysctl_min_unmapped_ratio) / 100; | 7659 | sysctl_min_unmapped_ratio) / 100; |
7502 | } | 7660 | } |
7503 | 7661 | ||
7504 | 7662 | ||
@@ -7525,8 +7683,8 @@ static void setup_min_slab_ratio(void) | |||
7525 | pgdat->min_slab_pages = 0; | 7683 | pgdat->min_slab_pages = 0; |
7526 | 7684 | ||
7527 | for_each_zone(zone) | 7685 | for_each_zone(zone) |
7528 | zone->zone_pgdat->min_slab_pages += (zone->managed_pages * | 7686 | zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * |
7529 | sysctl_min_slab_ratio) / 100; | 7687 | sysctl_min_slab_ratio) / 100; |
7530 | } | 7688 | } |
7531 | 7689 | ||
7532 | int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, | 7690 | int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, |
@@ -7766,8 +7924,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
7766 | * race condition. So you can't expect this function should be exact. | 7924 | * race condition. So you can't expect this function should be exact. |
7767 | */ | 7925 | */ |
7768 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | 7926 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
7769 | int migratetype, | 7927 | int migratetype, int flags) |
7770 | bool skip_hwpoisoned_pages) | ||
7771 | { | 7928 | { |
7772 | unsigned long pfn, iter, found; | 7929 | unsigned long pfn, iter, found; |
7773 | 7930 | ||
@@ -7841,7 +7998,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
7841 | * The HWPoisoned page may be not in buddy system, and | 7998 | * The HWPoisoned page may be not in buddy system, and |
7842 | * page_count() is not 0. | 7999 | * page_count() is not 0. |
7843 | */ | 8000 | */ |
7844 | if (skip_hwpoisoned_pages && PageHWPoison(page)) | 8001 | if ((flags & SKIP_HWPOISON) && PageHWPoison(page)) |
7845 | continue; | 8002 | continue; |
7846 | 8003 | ||
7847 | if (__PageMovable(page)) | 8004 | if (__PageMovable(page)) |
@@ -7868,6 +8025,8 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
7868 | return false; | 8025 | return false; |
7869 | unmovable: | 8026 | unmovable: |
7870 | WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); | 8027 | WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); |
8028 | if (flags & REPORT_FAILURE) | ||
8029 | dump_page(pfn_to_page(pfn+iter), "unmovable page"); | ||
7871 | return true; | 8030 | return true; |
7872 | } | 8031 | } |
7873 | 8032 | ||
@@ -7994,8 +8153,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
7994 | */ | 8153 | */ |
7995 | 8154 | ||
7996 | ret = start_isolate_page_range(pfn_max_align_down(start), | 8155 | ret = start_isolate_page_range(pfn_max_align_down(start), |
7997 | pfn_max_align_up(end), migratetype, | 8156 | pfn_max_align_up(end), migratetype, 0); |
7998 | false); | ||
7999 | if (ret) | 8157 | if (ret) |
8000 | return ret; | 8158 | return ret; |
8001 | 8159 | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 43e085608846..ce323e56b34d 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -15,8 +15,7 @@ | |||
15 | #define CREATE_TRACE_POINTS | 15 | #define CREATE_TRACE_POINTS |
16 | #include <trace/events/page_isolation.h> | 16 | #include <trace/events/page_isolation.h> |
17 | 17 | ||
18 | static int set_migratetype_isolate(struct page *page, int migratetype, | 18 | static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags) |
19 | bool skip_hwpoisoned_pages) | ||
20 | { | 19 | { |
21 | struct zone *zone; | 20 | struct zone *zone; |
22 | unsigned long flags, pfn; | 21 | unsigned long flags, pfn; |
@@ -60,8 +59,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, | |||
60 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | 59 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. |
61 | * We just check MOVABLE pages. | 60 | * We just check MOVABLE pages. |
62 | */ | 61 | */ |
63 | if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, | 62 | if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, flags)) |
64 | skip_hwpoisoned_pages)) | ||
65 | ret = 0; | 63 | ret = 0; |
66 | 64 | ||
67 | /* | 65 | /* |
@@ -185,7 +183,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) | |||
185 | * prevents two threads from simultaneously working on overlapping ranges. | 183 | * prevents two threads from simultaneously working on overlapping ranges. |
186 | */ | 184 | */ |
187 | int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | 185 | int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, |
188 | unsigned migratetype, bool skip_hwpoisoned_pages) | 186 | unsigned migratetype, int flags) |
189 | { | 187 | { |
190 | unsigned long pfn; | 188 | unsigned long pfn; |
191 | unsigned long undo_pfn; | 189 | unsigned long undo_pfn; |
@@ -199,7 +197,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | |||
199 | pfn += pageblock_nr_pages) { | 197 | pfn += pageblock_nr_pages) { |
200 | page = __first_valid_page(pfn, pageblock_nr_pages); | 198 | page = __first_valid_page(pfn, pageblock_nr_pages); |
201 | if (page && | 199 | if (page && |
202 | set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) { | 200 | set_migratetype_isolate(page, migratetype, flags)) { |
203 | undo_pfn = pfn; | 201 | undo_pfn = pfn; |
204 | goto undo; | 202 | goto undo; |
205 | } | 203 | } |
diff --git a/mm/page_owner.c b/mm/page_owner.c index 87bc0dfdb52b..28b06524939f 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c | |||
@@ -351,6 +351,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
351 | .skip = 0 | 351 | .skip = 0 |
352 | }; | 352 | }; |
353 | 353 | ||
354 | count = min_t(size_t, count, PAGE_SIZE); | ||
354 | kbuf = kmalloc(count, GFP_KERNEL); | 355 | kbuf = kmalloc(count, GFP_KERNEL); |
355 | if (!kbuf) | 356 | if (!kbuf) |
356 | return -ENOMEM; | 357 | return -ENOMEM; |
diff --git a/mm/readahead.c b/mm/readahead.c index f3d6f9656a3c..1ae16522412a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -270,17 +270,15 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) | |||
270 | * return it as the new window size. | 270 | * return it as the new window size. |
271 | */ | 271 | */ |
272 | static unsigned long get_next_ra_size(struct file_ra_state *ra, | 272 | static unsigned long get_next_ra_size(struct file_ra_state *ra, |
273 | unsigned long max) | 273 | unsigned long max) |
274 | { | 274 | { |
275 | unsigned long cur = ra->size; | 275 | unsigned long cur = ra->size; |
276 | unsigned long newsize; | ||
277 | 276 | ||
278 | if (cur < max / 16) | 277 | if (cur < max / 16) |
279 | newsize = 4 * cur; | 278 | return 4 * cur; |
280 | else | 279 | if (cur <= max / 2) |
281 | newsize = 2 * cur; | 280 | return 2 * cur; |
282 | 281 | return max; | |
283 | return min(newsize, max); | ||
284 | } | 282 | } |
285 | 283 | ||
286 | /* | 284 | /* |
@@ -25,6 +25,7 @@ | |||
25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
26 | * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) | 26 | * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) |
27 | * mapping->i_mmap_rwsem | 27 | * mapping->i_mmap_rwsem |
28 | * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) | ||
28 | * anon_vma->rwsem | 29 | * anon_vma->rwsem |
29 | * mm->page_table_lock or pte_lock | 30 | * mm->page_table_lock or pte_lock |
30 | * zone_lru_lock (in mark_page_accessed, isolate_lru_page) | 31 | * zone_lru_lock (in mark_page_accessed, isolate_lru_page) |
@@ -889,15 +890,17 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
889 | .address = address, | 890 | .address = address, |
890 | .flags = PVMW_SYNC, | 891 | .flags = PVMW_SYNC, |
891 | }; | 892 | }; |
892 | unsigned long start = address, end; | 893 | struct mmu_notifier_range range; |
893 | int *cleaned = arg; | 894 | int *cleaned = arg; |
894 | 895 | ||
895 | /* | 896 | /* |
896 | * We have to assume the worse case ie pmd for invalidation. Note that | 897 | * We have to assume the worse case ie pmd for invalidation. Note that |
897 | * the page can not be free from this function. | 898 | * the page can not be free from this function. |
898 | */ | 899 | */ |
899 | end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); | 900 | mmu_notifier_range_init(&range, vma->vm_mm, address, |
900 | mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); | 901 | min(vma->vm_end, address + |
902 | (PAGE_SIZE << compound_order(page)))); | ||
903 | mmu_notifier_invalidate_range_start(&range); | ||
901 | 904 | ||
902 | while (page_vma_mapped_walk(&pvmw)) { | 905 | while (page_vma_mapped_walk(&pvmw)) { |
903 | unsigned long cstart; | 906 | unsigned long cstart; |
@@ -949,7 +952,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
949 | (*cleaned)++; | 952 | (*cleaned)++; |
950 | } | 953 | } |
951 | 954 | ||
952 | mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); | 955 | mmu_notifier_invalidate_range_end(&range); |
953 | 956 | ||
954 | return true; | 957 | return true; |
955 | } | 958 | } |
@@ -1017,7 +1020,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) | |||
1017 | 1020 | ||
1018 | /** | 1021 | /** |
1019 | * __page_set_anon_rmap - set up new anonymous rmap | 1022 | * __page_set_anon_rmap - set up new anonymous rmap |
1020 | * @page: Page to add to rmap | 1023 | * @page: Page or Hugepage to add to rmap |
1021 | * @vma: VM area to add page to. | 1024 | * @vma: VM area to add page to. |
1022 | * @address: User virtual address of the mapping | 1025 | * @address: User virtual address of the mapping |
1023 | * @exclusive: the page is exclusively owned by the current process | 1026 | * @exclusive: the page is exclusively owned by the current process |
@@ -1345,7 +1348,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1345 | pte_t pteval; | 1348 | pte_t pteval; |
1346 | struct page *subpage; | 1349 | struct page *subpage; |
1347 | bool ret = true; | 1350 | bool ret = true; |
1348 | unsigned long start = address, end; | 1351 | struct mmu_notifier_range range; |
1349 | enum ttu_flags flags = (enum ttu_flags)arg; | 1352 | enum ttu_flags flags = (enum ttu_flags)arg; |
1350 | 1353 | ||
1351 | /* munlock has nothing to gain from examining un-locked vmas */ | 1354 | /* munlock has nothing to gain from examining un-locked vmas */ |
@@ -1369,15 +1372,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1369 | * Note that the page can not be free in this function as call of | 1372 | * Note that the page can not be free in this function as call of |
1370 | * try_to_unmap() must hold a reference on the page. | 1373 | * try_to_unmap() must hold a reference on the page. |
1371 | */ | 1374 | */ |
1372 | end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); | 1375 | mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start, |
1376 | min(vma->vm_end, vma->vm_start + | ||
1377 | (PAGE_SIZE << compound_order(page)))); | ||
1373 | if (PageHuge(page)) { | 1378 | if (PageHuge(page)) { |
1374 | /* | 1379 | /* |
1375 | * If sharing is possible, start and end will be adjusted | 1380 | * If sharing is possible, start and end will be adjusted |
1376 | * accordingly. | 1381 | * accordingly. |
1382 | * | ||
1383 | * If called for a huge page, caller must hold i_mmap_rwsem | ||
1384 | * in write mode as it is possible to call huge_pmd_unshare. | ||
1377 | */ | 1385 | */ |
1378 | adjust_range_if_pmd_sharing_possible(vma, &start, &end); | 1386 | adjust_range_if_pmd_sharing_possible(vma, &range.start, |
1387 | &range.end); | ||
1379 | } | 1388 | } |
1380 | mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); | 1389 | mmu_notifier_invalidate_range_start(&range); |
1381 | 1390 | ||
1382 | while (page_vma_mapped_walk(&pvmw)) { | 1391 | while (page_vma_mapped_walk(&pvmw)) { |
1383 | #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION | 1392 | #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION |
@@ -1428,9 +1437,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1428 | * we must flush them all. start/end were | 1437 | * we must flush them all. start/end were |
1429 | * already adjusted above to cover this range. | 1438 | * already adjusted above to cover this range. |
1430 | */ | 1439 | */ |
1431 | flush_cache_range(vma, start, end); | 1440 | flush_cache_range(vma, range.start, range.end); |
1432 | flush_tlb_range(vma, start, end); | 1441 | flush_tlb_range(vma, range.start, range.end); |
1433 | mmu_notifier_invalidate_range(mm, start, end); | 1442 | mmu_notifier_invalidate_range(mm, range.start, |
1443 | range.end); | ||
1434 | 1444 | ||
1435 | /* | 1445 | /* |
1436 | * The ref count of the PMD page was dropped | 1446 | * The ref count of the PMD page was dropped |
@@ -1650,7 +1660,7 @@ discard: | |||
1650 | put_page(page); | 1660 | put_page(page); |
1651 | } | 1661 | } |
1652 | 1662 | ||
1653 | mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); | 1663 | mmu_notifier_invalidate_range_end(&range); |
1654 | 1664 | ||
1655 | return ret; | 1665 | return ret; |
1656 | } | 1666 | } |
@@ -1910,27 +1920,10 @@ void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) | |||
1910 | 1920 | ||
1911 | #ifdef CONFIG_HUGETLB_PAGE | 1921 | #ifdef CONFIG_HUGETLB_PAGE |
1912 | /* | 1922 | /* |
1913 | * The following three functions are for anonymous (private mapped) hugepages. | 1923 | * The following two functions are for anonymous (private mapped) hugepages. |
1914 | * Unlike common anonymous pages, anonymous hugepages have no accounting code | 1924 | * Unlike common anonymous pages, anonymous hugepages have no accounting code |
1915 | * and no lru code, because we handle hugepages differently from common pages. | 1925 | * and no lru code, because we handle hugepages differently from common pages. |
1916 | */ | 1926 | */ |
1917 | static void __hugepage_set_anon_rmap(struct page *page, | ||
1918 | struct vm_area_struct *vma, unsigned long address, int exclusive) | ||
1919 | { | ||
1920 | struct anon_vma *anon_vma = vma->anon_vma; | ||
1921 | |||
1922 | BUG_ON(!anon_vma); | ||
1923 | |||
1924 | if (PageAnon(page)) | ||
1925 | return; | ||
1926 | if (!exclusive) | ||
1927 | anon_vma = anon_vma->root; | ||
1928 | |||
1929 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
1930 | page->mapping = (struct address_space *) anon_vma; | ||
1931 | page->index = linear_page_index(vma, address); | ||
1932 | } | ||
1933 | |||
1934 | void hugepage_add_anon_rmap(struct page *page, | 1927 | void hugepage_add_anon_rmap(struct page *page, |
1935 | struct vm_area_struct *vma, unsigned long address) | 1928 | struct vm_area_struct *vma, unsigned long address) |
1936 | { | 1929 | { |
@@ -1942,7 +1935,7 @@ void hugepage_add_anon_rmap(struct page *page, | |||
1942 | /* address might be in next vma when migration races vma_adjust */ | 1935 | /* address might be in next vma when migration races vma_adjust */ |
1943 | first = atomic_inc_and_test(compound_mapcount_ptr(page)); | 1936 | first = atomic_inc_and_test(compound_mapcount_ptr(page)); |
1944 | if (first) | 1937 | if (first) |
1945 | __hugepage_set_anon_rmap(page, vma, address, 0); | 1938 | __page_set_anon_rmap(page, vma, address, 0); |
1946 | } | 1939 | } |
1947 | 1940 | ||
1948 | void hugepage_add_new_anon_rmap(struct page *page, | 1941 | void hugepage_add_new_anon_rmap(struct page *page, |
@@ -1950,6 +1943,6 @@ void hugepage_add_new_anon_rmap(struct page *page, | |||
1950 | { | 1943 | { |
1951 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 1944 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
1952 | atomic_set(compound_mapcount_ptr(page), 0); | 1945 | atomic_set(compound_mapcount_ptr(page), 0); |
1953 | __hugepage_set_anon_rmap(page, vma, address, 1); | 1946 | __page_set_anon_rmap(page, vma, address, 1); |
1954 | } | 1947 | } |
1955 | #endif /* CONFIG_HUGETLB_PAGE */ | 1948 | #endif /* CONFIG_HUGETLB_PAGE */ |
diff --git a/mm/shmem.c b/mm/shmem.c index 375f3ac19bb8..6ece1e2fe76e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -109,12 +109,14 @@ struct shmem_falloc { | |||
109 | #ifdef CONFIG_TMPFS | 109 | #ifdef CONFIG_TMPFS |
110 | static unsigned long shmem_default_max_blocks(void) | 110 | static unsigned long shmem_default_max_blocks(void) |
111 | { | 111 | { |
112 | return totalram_pages / 2; | 112 | return totalram_pages() / 2; |
113 | } | 113 | } |
114 | 114 | ||
115 | static unsigned long shmem_default_max_inodes(void) | 115 | static unsigned long shmem_default_max_inodes(void) |
116 | { | 116 | { |
117 | return min(totalram_pages - totalhigh_pages, totalram_pages / 2); | 117 | unsigned long nr_pages = totalram_pages(); |
118 | |||
119 | return min(nr_pages - totalhigh_pages(), nr_pages / 2); | ||
118 | } | 120 | } |
119 | #endif | 121 | #endif |
120 | 122 | ||
@@ -3301,7 +3303,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
3301 | size = memparse(value,&rest); | 3303 | size = memparse(value,&rest); |
3302 | if (*rest == '%') { | 3304 | if (*rest == '%') { |
3303 | size <<= PAGE_SHIFT; | 3305 | size <<= PAGE_SHIFT; |
3304 | size *= totalram_pages; | 3306 | size *= totalram_pages(); |
3305 | do_div(size, 100); | 3307 | do_div(size, 100); |
3306 | rest++; | 3308 | rest++; |
3307 | } | 3309 | } |
@@ -406,19 +406,6 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, | |||
406 | return page->s_mem + cache->size * idx; | 406 | return page->s_mem + cache->size * idx; |
407 | } | 407 | } |
408 | 408 | ||
409 | /* | ||
410 | * We want to avoid an expensive divide : (offset / cache->size) | ||
411 | * Using the fact that size is a constant for a particular cache, | ||
412 | * we can replace (offset / cache->size) by | ||
413 | * reciprocal_divide(offset, cache->reciprocal_buffer_size) | ||
414 | */ | ||
415 | static inline unsigned int obj_to_index(const struct kmem_cache *cache, | ||
416 | const struct page *page, void *obj) | ||
417 | { | ||
418 | u32 offset = (obj - page->s_mem); | ||
419 | return reciprocal_divide(offset, cache->reciprocal_buffer_size); | ||
420 | } | ||
421 | |||
422 | #define BOOT_CPUCACHE_ENTRIES 1 | 409 | #define BOOT_CPUCACHE_ENTRIES 1 |
423 | /* internal cache of cache description objs */ | 410 | /* internal cache of cache description objs */ |
424 | static struct kmem_cache kmem_cache_boot = { | 411 | static struct kmem_cache kmem_cache_boot = { |
@@ -1248,7 +1235,7 @@ void __init kmem_cache_init(void) | |||
1248 | * page orders on machines with more than 32MB of memory if | 1235 | * page orders on machines with more than 32MB of memory if |
1249 | * not overridden on the command line. | 1236 | * not overridden on the command line. |
1250 | */ | 1237 | */ |
1251 | if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) | 1238 | if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT) |
1252 | slab_max_order = SLAB_MAX_ORDER_HI; | 1239 | slab_max_order = SLAB_MAX_ORDER_HI; |
1253 | 1240 | ||
1254 | /* Bootstrap is tricky, because several objects are allocated | 1241 | /* Bootstrap is tricky, because several objects are allocated |
@@ -2370,7 +2357,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, | |||
2370 | void *freelist; | 2357 | void *freelist; |
2371 | void *addr = page_address(page); | 2358 | void *addr = page_address(page); |
2372 | 2359 | ||
2373 | page->s_mem = addr + colour_off; | 2360 | page->s_mem = kasan_reset_tag(addr) + colour_off; |
2374 | page->active = 0; | 2361 | page->active = 0; |
2375 | 2362 | ||
2376 | if (OBJFREELIST_SLAB(cachep)) | 2363 | if (OBJFREELIST_SLAB(cachep)) |
@@ -2574,7 +2561,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2574 | 2561 | ||
2575 | for (i = 0; i < cachep->num; i++) { | 2562 | for (i = 0; i < cachep->num; i++) { |
2576 | objp = index_to_obj(cachep, page, i); | 2563 | objp = index_to_obj(cachep, page, i); |
2577 | kasan_init_slab_obj(cachep, objp); | 2564 | objp = kasan_init_slab_obj(cachep, objp); |
2578 | 2565 | ||
2579 | /* constructor could break poison info */ | 2566 | /* constructor could break poison info */ |
2580 | if (DEBUG == 0 && cachep->ctor) { | 2567 | if (DEBUG == 0 && cachep->ctor) { |
@@ -3551,7 +3538,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3551 | { | 3538 | { |
3552 | void *ret = slab_alloc(cachep, flags, _RET_IP_); | 3539 | void *ret = slab_alloc(cachep, flags, _RET_IP_); |
3553 | 3540 | ||
3554 | kasan_slab_alloc(cachep, ret, flags); | 3541 | ret = kasan_slab_alloc(cachep, ret, flags); |
3555 | trace_kmem_cache_alloc(_RET_IP_, ret, | 3542 | trace_kmem_cache_alloc(_RET_IP_, ret, |
3556 | cachep->object_size, cachep->size, flags); | 3543 | cachep->object_size, cachep->size, flags); |
3557 | 3544 | ||
@@ -3617,7 +3604,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) | |||
3617 | 3604 | ||
3618 | ret = slab_alloc(cachep, flags, _RET_IP_); | 3605 | ret = slab_alloc(cachep, flags, _RET_IP_); |
3619 | 3606 | ||
3620 | kasan_kmalloc(cachep, ret, size, flags); | 3607 | ret = kasan_kmalloc(cachep, ret, size, flags); |
3621 | trace_kmalloc(_RET_IP_, ret, | 3608 | trace_kmalloc(_RET_IP_, ret, |
3622 | size, cachep->size, flags); | 3609 | size, cachep->size, flags); |
3623 | return ret; | 3610 | return ret; |
@@ -3641,7 +3628,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3641 | { | 3628 | { |
3642 | void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); | 3629 | void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); |
3643 | 3630 | ||
3644 | kasan_slab_alloc(cachep, ret, flags); | 3631 | ret = kasan_slab_alloc(cachep, ret, flags); |
3645 | trace_kmem_cache_alloc_node(_RET_IP_, ret, | 3632 | trace_kmem_cache_alloc_node(_RET_IP_, ret, |
3646 | cachep->object_size, cachep->size, | 3633 | cachep->object_size, cachep->size, |
3647 | flags, nodeid); | 3634 | flags, nodeid); |
@@ -3660,7 +3647,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, | |||
3660 | 3647 | ||
3661 | ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); | 3648 | ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); |
3662 | 3649 | ||
3663 | kasan_kmalloc(cachep, ret, size, flags); | 3650 | ret = kasan_kmalloc(cachep, ret, size, flags); |
3664 | trace_kmalloc_node(_RET_IP_, ret, | 3651 | trace_kmalloc_node(_RET_IP_, ret, |
3665 | size, cachep->size, | 3652 | size, cachep->size, |
3666 | flags, nodeid); | 3653 | flags, nodeid); |
@@ -3681,7 +3668,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) | |||
3681 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3668 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
3682 | return cachep; | 3669 | return cachep; |
3683 | ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); | 3670 | ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); |
3684 | kasan_kmalloc(cachep, ret, size, flags); | 3671 | ret = kasan_kmalloc(cachep, ret, size, flags); |
3685 | 3672 | ||
3686 | return ret; | 3673 | return ret; |
3687 | } | 3674 | } |
@@ -3719,7 +3706,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3719 | return cachep; | 3706 | return cachep; |
3720 | ret = slab_alloc(cachep, flags, caller); | 3707 | ret = slab_alloc(cachep, flags, caller); |
3721 | 3708 | ||
3722 | kasan_kmalloc(cachep, ret, size, flags); | 3709 | ret = kasan_kmalloc(cachep, ret, size, flags); |
3723 | trace_kmalloc(caller, ret, | 3710 | trace_kmalloc(caller, ret, |
3724 | size, cachep->size, flags); | 3711 | size, cachep->size, flags); |
3725 | 3712 | ||
@@ -441,7 +441,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, | |||
441 | 441 | ||
442 | kmemleak_alloc_recursive(object, s->object_size, 1, | 442 | kmemleak_alloc_recursive(object, s->object_size, 1, |
443 | s->flags, flags); | 443 | s->flags, flags); |
444 | kasan_slab_alloc(s, object, flags); | 444 | p[i] = kasan_slab_alloc(s, object, flags); |
445 | } | 445 | } |
446 | 446 | ||
447 | if (memcg_kmem_enabled()) | 447 | if (memcg_kmem_enabled()) |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 9c11e8a937d2..70b0cc85db67 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -1029,10 +1029,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) | |||
1029 | 1029 | ||
1030 | index = size_index[size_index_elem(size)]; | 1030 | index = size_index[size_index_elem(size)]; |
1031 | } else { | 1031 | } else { |
1032 | if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { | 1032 | if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE)) |
1033 | WARN_ON(1); | ||
1034 | return NULL; | 1033 | return NULL; |
1035 | } | ||
1036 | index = fls(size - 1); | 1034 | index = fls(size - 1); |
1037 | } | 1035 | } |
1038 | 1036 | ||
@@ -1204,7 +1202,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) | |||
1204 | page = alloc_pages(flags, order); | 1202 | page = alloc_pages(flags, order); |
1205 | ret = page ? page_address(page) : NULL; | 1203 | ret = page ? page_address(page) : NULL; |
1206 | kmemleak_alloc(ret, size, 1, flags); | 1204 | kmemleak_alloc(ret, size, 1, flags); |
1207 | kasan_kmalloc_large(ret, size, flags); | 1205 | ret = kasan_kmalloc_large(ret, size, flags); |
1208 | return ret; | 1206 | return ret; |
1209 | } | 1207 | } |
1210 | EXPORT_SYMBOL(kmalloc_order); | 1208 | EXPORT_SYMBOL(kmalloc_order); |
@@ -1482,7 +1480,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, | |||
1482 | ks = ksize(p); | 1480 | ks = ksize(p); |
1483 | 1481 | ||
1484 | if (ks >= new_size) { | 1482 | if (ks >= new_size) { |
1485 | kasan_krealloc((void *)p, new_size, flags); | 1483 | p = kasan_krealloc((void *)p, new_size, flags); |
1486 | return (void *)p; | 1484 | return (void *)p; |
1487 | } | 1485 | } |
1488 | 1486 | ||
@@ -1534,7 +1532,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) | |||
1534 | } | 1532 | } |
1535 | 1533 | ||
1536 | ret = __do_krealloc(p, new_size, flags); | 1534 | ret = __do_krealloc(p, new_size, flags); |
1537 | if (ret && p != ret) | 1535 | if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) |
1538 | kfree(p); | 1536 | kfree(p); |
1539 | 1537 | ||
1540 | return ret; | 1538 | return ret; |
@@ -1372,10 +1372,10 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, | |||
1372 | * Hooks for other subsystems that check memory allocations. In a typical | 1372 | * Hooks for other subsystems that check memory allocations. In a typical |
1373 | * production configuration these hooks all should produce no code at all. | 1373 | * production configuration these hooks all should produce no code at all. |
1374 | */ | 1374 | */ |
1375 | static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) | 1375 | static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) |
1376 | { | 1376 | { |
1377 | kmemleak_alloc(ptr, size, 1, flags); | 1377 | kmemleak_alloc(ptr, size, 1, flags); |
1378 | kasan_kmalloc_large(ptr, size, flags); | 1378 | return kasan_kmalloc_large(ptr, size, flags); |
1379 | } | 1379 | } |
1380 | 1380 | ||
1381 | static __always_inline void kfree_hook(void *x) | 1381 | static __always_inline void kfree_hook(void *x) |
@@ -1451,16 +1451,17 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, | |||
1451 | #endif | 1451 | #endif |
1452 | } | 1452 | } |
1453 | 1453 | ||
1454 | static void setup_object(struct kmem_cache *s, struct page *page, | 1454 | static void *setup_object(struct kmem_cache *s, struct page *page, |
1455 | void *object) | 1455 | void *object) |
1456 | { | 1456 | { |
1457 | setup_object_debug(s, page, object); | 1457 | setup_object_debug(s, page, object); |
1458 | kasan_init_slab_obj(s, object); | 1458 | object = kasan_init_slab_obj(s, object); |
1459 | if (unlikely(s->ctor)) { | 1459 | if (unlikely(s->ctor)) { |
1460 | kasan_unpoison_object_data(s, object); | 1460 | kasan_unpoison_object_data(s, object); |
1461 | s->ctor(object); | 1461 | s->ctor(object); |
1462 | kasan_poison_object_data(s, object); | 1462 | kasan_poison_object_data(s, object); |
1463 | } | 1463 | } |
1464 | return object; | ||
1464 | } | 1465 | } |
1465 | 1466 | ||
1466 | /* | 1467 | /* |
@@ -1568,16 +1569,16 @@ static bool shuffle_freelist(struct kmem_cache *s, struct page *page) | |||
1568 | /* First entry is used as the base of the freelist */ | 1569 | /* First entry is used as the base of the freelist */ |
1569 | cur = next_freelist_entry(s, page, &pos, start, page_limit, | 1570 | cur = next_freelist_entry(s, page, &pos, start, page_limit, |
1570 | freelist_count); | 1571 | freelist_count); |
1572 | cur = setup_object(s, page, cur); | ||
1571 | page->freelist = cur; | 1573 | page->freelist = cur; |
1572 | 1574 | ||
1573 | for (idx = 1; idx < page->objects; idx++) { | 1575 | for (idx = 1; idx < page->objects; idx++) { |
1574 | setup_object(s, page, cur); | ||
1575 | next = next_freelist_entry(s, page, &pos, start, page_limit, | 1576 | next = next_freelist_entry(s, page, &pos, start, page_limit, |
1576 | freelist_count); | 1577 | freelist_count); |
1578 | next = setup_object(s, page, next); | ||
1577 | set_freepointer(s, cur, next); | 1579 | set_freepointer(s, cur, next); |
1578 | cur = next; | 1580 | cur = next; |
1579 | } | 1581 | } |
1580 | setup_object(s, page, cur); | ||
1581 | set_freepointer(s, cur, NULL); | 1582 | set_freepointer(s, cur, NULL); |
1582 | 1583 | ||
1583 | return true; | 1584 | return true; |
@@ -1599,7 +1600,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1599 | struct page *page; | 1600 | struct page *page; |
1600 | struct kmem_cache_order_objects oo = s->oo; | 1601 | struct kmem_cache_order_objects oo = s->oo; |
1601 | gfp_t alloc_gfp; | 1602 | gfp_t alloc_gfp; |
1602 | void *start, *p; | 1603 | void *start, *p, *next; |
1603 | int idx, order; | 1604 | int idx, order; |
1604 | bool shuffle; | 1605 | bool shuffle; |
1605 | 1606 | ||
@@ -1651,13 +1652,16 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1651 | 1652 | ||
1652 | if (!shuffle) { | 1653 | if (!shuffle) { |
1653 | for_each_object_idx(p, idx, s, start, page->objects) { | 1654 | for_each_object_idx(p, idx, s, start, page->objects) { |
1654 | setup_object(s, page, p); | 1655 | if (likely(idx < page->objects)) { |
1655 | if (likely(idx < page->objects)) | 1656 | next = p + s->size; |
1656 | set_freepointer(s, p, p + s->size); | 1657 | next = setup_object(s, page, next); |
1657 | else | 1658 | set_freepointer(s, p, next); |
1659 | } else | ||
1658 | set_freepointer(s, p, NULL); | 1660 | set_freepointer(s, p, NULL); |
1659 | } | 1661 | } |
1660 | page->freelist = fixup_red_left(s, start); | 1662 | start = fixup_red_left(s, start); |
1663 | start = setup_object(s, page, start); | ||
1664 | page->freelist = start; | ||
1661 | } | 1665 | } |
1662 | 1666 | ||
1663 | page->inuse = page->objects; | 1667 | page->inuse = page->objects; |
@@ -2127,26 +2131,15 @@ redo: | |||
2127 | } | 2131 | } |
2128 | 2132 | ||
2129 | if (l != m) { | 2133 | if (l != m) { |
2130 | |||
2131 | if (l == M_PARTIAL) | 2134 | if (l == M_PARTIAL) |
2132 | |||
2133 | remove_partial(n, page); | 2135 | remove_partial(n, page); |
2134 | |||
2135 | else if (l == M_FULL) | 2136 | else if (l == M_FULL) |
2136 | |||
2137 | remove_full(s, n, page); | 2137 | remove_full(s, n, page); |
2138 | 2138 | ||
2139 | if (m == M_PARTIAL) { | 2139 | if (m == M_PARTIAL) |
2140 | |||
2141 | add_partial(n, page, tail); | 2140 | add_partial(n, page, tail); |
2142 | stat(s, tail); | 2141 | else if (m == M_FULL) |
2143 | |||
2144 | } else if (m == M_FULL) { | ||
2145 | |||
2146 | stat(s, DEACTIVATE_FULL); | ||
2147 | add_full(s, n, page); | 2142 | add_full(s, n, page); |
2148 | |||
2149 | } | ||
2150 | } | 2143 | } |
2151 | 2144 | ||
2152 | l = m; | 2145 | l = m; |
@@ -2159,7 +2152,11 @@ redo: | |||
2159 | if (lock) | 2152 | if (lock) |
2160 | spin_unlock(&n->list_lock); | 2153 | spin_unlock(&n->list_lock); |
2161 | 2154 | ||
2162 | if (m == M_FREE) { | 2155 | if (m == M_PARTIAL) |
2156 | stat(s, tail); | ||
2157 | else if (m == M_FULL) | ||
2158 | stat(s, DEACTIVATE_FULL); | ||
2159 | else if (m == M_FREE) { | ||
2163 | stat(s, DEACTIVATE_EMPTY); | 2160 | stat(s, DEACTIVATE_EMPTY); |
2164 | discard_slab(s, page); | 2161 | discard_slab(s, page); |
2165 | stat(s, FREE_SLAB); | 2162 | stat(s, FREE_SLAB); |
@@ -2313,12 +2310,10 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | |||
2313 | { | 2310 | { |
2314 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); | 2311 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
2315 | 2312 | ||
2316 | if (likely(c)) { | 2313 | if (c->page) |
2317 | if (c->page) | 2314 | flush_slab(s, c); |
2318 | flush_slab(s, c); | ||
2319 | 2315 | ||
2320 | unfreeze_partials(s, c); | 2316 | unfreeze_partials(s, c); |
2321 | } | ||
2322 | } | 2317 | } |
2323 | 2318 | ||
2324 | static void flush_cpu_slab(void *d) | 2319 | static void flush_cpu_slab(void *d) |
@@ -2367,7 +2362,7 @@ static int slub_cpu_dead(unsigned int cpu) | |||
2367 | static inline int node_match(struct page *page, int node) | 2362 | static inline int node_match(struct page *page, int node) |
2368 | { | 2363 | { |
2369 | #ifdef CONFIG_NUMA | 2364 | #ifdef CONFIG_NUMA |
2370 | if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) | 2365 | if (node != NUMA_NO_NODE && page_to_nid(page) != node) |
2371 | return 0; | 2366 | return 0; |
2372 | #endif | 2367 | #endif |
2373 | return 1; | 2368 | return 1; |
@@ -2768,7 +2763,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) | |||
2768 | { | 2763 | { |
2769 | void *ret = slab_alloc(s, gfpflags, _RET_IP_); | 2764 | void *ret = slab_alloc(s, gfpflags, _RET_IP_); |
2770 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); | 2765 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); |
2771 | kasan_kmalloc(s, ret, size, gfpflags); | 2766 | ret = kasan_kmalloc(s, ret, size, gfpflags); |
2772 | return ret; | 2767 | return ret; |
2773 | } | 2768 | } |
2774 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | 2769 | EXPORT_SYMBOL(kmem_cache_alloc_trace); |
@@ -2796,7 +2791,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, | |||
2796 | trace_kmalloc_node(_RET_IP_, ret, | 2791 | trace_kmalloc_node(_RET_IP_, ret, |
2797 | size, s->size, gfpflags, node); | 2792 | size, s->size, gfpflags, node); |
2798 | 2793 | ||
2799 | kasan_kmalloc(s, ret, size, gfpflags); | 2794 | ret = kasan_kmalloc(s, ret, size, gfpflags); |
2800 | return ret; | 2795 | return ret; |
2801 | } | 2796 | } |
2802 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); | 2797 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); |
@@ -2992,7 +2987,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, | |||
2992 | do_slab_free(s, page, head, tail, cnt, addr); | 2987 | do_slab_free(s, page, head, tail, cnt, addr); |
2993 | } | 2988 | } |
2994 | 2989 | ||
2995 | #ifdef CONFIG_KASAN | 2990 | #ifdef CONFIG_KASAN_GENERIC |
2996 | void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) | 2991 | void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) |
2997 | { | 2992 | { |
2998 | do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr); | 2993 | do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr); |
@@ -3364,16 +3359,16 @@ static void early_kmem_cache_node_alloc(int node) | |||
3364 | 3359 | ||
3365 | n = page->freelist; | 3360 | n = page->freelist; |
3366 | BUG_ON(!n); | 3361 | BUG_ON(!n); |
3367 | page->freelist = get_freepointer(kmem_cache_node, n); | ||
3368 | page->inuse = 1; | ||
3369 | page->frozen = 0; | ||
3370 | kmem_cache_node->node[node] = n; | ||
3371 | #ifdef CONFIG_SLUB_DEBUG | 3362 | #ifdef CONFIG_SLUB_DEBUG |
3372 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); | 3363 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); |
3373 | init_tracking(kmem_cache_node, n); | 3364 | init_tracking(kmem_cache_node, n); |
3374 | #endif | 3365 | #endif |
3375 | kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), | 3366 | n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), |
3376 | GFP_KERNEL); | 3367 | GFP_KERNEL); |
3368 | page->freelist = get_freepointer(kmem_cache_node, n); | ||
3369 | page->inuse = 1; | ||
3370 | page->frozen = 0; | ||
3371 | kmem_cache_node->node[node] = n; | ||
3377 | init_kmem_cache_node(n); | 3372 | init_kmem_cache_node(n); |
3378 | inc_slabs_node(kmem_cache_node, node, page->objects); | 3373 | inc_slabs_node(kmem_cache_node, node, page->objects); |
3379 | 3374 | ||
@@ -3784,7 +3779,7 @@ void *__kmalloc(size_t size, gfp_t flags) | |||
3784 | 3779 | ||
3785 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); | 3780 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); |
3786 | 3781 | ||
3787 | kasan_kmalloc(s, ret, size, flags); | 3782 | ret = kasan_kmalloc(s, ret, size, flags); |
3788 | 3783 | ||
3789 | return ret; | 3784 | return ret; |
3790 | } | 3785 | } |
@@ -3801,8 +3796,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | |||
3801 | if (page) | 3796 | if (page) |
3802 | ptr = page_address(page); | 3797 | ptr = page_address(page); |
3803 | 3798 | ||
3804 | kmalloc_large_node_hook(ptr, size, flags); | 3799 | return kmalloc_large_node_hook(ptr, size, flags); |
3805 | return ptr; | ||
3806 | } | 3800 | } |
3807 | 3801 | ||
3808 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3802 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
@@ -3829,7 +3823,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
3829 | 3823 | ||
3830 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); | 3824 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); |
3831 | 3825 | ||
3832 | kasan_kmalloc(s, ret, size, flags); | 3826 | ret = kasan_kmalloc(s, ret, size, flags); |
3833 | 3827 | ||
3834 | return ret; | 3828 | return ret; |
3835 | } | 3829 | } |
diff --git a/mm/sparse.c b/mm/sparse.c index 3abc8cc50201..7ea5dc6c6b19 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -678,25 +678,24 @@ static void free_map_bootmem(struct page *memmap) | |||
678 | * set. If this is <=0, then that means that the passed-in | 678 | * set. If this is <=0, then that means that the passed-in |
679 | * map was not consumed and must be freed. | 679 | * map was not consumed and must be freed. |
680 | */ | 680 | */ |
681 | int __meminit sparse_add_one_section(struct pglist_data *pgdat, | 681 | int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, |
682 | unsigned long start_pfn, struct vmem_altmap *altmap) | 682 | struct vmem_altmap *altmap) |
683 | { | 683 | { |
684 | unsigned long section_nr = pfn_to_section_nr(start_pfn); | 684 | unsigned long section_nr = pfn_to_section_nr(start_pfn); |
685 | struct mem_section *ms; | 685 | struct mem_section *ms; |
686 | struct page *memmap; | 686 | struct page *memmap; |
687 | unsigned long *usemap; | 687 | unsigned long *usemap; |
688 | unsigned long flags; | ||
689 | int ret; | 688 | int ret; |
690 | 689 | ||
691 | /* | 690 | /* |
692 | * no locking for this, because it does its own | 691 | * no locking for this, because it does its own |
693 | * plus, it does a kmalloc | 692 | * plus, it does a kmalloc |
694 | */ | 693 | */ |
695 | ret = sparse_index_init(section_nr, pgdat->node_id); | 694 | ret = sparse_index_init(section_nr, nid); |
696 | if (ret < 0 && ret != -EEXIST) | 695 | if (ret < 0 && ret != -EEXIST) |
697 | return ret; | 696 | return ret; |
698 | ret = 0; | 697 | ret = 0; |
699 | memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap); | 698 | memmap = kmalloc_section_memmap(section_nr, nid, altmap); |
700 | if (!memmap) | 699 | if (!memmap) |
701 | return -ENOMEM; | 700 | return -ENOMEM; |
702 | usemap = __kmalloc_section_usemap(); | 701 | usemap = __kmalloc_section_usemap(); |
@@ -705,8 +704,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, | |||
705 | return -ENOMEM; | 704 | return -ENOMEM; |
706 | } | 705 | } |
707 | 706 | ||
708 | pgdat_resize_lock(pgdat, &flags); | ||
709 | |||
710 | ms = __pfn_to_section(start_pfn); | 707 | ms = __pfn_to_section(start_pfn); |
711 | if (ms->section_mem_map & SECTION_MARKED_PRESENT) { | 708 | if (ms->section_mem_map & SECTION_MARKED_PRESENT) { |
712 | ret = -EEXIST; | 709 | ret = -EEXIST; |
@@ -723,7 +720,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, | |||
723 | sparse_init_one_section(ms, section_nr, memmap, usemap); | 720 | sparse_init_one_section(ms, section_nr, memmap, usemap); |
724 | 721 | ||
725 | out: | 722 | out: |
726 | pgdat_resize_unlock(pgdat, &flags); | ||
727 | if (ret < 0) { | 723 | if (ret < 0) { |
728 | kfree(usemap); | 724 | kfree(usemap); |
729 | __kfree_section_memmap(memmap, altmap); | 725 | __kfree_section_memmap(memmap, altmap); |
@@ -740,6 +736,15 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
740 | if (!memmap) | 736 | if (!memmap) |
741 | return; | 737 | return; |
742 | 738 | ||
739 | /* | ||
740 | * A further optimization is to have per section refcounted | ||
741 | * num_poisoned_pages. But that would need more space per memmap, so | ||
742 | * for now just do a quick global check to speed up this routine in the | ||
743 | * absence of bad pages. | ||
744 | */ | ||
745 | if (atomic_long_read(&num_poisoned_pages) == 0) | ||
746 | return; | ||
747 | |||
743 | for (i = 0; i < nr_pages; i++) { | 748 | for (i = 0; i < nr_pages; i++) { |
744 | if (PageHWPoison(&memmap[i])) { | 749 | if (PageHWPoison(&memmap[i])) { |
745 | atomic_long_sub(1, &num_poisoned_pages); | 750 | atomic_long_sub(1, &num_poisoned_pages); |
@@ -785,10 +790,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, | |||
785 | unsigned long map_offset, struct vmem_altmap *altmap) | 790 | unsigned long map_offset, struct vmem_altmap *altmap) |
786 | { | 791 | { |
787 | struct page *memmap = NULL; | 792 | struct page *memmap = NULL; |
788 | unsigned long *usemap = NULL, flags; | 793 | unsigned long *usemap = NULL; |
789 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
790 | 794 | ||
791 | pgdat_resize_lock(pgdat, &flags); | ||
792 | if (ms->section_mem_map) { | 795 | if (ms->section_mem_map) { |
793 | usemap = ms->pageblock_flags; | 796 | usemap = ms->pageblock_flags; |
794 | memmap = sparse_decode_mem_map(ms->section_mem_map, | 797 | memmap = sparse_decode_mem_map(ms->section_mem_map, |
@@ -796,7 +799,6 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, | |||
796 | ms->section_mem_map = 0; | 799 | ms->section_mem_map = 0; |
797 | ms->pageblock_flags = NULL; | 800 | ms->pageblock_flags = NULL; |
798 | } | 801 | } |
799 | pgdat_resize_unlock(pgdat, &flags); | ||
800 | 802 | ||
801 | clear_hwpoisoned_pages(memmap + map_offset, | 803 | clear_hwpoisoned_pages(memmap + map_offset, |
802 | PAGES_PER_SECTION - map_offset); | 804 | PAGES_PER_SECTION - map_offset); |
@@ -1022,7 +1022,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); | |||
1022 | */ | 1022 | */ |
1023 | void __init swap_setup(void) | 1023 | void __init swap_setup(void) |
1024 | { | 1024 | { |
1025 | unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); | 1025 | unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); |
1026 | 1026 | ||
1027 | /* Use a smaller cluster for small-memory machines */ | 1027 | /* Use a smaller cluster for small-memory machines */ |
1028 | if (megs < 16) | 1028 | if (megs < 16) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 8688ae65ef58..dbac1d49469d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -2197,7 +2197,8 @@ int try_to_unuse(unsigned int type, bool frontswap, | |||
2197 | */ | 2197 | */ |
2198 | if (PageSwapCache(page) && | 2198 | if (PageSwapCache(page) && |
2199 | likely(page_private(page) == entry.val) && | 2199 | likely(page_private(page) == entry.val) && |
2200 | !page_swapped(page)) | 2200 | (!PageTransCompound(page) || |
2201 | !swap_page_trans_huge_swapped(si, entry))) | ||
2201 | delete_from_swap_cache(compound_head(page)); | 2202 | delete_from_swap_cache(compound_head(page)); |
2202 | 2203 | ||
2203 | /* | 2204 | /* |
@@ -2812,8 +2813,9 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2812 | struct swap_info_struct *p; | 2813 | struct swap_info_struct *p; |
2813 | unsigned int type; | 2814 | unsigned int type; |
2814 | int i; | 2815 | int i; |
2816 | int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node); | ||
2815 | 2817 | ||
2816 | p = kvzalloc(sizeof(*p), GFP_KERNEL); | 2818 | p = kvzalloc(size, GFP_KERNEL); |
2817 | if (!p) | 2819 | if (!p) |
2818 | return ERR_PTR(-ENOMEM); | 2820 | return ERR_PTR(-ENOMEM); |
2819 | 2821 | ||
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 458acda96f20..48368589f519 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c | |||
@@ -267,10 +267,14 @@ retry: | |||
267 | VM_BUG_ON(dst_addr & ~huge_page_mask(h)); | 267 | VM_BUG_ON(dst_addr & ~huge_page_mask(h)); |
268 | 268 | ||
269 | /* | 269 | /* |
270 | * Serialize via hugetlb_fault_mutex | 270 | * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. |
271 | * i_mmap_rwsem ensures the dst_pte remains valid even | ||
272 | * in the case of shared pmds. fault mutex prevents | ||
273 | * races with other faulting threads. | ||
271 | */ | 274 | */ |
272 | idx = linear_page_index(dst_vma, dst_addr); | ||
273 | mapping = dst_vma->vm_file->f_mapping; | 275 | mapping = dst_vma->vm_file->f_mapping; |
276 | i_mmap_lock_read(mapping); | ||
277 | idx = linear_page_index(dst_vma, dst_addr); | ||
274 | hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, | 278 | hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, |
275 | idx, dst_addr); | 279 | idx, dst_addr); |
276 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | 280 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
@@ -279,6 +283,7 @@ retry: | |||
279 | dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); | 283 | dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); |
280 | if (!dst_pte) { | 284 | if (!dst_pte) { |
281 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 285 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
286 | i_mmap_unlock_read(mapping); | ||
282 | goto out_unlock; | 287 | goto out_unlock; |
283 | } | 288 | } |
284 | 289 | ||
@@ -286,6 +291,7 @@ retry: | |||
286 | dst_pteval = huge_ptep_get(dst_pte); | 291 | dst_pteval = huge_ptep_get(dst_pte); |
287 | if (!huge_pte_none(dst_pteval)) { | 292 | if (!huge_pte_none(dst_pteval)) { |
288 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 293 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
294 | i_mmap_unlock_read(mapping); | ||
289 | goto out_unlock; | 295 | goto out_unlock; |
290 | } | 296 | } |
291 | 297 | ||
@@ -293,6 +299,7 @@ retry: | |||
293 | dst_addr, src_addr, &page); | 299 | dst_addr, src_addr, &page); |
294 | 300 | ||
295 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 301 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
302 | i_mmap_unlock_read(mapping); | ||
296 | vm_alloc_shared = vm_shared; | 303 | vm_alloc_shared = vm_shared; |
297 | 304 | ||
298 | cond_resched(); | 305 | cond_resched(); |
@@ -593,7 +593,7 @@ unsigned long vm_commit_limit(void) | |||
593 | if (sysctl_overcommit_kbytes) | 593 | if (sysctl_overcommit_kbytes) |
594 | allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); | 594 | allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); |
595 | else | 595 | else |
596 | allowed = ((totalram_pages - hugetlb_total_pages()) | 596 | allowed = ((totalram_pages() - hugetlb_total_pages()) |
597 | * sysctl_overcommit_ratio / 100); | 597 | * sysctl_overcommit_ratio / 100); |
598 | allowed += total_swap_pages; | 598 | allowed += total_swap_pages; |
599 | 599 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 97d4b25d0373..871e41c55e23 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1634,7 +1634,7 @@ void *vmap(struct page **pages, unsigned int count, | |||
1634 | 1634 | ||
1635 | might_sleep(); | 1635 | might_sleep(); |
1636 | 1636 | ||
1637 | if (count > totalram_pages) | 1637 | if (count > totalram_pages()) |
1638 | return NULL; | 1638 | return NULL; |
1639 | 1639 | ||
1640 | size = (unsigned long)count << PAGE_SHIFT; | 1640 | size = (unsigned long)count << PAGE_SHIFT; |
@@ -1739,7 +1739,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1739 | unsigned long real_size = size; | 1739 | unsigned long real_size = size; |
1740 | 1740 | ||
1741 | size = PAGE_ALIGN(size); | 1741 | size = PAGE_ALIGN(size); |
1742 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) | 1742 | if (!size || (size >> PAGE_SHIFT) > totalram_pages()) |
1743 | goto fail; | 1743 | goto fail; |
1744 | 1744 | ||
1745 | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | | 1745 | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 24ab1f7394ab..a714c4f800e9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -88,6 +88,9 @@ struct scan_control { | |||
88 | /* Can pages be swapped as part of reclaim? */ | 88 | /* Can pages be swapped as part of reclaim? */ |
89 | unsigned int may_swap:1; | 89 | unsigned int may_swap:1; |
90 | 90 | ||
91 | /* e.g. boosted watermark reclaim leaves slabs alone */ | ||
92 | unsigned int may_shrinkslab:1; | ||
93 | |||
91 | /* | 94 | /* |
92 | * Cgroups are not reclaimed below their configured memory.low, | 95 | * Cgroups are not reclaimed below their configured memory.low, |
93 | * unless we threaten to OOM. If any cgroups are skipped due to | 96 | * unless we threaten to OOM. If any cgroups are skipped due to |
@@ -1457,14 +1460,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1457 | count_memcg_page_event(page, PGLAZYFREED); | 1460 | count_memcg_page_event(page, PGLAZYFREED); |
1458 | } else if (!mapping || !__remove_mapping(mapping, page, true)) | 1461 | } else if (!mapping || !__remove_mapping(mapping, page, true)) |
1459 | goto keep_locked; | 1462 | goto keep_locked; |
1460 | /* | 1463 | |
1461 | * At this point, we have no other references and there is | 1464 | unlock_page(page); |
1462 | * no way to pick any more up (removed from LRU, removed | ||
1463 | * from pagecache). Can use non-atomic bitops now (and | ||
1464 | * we obviously don't have to worry about waking up a process | ||
1465 | * waiting on the page lock, because there are no references. | ||
1466 | */ | ||
1467 | __ClearPageLocked(page); | ||
1468 | free_it: | 1465 | free_it: |
1469 | nr_reclaimed++; | 1466 | nr_reclaimed++; |
1470 | 1467 | ||
@@ -2756,8 +2753,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2756 | shrink_node_memcg(pgdat, memcg, sc, &lru_pages); | 2753 | shrink_node_memcg(pgdat, memcg, sc, &lru_pages); |
2757 | node_lru_pages += lru_pages; | 2754 | node_lru_pages += lru_pages; |
2758 | 2755 | ||
2759 | shrink_slab(sc->gfp_mask, pgdat->node_id, | 2756 | if (sc->may_shrinkslab) { |
2757 | shrink_slab(sc->gfp_mask, pgdat->node_id, | ||
2760 | memcg, sc->priority); | 2758 | memcg, sc->priority); |
2759 | } | ||
2761 | 2760 | ||
2762 | /* Record the group's reclaim efficiency */ | 2761 | /* Record the group's reclaim efficiency */ |
2763 | vmpressure(sc->gfp_mask, memcg, false, | 2762 | vmpressure(sc->gfp_mask, memcg, false, |
@@ -3239,6 +3238,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
3239 | .may_writepage = !laptop_mode, | 3238 | .may_writepage = !laptop_mode, |
3240 | .may_unmap = 1, | 3239 | .may_unmap = 1, |
3241 | .may_swap = 1, | 3240 | .may_swap = 1, |
3241 | .may_shrinkslab = 1, | ||
3242 | }; | 3242 | }; |
3243 | 3243 | ||
3244 | /* | 3244 | /* |
@@ -3283,6 +3283,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, | |||
3283 | .may_unmap = 1, | 3283 | .may_unmap = 1, |
3284 | .reclaim_idx = MAX_NR_ZONES - 1, | 3284 | .reclaim_idx = MAX_NR_ZONES - 1, |
3285 | .may_swap = !noswap, | 3285 | .may_swap = !noswap, |
3286 | .may_shrinkslab = 1, | ||
3286 | }; | 3287 | }; |
3287 | unsigned long lru_pages; | 3288 | unsigned long lru_pages; |
3288 | 3289 | ||
@@ -3329,6 +3330,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
3329 | .may_writepage = !laptop_mode, | 3330 | .may_writepage = !laptop_mode, |
3330 | .may_unmap = 1, | 3331 | .may_unmap = 1, |
3331 | .may_swap = may_swap, | 3332 | .may_swap = may_swap, |
3333 | .may_shrinkslab = 1, | ||
3332 | }; | 3334 | }; |
3333 | 3335 | ||
3334 | /* | 3336 | /* |
@@ -3379,6 +3381,30 @@ static void age_active_anon(struct pglist_data *pgdat, | |||
3379 | } while (memcg); | 3381 | } while (memcg); |
3380 | } | 3382 | } |
3381 | 3383 | ||
3384 | static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) | ||
3385 | { | ||
3386 | int i; | ||
3387 | struct zone *zone; | ||
3388 | |||
3389 | /* | ||
3390 | * Check for watermark boosts top-down as the higher zones | ||
3391 | * are more likely to be boosted. Both watermarks and boosts | ||
3392 | * should not be checked at the time time as reclaim would | ||
3393 | * start prematurely when there is no boosting and a lower | ||
3394 | * zone is balanced. | ||
3395 | */ | ||
3396 | for (i = classzone_idx; i >= 0; i--) { | ||
3397 | zone = pgdat->node_zones + i; | ||
3398 | if (!managed_zone(zone)) | ||
3399 | continue; | ||
3400 | |||
3401 | if (zone->watermark_boost) | ||
3402 | return true; | ||
3403 | } | ||
3404 | |||
3405 | return false; | ||
3406 | } | ||
3407 | |||
3382 | /* | 3408 | /* |
3383 | * Returns true if there is an eligible zone balanced for the request order | 3409 | * Returns true if there is an eligible zone balanced for the request order |
3384 | * and classzone_idx | 3410 | * and classzone_idx |
@@ -3389,6 +3415,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | |||
3389 | unsigned long mark = -1; | 3415 | unsigned long mark = -1; |
3390 | struct zone *zone; | 3416 | struct zone *zone; |
3391 | 3417 | ||
3418 | /* | ||
3419 | * Check watermarks bottom-up as lower zones are more likely to | ||
3420 | * meet watermarks. | ||
3421 | */ | ||
3392 | for (i = 0; i <= classzone_idx; i++) { | 3422 | for (i = 0; i <= classzone_idx; i++) { |
3393 | zone = pgdat->node_zones + i; | 3423 | zone = pgdat->node_zones + i; |
3394 | 3424 | ||
@@ -3517,14 +3547,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3517 | unsigned long nr_soft_reclaimed; | 3547 | unsigned long nr_soft_reclaimed; |
3518 | unsigned long nr_soft_scanned; | 3548 | unsigned long nr_soft_scanned; |
3519 | unsigned long pflags; | 3549 | unsigned long pflags; |
3550 | unsigned long nr_boost_reclaim; | ||
3551 | unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; | ||
3552 | bool boosted; | ||
3520 | struct zone *zone; | 3553 | struct zone *zone; |
3521 | struct scan_control sc = { | 3554 | struct scan_control sc = { |
3522 | .gfp_mask = GFP_KERNEL, | 3555 | .gfp_mask = GFP_KERNEL, |
3523 | .order = order, | 3556 | .order = order, |
3524 | .priority = DEF_PRIORITY, | ||
3525 | .may_writepage = !laptop_mode, | ||
3526 | .may_unmap = 1, | 3557 | .may_unmap = 1, |
3527 | .may_swap = 1, | ||
3528 | }; | 3558 | }; |
3529 | 3559 | ||
3530 | psi_memstall_enter(&pflags); | 3560 | psi_memstall_enter(&pflags); |
@@ -3532,9 +3562,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3532 | 3562 | ||
3533 | count_vm_event(PAGEOUTRUN); | 3563 | count_vm_event(PAGEOUTRUN); |
3534 | 3564 | ||
3565 | /* | ||
3566 | * Account for the reclaim boost. Note that the zone boost is left in | ||
3567 | * place so that parallel allocations that are near the watermark will | ||
3568 | * stall or direct reclaim until kswapd is finished. | ||
3569 | */ | ||
3570 | nr_boost_reclaim = 0; | ||
3571 | for (i = 0; i <= classzone_idx; i++) { | ||
3572 | zone = pgdat->node_zones + i; | ||
3573 | if (!managed_zone(zone)) | ||
3574 | continue; | ||
3575 | |||
3576 | nr_boost_reclaim += zone->watermark_boost; | ||
3577 | zone_boosts[i] = zone->watermark_boost; | ||
3578 | } | ||
3579 | boosted = nr_boost_reclaim; | ||
3580 | |||
3581 | restart: | ||
3582 | sc.priority = DEF_PRIORITY; | ||
3535 | do { | 3583 | do { |
3536 | unsigned long nr_reclaimed = sc.nr_reclaimed; | 3584 | unsigned long nr_reclaimed = sc.nr_reclaimed; |
3537 | bool raise_priority = true; | 3585 | bool raise_priority = true; |
3586 | bool balanced; | ||
3538 | bool ret; | 3587 | bool ret; |
3539 | 3588 | ||
3540 | sc.reclaim_idx = classzone_idx; | 3589 | sc.reclaim_idx = classzone_idx; |
@@ -3561,13 +3610,40 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3561 | } | 3610 | } |
3562 | 3611 | ||
3563 | /* | 3612 | /* |
3564 | * Only reclaim if there are no eligible zones. Note that | 3613 | * If the pgdat is imbalanced then ignore boosting and preserve |
3565 | * sc.reclaim_idx is not used as buffer_heads_over_limit may | 3614 | * the watermarks for a later time and restart. Note that the |
3566 | * have adjusted it. | 3615 | * zone watermarks will be still reset at the end of balancing |
3616 | * on the grounds that the normal reclaim should be enough to | ||
3617 | * re-evaluate if boosting is required when kswapd next wakes. | ||
3567 | */ | 3618 | */ |
3568 | if (pgdat_balanced(pgdat, sc.order, classzone_idx)) | 3619 | balanced = pgdat_balanced(pgdat, sc.order, classzone_idx); |
3620 | if (!balanced && nr_boost_reclaim) { | ||
3621 | nr_boost_reclaim = 0; | ||
3622 | goto restart; | ||
3623 | } | ||
3624 | |||
3625 | /* | ||
3626 | * If boosting is not active then only reclaim if there are no | ||
3627 | * eligible zones. Note that sc.reclaim_idx is not used as | ||
3628 | * buffer_heads_over_limit may have adjusted it. | ||
3629 | */ | ||
3630 | if (!nr_boost_reclaim && balanced) | ||
3569 | goto out; | 3631 | goto out; |
3570 | 3632 | ||
3633 | /* Limit the priority of boosting to avoid reclaim writeback */ | ||
3634 | if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) | ||
3635 | raise_priority = false; | ||
3636 | |||
3637 | /* | ||
3638 | * Do not writeback or swap pages for boosted reclaim. The | ||
3639 | * intent is to relieve pressure not issue sub-optimal IO | ||
3640 | * from reclaim context. If no pages are reclaimed, the | ||
3641 | * reclaim will be aborted. | ||
3642 | */ | ||
3643 | sc.may_writepage = !laptop_mode && !nr_boost_reclaim; | ||
3644 | sc.may_swap = !nr_boost_reclaim; | ||
3645 | sc.may_shrinkslab = !nr_boost_reclaim; | ||
3646 | |||
3571 | /* | 3647 | /* |
3572 | * Do some background aging of the anon list, to give | 3648 | * Do some background aging of the anon list, to give |
3573 | * pages a chance to be referenced before reclaiming. All | 3649 | * pages a chance to be referenced before reclaiming. All |
@@ -3619,6 +3695,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3619 | * progress in reclaiming pages | 3695 | * progress in reclaiming pages |
3620 | */ | 3696 | */ |
3621 | nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; | 3697 | nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; |
3698 | nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); | ||
3699 | |||
3700 | /* | ||
3701 | * If reclaim made no progress for a boost, stop reclaim as | ||
3702 | * IO cannot be queued and it could be an infinite loop in | ||
3703 | * extreme circumstances. | ||
3704 | */ | ||
3705 | if (nr_boost_reclaim && !nr_reclaimed) | ||
3706 | break; | ||
3707 | |||
3622 | if (raise_priority || !nr_reclaimed) | 3708 | if (raise_priority || !nr_reclaimed) |
3623 | sc.priority--; | 3709 | sc.priority--; |
3624 | } while (sc.priority >= 1); | 3710 | } while (sc.priority >= 1); |
@@ -3627,6 +3713,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3627 | pgdat->kswapd_failures++; | 3713 | pgdat->kswapd_failures++; |
3628 | 3714 | ||
3629 | out: | 3715 | out: |
3716 | /* If reclaim was boosted, account for the reclaim done in this pass */ | ||
3717 | if (boosted) { | ||
3718 | unsigned long flags; | ||
3719 | |||
3720 | for (i = 0; i <= classzone_idx; i++) { | ||
3721 | if (!zone_boosts[i]) | ||
3722 | continue; | ||
3723 | |||
3724 | /* Increments are under the zone lock */ | ||
3725 | zone = pgdat->node_zones + i; | ||
3726 | spin_lock_irqsave(&zone->lock, flags); | ||
3727 | zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); | ||
3728 | spin_unlock_irqrestore(&zone->lock, flags); | ||
3729 | } | ||
3730 | |||
3731 | /* | ||
3732 | * As there is now likely space, wakeup kcompact to defragment | ||
3733 | * pageblocks. | ||
3734 | */ | ||
3735 | wakeup_kcompactd(pgdat, pageblock_order, classzone_idx); | ||
3736 | } | ||
3737 | |||
3630 | snapshot_refaults(NULL, pgdat); | 3738 | snapshot_refaults(NULL, pgdat); |
3631 | __fs_reclaim_release(); | 3739 | __fs_reclaim_release(); |
3632 | psi_memstall_leave(&pflags); | 3740 | psi_memstall_leave(&pflags); |
@@ -3855,7 +3963,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, | |||
3855 | 3963 | ||
3856 | /* Hopeless node, leave it to direct reclaim if possible */ | 3964 | /* Hopeless node, leave it to direct reclaim if possible */ |
3857 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || | 3965 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || |
3858 | pgdat_balanced(pgdat, order, classzone_idx)) { | 3966 | (pgdat_balanced(pgdat, order, classzone_idx) && |
3967 | !pgdat_watermark_boosted(pgdat, classzone_idx))) { | ||
3859 | /* | 3968 | /* |
3860 | * There may be plenty of free memory available, but it's too | 3969 | * There may be plenty of free memory available, but it's too |
3861 | * fragmented for high-order allocations. Wake up kcompactd | 3970 | * fragmented for high-order allocations. Wake up kcompactd |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 9c624595e904..83b30edc2f7f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -227,7 +227,7 @@ int calculate_normal_threshold(struct zone *zone) | |||
227 | * 125 1024 10 16-32 GB 9 | 227 | * 125 1024 10 16-32 GB 9 |
228 | */ | 228 | */ |
229 | 229 | ||
230 | mem = zone->managed_pages >> (27 - PAGE_SHIFT); | 230 | mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT); |
231 | 231 | ||
232 | threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); | 232 | threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); |
233 | 233 | ||
@@ -1569,7 +1569,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1569 | high_wmark_pages(zone), | 1569 | high_wmark_pages(zone), |
1570 | zone->spanned_pages, | 1570 | zone->spanned_pages, |
1571 | zone->present_pages, | 1571 | zone->present_pages, |
1572 | zone->managed_pages); | 1572 | zone_managed_pages(zone)); |
1573 | 1573 | ||
1574 | seq_printf(m, | 1574 | seq_printf(m, |
1575 | "\n protection: (%ld", | 1575 | "\n protection: (%ld", |
diff --git a/mm/workingset.c b/mm/workingset.c index d46f8c92aa2f..dcb994f2acc2 100644 --- a/mm/workingset.c +++ b/mm/workingset.c | |||
@@ -549,7 +549,7 @@ static int __init workingset_init(void) | |||
549 | * double the initial memory by using totalram_pages as-is. | 549 | * double the initial memory by using totalram_pages as-is. |
550 | */ | 550 | */ |
551 | timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; | 551 | timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; |
552 | max_order = fls_long(totalram_pages - 1); | 552 | max_order = fls_long(totalram_pages() - 1); |
553 | if (max_order > timestamp_bits) | 553 | if (max_order > timestamp_bits) |
554 | bucket_order = max_order - timestamp_bits; | 554 | bucket_order = max_order - timestamp_bits; |
555 | pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", | 555 | pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", |
diff --git a/mm/zswap.c b/mm/zswap.c index cd91fd9d96b8..a4e4d36ec085 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -219,8 +219,8 @@ static const struct zpool_ops zswap_zpool_ops = { | |||
219 | 219 | ||
220 | static bool zswap_is_full(void) | 220 | static bool zswap_is_full(void) |
221 | { | 221 | { |
222 | return totalram_pages * zswap_max_pool_percent / 100 < | 222 | return totalram_pages() * zswap_max_pool_percent / 100 < |
223 | DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); | 223 | DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); |
224 | } | 224 | } |
225 | 225 | ||
226 | static void zswap_update_total_size(void) | 226 | static void zswap_update_total_size(void) |