aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-16 14:51:08 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-16 14:51:08 -0400
commit271ecc5253e2b317d729d366560789cd7f93836c (patch)
treed3a60bc4dfa8245ff934f357f2367db76b59e7cf /mm
parentaa6865d836418eb2ba888a4cb1318a28e9aa2e0c (diff)
parent63c06227a22b098a3849c5c99e836aea161ca0d7 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge first patch-bomb from Andrew Morton: - some misc things - ofs2 updates - about half of MM - checkpatch updates - autofs4 update * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (120 commits) autofs4: fix string.h include in auto_dev-ioctl.h autofs4: use pr_xxx() macros directly for logging autofs4: change log print macros to not insert newline autofs4: make autofs log prints consistent autofs4: fix some white space errors autofs4: fix invalid ioctl return in autofs4_root_ioctl_unlocked() autofs4: fix coding style line length in autofs4_wait() autofs4: fix coding style problem in autofs4_get_set_timeout() autofs4: coding style fixes autofs: show pipe inode in mount options kallsyms: add support for relative offsets in kallsyms address table kallsyms: don't overload absolute symbol type for percpu symbols x86: kallsyms: disable absolute percpu symbols on !SMP checkpatch: fix another left brace warning checkpatch: improve UNSPECIFIED_INT test for bare signed/unsigned uses checkpatch: warn on bare unsigned or signed declarations without int checkpatch: exclude asm volatile from complex macro check mm: memcontrol: drop unnecessary lru locking from mem_cgroup_migrate() mm: migrate: consolidate mem_cgroup_migrate() calls mm/compaction: speed up pageblock_pfn_to_page() when zone is contiguous ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug57
-rw-r--r--mm/Makefile2
-rw-r--r--mm/compaction.c93
-rw-r--r--mm/debug.c165
-rw-r--r--mm/failslab.c12
-rw-r--r--mm/filemap.c113
-rw-r--r--mm/huge_memory.c20
-rw-r--r--mm/internal.h18
-rw-r--r--mm/kmemcheck.c3
-rw-r--r--mm/madvise.c19
-rw-r--r--mm/memblock.c8
-rw-r--r--mm/memcontrol.c92
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c7
-rw-r--r--mm/memory_hotplug.c30
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/migrate.c23
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/page-writeback.c62
-rw-r--r--mm/page_alloc.c295
-rw-r--r--mm/page_ext.c10
-rw-r--r--mm/page_owner.c100
-rw-r--r--mm/page_poison.c (renamed from mm/debug-pagealloc.c)67
-rw-r--r--mm/rmap.c16
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab.c1037
-rw-r--r--mm/slab.h69
-rw-r--r--mm/slab_common.c8
-rw-r--r--mm/slub.c325
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/vmscan.c47
-rw-r--r--mm/vmstat.c15
-rw-r--r--mm/workingset.c160
33 files changed, 1707 insertions, 1187 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 957d3da53ddd..5c50b238b770 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC
16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC 16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
17 ---help--- 17 ---help---
18 Unmap pages from the kernel linear mapping after free_pages(). 18 Unmap pages from the kernel linear mapping after free_pages().
19 This results in a large slowdown, but helps to find certain types 19 Depending on runtime enablement, this results in a small or large
20 of memory corruption. 20 slowdown, but helps to find certain types of memory corruption.
21 21
22 For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, 22 For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
23 fill the pages with poison patterns after free_pages() and verify 23 fill the pages with poison patterns after free_pages() and verify
@@ -26,5 +26,56 @@ config DEBUG_PAGEALLOC
26 that would result in incorrect warnings of memory corruption after 26 that would result in incorrect warnings of memory corruption after
27 a resume because free pages are not saved to the suspend image. 27 a resume because free pages are not saved to the suspend image.
28 28
29 By default this option will have a small overhead, e.g. by not
30 allowing the kernel mapping to be backed by large pages on some
31 architectures. Even bigger overhead comes when the debugging is
32 enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc
33 command line parameter.
34
35config DEBUG_PAGEALLOC_ENABLE_DEFAULT
36 bool "Enable debug page memory allocations by default?"
37 default n
38 depends on DEBUG_PAGEALLOC
39 ---help---
40 Enable debug page memory allocations by default? This value
41 can be overridden by debug_pagealloc=off|on.
42
29config PAGE_POISONING 43config PAGE_POISONING
30 bool 44 bool "Poison pages after freeing"
45 select PAGE_EXTENSION
46 select PAGE_POISONING_NO_SANITY if HIBERNATION
47 ---help---
48 Fill the pages with poison patterns after free_pages() and verify
49 the patterns before alloc_pages. The filling of the memory helps
50 reduce the risk of information leaks from freed data. This does
51 have a potential performance impact.
52
53 Note that "poison" here is not the same thing as the "HWPoison"
54 for CONFIG_MEMORY_FAILURE. This is software poisoning only.
55
56 If unsure, say N
57
58config PAGE_POISONING_NO_SANITY
59 depends on PAGE_POISONING
60 bool "Only poison, don't sanity check"
61 ---help---
62 Skip the sanity checking on alloc, only fill the pages with
63 poison on free. This reduces some of the overhead of the
64 poisoning feature.
65
66 If you are only interested in sanitization, say Y. Otherwise
67 say N.
68
69config PAGE_POISONING_ZERO
70 bool "Use zero for poisoning instead of random data"
71 depends on PAGE_POISONING
72 ---help---
73 Instead of using the existing poison value, fill the pages with
74 zeros. This makes it harder to detect when errors are occurring
75 due to sanitization but the zeroing at free means that it is
76 no longer necessary to write zeros when GFP_ZERO is used on
77 allocation.
78
79 Enabling page poisoning with this option will disable hibernation
80
81 If unsure, say N
diff --git a/mm/Makefile b/mm/Makefile
index 2ed43191fc3b..cfdd481d27a5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -48,7 +48,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
48obj-$(CONFIG_SLOB) += slob.o 48obj-$(CONFIG_SLOB) += slob.o
49obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 49obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
50obj-$(CONFIG_KSM) += ksm.o 50obj-$(CONFIG_KSM) += ksm.o
51obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 51obj-$(CONFIG_PAGE_POISONING) += page_poison.o
52obj-$(CONFIG_SLAB) += slab.o 52obj-$(CONFIG_SLAB) += slab.o
53obj-$(CONFIG_SLUB) += slub.o 53obj-$(CONFIG_SLUB) += slub.o
54obj-$(CONFIG_KMEMCHECK) += kmemcheck.o 54obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 585de54dbe8c..93f71d968098 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -71,49 +71,6 @@ static inline bool migrate_async_suitable(int migratetype)
71 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 71 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
72} 72}
73 73
74/*
75 * Check that the whole (or subset of) a pageblock given by the interval of
76 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
77 * with the migration of free compaction scanner. The scanners then need to
78 * use only pfn_valid_within() check for arches that allow holes within
79 * pageblocks.
80 *
81 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
82 *
83 * It's possible on some configurations to have a setup like node0 node1 node0
84 * i.e. it's possible that all pages within a zones range of pages do not
85 * belong to a single zone. We assume that a border between node0 and node1
86 * can occur within a single pageblock, but not a node0 node1 node0
87 * interleaving within a single pageblock. It is therefore sufficient to check
88 * the first and last page of a pageblock and avoid checking each individual
89 * page in a pageblock.
90 */
91static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
92 unsigned long end_pfn, struct zone *zone)
93{
94 struct page *start_page;
95 struct page *end_page;
96
97 /* end_pfn is one past the range we are checking */
98 end_pfn--;
99
100 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
101 return NULL;
102
103 start_page = pfn_to_page(start_pfn);
104
105 if (page_zone(start_page) != zone)
106 return NULL;
107
108 end_page = pfn_to_page(end_pfn);
109
110 /* This gives a shorter code than deriving page_zone(end_page) */
111 if (page_zone_id(start_page) != page_zone_id(end_page))
112 return NULL;
113
114 return start_page;
115}
116
117#ifdef CONFIG_COMPACTION 74#ifdef CONFIG_COMPACTION
118 75
119/* Do not skip compaction more than 64 times */ 76/* Do not skip compaction more than 64 times */
@@ -200,7 +157,8 @@ static void reset_cached_positions(struct zone *zone)
200{ 157{
201 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; 158 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
202 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; 159 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
203 zone->compact_cached_free_pfn = zone_end_pfn(zone); 160 zone->compact_cached_free_pfn =
161 round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
204} 162}
205 163
206/* 164/*
@@ -554,13 +512,17 @@ unsigned long
554isolate_freepages_range(struct compact_control *cc, 512isolate_freepages_range(struct compact_control *cc,
555 unsigned long start_pfn, unsigned long end_pfn) 513 unsigned long start_pfn, unsigned long end_pfn)
556{ 514{
557 unsigned long isolated, pfn, block_end_pfn; 515 unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
558 LIST_HEAD(freelist); 516 LIST_HEAD(freelist);
559 517
560 pfn = start_pfn; 518 pfn = start_pfn;
519 block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
520 if (block_start_pfn < cc->zone->zone_start_pfn)
521 block_start_pfn = cc->zone->zone_start_pfn;
561 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 522 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
562 523
563 for (; pfn < end_pfn; pfn += isolated, 524 for (; pfn < end_pfn; pfn += isolated,
525 block_start_pfn = block_end_pfn,
564 block_end_pfn += pageblock_nr_pages) { 526 block_end_pfn += pageblock_nr_pages) {
565 /* Protect pfn from changing by isolate_freepages_block */ 527 /* Protect pfn from changing by isolate_freepages_block */
566 unsigned long isolate_start_pfn = pfn; 528 unsigned long isolate_start_pfn = pfn;
@@ -573,11 +535,13 @@ isolate_freepages_range(struct compact_control *cc,
573 * scanning range to right one. 535 * scanning range to right one.
574 */ 536 */
575 if (pfn >= block_end_pfn) { 537 if (pfn >= block_end_pfn) {
538 block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
576 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 539 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
577 block_end_pfn = min(block_end_pfn, end_pfn); 540 block_end_pfn = min(block_end_pfn, end_pfn);
578 } 541 }
579 542
580 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 543 if (!pageblock_pfn_to_page(block_start_pfn,
544 block_end_pfn, cc->zone))
581 break; 545 break;
582 546
583 isolated = isolate_freepages_block(cc, &isolate_start_pfn, 547 isolated = isolate_freepages_block(cc, &isolate_start_pfn,
@@ -863,18 +827,23 @@ unsigned long
863isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, 827isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
864 unsigned long end_pfn) 828 unsigned long end_pfn)
865{ 829{
866 unsigned long pfn, block_end_pfn; 830 unsigned long pfn, block_start_pfn, block_end_pfn;
867 831
868 /* Scan block by block. First and last block may be incomplete */ 832 /* Scan block by block. First and last block may be incomplete */
869 pfn = start_pfn; 833 pfn = start_pfn;
834 block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
835 if (block_start_pfn < cc->zone->zone_start_pfn)
836 block_start_pfn = cc->zone->zone_start_pfn;
870 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 837 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
871 838
872 for (; pfn < end_pfn; pfn = block_end_pfn, 839 for (; pfn < end_pfn; pfn = block_end_pfn,
840 block_start_pfn = block_end_pfn,
873 block_end_pfn += pageblock_nr_pages) { 841 block_end_pfn += pageblock_nr_pages) {
874 842
875 block_end_pfn = min(block_end_pfn, end_pfn); 843 block_end_pfn = min(block_end_pfn, end_pfn);
876 844
877 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 845 if (!pageblock_pfn_to_page(block_start_pfn,
846 block_end_pfn, cc->zone))
878 continue; 847 continue;
879 848
880 pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, 849 pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
@@ -1103,7 +1072,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1;
1103static isolate_migrate_t isolate_migratepages(struct zone *zone, 1072static isolate_migrate_t isolate_migratepages(struct zone *zone,
1104 struct compact_control *cc) 1073 struct compact_control *cc)
1105{ 1074{
1106 unsigned long low_pfn, end_pfn; 1075 unsigned long block_start_pfn;
1076 unsigned long block_end_pfn;
1077 unsigned long low_pfn;
1107 unsigned long isolate_start_pfn; 1078 unsigned long isolate_start_pfn;
1108 struct page *page; 1079 struct page *page;
1109 const isolate_mode_t isolate_mode = 1080 const isolate_mode_t isolate_mode =
@@ -1115,16 +1086,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1115 * initialized by compact_zone() 1086 * initialized by compact_zone()
1116 */ 1087 */
1117 low_pfn = cc->migrate_pfn; 1088 low_pfn = cc->migrate_pfn;
1089 block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
1090 if (block_start_pfn < zone->zone_start_pfn)
1091 block_start_pfn = zone->zone_start_pfn;
1118 1092
1119 /* Only scan within a pageblock boundary */ 1093 /* Only scan within a pageblock boundary */
1120 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); 1094 block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
1121 1095
1122 /* 1096 /*
1123 * Iterate over whole pageblocks until we find the first suitable. 1097 * Iterate over whole pageblocks until we find the first suitable.
1124 * Do not cross the free scanner. 1098 * Do not cross the free scanner.
1125 */ 1099 */
1126 for (; end_pfn <= cc->free_pfn; 1100 for (; block_end_pfn <= cc->free_pfn;
1127 low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { 1101 low_pfn = block_end_pfn,
1102 block_start_pfn = block_end_pfn,
1103 block_end_pfn += pageblock_nr_pages) {
1128 1104
1129 /* 1105 /*
1130 * This can potentially iterate a massively long zone with 1106 * This can potentially iterate a massively long zone with
@@ -1135,7 +1111,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1135 && compact_should_abort(cc)) 1111 && compact_should_abort(cc))
1136 break; 1112 break;
1137 1113
1138 page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); 1114 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1115 zone);
1139 if (!page) 1116 if (!page)
1140 continue; 1117 continue;
1141 1118
@@ -1154,8 +1131,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1154 1131
1155 /* Perform the isolation */ 1132 /* Perform the isolation */
1156 isolate_start_pfn = low_pfn; 1133 isolate_start_pfn = low_pfn;
1157 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, 1134 low_pfn = isolate_migratepages_block(cc, low_pfn,
1158 isolate_mode); 1135 block_end_pfn, isolate_mode);
1159 1136
1160 if (!low_pfn || cc->contended) { 1137 if (!low_pfn || cc->contended) {
1161 acct_isolated(zone, cc); 1138 acct_isolated(zone, cc);
@@ -1371,11 +1348,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1371 */ 1348 */
1372 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; 1349 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
1373 cc->free_pfn = zone->compact_cached_free_pfn; 1350 cc->free_pfn = zone->compact_cached_free_pfn;
1374 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { 1351 if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
1375 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); 1352 cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
1376 zone->compact_cached_free_pfn = cc->free_pfn; 1353 zone->compact_cached_free_pfn = cc->free_pfn;
1377 } 1354 }
1378 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { 1355 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
1379 cc->migrate_pfn = start_pfn; 1356 cc->migrate_pfn = start_pfn;
1380 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 1357 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1381 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1358 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
diff --git a/mm/debug.c b/mm/debug.c
index f05b2d5d6481..df7247b0b532 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -9,75 +9,38 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/trace_events.h> 10#include <linux/trace_events.h>
11#include <linux/memcontrol.h> 11#include <linux/memcontrol.h>
12 12#include <trace/events/mmflags.h>
13static const struct trace_print_flags pageflag_names[] = { 13#include <linux/migrate.h>
14 {1UL << PG_locked, "locked" }, 14#include <linux/page_owner.h>
15 {1UL << PG_error, "error" }, 15
16 {1UL << PG_referenced, "referenced" }, 16#include "internal.h"
17 {1UL << PG_uptodate, "uptodate" }, 17
18 {1UL << PG_dirty, "dirty" }, 18char *migrate_reason_names[MR_TYPES] = {
19 {1UL << PG_lru, "lru" }, 19 "compaction",
20 {1UL << PG_active, "active" }, 20 "memory_failure",
21 {1UL << PG_slab, "slab" }, 21 "memory_hotplug",
22 {1UL << PG_owner_priv_1, "owner_priv_1" }, 22 "syscall_or_cpuset",
23 {1UL << PG_arch_1, "arch_1" }, 23 "mempolicy_mbind",
24 {1UL << PG_reserved, "reserved" }, 24 "numa_misplaced",
25 {1UL << PG_private, "private" }, 25 "cma",
26 {1UL << PG_private_2, "private_2" },
27 {1UL << PG_writeback, "writeback" },
28 {1UL << PG_head, "head" },
29 {1UL << PG_swapcache, "swapcache" },
30 {1UL << PG_mappedtodisk, "mappedtodisk" },
31 {1UL << PG_reclaim, "reclaim" },
32 {1UL << PG_swapbacked, "swapbacked" },
33 {1UL << PG_unevictable, "unevictable" },
34#ifdef CONFIG_MMU
35 {1UL << PG_mlocked, "mlocked" },
36#endif
37#ifdef CONFIG_ARCH_USES_PG_UNCACHED
38 {1UL << PG_uncached, "uncached" },
39#endif
40#ifdef CONFIG_MEMORY_FAILURE
41 {1UL << PG_hwpoison, "hwpoison" },
42#endif
43#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
44 {1UL << PG_young, "young" },
45 {1UL << PG_idle, "idle" },
46#endif
47}; 26};
48 27
49static void dump_flags(unsigned long flags, 28const struct trace_print_flags pageflag_names[] = {
50 const struct trace_print_flags *names, int count) 29 __def_pageflag_names,
51{ 30 {0, NULL}
52 const char *delim = ""; 31};
53 unsigned long mask;
54 int i;
55
56 pr_emerg("flags: %#lx(", flags);
57
58 /* remove zone id */
59 flags &= (1UL << NR_PAGEFLAGS) - 1;
60
61 for (i = 0; i < count && flags; i++) {
62
63 mask = names[i].mask;
64 if ((flags & mask) != mask)
65 continue;
66
67 flags &= ~mask;
68 pr_cont("%s%s", delim, names[i].name);
69 delim = "|";
70 }
71 32
72 /* check for left over flags */ 33const struct trace_print_flags gfpflag_names[] = {
73 if (flags) 34 __def_gfpflag_names,
74 pr_cont("%s%#lx", delim, flags); 35 {0, NULL}
36};
75 37
76 pr_cont(")\n"); 38const struct trace_print_flags vmaflag_names[] = {
77} 39 __def_vmaflag_names,
40 {0, NULL}
41};
78 42
79void dump_page_badflags(struct page *page, const char *reason, 43void __dump_page(struct page *page, const char *reason)
80 unsigned long badflags)
81{ 44{
82 pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", 45 pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
83 page, atomic_read(&page->_count), page_mapcount(page), 46 page, atomic_read(&page->_count), page_mapcount(page),
@@ -85,15 +48,13 @@ void dump_page_badflags(struct page *page, const char *reason,
85 if (PageCompound(page)) 48 if (PageCompound(page))
86 pr_cont(" compound_mapcount: %d", compound_mapcount(page)); 49 pr_cont(" compound_mapcount: %d", compound_mapcount(page));
87 pr_cont("\n"); 50 pr_cont("\n");
88 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); 51 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
89 dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); 52
53 pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
54
90 if (reason) 55 if (reason)
91 pr_alert("page dumped because: %s\n", reason); 56 pr_alert("page dumped because: %s\n", reason);
92 if (page->flags & badflags) { 57
93 pr_alert("bad because of flags:\n");
94 dump_flags(page->flags & badflags,
95 pageflag_names, ARRAY_SIZE(pageflag_names));
96 }
97#ifdef CONFIG_MEMCG 58#ifdef CONFIG_MEMCG
98 if (page->mem_cgroup) 59 if (page->mem_cgroup)
99 pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); 60 pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
@@ -102,67 +63,26 @@ void dump_page_badflags(struct page *page, const char *reason,
102 63
103void dump_page(struct page *page, const char *reason) 64void dump_page(struct page *page, const char *reason)
104{ 65{
105 dump_page_badflags(page, reason, 0); 66 __dump_page(page, reason);
67 dump_page_owner(page);
106} 68}
107EXPORT_SYMBOL(dump_page); 69EXPORT_SYMBOL(dump_page);
108 70
109#ifdef CONFIG_DEBUG_VM 71#ifdef CONFIG_DEBUG_VM
110 72
111static const struct trace_print_flags vmaflags_names[] = {
112 {VM_READ, "read" },
113 {VM_WRITE, "write" },
114 {VM_EXEC, "exec" },
115 {VM_SHARED, "shared" },
116 {VM_MAYREAD, "mayread" },
117 {VM_MAYWRITE, "maywrite" },
118 {VM_MAYEXEC, "mayexec" },
119 {VM_MAYSHARE, "mayshare" },
120 {VM_GROWSDOWN, "growsdown" },
121 {VM_PFNMAP, "pfnmap" },
122 {VM_DENYWRITE, "denywrite" },
123 {VM_LOCKONFAULT, "lockonfault" },
124 {VM_LOCKED, "locked" },
125 {VM_IO, "io" },
126 {VM_SEQ_READ, "seqread" },
127 {VM_RAND_READ, "randread" },
128 {VM_DONTCOPY, "dontcopy" },
129 {VM_DONTEXPAND, "dontexpand" },
130 {VM_ACCOUNT, "account" },
131 {VM_NORESERVE, "noreserve" },
132 {VM_HUGETLB, "hugetlb" },
133#if defined(CONFIG_X86)
134 {VM_PAT, "pat" },
135#elif defined(CONFIG_PPC)
136 {VM_SAO, "sao" },
137#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
138 {VM_GROWSUP, "growsup" },
139#elif !defined(CONFIG_MMU)
140 {VM_MAPPED_COPY, "mappedcopy" },
141#else
142 {VM_ARCH_1, "arch_1" },
143#endif
144 {VM_DONTDUMP, "dontdump" },
145#ifdef CONFIG_MEM_SOFT_DIRTY
146 {VM_SOFTDIRTY, "softdirty" },
147#endif
148 {VM_MIXEDMAP, "mixedmap" },
149 {VM_HUGEPAGE, "hugepage" },
150 {VM_NOHUGEPAGE, "nohugepage" },
151 {VM_MERGEABLE, "mergeable" },
152};
153
154void dump_vma(const struct vm_area_struct *vma) 73void dump_vma(const struct vm_area_struct *vma)
155{ 74{
156 pr_emerg("vma %p start %p end %p\n" 75 pr_emerg("vma %p start %p end %p\n"
157 "next %p prev %p mm %p\n" 76 "next %p prev %p mm %p\n"
158 "prot %lx anon_vma %p vm_ops %p\n" 77 "prot %lx anon_vma %p vm_ops %p\n"
159 "pgoff %lx file %p private_data %p\n", 78 "pgoff %lx file %p private_data %p\n"
79 "flags: %#lx(%pGv)\n",
160 vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, 80 vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
161 vma->vm_prev, vma->vm_mm, 81 vma->vm_prev, vma->vm_mm,
162 (unsigned long)pgprot_val(vma->vm_page_prot), 82 (unsigned long)pgprot_val(vma->vm_page_prot),
163 vma->anon_vma, vma->vm_ops, vma->vm_pgoff, 83 vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
164 vma->vm_file, vma->vm_private_data); 84 vma->vm_file, vma->vm_private_data,
165 dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); 85 vma->vm_flags, &vma->vm_flags);
166} 86}
167EXPORT_SYMBOL(dump_vma); 87EXPORT_SYMBOL(dump_vma);
168 88
@@ -196,7 +116,7 @@ void dump_mm(const struct mm_struct *mm)
196#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) 116#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
197 "tlb_flush_pending %d\n" 117 "tlb_flush_pending %d\n"
198#endif 118#endif
199 "%s", /* This is here to hold the comma */ 119 "def_flags: %#lx(%pGv)\n",
200 120
201 mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, 121 mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
202#ifdef CONFIG_MMU 122#ifdef CONFIG_MMU
@@ -230,11 +150,8 @@ void dump_mm(const struct mm_struct *mm)
230#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) 150#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
231 mm->tlb_flush_pending, 151 mm->tlb_flush_pending,
232#endif 152#endif
233 "" /* This is here to not have a comma! */ 153 mm->def_flags, &mm->def_flags
234 ); 154 );
235
236 dump_flags(mm->def_flags, vmaflags_names,
237 ARRAY_SIZE(vmaflags_names));
238} 155}
239 156
240#endif /* CONFIG_DEBUG_VM */ 157#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/failslab.c b/mm/failslab.c
index 79171b4a5826..b0fac98cd938 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,5 +1,7 @@
1#include <linux/fault-inject.h> 1#include <linux/fault-inject.h>
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/mm.h>
4#include "slab.h"
3 5
4static struct { 6static struct {
5 struct fault_attr attr; 7 struct fault_attr attr;
@@ -11,18 +13,22 @@ static struct {
11 .cache_filter = false, 13 .cache_filter = false,
12}; 14};
13 15
14bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) 16bool should_failslab(struct kmem_cache *s, gfp_t gfpflags)
15{ 17{
18 /* No fault-injection for bootstrap cache */
19 if (unlikely(s == kmem_cache))
20 return false;
21
16 if (gfpflags & __GFP_NOFAIL) 22 if (gfpflags & __GFP_NOFAIL)
17 return false; 23 return false;
18 24
19 if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) 25 if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
20 return false; 26 return false;
21 27
22 if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) 28 if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
23 return false; 29 return false;
24 30
25 return should_fail(&failslab.attr, size); 31 return should_fail(&failslab.attr, s->object_size);
26} 32}
27 33
28static int __init setup_failslab(char *str) 34static int __init setup_failslab(char *str)
diff --git a/mm/filemap.c b/mm/filemap.c
index da7a35d83de7..61b441b191ad 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,7 +101,7 @@
101 * ->tree_lock (page_remove_rmap->set_page_dirty) 101 * ->tree_lock (page_remove_rmap->set_page_dirty)
102 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) 102 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
103 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 103 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
104 * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) 104 * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
105 * bdi.wb->list_lock (zap_pte_range->set_page_dirty) 105 * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
106 * ->inode->i_lock (zap_pte_range->set_page_dirty) 106 * ->inode->i_lock (zap_pte_range->set_page_dirty)
107 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 107 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
@@ -176,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
176/* 176/*
177 * Delete a page from the page cache and free it. Caller has to make 177 * Delete a page from the page cache and free it. Caller has to make
178 * sure the page is locked and that nobody else uses it - or that usage 178 * sure the page is locked and that nobody else uses it - or that usage
179 * is safe. The caller must hold the mapping's tree_lock and 179 * is safe. The caller must hold the mapping's tree_lock.
180 * mem_cgroup_begin_page_stat().
181 */ 180 */
182void __delete_from_page_cache(struct page *page, void *shadow, 181void __delete_from_page_cache(struct page *page, void *shadow)
183 struct mem_cgroup *memcg)
184{ 182{
185 struct address_space *mapping = page->mapping; 183 struct address_space *mapping = page->mapping;
186 184
@@ -239,8 +237,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
239 * anyway will be cleared before returning page into buddy allocator. 237 * anyway will be cleared before returning page into buddy allocator.
240 */ 238 */
241 if (WARN_ON_ONCE(PageDirty(page))) 239 if (WARN_ON_ONCE(PageDirty(page)))
242 account_page_cleaned(page, mapping, memcg, 240 account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
243 inode_to_wb(mapping->host));
244} 241}
245 242
246/** 243/**
@@ -254,7 +251,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
254void delete_from_page_cache(struct page *page) 251void delete_from_page_cache(struct page *page)
255{ 252{
256 struct address_space *mapping = page->mapping; 253 struct address_space *mapping = page->mapping;
257 struct mem_cgroup *memcg;
258 unsigned long flags; 254 unsigned long flags;
259 255
260 void (*freepage)(struct page *); 256 void (*freepage)(struct page *);
@@ -263,11 +259,9 @@ void delete_from_page_cache(struct page *page)
263 259
264 freepage = mapping->a_ops->freepage; 260 freepage = mapping->a_ops->freepage;
265 261
266 memcg = mem_cgroup_begin_page_stat(page);
267 spin_lock_irqsave(&mapping->tree_lock, flags); 262 spin_lock_irqsave(&mapping->tree_lock, flags);
268 __delete_from_page_cache(page, NULL, memcg); 263 __delete_from_page_cache(page, NULL);
269 spin_unlock_irqrestore(&mapping->tree_lock, flags); 264 spin_unlock_irqrestore(&mapping->tree_lock, flags);
270 mem_cgroup_end_page_stat(memcg);
271 265
272 if (freepage) 266 if (freepage)
273 freepage(page); 267 freepage(page);
@@ -551,7 +545,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
551 if (!error) { 545 if (!error) {
552 struct address_space *mapping = old->mapping; 546 struct address_space *mapping = old->mapping;
553 void (*freepage)(struct page *); 547 void (*freepage)(struct page *);
554 struct mem_cgroup *memcg;
555 unsigned long flags; 548 unsigned long flags;
556 549
557 pgoff_t offset = old->index; 550 pgoff_t offset = old->index;
@@ -561,9 +554,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
561 new->mapping = mapping; 554 new->mapping = mapping;
562 new->index = offset; 555 new->index = offset;
563 556
564 memcg = mem_cgroup_begin_page_stat(old);
565 spin_lock_irqsave(&mapping->tree_lock, flags); 557 spin_lock_irqsave(&mapping->tree_lock, flags);
566 __delete_from_page_cache(old, NULL, memcg); 558 __delete_from_page_cache(old, NULL);
567 error = radix_tree_insert(&mapping->page_tree, offset, new); 559 error = radix_tree_insert(&mapping->page_tree, offset, new);
568 BUG_ON(error); 560 BUG_ON(error);
569 mapping->nrpages++; 561 mapping->nrpages++;
@@ -576,8 +568,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
576 if (PageSwapBacked(new)) 568 if (PageSwapBacked(new))
577 __inc_zone_page_state(new, NR_SHMEM); 569 __inc_zone_page_state(new, NR_SHMEM);
578 spin_unlock_irqrestore(&mapping->tree_lock, flags); 570 spin_unlock_irqrestore(&mapping->tree_lock, flags);
579 mem_cgroup_end_page_stat(memcg); 571 mem_cgroup_migrate(old, new);
580 mem_cgroup_replace_page(old, new);
581 radix_tree_preload_end(); 572 radix_tree_preload_end();
582 if (freepage) 573 if (freepage)
583 freepage(old); 574 freepage(old);
@@ -1668,6 +1659,15 @@ find_page:
1668 index, last_index - index); 1659 index, last_index - index);
1669 } 1660 }
1670 if (!PageUptodate(page)) { 1661 if (!PageUptodate(page)) {
1662 /*
1663 * See comment in do_read_cache_page on why
1664 * wait_on_page_locked is used to avoid unnecessarily
1665 * serialisations and why it's safe.
1666 */
1667 wait_on_page_locked_killable(page);
1668 if (PageUptodate(page))
1669 goto page_ok;
1670
1671 if (inode->i_blkbits == PAGE_CACHE_SHIFT || 1671 if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1672 !mapping->a_ops->is_partially_uptodate) 1672 !mapping->a_ops->is_partially_uptodate)
1673 goto page_not_up_to_date; 1673 goto page_not_up_to_date;
@@ -2303,7 +2303,7 @@ static struct page *wait_on_page_read(struct page *page)
2303 return page; 2303 return page;
2304} 2304}
2305 2305
2306static struct page *__read_cache_page(struct address_space *mapping, 2306static struct page *do_read_cache_page(struct address_space *mapping,
2307 pgoff_t index, 2307 pgoff_t index,
2308 int (*filler)(void *, struct page *), 2308 int (*filler)(void *, struct page *),
2309 void *data, 2309 void *data,
@@ -2325,53 +2325,74 @@ repeat:
2325 /* Presumably ENOMEM for radix tree node */ 2325 /* Presumably ENOMEM for radix tree node */
2326 return ERR_PTR(err); 2326 return ERR_PTR(err);
2327 } 2327 }
2328
2329filler:
2328 err = filler(data, page); 2330 err = filler(data, page);
2329 if (err < 0) { 2331 if (err < 0) {
2330 page_cache_release(page); 2332 page_cache_release(page);
2331 page = ERR_PTR(err); 2333 return ERR_PTR(err);
2332 } else {
2333 page = wait_on_page_read(page);
2334 } 2334 }
2335 }
2336 return page;
2337}
2338
2339static struct page *do_read_cache_page(struct address_space *mapping,
2340 pgoff_t index,
2341 int (*filler)(void *, struct page *),
2342 void *data,
2343 gfp_t gfp)
2344 2335
2345{ 2336 page = wait_on_page_read(page);
2346 struct page *page; 2337 if (IS_ERR(page))
2347 int err; 2338 return page;
2339 goto out;
2340 }
2341 if (PageUptodate(page))
2342 goto out;
2348 2343
2349retry: 2344 /*
2350 page = __read_cache_page(mapping, index, filler, data, gfp); 2345 * Page is not up to date and may be locked due one of the following
2351 if (IS_ERR(page)) 2346 * case a: Page is being filled and the page lock is held
2352 return page; 2347 * case b: Read/write error clearing the page uptodate status
2348 * case c: Truncation in progress (page locked)
2349 * case d: Reclaim in progress
2350 *
2351 * Case a, the page will be up to date when the page is unlocked.
2352 * There is no need to serialise on the page lock here as the page
2353 * is pinned so the lock gives no additional protection. Even if the
2354 * the page is truncated, the data is still valid if PageUptodate as
2355 * it's a race vs truncate race.
2356 * Case b, the page will not be up to date
2357 * Case c, the page may be truncated but in itself, the data may still
2358 * be valid after IO completes as it's a read vs truncate race. The
2359 * operation must restart if the page is not uptodate on unlock but
2360 * otherwise serialising on page lock to stabilise the mapping gives
2361 * no additional guarantees to the caller as the page lock is
2362 * released before return.
2363 * Case d, similar to truncation. If reclaim holds the page lock, it
2364 * will be a race with remove_mapping that determines if the mapping
2365 * is valid on unlock but otherwise the data is valid and there is
2366 * no need to serialise with page lock.
2367 *
2368 * As the page lock gives no additional guarantee, we optimistically
2369 * wait on the page to be unlocked and check if it's up to date and
2370 * use the page if it is. Otherwise, the page lock is required to
2371 * distinguish between the different cases. The motivation is that we
2372 * avoid spurious serialisations and wakeups when multiple processes
2373 * wait on the same page for IO to complete.
2374 */
2375 wait_on_page_locked(page);
2353 if (PageUptodate(page)) 2376 if (PageUptodate(page))
2354 goto out; 2377 goto out;
2355 2378
2379 /* Distinguish between all the cases under the safety of the lock */
2356 lock_page(page); 2380 lock_page(page);
2381
2382 /* Case c or d, restart the operation */
2357 if (!page->mapping) { 2383 if (!page->mapping) {
2358 unlock_page(page); 2384 unlock_page(page);
2359 page_cache_release(page); 2385 page_cache_release(page);
2360 goto retry; 2386 goto repeat;
2361 } 2387 }
2388
2389 /* Someone else locked and filled the page in a very small window */
2362 if (PageUptodate(page)) { 2390 if (PageUptodate(page)) {
2363 unlock_page(page); 2391 unlock_page(page);
2364 goto out; 2392 goto out;
2365 } 2393 }
2366 err = filler(data, page); 2394 goto filler;
2367 if (err < 0) { 2395
2368 page_cache_release(page);
2369 return ERR_PTR(err);
2370 } else {
2371 page = wait_on_page_read(page);
2372 if (IS_ERR(page))
2373 return page;
2374 }
2375out: 2396out:
2376 mark_page_accessed(page); 2397 mark_page_accessed(page);
2377 return page; 2398 return page;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e10a4fee88d2..1ea21e203a70 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3220,28 +3220,26 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
3220 } 3220 }
3221} 3221}
3222 3222
3223static int __split_huge_page_tail(struct page *head, int tail, 3223static void __split_huge_page_tail(struct page *head, int tail,
3224 struct lruvec *lruvec, struct list_head *list) 3224 struct lruvec *lruvec, struct list_head *list)
3225{ 3225{
3226 int mapcount;
3227 struct page *page_tail = head + tail; 3226 struct page *page_tail = head + tail;
3228 3227
3229 mapcount = atomic_read(&page_tail->_mapcount) + 1; 3228 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
3230 VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); 3229 VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
3231 3230
3232 /* 3231 /*
3233 * tail_page->_count is zero and not changing from under us. But 3232 * tail_page->_count is zero and not changing from under us. But
3234 * get_page_unless_zero() may be running from under us on the 3233 * get_page_unless_zero() may be running from under us on the
3235 * tail_page. If we used atomic_set() below instead of atomic_add(), we 3234 * tail_page. If we used atomic_set() below instead of atomic_inc(), we
3236 * would then run atomic_set() concurrently with 3235 * would then run atomic_set() concurrently with
3237 * get_page_unless_zero(), and atomic_set() is implemented in C not 3236 * get_page_unless_zero(), and atomic_set() is implemented in C not
3238 * using locked ops. spin_unlock on x86 sometime uses locked ops 3237 * using locked ops. spin_unlock on x86 sometime uses locked ops
3239 * because of PPro errata 66, 92, so unless somebody can guarantee 3238 * because of PPro errata 66, 92, so unless somebody can guarantee
3240 * atomic_set() here would be safe on all archs (and not only on x86), 3239 * atomic_set() here would be safe on all archs (and not only on x86),
3241 * it's safer to use atomic_add(). 3240 * it's safer to use atomic_inc().
3242 */ 3241 */
3243 atomic_add(mapcount + 1, &page_tail->_count); 3242 atomic_inc(&page_tail->_count);
3244
3245 3243
3246 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 3244 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
3247 page_tail->flags |= (head->flags & 3245 page_tail->flags |= (head->flags &
@@ -3275,8 +3273,6 @@ static int __split_huge_page_tail(struct page *head, int tail,
3275 page_tail->index = head->index + tail; 3273 page_tail->index = head->index + tail;
3276 page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); 3274 page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
3277 lru_add_page_tail(head, page_tail, lruvec, list); 3275 lru_add_page_tail(head, page_tail, lruvec, list);
3278
3279 return mapcount;
3280} 3276}
3281 3277
3282static void __split_huge_page(struct page *page, struct list_head *list) 3278static void __split_huge_page(struct page *page, struct list_head *list)
@@ -3284,7 +3280,7 @@ static void __split_huge_page(struct page *page, struct list_head *list)
3284 struct page *head = compound_head(page); 3280 struct page *head = compound_head(page);
3285 struct zone *zone = page_zone(head); 3281 struct zone *zone = page_zone(head);
3286 struct lruvec *lruvec; 3282 struct lruvec *lruvec;
3287 int i, tail_mapcount; 3283 int i;
3288 3284
3289 /* prevent PageLRU to go away from under us, and freeze lru stats */ 3285 /* prevent PageLRU to go away from under us, and freeze lru stats */
3290 spin_lock_irq(&zone->lru_lock); 3286 spin_lock_irq(&zone->lru_lock);
@@ -3293,10 +3289,8 @@ static void __split_huge_page(struct page *page, struct list_head *list)
3293 /* complete memcg works before add pages to LRU */ 3289 /* complete memcg works before add pages to LRU */
3294 mem_cgroup_split_huge_fixup(head); 3290 mem_cgroup_split_huge_fixup(head);
3295 3291
3296 tail_mapcount = 0;
3297 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) 3292 for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
3298 tail_mapcount += __split_huge_page_tail(head, i, lruvec, list); 3293 __split_huge_page_tail(head, i, lruvec, list);
3299 atomic_sub(tail_mapcount, &head->_count);
3300 3294
3301 ClearPageCompound(head); 3295 ClearPageCompound(head);
3302 spin_unlock_irq(&zone->lru_lock); 3296 spin_unlock_irq(&zone->lru_lock);
diff --git a/mm/internal.h b/mm/internal.h
index a38a21ebddb4..ad9400d759c8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -14,6 +14,7 @@
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/tracepoint-defs.h>
17 18
18/* 19/*
19 * The set of flags that only affect watermark checking and reclaim 20 * The set of flags that only affect watermark checking and reclaim
@@ -131,6 +132,18 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
131 return page_idx ^ (1 << order); 132 return page_idx ^ (1 << order);
132} 133}
133 134
135extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
136 unsigned long end_pfn, struct zone *zone);
137
138static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
139 unsigned long end_pfn, struct zone *zone)
140{
141 if (zone->contiguous)
142 return pfn_to_page(start_pfn);
143
144 return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
145}
146
134extern int __isolate_free_page(struct page *page, unsigned int order); 147extern int __isolate_free_page(struct page *page, unsigned int order);
135extern void __free_pages_bootmem(struct page *page, unsigned long pfn, 148extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
136 unsigned int order); 149 unsigned int order);
@@ -466,4 +479,9 @@ static inline void try_to_unmap_flush_dirty(void)
466} 479}
467 480
468#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 481#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
482
483extern const struct trace_print_flags pageflag_names[];
484extern const struct trace_print_flags vmaflag_names[];
485extern const struct trace_print_flags gfpflag_names[];
486
469#endif /* __MM_INTERNAL_H */ 487#endif /* __MM_INTERNAL_H */
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index cab58bb592d8..6f4f424037c0 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -60,6 +60,9 @@ void kmemcheck_free_shadow(struct page *page, int order)
60void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, 60void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
61 size_t size) 61 size_t size)
62{ 62{
63 if (unlikely(!object)) /* Skip object if allocation failed */
64 return;
65
63 /* 66 /*
64 * Has already been memset(), which initializes the shadow for us 67 * Has already been memset(), which initializes the shadow for us
65 * as well. 68 * as well.
diff --git a/mm/madvise.c b/mm/madvise.c
index f56825b6d2e1..a01147359f3b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -555,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
555 } 555 }
556 pr_info("Injecting memory failure for page %#lx at %#lx\n", 556 pr_info("Injecting memory failure for page %#lx at %#lx\n",
557 page_to_pfn(p), start); 557 page_to_pfn(p), start);
558 /* Ignore return value for now */ 558 ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
559 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); 559 if (ret)
560 return ret;
560 } 561 }
561 return 0; 562 return 0;
562} 563}
@@ -638,14 +639,28 @@ madvise_behavior_valid(int behavior)
638 * some pages ahead. 639 * some pages ahead.
639 * MADV_DONTNEED - the application is finished with the given range, 640 * MADV_DONTNEED - the application is finished with the given range,
640 * so the kernel can free resources associated with it. 641 * so the kernel can free resources associated with it.
642 * MADV_FREE - the application marks pages in the given range as lazy free,
643 * where actual purges are postponed until memory pressure happens.
641 * MADV_REMOVE - the application wants to free up the given range of 644 * MADV_REMOVE - the application wants to free up the given range of
642 * pages and associated backing store. 645 * pages and associated backing store.
643 * MADV_DONTFORK - omit this area from child's address space when forking: 646 * MADV_DONTFORK - omit this area from child's address space when forking:
644 * typically, to avoid COWing pages pinned by get_user_pages(). 647 * typically, to avoid COWing pages pinned by get_user_pages().
645 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 648 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
649 * MADV_HWPOISON - trigger memory error handler as if the given memory range
650 * were corrupted by unrecoverable hardware memory failure.
651 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
646 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 652 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
647 * this area with pages of identical content from other such areas. 653 * this area with pages of identical content from other such areas.
648 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 654 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
655 * MADV_HUGEPAGE - the application wants to back the given range by transparent
656 * huge pages in the future. Existing pages might be coalesced and
657 * new pages might be allocated as THP.
658 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
659 * transparent huge pages so the existing pages will not be
660 * coalesced into THP and new pages will not be allocated as THP.
661 * MADV_DONTDUMP - the application wants to prevent pages in the given range
662 * from being included in its core dump.
663 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
649 * 664 *
650 * return values: 665 * return values:
651 * zero - success 666 * zero - success
diff --git a/mm/memblock.c b/mm/memblock.c
index dd7989929f13..fc7824fa1b42 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -612,14 +612,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
612 int nid, 612 int nid,
613 unsigned long flags) 613 unsigned long flags)
614{ 614{
615 struct memblock_type *type = &memblock.memory;
616
617 memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", 615 memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
618 (unsigned long long)base, 616 (unsigned long long)base,
619 (unsigned long long)base + size - 1, 617 (unsigned long long)base + size - 1,
620 flags, (void *)_RET_IP_); 618 flags, (void *)_RET_IP_);
621 619
622 return memblock_add_range(type, base, size, nid, flags); 620 return memblock_add_range(&memblock.memory, base, size, nid, flags);
623} 621}
624 622
625int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) 623int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -740,14 +738,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
740 int nid, 738 int nid,
741 unsigned long flags) 739 unsigned long flags)
742{ 740{
743 struct memblock_type *type = &memblock.reserved;
744
745 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", 741 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
746 (unsigned long long)base, 742 (unsigned long long)base,
747 (unsigned long long)base + size - 1, 743 (unsigned long long)base + size - 1,
748 flags, (void *)_RET_IP_); 744 flags, (void *)_RET_IP_);
749 745
750 return memblock_add_range(type, base, size, nid, flags); 746 return memblock_add_range(&memblock.reserved, base, size, nid, flags);
751} 747}
752 748
753int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) 749int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d06cae2de783..42882c1e7fce 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
268 return (memcg == root_mem_cgroup); 268 return (memcg == root_mem_cgroup);
269} 269}
270 270
271/*
272 * We restrict the id in the range of [1, 65535], so it can fit into
273 * an unsigned short.
274 */
275#define MEM_CGROUP_ID_MAX USHRT_MAX
276
277static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
278{
279 return memcg->css.id;
280}
281
282/*
283 * A helper function to get mem_cgroup from ID. must be called under
284 * rcu_read_lock(). The caller is responsible for calling
285 * css_tryget_online() if the mem_cgroup is used for charging. (dropping
286 * refcnt from swap can be called against removed memcg.)
287 */
288static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
289{
290 struct cgroup_subsys_state *css;
291
292 css = css_from_id(id, &memory_cgrp_subsys);
293 return mem_cgroup_from_css(css);
294}
295
296#ifndef CONFIG_SLOB 271#ifndef CONFIG_SLOB
297/* 272/*
298 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. 273 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
@@ -1709,19 +1684,13 @@ cleanup:
1709} 1684}
1710 1685
1711/** 1686/**
1712 * mem_cgroup_begin_page_stat - begin a page state statistics transaction 1687 * lock_page_memcg - lock a page->mem_cgroup binding
1713 * @page: page that is going to change accounted state 1688 * @page: the page
1714 *
1715 * This function must mark the beginning of an accounted page state
1716 * change to prevent double accounting when the page is concurrently
1717 * being moved to another memcg:
1718 * 1689 *
1719 * memcg = mem_cgroup_begin_page_stat(page); 1690 * This function protects unlocked LRU pages from being moved to
1720 * if (TestClearPageState(page)) 1691 * another cgroup and stabilizes their page->mem_cgroup binding.
1721 * mem_cgroup_update_page_stat(memcg, state, -1);
1722 * mem_cgroup_end_page_stat(memcg);
1723 */ 1692 */
1724struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) 1693void lock_page_memcg(struct page *page)
1725{ 1694{
1726 struct mem_cgroup *memcg; 1695 struct mem_cgroup *memcg;
1727 unsigned long flags; 1696 unsigned long flags;
@@ -1730,25 +1699,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
1730 * The RCU lock is held throughout the transaction. The fast 1699 * The RCU lock is held throughout the transaction. The fast
1731 * path can get away without acquiring the memcg->move_lock 1700 * path can get away without acquiring the memcg->move_lock
1732 * because page moving starts with an RCU grace period. 1701 * because page moving starts with an RCU grace period.
1733 *
1734 * The RCU lock also protects the memcg from being freed when
1735 * the page state that is going to change is the only thing
1736 * preventing the page from being uncharged.
1737 * E.g. end-writeback clearing PageWriteback(), which allows
1738 * migration to go ahead and uncharge the page before the
1739 * account transaction might be complete.
1740 */ 1702 */
1741 rcu_read_lock(); 1703 rcu_read_lock();
1742 1704
1743 if (mem_cgroup_disabled()) 1705 if (mem_cgroup_disabled())
1744 return NULL; 1706 return;
1745again: 1707again:
1746 memcg = page->mem_cgroup; 1708 memcg = page->mem_cgroup;
1747 if (unlikely(!memcg)) 1709 if (unlikely(!memcg))
1748 return NULL; 1710 return;
1749 1711
1750 if (atomic_read(&memcg->moving_account) <= 0) 1712 if (atomic_read(&memcg->moving_account) <= 0)
1751 return memcg; 1713 return;
1752 1714
1753 spin_lock_irqsave(&memcg->move_lock, flags); 1715 spin_lock_irqsave(&memcg->move_lock, flags);
1754 if (memcg != page->mem_cgroup) { 1716 if (memcg != page->mem_cgroup) {
@@ -1759,21 +1721,23 @@ again:
1759 /* 1721 /*
1760 * When charge migration first begins, we can have locked and 1722 * When charge migration first begins, we can have locked and
1761 * unlocked page stat updates happening concurrently. Track 1723 * unlocked page stat updates happening concurrently. Track
1762 * the task who has the lock for mem_cgroup_end_page_stat(). 1724 * the task who has the lock for unlock_page_memcg().
1763 */ 1725 */
1764 memcg->move_lock_task = current; 1726 memcg->move_lock_task = current;
1765 memcg->move_lock_flags = flags; 1727 memcg->move_lock_flags = flags;
1766 1728
1767 return memcg; 1729 return;
1768} 1730}
1769EXPORT_SYMBOL(mem_cgroup_begin_page_stat); 1731EXPORT_SYMBOL(lock_page_memcg);
1770 1732
1771/** 1733/**
1772 * mem_cgroup_end_page_stat - finish a page state statistics transaction 1734 * unlock_page_memcg - unlock a page->mem_cgroup binding
1773 * @memcg: the memcg that was accounted against 1735 * @page: the page
1774 */ 1736 */
1775void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) 1737void unlock_page_memcg(struct page *page)
1776{ 1738{
1739 struct mem_cgroup *memcg = page->mem_cgroup;
1740
1777 if (memcg && memcg->move_lock_task == current) { 1741 if (memcg && memcg->move_lock_task == current) {
1778 unsigned long flags = memcg->move_lock_flags; 1742 unsigned long flags = memcg->move_lock_flags;
1779 1743
@@ -1785,7 +1749,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
1785 1749
1786 rcu_read_unlock(); 1750 rcu_read_unlock();
1787} 1751}
1788EXPORT_SYMBOL(mem_cgroup_end_page_stat); 1752EXPORT_SYMBOL(unlock_page_memcg);
1789 1753
1790/* 1754/*
1791 * size of first charge trial. "32" comes from vmscan.c's magic value. 1755 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -4488,7 +4452,7 @@ static int mem_cgroup_move_account(struct page *page,
4488 VM_BUG_ON(compound && !PageTransHuge(page)); 4452 VM_BUG_ON(compound && !PageTransHuge(page));
4489 4453
4490 /* 4454 /*
4491 * Prevent mem_cgroup_replace_page() from looking at 4455 * Prevent mem_cgroup_migrate() from looking at
4492 * page->mem_cgroup of its source page while we change it. 4456 * page->mem_cgroup of its source page while we change it.
4493 */ 4457 */
4494 ret = -EBUSY; 4458 ret = -EBUSY;
@@ -4923,9 +4887,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4923 4887
4924 lru_add_drain_all(); 4888 lru_add_drain_all();
4925 /* 4889 /*
4926 * Signal mem_cgroup_begin_page_stat() to take the memcg's 4890 * Signal lock_page_memcg() to take the memcg's move_lock
4927 * move_lock while we're moving its pages to another memcg. 4891 * while we're moving its pages to another memcg. Then wait
4928 * Then wait for already started RCU-only updates to finish. 4892 * for already started RCU-only updates to finish.
4929 */ 4893 */
4930 atomic_inc(&mc.from->moving_account); 4894 atomic_inc(&mc.from->moving_account);
4931 synchronize_rcu(); 4895 synchronize_rcu();
@@ -5517,16 +5481,16 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
5517} 5481}
5518 5482
5519/** 5483/**
5520 * mem_cgroup_replace_page - migrate a charge to another page 5484 * mem_cgroup_migrate - charge a page's replacement
5521 * @oldpage: currently charged page 5485 * @oldpage: currently circulating page
5522 * @newpage: page to transfer the charge to 5486 * @newpage: replacement page
5523 * 5487 *
5524 * Migrate the charge from @oldpage to @newpage. 5488 * Charge @newpage as a replacement page for @oldpage. @oldpage will
5489 * be uncharged upon free.
5525 * 5490 *
5526 * Both pages must be locked, @newpage->mapping must be set up. 5491 * Both pages must be locked, @newpage->mapping must be set up.
5527 * Either or both pages might be on the LRU already.
5528 */ 5492 */
5529void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) 5493void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
5530{ 5494{
5531 struct mem_cgroup *memcg; 5495 struct mem_cgroup *memcg;
5532 unsigned int nr_pages; 5496 unsigned int nr_pages;
@@ -5559,7 +5523,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
5559 page_counter_charge(&memcg->memsw, nr_pages); 5523 page_counter_charge(&memcg->memsw, nr_pages);
5560 css_get_many(&memcg->css, nr_pages); 5524 css_get_many(&memcg->css, nr_pages);
5561 5525
5562 commit_charge(newpage, memcg, true); 5526 commit_charge(newpage, memcg, false);
5563 5527
5564 local_irq_disable(); 5528 local_irq_disable();
5565 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); 5529 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ac595e7a3a95..67c30eb993f0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -826,8 +826,6 @@ static struct page_state {
826#undef lru 826#undef lru
827#undef swapbacked 827#undef swapbacked
828#undef head 828#undef head
829#undef tail
830#undef compound
831#undef slab 829#undef slab
832#undef reserved 830#undef reserved
833 831
diff --git a/mm/memory.c b/mm/memory.c
index 906d8e3b42c0..0e247642ed5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1897,7 +1897,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1897 unsigned long end = addr + size; 1897 unsigned long end = addr + size;
1898 int err; 1898 int err;
1899 1899
1900 BUG_ON(addr >= end); 1900 if (WARN_ON(addr >= end))
1901 return -EINVAL;
1902
1901 pgd = pgd_offset(mm, addr); 1903 pgd = pgd_offset(mm, addr);
1902 do { 1904 do {
1903 next = pgd_addr_end(addr, end); 1905 next = pgd_addr_end(addr, end);
@@ -3143,8 +3145,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3143 unsigned long address, pte_t *page_table, pmd_t *pmd, 3145 unsigned long address, pte_t *page_table, pmd_t *pmd,
3144 unsigned int flags, pte_t orig_pte) 3146 unsigned int flags, pte_t orig_pte)
3145{ 3147{
3146 pgoff_t pgoff = (((address & PAGE_MASK) 3148 pgoff_t pgoff = linear_page_index(vma, address);
3147 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3148 3149
3149 pte_unmap(page_table); 3150 pte_unmap(page_table);
3150 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ 3151 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 979b18cbd343..24ea06393816 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -77,6 +77,9 @@ static struct {
77#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) 77#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
78#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) 78#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
79 79
80bool memhp_auto_online;
81EXPORT_SYMBOL_GPL(memhp_auto_online);
82
80void get_online_mems(void) 83void get_online_mems(void)
81{ 84{
82 might_sleep(); 85 might_sleep();
@@ -509,6 +512,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
509 int start_sec, end_sec; 512 int start_sec, end_sec;
510 struct vmem_altmap *altmap; 513 struct vmem_altmap *altmap;
511 514
515 clear_zone_contiguous(zone);
516
512 /* during initialize mem_map, align hot-added range to section */ 517 /* during initialize mem_map, align hot-added range to section */
513 start_sec = pfn_to_section_nr(phys_start_pfn); 518 start_sec = pfn_to_section_nr(phys_start_pfn);
514 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 519 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
@@ -521,7 +526,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
521 if (altmap->base_pfn != phys_start_pfn 526 if (altmap->base_pfn != phys_start_pfn
522 || vmem_altmap_offset(altmap) > nr_pages) { 527 || vmem_altmap_offset(altmap) > nr_pages) {
523 pr_warn_once("memory add fail, invalid altmap\n"); 528 pr_warn_once("memory add fail, invalid altmap\n");
524 return -EINVAL; 529 err = -EINVAL;
530 goto out;
525 } 531 }
526 altmap->alloc = 0; 532 altmap->alloc = 0;
527 } 533 }
@@ -539,7 +545,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
539 err = 0; 545 err = 0;
540 } 546 }
541 vmemmap_populate_print_last(); 547 vmemmap_populate_print_last();
542 548out:
549 set_zone_contiguous(zone);
543 return err; 550 return err;
544} 551}
545EXPORT_SYMBOL_GPL(__add_pages); 552EXPORT_SYMBOL_GPL(__add_pages);
@@ -811,6 +818,8 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
811 } 818 }
812 } 819 }
813 820
821 clear_zone_contiguous(zone);
822
814 /* 823 /*
815 * We can only remove entire sections 824 * We can only remove entire sections
816 */ 825 */
@@ -826,6 +835,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
826 if (ret) 835 if (ret)
827 break; 836 break;
828 } 837 }
838
839 set_zone_contiguous(zone);
840
829 return ret; 841 return ret;
830} 842}
831EXPORT_SYMBOL_GPL(__remove_pages); 843EXPORT_SYMBOL_GPL(__remove_pages);
@@ -1261,8 +1273,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
1261 return zone_default; 1273 return zone_default;
1262} 1274}
1263 1275
1276static int online_memory_block(struct memory_block *mem, void *arg)
1277{
1278 return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
1279}
1280
1264/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1281/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1265int __ref add_memory_resource(int nid, struct resource *res) 1282int __ref add_memory_resource(int nid, struct resource *res, bool online)
1266{ 1283{
1267 u64 start, size; 1284 u64 start, size;
1268 pg_data_t *pgdat = NULL; 1285 pg_data_t *pgdat = NULL;
@@ -1322,6 +1339,11 @@ int __ref add_memory_resource(int nid, struct resource *res)
1322 /* create new memmap entry */ 1339 /* create new memmap entry */
1323 firmware_map_add_hotplug(start, start + size, "System RAM"); 1340 firmware_map_add_hotplug(start, start + size, "System RAM");
1324 1341
1342 /* online pages if requested */
1343 if (online)
1344 walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
1345 NULL, online_memory_block);
1346
1325 goto out; 1347 goto out;
1326 1348
1327error: 1349error:
@@ -1345,7 +1367,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
1345 if (IS_ERR(res)) 1367 if (IS_ERR(res))
1346 return PTR_ERR(res); 1368 return PTR_ERR(res);
1347 1369
1348 ret = add_memory_resource(nid, res); 1370 ret = add_memory_resource(nid, res, memhp_auto_online);
1349 if (ret < 0) 1371 if (ret < 0)
1350 release_memory_resource(res); 1372 release_memory_resource(res);
1351 return ret; 1373 return ret;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9a3f6b90e628..8cbc74387df3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -643,7 +643,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
643 643
644 if (flags & MPOL_MF_LAZY) { 644 if (flags & MPOL_MF_LAZY) {
645 /* Similar to task_numa_work, skip inaccessible VMAs */ 645 /* Similar to task_numa_work, skip inaccessible VMAs */
646 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) 646 if (!is_vm_hugetlb_page(vma) &&
647 (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
648 !(vma->vm_flags & VM_MIXEDMAP))
647 change_prot_numa(vma, start, endvma); 649 change_prot_numa(vma, start, endvma);
648 return 1; 650 return 1;
649 } 651 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 3ad0fea5c438..568284ec75d4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -38,6 +38,7 @@
38#include <linux/balloon_compaction.h> 38#include <linux/balloon_compaction.h>
39#include <linux/mmu_notifier.h> 39#include <linux/mmu_notifier.h>
40#include <linux/page_idle.h> 40#include <linux/page_idle.h>
41#include <linux/page_owner.h>
41 42
42#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
43 44
@@ -325,7 +326,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
325 return -EAGAIN; 326 return -EAGAIN;
326 327
327 /* No turning back from here */ 328 /* No turning back from here */
328 set_page_memcg(newpage, page_memcg(page));
329 newpage->index = page->index; 329 newpage->index = page->index;
330 newpage->mapping = page->mapping; 330 newpage->mapping = page->mapping;
331 if (PageSwapBacked(page)) 331 if (PageSwapBacked(page))
@@ -372,7 +372,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
372 * Now we know that no one else is looking at the page: 372 * Now we know that no one else is looking at the page:
373 * no turning back from here. 373 * no turning back from here.
374 */ 374 */
375 set_page_memcg(newpage, page_memcg(page));
376 newpage->index = page->index; 375 newpage->index = page->index;
377 newpage->mapping = page->mapping; 376 newpage->mapping = page->mapping;
378 if (PageSwapBacked(page)) 377 if (PageSwapBacked(page))
@@ -457,9 +456,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
457 return -EAGAIN; 456 return -EAGAIN;
458 } 457 }
459 458
460 set_page_memcg(newpage, page_memcg(page));
461 newpage->index = page->index; 459 newpage->index = page->index;
462 newpage->mapping = page->mapping; 460 newpage->mapping = page->mapping;
461
463 get_page(newpage); 462 get_page(newpage);
464 463
465 radix_tree_replace_slot(pslot, newpage); 464 radix_tree_replace_slot(pslot, newpage);
@@ -467,6 +466,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
467 page_unfreeze_refs(page, expected_count - 1); 466 page_unfreeze_refs(page, expected_count - 1);
468 467
469 spin_unlock_irq(&mapping->tree_lock); 468 spin_unlock_irq(&mapping->tree_lock);
469
470 return MIGRATEPAGE_SUCCESS; 470 return MIGRATEPAGE_SUCCESS;
471} 471}
472 472
@@ -578,6 +578,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
578 */ 578 */
579 if (PageWriteback(newpage)) 579 if (PageWriteback(newpage))
580 end_page_writeback(newpage); 580 end_page_writeback(newpage);
581
582 copy_page_owner(page, newpage);
583
584 mem_cgroup_migrate(page, newpage);
581} 585}
582 586
583/************************************************************ 587/************************************************************
@@ -772,7 +776,6 @@ static int move_to_new_page(struct page *newpage, struct page *page,
772 * page is freed; but stats require that PageAnon be left as PageAnon. 776 * page is freed; but stats require that PageAnon be left as PageAnon.
773 */ 777 */
774 if (rc == MIGRATEPAGE_SUCCESS) { 778 if (rc == MIGRATEPAGE_SUCCESS) {
775 set_page_memcg(page, NULL);
776 if (!PageAnon(page)) 779 if (!PageAnon(page))
777 page->mapping = NULL; 780 page->mapping = NULL;
778 } 781 }
@@ -952,8 +955,10 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
952 } 955 }
953 956
954 rc = __unmap_and_move(page, newpage, force, mode); 957 rc = __unmap_and_move(page, newpage, force, mode);
955 if (rc == MIGRATEPAGE_SUCCESS) 958 if (rc == MIGRATEPAGE_SUCCESS) {
956 put_new_page = NULL; 959 put_new_page = NULL;
960 set_page_owner_migrate_reason(newpage, reason);
961 }
957 962
958out: 963out:
959 if (rc != -EAGAIN) { 964 if (rc != -EAGAIN) {
@@ -1018,7 +1023,7 @@ out:
1018static int unmap_and_move_huge_page(new_page_t get_new_page, 1023static int unmap_and_move_huge_page(new_page_t get_new_page,
1019 free_page_t put_new_page, unsigned long private, 1024 free_page_t put_new_page, unsigned long private,
1020 struct page *hpage, int force, 1025 struct page *hpage, int force,
1021 enum migrate_mode mode) 1026 enum migrate_mode mode, int reason)
1022{ 1027{
1023 int rc = -EAGAIN; 1028 int rc = -EAGAIN;
1024 int *result = NULL; 1029 int *result = NULL;
@@ -1076,6 +1081,7 @@ put_anon:
1076 if (rc == MIGRATEPAGE_SUCCESS) { 1081 if (rc == MIGRATEPAGE_SUCCESS) {
1077 hugetlb_cgroup_migrate(hpage, new_hpage); 1082 hugetlb_cgroup_migrate(hpage, new_hpage);
1078 put_new_page = NULL; 1083 put_new_page = NULL;
1084 set_page_owner_migrate_reason(new_hpage, reason);
1079 } 1085 }
1080 1086
1081 unlock_page(hpage); 1087 unlock_page(hpage);
@@ -1148,7 +1154,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
1148 if (PageHuge(page)) 1154 if (PageHuge(page))
1149 rc = unmap_and_move_huge_page(get_new_page, 1155 rc = unmap_and_move_huge_page(get_new_page,
1150 put_new_page, private, page, 1156 put_new_page, private, page,
1151 pass > 2, mode); 1157 pass > 2, mode, reason);
1152 else 1158 else
1153 rc = unmap_and_move(get_new_page, put_new_page, 1159 rc = unmap_and_move(get_new_page, put_new_page,
1154 private, page, pass > 2, mode, 1160 private, page, pass > 2, mode,
@@ -1836,9 +1842,8 @@ fail_putback:
1836 } 1842 }
1837 1843
1838 mlock_migrate_page(new_page, page); 1844 mlock_migrate_page(new_page, page);
1839 set_page_memcg(new_page, page_memcg(page));
1840 set_page_memcg(page, NULL);
1841 page_remove_rmap(page, true); 1845 page_remove_rmap(page, true);
1846 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
1842 1847
1843 spin_unlock(ptl); 1848 spin_unlock(ptl);
1844 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1849 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dc490c06941b..e97a05d9621f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -386,10 +386,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
386static void dump_header(struct oom_control *oc, struct task_struct *p, 386static void dump_header(struct oom_control *oc, struct task_struct *p,
387 struct mem_cgroup *memcg) 387 struct mem_cgroup *memcg)
388{ 388{
389 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 389 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, "
390 "oom_score_adj=%hd\n", 390 "oom_score_adj=%hd\n",
391 current->comm, oc->gfp_mask, oc->order, 391 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
392 current->signal->oom_score_adj); 392 current->signal->oom_score_adj);
393
393 cpuset_print_current_mems_allowed(); 394 cpuset_print_current_mems_allowed();
394 dump_stack(); 395 dump_stack();
395 if (memcg) 396 if (memcg)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6fe7d15bd1f7..11ff8f758631 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1169,6 +1169,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
1169 unsigned long balanced_dirty_ratelimit; 1169 unsigned long balanced_dirty_ratelimit;
1170 unsigned long step; 1170 unsigned long step;
1171 unsigned long x; 1171 unsigned long x;
1172 unsigned long shift;
1172 1173
1173 /* 1174 /*
1174 * The dirty rate will match the writeout rate in long term, except 1175 * The dirty rate will match the writeout rate in long term, except
@@ -1293,11 +1294,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
1293 * rate itself is constantly fluctuating. So decrease the track speed 1294 * rate itself is constantly fluctuating. So decrease the track speed
1294 * when it gets close to the target. Helps eliminate pointless tremors. 1295 * when it gets close to the target. Helps eliminate pointless tremors.
1295 */ 1296 */
1296 step >>= dirty_ratelimit / (2 * step + 1); 1297 shift = dirty_ratelimit / (2 * step + 1);
1297 /* 1298 if (shift < BITS_PER_LONG)
1298 * Limit the tracking speed to avoid overshooting. 1299 step = DIV_ROUND_UP(step >> shift, 8);
1299 */ 1300 else
1300 step = (step + 7) / 8; 1301 step = 0;
1301 1302
1302 if (dirty_ratelimit < balanced_dirty_ratelimit) 1303 if (dirty_ratelimit < balanced_dirty_ratelimit)
1303 dirty_ratelimit += step; 1304 dirty_ratelimit += step;
@@ -2409,12 +2410,11 @@ int __set_page_dirty_no_writeback(struct page *page)
2409/* 2410/*
2410 * Helper function for set_page_dirty family. 2411 * Helper function for set_page_dirty family.
2411 * 2412 *
2412 * Caller must hold mem_cgroup_begin_page_stat(). 2413 * Caller must hold lock_page_memcg().
2413 * 2414 *
2414 * NOTE: This relies on being atomic wrt interrupts. 2415 * NOTE: This relies on being atomic wrt interrupts.
2415 */ 2416 */
2416void account_page_dirtied(struct page *page, struct address_space *mapping, 2417void account_page_dirtied(struct page *page, struct address_space *mapping)
2417 struct mem_cgroup *memcg)
2418{ 2418{
2419 struct inode *inode = mapping->host; 2419 struct inode *inode = mapping->host;
2420 2420
@@ -2426,7 +2426,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping,
2426 inode_attach_wb(inode, page); 2426 inode_attach_wb(inode, page);
2427 wb = inode_to_wb(inode); 2427 wb = inode_to_wb(inode);
2428 2428
2429 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); 2429 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2430 __inc_zone_page_state(page, NR_FILE_DIRTY); 2430 __inc_zone_page_state(page, NR_FILE_DIRTY);
2431 __inc_zone_page_state(page, NR_DIRTIED); 2431 __inc_zone_page_state(page, NR_DIRTIED);
2432 __inc_wb_stat(wb, WB_RECLAIMABLE); 2432 __inc_wb_stat(wb, WB_RECLAIMABLE);
@@ -2441,13 +2441,13 @@ EXPORT_SYMBOL(account_page_dirtied);
2441/* 2441/*
2442 * Helper function for deaccounting dirty page without writeback. 2442 * Helper function for deaccounting dirty page without writeback.
2443 * 2443 *
2444 * Caller must hold mem_cgroup_begin_page_stat(). 2444 * Caller must hold lock_page_memcg().
2445 */ 2445 */
2446void account_page_cleaned(struct page *page, struct address_space *mapping, 2446void account_page_cleaned(struct page *page, struct address_space *mapping,
2447 struct mem_cgroup *memcg, struct bdi_writeback *wb) 2447 struct bdi_writeback *wb)
2448{ 2448{
2449 if (mapping_cap_account_dirty(mapping)) { 2449 if (mapping_cap_account_dirty(mapping)) {
2450 mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); 2450 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2451 dec_zone_page_state(page, NR_FILE_DIRTY); 2451 dec_zone_page_state(page, NR_FILE_DIRTY);
2452 dec_wb_stat(wb, WB_RECLAIMABLE); 2452 dec_wb_stat(wb, WB_RECLAIMABLE);
2453 task_io_account_cancelled_write(PAGE_CACHE_SIZE); 2453 task_io_account_cancelled_write(PAGE_CACHE_SIZE);
@@ -2468,26 +2468,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
2468 */ 2468 */
2469int __set_page_dirty_nobuffers(struct page *page) 2469int __set_page_dirty_nobuffers(struct page *page)
2470{ 2470{
2471 struct mem_cgroup *memcg; 2471 lock_page_memcg(page);
2472
2473 memcg = mem_cgroup_begin_page_stat(page);
2474 if (!TestSetPageDirty(page)) { 2472 if (!TestSetPageDirty(page)) {
2475 struct address_space *mapping = page_mapping(page); 2473 struct address_space *mapping = page_mapping(page);
2476 unsigned long flags; 2474 unsigned long flags;
2477 2475
2478 if (!mapping) { 2476 if (!mapping) {
2479 mem_cgroup_end_page_stat(memcg); 2477 unlock_page_memcg(page);
2480 return 1; 2478 return 1;
2481 } 2479 }
2482 2480
2483 spin_lock_irqsave(&mapping->tree_lock, flags); 2481 spin_lock_irqsave(&mapping->tree_lock, flags);
2484 BUG_ON(page_mapping(page) != mapping); 2482 BUG_ON(page_mapping(page) != mapping);
2485 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 2483 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
2486 account_page_dirtied(page, mapping, memcg); 2484 account_page_dirtied(page, mapping);
2487 radix_tree_tag_set(&mapping->page_tree, page_index(page), 2485 radix_tree_tag_set(&mapping->page_tree, page_index(page),
2488 PAGECACHE_TAG_DIRTY); 2486 PAGECACHE_TAG_DIRTY);
2489 spin_unlock_irqrestore(&mapping->tree_lock, flags); 2487 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2490 mem_cgroup_end_page_stat(memcg); 2488 unlock_page_memcg(page);
2491 2489
2492 if (mapping->host) { 2490 if (mapping->host) {
2493 /* !PageAnon && !swapper_space */ 2491 /* !PageAnon && !swapper_space */
@@ -2495,7 +2493,7 @@ int __set_page_dirty_nobuffers(struct page *page)
2495 } 2493 }
2496 return 1; 2494 return 1;
2497 } 2495 }
2498 mem_cgroup_end_page_stat(memcg); 2496 unlock_page_memcg(page);
2499 return 0; 2497 return 0;
2500} 2498}
2501EXPORT_SYMBOL(__set_page_dirty_nobuffers); 2499EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -2625,17 +2623,16 @@ void cancel_dirty_page(struct page *page)
2625 if (mapping_cap_account_dirty(mapping)) { 2623 if (mapping_cap_account_dirty(mapping)) {
2626 struct inode *inode = mapping->host; 2624 struct inode *inode = mapping->host;
2627 struct bdi_writeback *wb; 2625 struct bdi_writeback *wb;
2628 struct mem_cgroup *memcg;
2629 bool locked; 2626 bool locked;
2630 2627
2631 memcg = mem_cgroup_begin_page_stat(page); 2628 lock_page_memcg(page);
2632 wb = unlocked_inode_to_wb_begin(inode, &locked); 2629 wb = unlocked_inode_to_wb_begin(inode, &locked);
2633 2630
2634 if (TestClearPageDirty(page)) 2631 if (TestClearPageDirty(page))
2635 account_page_cleaned(page, mapping, memcg, wb); 2632 account_page_cleaned(page, mapping, wb);
2636 2633
2637 unlocked_inode_to_wb_end(inode, locked); 2634 unlocked_inode_to_wb_end(inode, locked);
2638 mem_cgroup_end_page_stat(memcg); 2635 unlock_page_memcg(page);
2639 } else { 2636 } else {
2640 ClearPageDirty(page); 2637 ClearPageDirty(page);
2641 } 2638 }
@@ -2666,7 +2663,6 @@ int clear_page_dirty_for_io(struct page *page)
2666 if (mapping && mapping_cap_account_dirty(mapping)) { 2663 if (mapping && mapping_cap_account_dirty(mapping)) {
2667 struct inode *inode = mapping->host; 2664 struct inode *inode = mapping->host;
2668 struct bdi_writeback *wb; 2665 struct bdi_writeback *wb;
2669 struct mem_cgroup *memcg;
2670 bool locked; 2666 bool locked;
2671 2667
2672 /* 2668 /*
@@ -2704,16 +2700,14 @@ int clear_page_dirty_for_io(struct page *page)
2704 * always locked coming in here, so we get the desired 2700 * always locked coming in here, so we get the desired
2705 * exclusion. 2701 * exclusion.
2706 */ 2702 */
2707 memcg = mem_cgroup_begin_page_stat(page);
2708 wb = unlocked_inode_to_wb_begin(inode, &locked); 2703 wb = unlocked_inode_to_wb_begin(inode, &locked);
2709 if (TestClearPageDirty(page)) { 2704 if (TestClearPageDirty(page)) {
2710 mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); 2705 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2711 dec_zone_page_state(page, NR_FILE_DIRTY); 2706 dec_zone_page_state(page, NR_FILE_DIRTY);
2712 dec_wb_stat(wb, WB_RECLAIMABLE); 2707 dec_wb_stat(wb, WB_RECLAIMABLE);
2713 ret = 1; 2708 ret = 1;
2714 } 2709 }
2715 unlocked_inode_to_wb_end(inode, locked); 2710 unlocked_inode_to_wb_end(inode, locked);
2716 mem_cgroup_end_page_stat(memcg);
2717 return ret; 2711 return ret;
2718 } 2712 }
2719 return TestClearPageDirty(page); 2713 return TestClearPageDirty(page);
@@ -2723,10 +2717,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
2723int test_clear_page_writeback(struct page *page) 2717int test_clear_page_writeback(struct page *page)
2724{ 2718{
2725 struct address_space *mapping = page_mapping(page); 2719 struct address_space *mapping = page_mapping(page);
2726 struct mem_cgroup *memcg;
2727 int ret; 2720 int ret;
2728 2721
2729 memcg = mem_cgroup_begin_page_stat(page); 2722 lock_page_memcg(page);
2730 if (mapping) { 2723 if (mapping) {
2731 struct inode *inode = mapping->host; 2724 struct inode *inode = mapping->host;
2732 struct backing_dev_info *bdi = inode_to_bdi(inode); 2725 struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2750,21 +2743,20 @@ int test_clear_page_writeback(struct page *page)
2750 ret = TestClearPageWriteback(page); 2743 ret = TestClearPageWriteback(page);
2751 } 2744 }
2752 if (ret) { 2745 if (ret) {
2753 mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 2746 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2754 dec_zone_page_state(page, NR_WRITEBACK); 2747 dec_zone_page_state(page, NR_WRITEBACK);
2755 inc_zone_page_state(page, NR_WRITTEN); 2748 inc_zone_page_state(page, NR_WRITTEN);
2756 } 2749 }
2757 mem_cgroup_end_page_stat(memcg); 2750 unlock_page_memcg(page);
2758 return ret; 2751 return ret;
2759} 2752}
2760 2753
2761int __test_set_page_writeback(struct page *page, bool keep_write) 2754int __test_set_page_writeback(struct page *page, bool keep_write)
2762{ 2755{
2763 struct address_space *mapping = page_mapping(page); 2756 struct address_space *mapping = page_mapping(page);
2764 struct mem_cgroup *memcg;
2765 int ret; 2757 int ret;
2766 2758
2767 memcg = mem_cgroup_begin_page_stat(page); 2759 lock_page_memcg(page);
2768 if (mapping) { 2760 if (mapping) {
2769 struct inode *inode = mapping->host; 2761 struct inode *inode = mapping->host;
2770 struct backing_dev_info *bdi = inode_to_bdi(inode); 2762 struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2792,10 +2784,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2792 ret = TestSetPageWriteback(page); 2784 ret = TestSetPageWriteback(page);
2793 } 2785 }
2794 if (!ret) { 2786 if (!ret) {
2795 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 2787 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2796 inc_zone_page_state(page, NR_WRITEBACK); 2788 inc_zone_page_state(page, NR_WRITEBACK);
2797 } 2789 }
2798 mem_cgroup_end_page_stat(memcg); 2790 unlock_page_memcg(page);
2799 return ret; 2791 return ret;
2800 2792
2801} 2793}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 838ca8bb64f7..c46b75d14b6f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = {
223#endif 223#endif
224}; 224};
225 225
226char * const migratetype_names[MIGRATE_TYPES] = {
227 "Unmovable",
228 "Movable",
229 "Reclaimable",
230 "HighAtomic",
231#ifdef CONFIG_CMA
232 "CMA",
233#endif
234#ifdef CONFIG_MEMORY_ISOLATION
235 "Isolate",
236#endif
237};
238
226compound_page_dtor * const compound_page_dtors[] = { 239compound_page_dtor * const compound_page_dtors[] = {
227 NULL, 240 NULL,
228 free_compound_page, 241 free_compound_page,
@@ -247,6 +260,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
247static unsigned long __initdata required_kernelcore; 260static unsigned long __initdata required_kernelcore;
248static unsigned long __initdata required_movablecore; 261static unsigned long __initdata required_movablecore;
249static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 262static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
263static bool mirrored_kernelcore;
250 264
251/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 265/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
252int movable_zone; 266int movable_zone;
@@ -416,7 +430,7 @@ static void bad_page(struct page *page, const char *reason,
416 goto out; 430 goto out;
417 } 431 }
418 if (nr_unshown) { 432 if (nr_unshown) {
419 printk(KERN_ALERT 433 pr_alert(
420 "BUG: Bad page state: %lu messages suppressed\n", 434 "BUG: Bad page state: %lu messages suppressed\n",
421 nr_unshown); 435 nr_unshown);
422 nr_unshown = 0; 436 nr_unshown = 0;
@@ -426,9 +440,14 @@ static void bad_page(struct page *page, const char *reason,
426 if (nr_shown++ == 0) 440 if (nr_shown++ == 0)
427 resume = jiffies + 60 * HZ; 441 resume = jiffies + 60 * HZ;
428 442
429 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 443 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
430 current->comm, page_to_pfn(page)); 444 current->comm, page_to_pfn(page));
431 dump_page_badflags(page, reason, bad_flags); 445 __dump_page(page, reason);
446 bad_flags &= page->flags;
447 if (bad_flags)
448 pr_alert("bad because of flags: %#lx(%pGp)\n",
449 bad_flags, &bad_flags);
450 dump_page_owner(page);
432 451
433 print_modules(); 452 print_modules();
434 dump_stack(); 453 dump_stack();
@@ -477,7 +496,8 @@ void prep_compound_page(struct page *page, unsigned int order)
477 496
478#ifdef CONFIG_DEBUG_PAGEALLOC 497#ifdef CONFIG_DEBUG_PAGEALLOC
479unsigned int _debug_guardpage_minorder; 498unsigned int _debug_guardpage_minorder;
480bool _debug_pagealloc_enabled __read_mostly; 499bool _debug_pagealloc_enabled __read_mostly
500 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
481bool _debug_guardpage_enabled __read_mostly; 501bool _debug_guardpage_enabled __read_mostly;
482 502
483static int __init early_debug_pagealloc(char *buf) 503static int __init early_debug_pagealloc(char *buf)
@@ -488,6 +508,9 @@ static int __init early_debug_pagealloc(char *buf)
488 if (strcmp(buf, "on") == 0) 508 if (strcmp(buf, "on") == 0)
489 _debug_pagealloc_enabled = true; 509 _debug_pagealloc_enabled = true;
490 510
511 if (strcmp(buf, "off") == 0)
512 _debug_pagealloc_enabled = false;
513
491 return 0; 514 return 0;
492} 515}
493early_param("debug_pagealloc", early_debug_pagealloc); 516early_param("debug_pagealloc", early_debug_pagealloc);
@@ -1002,6 +1025,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
1002 PAGE_SIZE << order); 1025 PAGE_SIZE << order);
1003 } 1026 }
1004 arch_free_page(page, order); 1027 arch_free_page(page, order);
1028 kernel_poison_pages(page, 1 << order, 0);
1005 kernel_map_pages(page, 1 << order, 0); 1029 kernel_map_pages(page, 1 << order, 0);
1006 1030
1007 return true; 1031 return true;
@@ -1104,6 +1128,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
1104 return __free_pages_boot_core(page, pfn, order); 1128 return __free_pages_boot_core(page, pfn, order);
1105} 1129}
1106 1130
1131/*
1132 * Check that the whole (or subset of) a pageblock given by the interval of
1133 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1134 * with the migration of free compaction scanner. The scanners then need to
1135 * use only pfn_valid_within() check for arches that allow holes within
1136 * pageblocks.
1137 *
1138 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1139 *
1140 * It's possible on some configurations to have a setup like node0 node1 node0
1141 * i.e. it's possible that all pages within a zones range of pages do not
1142 * belong to a single zone. We assume that a border between node0 and node1
1143 * can occur within a single pageblock, but not a node0 node1 node0
1144 * interleaving within a single pageblock. It is therefore sufficient to check
1145 * the first and last page of a pageblock and avoid checking each individual
1146 * page in a pageblock.
1147 */
1148struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1149 unsigned long end_pfn, struct zone *zone)
1150{
1151 struct page *start_page;
1152 struct page *end_page;
1153
1154 /* end_pfn is one past the range we are checking */
1155 end_pfn--;
1156
1157 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
1158 return NULL;
1159
1160 start_page = pfn_to_page(start_pfn);
1161
1162 if (page_zone(start_page) != zone)
1163 return NULL;
1164
1165 end_page = pfn_to_page(end_pfn);
1166
1167 /* This gives a shorter code than deriving page_zone(end_page) */
1168 if (page_zone_id(start_page) != page_zone_id(end_page))
1169 return NULL;
1170
1171 return start_page;
1172}
1173
1174void set_zone_contiguous(struct zone *zone)
1175{
1176 unsigned long block_start_pfn = zone->zone_start_pfn;
1177 unsigned long block_end_pfn;
1178
1179 block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
1180 for (; block_start_pfn < zone_end_pfn(zone);
1181 block_start_pfn = block_end_pfn,
1182 block_end_pfn += pageblock_nr_pages) {
1183
1184 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
1185
1186 if (!__pageblock_pfn_to_page(block_start_pfn,
1187 block_end_pfn, zone))
1188 return;
1189 }
1190
1191 /* We confirm that there is no hole */
1192 zone->contiguous = true;
1193}
1194
1195void clear_zone_contiguous(struct zone *zone)
1196{
1197 zone->contiguous = false;
1198}
1199
1107#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1200#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1108static void __init deferred_free_range(struct page *page, 1201static void __init deferred_free_range(struct page *page,
1109 unsigned long pfn, int nr_pages) 1202 unsigned long pfn, int nr_pages)
@@ -1254,9 +1347,13 @@ free_range:
1254 pgdat_init_report_one_done(); 1347 pgdat_init_report_one_done();
1255 return 0; 1348 return 0;
1256} 1349}
1350#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1257 1351
1258void __init page_alloc_init_late(void) 1352void __init page_alloc_init_late(void)
1259{ 1353{
1354 struct zone *zone;
1355
1356#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1260 int nid; 1357 int nid;
1261 1358
1262 /* There will be num_node_state(N_MEMORY) threads */ 1359 /* There will be num_node_state(N_MEMORY) threads */
@@ -1270,8 +1367,11 @@ void __init page_alloc_init_late(void)
1270 1367
1271 /* Reinit limits that are based on free pages after the kernel is up */ 1368 /* Reinit limits that are based on free pages after the kernel is up */
1272 files_maxfiles_init(); 1369 files_maxfiles_init();
1370#endif
1371
1372 for_each_populated_zone(zone)
1373 set_zone_contiguous(zone);
1273} 1374}
1274#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1275 1375
1276#ifdef CONFIG_CMA 1376#ifdef CONFIG_CMA
1277/* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 1377/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
@@ -1381,15 +1481,24 @@ static inline int check_new_page(struct page *page)
1381 return 0; 1481 return 0;
1382} 1482}
1383 1483
1484static inline bool free_pages_prezeroed(bool poisoned)
1485{
1486 return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
1487 page_poisoning_enabled() && poisoned;
1488}
1489
1384static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 1490static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
1385 int alloc_flags) 1491 int alloc_flags)
1386{ 1492{
1387 int i; 1493 int i;
1494 bool poisoned = true;
1388 1495
1389 for (i = 0; i < (1 << order); i++) { 1496 for (i = 0; i < (1 << order); i++) {
1390 struct page *p = page + i; 1497 struct page *p = page + i;
1391 if (unlikely(check_new_page(p))) 1498 if (unlikely(check_new_page(p)))
1392 return 1; 1499 return 1;
1500 if (poisoned)
1501 poisoned &= page_is_poisoned(p);
1393 } 1502 }
1394 1503
1395 set_page_private(page, 0); 1504 set_page_private(page, 0);
@@ -1397,9 +1506,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
1397 1506
1398 arch_alloc_page(page, order); 1507 arch_alloc_page(page, order);
1399 kernel_map_pages(page, 1 << order, 1); 1508 kernel_map_pages(page, 1 << order, 1);
1509 kernel_poison_pages(page, 1 << order, 1);
1400 kasan_alloc_pages(page, order); 1510 kasan_alloc_pages(page, order);
1401 1511
1402 if (gfp_flags & __GFP_ZERO) 1512 if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
1403 for (i = 0; i < (1 << order); i++) 1513 for (i = 0; i < (1 << order); i++)
1404 clear_highpage(page + i); 1514 clear_highpage(page + i);
1405 1515
@@ -2690,9 +2800,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
2690 va_end(args); 2800 va_end(args);
2691 } 2801 }
2692 2802
2693 pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", 2803 pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
2694 current->comm, order, gfp_mask); 2804 current->comm, order, gfp_mask, &gfp_mask);
2695
2696 dump_stack(); 2805 dump_stack();
2697 if (!should_suppress_show_mem()) 2806 if (!should_suppress_show_mem())
2698 show_mem(filter); 2807 show_mem(filter);
@@ -4491,6 +4600,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4491 pg_data_t *pgdat = NODE_DATA(nid); 4600 pg_data_t *pgdat = NODE_DATA(nid);
4492 unsigned long pfn; 4601 unsigned long pfn;
4493 unsigned long nr_initialised = 0; 4602 unsigned long nr_initialised = 0;
4603#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4604 struct memblock_region *r = NULL, *tmp;
4605#endif
4494 4606
4495 if (highest_memmap_pfn < end_pfn - 1) 4607 if (highest_memmap_pfn < end_pfn - 1)
4496 highest_memmap_pfn = end_pfn - 1; 4608 highest_memmap_pfn = end_pfn - 1;
@@ -4504,20 +4616,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4504 4616
4505 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 4617 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
4506 /* 4618 /*
4507 * There can be holes in boot-time mem_map[]s 4619 * There can be holes in boot-time mem_map[]s handed to this
4508 * handed to this function. They do not 4620 * function. They do not exist on hotplugged memory.
4509 * exist on hotplugged memory.
4510 */ 4621 */
4511 if (context == MEMMAP_EARLY) { 4622 if (context != MEMMAP_EARLY)
4512 if (!early_pfn_valid(pfn)) 4623 goto not_early;
4624
4625 if (!early_pfn_valid(pfn))
4626 continue;
4627 if (!early_pfn_in_nid(pfn, nid))
4628 continue;
4629 if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
4630 break;
4631
4632#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4633 /*
4634 * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
4635 * from zone_movable_pfn[nid] to end of each node should be
4636 * ZONE_MOVABLE not ZONE_NORMAL. skip it.
4637 */
4638 if (!mirrored_kernelcore && zone_movable_pfn[nid])
4639 if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
4513 continue; 4640 continue;
4514 if (!early_pfn_in_nid(pfn, nid)) 4641
4642 /*
4643 * Check given memblock attribute by firmware which can affect
4644 * kernel memory layout. If zone==ZONE_MOVABLE but memory is
4645 * mirrored, it's an overlapped memmap init. skip it.
4646 */
4647 if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
4648 if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
4649 for_each_memblock(memory, tmp)
4650 if (pfn < memblock_region_memory_end_pfn(tmp))
4651 break;
4652 r = tmp;
4653 }
4654 if (pfn >= memblock_region_memory_base_pfn(r) &&
4655 memblock_is_mirror(r)) {
4656 /* already initialized as NORMAL */
4657 pfn = memblock_region_memory_end_pfn(r);
4515 continue; 4658 continue;
4516 if (!update_defer_init(pgdat, pfn, end_pfn, 4659 }
4517 &nr_initialised))
4518 break;
4519 } 4660 }
4661#endif
4520 4662
4663not_early:
4521 /* 4664 /*
4522 * Mark the block movable so that blocks are reserved for 4665 * Mark the block movable so that blocks are reserved for
4523 * movable at startup. This will force kernel allocations 4666 * movable at startup. This will force kernel allocations
@@ -4934,11 +5077,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
4934 *zone_end_pfn = min(node_end_pfn, 5077 *zone_end_pfn = min(node_end_pfn,
4935 arch_zone_highest_possible_pfn[movable_zone]); 5078 arch_zone_highest_possible_pfn[movable_zone]);
4936 5079
4937 /* Adjust for ZONE_MOVABLE starting within this range */
4938 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4939 *zone_end_pfn > zone_movable_pfn[nid]) {
4940 *zone_end_pfn = zone_movable_pfn[nid];
4941
4942 /* Check if this whole range is within ZONE_MOVABLE */ 5080 /* Check if this whole range is within ZONE_MOVABLE */
4943 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 5081 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4944 *zone_start_pfn = *zone_end_pfn; 5082 *zone_start_pfn = *zone_end_pfn;
@@ -4953,31 +5091,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4953 unsigned long zone_type, 5091 unsigned long zone_type,
4954 unsigned long node_start_pfn, 5092 unsigned long node_start_pfn,
4955 unsigned long node_end_pfn, 5093 unsigned long node_end_pfn,
5094 unsigned long *zone_start_pfn,
5095 unsigned long *zone_end_pfn,
4956 unsigned long *ignored) 5096 unsigned long *ignored)
4957{ 5097{
4958 unsigned long zone_start_pfn, zone_end_pfn;
4959
4960 /* When hotadd a new node from cpu_up(), the node should be empty */ 5098 /* When hotadd a new node from cpu_up(), the node should be empty */
4961 if (!node_start_pfn && !node_end_pfn) 5099 if (!node_start_pfn && !node_end_pfn)
4962 return 0; 5100 return 0;
4963 5101
4964 /* Get the start and end of the zone */ 5102 /* Get the start and end of the zone */
4965 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 5103 *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4966 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 5104 *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4967 adjust_zone_range_for_zone_movable(nid, zone_type, 5105 adjust_zone_range_for_zone_movable(nid, zone_type,
4968 node_start_pfn, node_end_pfn, 5106 node_start_pfn, node_end_pfn,
4969 &zone_start_pfn, &zone_end_pfn); 5107 zone_start_pfn, zone_end_pfn);
4970 5108
4971 /* Check that this node has pages within the zone's required range */ 5109 /* Check that this node has pages within the zone's required range */
4972 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 5110 if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
4973 return 0; 5111 return 0;
4974 5112
4975 /* Move the zone boundaries inside the node if necessary */ 5113 /* Move the zone boundaries inside the node if necessary */
4976 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 5114 *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
4977 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 5115 *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
4978 5116
4979 /* Return the spanned pages */ 5117 /* Return the spanned pages */
4980 return zone_end_pfn - zone_start_pfn; 5118 return *zone_end_pfn - *zone_start_pfn;
4981} 5119}
4982 5120
4983/* 5121/*
@@ -5023,6 +5161,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
5023 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 5161 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
5024 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 5162 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
5025 unsigned long zone_start_pfn, zone_end_pfn; 5163 unsigned long zone_start_pfn, zone_end_pfn;
5164 unsigned long nr_absent;
5026 5165
5027 /* When hotadd a new node from cpu_up(), the node should be empty */ 5166 /* When hotadd a new node from cpu_up(), the node should be empty */
5028 if (!node_start_pfn && !node_end_pfn) 5167 if (!node_start_pfn && !node_end_pfn)
@@ -5034,7 +5173,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
5034 adjust_zone_range_for_zone_movable(nid, zone_type, 5173 adjust_zone_range_for_zone_movable(nid, zone_type,
5035 node_start_pfn, node_end_pfn, 5174 node_start_pfn, node_end_pfn,
5036 &zone_start_pfn, &zone_end_pfn); 5175 &zone_start_pfn, &zone_end_pfn);
5037 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 5176 nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
5177
5178 /*
5179 * ZONE_MOVABLE handling.
5180 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
5181 * and vice versa.
5182 */
5183 if (zone_movable_pfn[nid]) {
5184 if (mirrored_kernelcore) {
5185 unsigned long start_pfn, end_pfn;
5186 struct memblock_region *r;
5187
5188 for_each_memblock(memory, r) {
5189 start_pfn = clamp(memblock_region_memory_base_pfn(r),
5190 zone_start_pfn, zone_end_pfn);
5191 end_pfn = clamp(memblock_region_memory_end_pfn(r),
5192 zone_start_pfn, zone_end_pfn);
5193
5194 if (zone_type == ZONE_MOVABLE &&
5195 memblock_is_mirror(r))
5196 nr_absent += end_pfn - start_pfn;
5197
5198 if (zone_type == ZONE_NORMAL &&
5199 !memblock_is_mirror(r))
5200 nr_absent += end_pfn - start_pfn;
5201 }
5202 } else {
5203 if (zone_type == ZONE_NORMAL)
5204 nr_absent += node_end_pfn - zone_movable_pfn[nid];
5205 }
5206 }
5207
5208 return nr_absent;
5038} 5209}
5039 5210
5040#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5211#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
@@ -5042,8 +5213,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
5042 unsigned long zone_type, 5213 unsigned long zone_type,
5043 unsigned long node_start_pfn, 5214 unsigned long node_start_pfn,
5044 unsigned long node_end_pfn, 5215 unsigned long node_end_pfn,
5216 unsigned long *zone_start_pfn,
5217 unsigned long *zone_end_pfn,
5045 unsigned long *zones_size) 5218 unsigned long *zones_size)
5046{ 5219{
5220 unsigned int zone;
5221
5222 *zone_start_pfn = node_start_pfn;
5223 for (zone = 0; zone < zone_type; zone++)
5224 *zone_start_pfn += zones_size[zone];
5225
5226 *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
5227
5047 return zones_size[zone_type]; 5228 return zones_size[zone_type];
5048} 5229}
5049 5230
@@ -5072,15 +5253,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
5072 5253
5073 for (i = 0; i < MAX_NR_ZONES; i++) { 5254 for (i = 0; i < MAX_NR_ZONES; i++) {
5074 struct zone *zone = pgdat->node_zones + i; 5255 struct zone *zone = pgdat->node_zones + i;
5256 unsigned long zone_start_pfn, zone_end_pfn;
5075 unsigned long size, real_size; 5257 unsigned long size, real_size;
5076 5258
5077 size = zone_spanned_pages_in_node(pgdat->node_id, i, 5259 size = zone_spanned_pages_in_node(pgdat->node_id, i,
5078 node_start_pfn, 5260 node_start_pfn,
5079 node_end_pfn, 5261 node_end_pfn,
5262 &zone_start_pfn,
5263 &zone_end_pfn,
5080 zones_size); 5264 zones_size);
5081 real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, 5265 real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
5082 node_start_pfn, node_end_pfn, 5266 node_start_pfn, node_end_pfn,
5083 zholes_size); 5267 zholes_size);
5268 if (size)
5269 zone->zone_start_pfn = zone_start_pfn;
5270 else
5271 zone->zone_start_pfn = 0;
5084 zone->spanned_pages = size; 5272 zone->spanned_pages = size;
5085 zone->present_pages = real_size; 5273 zone->present_pages = real_size;
5086 5274
@@ -5201,7 +5389,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5201{ 5389{
5202 enum zone_type j; 5390 enum zone_type j;
5203 int nid = pgdat->node_id; 5391 int nid = pgdat->node_id;
5204 unsigned long zone_start_pfn = pgdat->node_start_pfn;
5205 int ret; 5392 int ret;
5206 5393
5207 pgdat_resize_init(pgdat); 5394 pgdat_resize_init(pgdat);
@@ -5222,6 +5409,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5222 for (j = 0; j < MAX_NR_ZONES; j++) { 5409 for (j = 0; j < MAX_NR_ZONES; j++) {
5223 struct zone *zone = pgdat->node_zones + j; 5410 struct zone *zone = pgdat->node_zones + j;
5224 unsigned long size, realsize, freesize, memmap_pages; 5411 unsigned long size, realsize, freesize, memmap_pages;
5412 unsigned long zone_start_pfn = zone->zone_start_pfn;
5225 5413
5226 size = zone->spanned_pages; 5414 size = zone->spanned_pages;
5227 realsize = freesize = zone->present_pages; 5415 realsize = freesize = zone->present_pages;
@@ -5290,7 +5478,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5290 ret = init_currently_empty_zone(zone, zone_start_pfn, size); 5478 ret = init_currently_empty_zone(zone, zone_start_pfn, size);
5291 BUG_ON(ret); 5479 BUG_ON(ret);
5292 memmap_init(size, nid, j, zone_start_pfn); 5480 memmap_init(size, nid, j, zone_start_pfn);
5293 zone_start_pfn += size;
5294 } 5481 }
5295} 5482}
5296 5483
@@ -5358,6 +5545,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
5358 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, 5545 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
5359 (u64)start_pfn << PAGE_SHIFT, 5546 (u64)start_pfn << PAGE_SHIFT,
5360 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); 5547 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
5548#else
5549 start_pfn = node_start_pfn;
5361#endif 5550#endif
5362 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 5551 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
5363 zones_size, zholes_size); 5552 zones_size, zholes_size);
@@ -5529,6 +5718,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
5529 } 5718 }
5530 5719
5531 /* 5720 /*
5721 * If kernelcore=mirror is specified, ignore movablecore option
5722 */
5723 if (mirrored_kernelcore) {
5724 bool mem_below_4gb_not_mirrored = false;
5725
5726 for_each_memblock(memory, r) {
5727 if (memblock_is_mirror(r))
5728 continue;
5729
5730 nid = r->nid;
5731
5732 usable_startpfn = memblock_region_memory_base_pfn(r);
5733
5734 if (usable_startpfn < 0x100000) {
5735 mem_below_4gb_not_mirrored = true;
5736 continue;
5737 }
5738
5739 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
5740 min(usable_startpfn, zone_movable_pfn[nid]) :
5741 usable_startpfn;
5742 }
5743
5744 if (mem_below_4gb_not_mirrored)
5745 pr_warn("This configuration results in unmirrored kernel memory.");
5746
5747 goto out2;
5748 }
5749
5750 /*
5532 * If movablecore=nn[KMG] was specified, calculate what size of 5751 * If movablecore=nn[KMG] was specified, calculate what size of
5533 * kernelcore that corresponds so that memory usable for 5752 * kernelcore that corresponds so that memory usable for
5534 * any allocation type is evenly spread. If both kernelcore 5753 * any allocation type is evenly spread. If both kernelcore
@@ -5788,6 +6007,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
5788 */ 6007 */
5789static int __init cmdline_parse_kernelcore(char *p) 6008static int __init cmdline_parse_kernelcore(char *p)
5790{ 6009{
6010 /* parse kernelcore=mirror */
6011 if (parse_option_str(p, "mirror")) {
6012 mirrored_kernelcore = true;
6013 return 0;
6014 }
6015
5791 return cmdline_parse_core(p, &required_kernelcore); 6016 return cmdline_parse_core(p, &required_kernelcore);
5792} 6017}
5793 6018
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 292ca7b8debd..2d864e64f7fe 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page)
106 struct page_ext *base; 106 struct page_ext *base;
107 107
108 base = NODE_DATA(page_to_nid(page))->node_page_ext; 108 base = NODE_DATA(page_to_nid(page))->node_page_ext;
109#ifdef CONFIG_DEBUG_VM 109#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
110 /* 110 /*
111 * The sanity checks the page allocator does upon freeing a 111 * The sanity checks the page allocator does upon freeing a
112 * page can reach here before the page_ext arrays are 112 * page can reach here before the page_ext arrays are
113 * allocated when feeding a range of pages to the allocator 113 * allocated when feeding a range of pages to the allocator
114 * for the first time during bootup or memory hotplug. 114 * for the first time during bootup or memory hotplug.
115 *
116 * This check is also necessary for ensuring page poisoning
117 * works as expected when enabled
115 */ 118 */
116 if (unlikely(!base)) 119 if (unlikely(!base))
117 return NULL; 120 return NULL;
@@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page)
180{ 183{
181 unsigned long pfn = page_to_pfn(page); 184 unsigned long pfn = page_to_pfn(page);
182 struct mem_section *section = __pfn_to_section(pfn); 185 struct mem_section *section = __pfn_to_section(pfn);
183#ifdef CONFIG_DEBUG_VM 186#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
184 /* 187 /*
185 * The sanity checks the page allocator does upon freeing a 188 * The sanity checks the page allocator does upon freeing a
186 * page can reach here before the page_ext arrays are 189 * page can reach here before the page_ext arrays are
187 * allocated when feeding a range of pages to the allocator 190 * allocated when feeding a range of pages to the allocator
188 * for the first time during bootup or memory hotplug. 191 * for the first time during bootup or memory hotplug.
192 *
193 * This check is also necessary for ensuring page poisoning
194 * works as expected when enabled
189 */ 195 */
190 if (!section->page_ext) 196 if (!section->page_ext)
191 return NULL; 197 return NULL;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 983c3a10fa07..44ad1f00c4e1 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -5,10 +5,12 @@
5#include <linux/bootmem.h> 5#include <linux/bootmem.h>
6#include <linux/stacktrace.h> 6#include <linux/stacktrace.h>
7#include <linux/page_owner.h> 7#include <linux/page_owner.h>
8#include <linux/jump_label.h>
9#include <linux/migrate.h>
8#include "internal.h" 10#include "internal.h"
9 11
10static bool page_owner_disabled = true; 12static bool page_owner_disabled = true;
11bool page_owner_inited __read_mostly; 13DEFINE_STATIC_KEY_FALSE(page_owner_inited);
12 14
13static void init_early_allocated_pages(void); 15static void init_early_allocated_pages(void);
14 16
@@ -37,7 +39,7 @@ static void init_page_owner(void)
37 if (page_owner_disabled) 39 if (page_owner_disabled)
38 return; 40 return;
39 41
40 page_owner_inited = true; 42 static_branch_enable(&page_owner_inited);
41 init_early_allocated_pages(); 43 init_early_allocated_pages();
42} 44}
43 45
@@ -72,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
72 page_ext->order = order; 74 page_ext->order = order;
73 page_ext->gfp_mask = gfp_mask; 75 page_ext->gfp_mask = gfp_mask;
74 page_ext->nr_entries = trace.nr_entries; 76 page_ext->nr_entries = trace.nr_entries;
77 page_ext->last_migrate_reason = -1;
75 78
76 __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 79 __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
77} 80}
78 81
82void __set_page_owner_migrate_reason(struct page *page, int reason)
83{
84 struct page_ext *page_ext = lookup_page_ext(page);
85
86 page_ext->last_migrate_reason = reason;
87}
88
79gfp_t __get_page_owner_gfp(struct page *page) 89gfp_t __get_page_owner_gfp(struct page *page)
80{ 90{
81 struct page_ext *page_ext = lookup_page_ext(page); 91 struct page_ext *page_ext = lookup_page_ext(page);
@@ -83,6 +93,31 @@ gfp_t __get_page_owner_gfp(struct page *page)
83 return page_ext->gfp_mask; 93 return page_ext->gfp_mask;
84} 94}
85 95
96void __copy_page_owner(struct page *oldpage, struct page *newpage)
97{
98 struct page_ext *old_ext = lookup_page_ext(oldpage);
99 struct page_ext *new_ext = lookup_page_ext(newpage);
100 int i;
101
102 new_ext->order = old_ext->order;
103 new_ext->gfp_mask = old_ext->gfp_mask;
104 new_ext->nr_entries = old_ext->nr_entries;
105
106 for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
107 new_ext->trace_entries[i] = old_ext->trace_entries[i];
108
109 /*
110 * We don't clear the bit on the oldpage as it's going to be freed
111 * after migration. Until then, the info can be useful in case of
112 * a bug, and the overal stats will be off a bit only temporarily.
113 * Also, migrate_misplaced_transhuge_page() can still fail the
114 * migration and then we want the oldpage to retain the info. But
115 * in that case we also don't need to explicitly clear the info from
116 * the new page, which will be freed.
117 */
118 __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
119}
120
86static ssize_t 121static ssize_t
87print_page_owner(char __user *buf, size_t count, unsigned long pfn, 122print_page_owner(char __user *buf, size_t count, unsigned long pfn,
88 struct page *page, struct page_ext *page_ext) 123 struct page *page, struct page_ext *page_ext)
@@ -100,8 +135,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
100 return -ENOMEM; 135 return -ENOMEM;
101 136
102 ret = snprintf(kbuf, count, 137 ret = snprintf(kbuf, count,
103 "Page allocated via order %u, mask 0x%x\n", 138 "Page allocated via order %u, mask %#x(%pGg)\n",
104 page_ext->order, page_ext->gfp_mask); 139 page_ext->order, page_ext->gfp_mask,
140 &page_ext->gfp_mask);
105 141
106 if (ret >= count) 142 if (ret >= count)
107 goto err; 143 goto err;
@@ -110,23 +146,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
110 pageblock_mt = get_pfnblock_migratetype(page, pfn); 146 pageblock_mt = get_pfnblock_migratetype(page, pfn);
111 page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); 147 page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
112 ret += snprintf(kbuf + ret, count - ret, 148 ret += snprintf(kbuf + ret, count - ret,
113 "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", 149 "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
114 pfn, 150 pfn,
151 migratetype_names[page_mt],
115 pfn >> pageblock_order, 152 pfn >> pageblock_order,
116 pageblock_mt, 153 migratetype_names[pageblock_mt],
117 pageblock_mt != page_mt ? "Fallback" : " ", 154 page->flags, &page->flags);
118 PageLocked(page) ? "K" : " ",
119 PageError(page) ? "E" : " ",
120 PageReferenced(page) ? "R" : " ",
121 PageUptodate(page) ? "U" : " ",
122 PageDirty(page) ? "D" : " ",
123 PageLRU(page) ? "L" : " ",
124 PageActive(page) ? "A" : " ",
125 PageSlab(page) ? "S" : " ",
126 PageWriteback(page) ? "W" : " ",
127 PageCompound(page) ? "C" : " ",
128 PageSwapCache(page) ? "B" : " ",
129 PageMappedToDisk(page) ? "M" : " ");
130 155
131 if (ret >= count) 156 if (ret >= count)
132 goto err; 157 goto err;
@@ -135,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
135 if (ret >= count) 160 if (ret >= count)
136 goto err; 161 goto err;
137 162
163 if (page_ext->last_migrate_reason != -1) {
164 ret += snprintf(kbuf + ret, count - ret,
165 "Page has been migrated, last migrate reason: %s\n",
166 migrate_reason_names[page_ext->last_migrate_reason]);
167 if (ret >= count)
168 goto err;
169 }
170
138 ret += snprintf(kbuf + ret, count - ret, "\n"); 171 ret += snprintf(kbuf + ret, count - ret, "\n");
139 if (ret >= count) 172 if (ret >= count)
140 goto err; 173 goto err;
@@ -150,6 +183,31 @@ err:
150 return -ENOMEM; 183 return -ENOMEM;
151} 184}
152 185
186void __dump_page_owner(struct page *page)
187{
188 struct page_ext *page_ext = lookup_page_ext(page);
189 struct stack_trace trace = {
190 .nr_entries = page_ext->nr_entries,
191 .entries = &page_ext->trace_entries[0],
192 };
193 gfp_t gfp_mask = page_ext->gfp_mask;
194 int mt = gfpflags_to_migratetype(gfp_mask);
195
196 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
197 pr_alert("page_owner info is not active (free page?)\n");
198 return;
199 }
200
201 pr_alert("page allocated via order %u, migratetype %s, "
202 "gfp_mask %#x(%pGg)\n", page_ext->order,
203 migratetype_names[mt], gfp_mask, &gfp_mask);
204 print_stack_trace(&trace, 0);
205
206 if (page_ext->last_migrate_reason != -1)
207 pr_alert("page has been migrated, last migrate reason: %s\n",
208 migrate_reason_names[page_ext->last_migrate_reason]);
209}
210
153static ssize_t 211static ssize_t
154read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) 212read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
155{ 213{
@@ -157,7 +215,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
157 struct page *page; 215 struct page *page;
158 struct page_ext *page_ext; 216 struct page_ext *page_ext;
159 217
160 if (!page_owner_inited) 218 if (!static_branch_unlikely(&page_owner_inited))
161 return -EINVAL; 219 return -EINVAL;
162 220
163 page = NULL; 221 page = NULL;
@@ -305,7 +363,7 @@ static int __init pageowner_init(void)
305{ 363{
306 struct dentry *dentry; 364 struct dentry *dentry;
307 365
308 if (!page_owner_inited) { 366 if (!static_branch_unlikely(&page_owner_inited)) {
309 pr_info("page_owner is disabled\n"); 367 pr_info("page_owner is disabled\n");
310 return 0; 368 return 0;
311 } 369 }
diff --git a/mm/debug-pagealloc.c b/mm/page_poison.c
index 5bf5906ce13b..479e7ea2bea6 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/page_poison.c
@@ -6,22 +6,48 @@
6#include <linux/poison.h> 6#include <linux/poison.h>
7#include <linux/ratelimit.h> 7#include <linux/ratelimit.h>
8 8
9static bool page_poisoning_enabled __read_mostly; 9static bool __page_poisoning_enabled __read_mostly;
10static bool want_page_poisoning __read_mostly;
10 11
11static bool need_page_poisoning(void) 12static int early_page_poison_param(char *buf)
12{ 13{
13 if (!debug_pagealloc_enabled()) 14 if (!buf)
14 return false; 15 return -EINVAL;
16
17 if (strcmp(buf, "on") == 0)
18 want_page_poisoning = true;
19 else if (strcmp(buf, "off") == 0)
20 want_page_poisoning = false;
15 21
16 return true; 22 return 0;
23}
24early_param("page_poison", early_page_poison_param);
25
26bool page_poisoning_enabled(void)
27{
28 return __page_poisoning_enabled;
29}
30
31static bool need_page_poisoning(void)
32{
33 return want_page_poisoning;
17} 34}
18 35
19static void init_page_poisoning(void) 36static void init_page_poisoning(void)
20{ 37{
21 if (!debug_pagealloc_enabled()) 38 /*
22 return; 39 * page poisoning is debug page alloc for some arches. If either
40 * of those options are enabled, enable poisoning
41 */
42 if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
43 if (!want_page_poisoning && !debug_pagealloc_enabled())
44 return;
45 } else {
46 if (!want_page_poisoning)
47 return;
48 }
23 49
24 page_poisoning_enabled = true; 50 __page_poisoning_enabled = true;
25} 51}
26 52
27struct page_ext_operations page_poisoning_ops = { 53struct page_ext_operations page_poisoning_ops = {
@@ -45,11 +71,14 @@ static inline void clear_page_poison(struct page *page)
45 __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); 71 __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
46} 72}
47 73
48static inline bool page_poison(struct page *page) 74bool page_is_poisoned(struct page *page)
49{ 75{
50 struct page_ext *page_ext; 76 struct page_ext *page_ext;
51 77
52 page_ext = lookup_page_ext(page); 78 page_ext = lookup_page_ext(page);
79 if (!page_ext)
80 return false;
81
53 return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); 82 return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
54} 83}
55 84
@@ -83,6 +112,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
83 unsigned char *start; 112 unsigned char *start;
84 unsigned char *end; 113 unsigned char *end;
85 114
115 if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
116 return;
117
86 start = memchr_inv(mem, PAGE_POISON, bytes); 118 start = memchr_inv(mem, PAGE_POISON, bytes);
87 if (!start) 119 if (!start)
88 return; 120 return;
@@ -95,9 +127,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
95 if (!__ratelimit(&ratelimit)) 127 if (!__ratelimit(&ratelimit))
96 return; 128 return;
97 else if (start == end && single_bit_flip(*start, PAGE_POISON)) 129 else if (start == end && single_bit_flip(*start, PAGE_POISON))
98 printk(KERN_ERR "pagealloc: single bit error\n"); 130 pr_err("pagealloc: single bit error\n");
99 else 131 else
100 printk(KERN_ERR "pagealloc: memory corruption\n"); 132 pr_err("pagealloc: memory corruption\n");
101 133
102 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, 134 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
103 end - start + 1, 1); 135 end - start + 1, 1);
@@ -108,7 +140,7 @@ static void unpoison_page(struct page *page)
108{ 140{
109 void *addr; 141 void *addr;
110 142
111 if (!page_poison(page)) 143 if (!page_is_poisoned(page))
112 return; 144 return;
113 145
114 addr = kmap_atomic(page); 146 addr = kmap_atomic(page);
@@ -125,9 +157,9 @@ static void unpoison_pages(struct page *page, int n)
125 unpoison_page(page + i); 157 unpoison_page(page + i);
126} 158}
127 159
128void __kernel_map_pages(struct page *page, int numpages, int enable) 160void kernel_poison_pages(struct page *page, int numpages, int enable)
129{ 161{
130 if (!page_poisoning_enabled) 162 if (!page_poisoning_enabled())
131 return; 163 return;
132 164
133 if (enable) 165 if (enable)
@@ -135,3 +167,10 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
135 else 167 else
136 poison_pages(page, numpages); 168 poison_pages(page, numpages);
137} 169}
170
171#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
172void __kernel_map_pages(struct page *page, int numpages, int enable)
173{
174 /* This function does nothing, all work is done via poison pages */
175}
176#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 79f3bf047f38..02f0bfc3c80a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1287,21 +1287,17 @@ void page_add_new_anon_rmap(struct page *page,
1287 */ 1287 */
1288void page_add_file_rmap(struct page *page) 1288void page_add_file_rmap(struct page *page)
1289{ 1289{
1290 struct mem_cgroup *memcg; 1290 lock_page_memcg(page);
1291
1292 memcg = mem_cgroup_begin_page_stat(page);
1293 if (atomic_inc_and_test(&page->_mapcount)) { 1291 if (atomic_inc_and_test(&page->_mapcount)) {
1294 __inc_zone_page_state(page, NR_FILE_MAPPED); 1292 __inc_zone_page_state(page, NR_FILE_MAPPED);
1295 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 1293 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
1296 } 1294 }
1297 mem_cgroup_end_page_stat(memcg); 1295 unlock_page_memcg(page);
1298} 1296}
1299 1297
1300static void page_remove_file_rmap(struct page *page) 1298static void page_remove_file_rmap(struct page *page)
1301{ 1299{
1302 struct mem_cgroup *memcg; 1300 lock_page_memcg(page);
1303
1304 memcg = mem_cgroup_begin_page_stat(page);
1305 1301
1306 /* Hugepages are not counted in NR_FILE_MAPPED for now. */ 1302 /* Hugepages are not counted in NR_FILE_MAPPED for now. */
1307 if (unlikely(PageHuge(page))) { 1303 if (unlikely(PageHuge(page))) {
@@ -1320,12 +1316,12 @@ static void page_remove_file_rmap(struct page *page)
1320 * pte lock(a spinlock) is held, which implies preemption disabled. 1316 * pte lock(a spinlock) is held, which implies preemption disabled.
1321 */ 1317 */
1322 __dec_zone_page_state(page, NR_FILE_MAPPED); 1318 __dec_zone_page_state(page, NR_FILE_MAPPED);
1323 mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 1319 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
1324 1320
1325 if (unlikely(PageMlocked(page))) 1321 if (unlikely(PageMlocked(page)))
1326 clear_page_mlock(page); 1322 clear_page_mlock(page);
1327out: 1323out:
1328 mem_cgroup_end_page_stat(memcg); 1324 unlock_page_memcg(page);
1329} 1325}
1330 1326
1331static void page_remove_anon_compound_rmap(struct page *page) 1327static void page_remove_anon_compound_rmap(struct page *page)
diff --git a/mm/shmem.c b/mm/shmem.c
index 440e2a7e6c1c..1acfdbc4bd9e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1116,7 +1116,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1116 */ 1116 */
1117 oldpage = newpage; 1117 oldpage = newpage;
1118 } else { 1118 } else {
1119 mem_cgroup_replace_page(oldpage, newpage); 1119 mem_cgroup_migrate(oldpage, newpage);
1120 lru_cache_add_anon(newpage); 1120 lru_cache_add_anon(newpage);
1121 *pagep = newpage; 1121 *pagep = newpage;
1122 } 1122 }
diff --git a/mm/slab.c b/mm/slab.c
index 621fbcb35a36..852fc5c79829 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t;
169#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) 169#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
170 170
171/* 171/*
172 * true if a page was allocated from pfmemalloc reserves for network-based
173 * swap
174 */
175static bool pfmemalloc_active __read_mostly;
176
177/*
178 * struct array_cache 172 * struct array_cache
179 * 173 *
180 * Purpose: 174 * Purpose:
@@ -195,10 +189,6 @@ struct array_cache {
195 * Must have this definition in here for the proper 189 * Must have this definition in here for the proper
196 * alignment of array_cache. Also simplifies accessing 190 * alignment of array_cache. Also simplifies accessing
197 * the entries. 191 * the entries.
198 *
199 * Entries should not be directly dereferenced as
200 * entries belonging to slabs marked pfmemalloc will
201 * have the lower bits set SLAB_OBJ_PFMEMALLOC
202 */ 192 */
203}; 193};
204 194
@@ -207,33 +197,6 @@ struct alien_cache {
207 struct array_cache ac; 197 struct array_cache ac;
208}; 198};
209 199
210#define SLAB_OBJ_PFMEMALLOC 1
211static inline bool is_obj_pfmemalloc(void *objp)
212{
213 return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
214}
215
216static inline void set_obj_pfmemalloc(void **objp)
217{
218 *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
219 return;
220}
221
222static inline void clear_obj_pfmemalloc(void **objp)
223{
224 *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
225}
226
227/*
228 * bootstrap: The caches do not work without cpuarrays anymore, but the
229 * cpuarrays are allocated from the generic caches...
230 */
231#define BOOT_CPUCACHE_ENTRIES 1
232struct arraycache_init {
233 struct array_cache cache;
234 void *entries[BOOT_CPUCACHE_ENTRIES];
235};
236
237/* 200/*
238 * Need this for bootstrapping a per node allocator. 201 * Need this for bootstrapping a per node allocator.
239 */ 202 */
@@ -280,9 +243,10 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
280 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 243 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
281 } while (0) 244 } while (0)
282 245
246#define CFLGS_OBJFREELIST_SLAB (0x40000000UL)
283#define CFLGS_OFF_SLAB (0x80000000UL) 247#define CFLGS_OFF_SLAB (0x80000000UL)
248#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
284#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 249#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
285#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
286 250
287#define BATCHREFILL_LIMIT 16 251#define BATCHREFILL_LIMIT 16
288/* 252/*
@@ -390,36 +354,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
390 354
391#endif 355#endif
392 356
393#define OBJECT_FREE (0)
394#define OBJECT_ACTIVE (1)
395
396#ifdef CONFIG_DEBUG_SLAB_LEAK 357#ifdef CONFIG_DEBUG_SLAB_LEAK
397 358
398static void set_obj_status(struct page *page, int idx, int val) 359static inline bool is_store_user_clean(struct kmem_cache *cachep)
399{ 360{
400 int freelist_size; 361 return atomic_read(&cachep->store_user_clean) == 1;
401 char *status;
402 struct kmem_cache *cachep = page->slab_cache;
403
404 freelist_size = cachep->num * sizeof(freelist_idx_t);
405 status = (char *)page->freelist + freelist_size;
406 status[idx] = val;
407} 362}
408 363
409static inline unsigned int get_obj_status(struct page *page, int idx) 364static inline void set_store_user_clean(struct kmem_cache *cachep)
410{ 365{
411 int freelist_size; 366 atomic_set(&cachep->store_user_clean, 1);
412 char *status; 367}
413 struct kmem_cache *cachep = page->slab_cache;
414
415 freelist_size = cachep->num * sizeof(freelist_idx_t);
416 status = (char *)page->freelist + freelist_size;
417 368
418 return status[idx]; 369static inline void set_store_user_dirty(struct kmem_cache *cachep)
370{
371 if (is_store_user_clean(cachep))
372 atomic_set(&cachep->store_user_clean, 0);
419} 373}
420 374
421#else 375#else
422static inline void set_obj_status(struct page *page, int idx, int val) {} 376static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
423 377
424#endif 378#endif
425 379
@@ -457,6 +411,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
457 return reciprocal_divide(offset, cache->reciprocal_buffer_size); 411 return reciprocal_divide(offset, cache->reciprocal_buffer_size);
458} 412}
459 413
414#define BOOT_CPUCACHE_ENTRIES 1
460/* internal cache of cache description objs */ 415/* internal cache of cache description objs */
461static struct kmem_cache kmem_cache_boot = { 416static struct kmem_cache kmem_cache_boot = {
462 .batchcount = 1, 417 .batchcount = 1,
@@ -475,61 +430,13 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
475 return this_cpu_ptr(cachep->cpu_cache); 430 return this_cpu_ptr(cachep->cpu_cache);
476} 431}
477 432
478static size_t calculate_freelist_size(int nr_objs, size_t align)
479{
480 size_t freelist_size;
481
482 freelist_size = nr_objs * sizeof(freelist_idx_t);
483 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
484 freelist_size += nr_objs * sizeof(char);
485
486 if (align)
487 freelist_size = ALIGN(freelist_size, align);
488
489 return freelist_size;
490}
491
492static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
493 size_t idx_size, size_t align)
494{
495 int nr_objs;
496 size_t remained_size;
497 size_t freelist_size;
498 int extra_space = 0;
499
500 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
501 extra_space = sizeof(char);
502 /*
503 * Ignore padding for the initial guess. The padding
504 * is at most @align-1 bytes, and @buffer_size is at
505 * least @align. In the worst case, this result will
506 * be one greater than the number of objects that fit
507 * into the memory allocation when taking the padding
508 * into account.
509 */
510 nr_objs = slab_size / (buffer_size + idx_size + extra_space);
511
512 /*
513 * This calculated number will be either the right
514 * amount, or one greater than what we want.
515 */
516 remained_size = slab_size - nr_objs * buffer_size;
517 freelist_size = calculate_freelist_size(nr_objs, align);
518 if (remained_size < freelist_size)
519 nr_objs--;
520
521 return nr_objs;
522}
523
524/* 433/*
525 * Calculate the number of objects and left-over bytes for a given buffer size. 434 * Calculate the number of objects and left-over bytes for a given buffer size.
526 */ 435 */
527static void cache_estimate(unsigned long gfporder, size_t buffer_size, 436static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
528 size_t align, int flags, size_t *left_over, 437 unsigned long flags, size_t *left_over)
529 unsigned int *num)
530{ 438{
531 int nr_objs; 439 unsigned int num;
532 size_t mgmt_size;
533 size_t slab_size = PAGE_SIZE << gfporder; 440 size_t slab_size = PAGE_SIZE << gfporder;
534 441
535 /* 442 /*
@@ -537,26 +444,28 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
537 * on it. For the latter case, the memory allocated for a 444 * on it. For the latter case, the memory allocated for a
538 * slab is used for: 445 * slab is used for:
539 * 446 *
540 * - One unsigned int for each object
541 * - Padding to respect alignment of @align
542 * - @buffer_size bytes for each object 447 * - @buffer_size bytes for each object
448 * - One freelist_idx_t for each object
449 *
450 * We don't need to consider alignment of freelist because
451 * freelist will be at the end of slab page. The objects will be
452 * at the correct alignment.
543 * 453 *
544 * If the slab management structure is off the slab, then the 454 * If the slab management structure is off the slab, then the
545 * alignment will already be calculated into the size. Because 455 * alignment will already be calculated into the size. Because
546 * the slabs are all pages aligned, the objects will be at the 456 * the slabs are all pages aligned, the objects will be at the
547 * correct alignment when allocated. 457 * correct alignment when allocated.
548 */ 458 */
549 if (flags & CFLGS_OFF_SLAB) { 459 if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) {
550 mgmt_size = 0; 460 num = slab_size / buffer_size;
551 nr_objs = slab_size / buffer_size; 461 *left_over = slab_size % buffer_size;
552
553 } else { 462 } else {
554 nr_objs = calculate_nr_objs(slab_size, buffer_size, 463 num = slab_size / (buffer_size + sizeof(freelist_idx_t));
555 sizeof(freelist_idx_t), align); 464 *left_over = slab_size %
556 mgmt_size = calculate_freelist_size(nr_objs, align); 465 (buffer_size + sizeof(freelist_idx_t));
557 } 466 }
558 *num = nr_objs; 467
559 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 468 return num;
560} 469}
561 470
562#if DEBUG 471#if DEBUG
@@ -687,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries,
687 return ac; 596 return ac;
688} 597}
689 598
690static inline bool is_slab_pfmemalloc(struct page *page) 599static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
691{ 600 struct page *page, void *objp)
692 return PageSlabPfmemalloc(page);
693}
694
695/* Clears pfmemalloc_active if no slabs have pfmalloc set */
696static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
697 struct array_cache *ac)
698{
699 struct kmem_cache_node *n = get_node(cachep, numa_mem_id());
700 struct page *page;
701 unsigned long flags;
702
703 if (!pfmemalloc_active)
704 return;
705
706 spin_lock_irqsave(&n->list_lock, flags);
707 list_for_each_entry(page, &n->slabs_full, lru)
708 if (is_slab_pfmemalloc(page))
709 goto out;
710
711 list_for_each_entry(page, &n->slabs_partial, lru)
712 if (is_slab_pfmemalloc(page))
713 goto out;
714
715 list_for_each_entry(page, &n->slabs_free, lru)
716 if (is_slab_pfmemalloc(page))
717 goto out;
718
719 pfmemalloc_active = false;
720out:
721 spin_unlock_irqrestore(&n->list_lock, flags);
722}
723
724static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
725 gfp_t flags, bool force_refill)
726{ 601{
727 int i; 602 struct kmem_cache_node *n;
728 void *objp = ac->entry[--ac->avail]; 603 int page_node;
729 604 LIST_HEAD(list);
730 /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
731 if (unlikely(is_obj_pfmemalloc(objp))) {
732 struct kmem_cache_node *n;
733
734 if (gfp_pfmemalloc_allowed(flags)) {
735 clear_obj_pfmemalloc(&objp);
736 return objp;
737 }
738
739 /* The caller cannot use PFMEMALLOC objects, find another one */
740 for (i = 0; i < ac->avail; i++) {
741 /* If a !PFMEMALLOC object is found, swap them */
742 if (!is_obj_pfmemalloc(ac->entry[i])) {
743 objp = ac->entry[i];
744 ac->entry[i] = ac->entry[ac->avail];
745 ac->entry[ac->avail] = objp;
746 return objp;
747 }
748 }
749
750 /*
751 * If there are empty slabs on the slabs_free list and we are
752 * being forced to refill the cache, mark this one !pfmemalloc.
753 */
754 n = get_node(cachep, numa_mem_id());
755 if (!list_empty(&n->slabs_free) && force_refill) {
756 struct page *page = virt_to_head_page(objp);
757 ClearPageSlabPfmemalloc(page);
758 clear_obj_pfmemalloc(&objp);
759 recheck_pfmemalloc_active(cachep, ac);
760 return objp;
761 }
762
763 /* No !PFMEMALLOC objects available */
764 ac->avail++;
765 objp = NULL;
766 }
767
768 return objp;
769}
770
771static inline void *ac_get_obj(struct kmem_cache *cachep,
772 struct array_cache *ac, gfp_t flags, bool force_refill)
773{
774 void *objp;
775
776 if (unlikely(sk_memalloc_socks()))
777 objp = __ac_get_obj(cachep, ac, flags, force_refill);
778 else
779 objp = ac->entry[--ac->avail];
780
781 return objp;
782}
783
784static noinline void *__ac_put_obj(struct kmem_cache *cachep,
785 struct array_cache *ac, void *objp)
786{
787 if (unlikely(pfmemalloc_active)) {
788 /* Some pfmemalloc slabs exist, check if this is one */
789 struct page *page = virt_to_head_page(objp);
790 if (PageSlabPfmemalloc(page))
791 set_obj_pfmemalloc(&objp);
792 }
793 605
794 return objp; 606 page_node = page_to_nid(page);
795} 607 n = get_node(cachep, page_node);
796 608
797static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, 609 spin_lock(&n->list_lock);
798 void *objp) 610 free_block(cachep, &objp, 1, page_node, &list);
799{ 611 spin_unlock(&n->list_lock);
800 if (unlikely(sk_memalloc_socks()))
801 objp = __ac_put_obj(cachep, ac, objp);
802 612
803 ac->entry[ac->avail++] = objp; 613 slabs_destroy(cachep, &list);
804} 614}
805 615
806/* 616/*
@@ -1003,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
1003 STATS_INC_ACOVERFLOW(cachep); 813 STATS_INC_ACOVERFLOW(cachep);
1004 __drain_alien_cache(cachep, ac, page_node, &list); 814 __drain_alien_cache(cachep, ac, page_node, &list);
1005 } 815 }
1006 ac_put_obj(cachep, ac, objp); 816 ac->entry[ac->avail++] = objp;
1007 spin_unlock(&alien->lock); 817 spin_unlock(&alien->lock);
1008 slabs_destroy(cachep, &list); 818 slabs_destroy(cachep, &list);
1009 } else { 819 } else {
@@ -1540,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1540 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) 1350 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
1541 return; 1351 return;
1542 1352
1543 printk(KERN_WARNING 1353 pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
1544 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", 1354 nodeid, gfpflags, &gfpflags);
1545 nodeid, gfpflags); 1355 pr_warn(" cache: %s, object size: %d, order: %d\n",
1546 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
1547 cachep->name, cachep->size, cachep->gfporder); 1356 cachep->name, cachep->size, cachep->gfporder);
1548 1357
1549 for_each_kmem_cache_node(cachep, node, n) { 1358 for_each_kmem_cache_node(cachep, node, n) {
@@ -1567,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1567 1376
1568 num_slabs += active_slabs; 1377 num_slabs += active_slabs;
1569 num_objs = num_slabs * cachep->num; 1378 num_objs = num_slabs * cachep->num;
1570 printk(KERN_WARNING 1379 pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
1571 " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
1572 node, active_slabs, num_slabs, active_objs, num_objs, 1380 node, active_slabs, num_slabs, active_objs, num_objs,
1573 free_objects); 1381 free_objects);
1574 } 1382 }
@@ -1604,10 +1412,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1604 return NULL; 1412 return NULL;
1605 } 1413 }
1606 1414
1607 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1608 if (page_is_pfmemalloc(page))
1609 pfmemalloc_active = true;
1610
1611 nr_pages = (1 << cachep->gfporder); 1415 nr_pages = (1 << cachep->gfporder);
1612 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1416 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1613 add_zone_page_state(page_zone(page), 1417 add_zone_page_state(page_zone(page),
@@ -1615,8 +1419,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1615 else 1419 else
1616 add_zone_page_state(page_zone(page), 1420 add_zone_page_state(page_zone(page),
1617 NR_SLAB_UNRECLAIMABLE, nr_pages); 1421 NR_SLAB_UNRECLAIMABLE, nr_pages);
1422
1618 __SetPageSlab(page); 1423 __SetPageSlab(page);
1619 if (page_is_pfmemalloc(page)) 1424 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1425 if (sk_memalloc_socks() && page_is_pfmemalloc(page))
1620 SetPageSlabPfmemalloc(page); 1426 SetPageSlabPfmemalloc(page);
1621 1427
1622 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1428 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
@@ -1670,6 +1476,14 @@ static void kmem_rcu_free(struct rcu_head *head)
1670} 1476}
1671 1477
1672#if DEBUG 1478#if DEBUG
1479static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
1480{
1481 if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
1482 (cachep->size % PAGE_SIZE) == 0)
1483 return true;
1484
1485 return false;
1486}
1673 1487
1674#ifdef CONFIG_DEBUG_PAGEALLOC 1488#ifdef CONFIG_DEBUG_PAGEALLOC
1675static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1489static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
@@ -1703,6 +1517,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1703 } 1517 }
1704 *addr++ = 0x87654321; 1518 *addr++ = 0x87654321;
1705} 1519}
1520
1521static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
1522 int map, unsigned long caller)
1523{
1524 if (!is_debug_pagealloc_cache(cachep))
1525 return;
1526
1527 if (caller)
1528 store_stackinfo(cachep, objp, caller);
1529
1530 kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
1531}
1532
1533#else
1534static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
1535 int map, unsigned long caller) {}
1536
1706#endif 1537#endif
1707 1538
1708static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 1539static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
@@ -1781,6 +1612,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1781 int size, i; 1612 int size, i;
1782 int lines = 0; 1613 int lines = 0;
1783 1614
1615 if (is_debug_pagealloc_cache(cachep))
1616 return;
1617
1784 realobj = (char *)objp + obj_offset(cachep); 1618 realobj = (char *)objp + obj_offset(cachep);
1785 size = cachep->object_size; 1619 size = cachep->object_size;
1786 1620
@@ -1842,20 +1676,18 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1842 struct page *page) 1676 struct page *page)
1843{ 1677{
1844 int i; 1678 int i;
1679
1680 if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
1681 poison_obj(cachep, page->freelist - obj_offset(cachep),
1682 POISON_FREE);
1683 }
1684
1845 for (i = 0; i < cachep->num; i++) { 1685 for (i = 0; i < cachep->num; i++) {
1846 void *objp = index_to_obj(cachep, page, i); 1686 void *objp = index_to_obj(cachep, page, i);
1847 1687
1848 if (cachep->flags & SLAB_POISON) { 1688 if (cachep->flags & SLAB_POISON) {
1849#ifdef CONFIG_DEBUG_PAGEALLOC
1850 if (cachep->size % PAGE_SIZE == 0 &&
1851 OFF_SLAB(cachep))
1852 kernel_map_pages(virt_to_page(objp),
1853 cachep->size / PAGE_SIZE, 1);
1854 else
1855 check_poison_obj(cachep, objp);
1856#else
1857 check_poison_obj(cachep, objp); 1689 check_poison_obj(cachep, objp);
1858#endif 1690 slab_kernel_map(cachep, objp, 1, 0);
1859 } 1691 }
1860 if (cachep->flags & SLAB_RED_ZONE) { 1692 if (cachep->flags & SLAB_RED_ZONE) {
1861 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1693 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
@@ -1916,7 +1748,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
1916 * calculate_slab_order - calculate size (page order) of slabs 1748 * calculate_slab_order - calculate size (page order) of slabs
1917 * @cachep: pointer to the cache that is being created 1749 * @cachep: pointer to the cache that is being created
1918 * @size: size of objects to be created in this cache. 1750 * @size: size of objects to be created in this cache.
1919 * @align: required alignment for the objects.
1920 * @flags: slab allocation flags 1751 * @flags: slab allocation flags
1921 * 1752 *
1922 * Also calculates the number of objects per slab. 1753 * Also calculates the number of objects per slab.
@@ -1926,9 +1757,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
1926 * towards high-order requests, this should be changed. 1757 * towards high-order requests, this should be changed.
1927 */ 1758 */
1928static size_t calculate_slab_order(struct kmem_cache *cachep, 1759static size_t calculate_slab_order(struct kmem_cache *cachep,
1929 size_t size, size_t align, unsigned long flags) 1760 size_t size, unsigned long flags)
1930{ 1761{
1931 unsigned long offslab_limit;
1932 size_t left_over = 0; 1762 size_t left_over = 0;
1933 int gfporder; 1763 int gfporder;
1934 1764
@@ -1936,7 +1766,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
1936 unsigned int num; 1766 unsigned int num;
1937 size_t remainder; 1767 size_t remainder;
1938 1768
1939 cache_estimate(gfporder, size, align, flags, &remainder, &num); 1769 num = cache_estimate(gfporder, size, flags, &remainder);
1940 if (!num) 1770 if (!num)
1941 continue; 1771 continue;
1942 1772
@@ -1945,19 +1775,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
1945 break; 1775 break;
1946 1776
1947 if (flags & CFLGS_OFF_SLAB) { 1777 if (flags & CFLGS_OFF_SLAB) {
1948 size_t freelist_size_per_obj = sizeof(freelist_idx_t); 1778 struct kmem_cache *freelist_cache;
1779 size_t freelist_size;
1780
1781 freelist_size = num * sizeof(freelist_idx_t);
1782 freelist_cache = kmalloc_slab(freelist_size, 0u);
1783 if (!freelist_cache)
1784 continue;
1785
1949 /* 1786 /*
1950 * Max number of objs-per-slab for caches which 1787 * Needed to avoid possible looping condition
1951 * use off-slab slabs. Needed to avoid a possible 1788 * in cache_grow()
1952 * looping condition in cache_grow().
1953 */ 1789 */
1954 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 1790 if (OFF_SLAB(freelist_cache))
1955 freelist_size_per_obj += sizeof(char); 1791 continue;
1956 offslab_limit = size;
1957 offslab_limit /= freelist_size_per_obj;
1958 1792
1959 if (num > offslab_limit) 1793 /* check if off slab has enough benefit */
1960 break; 1794 if (freelist_cache->size > cachep->size / 2)
1795 continue;
1961 } 1796 }
1962 1797
1963 /* Found something acceptable - save it away */ 1798 /* Found something acceptable - save it away */
@@ -2075,6 +1910,79 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
2075 return cachep; 1910 return cachep;
2076} 1911}
2077 1912
1913static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
1914 size_t size, unsigned long flags)
1915{
1916 size_t left;
1917
1918 cachep->num = 0;
1919
1920 if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU)
1921 return false;
1922
1923 left = calculate_slab_order(cachep, size,
1924 flags | CFLGS_OBJFREELIST_SLAB);
1925 if (!cachep->num)
1926 return false;
1927
1928 if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
1929 return false;
1930
1931 cachep->colour = left / cachep->colour_off;
1932
1933 return true;
1934}
1935
1936static bool set_off_slab_cache(struct kmem_cache *cachep,
1937 size_t size, unsigned long flags)
1938{
1939 size_t left;
1940
1941 cachep->num = 0;
1942
1943 /*
1944 * Always use on-slab management when SLAB_NOLEAKTRACE
1945 * to avoid recursive calls into kmemleak.
1946 */
1947 if (flags & SLAB_NOLEAKTRACE)
1948 return false;
1949
1950 /*
1951 * Size is large, assume best to place the slab management obj
1952 * off-slab (should allow better packing of objs).
1953 */
1954 left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB);
1955 if (!cachep->num)
1956 return false;
1957
1958 /*
1959 * If the slab has been placed off-slab, and we have enough space then
1960 * move it on-slab. This is at the expense of any extra colouring.
1961 */
1962 if (left >= cachep->num * sizeof(freelist_idx_t))
1963 return false;
1964
1965 cachep->colour = left / cachep->colour_off;
1966
1967 return true;
1968}
1969
1970static bool set_on_slab_cache(struct kmem_cache *cachep,
1971 size_t size, unsigned long flags)
1972{
1973 size_t left;
1974
1975 cachep->num = 0;
1976
1977 left = calculate_slab_order(cachep, size, flags);
1978 if (!cachep->num)
1979 return false;
1980
1981 cachep->colour = left / cachep->colour_off;
1982
1983 return true;
1984}
1985
2078/** 1986/**
2079 * __kmem_cache_create - Create a cache. 1987 * __kmem_cache_create - Create a cache.
2080 * @cachep: cache management descriptor 1988 * @cachep: cache management descriptor
@@ -2099,7 +2007,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
2099int 2007int
2100__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) 2008__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2101{ 2009{
2102 size_t left_over, freelist_size;
2103 size_t ralign = BYTES_PER_WORD; 2010 size_t ralign = BYTES_PER_WORD;
2104 gfp_t gfp; 2011 gfp_t gfp;
2105 int err; 2012 int err;
@@ -2119,8 +2026,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2119 if (!(flags & SLAB_DESTROY_BY_RCU)) 2026 if (!(flags & SLAB_DESTROY_BY_RCU))
2120 flags |= SLAB_POISON; 2027 flags |= SLAB_POISON;
2121#endif 2028#endif
2122 if (flags & SLAB_DESTROY_BY_RCU)
2123 BUG_ON(flags & SLAB_POISON);
2124#endif 2029#endif
2125 2030
2126 /* 2031 /*
@@ -2152,6 +2057,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2152 * 4) Store it. 2057 * 4) Store it.
2153 */ 2058 */
2154 cachep->align = ralign; 2059 cachep->align = ralign;
2060 cachep->colour_off = cache_line_size();
2061 /* Offset must be a multiple of the alignment. */
2062 if (cachep->colour_off < cachep->align)
2063 cachep->colour_off = cachep->align;
2155 2064
2156 if (slab_is_available()) 2065 if (slab_is_available())
2157 gfp = GFP_KERNEL; 2066 gfp = GFP_KERNEL;
@@ -2179,37 +2088,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2179 else 2088 else
2180 size += BYTES_PER_WORD; 2089 size += BYTES_PER_WORD;
2181 } 2090 }
2182#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2183 /*
2184 * To activate debug pagealloc, off-slab management is necessary
2185 * requirement. In early phase of initialization, small sized slab
2186 * doesn't get initialized so it would not be possible. So, we need
2187 * to check size >= 256. It guarantees that all necessary small
2188 * sized slab is initialized in current slab initialization sequence.
2189 */
2190 if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) &&
2191 size >= 256 && cachep->object_size > cache_line_size() &&
2192 ALIGN(size, cachep->align) < PAGE_SIZE) {
2193 cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
2194 size = PAGE_SIZE;
2195 }
2196#endif
2197#endif 2091#endif
2198 2092
2199 /*
2200 * Determine if the slab management is 'on' or 'off' slab.
2201 * (bootstrapping cannot cope with offslab caches so don't do
2202 * it too early on. Always use on-slab management when
2203 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2204 */
2205 if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
2206 !(flags & SLAB_NOLEAKTRACE))
2207 /*
2208 * Size is large, assume best to place the slab management obj
2209 * off-slab (should allow better packing of objs).
2210 */
2211 flags |= CFLGS_OFF_SLAB;
2212
2213 size = ALIGN(size, cachep->align); 2093 size = ALIGN(size, cachep->align);
2214 /* 2094 /*
2215 * We should restrict the number of objects in a slab to implement 2095 * We should restrict the number of objects in a slab to implement
@@ -2218,42 +2098,46 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2218 if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) 2098 if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
2219 size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); 2099 size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
2220 2100
2221 left_over = calculate_slab_order(cachep, size, cachep->align, flags); 2101#if DEBUG
2222
2223 if (!cachep->num)
2224 return -E2BIG;
2225
2226 freelist_size = calculate_freelist_size(cachep->num, cachep->align);
2227
2228 /* 2102 /*
2229 * If the slab has been placed off-slab, and we have enough space then 2103 * To activate debug pagealloc, off-slab management is necessary
2230 * move it on-slab. This is at the expense of any extra colouring. 2104 * requirement. In early phase of initialization, small sized slab
2105 * doesn't get initialized so it would not be possible. So, we need
2106 * to check size >= 256. It guarantees that all necessary small
2107 * sized slab is initialized in current slab initialization sequence.
2231 */ 2108 */
2232 if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { 2109 if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
2233 flags &= ~CFLGS_OFF_SLAB; 2110 size >= 256 && cachep->object_size > cache_line_size()) {
2234 left_over -= freelist_size; 2111 if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
2112 size_t tmp_size = ALIGN(size, PAGE_SIZE);
2113
2114 if (set_off_slab_cache(cachep, tmp_size, flags)) {
2115 flags |= CFLGS_OFF_SLAB;
2116 cachep->obj_offset += tmp_size - size;
2117 size = tmp_size;
2118 goto done;
2119 }
2120 }
2235 } 2121 }
2122#endif
2236 2123
2237 if (flags & CFLGS_OFF_SLAB) { 2124 if (set_objfreelist_slab_cache(cachep, size, flags)) {
2238 /* really off slab. No need for manual alignment */ 2125 flags |= CFLGS_OBJFREELIST_SLAB;
2239 freelist_size = calculate_freelist_size(cachep->num, 0); 2126 goto done;
2127 }
2240 2128
2241#ifdef CONFIG_PAGE_POISONING 2129 if (set_off_slab_cache(cachep, size, flags)) {
2242 /* If we're going to use the generic kernel_map_pages() 2130 flags |= CFLGS_OFF_SLAB;
2243 * poisoning, then it's going to smash the contents of 2131 goto done;
2244 * the redzone and userword anyhow, so switch them off.
2245 */
2246 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2247 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2248#endif
2249 } 2132 }
2250 2133
2251 cachep->colour_off = cache_line_size(); 2134 if (set_on_slab_cache(cachep, size, flags))
2252 /* Offset must be a multiple of the alignment. */ 2135 goto done;
2253 if (cachep->colour_off < cachep->align) 2136
2254 cachep->colour_off = cachep->align; 2137 return -E2BIG;
2255 cachep->colour = left_over / cachep->colour_off; 2138
2256 cachep->freelist_size = freelist_size; 2139done:
2140 cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
2257 cachep->flags = flags; 2141 cachep->flags = flags;
2258 cachep->allocflags = __GFP_COMP; 2142 cachep->allocflags = __GFP_COMP;
2259 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 2143 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
@@ -2261,16 +2145,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2261 cachep->size = size; 2145 cachep->size = size;
2262 cachep->reciprocal_buffer_size = reciprocal_value(size); 2146 cachep->reciprocal_buffer_size = reciprocal_value(size);
2263 2147
2264 if (flags & CFLGS_OFF_SLAB) { 2148#if DEBUG
2265 cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); 2149 /*
2266 /* 2150 * If we're going to use the generic kernel_map_pages()
2267 * This is a possibility for one of the kmalloc_{dma,}_caches. 2151 * poisoning, then it's going to smash the contents of
2268 * But since we go off slab only for object size greater than 2152 * the redzone and userword anyhow, so switch them off.
2269 * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created 2153 */
2270 * in ascending order,this should not happen at all. 2154 if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
2271 * But leave a BUG_ON for some lucky dude. 2155 (cachep->flags & SLAB_POISON) &&
2272 */ 2156 is_debug_pagealloc_cache(cachep))
2273 BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); 2157 cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2158#endif
2159
2160 if (OFF_SLAB(cachep)) {
2161 cachep->freelist_cache =
2162 kmalloc_slab(cachep->freelist_size, 0u);
2274 } 2163 }
2275 2164
2276 err = setup_cpu_cache(cachep, gfp); 2165 err = setup_cpu_cache(cachep, gfp);
@@ -2377,9 +2266,6 @@ static int drain_freelist(struct kmem_cache *cache,
2377 } 2266 }
2378 2267
2379 page = list_entry(p, struct page, lru); 2268 page = list_entry(p, struct page, lru);
2380#if DEBUG
2381 BUG_ON(page->active);
2382#endif
2383 list_del(&page->lru); 2269 list_del(&page->lru);
2384 /* 2270 /*
2385 * Safe to drop the lock. The slab is no longer linked 2271 * Safe to drop the lock. The slab is no longer linked
@@ -2454,18 +2340,23 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
2454 void *freelist; 2340 void *freelist;
2455 void *addr = page_address(page); 2341 void *addr = page_address(page);
2456 2342
2457 if (OFF_SLAB(cachep)) { 2343 page->s_mem = addr + colour_off;
2344 page->active = 0;
2345
2346 if (OBJFREELIST_SLAB(cachep))
2347 freelist = NULL;
2348 else if (OFF_SLAB(cachep)) {
2458 /* Slab management obj is off-slab. */ 2349 /* Slab management obj is off-slab. */
2459 freelist = kmem_cache_alloc_node(cachep->freelist_cache, 2350 freelist = kmem_cache_alloc_node(cachep->freelist_cache,
2460 local_flags, nodeid); 2351 local_flags, nodeid);
2461 if (!freelist) 2352 if (!freelist)
2462 return NULL; 2353 return NULL;
2463 } else { 2354 } else {
2464 freelist = addr + colour_off; 2355 /* We will use last bytes at the slab for freelist */
2465 colour_off += cachep->freelist_size; 2356 freelist = addr + (PAGE_SIZE << cachep->gfporder) -
2357 cachep->freelist_size;
2466 } 2358 }
2467 page->active = 0; 2359
2468 page->s_mem = addr + colour_off;
2469 return freelist; 2360 return freelist;
2470} 2361}
2471 2362
@@ -2480,17 +2371,14 @@ static inline void set_free_obj(struct page *page,
2480 ((freelist_idx_t *)(page->freelist))[idx] = val; 2371 ((freelist_idx_t *)(page->freelist))[idx] = val;
2481} 2372}
2482 2373
2483static void cache_init_objs(struct kmem_cache *cachep, 2374static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
2484 struct page *page)
2485{ 2375{
2376#if DEBUG
2486 int i; 2377 int i;
2487 2378
2488 for (i = 0; i < cachep->num; i++) { 2379 for (i = 0; i < cachep->num; i++) {
2489 void *objp = index_to_obj(cachep, page, i); 2380 void *objp = index_to_obj(cachep, page, i);
2490#if DEBUG 2381
2491 /* need to poison the objs? */
2492 if (cachep->flags & SLAB_POISON)
2493 poison_obj(cachep, objp, POISON_FREE);
2494 if (cachep->flags & SLAB_STORE_USER) 2382 if (cachep->flags & SLAB_STORE_USER)
2495 *dbg_userword(cachep, objp) = NULL; 2383 *dbg_userword(cachep, objp) = NULL;
2496 2384
@@ -2514,15 +2402,32 @@ static void cache_init_objs(struct kmem_cache *cachep,
2514 slab_error(cachep, "constructor overwrote the" 2402 slab_error(cachep, "constructor overwrote the"
2515 " start of an object"); 2403 " start of an object");
2516 } 2404 }
2517 if ((cachep->size % PAGE_SIZE) == 0 && 2405 /* need to poison the objs? */
2518 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2406 if (cachep->flags & SLAB_POISON) {
2519 kernel_map_pages(virt_to_page(objp), 2407 poison_obj(cachep, objp, POISON_FREE);
2520 cachep->size / PAGE_SIZE, 0); 2408 slab_kernel_map(cachep, objp, 0, 0);
2521#else 2409 }
2522 if (cachep->ctor) 2410 }
2523 cachep->ctor(objp);
2524#endif 2411#endif
2525 set_obj_status(page, i, OBJECT_FREE); 2412}
2413
2414static void cache_init_objs(struct kmem_cache *cachep,
2415 struct page *page)
2416{
2417 int i;
2418
2419 cache_init_objs_debug(cachep, page);
2420
2421 if (OBJFREELIST_SLAB(cachep)) {
2422 page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
2423 obj_offset(cachep);
2424 }
2425
2426 for (i = 0; i < cachep->num; i++) {
2427 /* constructor could break poison info */
2428 if (DEBUG == 0 && cachep->ctor)
2429 cachep->ctor(index_to_obj(cachep, page, i));
2430
2526 set_free_obj(page, i, i); 2431 set_free_obj(page, i, i);
2527 } 2432 }
2528} 2433}
@@ -2537,30 +2442,28 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2537 } 2442 }
2538} 2443}
2539 2444
2540static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, 2445static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
2541 int nodeid)
2542{ 2446{
2543 void *objp; 2447 void *objp;
2544 2448
2545 objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); 2449 objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
2546 page->active++; 2450 page->active++;
2451
2547#if DEBUG 2452#if DEBUG
2548 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); 2453 if (cachep->flags & SLAB_STORE_USER)
2454 set_store_user_dirty(cachep);
2549#endif 2455#endif
2550 2456
2551 return objp; 2457 return objp;
2552} 2458}
2553 2459
2554static void slab_put_obj(struct kmem_cache *cachep, struct page *page, 2460static void slab_put_obj(struct kmem_cache *cachep,
2555 void *objp, int nodeid) 2461 struct page *page, void *objp)
2556{ 2462{
2557 unsigned int objnr = obj_to_index(cachep, page, objp); 2463 unsigned int objnr = obj_to_index(cachep, page, objp);
2558#if DEBUG 2464#if DEBUG
2559 unsigned int i; 2465 unsigned int i;
2560 2466
2561 /* Verify that the slab belongs to the intended node */
2562 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
2563
2564 /* Verify double free bug */ 2467 /* Verify double free bug */
2565 for (i = page->active; i < cachep->num; i++) { 2468 for (i = page->active; i < cachep->num; i++) {
2566 if (get_free_obj(page, i) == objnr) { 2469 if (get_free_obj(page, i) == objnr) {
@@ -2571,6 +2474,9 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
2571 } 2474 }
2572#endif 2475#endif
2573 page->active--; 2476 page->active--;
2477 if (!page->freelist)
2478 page->freelist = objp + obj_offset(cachep);
2479
2574 set_free_obj(page, page->active, objnr); 2480 set_free_obj(page, page->active, objnr);
2575} 2481}
2576 2482
@@ -2645,7 +2551,7 @@ static int cache_grow(struct kmem_cache *cachep,
2645 /* Get slab management. */ 2551 /* Get slab management. */
2646 freelist = alloc_slabmgmt(cachep, page, offset, 2552 freelist = alloc_slabmgmt(cachep, page, offset,
2647 local_flags & ~GFP_CONSTRAINT_MASK, nodeid); 2553 local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2648 if (!freelist) 2554 if (OFF_SLAB(cachep) && !freelist)
2649 goto opps1; 2555 goto opps1;
2650 2556
2651 slab_map_pages(cachep, page, freelist); 2557 slab_map_pages(cachep, page, freelist);
@@ -2726,27 +2632,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2726 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2632 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2727 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2633 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2728 } 2634 }
2729 if (cachep->flags & SLAB_STORE_USER) 2635 if (cachep->flags & SLAB_STORE_USER) {
2636 set_store_user_dirty(cachep);
2730 *dbg_userword(cachep, objp) = (void *)caller; 2637 *dbg_userword(cachep, objp) = (void *)caller;
2638 }
2731 2639
2732 objnr = obj_to_index(cachep, page, objp); 2640 objnr = obj_to_index(cachep, page, objp);
2733 2641
2734 BUG_ON(objnr >= cachep->num); 2642 BUG_ON(objnr >= cachep->num);
2735 BUG_ON(objp != index_to_obj(cachep, page, objnr)); 2643 BUG_ON(objp != index_to_obj(cachep, page, objnr));
2736 2644
2737 set_obj_status(page, objnr, OBJECT_FREE);
2738 if (cachep->flags & SLAB_POISON) { 2645 if (cachep->flags & SLAB_POISON) {
2739#ifdef CONFIG_DEBUG_PAGEALLOC
2740 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2741 store_stackinfo(cachep, objp, caller);
2742 kernel_map_pages(virt_to_page(objp),
2743 cachep->size / PAGE_SIZE, 0);
2744 } else {
2745 poison_obj(cachep, objp, POISON_FREE);
2746 }
2747#else
2748 poison_obj(cachep, objp, POISON_FREE); 2646 poison_obj(cachep, objp, POISON_FREE);
2749#endif 2647 slab_kernel_map(cachep, objp, 0, caller);
2750 } 2648 }
2751 return objp; 2649 return objp;
2752} 2650}
@@ -2756,7 +2654,85 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2756#define cache_free_debugcheck(x,objp,z) (objp) 2654#define cache_free_debugcheck(x,objp,z) (objp)
2757#endif 2655#endif
2758 2656
2759static struct page *get_first_slab(struct kmem_cache_node *n) 2657static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
2658 void **list)
2659{
2660#if DEBUG
2661 void *next = *list;
2662 void *objp;
2663
2664 while (next) {
2665 objp = next - obj_offset(cachep);
2666 next = *(void **)next;
2667 poison_obj(cachep, objp, POISON_FREE);
2668 }
2669#endif
2670}
2671
2672static inline void fixup_slab_list(struct kmem_cache *cachep,
2673 struct kmem_cache_node *n, struct page *page,
2674 void **list)
2675{
2676 /* move slabp to correct slabp list: */
2677 list_del(&page->lru);
2678 if (page->active == cachep->num) {
2679 list_add(&page->lru, &n->slabs_full);
2680 if (OBJFREELIST_SLAB(cachep)) {
2681#if DEBUG
2682 /* Poisoning will be done without holding the lock */
2683 if (cachep->flags & SLAB_POISON) {
2684 void **objp = page->freelist;
2685
2686 *objp = *list;
2687 *list = objp;
2688 }
2689#endif
2690 page->freelist = NULL;
2691 }
2692 } else
2693 list_add(&page->lru, &n->slabs_partial);
2694}
2695
2696/* Try to find non-pfmemalloc slab if needed */
2697static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
2698 struct page *page, bool pfmemalloc)
2699{
2700 if (!page)
2701 return NULL;
2702
2703 if (pfmemalloc)
2704 return page;
2705
2706 if (!PageSlabPfmemalloc(page))
2707 return page;
2708
2709 /* No need to keep pfmemalloc slab if we have enough free objects */
2710 if (n->free_objects > n->free_limit) {
2711 ClearPageSlabPfmemalloc(page);
2712 return page;
2713 }
2714
2715 /* Move pfmemalloc slab to the end of list to speed up next search */
2716 list_del(&page->lru);
2717 if (!page->active)
2718 list_add_tail(&page->lru, &n->slabs_free);
2719 else
2720 list_add_tail(&page->lru, &n->slabs_partial);
2721
2722 list_for_each_entry(page, &n->slabs_partial, lru) {
2723 if (!PageSlabPfmemalloc(page))
2724 return page;
2725 }
2726
2727 list_for_each_entry(page, &n->slabs_free, lru) {
2728 if (!PageSlabPfmemalloc(page))
2729 return page;
2730 }
2731
2732 return NULL;
2733}
2734
2735static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
2760{ 2736{
2761 struct page *page; 2737 struct page *page;
2762 2738
@@ -2768,21 +2744,51 @@ static struct page *get_first_slab(struct kmem_cache_node *n)
2768 struct page, lru); 2744 struct page, lru);
2769 } 2745 }
2770 2746
2747 if (sk_memalloc_socks())
2748 return get_valid_first_slab(n, page, pfmemalloc);
2749
2771 return page; 2750 return page;
2772} 2751}
2773 2752
2774static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, 2753static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
2775 bool force_refill) 2754 struct kmem_cache_node *n, gfp_t flags)
2755{
2756 struct page *page;
2757 void *obj;
2758 void *list = NULL;
2759
2760 if (!gfp_pfmemalloc_allowed(flags))
2761 return NULL;
2762
2763 spin_lock(&n->list_lock);
2764 page = get_first_slab(n, true);
2765 if (!page) {
2766 spin_unlock(&n->list_lock);
2767 return NULL;
2768 }
2769
2770 obj = slab_get_obj(cachep, page);
2771 n->free_objects--;
2772
2773 fixup_slab_list(cachep, n, page, &list);
2774
2775 spin_unlock(&n->list_lock);
2776 fixup_objfreelist_debug(cachep, &list);
2777
2778 return obj;
2779}
2780
2781static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2776{ 2782{
2777 int batchcount; 2783 int batchcount;
2778 struct kmem_cache_node *n; 2784 struct kmem_cache_node *n;
2779 struct array_cache *ac; 2785 struct array_cache *ac;
2780 int node; 2786 int node;
2787 void *list = NULL;
2781 2788
2782 check_irq_off(); 2789 check_irq_off();
2783 node = numa_mem_id(); 2790 node = numa_mem_id();
2784 if (unlikely(force_refill)) 2791
2785 goto force_grow;
2786retry: 2792retry:
2787 ac = cpu_cache_get(cachep); 2793 ac = cpu_cache_get(cachep);
2788 batchcount = ac->batchcount; 2794 batchcount = ac->batchcount;
@@ -2808,7 +2814,7 @@ retry:
2808 while (batchcount > 0) { 2814 while (batchcount > 0) {
2809 struct page *page; 2815 struct page *page;
2810 /* Get slab alloc is to come from. */ 2816 /* Get slab alloc is to come from. */
2811 page = get_first_slab(n); 2817 page = get_first_slab(n, false);
2812 if (!page) 2818 if (!page)
2813 goto must_grow; 2819 goto must_grow;
2814 2820
@@ -2826,26 +2832,29 @@ retry:
2826 STATS_INC_ACTIVE(cachep); 2832 STATS_INC_ACTIVE(cachep);
2827 STATS_SET_HIGH(cachep); 2833 STATS_SET_HIGH(cachep);
2828 2834
2829 ac_put_obj(cachep, ac, slab_get_obj(cachep, page, 2835 ac->entry[ac->avail++] = slab_get_obj(cachep, page);
2830 node));
2831 } 2836 }
2832 2837
2833 /* move slabp to correct slabp list: */ 2838 fixup_slab_list(cachep, n, page, &list);
2834 list_del(&page->lru);
2835 if (page->active == cachep->num)
2836 list_add(&page->lru, &n->slabs_full);
2837 else
2838 list_add(&page->lru, &n->slabs_partial);
2839 } 2839 }
2840 2840
2841must_grow: 2841must_grow:
2842 n->free_objects -= ac->avail; 2842 n->free_objects -= ac->avail;
2843alloc_done: 2843alloc_done:
2844 spin_unlock(&n->list_lock); 2844 spin_unlock(&n->list_lock);
2845 fixup_objfreelist_debug(cachep, &list);
2845 2846
2846 if (unlikely(!ac->avail)) { 2847 if (unlikely(!ac->avail)) {
2847 int x; 2848 int x;
2848force_grow: 2849
2850 /* Check if we can use obj in pfmemalloc slab */
2851 if (sk_memalloc_socks()) {
2852 void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
2853
2854 if (obj)
2855 return obj;
2856 }
2857
2849 x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); 2858 x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
2850 2859
2851 /* cache_grow can reenable interrupts, then ac could change. */ 2860 /* cache_grow can reenable interrupts, then ac could change. */
@@ -2853,7 +2862,7 @@ force_grow:
2853 node = numa_mem_id(); 2862 node = numa_mem_id();
2854 2863
2855 /* no objects in sight? abort */ 2864 /* no objects in sight? abort */
2856 if (!x && (ac->avail == 0 || force_refill)) 2865 if (!x && ac->avail == 0)
2857 return NULL; 2866 return NULL;
2858 2867
2859 if (!ac->avail) /* objects refilled by interrupt? */ 2868 if (!ac->avail) /* objects refilled by interrupt? */
@@ -2861,7 +2870,7 @@ force_grow:
2861 } 2870 }
2862 ac->touched = 1; 2871 ac->touched = 1;
2863 2872
2864 return ac_get_obj(cachep, ac, flags, force_refill); 2873 return ac->entry[--ac->avail];
2865} 2874}
2866 2875
2867static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 2876static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -2877,20 +2886,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2877static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 2886static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2878 gfp_t flags, void *objp, unsigned long caller) 2887 gfp_t flags, void *objp, unsigned long caller)
2879{ 2888{
2880 struct page *page;
2881
2882 if (!objp) 2889 if (!objp)
2883 return objp; 2890 return objp;
2884 if (cachep->flags & SLAB_POISON) { 2891 if (cachep->flags & SLAB_POISON) {
2885#ifdef CONFIG_DEBUG_PAGEALLOC
2886 if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2887 kernel_map_pages(virt_to_page(objp),
2888 cachep->size / PAGE_SIZE, 1);
2889 else
2890 check_poison_obj(cachep, objp);
2891#else
2892 check_poison_obj(cachep, objp); 2892 check_poison_obj(cachep, objp);
2893#endif 2893 slab_kernel_map(cachep, objp, 1, 0);
2894 poison_obj(cachep, objp, POISON_INUSE); 2894 poison_obj(cachep, objp, POISON_INUSE);
2895 } 2895 }
2896 if (cachep->flags & SLAB_STORE_USER) 2896 if (cachep->flags & SLAB_STORE_USER)
@@ -2910,8 +2910,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2910 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2910 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2911 } 2911 }
2912 2912
2913 page = virt_to_head_page(objp);
2914 set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
2915 objp += obj_offset(cachep); 2913 objp += obj_offset(cachep);
2916 if (cachep->ctor && cachep->flags & SLAB_POISON) 2914 if (cachep->ctor && cachep->flags & SLAB_POISON)
2917 cachep->ctor(objp); 2915 cachep->ctor(objp);
@@ -2926,40 +2924,24 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2926#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2924#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2927#endif 2925#endif
2928 2926
2929static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
2930{
2931 if (unlikely(cachep == kmem_cache))
2932 return false;
2933
2934 return should_failslab(cachep->object_size, flags, cachep->flags);
2935}
2936
2937static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 2927static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2938{ 2928{
2939 void *objp; 2929 void *objp;
2940 struct array_cache *ac; 2930 struct array_cache *ac;
2941 bool force_refill = false;
2942 2931
2943 check_irq_off(); 2932 check_irq_off();
2944 2933
2945 ac = cpu_cache_get(cachep); 2934 ac = cpu_cache_get(cachep);
2946 if (likely(ac->avail)) { 2935 if (likely(ac->avail)) {
2947 ac->touched = 1; 2936 ac->touched = 1;
2948 objp = ac_get_obj(cachep, ac, flags, false); 2937 objp = ac->entry[--ac->avail];
2949 2938
2950 /* 2939 STATS_INC_ALLOCHIT(cachep);
2951 * Allow for the possibility all avail objects are not allowed 2940 goto out;
2952 * by the current flags
2953 */
2954 if (objp) {
2955 STATS_INC_ALLOCHIT(cachep);
2956 goto out;
2957 }
2958 force_refill = true;
2959 } 2941 }
2960 2942
2961 STATS_INC_ALLOCMISS(cachep); 2943 STATS_INC_ALLOCMISS(cachep);
2962 objp = cache_alloc_refill(cachep, flags, force_refill); 2944 objp = cache_alloc_refill(cachep, flags);
2963 /* 2945 /*
2964 * the 'ac' may be updated by cache_alloc_refill(), 2946 * the 'ac' may be updated by cache_alloc_refill(),
2965 * and kmemleak_erase() requires its correct value. 2947 * and kmemleak_erase() requires its correct value.
@@ -3097,6 +3079,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3097 struct page *page; 3079 struct page *page;
3098 struct kmem_cache_node *n; 3080 struct kmem_cache_node *n;
3099 void *obj; 3081 void *obj;
3082 void *list = NULL;
3100 int x; 3083 int x;
3101 3084
3102 VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); 3085 VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
@@ -3106,7 +3089,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3106retry: 3089retry:
3107 check_irq_off(); 3090 check_irq_off();
3108 spin_lock(&n->list_lock); 3091 spin_lock(&n->list_lock);
3109 page = get_first_slab(n); 3092 page = get_first_slab(n, false);
3110 if (!page) 3093 if (!page)
3111 goto must_grow; 3094 goto must_grow;
3112 3095
@@ -3118,17 +3101,13 @@ retry:
3118 3101
3119 BUG_ON(page->active == cachep->num); 3102 BUG_ON(page->active == cachep->num);
3120 3103
3121 obj = slab_get_obj(cachep, page, nodeid); 3104 obj = slab_get_obj(cachep, page);
3122 n->free_objects--; 3105 n->free_objects--;
3123 /* move slabp to correct slabp list: */
3124 list_del(&page->lru);
3125 3106
3126 if (page->active == cachep->num) 3107 fixup_slab_list(cachep, n, page, &list);
3127 list_add(&page->lru, &n->slabs_full);
3128 else
3129 list_add(&page->lru, &n->slabs_partial);
3130 3108
3131 spin_unlock(&n->list_lock); 3109 spin_unlock(&n->list_lock);
3110 fixup_objfreelist_debug(cachep, &list);
3132 goto done; 3111 goto done;
3133 3112
3134must_grow: 3113must_grow:
@@ -3152,14 +3131,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3152 int slab_node = numa_mem_id(); 3131 int slab_node = numa_mem_id();
3153 3132
3154 flags &= gfp_allowed_mask; 3133 flags &= gfp_allowed_mask;
3155 3134 cachep = slab_pre_alloc_hook(cachep, flags);
3156 lockdep_trace_alloc(flags); 3135 if (unlikely(!cachep))
3157
3158 if (slab_should_failslab(cachep, flags))
3159 return NULL; 3136 return NULL;
3160 3137
3161 cachep = memcg_kmem_get_cache(cachep, flags);
3162
3163 cache_alloc_debugcheck_before(cachep, flags); 3138 cache_alloc_debugcheck_before(cachep, flags);
3164 local_irq_save(save_flags); 3139 local_irq_save(save_flags);
3165 3140
@@ -3188,16 +3163,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3188 out: 3163 out:
3189 local_irq_restore(save_flags); 3164 local_irq_restore(save_flags);
3190 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3165 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3191 kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
3192 flags);
3193 3166
3194 if (likely(ptr)) { 3167 if (unlikely(flags & __GFP_ZERO) && ptr)
3195 kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); 3168 memset(ptr, 0, cachep->object_size);
3196 if (unlikely(flags & __GFP_ZERO))
3197 memset(ptr, 0, cachep->object_size);
3198 }
3199 3169
3200 memcg_kmem_put_cache(cachep); 3170 slab_post_alloc_hook(cachep, flags, 1, &ptr);
3201 return ptr; 3171 return ptr;
3202} 3172}
3203 3173
@@ -3240,30 +3210,21 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3240 void *objp; 3210 void *objp;
3241 3211
3242 flags &= gfp_allowed_mask; 3212 flags &= gfp_allowed_mask;
3243 3213 cachep = slab_pre_alloc_hook(cachep, flags);
3244 lockdep_trace_alloc(flags); 3214 if (unlikely(!cachep))
3245
3246 if (slab_should_failslab(cachep, flags))
3247 return NULL; 3215 return NULL;
3248 3216
3249 cachep = memcg_kmem_get_cache(cachep, flags);
3250
3251 cache_alloc_debugcheck_before(cachep, flags); 3217 cache_alloc_debugcheck_before(cachep, flags);
3252 local_irq_save(save_flags); 3218 local_irq_save(save_flags);
3253 objp = __do_cache_alloc(cachep, flags); 3219 objp = __do_cache_alloc(cachep, flags);
3254 local_irq_restore(save_flags); 3220 local_irq_restore(save_flags);
3255 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3221 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3256 kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
3257 flags);
3258 prefetchw(objp); 3222 prefetchw(objp);
3259 3223
3260 if (likely(objp)) { 3224 if (unlikely(flags & __GFP_ZERO) && objp)
3261 kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); 3225 memset(objp, 0, cachep->object_size);
3262 if (unlikely(flags & __GFP_ZERO))
3263 memset(objp, 0, cachep->object_size);
3264 }
3265 3226
3266 memcg_kmem_put_cache(cachep); 3227 slab_post_alloc_hook(cachep, flags, 1, &objp);
3267 return objp; 3228 return objp;
3268} 3229}
3269 3230
@@ -3281,13 +3242,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
3281 void *objp; 3242 void *objp;
3282 struct page *page; 3243 struct page *page;
3283 3244
3284 clear_obj_pfmemalloc(&objpp[i]);
3285 objp = objpp[i]; 3245 objp = objpp[i];
3286 3246
3287 page = virt_to_head_page(objp); 3247 page = virt_to_head_page(objp);
3288 list_del(&page->lru); 3248 list_del(&page->lru);
3289 check_spinlock_acquired_node(cachep, node); 3249 check_spinlock_acquired_node(cachep, node);
3290 slab_put_obj(cachep, page, objp, node); 3250 slab_put_obj(cachep, page, objp);
3291 STATS_DEC_ACTIVE(cachep); 3251 STATS_DEC_ACTIVE(cachep);
3292 n->free_objects++; 3252 n->free_objects++;
3293 3253
@@ -3317,9 +3277,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3317 LIST_HEAD(list); 3277 LIST_HEAD(list);
3318 3278
3319 batchcount = ac->batchcount; 3279 batchcount = ac->batchcount;
3320#if DEBUG 3280
3321 BUG_ON(!batchcount || batchcount > ac->avail);
3322#endif
3323 check_irq_off(); 3281 check_irq_off();
3324 n = get_node(cachep, node); 3282 n = get_node(cachep, node);
3325 spin_lock(&n->list_lock); 3283 spin_lock(&n->list_lock);
@@ -3389,7 +3347,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3389 cache_flusharray(cachep, ac); 3347 cache_flusharray(cachep, ac);
3390 } 3348 }
3391 3349
3392 ac_put_obj(cachep, ac, objp); 3350 if (sk_memalloc_socks()) {
3351 struct page *page = virt_to_head_page(objp);
3352
3353 if (unlikely(PageSlabPfmemalloc(page))) {
3354 cache_free_pfmemalloc(cachep, page, objp);
3355 return;
3356 }
3357 }
3358
3359 ac->entry[ac->avail++] = objp;
3393} 3360}
3394 3361
3395/** 3362/**
@@ -3411,16 +3378,53 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3411} 3378}
3412EXPORT_SYMBOL(kmem_cache_alloc); 3379EXPORT_SYMBOL(kmem_cache_alloc);
3413 3380
3414void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) 3381static __always_inline void
3382cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
3383 size_t size, void **p, unsigned long caller)
3415{ 3384{
3416 __kmem_cache_free_bulk(s, size, p); 3385 size_t i;
3386
3387 for (i = 0; i < size; i++)
3388 p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
3417} 3389}
3418EXPORT_SYMBOL(kmem_cache_free_bulk);
3419 3390
3420int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 3391int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
3421 void **p) 3392 void **p)
3422{ 3393{
3423 return __kmem_cache_alloc_bulk(s, flags, size, p); 3394 size_t i;
3395
3396 s = slab_pre_alloc_hook(s, flags);
3397 if (!s)
3398 return 0;
3399
3400 cache_alloc_debugcheck_before(s, flags);
3401
3402 local_irq_disable();
3403 for (i = 0; i < size; i++) {
3404 void *objp = __do_cache_alloc(s, flags);
3405
3406 if (unlikely(!objp))
3407 goto error;
3408 p[i] = objp;
3409 }
3410 local_irq_enable();
3411
3412 cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
3413
3414 /* Clear memory outside IRQ disabled section */
3415 if (unlikely(flags & __GFP_ZERO))
3416 for (i = 0; i < size; i++)
3417 memset(p[i], 0, s->object_size);
3418
3419 slab_post_alloc_hook(s, flags, size, p);
3420 /* FIXME: Trace call missing. Christoph would like a bulk variant */
3421 return size;
3422error:
3423 local_irq_enable();
3424 cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
3425 slab_post_alloc_hook(s, flags, i, p);
3426 __kmem_cache_free_bulk(s, i, p);
3427 return 0;
3424} 3428}
3425EXPORT_SYMBOL(kmem_cache_alloc_bulk); 3429EXPORT_SYMBOL(kmem_cache_alloc_bulk);
3426 3430
@@ -3567,6 +3571,32 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3567} 3571}
3568EXPORT_SYMBOL(kmem_cache_free); 3572EXPORT_SYMBOL(kmem_cache_free);
3569 3573
3574void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
3575{
3576 struct kmem_cache *s;
3577 size_t i;
3578
3579 local_irq_disable();
3580 for (i = 0; i < size; i++) {
3581 void *objp = p[i];
3582
3583 if (!orig_s) /* called via kfree_bulk */
3584 s = virt_to_cache(objp);
3585 else
3586 s = cache_from_obj(orig_s, objp);
3587
3588 debug_check_no_locks_freed(objp, s->object_size);
3589 if (!(s->flags & SLAB_DEBUG_OBJECTS))
3590 debug_check_no_obj_freed(objp, s->object_size);
3591
3592 __cache_free(s, objp, _RET_IP_);
3593 }
3594 local_irq_enable();
3595
3596 /* FIXME: add tracing */
3597}
3598EXPORT_SYMBOL(kmem_cache_free_bulk);
3599
3570/** 3600/**
3571 * kfree - free previously allocated memory 3601 * kfree - free previously allocated memory
3572 * @objp: pointer returned by kmalloc. 3602 * @objp: pointer returned by kmalloc.
@@ -4102,15 +4132,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
4102 struct page *page) 4132 struct page *page)
4103{ 4133{
4104 void *p; 4134 void *p;
4105 int i; 4135 int i, j;
4136 unsigned long v;
4106 4137
4107 if (n[0] == n[1]) 4138 if (n[0] == n[1])
4108 return; 4139 return;
4109 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { 4140 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
4110 if (get_obj_status(page, i) != OBJECT_ACTIVE) 4141 bool active = true;
4142
4143 for (j = page->active; j < c->num; j++) {
4144 if (get_free_obj(page, j) == i) {
4145 active = false;
4146 break;
4147 }
4148 }
4149
4150 if (!active)
4151 continue;
4152
4153 /*
4154 * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
4155 * mapping is established when actual object allocation and
4156 * we could mistakenly access the unmapped object in the cpu
4157 * cache.
4158 */
4159 if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
4111 continue; 4160 continue;
4112 4161
4113 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4162 if (!add_caller(n, v))
4114 return; 4163 return;
4115 } 4164 }
4116} 4165}
@@ -4146,21 +4195,31 @@ static int leaks_show(struct seq_file *m, void *p)
4146 if (!(cachep->flags & SLAB_RED_ZONE)) 4195 if (!(cachep->flags & SLAB_RED_ZONE))
4147 return 0; 4196 return 0;
4148 4197
4149 /* OK, we can do it */ 4198 /*
4199 * Set store_user_clean and start to grab stored user information
4200 * for all objects on this cache. If some alloc/free requests comes
4201 * during the processing, information would be wrong so restart
4202 * whole processing.
4203 */
4204 do {
4205 set_store_user_clean(cachep);
4206 drain_cpu_caches(cachep);
4150 4207
4151 x[1] = 0; 4208 x[1] = 0;
4152 4209
4153 for_each_kmem_cache_node(cachep, node, n) { 4210 for_each_kmem_cache_node(cachep, node, n) {
4154 4211
4155 check_irq_on(); 4212 check_irq_on();
4156 spin_lock_irq(&n->list_lock); 4213 spin_lock_irq(&n->list_lock);
4214
4215 list_for_each_entry(page, &n->slabs_full, lru)
4216 handle_slab(x, cachep, page);
4217 list_for_each_entry(page, &n->slabs_partial, lru)
4218 handle_slab(x, cachep, page);
4219 spin_unlock_irq(&n->list_lock);
4220 }
4221 } while (!is_store_user_clean(cachep));
4157 4222
4158 list_for_each_entry(page, &n->slabs_full, lru)
4159 handle_slab(x, cachep, page);
4160 list_for_each_entry(page, &n->slabs_partial, lru)
4161 handle_slab(x, cachep, page);
4162 spin_unlock_irq(&n->list_lock);
4163 }
4164 name = cachep->name; 4223 name = cachep->name;
4165 if (x[0] == x[1]) { 4224 if (x[0] == x[1]) {
4166 /* Increase the buffer size */ 4225 /* Increase the buffer size */
diff --git a/mm/slab.h b/mm/slab.h
index 2eedacea439d..b7934361f026 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -38,6 +38,10 @@ struct kmem_cache {
38#endif 38#endif
39 39
40#include <linux/memcontrol.h> 40#include <linux/memcontrol.h>
41#include <linux/fault-inject.h>
42#include <linux/kmemcheck.h>
43#include <linux/kasan.h>
44#include <linux/kmemleak.h>
41 45
42/* 46/*
43 * State of the slab allocator. 47 * State of the slab allocator.
@@ -121,7 +125,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
121#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 125#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
122#elif defined(CONFIG_SLUB_DEBUG) 126#elif defined(CONFIG_SLUB_DEBUG)
123#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 127#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
124 SLAB_TRACE | SLAB_DEBUG_FREE) 128 SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
125#else 129#else
126#define SLAB_DEBUG_FLAGS (0) 130#define SLAB_DEBUG_FLAGS (0)
127#endif 131#endif
@@ -168,7 +172,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
168/* 172/*
169 * Generic implementation of bulk operations 173 * Generic implementation of bulk operations
170 * These are useful for situations in which the allocator cannot 174 * These are useful for situations in which the allocator cannot
171 * perform optimizations. In that case segments of the objecct listed 175 * perform optimizations. In that case segments of the object listed
172 * may be allocated or freed using these operations. 176 * may be allocated or freed using these operations.
173 */ 177 */
174void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); 178void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
@@ -307,7 +311,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
307 * to not do even the assignment. In that case, slab_equal_or_root 311 * to not do even the assignment. In that case, slab_equal_or_root
308 * will also be a constant. 312 * will also be a constant.
309 */ 313 */
310 if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) 314 if (!memcg_kmem_enabled() &&
315 !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
311 return s; 316 return s;
312 317
313 page = virt_to_head_page(x); 318 page = virt_to_head_page(x);
@@ -321,6 +326,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
321 return s; 326 return s;
322} 327}
323 328
329static inline size_t slab_ksize(const struct kmem_cache *s)
330{
331#ifndef CONFIG_SLUB
332 return s->object_size;
333
334#else /* CONFIG_SLUB */
335# ifdef CONFIG_SLUB_DEBUG
336 /*
337 * Debugging requires use of the padding between object
338 * and whatever may come after it.
339 */
340 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
341 return s->object_size;
342# endif
343 /*
344 * If we have the need to store the freelist pointer
345 * back there or track user information then we can
346 * only use the space before that information.
347 */
348 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
349 return s->inuse;
350 /*
351 * Else we can use all the padding etc for the allocation
352 */
353 return s->size;
354#endif
355}
356
357static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
358 gfp_t flags)
359{
360 flags &= gfp_allowed_mask;
361 lockdep_trace_alloc(flags);
362 might_sleep_if(gfpflags_allow_blocking(flags));
363
364 if (should_failslab(s, flags))
365 return NULL;
366
367 return memcg_kmem_get_cache(s, flags);
368}
369
370static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
371 size_t size, void **p)
372{
373 size_t i;
374
375 flags &= gfp_allowed_mask;
376 for (i = 0; i < size; i++) {
377 void *object = p[i];
378
379 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
380 kmemleak_alloc_recursive(object, s->object_size, 1,
381 s->flags, flags);
382 kasan_slab_alloc(s, object);
383 }
384 memcg_kmem_put_cache(s);
385}
386
324#ifndef CONFIG_SLOB 387#ifndef CONFIG_SLOB
325/* 388/*
326 * The slab lists for all objects. 389 * The slab lists for all objects.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 065b7bdabdc3..6afb2263a5c5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -109,8 +109,12 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
109{ 109{
110 size_t i; 110 size_t i;
111 111
112 for (i = 0; i < nr; i++) 112 for (i = 0; i < nr; i++) {
113 kmem_cache_free(s, p[i]); 113 if (s)
114 kmem_cache_free(s, p[i]);
115 else
116 kfree(p[i]);
117 }
114} 118}
115 119
116int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, 120int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
diff --git a/mm/slub.c b/mm/slub.c
index d8fbd4a6ed59..6c91324f9370 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
124#endif 124#endif
125} 125}
126 126
127static inline void *fixup_red_left(struct kmem_cache *s, void *p)
128{
129 if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
130 p += s->red_left_pad;
131
132 return p;
133}
134
127static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) 135static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
128{ 136{
129#ifdef CONFIG_SLUB_CPU_PARTIAL 137#ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -160,10 +168,18 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
160 */ 168 */
161#define MAX_PARTIAL 10 169#define MAX_PARTIAL 10
162 170
163#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 171#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
164 SLAB_POISON | SLAB_STORE_USER) 172 SLAB_POISON | SLAB_STORE_USER)
165 173
166/* 174/*
175 * These debug flags cannot use CMPXCHG because there might be consistency
176 * issues when checking or reading debug information
177 */
178#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
179 SLAB_TRACE)
180
181
182/*
167 * Debugging flags that require metadata to be stored in the slab. These get 183 * Debugging flags that require metadata to be stored in the slab. These get
168 * disabled when slub_debug=O is used and a cache's min order increases with 184 * disabled when slub_debug=O is used and a cache's min order increases with
169 * metadata. 185 * metadata.
@@ -224,24 +240,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
224 * Core slab cache functions 240 * Core slab cache functions
225 *******************************************************************/ 241 *******************************************************************/
226 242
227/* Verify that a pointer has an address that is valid within a slab page */
228static inline int check_valid_pointer(struct kmem_cache *s,
229 struct page *page, const void *object)
230{
231 void *base;
232
233 if (!object)
234 return 1;
235
236 base = page_address(page);
237 if (object < base || object >= base + page->objects * s->size ||
238 (object - base) % s->size) {
239 return 0;
240 }
241
242 return 1;
243}
244
245static inline void *get_freepointer(struct kmem_cache *s, void *object) 243static inline void *get_freepointer(struct kmem_cache *s, void *object)
246{ 244{
247 return *(void **)(object + s->offset); 245 return *(void **)(object + s->offset);
@@ -271,12 +269,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
271 269
272/* Loop over all objects in a slab */ 270/* Loop over all objects in a slab */
273#define for_each_object(__p, __s, __addr, __objects) \ 271#define for_each_object(__p, __s, __addr, __objects) \
274 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ 272 for (__p = fixup_red_left(__s, __addr); \
275 __p += (__s)->size) 273 __p < (__addr) + (__objects) * (__s)->size; \
274 __p += (__s)->size)
276 275
277#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ 276#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
278 for (__p = (__addr), __idx = 1; __idx <= __objects;\ 277 for (__p = fixup_red_left(__s, __addr), __idx = 1; \
279 __p += (__s)->size, __idx++) 278 __idx <= __objects; \
279 __p += (__s)->size, __idx++)
280 280
281/* Determine object index from a given position */ 281/* Determine object index from a given position */
282static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 282static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -284,30 +284,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
284 return (p - addr) / s->size; 284 return (p - addr) / s->size;
285} 285}
286 286
287static inline size_t slab_ksize(const struct kmem_cache *s)
288{
289#ifdef CONFIG_SLUB_DEBUG
290 /*
291 * Debugging requires use of the padding between object
292 * and whatever may come after it.
293 */
294 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
295 return s->object_size;
296
297#endif
298 /*
299 * If we have the need to store the freelist pointer
300 * back there or track user information then we can
301 * only use the space before that information.
302 */
303 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
304 return s->inuse;
305 /*
306 * Else we can use all the padding etc for the allocation
307 */
308 return s->size;
309}
310
311static inline int order_objects(int order, unsigned long size, int reserved) 287static inline int order_objects(int order, unsigned long size, int reserved)
312{ 288{
313 return ((PAGE_SIZE << order) - reserved) / size; 289 return ((PAGE_SIZE << order) - reserved) / size;
@@ -458,6 +434,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
458 set_bit(slab_index(p, s, addr), map); 434 set_bit(slab_index(p, s, addr), map);
459} 435}
460 436
437static inline int size_from_object(struct kmem_cache *s)
438{
439 if (s->flags & SLAB_RED_ZONE)
440 return s->size - s->red_left_pad;
441
442 return s->size;
443}
444
445static inline void *restore_red_left(struct kmem_cache *s, void *p)
446{
447 if (s->flags & SLAB_RED_ZONE)
448 p -= s->red_left_pad;
449
450 return p;
451}
452
461/* 453/*
462 * Debug settings: 454 * Debug settings:
463 */ 455 */
@@ -491,6 +483,26 @@ static inline void metadata_access_disable(void)
491/* 483/*
492 * Object debugging 484 * Object debugging
493 */ 485 */
486
487/* Verify that a pointer has an address that is valid within a slab page */
488static inline int check_valid_pointer(struct kmem_cache *s,
489 struct page *page, void *object)
490{
491 void *base;
492
493 if (!object)
494 return 1;
495
496 base = page_address(page);
497 object = restore_red_left(s, object);
498 if (object < base || object >= base + page->objects * s->size ||
499 (object - base) % s->size) {
500 return 0;
501 }
502
503 return 1;
504}
505
494static void print_section(char *text, u8 *addr, unsigned int length) 506static void print_section(char *text, u8 *addr, unsigned int length)
495{ 507{
496 metadata_access_enable(); 508 metadata_access_enable();
@@ -630,7 +642,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
630 pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 642 pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
631 p, p - addr, get_freepointer(s, p)); 643 p, p - addr, get_freepointer(s, p));
632 644
633 if (p > addr + 16) 645 if (s->flags & SLAB_RED_ZONE)
646 print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
647 else if (p > addr + 16)
634 print_section("Bytes b4 ", p - 16, 16); 648 print_section("Bytes b4 ", p - 16, 16);
635 649
636 print_section("Object ", p, min_t(unsigned long, s->object_size, 650 print_section("Object ", p, min_t(unsigned long, s->object_size,
@@ -647,9 +661,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
647 if (s->flags & SLAB_STORE_USER) 661 if (s->flags & SLAB_STORE_USER)
648 off += 2 * sizeof(struct track); 662 off += 2 * sizeof(struct track);
649 663
650 if (off != s->size) 664 if (off != size_from_object(s))
651 /* Beginning of the filler is the free pointer */ 665 /* Beginning of the filler is the free pointer */
652 print_section("Padding ", p + off, s->size - off); 666 print_section("Padding ", p + off, size_from_object(s) - off);
653 667
654 dump_stack(); 668 dump_stack();
655} 669}
@@ -679,6 +693,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
679{ 693{
680 u8 *p = object; 694 u8 *p = object;
681 695
696 if (s->flags & SLAB_RED_ZONE)
697 memset(p - s->red_left_pad, val, s->red_left_pad);
698
682 if (s->flags & __OBJECT_POISON) { 699 if (s->flags & __OBJECT_POISON) {
683 memset(p, POISON_FREE, s->object_size - 1); 700 memset(p, POISON_FREE, s->object_size - 1);
684 p[s->object_size - 1] = POISON_END; 701 p[s->object_size - 1] = POISON_END;
@@ -771,11 +788,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
771 /* We also have user information there */ 788 /* We also have user information there */
772 off += 2 * sizeof(struct track); 789 off += 2 * sizeof(struct track);
773 790
774 if (s->size == off) 791 if (size_from_object(s) == off)
775 return 1; 792 return 1;
776 793
777 return check_bytes_and_report(s, page, p, "Object padding", 794 return check_bytes_and_report(s, page, p, "Object padding",
778 p + off, POISON_INUSE, s->size - off); 795 p + off, POISON_INUSE, size_from_object(s) - off);
779} 796}
780 797
781/* Check the pad bytes at the end of a slab page */ 798/* Check the pad bytes at the end of a slab page */
@@ -820,6 +837,10 @@ static int check_object(struct kmem_cache *s, struct page *page,
820 837
821 if (s->flags & SLAB_RED_ZONE) { 838 if (s->flags & SLAB_RED_ZONE) {
822 if (!check_bytes_and_report(s, page, object, "Redzone", 839 if (!check_bytes_and_report(s, page, object, "Redzone",
840 object - s->red_left_pad, val, s->red_left_pad))
841 return 0;
842
843 if (!check_bytes_and_report(s, page, object, "Redzone",
823 endobject, val, s->inuse - s->object_size)) 844 endobject, val, s->inuse - s->object_size))
824 return 0; 845 return 0;
825 } else { 846 } else {
@@ -1031,20 +1052,32 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
1031 init_tracking(s, object); 1052 init_tracking(s, object);
1032} 1053}
1033 1054
1034static noinline int alloc_debug_processing(struct kmem_cache *s, 1055static inline int alloc_consistency_checks(struct kmem_cache *s,
1035 struct page *page, 1056 struct page *page,
1036 void *object, unsigned long addr) 1057 void *object, unsigned long addr)
1037{ 1058{
1038 if (!check_slab(s, page)) 1059 if (!check_slab(s, page))
1039 goto bad; 1060 return 0;
1040 1061
1041 if (!check_valid_pointer(s, page, object)) { 1062 if (!check_valid_pointer(s, page, object)) {
1042 object_err(s, page, object, "Freelist Pointer check fails"); 1063 object_err(s, page, object, "Freelist Pointer check fails");
1043 goto bad; 1064 return 0;
1044 } 1065 }
1045 1066
1046 if (!check_object(s, page, object, SLUB_RED_INACTIVE)) 1067 if (!check_object(s, page, object, SLUB_RED_INACTIVE))
1047 goto bad; 1068 return 0;
1069
1070 return 1;
1071}
1072
1073static noinline int alloc_debug_processing(struct kmem_cache *s,
1074 struct page *page,
1075 void *object, unsigned long addr)
1076{
1077 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1078 if (!alloc_consistency_checks(s, page, object, addr))
1079 goto bad;
1080 }
1048 1081
1049 /* Success perform special debug activities for allocs */ 1082 /* Success perform special debug activities for allocs */
1050 if (s->flags & SLAB_STORE_USER) 1083 if (s->flags & SLAB_STORE_USER)
@@ -1067,37 +1100,21 @@ bad:
1067 return 0; 1100 return 0;
1068} 1101}
1069 1102
1070/* Supports checking bulk free of a constructed freelist */ 1103static inline int free_consistency_checks(struct kmem_cache *s,
1071static noinline struct kmem_cache_node *free_debug_processing( 1104 struct page *page, void *object, unsigned long addr)
1072 struct kmem_cache *s, struct page *page,
1073 void *head, void *tail, int bulk_cnt,
1074 unsigned long addr, unsigned long *flags)
1075{ 1105{
1076 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1077 void *object = head;
1078 int cnt = 0;
1079
1080 spin_lock_irqsave(&n->list_lock, *flags);
1081 slab_lock(page);
1082
1083 if (!check_slab(s, page))
1084 goto fail;
1085
1086next_object:
1087 cnt++;
1088
1089 if (!check_valid_pointer(s, page, object)) { 1106 if (!check_valid_pointer(s, page, object)) {
1090 slab_err(s, page, "Invalid object pointer 0x%p", object); 1107 slab_err(s, page, "Invalid object pointer 0x%p", object);
1091 goto fail; 1108 return 0;
1092 } 1109 }
1093 1110
1094 if (on_freelist(s, page, object)) { 1111 if (on_freelist(s, page, object)) {
1095 object_err(s, page, object, "Object already free"); 1112 object_err(s, page, object, "Object already free");
1096 goto fail; 1113 return 0;
1097 } 1114 }
1098 1115
1099 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1116 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1100 goto out; 1117 return 0;
1101 1118
1102 if (unlikely(s != page->slab_cache)) { 1119 if (unlikely(s != page->slab_cache)) {
1103 if (!PageSlab(page)) { 1120 if (!PageSlab(page)) {
@@ -1110,7 +1127,37 @@ next_object:
1110 } else 1127 } else
1111 object_err(s, page, object, 1128 object_err(s, page, object,
1112 "page slab pointer corrupt."); 1129 "page slab pointer corrupt.");
1113 goto fail; 1130 return 0;
1131 }
1132 return 1;
1133}
1134
1135/* Supports checking bulk free of a constructed freelist */
1136static noinline int free_debug_processing(
1137 struct kmem_cache *s, struct page *page,
1138 void *head, void *tail, int bulk_cnt,
1139 unsigned long addr)
1140{
1141 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1142 void *object = head;
1143 int cnt = 0;
1144 unsigned long uninitialized_var(flags);
1145 int ret = 0;
1146
1147 spin_lock_irqsave(&n->list_lock, flags);
1148 slab_lock(page);
1149
1150 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1151 if (!check_slab(s, page))
1152 goto out;
1153 }
1154
1155next_object:
1156 cnt++;
1157
1158 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1159 if (!free_consistency_checks(s, page, object, addr))
1160 goto out;
1114 } 1161 }
1115 1162
1116 if (s->flags & SLAB_STORE_USER) 1163 if (s->flags & SLAB_STORE_USER)
@@ -1124,23 +1171,18 @@ next_object:
1124 object = get_freepointer(s, object); 1171 object = get_freepointer(s, object);
1125 goto next_object; 1172 goto next_object;
1126 } 1173 }
1174 ret = 1;
1175
1127out: 1176out:
1128 if (cnt != bulk_cnt) 1177 if (cnt != bulk_cnt)
1129 slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", 1178 slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
1130 bulk_cnt, cnt); 1179 bulk_cnt, cnt);
1131 1180
1132 slab_unlock(page); 1181 slab_unlock(page);
1133 /* 1182 spin_unlock_irqrestore(&n->list_lock, flags);
1134 * Keep node_lock to preserve integrity 1183 if (!ret)
1135 * until the object is actually freed 1184 slab_fix(s, "Object at 0x%p not freed", object);
1136 */ 1185 return ret;
1137 return n;
1138
1139fail:
1140 slab_unlock(page);
1141 spin_unlock_irqrestore(&n->list_lock, *flags);
1142 slab_fix(s, "Object at 0x%p not freed", object);
1143 return NULL;
1144} 1186}
1145 1187
1146static int __init setup_slub_debug(char *str) 1188static int __init setup_slub_debug(char *str)
@@ -1172,7 +1214,7 @@ static int __init setup_slub_debug(char *str)
1172 for (; *str && *str != ','; str++) { 1214 for (; *str && *str != ','; str++) {
1173 switch (tolower(*str)) { 1215 switch (tolower(*str)) {
1174 case 'f': 1216 case 'f':
1175 slub_debug |= SLAB_DEBUG_FREE; 1217 slub_debug |= SLAB_CONSISTENCY_CHECKS;
1176 break; 1218 break;
1177 case 'z': 1219 case 'z':
1178 slub_debug |= SLAB_RED_ZONE; 1220 slub_debug |= SLAB_RED_ZONE;
@@ -1231,10 +1273,10 @@ static inline void setup_object_debug(struct kmem_cache *s,
1231static inline int alloc_debug_processing(struct kmem_cache *s, 1273static inline int alloc_debug_processing(struct kmem_cache *s,
1232 struct page *page, void *object, unsigned long addr) { return 0; } 1274 struct page *page, void *object, unsigned long addr) { return 0; }
1233 1275
1234static inline struct kmem_cache_node *free_debug_processing( 1276static inline int free_debug_processing(
1235 struct kmem_cache *s, struct page *page, 1277 struct kmem_cache *s, struct page *page,
1236 void *head, void *tail, int bulk_cnt, 1278 void *head, void *tail, int bulk_cnt,
1237 unsigned long addr, unsigned long *flags) { return NULL; } 1279 unsigned long addr) { return 0; }
1238 1280
1239static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1281static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1240 { return 1; } 1282 { return 1; }
@@ -1281,36 +1323,6 @@ static inline void kfree_hook(const void *x)
1281 kasan_kfree_large(x); 1323 kasan_kfree_large(x);
1282} 1324}
1283 1325
1284static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
1285 gfp_t flags)
1286{
1287 flags &= gfp_allowed_mask;
1288 lockdep_trace_alloc(flags);
1289 might_sleep_if(gfpflags_allow_blocking(flags));
1290
1291 if (should_failslab(s->object_size, flags, s->flags))
1292 return NULL;
1293
1294 return memcg_kmem_get_cache(s, flags);
1295}
1296
1297static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1298 size_t size, void **p)
1299{
1300 size_t i;
1301
1302 flags &= gfp_allowed_mask;
1303 for (i = 0; i < size; i++) {
1304 void *object = p[i];
1305
1306 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
1307 kmemleak_alloc_recursive(object, s->object_size, 1,
1308 s->flags, flags);
1309 kasan_slab_alloc(s, object);
1310 }
1311 memcg_kmem_put_cache(s);
1312}
1313
1314static inline void slab_free_hook(struct kmem_cache *s, void *x) 1326static inline void slab_free_hook(struct kmem_cache *s, void *x)
1315{ 1327{
1316 kmemleak_free_recursive(x, s->flags); 1328 kmemleak_free_recursive(x, s->flags);
@@ -1470,7 +1482,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1470 set_freepointer(s, p, NULL); 1482 set_freepointer(s, p, NULL);
1471 } 1483 }
1472 1484
1473 page->freelist = start; 1485 page->freelist = fixup_red_left(s, start);
1474 page->inuse = page->objects; 1486 page->inuse = page->objects;
1475 page->frozen = 1; 1487 page->frozen = 1;
1476 1488
@@ -1506,7 +1518,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1506 int order = compound_order(page); 1518 int order = compound_order(page);
1507 int pages = 1 << order; 1519 int pages = 1 << order;
1508 1520
1509 if (kmem_cache_debug(s)) { 1521 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1510 void *p; 1522 void *p;
1511 1523
1512 slab_pad_check(s, page); 1524 slab_pad_check(s, page);
@@ -2224,8 +2236,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2224 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) 2236 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
2225 return; 2237 return;
2226 2238
2227 pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2239 pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
2228 nid, gfpflags); 2240 nid, gfpflags, &gfpflags);
2229 pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", 2241 pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
2230 s->name, s->object_size, s->size, oo_order(s->oo), 2242 s->name, s->object_size, s->size, oo_order(s->oo),
2231 oo_order(s->min)); 2243 oo_order(s->min));
@@ -2642,8 +2654,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2642 stat(s, FREE_SLOWPATH); 2654 stat(s, FREE_SLOWPATH);
2643 2655
2644 if (kmem_cache_debug(s) && 2656 if (kmem_cache_debug(s) &&
2645 !(n = free_debug_processing(s, page, head, tail, cnt, 2657 !free_debug_processing(s, page, head, tail, cnt, addr))
2646 addr, &flags)))
2647 return; 2658 return;
2648 2659
2649 do { 2660 do {
@@ -2815,6 +2826,7 @@ struct detached_freelist {
2815 void *tail; 2826 void *tail;
2816 void *freelist; 2827 void *freelist;
2817 int cnt; 2828 int cnt;
2829 struct kmem_cache *s;
2818}; 2830};
2819 2831
2820/* 2832/*
@@ -2829,26 +2841,45 @@ struct detached_freelist {
2829 * synchronization primitive. Look ahead in the array is limited due 2841 * synchronization primitive. Look ahead in the array is limited due
2830 * to performance reasons. 2842 * to performance reasons.
2831 */ 2843 */
2832static int build_detached_freelist(struct kmem_cache *s, size_t size, 2844static inline
2833 void **p, struct detached_freelist *df) 2845int build_detached_freelist(struct kmem_cache *s, size_t size,
2846 void **p, struct detached_freelist *df)
2834{ 2847{
2835 size_t first_skipped_index = 0; 2848 size_t first_skipped_index = 0;
2836 int lookahead = 3; 2849 int lookahead = 3;
2837 void *object; 2850 void *object;
2851 struct page *page;
2838 2852
2839 /* Always re-init detached_freelist */ 2853 /* Always re-init detached_freelist */
2840 df->page = NULL; 2854 df->page = NULL;
2841 2855
2842 do { 2856 do {
2843 object = p[--size]; 2857 object = p[--size];
2858 /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
2844 } while (!object && size); 2859 } while (!object && size);
2845 2860
2846 if (!object) 2861 if (!object)
2847 return 0; 2862 return 0;
2848 2863
2864 page = virt_to_head_page(object);
2865 if (!s) {
2866 /* Handle kalloc'ed objects */
2867 if (unlikely(!PageSlab(page))) {
2868 BUG_ON(!PageCompound(page));
2869 kfree_hook(object);
2870 __free_kmem_pages(page, compound_order(page));
2871 p[size] = NULL; /* mark object processed */
2872 return size;
2873 }
2874 /* Derive kmem_cache from object */
2875 df->s = page->slab_cache;
2876 } else {
2877 df->s = cache_from_obj(s, object); /* Support for memcg */
2878 }
2879
2849 /* Start new detached freelist */ 2880 /* Start new detached freelist */
2850 set_freepointer(s, object, NULL); 2881 df->page = page;
2851 df->page = virt_to_head_page(object); 2882 set_freepointer(df->s, object, NULL);
2852 df->tail = object; 2883 df->tail = object;
2853 df->freelist = object; 2884 df->freelist = object;
2854 p[size] = NULL; /* mark object processed */ 2885 p[size] = NULL; /* mark object processed */
@@ -2862,7 +2893,7 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size,
2862 /* df->page is always set at this point */ 2893 /* df->page is always set at this point */
2863 if (df->page == virt_to_head_page(object)) { 2894 if (df->page == virt_to_head_page(object)) {
2864 /* Opportunity build freelist */ 2895 /* Opportunity build freelist */
2865 set_freepointer(s, object, df->freelist); 2896 set_freepointer(df->s, object, df->freelist);
2866 df->freelist = object; 2897 df->freelist = object;
2867 df->cnt++; 2898 df->cnt++;
2868 p[size] = NULL; /* mark object processed */ 2899 p[size] = NULL; /* mark object processed */
@@ -2881,25 +2912,20 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size,
2881 return first_skipped_index; 2912 return first_skipped_index;
2882} 2913}
2883 2914
2884
2885/* Note that interrupts must be enabled when calling this function. */ 2915/* Note that interrupts must be enabled when calling this function. */
2886void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) 2916void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
2887{ 2917{
2888 if (WARN_ON(!size)) 2918 if (WARN_ON(!size))
2889 return; 2919 return;
2890 2920
2891 do { 2921 do {
2892 struct detached_freelist df; 2922 struct detached_freelist df;
2893 struct kmem_cache *s;
2894
2895 /* Support for memcg */
2896 s = cache_from_obj(orig_s, p[size - 1]);
2897 2923
2898 size = build_detached_freelist(s, size, p, &df); 2924 size = build_detached_freelist(s, size, p, &df);
2899 if (unlikely(!df.page)) 2925 if (unlikely(!df.page))
2900 continue; 2926 continue;
2901 2927
2902 slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); 2928 slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
2903 } while (likely(size)); 2929 } while (likely(size));
2904} 2930}
2905EXPORT_SYMBOL(kmem_cache_free_bulk); 2931EXPORT_SYMBOL(kmem_cache_free_bulk);
@@ -3285,7 +3311,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3285 */ 3311 */
3286 size += 2 * sizeof(struct track); 3312 size += 2 * sizeof(struct track);
3287 3313
3288 if (flags & SLAB_RED_ZONE) 3314 if (flags & SLAB_RED_ZONE) {
3289 /* 3315 /*
3290 * Add some empty padding so that we can catch 3316 * Add some empty padding so that we can catch
3291 * overwrites from earlier objects rather than let 3317 * overwrites from earlier objects rather than let
@@ -3294,6 +3320,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3294 * of the object. 3320 * of the object.
3295 */ 3321 */
3296 size += sizeof(void *); 3322 size += sizeof(void *);
3323
3324 s->red_left_pad = sizeof(void *);
3325 s->red_left_pad = ALIGN(s->red_left_pad, s->align);
3326 size += s->red_left_pad;
3327 }
3297#endif 3328#endif
3298 3329
3299 /* 3330 /*
@@ -3357,7 +3388,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3357 3388
3358#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 3389#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
3359 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 3390 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
3360 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) 3391 if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
3361 /* Enable fast mode */ 3392 /* Enable fast mode */
3362 s->flags |= __CMPXCHG_DOUBLE; 3393 s->flags |= __CMPXCHG_DOUBLE;
3363#endif 3394#endif
@@ -4812,16 +4843,16 @@ SLAB_ATTR_RO(total_objects);
4812 4843
4813static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 4844static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
4814{ 4845{
4815 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 4846 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
4816} 4847}
4817 4848
4818static ssize_t sanity_checks_store(struct kmem_cache *s, 4849static ssize_t sanity_checks_store(struct kmem_cache *s,
4819 const char *buf, size_t length) 4850 const char *buf, size_t length)
4820{ 4851{
4821 s->flags &= ~SLAB_DEBUG_FREE; 4852 s->flags &= ~SLAB_CONSISTENCY_CHECKS;
4822 if (buf[0] == '1') { 4853 if (buf[0] == '1') {
4823 s->flags &= ~__CMPXCHG_DOUBLE; 4854 s->flags &= ~__CMPXCHG_DOUBLE;
4824 s->flags |= SLAB_DEBUG_FREE; 4855 s->flags |= SLAB_CONSISTENCY_CHECKS;
4825 } 4856 }
4826 return length; 4857 return length;
4827} 4858}
@@ -4865,7 +4896,6 @@ static ssize_t red_zone_store(struct kmem_cache *s,
4865 4896
4866 s->flags &= ~SLAB_RED_ZONE; 4897 s->flags &= ~SLAB_RED_ZONE;
4867 if (buf[0] == '1') { 4898 if (buf[0] == '1') {
4868 s->flags &= ~__CMPXCHG_DOUBLE;
4869 s->flags |= SLAB_RED_ZONE; 4899 s->flags |= SLAB_RED_ZONE;
4870 } 4900 }
4871 calculate_sizes(s, -1); 4901 calculate_sizes(s, -1);
@@ -4886,7 +4916,6 @@ static ssize_t poison_store(struct kmem_cache *s,
4886 4916
4887 s->flags &= ~SLAB_POISON; 4917 s->flags &= ~SLAB_POISON;
4888 if (buf[0] == '1') { 4918 if (buf[0] == '1') {
4889 s->flags &= ~__CMPXCHG_DOUBLE;
4890 s->flags |= SLAB_POISON; 4919 s->flags |= SLAB_POISON;
4891 } 4920 }
4892 calculate_sizes(s, -1); 4921 calculate_sizes(s, -1);
@@ -5356,7 +5385,7 @@ static char *create_unique_id(struct kmem_cache *s)
5356 *p++ = 'd'; 5385 *p++ = 'd';
5357 if (s->flags & SLAB_RECLAIM_ACCOUNT) 5386 if (s->flags & SLAB_RECLAIM_ACCOUNT)
5358 *p++ = 'a'; 5387 *p++ = 'a';
5359 if (s->flags & SLAB_DEBUG_FREE) 5388 if (s->flags & SLAB_CONSISTENCY_CHECKS)
5360 *p++ = 'F'; 5389 *p++ = 'F';
5361 if (!(s->flags & SLAB_NOTRACK)) 5390 if (!(s->flags & SLAB_NOTRACK))
5362 *p++ = 't'; 5391 *p++ = 't';
diff --git a/mm/truncate.c b/mm/truncate.c
index e3ee0e27cd17..7598b552ae03 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -519,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
519static int 519static int
520invalidate_complete_page2(struct address_space *mapping, struct page *page) 520invalidate_complete_page2(struct address_space *mapping, struct page *page)
521{ 521{
522 struct mem_cgroup *memcg;
523 unsigned long flags; 522 unsigned long flags;
524 523
525 if (page->mapping != mapping) 524 if (page->mapping != mapping)
@@ -528,15 +527,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
528 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) 527 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
529 return 0; 528 return 0;
530 529
531 memcg = mem_cgroup_begin_page_stat(page);
532 spin_lock_irqsave(&mapping->tree_lock, flags); 530 spin_lock_irqsave(&mapping->tree_lock, flags);
533 if (PageDirty(page)) 531 if (PageDirty(page))
534 goto failed; 532 goto failed;
535 533
536 BUG_ON(page_has_private(page)); 534 BUG_ON(page_has_private(page));
537 __delete_from_page_cache(page, NULL, memcg); 535 __delete_from_page_cache(page, NULL);
538 spin_unlock_irqrestore(&mapping->tree_lock, flags); 536 spin_unlock_irqrestore(&mapping->tree_lock, flags);
539 mem_cgroup_end_page_stat(memcg);
540 537
541 if (mapping->a_ops->freepage) 538 if (mapping->a_ops->freepage)
542 mapping->a_ops->freepage(page); 539 mapping->a_ops->freepage(page);
@@ -545,7 +542,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
545 return 1; 542 return 1;
546failed: 543failed:
547 spin_unlock_irqrestore(&mapping->tree_lock, flags); 544 spin_unlock_irqrestore(&mapping->tree_lock, flags);
548 mem_cgroup_end_page_stat(memcg);
549 return 0; 545 return 0;
550} 546}
551 547
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 71b1c29948db..dd984470248f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -195,25 +195,25 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
195{ 195{
196 unsigned long nr; 196 unsigned long nr;
197 197
198 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 198 nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
199 zone_page_state(zone, NR_INACTIVE_FILE) + 199 zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
200 zone_page_state(zone, NR_ISOLATED_FILE); 200 zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
201 201
202 if (get_nr_swap_pages() > 0) 202 if (get_nr_swap_pages() > 0)
203 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 203 nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
204 zone_page_state(zone, NR_INACTIVE_ANON) + 204 zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
205 zone_page_state(zone, NR_ISOLATED_ANON); 205 zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
206 206
207 return nr; 207 return nr;
208} 208}
209 209
210bool zone_reclaimable(struct zone *zone) 210bool zone_reclaimable(struct zone *zone)
211{ 211{
212 return zone_page_state(zone, NR_PAGES_SCANNED) < 212 return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
213 zone_reclaimable_pages(zone) * 6; 213 zone_reclaimable_pages(zone) * 6;
214} 214}
215 215
216static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) 216unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
217{ 217{
218 if (!mem_cgroup_disabled()) 218 if (!mem_cgroup_disabled())
219 return mem_cgroup_get_lru_size(lruvec, lru); 219 return mem_cgroup_get_lru_size(lruvec, lru);
@@ -228,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker)
228{ 228{
229 size_t size = sizeof(*shrinker->nr_deferred); 229 size_t size = sizeof(*shrinker->nr_deferred);
230 230
231 /*
232 * If we only have one possible node in the system anyway, save
233 * ourselves the trouble and disable NUMA aware behavior. This way we
234 * will save memory and some small loop time later.
235 */
236 if (nr_node_ids == 1)
237 shrinker->flags &= ~SHRINKER_NUMA_AWARE;
238
239 if (shrinker->flags & SHRINKER_NUMA_AWARE) 231 if (shrinker->flags & SHRINKER_NUMA_AWARE)
240 size *= nr_node_ids; 232 size *= nr_node_ids;
241 233
@@ -611,12 +603,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
611 bool reclaimed) 603 bool reclaimed)
612{ 604{
613 unsigned long flags; 605 unsigned long flags;
614 struct mem_cgroup *memcg;
615 606
616 BUG_ON(!PageLocked(page)); 607 BUG_ON(!PageLocked(page));
617 BUG_ON(mapping != page_mapping(page)); 608 BUG_ON(mapping != page_mapping(page));
618 609
619 memcg = mem_cgroup_begin_page_stat(page);
620 spin_lock_irqsave(&mapping->tree_lock, flags); 610 spin_lock_irqsave(&mapping->tree_lock, flags);
621 /* 611 /*
622 * The non racy check for a busy page. 612 * The non racy check for a busy page.
@@ -656,7 +646,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
656 mem_cgroup_swapout(page, swap); 646 mem_cgroup_swapout(page, swap);
657 __delete_from_swap_cache(page); 647 __delete_from_swap_cache(page);
658 spin_unlock_irqrestore(&mapping->tree_lock, flags); 648 spin_unlock_irqrestore(&mapping->tree_lock, flags);
659 mem_cgroup_end_page_stat(memcg);
660 swapcache_free(swap); 649 swapcache_free(swap);
661 } else { 650 } else {
662 void (*freepage)(struct page *); 651 void (*freepage)(struct page *);
@@ -682,9 +671,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
682 if (reclaimed && page_is_file_cache(page) && 671 if (reclaimed && page_is_file_cache(page) &&
683 !mapping_exiting(mapping) && !dax_mapping(mapping)) 672 !mapping_exiting(mapping) && !dax_mapping(mapping))
684 shadow = workingset_eviction(mapping, page); 673 shadow = workingset_eviction(mapping, page);
685 __delete_from_page_cache(page, shadow, memcg); 674 __delete_from_page_cache(page, shadow);
686 spin_unlock_irqrestore(&mapping->tree_lock, flags); 675 spin_unlock_irqrestore(&mapping->tree_lock, flags);
687 mem_cgroup_end_page_stat(memcg);
688 676
689 if (freepage != NULL) 677 if (freepage != NULL)
690 freepage(page); 678 freepage(page);
@@ -694,7 +682,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
694 682
695cannot_free: 683cannot_free:
696 spin_unlock_irqrestore(&mapping->tree_lock, flags); 684 spin_unlock_irqrestore(&mapping->tree_lock, flags);
697 mem_cgroup_end_page_stat(memcg);
698 return 0; 685 return 0;
699} 686}
700 687
@@ -1931,8 +1918,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
1931 unsigned long inactive; 1918 unsigned long inactive;
1932 unsigned long active; 1919 unsigned long active;
1933 1920
1934 inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1921 inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
1935 active = get_lru_size(lruvec, LRU_ACTIVE_FILE); 1922 active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
1936 1923
1937 return active > inactive; 1924 return active > inactive;
1938} 1925}
@@ -2071,7 +2058,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2071 * system is under heavy pressure. 2058 * system is under heavy pressure.
2072 */ 2059 */
2073 if (!inactive_file_is_low(lruvec) && 2060 if (!inactive_file_is_low(lruvec) &&
2074 get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { 2061 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
2075 scan_balance = SCAN_FILE; 2062 scan_balance = SCAN_FILE;
2076 goto out; 2063 goto out;
2077 } 2064 }
@@ -2097,10 +2084,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2097 * anon in [0], file in [1] 2084 * anon in [0], file in [1]
2098 */ 2085 */
2099 2086
2100 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + 2087 anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
2101 get_lru_size(lruvec, LRU_INACTIVE_ANON); 2088 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
2102 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + 2089 file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
2103 get_lru_size(lruvec, LRU_INACTIVE_FILE); 2090 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
2104 2091
2105 spin_lock_irq(&zone->lru_lock); 2092 spin_lock_irq(&zone->lru_lock);
2106 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 2093 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2138,7 +2125,7 @@ out:
2138 unsigned long size; 2125 unsigned long size;
2139 unsigned long scan; 2126 unsigned long scan;
2140 2127
2141 size = get_lru_size(lruvec, lru); 2128 size = lruvec_lru_size(lruvec, lru);
2142 scan = size >> sc->priority; 2129 scan = size >> sc->priority;
2143 2130
2144 if (!scan && pass && force_scan) 2131 if (!scan && pass && force_scan)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 084c6725b373..69ce64f7b8d7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -924,19 +924,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
924#endif 924#endif
925 925
926#ifdef CONFIG_PROC_FS 926#ifdef CONFIG_PROC_FS
927static char * const migratetype_names[MIGRATE_TYPES] = {
928 "Unmovable",
929 "Movable",
930 "Reclaimable",
931 "HighAtomic",
932#ifdef CONFIG_CMA
933 "CMA",
934#endif
935#ifdef CONFIG_MEMORY_ISOLATION
936 "Isolate",
937#endif
938};
939
940static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 927static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
941 struct zone *zone) 928 struct zone *zone)
942{ 929{
@@ -1133,7 +1120,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1133#ifdef CONFIG_PAGE_OWNER 1120#ifdef CONFIG_PAGE_OWNER
1134 int mtype; 1121 int mtype;
1135 1122
1136 if (!page_owner_inited) 1123 if (!static_branch_unlikely(&page_owner_inited))
1137 return; 1124 return;
1138 1125
1139 drain_all_pages(NULL); 1126 drain_all_pages(NULL);
diff --git a/mm/workingset.c b/mm/workingset.c
index 61ead9e5549d..6130ba0b2641 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -152,8 +152,25 @@
152 * refault distance will immediately activate the refaulting page. 152 * refault distance will immediately activate the refaulting page.
153 */ 153 */
154 154
155static void *pack_shadow(unsigned long eviction, struct zone *zone) 155#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
156 ZONES_SHIFT + NODES_SHIFT + \
157 MEM_CGROUP_ID_SHIFT)
158#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
159
160/*
161 * Eviction timestamps need to be able to cover the full range of
162 * actionable refaults. However, bits are tight in the radix tree
163 * entry, and after storing the identifier for the lruvec there might
164 * not be enough left to represent every single actionable refault. In
165 * that case, we have to sacrifice granularity for distance, and group
166 * evictions into coarser buckets by shaving off lower timestamp bits.
167 */
168static unsigned int bucket_order __read_mostly;
169
170static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
156{ 171{
172 eviction >>= bucket_order;
173 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
157 eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); 174 eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
158 eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); 175 eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
159 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); 176 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -161,45 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone)
161 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); 178 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
162} 179}
163 180
164static void unpack_shadow(void *shadow, 181static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
165 struct zone **zone, 182 unsigned long *evictionp)
166 unsigned long *distance)
167{ 183{
168 unsigned long entry = (unsigned long)shadow; 184 unsigned long entry = (unsigned long)shadow;
169 unsigned long eviction; 185 int memcgid, nid, zid;
170 unsigned long refault;
171 unsigned long mask;
172 int zid, nid;
173 186
174 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; 187 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
175 zid = entry & ((1UL << ZONES_SHIFT) - 1); 188 zid = entry & ((1UL << ZONES_SHIFT) - 1);
176 entry >>= ZONES_SHIFT; 189 entry >>= ZONES_SHIFT;
177 nid = entry & ((1UL << NODES_SHIFT) - 1); 190 nid = entry & ((1UL << NODES_SHIFT) - 1);
178 entry >>= NODES_SHIFT; 191 entry >>= NODES_SHIFT;
179 eviction = entry; 192 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
193 entry >>= MEM_CGROUP_ID_SHIFT;
180 194
181 *zone = NODE_DATA(nid)->node_zones + zid; 195 *memcgidp = memcgid;
182 196 *zonep = NODE_DATA(nid)->node_zones + zid;
183 refault = atomic_long_read(&(*zone)->inactive_age); 197 *evictionp = entry << bucket_order;
184 mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
185 RADIX_TREE_EXCEPTIONAL_SHIFT);
186 /*
187 * The unsigned subtraction here gives an accurate distance
188 * across inactive_age overflows in most cases.
189 *
190 * There is a special case: usually, shadow entries have a
191 * short lifetime and are either refaulted or reclaimed along
192 * with the inode before they get too old. But it is not
193 * impossible for the inactive_age to lap a shadow entry in
194 * the field, which can then can result in a false small
195 * refault distance, leading to a false activation should this
196 * old entry actually refault again. However, earlier kernels
197 * used to deactivate unconditionally with *every* reclaim
198 * invocation for the longest time, so the occasional
199 * inappropriate activation leading to pressure on the active
200 * list is not a problem.
201 */
202 *distance = (refault - eviction) & mask;
203} 198}
204 199
205/** 200/**
@@ -212,11 +207,20 @@ static void unpack_shadow(void *shadow,
212 */ 207 */
213void *workingset_eviction(struct address_space *mapping, struct page *page) 208void *workingset_eviction(struct address_space *mapping, struct page *page)
214{ 209{
210 struct mem_cgroup *memcg = page_memcg(page);
215 struct zone *zone = page_zone(page); 211 struct zone *zone = page_zone(page);
212 int memcgid = mem_cgroup_id(memcg);
216 unsigned long eviction; 213 unsigned long eviction;
214 struct lruvec *lruvec;
217 215
218 eviction = atomic_long_inc_return(&zone->inactive_age); 216 /* Page is fully exclusive and pins page->mem_cgroup */
219 return pack_shadow(eviction, zone); 217 VM_BUG_ON_PAGE(PageLRU(page), page);
218 VM_BUG_ON_PAGE(page_count(page), page);
219 VM_BUG_ON_PAGE(!PageLocked(page), page);
220
221 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
222 eviction = atomic_long_inc_return(&lruvec->inactive_age);
223 return pack_shadow(memcgid, zone, eviction);
220} 224}
221 225
222/** 226/**
@@ -231,12 +235,64 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
231bool workingset_refault(void *shadow) 235bool workingset_refault(void *shadow)
232{ 236{
233 unsigned long refault_distance; 237 unsigned long refault_distance;
238 unsigned long active_file;
239 struct mem_cgroup *memcg;
240 unsigned long eviction;
241 struct lruvec *lruvec;
242 unsigned long refault;
234 struct zone *zone; 243 struct zone *zone;
244 int memcgid;
245
246 unpack_shadow(shadow, &memcgid, &zone, &eviction);
247
248 rcu_read_lock();
249 /*
250 * Look up the memcg associated with the stored ID. It might
251 * have been deleted since the page's eviction.
252 *
253 * Note that in rare events the ID could have been recycled
254 * for a new cgroup that refaults a shared page. This is
255 * impossible to tell from the available data. However, this
256 * should be a rare and limited disturbance, and activations
257 * are always speculative anyway. Ultimately, it's the aging
258 * algorithm's job to shake out the minimum access frequency
259 * for the active cache.
260 *
261 * XXX: On !CONFIG_MEMCG, this will always return NULL; it
262 * would be better if the root_mem_cgroup existed in all
263 * configurations instead.
264 */
265 memcg = mem_cgroup_from_id(memcgid);
266 if (!mem_cgroup_disabled() && !memcg) {
267 rcu_read_unlock();
268 return false;
269 }
270 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
271 refault = atomic_long_read(&lruvec->inactive_age);
272 active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
273 rcu_read_unlock();
274
275 /*
276 * The unsigned subtraction here gives an accurate distance
277 * across inactive_age overflows in most cases.
278 *
279 * There is a special case: usually, shadow entries have a
280 * short lifetime and are either refaulted or reclaimed along
281 * with the inode before they get too old. But it is not
282 * impossible for the inactive_age to lap a shadow entry in
283 * the field, which can then can result in a false small
284 * refault distance, leading to a false activation should this
285 * old entry actually refault again. However, earlier kernels
286 * used to deactivate unconditionally with *every* reclaim
287 * invocation for the longest time, so the occasional
288 * inappropriate activation leading to pressure on the active
289 * list is not a problem.
290 */
291 refault_distance = (refault - eviction) & EVICTION_MASK;
235 292
236 unpack_shadow(shadow, &zone, &refault_distance);
237 inc_zone_state(zone, WORKINGSET_REFAULT); 293 inc_zone_state(zone, WORKINGSET_REFAULT);
238 294
239 if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { 295 if (refault_distance <= active_file) {
240 inc_zone_state(zone, WORKINGSET_ACTIVATE); 296 inc_zone_state(zone, WORKINGSET_ACTIVATE);
241 return true; 297 return true;
242 } 298 }
@@ -249,7 +305,22 @@ bool workingset_refault(void *shadow)
249 */ 305 */
250void workingset_activation(struct page *page) 306void workingset_activation(struct page *page)
251{ 307{
252 atomic_long_inc(&page_zone(page)->inactive_age); 308 struct lruvec *lruvec;
309
310 lock_page_memcg(page);
311 /*
312 * Filter non-memcg pages here, e.g. unmap can call
313 * mark_page_accessed() on VDSO pages.
314 *
315 * XXX: See workingset_refault() - this should return
316 * root_mem_cgroup even for !CONFIG_MEMCG.
317 */
318 if (!mem_cgroup_disabled() && !page_memcg(page))
319 goto out;
320 lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
321 atomic_long_inc(&lruvec->inactive_age);
322out:
323 unlock_page_memcg(page);
253} 324}
254 325
255/* 326/*
@@ -398,8 +469,25 @@ static struct lock_class_key shadow_nodes_key;
398 469
399static int __init workingset_init(void) 470static int __init workingset_init(void)
400{ 471{
472 unsigned int timestamp_bits;
473 unsigned int max_order;
401 int ret; 474 int ret;
402 475
476 BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
477 /*
478 * Calculate the eviction bucket size to cover the longest
479 * actionable refault distance, which is currently half of
480 * memory (totalram_pages/2). However, memory hotplug may add
481 * some more pages at runtime, so keep working with up to
482 * double the initial memory by using totalram_pages as-is.
483 */
484 timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
485 max_order = fls_long(totalram_pages - 1);
486 if (max_order > timestamp_bits)
487 bucket_order = max_order - timestamp_bits;
488 printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
489 timestamp_bits, max_order, bucket_order);
490
403 ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); 491 ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
404 if (ret) 492 if (ret)
405 goto err; 493 goto err;