diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-16 14:51:08 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-16 14:51:08 -0400 |
commit | 271ecc5253e2b317d729d366560789cd7f93836c (patch) | |
tree | d3a60bc4dfa8245ff934f357f2367db76b59e7cf /mm | |
parent | aa6865d836418eb2ba888a4cb1318a28e9aa2e0c (diff) | |
parent | 63c06227a22b098a3849c5c99e836aea161ca0d7 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge first patch-bomb from Andrew Morton:
- some misc things
- ofs2 updates
- about half of MM
- checkpatch updates
- autofs4 update
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (120 commits)
autofs4: fix string.h include in auto_dev-ioctl.h
autofs4: use pr_xxx() macros directly for logging
autofs4: change log print macros to not insert newline
autofs4: make autofs log prints consistent
autofs4: fix some white space errors
autofs4: fix invalid ioctl return in autofs4_root_ioctl_unlocked()
autofs4: fix coding style line length in autofs4_wait()
autofs4: fix coding style problem in autofs4_get_set_timeout()
autofs4: coding style fixes
autofs: show pipe inode in mount options
kallsyms: add support for relative offsets in kallsyms address table
kallsyms: don't overload absolute symbol type for percpu symbols
x86: kallsyms: disable absolute percpu symbols on !SMP
checkpatch: fix another left brace warning
checkpatch: improve UNSPECIFIED_INT test for bare signed/unsigned uses
checkpatch: warn on bare unsigned or signed declarations without int
checkpatch: exclude asm volatile from complex macro check
mm: memcontrol: drop unnecessary lru locking from mem_cgroup_migrate()
mm: migrate: consolidate mem_cgroup_migrate() calls
mm/compaction: speed up pageblock_pfn_to_page() when zone is contiguous
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig.debug | 57 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/compaction.c | 93 | ||||
-rw-r--r-- | mm/debug.c | 165 | ||||
-rw-r--r-- | mm/failslab.c | 12 | ||||
-rw-r--r-- | mm/filemap.c | 113 | ||||
-rw-r--r-- | mm/huge_memory.c | 20 | ||||
-rw-r--r-- | mm/internal.h | 18 | ||||
-rw-r--r-- | mm/kmemcheck.c | 3 | ||||
-rw-r--r-- | mm/madvise.c | 19 | ||||
-rw-r--r-- | mm/memblock.c | 8 | ||||
-rw-r--r-- | mm/memcontrol.c | 92 | ||||
-rw-r--r-- | mm/memory-failure.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 7 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 30 | ||||
-rw-r--r-- | mm/mempolicy.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 23 | ||||
-rw-r--r-- | mm/oom_kill.c | 7 | ||||
-rw-r--r-- | mm/page-writeback.c | 62 | ||||
-rw-r--r-- | mm/page_alloc.c | 295 | ||||
-rw-r--r-- | mm/page_ext.c | 10 | ||||
-rw-r--r-- | mm/page_owner.c | 100 | ||||
-rw-r--r-- | mm/page_poison.c (renamed from mm/debug-pagealloc.c) | 67 | ||||
-rw-r--r-- | mm/rmap.c | 16 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | mm/slab.c | 1037 | ||||
-rw-r--r-- | mm/slab.h | 69 | ||||
-rw-r--r-- | mm/slab_common.c | 8 | ||||
-rw-r--r-- | mm/slub.c | 325 | ||||
-rw-r--r-- | mm/truncate.c | 6 | ||||
-rw-r--r-- | mm/vmscan.c | 47 | ||||
-rw-r--r-- | mm/vmstat.c | 15 | ||||
-rw-r--r-- | mm/workingset.c | 160 |
33 files changed, 1707 insertions, 1187 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 957d3da53ddd..5c50b238b770 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC | |||
16 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC | 16 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC |
17 | ---help--- | 17 | ---help--- |
18 | Unmap pages from the kernel linear mapping after free_pages(). | 18 | Unmap pages from the kernel linear mapping after free_pages(). |
19 | This results in a large slowdown, but helps to find certain types | 19 | Depending on runtime enablement, this results in a small or large |
20 | of memory corruption. | 20 | slowdown, but helps to find certain types of memory corruption. |
21 | 21 | ||
22 | For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, | 22 | For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, |
23 | fill the pages with poison patterns after free_pages() and verify | 23 | fill the pages with poison patterns after free_pages() and verify |
@@ -26,5 +26,56 @@ config DEBUG_PAGEALLOC | |||
26 | that would result in incorrect warnings of memory corruption after | 26 | that would result in incorrect warnings of memory corruption after |
27 | a resume because free pages are not saved to the suspend image. | 27 | a resume because free pages are not saved to the suspend image. |
28 | 28 | ||
29 | By default this option will have a small overhead, e.g. by not | ||
30 | allowing the kernel mapping to be backed by large pages on some | ||
31 | architectures. Even bigger overhead comes when the debugging is | ||
32 | enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc | ||
33 | command line parameter. | ||
34 | |||
35 | config DEBUG_PAGEALLOC_ENABLE_DEFAULT | ||
36 | bool "Enable debug page memory allocations by default?" | ||
37 | default n | ||
38 | depends on DEBUG_PAGEALLOC | ||
39 | ---help--- | ||
40 | Enable debug page memory allocations by default? This value | ||
41 | can be overridden by debug_pagealloc=off|on. | ||
42 | |||
29 | config PAGE_POISONING | 43 | config PAGE_POISONING |
30 | bool | 44 | bool "Poison pages after freeing" |
45 | select PAGE_EXTENSION | ||
46 | select PAGE_POISONING_NO_SANITY if HIBERNATION | ||
47 | ---help--- | ||
48 | Fill the pages with poison patterns after free_pages() and verify | ||
49 | the patterns before alloc_pages. The filling of the memory helps | ||
50 | reduce the risk of information leaks from freed data. This does | ||
51 | have a potential performance impact. | ||
52 | |||
53 | Note that "poison" here is not the same thing as the "HWPoison" | ||
54 | for CONFIG_MEMORY_FAILURE. This is software poisoning only. | ||
55 | |||
56 | If unsure, say N | ||
57 | |||
58 | config PAGE_POISONING_NO_SANITY | ||
59 | depends on PAGE_POISONING | ||
60 | bool "Only poison, don't sanity check" | ||
61 | ---help--- | ||
62 | Skip the sanity checking on alloc, only fill the pages with | ||
63 | poison on free. This reduces some of the overhead of the | ||
64 | poisoning feature. | ||
65 | |||
66 | If you are only interested in sanitization, say Y. Otherwise | ||
67 | say N. | ||
68 | |||
69 | config PAGE_POISONING_ZERO | ||
70 | bool "Use zero for poisoning instead of random data" | ||
71 | depends on PAGE_POISONING | ||
72 | ---help--- | ||
73 | Instead of using the existing poison value, fill the pages with | ||
74 | zeros. This makes it harder to detect when errors are occurring | ||
75 | due to sanitization but the zeroing at free means that it is | ||
76 | no longer necessary to write zeros when GFP_ZERO is used on | ||
77 | allocation. | ||
78 | |||
79 | Enabling page poisoning with this option will disable hibernation | ||
80 | |||
81 | If unsure, say N | ||
diff --git a/mm/Makefile b/mm/Makefile index 2ed43191fc3b..cfdd481d27a5 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -48,7 +48,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | |||
48 | obj-$(CONFIG_SLOB) += slob.o | 48 | obj-$(CONFIG_SLOB) += slob.o |
49 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 49 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
50 | obj-$(CONFIG_KSM) += ksm.o | 50 | obj-$(CONFIG_KSM) += ksm.o |
51 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | 51 | obj-$(CONFIG_PAGE_POISONING) += page_poison.o |
52 | obj-$(CONFIG_SLAB) += slab.o | 52 | obj-$(CONFIG_SLAB) += slab.o |
53 | obj-$(CONFIG_SLUB) += slub.o | 53 | obj-$(CONFIG_SLUB) += slub.o |
54 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o | 54 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o |
diff --git a/mm/compaction.c b/mm/compaction.c index 585de54dbe8c..93f71d968098 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -71,49 +71,6 @@ static inline bool migrate_async_suitable(int migratetype) | |||
71 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; | 71 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; |
72 | } | 72 | } |
73 | 73 | ||
74 | /* | ||
75 | * Check that the whole (or subset of) a pageblock given by the interval of | ||
76 | * [start_pfn, end_pfn) is valid and within the same zone, before scanning it | ||
77 | * with the migration of free compaction scanner. The scanners then need to | ||
78 | * use only pfn_valid_within() check for arches that allow holes within | ||
79 | * pageblocks. | ||
80 | * | ||
81 | * Return struct page pointer of start_pfn, or NULL if checks were not passed. | ||
82 | * | ||
83 | * It's possible on some configurations to have a setup like node0 node1 node0 | ||
84 | * i.e. it's possible that all pages within a zones range of pages do not | ||
85 | * belong to a single zone. We assume that a border between node0 and node1 | ||
86 | * can occur within a single pageblock, but not a node0 node1 node0 | ||
87 | * interleaving within a single pageblock. It is therefore sufficient to check | ||
88 | * the first and last page of a pageblock and avoid checking each individual | ||
89 | * page in a pageblock. | ||
90 | */ | ||
91 | static struct page *pageblock_pfn_to_page(unsigned long start_pfn, | ||
92 | unsigned long end_pfn, struct zone *zone) | ||
93 | { | ||
94 | struct page *start_page; | ||
95 | struct page *end_page; | ||
96 | |||
97 | /* end_pfn is one past the range we are checking */ | ||
98 | end_pfn--; | ||
99 | |||
100 | if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) | ||
101 | return NULL; | ||
102 | |||
103 | start_page = pfn_to_page(start_pfn); | ||
104 | |||
105 | if (page_zone(start_page) != zone) | ||
106 | return NULL; | ||
107 | |||
108 | end_page = pfn_to_page(end_pfn); | ||
109 | |||
110 | /* This gives a shorter code than deriving page_zone(end_page) */ | ||
111 | if (page_zone_id(start_page) != page_zone_id(end_page)) | ||
112 | return NULL; | ||
113 | |||
114 | return start_page; | ||
115 | } | ||
116 | |||
117 | #ifdef CONFIG_COMPACTION | 74 | #ifdef CONFIG_COMPACTION |
118 | 75 | ||
119 | /* Do not skip compaction more than 64 times */ | 76 | /* Do not skip compaction more than 64 times */ |
@@ -200,7 +157,8 @@ static void reset_cached_positions(struct zone *zone) | |||
200 | { | 157 | { |
201 | zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; | 158 | zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; |
202 | zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; | 159 | zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; |
203 | zone->compact_cached_free_pfn = zone_end_pfn(zone); | 160 | zone->compact_cached_free_pfn = |
161 | round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages); | ||
204 | } | 162 | } |
205 | 163 | ||
206 | /* | 164 | /* |
@@ -554,13 +512,17 @@ unsigned long | |||
554 | isolate_freepages_range(struct compact_control *cc, | 512 | isolate_freepages_range(struct compact_control *cc, |
555 | unsigned long start_pfn, unsigned long end_pfn) | 513 | unsigned long start_pfn, unsigned long end_pfn) |
556 | { | 514 | { |
557 | unsigned long isolated, pfn, block_end_pfn; | 515 | unsigned long isolated, pfn, block_start_pfn, block_end_pfn; |
558 | LIST_HEAD(freelist); | 516 | LIST_HEAD(freelist); |
559 | 517 | ||
560 | pfn = start_pfn; | 518 | pfn = start_pfn; |
519 | block_start_pfn = pfn & ~(pageblock_nr_pages - 1); | ||
520 | if (block_start_pfn < cc->zone->zone_start_pfn) | ||
521 | block_start_pfn = cc->zone->zone_start_pfn; | ||
561 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | 522 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
562 | 523 | ||
563 | for (; pfn < end_pfn; pfn += isolated, | 524 | for (; pfn < end_pfn; pfn += isolated, |
525 | block_start_pfn = block_end_pfn, | ||
564 | block_end_pfn += pageblock_nr_pages) { | 526 | block_end_pfn += pageblock_nr_pages) { |
565 | /* Protect pfn from changing by isolate_freepages_block */ | 527 | /* Protect pfn from changing by isolate_freepages_block */ |
566 | unsigned long isolate_start_pfn = pfn; | 528 | unsigned long isolate_start_pfn = pfn; |
@@ -573,11 +535,13 @@ isolate_freepages_range(struct compact_control *cc, | |||
573 | * scanning range to right one. | 535 | * scanning range to right one. |
574 | */ | 536 | */ |
575 | if (pfn >= block_end_pfn) { | 537 | if (pfn >= block_end_pfn) { |
538 | block_start_pfn = pfn & ~(pageblock_nr_pages - 1); | ||
576 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | 539 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
577 | block_end_pfn = min(block_end_pfn, end_pfn); | 540 | block_end_pfn = min(block_end_pfn, end_pfn); |
578 | } | 541 | } |
579 | 542 | ||
580 | if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) | 543 | if (!pageblock_pfn_to_page(block_start_pfn, |
544 | block_end_pfn, cc->zone)) | ||
581 | break; | 545 | break; |
582 | 546 | ||
583 | isolated = isolate_freepages_block(cc, &isolate_start_pfn, | 547 | isolated = isolate_freepages_block(cc, &isolate_start_pfn, |
@@ -863,18 +827,23 @@ unsigned long | |||
863 | isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, | 827 | isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, |
864 | unsigned long end_pfn) | 828 | unsigned long end_pfn) |
865 | { | 829 | { |
866 | unsigned long pfn, block_end_pfn; | 830 | unsigned long pfn, block_start_pfn, block_end_pfn; |
867 | 831 | ||
868 | /* Scan block by block. First and last block may be incomplete */ | 832 | /* Scan block by block. First and last block may be incomplete */ |
869 | pfn = start_pfn; | 833 | pfn = start_pfn; |
834 | block_start_pfn = pfn & ~(pageblock_nr_pages - 1); | ||
835 | if (block_start_pfn < cc->zone->zone_start_pfn) | ||
836 | block_start_pfn = cc->zone->zone_start_pfn; | ||
870 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | 837 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
871 | 838 | ||
872 | for (; pfn < end_pfn; pfn = block_end_pfn, | 839 | for (; pfn < end_pfn; pfn = block_end_pfn, |
840 | block_start_pfn = block_end_pfn, | ||
873 | block_end_pfn += pageblock_nr_pages) { | 841 | block_end_pfn += pageblock_nr_pages) { |
874 | 842 | ||
875 | block_end_pfn = min(block_end_pfn, end_pfn); | 843 | block_end_pfn = min(block_end_pfn, end_pfn); |
876 | 844 | ||
877 | if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) | 845 | if (!pageblock_pfn_to_page(block_start_pfn, |
846 | block_end_pfn, cc->zone)) | ||
878 | continue; | 847 | continue; |
879 | 848 | ||
880 | pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, | 849 | pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, |
@@ -1103,7 +1072,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1; | |||
1103 | static isolate_migrate_t isolate_migratepages(struct zone *zone, | 1072 | static isolate_migrate_t isolate_migratepages(struct zone *zone, |
1104 | struct compact_control *cc) | 1073 | struct compact_control *cc) |
1105 | { | 1074 | { |
1106 | unsigned long low_pfn, end_pfn; | 1075 | unsigned long block_start_pfn; |
1076 | unsigned long block_end_pfn; | ||
1077 | unsigned long low_pfn; | ||
1107 | unsigned long isolate_start_pfn; | 1078 | unsigned long isolate_start_pfn; |
1108 | struct page *page; | 1079 | struct page *page; |
1109 | const isolate_mode_t isolate_mode = | 1080 | const isolate_mode_t isolate_mode = |
@@ -1115,16 +1086,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1115 | * initialized by compact_zone() | 1086 | * initialized by compact_zone() |
1116 | */ | 1087 | */ |
1117 | low_pfn = cc->migrate_pfn; | 1088 | low_pfn = cc->migrate_pfn; |
1089 | block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1); | ||
1090 | if (block_start_pfn < zone->zone_start_pfn) | ||
1091 | block_start_pfn = zone->zone_start_pfn; | ||
1118 | 1092 | ||
1119 | /* Only scan within a pageblock boundary */ | 1093 | /* Only scan within a pageblock boundary */ |
1120 | end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); | 1094 | block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); |
1121 | 1095 | ||
1122 | /* | 1096 | /* |
1123 | * Iterate over whole pageblocks until we find the first suitable. | 1097 | * Iterate over whole pageblocks until we find the first suitable. |
1124 | * Do not cross the free scanner. | 1098 | * Do not cross the free scanner. |
1125 | */ | 1099 | */ |
1126 | for (; end_pfn <= cc->free_pfn; | 1100 | for (; block_end_pfn <= cc->free_pfn; |
1127 | low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { | 1101 | low_pfn = block_end_pfn, |
1102 | block_start_pfn = block_end_pfn, | ||
1103 | block_end_pfn += pageblock_nr_pages) { | ||
1128 | 1104 | ||
1129 | /* | 1105 | /* |
1130 | * This can potentially iterate a massively long zone with | 1106 | * This can potentially iterate a massively long zone with |
@@ -1135,7 +1111,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1135 | && compact_should_abort(cc)) | 1111 | && compact_should_abort(cc)) |
1136 | break; | 1112 | break; |
1137 | 1113 | ||
1138 | page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); | 1114 | page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, |
1115 | zone); | ||
1139 | if (!page) | 1116 | if (!page) |
1140 | continue; | 1117 | continue; |
1141 | 1118 | ||
@@ -1154,8 +1131,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1154 | 1131 | ||
1155 | /* Perform the isolation */ | 1132 | /* Perform the isolation */ |
1156 | isolate_start_pfn = low_pfn; | 1133 | isolate_start_pfn = low_pfn; |
1157 | low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, | 1134 | low_pfn = isolate_migratepages_block(cc, low_pfn, |
1158 | isolate_mode); | 1135 | block_end_pfn, isolate_mode); |
1159 | 1136 | ||
1160 | if (!low_pfn || cc->contended) { | 1137 | if (!low_pfn || cc->contended) { |
1161 | acct_isolated(zone, cc); | 1138 | acct_isolated(zone, cc); |
@@ -1371,11 +1348,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1371 | */ | 1348 | */ |
1372 | cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; | 1349 | cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; |
1373 | cc->free_pfn = zone->compact_cached_free_pfn; | 1350 | cc->free_pfn = zone->compact_cached_free_pfn; |
1374 | if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { | 1351 | if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { |
1375 | cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); | 1352 | cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages); |
1376 | zone->compact_cached_free_pfn = cc->free_pfn; | 1353 | zone->compact_cached_free_pfn = cc->free_pfn; |
1377 | } | 1354 | } |
1378 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { | 1355 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { |
1379 | cc->migrate_pfn = start_pfn; | 1356 | cc->migrate_pfn = start_pfn; |
1380 | zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; | 1357 | zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; |
1381 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; | 1358 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; |
diff --git a/mm/debug.c b/mm/debug.c index f05b2d5d6481..df7247b0b532 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -9,75 +9,38 @@ | |||
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/trace_events.h> | 10 | #include <linux/trace_events.h> |
11 | #include <linux/memcontrol.h> | 11 | #include <linux/memcontrol.h> |
12 | 12 | #include <trace/events/mmflags.h> | |
13 | static const struct trace_print_flags pageflag_names[] = { | 13 | #include <linux/migrate.h> |
14 | {1UL << PG_locked, "locked" }, | 14 | #include <linux/page_owner.h> |
15 | {1UL << PG_error, "error" }, | 15 | |
16 | {1UL << PG_referenced, "referenced" }, | 16 | #include "internal.h" |
17 | {1UL << PG_uptodate, "uptodate" }, | 17 | |
18 | {1UL << PG_dirty, "dirty" }, | 18 | char *migrate_reason_names[MR_TYPES] = { |
19 | {1UL << PG_lru, "lru" }, | 19 | "compaction", |
20 | {1UL << PG_active, "active" }, | 20 | "memory_failure", |
21 | {1UL << PG_slab, "slab" }, | 21 | "memory_hotplug", |
22 | {1UL << PG_owner_priv_1, "owner_priv_1" }, | 22 | "syscall_or_cpuset", |
23 | {1UL << PG_arch_1, "arch_1" }, | 23 | "mempolicy_mbind", |
24 | {1UL << PG_reserved, "reserved" }, | 24 | "numa_misplaced", |
25 | {1UL << PG_private, "private" }, | 25 | "cma", |
26 | {1UL << PG_private_2, "private_2" }, | ||
27 | {1UL << PG_writeback, "writeback" }, | ||
28 | {1UL << PG_head, "head" }, | ||
29 | {1UL << PG_swapcache, "swapcache" }, | ||
30 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | ||
31 | {1UL << PG_reclaim, "reclaim" }, | ||
32 | {1UL << PG_swapbacked, "swapbacked" }, | ||
33 | {1UL << PG_unevictable, "unevictable" }, | ||
34 | #ifdef CONFIG_MMU | ||
35 | {1UL << PG_mlocked, "mlocked" }, | ||
36 | #endif | ||
37 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED | ||
38 | {1UL << PG_uncached, "uncached" }, | ||
39 | #endif | ||
40 | #ifdef CONFIG_MEMORY_FAILURE | ||
41 | {1UL << PG_hwpoison, "hwpoison" }, | ||
42 | #endif | ||
43 | #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) | ||
44 | {1UL << PG_young, "young" }, | ||
45 | {1UL << PG_idle, "idle" }, | ||
46 | #endif | ||
47 | }; | 26 | }; |
48 | 27 | ||
49 | static void dump_flags(unsigned long flags, | 28 | const struct trace_print_flags pageflag_names[] = { |
50 | const struct trace_print_flags *names, int count) | 29 | __def_pageflag_names, |
51 | { | 30 | {0, NULL} |
52 | const char *delim = ""; | 31 | }; |
53 | unsigned long mask; | ||
54 | int i; | ||
55 | |||
56 | pr_emerg("flags: %#lx(", flags); | ||
57 | |||
58 | /* remove zone id */ | ||
59 | flags &= (1UL << NR_PAGEFLAGS) - 1; | ||
60 | |||
61 | for (i = 0; i < count && flags; i++) { | ||
62 | |||
63 | mask = names[i].mask; | ||
64 | if ((flags & mask) != mask) | ||
65 | continue; | ||
66 | |||
67 | flags &= ~mask; | ||
68 | pr_cont("%s%s", delim, names[i].name); | ||
69 | delim = "|"; | ||
70 | } | ||
71 | 32 | ||
72 | /* check for left over flags */ | 33 | const struct trace_print_flags gfpflag_names[] = { |
73 | if (flags) | 34 | __def_gfpflag_names, |
74 | pr_cont("%s%#lx", delim, flags); | 35 | {0, NULL} |
36 | }; | ||
75 | 37 | ||
76 | pr_cont(")\n"); | 38 | const struct trace_print_flags vmaflag_names[] = { |
77 | } | 39 | __def_vmaflag_names, |
40 | {0, NULL} | ||
41 | }; | ||
78 | 42 | ||
79 | void dump_page_badflags(struct page *page, const char *reason, | 43 | void __dump_page(struct page *page, const char *reason) |
80 | unsigned long badflags) | ||
81 | { | 44 | { |
82 | pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", | 45 | pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", |
83 | page, atomic_read(&page->_count), page_mapcount(page), | 46 | page, atomic_read(&page->_count), page_mapcount(page), |
@@ -85,15 +48,13 @@ void dump_page_badflags(struct page *page, const char *reason, | |||
85 | if (PageCompound(page)) | 48 | if (PageCompound(page)) |
86 | pr_cont(" compound_mapcount: %d", compound_mapcount(page)); | 49 | pr_cont(" compound_mapcount: %d", compound_mapcount(page)); |
87 | pr_cont("\n"); | 50 | pr_cont("\n"); |
88 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); | 51 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); |
89 | dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); | 52 | |
53 | pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); | ||
54 | |||
90 | if (reason) | 55 | if (reason) |
91 | pr_alert("page dumped because: %s\n", reason); | 56 | pr_alert("page dumped because: %s\n", reason); |
92 | if (page->flags & badflags) { | 57 | |
93 | pr_alert("bad because of flags:\n"); | ||
94 | dump_flags(page->flags & badflags, | ||
95 | pageflag_names, ARRAY_SIZE(pageflag_names)); | ||
96 | } | ||
97 | #ifdef CONFIG_MEMCG | 58 | #ifdef CONFIG_MEMCG |
98 | if (page->mem_cgroup) | 59 | if (page->mem_cgroup) |
99 | pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); | 60 | pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); |
@@ -102,67 +63,26 @@ void dump_page_badflags(struct page *page, const char *reason, | |||
102 | 63 | ||
103 | void dump_page(struct page *page, const char *reason) | 64 | void dump_page(struct page *page, const char *reason) |
104 | { | 65 | { |
105 | dump_page_badflags(page, reason, 0); | 66 | __dump_page(page, reason); |
67 | dump_page_owner(page); | ||
106 | } | 68 | } |
107 | EXPORT_SYMBOL(dump_page); | 69 | EXPORT_SYMBOL(dump_page); |
108 | 70 | ||
109 | #ifdef CONFIG_DEBUG_VM | 71 | #ifdef CONFIG_DEBUG_VM |
110 | 72 | ||
111 | static const struct trace_print_flags vmaflags_names[] = { | ||
112 | {VM_READ, "read" }, | ||
113 | {VM_WRITE, "write" }, | ||
114 | {VM_EXEC, "exec" }, | ||
115 | {VM_SHARED, "shared" }, | ||
116 | {VM_MAYREAD, "mayread" }, | ||
117 | {VM_MAYWRITE, "maywrite" }, | ||
118 | {VM_MAYEXEC, "mayexec" }, | ||
119 | {VM_MAYSHARE, "mayshare" }, | ||
120 | {VM_GROWSDOWN, "growsdown" }, | ||
121 | {VM_PFNMAP, "pfnmap" }, | ||
122 | {VM_DENYWRITE, "denywrite" }, | ||
123 | {VM_LOCKONFAULT, "lockonfault" }, | ||
124 | {VM_LOCKED, "locked" }, | ||
125 | {VM_IO, "io" }, | ||
126 | {VM_SEQ_READ, "seqread" }, | ||
127 | {VM_RAND_READ, "randread" }, | ||
128 | {VM_DONTCOPY, "dontcopy" }, | ||
129 | {VM_DONTEXPAND, "dontexpand" }, | ||
130 | {VM_ACCOUNT, "account" }, | ||
131 | {VM_NORESERVE, "noreserve" }, | ||
132 | {VM_HUGETLB, "hugetlb" }, | ||
133 | #if defined(CONFIG_X86) | ||
134 | {VM_PAT, "pat" }, | ||
135 | #elif defined(CONFIG_PPC) | ||
136 | {VM_SAO, "sao" }, | ||
137 | #elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) | ||
138 | {VM_GROWSUP, "growsup" }, | ||
139 | #elif !defined(CONFIG_MMU) | ||
140 | {VM_MAPPED_COPY, "mappedcopy" }, | ||
141 | #else | ||
142 | {VM_ARCH_1, "arch_1" }, | ||
143 | #endif | ||
144 | {VM_DONTDUMP, "dontdump" }, | ||
145 | #ifdef CONFIG_MEM_SOFT_DIRTY | ||
146 | {VM_SOFTDIRTY, "softdirty" }, | ||
147 | #endif | ||
148 | {VM_MIXEDMAP, "mixedmap" }, | ||
149 | {VM_HUGEPAGE, "hugepage" }, | ||
150 | {VM_NOHUGEPAGE, "nohugepage" }, | ||
151 | {VM_MERGEABLE, "mergeable" }, | ||
152 | }; | ||
153 | |||
154 | void dump_vma(const struct vm_area_struct *vma) | 73 | void dump_vma(const struct vm_area_struct *vma) |
155 | { | 74 | { |
156 | pr_emerg("vma %p start %p end %p\n" | 75 | pr_emerg("vma %p start %p end %p\n" |
157 | "next %p prev %p mm %p\n" | 76 | "next %p prev %p mm %p\n" |
158 | "prot %lx anon_vma %p vm_ops %p\n" | 77 | "prot %lx anon_vma %p vm_ops %p\n" |
159 | "pgoff %lx file %p private_data %p\n", | 78 | "pgoff %lx file %p private_data %p\n" |
79 | "flags: %#lx(%pGv)\n", | ||
160 | vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, | 80 | vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, |
161 | vma->vm_prev, vma->vm_mm, | 81 | vma->vm_prev, vma->vm_mm, |
162 | (unsigned long)pgprot_val(vma->vm_page_prot), | 82 | (unsigned long)pgprot_val(vma->vm_page_prot), |
163 | vma->anon_vma, vma->vm_ops, vma->vm_pgoff, | 83 | vma->anon_vma, vma->vm_ops, vma->vm_pgoff, |
164 | vma->vm_file, vma->vm_private_data); | 84 | vma->vm_file, vma->vm_private_data, |
165 | dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); | 85 | vma->vm_flags, &vma->vm_flags); |
166 | } | 86 | } |
167 | EXPORT_SYMBOL(dump_vma); | 87 | EXPORT_SYMBOL(dump_vma); |
168 | 88 | ||
@@ -196,7 +116,7 @@ void dump_mm(const struct mm_struct *mm) | |||
196 | #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) | 116 | #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) |
197 | "tlb_flush_pending %d\n" | 117 | "tlb_flush_pending %d\n" |
198 | #endif | 118 | #endif |
199 | "%s", /* This is here to hold the comma */ | 119 | "def_flags: %#lx(%pGv)\n", |
200 | 120 | ||
201 | mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, | 121 | mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, |
202 | #ifdef CONFIG_MMU | 122 | #ifdef CONFIG_MMU |
@@ -230,11 +150,8 @@ void dump_mm(const struct mm_struct *mm) | |||
230 | #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) | 150 | #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) |
231 | mm->tlb_flush_pending, | 151 | mm->tlb_flush_pending, |
232 | #endif | 152 | #endif |
233 | "" /* This is here to not have a comma! */ | 153 | mm->def_flags, &mm->def_flags |
234 | ); | 154 | ); |
235 | |||
236 | dump_flags(mm->def_flags, vmaflags_names, | ||
237 | ARRAY_SIZE(vmaflags_names)); | ||
238 | } | 155 | } |
239 | 156 | ||
240 | #endif /* CONFIG_DEBUG_VM */ | 157 | #endif /* CONFIG_DEBUG_VM */ |
diff --git a/mm/failslab.c b/mm/failslab.c index 79171b4a5826..b0fac98cd938 100644 --- a/mm/failslab.c +++ b/mm/failslab.c | |||
@@ -1,5 +1,7 @@ | |||
1 | #include <linux/fault-inject.h> | 1 | #include <linux/fault-inject.h> |
2 | #include <linux/slab.h> | 2 | #include <linux/slab.h> |
3 | #include <linux/mm.h> | ||
4 | #include "slab.h" | ||
3 | 5 | ||
4 | static struct { | 6 | static struct { |
5 | struct fault_attr attr; | 7 | struct fault_attr attr; |
@@ -11,18 +13,22 @@ static struct { | |||
11 | .cache_filter = false, | 13 | .cache_filter = false, |
12 | }; | 14 | }; |
13 | 15 | ||
14 | bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) | 16 | bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) |
15 | { | 17 | { |
18 | /* No fault-injection for bootstrap cache */ | ||
19 | if (unlikely(s == kmem_cache)) | ||
20 | return false; | ||
21 | |||
16 | if (gfpflags & __GFP_NOFAIL) | 22 | if (gfpflags & __GFP_NOFAIL) |
17 | return false; | 23 | return false; |
18 | 24 | ||
19 | if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) | 25 | if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) |
20 | return false; | 26 | return false; |
21 | 27 | ||
22 | if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) | 28 | if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) |
23 | return false; | 29 | return false; |
24 | 30 | ||
25 | return should_fail(&failslab.attr, size); | 31 | return should_fail(&failslab.attr, s->object_size); |
26 | } | 32 | } |
27 | 33 | ||
28 | static int __init setup_failslab(char *str) | 34 | static int __init setup_failslab(char *str) |
diff --git a/mm/filemap.c b/mm/filemap.c index da7a35d83de7..61b441b191ad 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -101,7 +101,7 @@ | |||
101 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 101 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
102 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) | 102 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) |
103 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) | 103 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
104 | * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) | 104 | * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) |
105 | * bdi.wb->list_lock (zap_pte_range->set_page_dirty) | 105 | * bdi.wb->list_lock (zap_pte_range->set_page_dirty) |
106 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 106 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
107 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 107 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
@@ -176,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping, | |||
176 | /* | 176 | /* |
177 | * Delete a page from the page cache and free it. Caller has to make | 177 | * Delete a page from the page cache and free it. Caller has to make |
178 | * sure the page is locked and that nobody else uses it - or that usage | 178 | * sure the page is locked and that nobody else uses it - or that usage |
179 | * is safe. The caller must hold the mapping's tree_lock and | 179 | * is safe. The caller must hold the mapping's tree_lock. |
180 | * mem_cgroup_begin_page_stat(). | ||
181 | */ | 180 | */ |
182 | void __delete_from_page_cache(struct page *page, void *shadow, | 181 | void __delete_from_page_cache(struct page *page, void *shadow) |
183 | struct mem_cgroup *memcg) | ||
184 | { | 182 | { |
185 | struct address_space *mapping = page->mapping; | 183 | struct address_space *mapping = page->mapping; |
186 | 184 | ||
@@ -239,8 +237,7 @@ void __delete_from_page_cache(struct page *page, void *shadow, | |||
239 | * anyway will be cleared before returning page into buddy allocator. | 237 | * anyway will be cleared before returning page into buddy allocator. |
240 | */ | 238 | */ |
241 | if (WARN_ON_ONCE(PageDirty(page))) | 239 | if (WARN_ON_ONCE(PageDirty(page))) |
242 | account_page_cleaned(page, mapping, memcg, | 240 | account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); |
243 | inode_to_wb(mapping->host)); | ||
244 | } | 241 | } |
245 | 242 | ||
246 | /** | 243 | /** |
@@ -254,7 +251,6 @@ void __delete_from_page_cache(struct page *page, void *shadow, | |||
254 | void delete_from_page_cache(struct page *page) | 251 | void delete_from_page_cache(struct page *page) |
255 | { | 252 | { |
256 | struct address_space *mapping = page->mapping; | 253 | struct address_space *mapping = page->mapping; |
257 | struct mem_cgroup *memcg; | ||
258 | unsigned long flags; | 254 | unsigned long flags; |
259 | 255 | ||
260 | void (*freepage)(struct page *); | 256 | void (*freepage)(struct page *); |
@@ -263,11 +259,9 @@ void delete_from_page_cache(struct page *page) | |||
263 | 259 | ||
264 | freepage = mapping->a_ops->freepage; | 260 | freepage = mapping->a_ops->freepage; |
265 | 261 | ||
266 | memcg = mem_cgroup_begin_page_stat(page); | ||
267 | spin_lock_irqsave(&mapping->tree_lock, flags); | 262 | spin_lock_irqsave(&mapping->tree_lock, flags); |
268 | __delete_from_page_cache(page, NULL, memcg); | 263 | __delete_from_page_cache(page, NULL); |
269 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 264 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
270 | mem_cgroup_end_page_stat(memcg); | ||
271 | 265 | ||
272 | if (freepage) | 266 | if (freepage) |
273 | freepage(page); | 267 | freepage(page); |
@@ -551,7 +545,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | |||
551 | if (!error) { | 545 | if (!error) { |
552 | struct address_space *mapping = old->mapping; | 546 | struct address_space *mapping = old->mapping; |
553 | void (*freepage)(struct page *); | 547 | void (*freepage)(struct page *); |
554 | struct mem_cgroup *memcg; | ||
555 | unsigned long flags; | 548 | unsigned long flags; |
556 | 549 | ||
557 | pgoff_t offset = old->index; | 550 | pgoff_t offset = old->index; |
@@ -561,9 +554,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | |||
561 | new->mapping = mapping; | 554 | new->mapping = mapping; |
562 | new->index = offset; | 555 | new->index = offset; |
563 | 556 | ||
564 | memcg = mem_cgroup_begin_page_stat(old); | ||
565 | spin_lock_irqsave(&mapping->tree_lock, flags); | 557 | spin_lock_irqsave(&mapping->tree_lock, flags); |
566 | __delete_from_page_cache(old, NULL, memcg); | 558 | __delete_from_page_cache(old, NULL); |
567 | error = radix_tree_insert(&mapping->page_tree, offset, new); | 559 | error = radix_tree_insert(&mapping->page_tree, offset, new); |
568 | BUG_ON(error); | 560 | BUG_ON(error); |
569 | mapping->nrpages++; | 561 | mapping->nrpages++; |
@@ -576,8 +568,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | |||
576 | if (PageSwapBacked(new)) | 568 | if (PageSwapBacked(new)) |
577 | __inc_zone_page_state(new, NR_SHMEM); | 569 | __inc_zone_page_state(new, NR_SHMEM); |
578 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 570 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
579 | mem_cgroup_end_page_stat(memcg); | 571 | mem_cgroup_migrate(old, new); |
580 | mem_cgroup_replace_page(old, new); | ||
581 | radix_tree_preload_end(); | 572 | radix_tree_preload_end(); |
582 | if (freepage) | 573 | if (freepage) |
583 | freepage(old); | 574 | freepage(old); |
@@ -1668,6 +1659,15 @@ find_page: | |||
1668 | index, last_index - index); | 1659 | index, last_index - index); |
1669 | } | 1660 | } |
1670 | if (!PageUptodate(page)) { | 1661 | if (!PageUptodate(page)) { |
1662 | /* | ||
1663 | * See comment in do_read_cache_page on why | ||
1664 | * wait_on_page_locked is used to avoid unnecessarily | ||
1665 | * serialisations and why it's safe. | ||
1666 | */ | ||
1667 | wait_on_page_locked_killable(page); | ||
1668 | if (PageUptodate(page)) | ||
1669 | goto page_ok; | ||
1670 | |||
1671 | if (inode->i_blkbits == PAGE_CACHE_SHIFT || | 1671 | if (inode->i_blkbits == PAGE_CACHE_SHIFT || |
1672 | !mapping->a_ops->is_partially_uptodate) | 1672 | !mapping->a_ops->is_partially_uptodate) |
1673 | goto page_not_up_to_date; | 1673 | goto page_not_up_to_date; |
@@ -2303,7 +2303,7 @@ static struct page *wait_on_page_read(struct page *page) | |||
2303 | return page; | 2303 | return page; |
2304 | } | 2304 | } |
2305 | 2305 | ||
2306 | static struct page *__read_cache_page(struct address_space *mapping, | 2306 | static struct page *do_read_cache_page(struct address_space *mapping, |
2307 | pgoff_t index, | 2307 | pgoff_t index, |
2308 | int (*filler)(void *, struct page *), | 2308 | int (*filler)(void *, struct page *), |
2309 | void *data, | 2309 | void *data, |
@@ -2325,53 +2325,74 @@ repeat: | |||
2325 | /* Presumably ENOMEM for radix tree node */ | 2325 | /* Presumably ENOMEM for radix tree node */ |
2326 | return ERR_PTR(err); | 2326 | return ERR_PTR(err); |
2327 | } | 2327 | } |
2328 | |||
2329 | filler: | ||
2328 | err = filler(data, page); | 2330 | err = filler(data, page); |
2329 | if (err < 0) { | 2331 | if (err < 0) { |
2330 | page_cache_release(page); | 2332 | page_cache_release(page); |
2331 | page = ERR_PTR(err); | 2333 | return ERR_PTR(err); |
2332 | } else { | ||
2333 | page = wait_on_page_read(page); | ||
2334 | } | 2334 | } |
2335 | } | ||
2336 | return page; | ||
2337 | } | ||
2338 | |||
2339 | static struct page *do_read_cache_page(struct address_space *mapping, | ||
2340 | pgoff_t index, | ||
2341 | int (*filler)(void *, struct page *), | ||
2342 | void *data, | ||
2343 | gfp_t gfp) | ||
2344 | 2335 | ||
2345 | { | 2336 | page = wait_on_page_read(page); |
2346 | struct page *page; | 2337 | if (IS_ERR(page)) |
2347 | int err; | 2338 | return page; |
2339 | goto out; | ||
2340 | } | ||
2341 | if (PageUptodate(page)) | ||
2342 | goto out; | ||
2348 | 2343 | ||
2349 | retry: | 2344 | /* |
2350 | page = __read_cache_page(mapping, index, filler, data, gfp); | 2345 | * Page is not up to date and may be locked due one of the following |
2351 | if (IS_ERR(page)) | 2346 | * case a: Page is being filled and the page lock is held |
2352 | return page; | 2347 | * case b: Read/write error clearing the page uptodate status |
2348 | * case c: Truncation in progress (page locked) | ||
2349 | * case d: Reclaim in progress | ||
2350 | * | ||
2351 | * Case a, the page will be up to date when the page is unlocked. | ||
2352 | * There is no need to serialise on the page lock here as the page | ||
2353 | * is pinned so the lock gives no additional protection. Even if the | ||
2354 | * the page is truncated, the data is still valid if PageUptodate as | ||
2355 | * it's a race vs truncate race. | ||
2356 | * Case b, the page will not be up to date | ||
2357 | * Case c, the page may be truncated but in itself, the data may still | ||
2358 | * be valid after IO completes as it's a read vs truncate race. The | ||
2359 | * operation must restart if the page is not uptodate on unlock but | ||
2360 | * otherwise serialising on page lock to stabilise the mapping gives | ||
2361 | * no additional guarantees to the caller as the page lock is | ||
2362 | * released before return. | ||
2363 | * Case d, similar to truncation. If reclaim holds the page lock, it | ||
2364 | * will be a race with remove_mapping that determines if the mapping | ||
2365 | * is valid on unlock but otherwise the data is valid and there is | ||
2366 | * no need to serialise with page lock. | ||
2367 | * | ||
2368 | * As the page lock gives no additional guarantee, we optimistically | ||
2369 | * wait on the page to be unlocked and check if it's up to date and | ||
2370 | * use the page if it is. Otherwise, the page lock is required to | ||
2371 | * distinguish between the different cases. The motivation is that we | ||
2372 | * avoid spurious serialisations and wakeups when multiple processes | ||
2373 | * wait on the same page for IO to complete. | ||
2374 | */ | ||
2375 | wait_on_page_locked(page); | ||
2353 | if (PageUptodate(page)) | 2376 | if (PageUptodate(page)) |
2354 | goto out; | 2377 | goto out; |
2355 | 2378 | ||
2379 | /* Distinguish between all the cases under the safety of the lock */ | ||
2356 | lock_page(page); | 2380 | lock_page(page); |
2381 | |||
2382 | /* Case c or d, restart the operation */ | ||
2357 | if (!page->mapping) { | 2383 | if (!page->mapping) { |
2358 | unlock_page(page); | 2384 | unlock_page(page); |
2359 | page_cache_release(page); | 2385 | page_cache_release(page); |
2360 | goto retry; | 2386 | goto repeat; |
2361 | } | 2387 | } |
2388 | |||
2389 | /* Someone else locked and filled the page in a very small window */ | ||
2362 | if (PageUptodate(page)) { | 2390 | if (PageUptodate(page)) { |
2363 | unlock_page(page); | 2391 | unlock_page(page); |
2364 | goto out; | 2392 | goto out; |
2365 | } | 2393 | } |
2366 | err = filler(data, page); | 2394 | goto filler; |
2367 | if (err < 0) { | 2395 | |
2368 | page_cache_release(page); | ||
2369 | return ERR_PTR(err); | ||
2370 | } else { | ||
2371 | page = wait_on_page_read(page); | ||
2372 | if (IS_ERR(page)) | ||
2373 | return page; | ||
2374 | } | ||
2375 | out: | 2396 | out: |
2376 | mark_page_accessed(page); | 2397 | mark_page_accessed(page); |
2377 | return page; | 2398 | return page; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e10a4fee88d2..1ea21e203a70 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -3220,28 +3220,26 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) | |||
3220 | } | 3220 | } |
3221 | } | 3221 | } |
3222 | 3222 | ||
3223 | static int __split_huge_page_tail(struct page *head, int tail, | 3223 | static void __split_huge_page_tail(struct page *head, int tail, |
3224 | struct lruvec *lruvec, struct list_head *list) | 3224 | struct lruvec *lruvec, struct list_head *list) |
3225 | { | 3225 | { |
3226 | int mapcount; | ||
3227 | struct page *page_tail = head + tail; | 3226 | struct page *page_tail = head + tail; |
3228 | 3227 | ||
3229 | mapcount = atomic_read(&page_tail->_mapcount) + 1; | 3228 | VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); |
3230 | VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); | 3229 | VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); |
3231 | 3230 | ||
3232 | /* | 3231 | /* |
3233 | * tail_page->_count is zero and not changing from under us. But | 3232 | * tail_page->_count is zero and not changing from under us. But |
3234 | * get_page_unless_zero() may be running from under us on the | 3233 | * get_page_unless_zero() may be running from under us on the |
3235 | * tail_page. If we used atomic_set() below instead of atomic_add(), we | 3234 | * tail_page. If we used atomic_set() below instead of atomic_inc(), we |
3236 | * would then run atomic_set() concurrently with | 3235 | * would then run atomic_set() concurrently with |
3237 | * get_page_unless_zero(), and atomic_set() is implemented in C not | 3236 | * get_page_unless_zero(), and atomic_set() is implemented in C not |
3238 | * using locked ops. spin_unlock on x86 sometime uses locked ops | 3237 | * using locked ops. spin_unlock on x86 sometime uses locked ops |
3239 | * because of PPro errata 66, 92, so unless somebody can guarantee | 3238 | * because of PPro errata 66, 92, so unless somebody can guarantee |
3240 | * atomic_set() here would be safe on all archs (and not only on x86), | 3239 | * atomic_set() here would be safe on all archs (and not only on x86), |
3241 | * it's safer to use atomic_add(). | 3240 | * it's safer to use atomic_inc(). |
3242 | */ | 3241 | */ |
3243 | atomic_add(mapcount + 1, &page_tail->_count); | 3242 | atomic_inc(&page_tail->_count); |
3244 | |||
3245 | 3243 | ||
3246 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 3244 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
3247 | page_tail->flags |= (head->flags & | 3245 | page_tail->flags |= (head->flags & |
@@ -3275,8 +3273,6 @@ static int __split_huge_page_tail(struct page *head, int tail, | |||
3275 | page_tail->index = head->index + tail; | 3273 | page_tail->index = head->index + tail; |
3276 | page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); | 3274 | page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); |
3277 | lru_add_page_tail(head, page_tail, lruvec, list); | 3275 | lru_add_page_tail(head, page_tail, lruvec, list); |
3278 | |||
3279 | return mapcount; | ||
3280 | } | 3276 | } |
3281 | 3277 | ||
3282 | static void __split_huge_page(struct page *page, struct list_head *list) | 3278 | static void __split_huge_page(struct page *page, struct list_head *list) |
@@ -3284,7 +3280,7 @@ static void __split_huge_page(struct page *page, struct list_head *list) | |||
3284 | struct page *head = compound_head(page); | 3280 | struct page *head = compound_head(page); |
3285 | struct zone *zone = page_zone(head); | 3281 | struct zone *zone = page_zone(head); |
3286 | struct lruvec *lruvec; | 3282 | struct lruvec *lruvec; |
3287 | int i, tail_mapcount; | 3283 | int i; |
3288 | 3284 | ||
3289 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | 3285 | /* prevent PageLRU to go away from under us, and freeze lru stats */ |
3290 | spin_lock_irq(&zone->lru_lock); | 3286 | spin_lock_irq(&zone->lru_lock); |
@@ -3293,10 +3289,8 @@ static void __split_huge_page(struct page *page, struct list_head *list) | |||
3293 | /* complete memcg works before add pages to LRU */ | 3289 | /* complete memcg works before add pages to LRU */ |
3294 | mem_cgroup_split_huge_fixup(head); | 3290 | mem_cgroup_split_huge_fixup(head); |
3295 | 3291 | ||
3296 | tail_mapcount = 0; | ||
3297 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) | 3292 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) |
3298 | tail_mapcount += __split_huge_page_tail(head, i, lruvec, list); | 3293 | __split_huge_page_tail(head, i, lruvec, list); |
3299 | atomic_sub(tail_mapcount, &head->_count); | ||
3300 | 3294 | ||
3301 | ClearPageCompound(head); | 3295 | ClearPageCompound(head); |
3302 | spin_unlock_irq(&zone->lru_lock); | 3296 | spin_unlock_irq(&zone->lru_lock); |
diff --git a/mm/internal.h b/mm/internal.h index a38a21ebddb4..ad9400d759c8 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/fs.h> | 14 | #include <linux/fs.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/pagemap.h> | 16 | #include <linux/pagemap.h> |
17 | #include <linux/tracepoint-defs.h> | ||
17 | 18 | ||
18 | /* | 19 | /* |
19 | * The set of flags that only affect watermark checking and reclaim | 20 | * The set of flags that only affect watermark checking and reclaim |
@@ -131,6 +132,18 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) | |||
131 | return page_idx ^ (1 << order); | 132 | return page_idx ^ (1 << order); |
132 | } | 133 | } |
133 | 134 | ||
135 | extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, | ||
136 | unsigned long end_pfn, struct zone *zone); | ||
137 | |||
138 | static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, | ||
139 | unsigned long end_pfn, struct zone *zone) | ||
140 | { | ||
141 | if (zone->contiguous) | ||
142 | return pfn_to_page(start_pfn); | ||
143 | |||
144 | return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); | ||
145 | } | ||
146 | |||
134 | extern int __isolate_free_page(struct page *page, unsigned int order); | 147 | extern int __isolate_free_page(struct page *page, unsigned int order); |
135 | extern void __free_pages_bootmem(struct page *page, unsigned long pfn, | 148 | extern void __free_pages_bootmem(struct page *page, unsigned long pfn, |
136 | unsigned int order); | 149 | unsigned int order); |
@@ -466,4 +479,9 @@ static inline void try_to_unmap_flush_dirty(void) | |||
466 | } | 479 | } |
467 | 480 | ||
468 | #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ | 481 | #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ |
482 | |||
483 | extern const struct trace_print_flags pageflag_names[]; | ||
484 | extern const struct trace_print_flags vmaflag_names[]; | ||
485 | extern const struct trace_print_flags gfpflag_names[]; | ||
486 | |||
469 | #endif /* __MM_INTERNAL_H */ | 487 | #endif /* __MM_INTERNAL_H */ |
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index cab58bb592d8..6f4f424037c0 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c | |||
@@ -60,6 +60,9 @@ void kmemcheck_free_shadow(struct page *page, int order) | |||
60 | void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, | 60 | void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, |
61 | size_t size) | 61 | size_t size) |
62 | { | 62 | { |
63 | if (unlikely(!object)) /* Skip object if allocation failed */ | ||
64 | return; | ||
65 | |||
63 | /* | 66 | /* |
64 | * Has already been memset(), which initializes the shadow for us | 67 | * Has already been memset(), which initializes the shadow for us |
65 | * as well. | 68 | * as well. |
diff --git a/mm/madvise.c b/mm/madvise.c index f56825b6d2e1..a01147359f3b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -555,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) | |||
555 | } | 555 | } |
556 | pr_info("Injecting memory failure for page %#lx at %#lx\n", | 556 | pr_info("Injecting memory failure for page %#lx at %#lx\n", |
557 | page_to_pfn(p), start); | 557 | page_to_pfn(p), start); |
558 | /* Ignore return value for now */ | 558 | ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); |
559 | memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); | 559 | if (ret) |
560 | return ret; | ||
560 | } | 561 | } |
561 | return 0; | 562 | return 0; |
562 | } | 563 | } |
@@ -638,14 +639,28 @@ madvise_behavior_valid(int behavior) | |||
638 | * some pages ahead. | 639 | * some pages ahead. |
639 | * MADV_DONTNEED - the application is finished with the given range, | 640 | * MADV_DONTNEED - the application is finished with the given range, |
640 | * so the kernel can free resources associated with it. | 641 | * so the kernel can free resources associated with it. |
642 | * MADV_FREE - the application marks pages in the given range as lazy free, | ||
643 | * where actual purges are postponed until memory pressure happens. | ||
641 | * MADV_REMOVE - the application wants to free up the given range of | 644 | * MADV_REMOVE - the application wants to free up the given range of |
642 | * pages and associated backing store. | 645 | * pages and associated backing store. |
643 | * MADV_DONTFORK - omit this area from child's address space when forking: | 646 | * MADV_DONTFORK - omit this area from child's address space when forking: |
644 | * typically, to avoid COWing pages pinned by get_user_pages(). | 647 | * typically, to avoid COWing pages pinned by get_user_pages(). |
645 | * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. | 648 | * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. |
649 | * MADV_HWPOISON - trigger memory error handler as if the given memory range | ||
650 | * were corrupted by unrecoverable hardware memory failure. | ||
651 | * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. | ||
646 | * MADV_MERGEABLE - the application recommends that KSM try to merge pages in | 652 | * MADV_MERGEABLE - the application recommends that KSM try to merge pages in |
647 | * this area with pages of identical content from other such areas. | 653 | * this area with pages of identical content from other such areas. |
648 | * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. | 654 | * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. |
655 | * MADV_HUGEPAGE - the application wants to back the given range by transparent | ||
656 | * huge pages in the future. Existing pages might be coalesced and | ||
657 | * new pages might be allocated as THP. | ||
658 | * MADV_NOHUGEPAGE - mark the given range as not worth being backed by | ||
659 | * transparent huge pages so the existing pages will not be | ||
660 | * coalesced into THP and new pages will not be allocated as THP. | ||
661 | * MADV_DONTDUMP - the application wants to prevent pages in the given range | ||
662 | * from being included in its core dump. | ||
663 | * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. | ||
649 | * | 664 | * |
650 | * return values: | 665 | * return values: |
651 | * zero - success | 666 | * zero - success |
diff --git a/mm/memblock.c b/mm/memblock.c index dd7989929f13..fc7824fa1b42 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -612,14 +612,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base, | |||
612 | int nid, | 612 | int nid, |
613 | unsigned long flags) | 613 | unsigned long flags) |
614 | { | 614 | { |
615 | struct memblock_type *type = &memblock.memory; | ||
616 | |||
617 | memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", | 615 | memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", |
618 | (unsigned long long)base, | 616 | (unsigned long long)base, |
619 | (unsigned long long)base + size - 1, | 617 | (unsigned long long)base + size - 1, |
620 | flags, (void *)_RET_IP_); | 618 | flags, (void *)_RET_IP_); |
621 | 619 | ||
622 | return memblock_add_range(type, base, size, nid, flags); | 620 | return memblock_add_range(&memblock.memory, base, size, nid, flags); |
623 | } | 621 | } |
624 | 622 | ||
625 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) | 623 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) |
@@ -740,14 +738,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, | |||
740 | int nid, | 738 | int nid, |
741 | unsigned long flags) | 739 | unsigned long flags) |
742 | { | 740 | { |
743 | struct memblock_type *type = &memblock.reserved; | ||
744 | |||
745 | memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", | 741 | memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", |
746 | (unsigned long long)base, | 742 | (unsigned long long)base, |
747 | (unsigned long long)base + size - 1, | 743 | (unsigned long long)base + size - 1, |
748 | flags, (void *)_RET_IP_); | 744 | flags, (void *)_RET_IP_); |
749 | 745 | ||
750 | return memblock_add_range(type, base, size, nid, flags); | 746 | return memblock_add_range(&memblock.reserved, base, size, nid, flags); |
751 | } | 747 | } |
752 | 748 | ||
753 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | 749 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d06cae2de783..42882c1e7fce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | |||
268 | return (memcg == root_mem_cgroup); | 268 | return (memcg == root_mem_cgroup); |
269 | } | 269 | } |
270 | 270 | ||
271 | /* | ||
272 | * We restrict the id in the range of [1, 65535], so it can fit into | ||
273 | * an unsigned short. | ||
274 | */ | ||
275 | #define MEM_CGROUP_ID_MAX USHRT_MAX | ||
276 | |||
277 | static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) | ||
278 | { | ||
279 | return memcg->css.id; | ||
280 | } | ||
281 | |||
282 | /* | ||
283 | * A helper function to get mem_cgroup from ID. must be called under | ||
284 | * rcu_read_lock(). The caller is responsible for calling | ||
285 | * css_tryget_online() if the mem_cgroup is used for charging. (dropping | ||
286 | * refcnt from swap can be called against removed memcg.) | ||
287 | */ | ||
288 | static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) | ||
289 | { | ||
290 | struct cgroup_subsys_state *css; | ||
291 | |||
292 | css = css_from_id(id, &memory_cgrp_subsys); | ||
293 | return mem_cgroup_from_css(css); | ||
294 | } | ||
295 | |||
296 | #ifndef CONFIG_SLOB | 271 | #ifndef CONFIG_SLOB |
297 | /* | 272 | /* |
298 | * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. | 273 | * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. |
@@ -1709,19 +1684,13 @@ cleanup: | |||
1709 | } | 1684 | } |
1710 | 1685 | ||
1711 | /** | 1686 | /** |
1712 | * mem_cgroup_begin_page_stat - begin a page state statistics transaction | 1687 | * lock_page_memcg - lock a page->mem_cgroup binding |
1713 | * @page: page that is going to change accounted state | 1688 | * @page: the page |
1714 | * | ||
1715 | * This function must mark the beginning of an accounted page state | ||
1716 | * change to prevent double accounting when the page is concurrently | ||
1717 | * being moved to another memcg: | ||
1718 | * | 1689 | * |
1719 | * memcg = mem_cgroup_begin_page_stat(page); | 1690 | * This function protects unlocked LRU pages from being moved to |
1720 | * if (TestClearPageState(page)) | 1691 | * another cgroup and stabilizes their page->mem_cgroup binding. |
1721 | * mem_cgroup_update_page_stat(memcg, state, -1); | ||
1722 | * mem_cgroup_end_page_stat(memcg); | ||
1723 | */ | 1692 | */ |
1724 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) | 1693 | void lock_page_memcg(struct page *page) |
1725 | { | 1694 | { |
1726 | struct mem_cgroup *memcg; | 1695 | struct mem_cgroup *memcg; |
1727 | unsigned long flags; | 1696 | unsigned long flags; |
@@ -1730,25 +1699,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) | |||
1730 | * The RCU lock is held throughout the transaction. The fast | 1699 | * The RCU lock is held throughout the transaction. The fast |
1731 | * path can get away without acquiring the memcg->move_lock | 1700 | * path can get away without acquiring the memcg->move_lock |
1732 | * because page moving starts with an RCU grace period. | 1701 | * because page moving starts with an RCU grace period. |
1733 | * | ||
1734 | * The RCU lock also protects the memcg from being freed when | ||
1735 | * the page state that is going to change is the only thing | ||
1736 | * preventing the page from being uncharged. | ||
1737 | * E.g. end-writeback clearing PageWriteback(), which allows | ||
1738 | * migration to go ahead and uncharge the page before the | ||
1739 | * account transaction might be complete. | ||
1740 | */ | 1702 | */ |
1741 | rcu_read_lock(); | 1703 | rcu_read_lock(); |
1742 | 1704 | ||
1743 | if (mem_cgroup_disabled()) | 1705 | if (mem_cgroup_disabled()) |
1744 | return NULL; | 1706 | return; |
1745 | again: | 1707 | again: |
1746 | memcg = page->mem_cgroup; | 1708 | memcg = page->mem_cgroup; |
1747 | if (unlikely(!memcg)) | 1709 | if (unlikely(!memcg)) |
1748 | return NULL; | 1710 | return; |
1749 | 1711 | ||
1750 | if (atomic_read(&memcg->moving_account) <= 0) | 1712 | if (atomic_read(&memcg->moving_account) <= 0) |
1751 | return memcg; | 1713 | return; |
1752 | 1714 | ||
1753 | spin_lock_irqsave(&memcg->move_lock, flags); | 1715 | spin_lock_irqsave(&memcg->move_lock, flags); |
1754 | if (memcg != page->mem_cgroup) { | 1716 | if (memcg != page->mem_cgroup) { |
@@ -1759,21 +1721,23 @@ again: | |||
1759 | /* | 1721 | /* |
1760 | * When charge migration first begins, we can have locked and | 1722 | * When charge migration first begins, we can have locked and |
1761 | * unlocked page stat updates happening concurrently. Track | 1723 | * unlocked page stat updates happening concurrently. Track |
1762 | * the task who has the lock for mem_cgroup_end_page_stat(). | 1724 | * the task who has the lock for unlock_page_memcg(). |
1763 | */ | 1725 | */ |
1764 | memcg->move_lock_task = current; | 1726 | memcg->move_lock_task = current; |
1765 | memcg->move_lock_flags = flags; | 1727 | memcg->move_lock_flags = flags; |
1766 | 1728 | ||
1767 | return memcg; | 1729 | return; |
1768 | } | 1730 | } |
1769 | EXPORT_SYMBOL(mem_cgroup_begin_page_stat); | 1731 | EXPORT_SYMBOL(lock_page_memcg); |
1770 | 1732 | ||
1771 | /** | 1733 | /** |
1772 | * mem_cgroup_end_page_stat - finish a page state statistics transaction | 1734 | * unlock_page_memcg - unlock a page->mem_cgroup binding |
1773 | * @memcg: the memcg that was accounted against | 1735 | * @page: the page |
1774 | */ | 1736 | */ |
1775 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) | 1737 | void unlock_page_memcg(struct page *page) |
1776 | { | 1738 | { |
1739 | struct mem_cgroup *memcg = page->mem_cgroup; | ||
1740 | |||
1777 | if (memcg && memcg->move_lock_task == current) { | 1741 | if (memcg && memcg->move_lock_task == current) { |
1778 | unsigned long flags = memcg->move_lock_flags; | 1742 | unsigned long flags = memcg->move_lock_flags; |
1779 | 1743 | ||
@@ -1785,7 +1749,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) | |||
1785 | 1749 | ||
1786 | rcu_read_unlock(); | 1750 | rcu_read_unlock(); |
1787 | } | 1751 | } |
1788 | EXPORT_SYMBOL(mem_cgroup_end_page_stat); | 1752 | EXPORT_SYMBOL(unlock_page_memcg); |
1789 | 1753 | ||
1790 | /* | 1754 | /* |
1791 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1755 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
@@ -4488,7 +4452,7 @@ static int mem_cgroup_move_account(struct page *page, | |||
4488 | VM_BUG_ON(compound && !PageTransHuge(page)); | 4452 | VM_BUG_ON(compound && !PageTransHuge(page)); |
4489 | 4453 | ||
4490 | /* | 4454 | /* |
4491 | * Prevent mem_cgroup_replace_page() from looking at | 4455 | * Prevent mem_cgroup_migrate() from looking at |
4492 | * page->mem_cgroup of its source page while we change it. | 4456 | * page->mem_cgroup of its source page while we change it. |
4493 | */ | 4457 | */ |
4494 | ret = -EBUSY; | 4458 | ret = -EBUSY; |
@@ -4923,9 +4887,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4923 | 4887 | ||
4924 | lru_add_drain_all(); | 4888 | lru_add_drain_all(); |
4925 | /* | 4889 | /* |
4926 | * Signal mem_cgroup_begin_page_stat() to take the memcg's | 4890 | * Signal lock_page_memcg() to take the memcg's move_lock |
4927 | * move_lock while we're moving its pages to another memcg. | 4891 | * while we're moving its pages to another memcg. Then wait |
4928 | * Then wait for already started RCU-only updates to finish. | 4892 | * for already started RCU-only updates to finish. |
4929 | */ | 4893 | */ |
4930 | atomic_inc(&mc.from->moving_account); | 4894 | atomic_inc(&mc.from->moving_account); |
4931 | synchronize_rcu(); | 4895 | synchronize_rcu(); |
@@ -5517,16 +5481,16 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) | |||
5517 | } | 5481 | } |
5518 | 5482 | ||
5519 | /** | 5483 | /** |
5520 | * mem_cgroup_replace_page - migrate a charge to another page | 5484 | * mem_cgroup_migrate - charge a page's replacement |
5521 | * @oldpage: currently charged page | 5485 | * @oldpage: currently circulating page |
5522 | * @newpage: page to transfer the charge to | 5486 | * @newpage: replacement page |
5523 | * | 5487 | * |
5524 | * Migrate the charge from @oldpage to @newpage. | 5488 | * Charge @newpage as a replacement page for @oldpage. @oldpage will |
5489 | * be uncharged upon free. | ||
5525 | * | 5490 | * |
5526 | * Both pages must be locked, @newpage->mapping must be set up. | 5491 | * Both pages must be locked, @newpage->mapping must be set up. |
5527 | * Either or both pages might be on the LRU already. | ||
5528 | */ | 5492 | */ |
5529 | void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) | 5493 | void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) |
5530 | { | 5494 | { |
5531 | struct mem_cgroup *memcg; | 5495 | struct mem_cgroup *memcg; |
5532 | unsigned int nr_pages; | 5496 | unsigned int nr_pages; |
@@ -5559,7 +5523,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) | |||
5559 | page_counter_charge(&memcg->memsw, nr_pages); | 5523 | page_counter_charge(&memcg->memsw, nr_pages); |
5560 | css_get_many(&memcg->css, nr_pages); | 5524 | css_get_many(&memcg->css, nr_pages); |
5561 | 5525 | ||
5562 | commit_charge(newpage, memcg, true); | 5526 | commit_charge(newpage, memcg, false); |
5563 | 5527 | ||
5564 | local_irq_disable(); | 5528 | local_irq_disable(); |
5565 | mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); | 5529 | mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ac595e7a3a95..67c30eb993f0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -826,8 +826,6 @@ static struct page_state { | |||
826 | #undef lru | 826 | #undef lru |
827 | #undef swapbacked | 827 | #undef swapbacked |
828 | #undef head | 828 | #undef head |
829 | #undef tail | ||
830 | #undef compound | ||
831 | #undef slab | 829 | #undef slab |
832 | #undef reserved | 830 | #undef reserved |
833 | 831 | ||
diff --git a/mm/memory.c b/mm/memory.c index 906d8e3b42c0..0e247642ed5b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1897,7 +1897,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | |||
1897 | unsigned long end = addr + size; | 1897 | unsigned long end = addr + size; |
1898 | int err; | 1898 | int err; |
1899 | 1899 | ||
1900 | BUG_ON(addr >= end); | 1900 | if (WARN_ON(addr >= end)) |
1901 | return -EINVAL; | ||
1902 | |||
1901 | pgd = pgd_offset(mm, addr); | 1903 | pgd = pgd_offset(mm, addr); |
1902 | do { | 1904 | do { |
1903 | next = pgd_addr_end(addr, end); | 1905 | next = pgd_addr_end(addr, end); |
@@ -3143,8 +3145,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3143 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 3145 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
3144 | unsigned int flags, pte_t orig_pte) | 3146 | unsigned int flags, pte_t orig_pte) |
3145 | { | 3147 | { |
3146 | pgoff_t pgoff = (((address & PAGE_MASK) | 3148 | pgoff_t pgoff = linear_page_index(vma, address); |
3147 | - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
3148 | 3149 | ||
3149 | pte_unmap(page_table); | 3150 | pte_unmap(page_table); |
3150 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ | 3151 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 979b18cbd343..24ea06393816 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -77,6 +77,9 @@ static struct { | |||
77 | #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) | 77 | #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) |
78 | #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) | 78 | #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) |
79 | 79 | ||
80 | bool memhp_auto_online; | ||
81 | EXPORT_SYMBOL_GPL(memhp_auto_online); | ||
82 | |||
80 | void get_online_mems(void) | 83 | void get_online_mems(void) |
81 | { | 84 | { |
82 | might_sleep(); | 85 | might_sleep(); |
@@ -509,6 +512,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, | |||
509 | int start_sec, end_sec; | 512 | int start_sec, end_sec; |
510 | struct vmem_altmap *altmap; | 513 | struct vmem_altmap *altmap; |
511 | 514 | ||
515 | clear_zone_contiguous(zone); | ||
516 | |||
512 | /* during initialize mem_map, align hot-added range to section */ | 517 | /* during initialize mem_map, align hot-added range to section */ |
513 | start_sec = pfn_to_section_nr(phys_start_pfn); | 518 | start_sec = pfn_to_section_nr(phys_start_pfn); |
514 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); | 519 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); |
@@ -521,7 +526,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, | |||
521 | if (altmap->base_pfn != phys_start_pfn | 526 | if (altmap->base_pfn != phys_start_pfn |
522 | || vmem_altmap_offset(altmap) > nr_pages) { | 527 | || vmem_altmap_offset(altmap) > nr_pages) { |
523 | pr_warn_once("memory add fail, invalid altmap\n"); | 528 | pr_warn_once("memory add fail, invalid altmap\n"); |
524 | return -EINVAL; | 529 | err = -EINVAL; |
530 | goto out; | ||
525 | } | 531 | } |
526 | altmap->alloc = 0; | 532 | altmap->alloc = 0; |
527 | } | 533 | } |
@@ -539,7 +545,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, | |||
539 | err = 0; | 545 | err = 0; |
540 | } | 546 | } |
541 | vmemmap_populate_print_last(); | 547 | vmemmap_populate_print_last(); |
542 | 548 | out: | |
549 | set_zone_contiguous(zone); | ||
543 | return err; | 550 | return err; |
544 | } | 551 | } |
545 | EXPORT_SYMBOL_GPL(__add_pages); | 552 | EXPORT_SYMBOL_GPL(__add_pages); |
@@ -811,6 +818,8 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
811 | } | 818 | } |
812 | } | 819 | } |
813 | 820 | ||
821 | clear_zone_contiguous(zone); | ||
822 | |||
814 | /* | 823 | /* |
815 | * We can only remove entire sections | 824 | * We can only remove entire sections |
816 | */ | 825 | */ |
@@ -826,6 +835,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
826 | if (ret) | 835 | if (ret) |
827 | break; | 836 | break; |
828 | } | 837 | } |
838 | |||
839 | set_zone_contiguous(zone); | ||
840 | |||
829 | return ret; | 841 | return ret; |
830 | } | 842 | } |
831 | EXPORT_SYMBOL_GPL(__remove_pages); | 843 | EXPORT_SYMBOL_GPL(__remove_pages); |
@@ -1261,8 +1273,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default, | |||
1261 | return zone_default; | 1273 | return zone_default; |
1262 | } | 1274 | } |
1263 | 1275 | ||
1276 | static int online_memory_block(struct memory_block *mem, void *arg) | ||
1277 | { | ||
1278 | return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); | ||
1279 | } | ||
1280 | |||
1264 | /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ | 1281 | /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ |
1265 | int __ref add_memory_resource(int nid, struct resource *res) | 1282 | int __ref add_memory_resource(int nid, struct resource *res, bool online) |
1266 | { | 1283 | { |
1267 | u64 start, size; | 1284 | u64 start, size; |
1268 | pg_data_t *pgdat = NULL; | 1285 | pg_data_t *pgdat = NULL; |
@@ -1322,6 +1339,11 @@ int __ref add_memory_resource(int nid, struct resource *res) | |||
1322 | /* create new memmap entry */ | 1339 | /* create new memmap entry */ |
1323 | firmware_map_add_hotplug(start, start + size, "System RAM"); | 1340 | firmware_map_add_hotplug(start, start + size, "System RAM"); |
1324 | 1341 | ||
1342 | /* online pages if requested */ | ||
1343 | if (online) | ||
1344 | walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), | ||
1345 | NULL, online_memory_block); | ||
1346 | |||
1325 | goto out; | 1347 | goto out; |
1326 | 1348 | ||
1327 | error: | 1349 | error: |
@@ -1345,7 +1367,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
1345 | if (IS_ERR(res)) | 1367 | if (IS_ERR(res)) |
1346 | return PTR_ERR(res); | 1368 | return PTR_ERR(res); |
1347 | 1369 | ||
1348 | ret = add_memory_resource(nid, res); | 1370 | ret = add_memory_resource(nid, res, memhp_auto_online); |
1349 | if (ret < 0) | 1371 | if (ret < 0) |
1350 | release_memory_resource(res); | 1372 | release_memory_resource(res); |
1351 | return ret; | 1373 | return ret; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9a3f6b90e628..8cbc74387df3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -643,7 +643,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, | |||
643 | 643 | ||
644 | if (flags & MPOL_MF_LAZY) { | 644 | if (flags & MPOL_MF_LAZY) { |
645 | /* Similar to task_numa_work, skip inaccessible VMAs */ | 645 | /* Similar to task_numa_work, skip inaccessible VMAs */ |
646 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | 646 | if (!is_vm_hugetlb_page(vma) && |
647 | (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) && | ||
648 | !(vma->vm_flags & VM_MIXEDMAP)) | ||
647 | change_prot_numa(vma, start, endvma); | 649 | change_prot_numa(vma, start, endvma); |
648 | return 1; | 650 | return 1; |
649 | } | 651 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 3ad0fea5c438..568284ec75d4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/balloon_compaction.h> | 38 | #include <linux/balloon_compaction.h> |
39 | #include <linux/mmu_notifier.h> | 39 | #include <linux/mmu_notifier.h> |
40 | #include <linux/page_idle.h> | 40 | #include <linux/page_idle.h> |
41 | #include <linux/page_owner.h> | ||
41 | 42 | ||
42 | #include <asm/tlbflush.h> | 43 | #include <asm/tlbflush.h> |
43 | 44 | ||
@@ -325,7 +326,6 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
325 | return -EAGAIN; | 326 | return -EAGAIN; |
326 | 327 | ||
327 | /* No turning back from here */ | 328 | /* No turning back from here */ |
328 | set_page_memcg(newpage, page_memcg(page)); | ||
329 | newpage->index = page->index; | 329 | newpage->index = page->index; |
330 | newpage->mapping = page->mapping; | 330 | newpage->mapping = page->mapping; |
331 | if (PageSwapBacked(page)) | 331 | if (PageSwapBacked(page)) |
@@ -372,7 +372,6 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
372 | * Now we know that no one else is looking at the page: | 372 | * Now we know that no one else is looking at the page: |
373 | * no turning back from here. | 373 | * no turning back from here. |
374 | */ | 374 | */ |
375 | set_page_memcg(newpage, page_memcg(page)); | ||
376 | newpage->index = page->index; | 375 | newpage->index = page->index; |
377 | newpage->mapping = page->mapping; | 376 | newpage->mapping = page->mapping; |
378 | if (PageSwapBacked(page)) | 377 | if (PageSwapBacked(page)) |
@@ -457,9 +456,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
457 | return -EAGAIN; | 456 | return -EAGAIN; |
458 | } | 457 | } |
459 | 458 | ||
460 | set_page_memcg(newpage, page_memcg(page)); | ||
461 | newpage->index = page->index; | 459 | newpage->index = page->index; |
462 | newpage->mapping = page->mapping; | 460 | newpage->mapping = page->mapping; |
461 | |||
463 | get_page(newpage); | 462 | get_page(newpage); |
464 | 463 | ||
465 | radix_tree_replace_slot(pslot, newpage); | 464 | radix_tree_replace_slot(pslot, newpage); |
@@ -467,6 +466,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
467 | page_unfreeze_refs(page, expected_count - 1); | 466 | page_unfreeze_refs(page, expected_count - 1); |
468 | 467 | ||
469 | spin_unlock_irq(&mapping->tree_lock); | 468 | spin_unlock_irq(&mapping->tree_lock); |
469 | |||
470 | return MIGRATEPAGE_SUCCESS; | 470 | return MIGRATEPAGE_SUCCESS; |
471 | } | 471 | } |
472 | 472 | ||
@@ -578,6 +578,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
578 | */ | 578 | */ |
579 | if (PageWriteback(newpage)) | 579 | if (PageWriteback(newpage)) |
580 | end_page_writeback(newpage); | 580 | end_page_writeback(newpage); |
581 | |||
582 | copy_page_owner(page, newpage); | ||
583 | |||
584 | mem_cgroup_migrate(page, newpage); | ||
581 | } | 585 | } |
582 | 586 | ||
583 | /************************************************************ | 587 | /************************************************************ |
@@ -772,7 +776,6 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
772 | * page is freed; but stats require that PageAnon be left as PageAnon. | 776 | * page is freed; but stats require that PageAnon be left as PageAnon. |
773 | */ | 777 | */ |
774 | if (rc == MIGRATEPAGE_SUCCESS) { | 778 | if (rc == MIGRATEPAGE_SUCCESS) { |
775 | set_page_memcg(page, NULL); | ||
776 | if (!PageAnon(page)) | 779 | if (!PageAnon(page)) |
777 | page->mapping = NULL; | 780 | page->mapping = NULL; |
778 | } | 781 | } |
@@ -952,8 +955,10 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, | |||
952 | } | 955 | } |
953 | 956 | ||
954 | rc = __unmap_and_move(page, newpage, force, mode); | 957 | rc = __unmap_and_move(page, newpage, force, mode); |
955 | if (rc == MIGRATEPAGE_SUCCESS) | 958 | if (rc == MIGRATEPAGE_SUCCESS) { |
956 | put_new_page = NULL; | 959 | put_new_page = NULL; |
960 | set_page_owner_migrate_reason(newpage, reason); | ||
961 | } | ||
957 | 962 | ||
958 | out: | 963 | out: |
959 | if (rc != -EAGAIN) { | 964 | if (rc != -EAGAIN) { |
@@ -1018,7 +1023,7 @@ out: | |||
1018 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 1023 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
1019 | free_page_t put_new_page, unsigned long private, | 1024 | free_page_t put_new_page, unsigned long private, |
1020 | struct page *hpage, int force, | 1025 | struct page *hpage, int force, |
1021 | enum migrate_mode mode) | 1026 | enum migrate_mode mode, int reason) |
1022 | { | 1027 | { |
1023 | int rc = -EAGAIN; | 1028 | int rc = -EAGAIN; |
1024 | int *result = NULL; | 1029 | int *result = NULL; |
@@ -1076,6 +1081,7 @@ put_anon: | |||
1076 | if (rc == MIGRATEPAGE_SUCCESS) { | 1081 | if (rc == MIGRATEPAGE_SUCCESS) { |
1077 | hugetlb_cgroup_migrate(hpage, new_hpage); | 1082 | hugetlb_cgroup_migrate(hpage, new_hpage); |
1078 | put_new_page = NULL; | 1083 | put_new_page = NULL; |
1084 | set_page_owner_migrate_reason(new_hpage, reason); | ||
1079 | } | 1085 | } |
1080 | 1086 | ||
1081 | unlock_page(hpage); | 1087 | unlock_page(hpage); |
@@ -1148,7 +1154,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, | |||
1148 | if (PageHuge(page)) | 1154 | if (PageHuge(page)) |
1149 | rc = unmap_and_move_huge_page(get_new_page, | 1155 | rc = unmap_and_move_huge_page(get_new_page, |
1150 | put_new_page, private, page, | 1156 | put_new_page, private, page, |
1151 | pass > 2, mode); | 1157 | pass > 2, mode, reason); |
1152 | else | 1158 | else |
1153 | rc = unmap_and_move(get_new_page, put_new_page, | 1159 | rc = unmap_and_move(get_new_page, put_new_page, |
1154 | private, page, pass > 2, mode, | 1160 | private, page, pass > 2, mode, |
@@ -1836,9 +1842,8 @@ fail_putback: | |||
1836 | } | 1842 | } |
1837 | 1843 | ||
1838 | mlock_migrate_page(new_page, page); | 1844 | mlock_migrate_page(new_page, page); |
1839 | set_page_memcg(new_page, page_memcg(page)); | ||
1840 | set_page_memcg(page, NULL); | ||
1841 | page_remove_rmap(page, true); | 1845 | page_remove_rmap(page, true); |
1846 | set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); | ||
1842 | 1847 | ||
1843 | spin_unlock(ptl); | 1848 | spin_unlock(ptl); |
1844 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1849 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dc490c06941b..e97a05d9621f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -386,10 +386,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
386 | static void dump_header(struct oom_control *oc, struct task_struct *p, | 386 | static void dump_header(struct oom_control *oc, struct task_struct *p, |
387 | struct mem_cgroup *memcg) | 387 | struct mem_cgroup *memcg) |
388 | { | 388 | { |
389 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | 389 | pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, " |
390 | "oom_score_adj=%hd\n", | 390 | "oom_score_adj=%hd\n", |
391 | current->comm, oc->gfp_mask, oc->order, | 391 | current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, |
392 | current->signal->oom_score_adj); | 392 | current->signal->oom_score_adj); |
393 | |||
393 | cpuset_print_current_mems_allowed(); | 394 | cpuset_print_current_mems_allowed(); |
394 | dump_stack(); | 395 | dump_stack(); |
395 | if (memcg) | 396 | if (memcg) |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6fe7d15bd1f7..11ff8f758631 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1169,6 +1169,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, | |||
1169 | unsigned long balanced_dirty_ratelimit; | 1169 | unsigned long balanced_dirty_ratelimit; |
1170 | unsigned long step; | 1170 | unsigned long step; |
1171 | unsigned long x; | 1171 | unsigned long x; |
1172 | unsigned long shift; | ||
1172 | 1173 | ||
1173 | /* | 1174 | /* |
1174 | * The dirty rate will match the writeout rate in long term, except | 1175 | * The dirty rate will match the writeout rate in long term, except |
@@ -1293,11 +1294,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, | |||
1293 | * rate itself is constantly fluctuating. So decrease the track speed | 1294 | * rate itself is constantly fluctuating. So decrease the track speed |
1294 | * when it gets close to the target. Helps eliminate pointless tremors. | 1295 | * when it gets close to the target. Helps eliminate pointless tremors. |
1295 | */ | 1296 | */ |
1296 | step >>= dirty_ratelimit / (2 * step + 1); | 1297 | shift = dirty_ratelimit / (2 * step + 1); |
1297 | /* | 1298 | if (shift < BITS_PER_LONG) |
1298 | * Limit the tracking speed to avoid overshooting. | 1299 | step = DIV_ROUND_UP(step >> shift, 8); |
1299 | */ | 1300 | else |
1300 | step = (step + 7) / 8; | 1301 | step = 0; |
1301 | 1302 | ||
1302 | if (dirty_ratelimit < balanced_dirty_ratelimit) | 1303 | if (dirty_ratelimit < balanced_dirty_ratelimit) |
1303 | dirty_ratelimit += step; | 1304 | dirty_ratelimit += step; |
@@ -2409,12 +2410,11 @@ int __set_page_dirty_no_writeback(struct page *page) | |||
2409 | /* | 2410 | /* |
2410 | * Helper function for set_page_dirty family. | 2411 | * Helper function for set_page_dirty family. |
2411 | * | 2412 | * |
2412 | * Caller must hold mem_cgroup_begin_page_stat(). | 2413 | * Caller must hold lock_page_memcg(). |
2413 | * | 2414 | * |
2414 | * NOTE: This relies on being atomic wrt interrupts. | 2415 | * NOTE: This relies on being atomic wrt interrupts. |
2415 | */ | 2416 | */ |
2416 | void account_page_dirtied(struct page *page, struct address_space *mapping, | 2417 | void account_page_dirtied(struct page *page, struct address_space *mapping) |
2417 | struct mem_cgroup *memcg) | ||
2418 | { | 2418 | { |
2419 | struct inode *inode = mapping->host; | 2419 | struct inode *inode = mapping->host; |
2420 | 2420 | ||
@@ -2426,7 +2426,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping, | |||
2426 | inode_attach_wb(inode, page); | 2426 | inode_attach_wb(inode, page); |
2427 | wb = inode_to_wb(inode); | 2427 | wb = inode_to_wb(inode); |
2428 | 2428 | ||
2429 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); | 2429 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); |
2430 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 2430 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
2431 | __inc_zone_page_state(page, NR_DIRTIED); | 2431 | __inc_zone_page_state(page, NR_DIRTIED); |
2432 | __inc_wb_stat(wb, WB_RECLAIMABLE); | 2432 | __inc_wb_stat(wb, WB_RECLAIMABLE); |
@@ -2441,13 +2441,13 @@ EXPORT_SYMBOL(account_page_dirtied); | |||
2441 | /* | 2441 | /* |
2442 | * Helper function for deaccounting dirty page without writeback. | 2442 | * Helper function for deaccounting dirty page without writeback. |
2443 | * | 2443 | * |
2444 | * Caller must hold mem_cgroup_begin_page_stat(). | 2444 | * Caller must hold lock_page_memcg(). |
2445 | */ | 2445 | */ |
2446 | void account_page_cleaned(struct page *page, struct address_space *mapping, | 2446 | void account_page_cleaned(struct page *page, struct address_space *mapping, |
2447 | struct mem_cgroup *memcg, struct bdi_writeback *wb) | 2447 | struct bdi_writeback *wb) |
2448 | { | 2448 | { |
2449 | if (mapping_cap_account_dirty(mapping)) { | 2449 | if (mapping_cap_account_dirty(mapping)) { |
2450 | mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); | 2450 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); |
2451 | dec_zone_page_state(page, NR_FILE_DIRTY); | 2451 | dec_zone_page_state(page, NR_FILE_DIRTY); |
2452 | dec_wb_stat(wb, WB_RECLAIMABLE); | 2452 | dec_wb_stat(wb, WB_RECLAIMABLE); |
2453 | task_io_account_cancelled_write(PAGE_CACHE_SIZE); | 2453 | task_io_account_cancelled_write(PAGE_CACHE_SIZE); |
@@ -2468,26 +2468,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, | |||
2468 | */ | 2468 | */ |
2469 | int __set_page_dirty_nobuffers(struct page *page) | 2469 | int __set_page_dirty_nobuffers(struct page *page) |
2470 | { | 2470 | { |
2471 | struct mem_cgroup *memcg; | 2471 | lock_page_memcg(page); |
2472 | |||
2473 | memcg = mem_cgroup_begin_page_stat(page); | ||
2474 | if (!TestSetPageDirty(page)) { | 2472 | if (!TestSetPageDirty(page)) { |
2475 | struct address_space *mapping = page_mapping(page); | 2473 | struct address_space *mapping = page_mapping(page); |
2476 | unsigned long flags; | 2474 | unsigned long flags; |
2477 | 2475 | ||
2478 | if (!mapping) { | 2476 | if (!mapping) { |
2479 | mem_cgroup_end_page_stat(memcg); | 2477 | unlock_page_memcg(page); |
2480 | return 1; | 2478 | return 1; |
2481 | } | 2479 | } |
2482 | 2480 | ||
2483 | spin_lock_irqsave(&mapping->tree_lock, flags); | 2481 | spin_lock_irqsave(&mapping->tree_lock, flags); |
2484 | BUG_ON(page_mapping(page) != mapping); | 2482 | BUG_ON(page_mapping(page) != mapping); |
2485 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); | 2483 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); |
2486 | account_page_dirtied(page, mapping, memcg); | 2484 | account_page_dirtied(page, mapping); |
2487 | radix_tree_tag_set(&mapping->page_tree, page_index(page), | 2485 | radix_tree_tag_set(&mapping->page_tree, page_index(page), |
2488 | PAGECACHE_TAG_DIRTY); | 2486 | PAGECACHE_TAG_DIRTY); |
2489 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 2487 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
2490 | mem_cgroup_end_page_stat(memcg); | 2488 | unlock_page_memcg(page); |
2491 | 2489 | ||
2492 | if (mapping->host) { | 2490 | if (mapping->host) { |
2493 | /* !PageAnon && !swapper_space */ | 2491 | /* !PageAnon && !swapper_space */ |
@@ -2495,7 +2493,7 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
2495 | } | 2493 | } |
2496 | return 1; | 2494 | return 1; |
2497 | } | 2495 | } |
2498 | mem_cgroup_end_page_stat(memcg); | 2496 | unlock_page_memcg(page); |
2499 | return 0; | 2497 | return 0; |
2500 | } | 2498 | } |
2501 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); | 2499 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); |
@@ -2625,17 +2623,16 @@ void cancel_dirty_page(struct page *page) | |||
2625 | if (mapping_cap_account_dirty(mapping)) { | 2623 | if (mapping_cap_account_dirty(mapping)) { |
2626 | struct inode *inode = mapping->host; | 2624 | struct inode *inode = mapping->host; |
2627 | struct bdi_writeback *wb; | 2625 | struct bdi_writeback *wb; |
2628 | struct mem_cgroup *memcg; | ||
2629 | bool locked; | 2626 | bool locked; |
2630 | 2627 | ||
2631 | memcg = mem_cgroup_begin_page_stat(page); | 2628 | lock_page_memcg(page); |
2632 | wb = unlocked_inode_to_wb_begin(inode, &locked); | 2629 | wb = unlocked_inode_to_wb_begin(inode, &locked); |
2633 | 2630 | ||
2634 | if (TestClearPageDirty(page)) | 2631 | if (TestClearPageDirty(page)) |
2635 | account_page_cleaned(page, mapping, memcg, wb); | 2632 | account_page_cleaned(page, mapping, wb); |
2636 | 2633 | ||
2637 | unlocked_inode_to_wb_end(inode, locked); | 2634 | unlocked_inode_to_wb_end(inode, locked); |
2638 | mem_cgroup_end_page_stat(memcg); | 2635 | unlock_page_memcg(page); |
2639 | } else { | 2636 | } else { |
2640 | ClearPageDirty(page); | 2637 | ClearPageDirty(page); |
2641 | } | 2638 | } |
@@ -2666,7 +2663,6 @@ int clear_page_dirty_for_io(struct page *page) | |||
2666 | if (mapping && mapping_cap_account_dirty(mapping)) { | 2663 | if (mapping && mapping_cap_account_dirty(mapping)) { |
2667 | struct inode *inode = mapping->host; | 2664 | struct inode *inode = mapping->host; |
2668 | struct bdi_writeback *wb; | 2665 | struct bdi_writeback *wb; |
2669 | struct mem_cgroup *memcg; | ||
2670 | bool locked; | 2666 | bool locked; |
2671 | 2667 | ||
2672 | /* | 2668 | /* |
@@ -2704,16 +2700,14 @@ int clear_page_dirty_for_io(struct page *page) | |||
2704 | * always locked coming in here, so we get the desired | 2700 | * always locked coming in here, so we get the desired |
2705 | * exclusion. | 2701 | * exclusion. |
2706 | */ | 2702 | */ |
2707 | memcg = mem_cgroup_begin_page_stat(page); | ||
2708 | wb = unlocked_inode_to_wb_begin(inode, &locked); | 2703 | wb = unlocked_inode_to_wb_begin(inode, &locked); |
2709 | if (TestClearPageDirty(page)) { | 2704 | if (TestClearPageDirty(page)) { |
2710 | mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); | 2705 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); |
2711 | dec_zone_page_state(page, NR_FILE_DIRTY); | 2706 | dec_zone_page_state(page, NR_FILE_DIRTY); |
2712 | dec_wb_stat(wb, WB_RECLAIMABLE); | 2707 | dec_wb_stat(wb, WB_RECLAIMABLE); |
2713 | ret = 1; | 2708 | ret = 1; |
2714 | } | 2709 | } |
2715 | unlocked_inode_to_wb_end(inode, locked); | 2710 | unlocked_inode_to_wb_end(inode, locked); |
2716 | mem_cgroup_end_page_stat(memcg); | ||
2717 | return ret; | 2711 | return ret; |
2718 | } | 2712 | } |
2719 | return TestClearPageDirty(page); | 2713 | return TestClearPageDirty(page); |
@@ -2723,10 +2717,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); | |||
2723 | int test_clear_page_writeback(struct page *page) | 2717 | int test_clear_page_writeback(struct page *page) |
2724 | { | 2718 | { |
2725 | struct address_space *mapping = page_mapping(page); | 2719 | struct address_space *mapping = page_mapping(page); |
2726 | struct mem_cgroup *memcg; | ||
2727 | int ret; | 2720 | int ret; |
2728 | 2721 | ||
2729 | memcg = mem_cgroup_begin_page_stat(page); | 2722 | lock_page_memcg(page); |
2730 | if (mapping) { | 2723 | if (mapping) { |
2731 | struct inode *inode = mapping->host; | 2724 | struct inode *inode = mapping->host; |
2732 | struct backing_dev_info *bdi = inode_to_bdi(inode); | 2725 | struct backing_dev_info *bdi = inode_to_bdi(inode); |
@@ -2750,21 +2743,20 @@ int test_clear_page_writeback(struct page *page) | |||
2750 | ret = TestClearPageWriteback(page); | 2743 | ret = TestClearPageWriteback(page); |
2751 | } | 2744 | } |
2752 | if (ret) { | 2745 | if (ret) { |
2753 | mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); | 2746 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); |
2754 | dec_zone_page_state(page, NR_WRITEBACK); | 2747 | dec_zone_page_state(page, NR_WRITEBACK); |
2755 | inc_zone_page_state(page, NR_WRITTEN); | 2748 | inc_zone_page_state(page, NR_WRITTEN); |
2756 | } | 2749 | } |
2757 | mem_cgroup_end_page_stat(memcg); | 2750 | unlock_page_memcg(page); |
2758 | return ret; | 2751 | return ret; |
2759 | } | 2752 | } |
2760 | 2753 | ||
2761 | int __test_set_page_writeback(struct page *page, bool keep_write) | 2754 | int __test_set_page_writeback(struct page *page, bool keep_write) |
2762 | { | 2755 | { |
2763 | struct address_space *mapping = page_mapping(page); | 2756 | struct address_space *mapping = page_mapping(page); |
2764 | struct mem_cgroup *memcg; | ||
2765 | int ret; | 2757 | int ret; |
2766 | 2758 | ||
2767 | memcg = mem_cgroup_begin_page_stat(page); | 2759 | lock_page_memcg(page); |
2768 | if (mapping) { | 2760 | if (mapping) { |
2769 | struct inode *inode = mapping->host; | 2761 | struct inode *inode = mapping->host; |
2770 | struct backing_dev_info *bdi = inode_to_bdi(inode); | 2762 | struct backing_dev_info *bdi = inode_to_bdi(inode); |
@@ -2792,10 +2784,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) | |||
2792 | ret = TestSetPageWriteback(page); | 2784 | ret = TestSetPageWriteback(page); |
2793 | } | 2785 | } |
2794 | if (!ret) { | 2786 | if (!ret) { |
2795 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); | 2787 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); |
2796 | inc_zone_page_state(page, NR_WRITEBACK); | 2788 | inc_zone_page_state(page, NR_WRITEBACK); |
2797 | } | 2789 | } |
2798 | mem_cgroup_end_page_stat(memcg); | 2790 | unlock_page_memcg(page); |
2799 | return ret; | 2791 | return ret; |
2800 | 2792 | ||
2801 | } | 2793 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 838ca8bb64f7..c46b75d14b6f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = { | |||
223 | #endif | 223 | #endif |
224 | }; | 224 | }; |
225 | 225 | ||
226 | char * const migratetype_names[MIGRATE_TYPES] = { | ||
227 | "Unmovable", | ||
228 | "Movable", | ||
229 | "Reclaimable", | ||
230 | "HighAtomic", | ||
231 | #ifdef CONFIG_CMA | ||
232 | "CMA", | ||
233 | #endif | ||
234 | #ifdef CONFIG_MEMORY_ISOLATION | ||
235 | "Isolate", | ||
236 | #endif | ||
237 | }; | ||
238 | |||
226 | compound_page_dtor * const compound_page_dtors[] = { | 239 | compound_page_dtor * const compound_page_dtors[] = { |
227 | NULL, | 240 | NULL, |
228 | free_compound_page, | 241 | free_compound_page, |
@@ -247,6 +260,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | |||
247 | static unsigned long __initdata required_kernelcore; | 260 | static unsigned long __initdata required_kernelcore; |
248 | static unsigned long __initdata required_movablecore; | 261 | static unsigned long __initdata required_movablecore; |
249 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 262 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
263 | static bool mirrored_kernelcore; | ||
250 | 264 | ||
251 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 265 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
252 | int movable_zone; | 266 | int movable_zone; |
@@ -416,7 +430,7 @@ static void bad_page(struct page *page, const char *reason, | |||
416 | goto out; | 430 | goto out; |
417 | } | 431 | } |
418 | if (nr_unshown) { | 432 | if (nr_unshown) { |
419 | printk(KERN_ALERT | 433 | pr_alert( |
420 | "BUG: Bad page state: %lu messages suppressed\n", | 434 | "BUG: Bad page state: %lu messages suppressed\n", |
421 | nr_unshown); | 435 | nr_unshown); |
422 | nr_unshown = 0; | 436 | nr_unshown = 0; |
@@ -426,9 +440,14 @@ static void bad_page(struct page *page, const char *reason, | |||
426 | if (nr_shown++ == 0) | 440 | if (nr_shown++ == 0) |
427 | resume = jiffies + 60 * HZ; | 441 | resume = jiffies + 60 * HZ; |
428 | 442 | ||
429 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | 443 | pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", |
430 | current->comm, page_to_pfn(page)); | 444 | current->comm, page_to_pfn(page)); |
431 | dump_page_badflags(page, reason, bad_flags); | 445 | __dump_page(page, reason); |
446 | bad_flags &= page->flags; | ||
447 | if (bad_flags) | ||
448 | pr_alert("bad because of flags: %#lx(%pGp)\n", | ||
449 | bad_flags, &bad_flags); | ||
450 | dump_page_owner(page); | ||
432 | 451 | ||
433 | print_modules(); | 452 | print_modules(); |
434 | dump_stack(); | 453 | dump_stack(); |
@@ -477,7 +496,8 @@ void prep_compound_page(struct page *page, unsigned int order) | |||
477 | 496 | ||
478 | #ifdef CONFIG_DEBUG_PAGEALLOC | 497 | #ifdef CONFIG_DEBUG_PAGEALLOC |
479 | unsigned int _debug_guardpage_minorder; | 498 | unsigned int _debug_guardpage_minorder; |
480 | bool _debug_pagealloc_enabled __read_mostly; | 499 | bool _debug_pagealloc_enabled __read_mostly |
500 | = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); | ||
481 | bool _debug_guardpage_enabled __read_mostly; | 501 | bool _debug_guardpage_enabled __read_mostly; |
482 | 502 | ||
483 | static int __init early_debug_pagealloc(char *buf) | 503 | static int __init early_debug_pagealloc(char *buf) |
@@ -488,6 +508,9 @@ static int __init early_debug_pagealloc(char *buf) | |||
488 | if (strcmp(buf, "on") == 0) | 508 | if (strcmp(buf, "on") == 0) |
489 | _debug_pagealloc_enabled = true; | 509 | _debug_pagealloc_enabled = true; |
490 | 510 | ||
511 | if (strcmp(buf, "off") == 0) | ||
512 | _debug_pagealloc_enabled = false; | ||
513 | |||
491 | return 0; | 514 | return 0; |
492 | } | 515 | } |
493 | early_param("debug_pagealloc", early_debug_pagealloc); | 516 | early_param("debug_pagealloc", early_debug_pagealloc); |
@@ -1002,6 +1025,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
1002 | PAGE_SIZE << order); | 1025 | PAGE_SIZE << order); |
1003 | } | 1026 | } |
1004 | arch_free_page(page, order); | 1027 | arch_free_page(page, order); |
1028 | kernel_poison_pages(page, 1 << order, 0); | ||
1005 | kernel_map_pages(page, 1 << order, 0); | 1029 | kernel_map_pages(page, 1 << order, 0); |
1006 | 1030 | ||
1007 | return true; | 1031 | return true; |
@@ -1104,6 +1128,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, | |||
1104 | return __free_pages_boot_core(page, pfn, order); | 1128 | return __free_pages_boot_core(page, pfn, order); |
1105 | } | 1129 | } |
1106 | 1130 | ||
1131 | /* | ||
1132 | * Check that the whole (or subset of) a pageblock given by the interval of | ||
1133 | * [start_pfn, end_pfn) is valid and within the same zone, before scanning it | ||
1134 | * with the migration of free compaction scanner. The scanners then need to | ||
1135 | * use only pfn_valid_within() check for arches that allow holes within | ||
1136 | * pageblocks. | ||
1137 | * | ||
1138 | * Return struct page pointer of start_pfn, or NULL if checks were not passed. | ||
1139 | * | ||
1140 | * It's possible on some configurations to have a setup like node0 node1 node0 | ||
1141 | * i.e. it's possible that all pages within a zones range of pages do not | ||
1142 | * belong to a single zone. We assume that a border between node0 and node1 | ||
1143 | * can occur within a single pageblock, but not a node0 node1 node0 | ||
1144 | * interleaving within a single pageblock. It is therefore sufficient to check | ||
1145 | * the first and last page of a pageblock and avoid checking each individual | ||
1146 | * page in a pageblock. | ||
1147 | */ | ||
1148 | struct page *__pageblock_pfn_to_page(unsigned long start_pfn, | ||
1149 | unsigned long end_pfn, struct zone *zone) | ||
1150 | { | ||
1151 | struct page *start_page; | ||
1152 | struct page *end_page; | ||
1153 | |||
1154 | /* end_pfn is one past the range we are checking */ | ||
1155 | end_pfn--; | ||
1156 | |||
1157 | if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) | ||
1158 | return NULL; | ||
1159 | |||
1160 | start_page = pfn_to_page(start_pfn); | ||
1161 | |||
1162 | if (page_zone(start_page) != zone) | ||
1163 | return NULL; | ||
1164 | |||
1165 | end_page = pfn_to_page(end_pfn); | ||
1166 | |||
1167 | /* This gives a shorter code than deriving page_zone(end_page) */ | ||
1168 | if (page_zone_id(start_page) != page_zone_id(end_page)) | ||
1169 | return NULL; | ||
1170 | |||
1171 | return start_page; | ||
1172 | } | ||
1173 | |||
1174 | void set_zone_contiguous(struct zone *zone) | ||
1175 | { | ||
1176 | unsigned long block_start_pfn = zone->zone_start_pfn; | ||
1177 | unsigned long block_end_pfn; | ||
1178 | |||
1179 | block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); | ||
1180 | for (; block_start_pfn < zone_end_pfn(zone); | ||
1181 | block_start_pfn = block_end_pfn, | ||
1182 | block_end_pfn += pageblock_nr_pages) { | ||
1183 | |||
1184 | block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); | ||
1185 | |||
1186 | if (!__pageblock_pfn_to_page(block_start_pfn, | ||
1187 | block_end_pfn, zone)) | ||
1188 | return; | ||
1189 | } | ||
1190 | |||
1191 | /* We confirm that there is no hole */ | ||
1192 | zone->contiguous = true; | ||
1193 | } | ||
1194 | |||
1195 | void clear_zone_contiguous(struct zone *zone) | ||
1196 | { | ||
1197 | zone->contiguous = false; | ||
1198 | } | ||
1199 | |||
1107 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT | 1200 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
1108 | static void __init deferred_free_range(struct page *page, | 1201 | static void __init deferred_free_range(struct page *page, |
1109 | unsigned long pfn, int nr_pages) | 1202 | unsigned long pfn, int nr_pages) |
@@ -1254,9 +1347,13 @@ free_range: | |||
1254 | pgdat_init_report_one_done(); | 1347 | pgdat_init_report_one_done(); |
1255 | return 0; | 1348 | return 0; |
1256 | } | 1349 | } |
1350 | #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ | ||
1257 | 1351 | ||
1258 | void __init page_alloc_init_late(void) | 1352 | void __init page_alloc_init_late(void) |
1259 | { | 1353 | { |
1354 | struct zone *zone; | ||
1355 | |||
1356 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT | ||
1260 | int nid; | 1357 | int nid; |
1261 | 1358 | ||
1262 | /* There will be num_node_state(N_MEMORY) threads */ | 1359 | /* There will be num_node_state(N_MEMORY) threads */ |
@@ -1270,8 +1367,11 @@ void __init page_alloc_init_late(void) | |||
1270 | 1367 | ||
1271 | /* Reinit limits that are based on free pages after the kernel is up */ | 1368 | /* Reinit limits that are based on free pages after the kernel is up */ |
1272 | files_maxfiles_init(); | 1369 | files_maxfiles_init(); |
1370 | #endif | ||
1371 | |||
1372 | for_each_populated_zone(zone) | ||
1373 | set_zone_contiguous(zone); | ||
1273 | } | 1374 | } |
1274 | #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ | ||
1275 | 1375 | ||
1276 | #ifdef CONFIG_CMA | 1376 | #ifdef CONFIG_CMA |
1277 | /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ | 1377 | /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ |
@@ -1381,15 +1481,24 @@ static inline int check_new_page(struct page *page) | |||
1381 | return 0; | 1481 | return 0; |
1382 | } | 1482 | } |
1383 | 1483 | ||
1484 | static inline bool free_pages_prezeroed(bool poisoned) | ||
1485 | { | ||
1486 | return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && | ||
1487 | page_poisoning_enabled() && poisoned; | ||
1488 | } | ||
1489 | |||
1384 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, | 1490 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, |
1385 | int alloc_flags) | 1491 | int alloc_flags) |
1386 | { | 1492 | { |
1387 | int i; | 1493 | int i; |
1494 | bool poisoned = true; | ||
1388 | 1495 | ||
1389 | for (i = 0; i < (1 << order); i++) { | 1496 | for (i = 0; i < (1 << order); i++) { |
1390 | struct page *p = page + i; | 1497 | struct page *p = page + i; |
1391 | if (unlikely(check_new_page(p))) | 1498 | if (unlikely(check_new_page(p))) |
1392 | return 1; | 1499 | return 1; |
1500 | if (poisoned) | ||
1501 | poisoned &= page_is_poisoned(p); | ||
1393 | } | 1502 | } |
1394 | 1503 | ||
1395 | set_page_private(page, 0); | 1504 | set_page_private(page, 0); |
@@ -1397,9 +1506,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, | |||
1397 | 1506 | ||
1398 | arch_alloc_page(page, order); | 1507 | arch_alloc_page(page, order); |
1399 | kernel_map_pages(page, 1 << order, 1); | 1508 | kernel_map_pages(page, 1 << order, 1); |
1509 | kernel_poison_pages(page, 1 << order, 1); | ||
1400 | kasan_alloc_pages(page, order); | 1510 | kasan_alloc_pages(page, order); |
1401 | 1511 | ||
1402 | if (gfp_flags & __GFP_ZERO) | 1512 | if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) |
1403 | for (i = 0; i < (1 << order); i++) | 1513 | for (i = 0; i < (1 << order); i++) |
1404 | clear_highpage(page + i); | 1514 | clear_highpage(page + i); |
1405 | 1515 | ||
@@ -2690,9 +2800,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) | |||
2690 | va_end(args); | 2800 | va_end(args); |
2691 | } | 2801 | } |
2692 | 2802 | ||
2693 | pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", | 2803 | pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", |
2694 | current->comm, order, gfp_mask); | 2804 | current->comm, order, gfp_mask, &gfp_mask); |
2695 | |||
2696 | dump_stack(); | 2805 | dump_stack(); |
2697 | if (!should_suppress_show_mem()) | 2806 | if (!should_suppress_show_mem()) |
2698 | show_mem(filter); | 2807 | show_mem(filter); |
@@ -4491,6 +4600,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
4491 | pg_data_t *pgdat = NODE_DATA(nid); | 4600 | pg_data_t *pgdat = NODE_DATA(nid); |
4492 | unsigned long pfn; | 4601 | unsigned long pfn; |
4493 | unsigned long nr_initialised = 0; | 4602 | unsigned long nr_initialised = 0; |
4603 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
4604 | struct memblock_region *r = NULL, *tmp; | ||
4605 | #endif | ||
4494 | 4606 | ||
4495 | if (highest_memmap_pfn < end_pfn - 1) | 4607 | if (highest_memmap_pfn < end_pfn - 1) |
4496 | highest_memmap_pfn = end_pfn - 1; | 4608 | highest_memmap_pfn = end_pfn - 1; |
@@ -4504,20 +4616,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
4504 | 4616 | ||
4505 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 4617 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
4506 | /* | 4618 | /* |
4507 | * There can be holes in boot-time mem_map[]s | 4619 | * There can be holes in boot-time mem_map[]s handed to this |
4508 | * handed to this function. They do not | 4620 | * function. They do not exist on hotplugged memory. |
4509 | * exist on hotplugged memory. | ||
4510 | */ | 4621 | */ |
4511 | if (context == MEMMAP_EARLY) { | 4622 | if (context != MEMMAP_EARLY) |
4512 | if (!early_pfn_valid(pfn)) | 4623 | goto not_early; |
4624 | |||
4625 | if (!early_pfn_valid(pfn)) | ||
4626 | continue; | ||
4627 | if (!early_pfn_in_nid(pfn, nid)) | ||
4628 | continue; | ||
4629 | if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) | ||
4630 | break; | ||
4631 | |||
4632 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
4633 | /* | ||
4634 | * If not mirrored_kernelcore and ZONE_MOVABLE exists, range | ||
4635 | * from zone_movable_pfn[nid] to end of each node should be | ||
4636 | * ZONE_MOVABLE not ZONE_NORMAL. skip it. | ||
4637 | */ | ||
4638 | if (!mirrored_kernelcore && zone_movable_pfn[nid]) | ||
4639 | if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) | ||
4513 | continue; | 4640 | continue; |
4514 | if (!early_pfn_in_nid(pfn, nid)) | 4641 | |
4642 | /* | ||
4643 | * Check given memblock attribute by firmware which can affect | ||
4644 | * kernel memory layout. If zone==ZONE_MOVABLE but memory is | ||
4645 | * mirrored, it's an overlapped memmap init. skip it. | ||
4646 | */ | ||
4647 | if (mirrored_kernelcore && zone == ZONE_MOVABLE) { | ||
4648 | if (!r || pfn >= memblock_region_memory_end_pfn(r)) { | ||
4649 | for_each_memblock(memory, tmp) | ||
4650 | if (pfn < memblock_region_memory_end_pfn(tmp)) | ||
4651 | break; | ||
4652 | r = tmp; | ||
4653 | } | ||
4654 | if (pfn >= memblock_region_memory_base_pfn(r) && | ||
4655 | memblock_is_mirror(r)) { | ||
4656 | /* already initialized as NORMAL */ | ||
4657 | pfn = memblock_region_memory_end_pfn(r); | ||
4515 | continue; | 4658 | continue; |
4516 | if (!update_defer_init(pgdat, pfn, end_pfn, | 4659 | } |
4517 | &nr_initialised)) | ||
4518 | break; | ||
4519 | } | 4660 | } |
4661 | #endif | ||
4520 | 4662 | ||
4663 | not_early: | ||
4521 | /* | 4664 | /* |
4522 | * Mark the block movable so that blocks are reserved for | 4665 | * Mark the block movable so that blocks are reserved for |
4523 | * movable at startup. This will force kernel allocations | 4666 | * movable at startup. This will force kernel allocations |
@@ -4934,11 +5077,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, | |||
4934 | *zone_end_pfn = min(node_end_pfn, | 5077 | *zone_end_pfn = min(node_end_pfn, |
4935 | arch_zone_highest_possible_pfn[movable_zone]); | 5078 | arch_zone_highest_possible_pfn[movable_zone]); |
4936 | 5079 | ||
4937 | /* Adjust for ZONE_MOVABLE starting within this range */ | ||
4938 | } else if (*zone_start_pfn < zone_movable_pfn[nid] && | ||
4939 | *zone_end_pfn > zone_movable_pfn[nid]) { | ||
4940 | *zone_end_pfn = zone_movable_pfn[nid]; | ||
4941 | |||
4942 | /* Check if this whole range is within ZONE_MOVABLE */ | 5080 | /* Check if this whole range is within ZONE_MOVABLE */ |
4943 | } else if (*zone_start_pfn >= zone_movable_pfn[nid]) | 5081 | } else if (*zone_start_pfn >= zone_movable_pfn[nid]) |
4944 | *zone_start_pfn = *zone_end_pfn; | 5082 | *zone_start_pfn = *zone_end_pfn; |
@@ -4953,31 +5091,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
4953 | unsigned long zone_type, | 5091 | unsigned long zone_type, |
4954 | unsigned long node_start_pfn, | 5092 | unsigned long node_start_pfn, |
4955 | unsigned long node_end_pfn, | 5093 | unsigned long node_end_pfn, |
5094 | unsigned long *zone_start_pfn, | ||
5095 | unsigned long *zone_end_pfn, | ||
4956 | unsigned long *ignored) | 5096 | unsigned long *ignored) |
4957 | { | 5097 | { |
4958 | unsigned long zone_start_pfn, zone_end_pfn; | ||
4959 | |||
4960 | /* When hotadd a new node from cpu_up(), the node should be empty */ | 5098 | /* When hotadd a new node from cpu_up(), the node should be empty */ |
4961 | if (!node_start_pfn && !node_end_pfn) | 5099 | if (!node_start_pfn && !node_end_pfn) |
4962 | return 0; | 5100 | return 0; |
4963 | 5101 | ||
4964 | /* Get the start and end of the zone */ | 5102 | /* Get the start and end of the zone */ |
4965 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; | 5103 | *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; |
4966 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; | 5104 | *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; |
4967 | adjust_zone_range_for_zone_movable(nid, zone_type, | 5105 | adjust_zone_range_for_zone_movable(nid, zone_type, |
4968 | node_start_pfn, node_end_pfn, | 5106 | node_start_pfn, node_end_pfn, |
4969 | &zone_start_pfn, &zone_end_pfn); | 5107 | zone_start_pfn, zone_end_pfn); |
4970 | 5108 | ||
4971 | /* Check that this node has pages within the zone's required range */ | 5109 | /* Check that this node has pages within the zone's required range */ |
4972 | if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) | 5110 | if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) |
4973 | return 0; | 5111 | return 0; |
4974 | 5112 | ||
4975 | /* Move the zone boundaries inside the node if necessary */ | 5113 | /* Move the zone boundaries inside the node if necessary */ |
4976 | zone_end_pfn = min(zone_end_pfn, node_end_pfn); | 5114 | *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); |
4977 | zone_start_pfn = max(zone_start_pfn, node_start_pfn); | 5115 | *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); |
4978 | 5116 | ||
4979 | /* Return the spanned pages */ | 5117 | /* Return the spanned pages */ |
4980 | return zone_end_pfn - zone_start_pfn; | 5118 | return *zone_end_pfn - *zone_start_pfn; |
4981 | } | 5119 | } |
4982 | 5120 | ||
4983 | /* | 5121 | /* |
@@ -5023,6 +5161,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
5023 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; | 5161 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; |
5024 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; | 5162 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; |
5025 | unsigned long zone_start_pfn, zone_end_pfn; | 5163 | unsigned long zone_start_pfn, zone_end_pfn; |
5164 | unsigned long nr_absent; | ||
5026 | 5165 | ||
5027 | /* When hotadd a new node from cpu_up(), the node should be empty */ | 5166 | /* When hotadd a new node from cpu_up(), the node should be empty */ |
5028 | if (!node_start_pfn && !node_end_pfn) | 5167 | if (!node_start_pfn && !node_end_pfn) |
@@ -5034,7 +5173,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
5034 | adjust_zone_range_for_zone_movable(nid, zone_type, | 5173 | adjust_zone_range_for_zone_movable(nid, zone_type, |
5035 | node_start_pfn, node_end_pfn, | 5174 | node_start_pfn, node_end_pfn, |
5036 | &zone_start_pfn, &zone_end_pfn); | 5175 | &zone_start_pfn, &zone_end_pfn); |
5037 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); | 5176 | nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); |
5177 | |||
5178 | /* | ||
5179 | * ZONE_MOVABLE handling. | ||
5180 | * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages | ||
5181 | * and vice versa. | ||
5182 | */ | ||
5183 | if (zone_movable_pfn[nid]) { | ||
5184 | if (mirrored_kernelcore) { | ||
5185 | unsigned long start_pfn, end_pfn; | ||
5186 | struct memblock_region *r; | ||
5187 | |||
5188 | for_each_memblock(memory, r) { | ||
5189 | start_pfn = clamp(memblock_region_memory_base_pfn(r), | ||
5190 | zone_start_pfn, zone_end_pfn); | ||
5191 | end_pfn = clamp(memblock_region_memory_end_pfn(r), | ||
5192 | zone_start_pfn, zone_end_pfn); | ||
5193 | |||
5194 | if (zone_type == ZONE_MOVABLE && | ||
5195 | memblock_is_mirror(r)) | ||
5196 | nr_absent += end_pfn - start_pfn; | ||
5197 | |||
5198 | if (zone_type == ZONE_NORMAL && | ||
5199 | !memblock_is_mirror(r)) | ||
5200 | nr_absent += end_pfn - start_pfn; | ||
5201 | } | ||
5202 | } else { | ||
5203 | if (zone_type == ZONE_NORMAL) | ||
5204 | nr_absent += node_end_pfn - zone_movable_pfn[nid]; | ||
5205 | } | ||
5206 | } | ||
5207 | |||
5208 | return nr_absent; | ||
5038 | } | 5209 | } |
5039 | 5210 | ||
5040 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 5211 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
@@ -5042,8 +5213,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
5042 | unsigned long zone_type, | 5213 | unsigned long zone_type, |
5043 | unsigned long node_start_pfn, | 5214 | unsigned long node_start_pfn, |
5044 | unsigned long node_end_pfn, | 5215 | unsigned long node_end_pfn, |
5216 | unsigned long *zone_start_pfn, | ||
5217 | unsigned long *zone_end_pfn, | ||
5045 | unsigned long *zones_size) | 5218 | unsigned long *zones_size) |
5046 | { | 5219 | { |
5220 | unsigned int zone; | ||
5221 | |||
5222 | *zone_start_pfn = node_start_pfn; | ||
5223 | for (zone = 0; zone < zone_type; zone++) | ||
5224 | *zone_start_pfn += zones_size[zone]; | ||
5225 | |||
5226 | *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; | ||
5227 | |||
5047 | return zones_size[zone_type]; | 5228 | return zones_size[zone_type]; |
5048 | } | 5229 | } |
5049 | 5230 | ||
@@ -5072,15 +5253,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | |||
5072 | 5253 | ||
5073 | for (i = 0; i < MAX_NR_ZONES; i++) { | 5254 | for (i = 0; i < MAX_NR_ZONES; i++) { |
5074 | struct zone *zone = pgdat->node_zones + i; | 5255 | struct zone *zone = pgdat->node_zones + i; |
5256 | unsigned long zone_start_pfn, zone_end_pfn; | ||
5075 | unsigned long size, real_size; | 5257 | unsigned long size, real_size; |
5076 | 5258 | ||
5077 | size = zone_spanned_pages_in_node(pgdat->node_id, i, | 5259 | size = zone_spanned_pages_in_node(pgdat->node_id, i, |
5078 | node_start_pfn, | 5260 | node_start_pfn, |
5079 | node_end_pfn, | 5261 | node_end_pfn, |
5262 | &zone_start_pfn, | ||
5263 | &zone_end_pfn, | ||
5080 | zones_size); | 5264 | zones_size); |
5081 | real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, | 5265 | real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, |
5082 | node_start_pfn, node_end_pfn, | 5266 | node_start_pfn, node_end_pfn, |
5083 | zholes_size); | 5267 | zholes_size); |
5268 | if (size) | ||
5269 | zone->zone_start_pfn = zone_start_pfn; | ||
5270 | else | ||
5271 | zone->zone_start_pfn = 0; | ||
5084 | zone->spanned_pages = size; | 5272 | zone->spanned_pages = size; |
5085 | zone->present_pages = real_size; | 5273 | zone->present_pages = real_size; |
5086 | 5274 | ||
@@ -5201,7 +5389,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
5201 | { | 5389 | { |
5202 | enum zone_type j; | 5390 | enum zone_type j; |
5203 | int nid = pgdat->node_id; | 5391 | int nid = pgdat->node_id; |
5204 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | ||
5205 | int ret; | 5392 | int ret; |
5206 | 5393 | ||
5207 | pgdat_resize_init(pgdat); | 5394 | pgdat_resize_init(pgdat); |
@@ -5222,6 +5409,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
5222 | for (j = 0; j < MAX_NR_ZONES; j++) { | 5409 | for (j = 0; j < MAX_NR_ZONES; j++) { |
5223 | struct zone *zone = pgdat->node_zones + j; | 5410 | struct zone *zone = pgdat->node_zones + j; |
5224 | unsigned long size, realsize, freesize, memmap_pages; | 5411 | unsigned long size, realsize, freesize, memmap_pages; |
5412 | unsigned long zone_start_pfn = zone->zone_start_pfn; | ||
5225 | 5413 | ||
5226 | size = zone->spanned_pages; | 5414 | size = zone->spanned_pages; |
5227 | realsize = freesize = zone->present_pages; | 5415 | realsize = freesize = zone->present_pages; |
@@ -5290,7 +5478,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
5290 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); | 5478 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); |
5291 | BUG_ON(ret); | 5479 | BUG_ON(ret); |
5292 | memmap_init(size, nid, j, zone_start_pfn); | 5480 | memmap_init(size, nid, j, zone_start_pfn); |
5293 | zone_start_pfn += size; | ||
5294 | } | 5481 | } |
5295 | } | 5482 | } |
5296 | 5483 | ||
@@ -5358,6 +5545,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
5358 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, | 5545 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, |
5359 | (u64)start_pfn << PAGE_SHIFT, | 5546 | (u64)start_pfn << PAGE_SHIFT, |
5360 | end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); | 5547 | end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); |
5548 | #else | ||
5549 | start_pfn = node_start_pfn; | ||
5361 | #endif | 5550 | #endif |
5362 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, | 5551 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, |
5363 | zones_size, zholes_size); | 5552 | zones_size, zholes_size); |
@@ -5529,6 +5718,36 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
5529 | } | 5718 | } |
5530 | 5719 | ||
5531 | /* | 5720 | /* |
5721 | * If kernelcore=mirror is specified, ignore movablecore option | ||
5722 | */ | ||
5723 | if (mirrored_kernelcore) { | ||
5724 | bool mem_below_4gb_not_mirrored = false; | ||
5725 | |||
5726 | for_each_memblock(memory, r) { | ||
5727 | if (memblock_is_mirror(r)) | ||
5728 | continue; | ||
5729 | |||
5730 | nid = r->nid; | ||
5731 | |||
5732 | usable_startpfn = memblock_region_memory_base_pfn(r); | ||
5733 | |||
5734 | if (usable_startpfn < 0x100000) { | ||
5735 | mem_below_4gb_not_mirrored = true; | ||
5736 | continue; | ||
5737 | } | ||
5738 | |||
5739 | zone_movable_pfn[nid] = zone_movable_pfn[nid] ? | ||
5740 | min(usable_startpfn, zone_movable_pfn[nid]) : | ||
5741 | usable_startpfn; | ||
5742 | } | ||
5743 | |||
5744 | if (mem_below_4gb_not_mirrored) | ||
5745 | pr_warn("This configuration results in unmirrored kernel memory."); | ||
5746 | |||
5747 | goto out2; | ||
5748 | } | ||
5749 | |||
5750 | /* | ||
5532 | * If movablecore=nn[KMG] was specified, calculate what size of | 5751 | * If movablecore=nn[KMG] was specified, calculate what size of |
5533 | * kernelcore that corresponds so that memory usable for | 5752 | * kernelcore that corresponds so that memory usable for |
5534 | * any allocation type is evenly spread. If both kernelcore | 5753 | * any allocation type is evenly spread. If both kernelcore |
@@ -5788,6 +6007,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core) | |||
5788 | */ | 6007 | */ |
5789 | static int __init cmdline_parse_kernelcore(char *p) | 6008 | static int __init cmdline_parse_kernelcore(char *p) |
5790 | { | 6009 | { |
6010 | /* parse kernelcore=mirror */ | ||
6011 | if (parse_option_str(p, "mirror")) { | ||
6012 | mirrored_kernelcore = true; | ||
6013 | return 0; | ||
6014 | } | ||
6015 | |||
5791 | return cmdline_parse_core(p, &required_kernelcore); | 6016 | return cmdline_parse_core(p, &required_kernelcore); |
5792 | } | 6017 | } |
5793 | 6018 | ||
diff --git a/mm/page_ext.c b/mm/page_ext.c index 292ca7b8debd..2d864e64f7fe 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c | |||
@@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page) | |||
106 | struct page_ext *base; | 106 | struct page_ext *base; |
107 | 107 | ||
108 | base = NODE_DATA(page_to_nid(page))->node_page_ext; | 108 | base = NODE_DATA(page_to_nid(page))->node_page_ext; |
109 | #ifdef CONFIG_DEBUG_VM | 109 | #if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) |
110 | /* | 110 | /* |
111 | * The sanity checks the page allocator does upon freeing a | 111 | * The sanity checks the page allocator does upon freeing a |
112 | * page can reach here before the page_ext arrays are | 112 | * page can reach here before the page_ext arrays are |
113 | * allocated when feeding a range of pages to the allocator | 113 | * allocated when feeding a range of pages to the allocator |
114 | * for the first time during bootup or memory hotplug. | 114 | * for the first time during bootup or memory hotplug. |
115 | * | ||
116 | * This check is also necessary for ensuring page poisoning | ||
117 | * works as expected when enabled | ||
115 | */ | 118 | */ |
116 | if (unlikely(!base)) | 119 | if (unlikely(!base)) |
117 | return NULL; | 120 | return NULL; |
@@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page) | |||
180 | { | 183 | { |
181 | unsigned long pfn = page_to_pfn(page); | 184 | unsigned long pfn = page_to_pfn(page); |
182 | struct mem_section *section = __pfn_to_section(pfn); | 185 | struct mem_section *section = __pfn_to_section(pfn); |
183 | #ifdef CONFIG_DEBUG_VM | 186 | #if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) |
184 | /* | 187 | /* |
185 | * The sanity checks the page allocator does upon freeing a | 188 | * The sanity checks the page allocator does upon freeing a |
186 | * page can reach here before the page_ext arrays are | 189 | * page can reach here before the page_ext arrays are |
187 | * allocated when feeding a range of pages to the allocator | 190 | * allocated when feeding a range of pages to the allocator |
188 | * for the first time during bootup or memory hotplug. | 191 | * for the first time during bootup or memory hotplug. |
192 | * | ||
193 | * This check is also necessary for ensuring page poisoning | ||
194 | * works as expected when enabled | ||
189 | */ | 195 | */ |
190 | if (!section->page_ext) | 196 | if (!section->page_ext) |
191 | return NULL; | 197 | return NULL; |
diff --git a/mm/page_owner.c b/mm/page_owner.c index 983c3a10fa07..44ad1f00c4e1 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c | |||
@@ -5,10 +5,12 @@ | |||
5 | #include <linux/bootmem.h> | 5 | #include <linux/bootmem.h> |
6 | #include <linux/stacktrace.h> | 6 | #include <linux/stacktrace.h> |
7 | #include <linux/page_owner.h> | 7 | #include <linux/page_owner.h> |
8 | #include <linux/jump_label.h> | ||
9 | #include <linux/migrate.h> | ||
8 | #include "internal.h" | 10 | #include "internal.h" |
9 | 11 | ||
10 | static bool page_owner_disabled = true; | 12 | static bool page_owner_disabled = true; |
11 | bool page_owner_inited __read_mostly; | 13 | DEFINE_STATIC_KEY_FALSE(page_owner_inited); |
12 | 14 | ||
13 | static void init_early_allocated_pages(void); | 15 | static void init_early_allocated_pages(void); |
14 | 16 | ||
@@ -37,7 +39,7 @@ static void init_page_owner(void) | |||
37 | if (page_owner_disabled) | 39 | if (page_owner_disabled) |
38 | return; | 40 | return; |
39 | 41 | ||
40 | page_owner_inited = true; | 42 | static_branch_enable(&page_owner_inited); |
41 | init_early_allocated_pages(); | 43 | init_early_allocated_pages(); |
42 | } | 44 | } |
43 | 45 | ||
@@ -72,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) | |||
72 | page_ext->order = order; | 74 | page_ext->order = order; |
73 | page_ext->gfp_mask = gfp_mask; | 75 | page_ext->gfp_mask = gfp_mask; |
74 | page_ext->nr_entries = trace.nr_entries; | 76 | page_ext->nr_entries = trace.nr_entries; |
77 | page_ext->last_migrate_reason = -1; | ||
75 | 78 | ||
76 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); | 79 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); |
77 | } | 80 | } |
78 | 81 | ||
82 | void __set_page_owner_migrate_reason(struct page *page, int reason) | ||
83 | { | ||
84 | struct page_ext *page_ext = lookup_page_ext(page); | ||
85 | |||
86 | page_ext->last_migrate_reason = reason; | ||
87 | } | ||
88 | |||
79 | gfp_t __get_page_owner_gfp(struct page *page) | 89 | gfp_t __get_page_owner_gfp(struct page *page) |
80 | { | 90 | { |
81 | struct page_ext *page_ext = lookup_page_ext(page); | 91 | struct page_ext *page_ext = lookup_page_ext(page); |
@@ -83,6 +93,31 @@ gfp_t __get_page_owner_gfp(struct page *page) | |||
83 | return page_ext->gfp_mask; | 93 | return page_ext->gfp_mask; |
84 | } | 94 | } |
85 | 95 | ||
96 | void __copy_page_owner(struct page *oldpage, struct page *newpage) | ||
97 | { | ||
98 | struct page_ext *old_ext = lookup_page_ext(oldpage); | ||
99 | struct page_ext *new_ext = lookup_page_ext(newpage); | ||
100 | int i; | ||
101 | |||
102 | new_ext->order = old_ext->order; | ||
103 | new_ext->gfp_mask = old_ext->gfp_mask; | ||
104 | new_ext->nr_entries = old_ext->nr_entries; | ||
105 | |||
106 | for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++) | ||
107 | new_ext->trace_entries[i] = old_ext->trace_entries[i]; | ||
108 | |||
109 | /* | ||
110 | * We don't clear the bit on the oldpage as it's going to be freed | ||
111 | * after migration. Until then, the info can be useful in case of | ||
112 | * a bug, and the overal stats will be off a bit only temporarily. | ||
113 | * Also, migrate_misplaced_transhuge_page() can still fail the | ||
114 | * migration and then we want the oldpage to retain the info. But | ||
115 | * in that case we also don't need to explicitly clear the info from | ||
116 | * the new page, which will be freed. | ||
117 | */ | ||
118 | __set_bit(PAGE_EXT_OWNER, &new_ext->flags); | ||
119 | } | ||
120 | |||
86 | static ssize_t | 121 | static ssize_t |
87 | print_page_owner(char __user *buf, size_t count, unsigned long pfn, | 122 | print_page_owner(char __user *buf, size_t count, unsigned long pfn, |
88 | struct page *page, struct page_ext *page_ext) | 123 | struct page *page, struct page_ext *page_ext) |
@@ -100,8 +135,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
100 | return -ENOMEM; | 135 | return -ENOMEM; |
101 | 136 | ||
102 | ret = snprintf(kbuf, count, | 137 | ret = snprintf(kbuf, count, |
103 | "Page allocated via order %u, mask 0x%x\n", | 138 | "Page allocated via order %u, mask %#x(%pGg)\n", |
104 | page_ext->order, page_ext->gfp_mask); | 139 | page_ext->order, page_ext->gfp_mask, |
140 | &page_ext->gfp_mask); | ||
105 | 141 | ||
106 | if (ret >= count) | 142 | if (ret >= count) |
107 | goto err; | 143 | goto err; |
@@ -110,23 +146,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
110 | pageblock_mt = get_pfnblock_migratetype(page, pfn); | 146 | pageblock_mt = get_pfnblock_migratetype(page, pfn); |
111 | page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); | 147 | page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); |
112 | ret += snprintf(kbuf + ret, count - ret, | 148 | ret += snprintf(kbuf + ret, count - ret, |
113 | "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", | 149 | "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", |
114 | pfn, | 150 | pfn, |
151 | migratetype_names[page_mt], | ||
115 | pfn >> pageblock_order, | 152 | pfn >> pageblock_order, |
116 | pageblock_mt, | 153 | migratetype_names[pageblock_mt], |
117 | pageblock_mt != page_mt ? "Fallback" : " ", | 154 | page->flags, &page->flags); |
118 | PageLocked(page) ? "K" : " ", | ||
119 | PageError(page) ? "E" : " ", | ||
120 | PageReferenced(page) ? "R" : " ", | ||
121 | PageUptodate(page) ? "U" : " ", | ||
122 | PageDirty(page) ? "D" : " ", | ||
123 | PageLRU(page) ? "L" : " ", | ||
124 | PageActive(page) ? "A" : " ", | ||
125 | PageSlab(page) ? "S" : " ", | ||
126 | PageWriteback(page) ? "W" : " ", | ||
127 | PageCompound(page) ? "C" : " ", | ||
128 | PageSwapCache(page) ? "B" : " ", | ||
129 | PageMappedToDisk(page) ? "M" : " "); | ||
130 | 155 | ||
131 | if (ret >= count) | 156 | if (ret >= count) |
132 | goto err; | 157 | goto err; |
@@ -135,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
135 | if (ret >= count) | 160 | if (ret >= count) |
136 | goto err; | 161 | goto err; |
137 | 162 | ||
163 | if (page_ext->last_migrate_reason != -1) { | ||
164 | ret += snprintf(kbuf + ret, count - ret, | ||
165 | "Page has been migrated, last migrate reason: %s\n", | ||
166 | migrate_reason_names[page_ext->last_migrate_reason]); | ||
167 | if (ret >= count) | ||
168 | goto err; | ||
169 | } | ||
170 | |||
138 | ret += snprintf(kbuf + ret, count - ret, "\n"); | 171 | ret += snprintf(kbuf + ret, count - ret, "\n"); |
139 | if (ret >= count) | 172 | if (ret >= count) |
140 | goto err; | 173 | goto err; |
@@ -150,6 +183,31 @@ err: | |||
150 | return -ENOMEM; | 183 | return -ENOMEM; |
151 | } | 184 | } |
152 | 185 | ||
186 | void __dump_page_owner(struct page *page) | ||
187 | { | ||
188 | struct page_ext *page_ext = lookup_page_ext(page); | ||
189 | struct stack_trace trace = { | ||
190 | .nr_entries = page_ext->nr_entries, | ||
191 | .entries = &page_ext->trace_entries[0], | ||
192 | }; | ||
193 | gfp_t gfp_mask = page_ext->gfp_mask; | ||
194 | int mt = gfpflags_to_migratetype(gfp_mask); | ||
195 | |||
196 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { | ||
197 | pr_alert("page_owner info is not active (free page?)\n"); | ||
198 | return; | ||
199 | } | ||
200 | |||
201 | pr_alert("page allocated via order %u, migratetype %s, " | ||
202 | "gfp_mask %#x(%pGg)\n", page_ext->order, | ||
203 | migratetype_names[mt], gfp_mask, &gfp_mask); | ||
204 | print_stack_trace(&trace, 0); | ||
205 | |||
206 | if (page_ext->last_migrate_reason != -1) | ||
207 | pr_alert("page has been migrated, last migrate reason: %s\n", | ||
208 | migrate_reason_names[page_ext->last_migrate_reason]); | ||
209 | } | ||
210 | |||
153 | static ssize_t | 211 | static ssize_t |
154 | read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) | 212 | read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) |
155 | { | 213 | { |
@@ -157,7 +215,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
157 | struct page *page; | 215 | struct page *page; |
158 | struct page_ext *page_ext; | 216 | struct page_ext *page_ext; |
159 | 217 | ||
160 | if (!page_owner_inited) | 218 | if (!static_branch_unlikely(&page_owner_inited)) |
161 | return -EINVAL; | 219 | return -EINVAL; |
162 | 220 | ||
163 | page = NULL; | 221 | page = NULL; |
@@ -305,7 +363,7 @@ static int __init pageowner_init(void) | |||
305 | { | 363 | { |
306 | struct dentry *dentry; | 364 | struct dentry *dentry; |
307 | 365 | ||
308 | if (!page_owner_inited) { | 366 | if (!static_branch_unlikely(&page_owner_inited)) { |
309 | pr_info("page_owner is disabled\n"); | 367 | pr_info("page_owner is disabled\n"); |
310 | return 0; | 368 | return 0; |
311 | } | 369 | } |
diff --git a/mm/debug-pagealloc.c b/mm/page_poison.c index 5bf5906ce13b..479e7ea2bea6 100644 --- a/mm/debug-pagealloc.c +++ b/mm/page_poison.c | |||
@@ -6,22 +6,48 @@ | |||
6 | #include <linux/poison.h> | 6 | #include <linux/poison.h> |
7 | #include <linux/ratelimit.h> | 7 | #include <linux/ratelimit.h> |
8 | 8 | ||
9 | static bool page_poisoning_enabled __read_mostly; | 9 | static bool __page_poisoning_enabled __read_mostly; |
10 | static bool want_page_poisoning __read_mostly; | ||
10 | 11 | ||
11 | static bool need_page_poisoning(void) | 12 | static int early_page_poison_param(char *buf) |
12 | { | 13 | { |
13 | if (!debug_pagealloc_enabled()) | 14 | if (!buf) |
14 | return false; | 15 | return -EINVAL; |
16 | |||
17 | if (strcmp(buf, "on") == 0) | ||
18 | want_page_poisoning = true; | ||
19 | else if (strcmp(buf, "off") == 0) | ||
20 | want_page_poisoning = false; | ||
15 | 21 | ||
16 | return true; | 22 | return 0; |
23 | } | ||
24 | early_param("page_poison", early_page_poison_param); | ||
25 | |||
26 | bool page_poisoning_enabled(void) | ||
27 | { | ||
28 | return __page_poisoning_enabled; | ||
29 | } | ||
30 | |||
31 | static bool need_page_poisoning(void) | ||
32 | { | ||
33 | return want_page_poisoning; | ||
17 | } | 34 | } |
18 | 35 | ||
19 | static void init_page_poisoning(void) | 36 | static void init_page_poisoning(void) |
20 | { | 37 | { |
21 | if (!debug_pagealloc_enabled()) | 38 | /* |
22 | return; | 39 | * page poisoning is debug page alloc for some arches. If either |
40 | * of those options are enabled, enable poisoning | ||
41 | */ | ||
42 | if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) { | ||
43 | if (!want_page_poisoning && !debug_pagealloc_enabled()) | ||
44 | return; | ||
45 | } else { | ||
46 | if (!want_page_poisoning) | ||
47 | return; | ||
48 | } | ||
23 | 49 | ||
24 | page_poisoning_enabled = true; | 50 | __page_poisoning_enabled = true; |
25 | } | 51 | } |
26 | 52 | ||
27 | struct page_ext_operations page_poisoning_ops = { | 53 | struct page_ext_operations page_poisoning_ops = { |
@@ -45,11 +71,14 @@ static inline void clear_page_poison(struct page *page) | |||
45 | __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | 71 | __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); |
46 | } | 72 | } |
47 | 73 | ||
48 | static inline bool page_poison(struct page *page) | 74 | bool page_is_poisoned(struct page *page) |
49 | { | 75 | { |
50 | struct page_ext *page_ext; | 76 | struct page_ext *page_ext; |
51 | 77 | ||
52 | page_ext = lookup_page_ext(page); | 78 | page_ext = lookup_page_ext(page); |
79 | if (!page_ext) | ||
80 | return false; | ||
81 | |||
53 | return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | 82 | return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); |
54 | } | 83 | } |
55 | 84 | ||
@@ -83,6 +112,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) | |||
83 | unsigned char *start; | 112 | unsigned char *start; |
84 | unsigned char *end; | 113 | unsigned char *end; |
85 | 114 | ||
115 | if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)) | ||
116 | return; | ||
117 | |||
86 | start = memchr_inv(mem, PAGE_POISON, bytes); | 118 | start = memchr_inv(mem, PAGE_POISON, bytes); |
87 | if (!start) | 119 | if (!start) |
88 | return; | 120 | return; |
@@ -95,9 +127,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) | |||
95 | if (!__ratelimit(&ratelimit)) | 127 | if (!__ratelimit(&ratelimit)) |
96 | return; | 128 | return; |
97 | else if (start == end && single_bit_flip(*start, PAGE_POISON)) | 129 | else if (start == end && single_bit_flip(*start, PAGE_POISON)) |
98 | printk(KERN_ERR "pagealloc: single bit error\n"); | 130 | pr_err("pagealloc: single bit error\n"); |
99 | else | 131 | else |
100 | printk(KERN_ERR "pagealloc: memory corruption\n"); | 132 | pr_err("pagealloc: memory corruption\n"); |
101 | 133 | ||
102 | print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, | 134 | print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, |
103 | end - start + 1, 1); | 135 | end - start + 1, 1); |
@@ -108,7 +140,7 @@ static void unpoison_page(struct page *page) | |||
108 | { | 140 | { |
109 | void *addr; | 141 | void *addr; |
110 | 142 | ||
111 | if (!page_poison(page)) | 143 | if (!page_is_poisoned(page)) |
112 | return; | 144 | return; |
113 | 145 | ||
114 | addr = kmap_atomic(page); | 146 | addr = kmap_atomic(page); |
@@ -125,9 +157,9 @@ static void unpoison_pages(struct page *page, int n) | |||
125 | unpoison_page(page + i); | 157 | unpoison_page(page + i); |
126 | } | 158 | } |
127 | 159 | ||
128 | void __kernel_map_pages(struct page *page, int numpages, int enable) | 160 | void kernel_poison_pages(struct page *page, int numpages, int enable) |
129 | { | 161 | { |
130 | if (!page_poisoning_enabled) | 162 | if (!page_poisoning_enabled()) |
131 | return; | 163 | return; |
132 | 164 | ||
133 | if (enable) | 165 | if (enable) |
@@ -135,3 +167,10 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) | |||
135 | else | 167 | else |
136 | poison_pages(page, numpages); | 168 | poison_pages(page, numpages); |
137 | } | 169 | } |
170 | |||
171 | #ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC | ||
172 | void __kernel_map_pages(struct page *page, int numpages, int enable) | ||
173 | { | ||
174 | /* This function does nothing, all work is done via poison pages */ | ||
175 | } | ||
176 | #endif | ||
@@ -1287,21 +1287,17 @@ void page_add_new_anon_rmap(struct page *page, | |||
1287 | */ | 1287 | */ |
1288 | void page_add_file_rmap(struct page *page) | 1288 | void page_add_file_rmap(struct page *page) |
1289 | { | 1289 | { |
1290 | struct mem_cgroup *memcg; | 1290 | lock_page_memcg(page); |
1291 | |||
1292 | memcg = mem_cgroup_begin_page_stat(page); | ||
1293 | if (atomic_inc_and_test(&page->_mapcount)) { | 1291 | if (atomic_inc_and_test(&page->_mapcount)) { |
1294 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1292 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
1295 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); | 1293 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); |
1296 | } | 1294 | } |
1297 | mem_cgroup_end_page_stat(memcg); | 1295 | unlock_page_memcg(page); |
1298 | } | 1296 | } |
1299 | 1297 | ||
1300 | static void page_remove_file_rmap(struct page *page) | 1298 | static void page_remove_file_rmap(struct page *page) |
1301 | { | 1299 | { |
1302 | struct mem_cgroup *memcg; | 1300 | lock_page_memcg(page); |
1303 | |||
1304 | memcg = mem_cgroup_begin_page_stat(page); | ||
1305 | 1301 | ||
1306 | /* Hugepages are not counted in NR_FILE_MAPPED for now. */ | 1302 | /* Hugepages are not counted in NR_FILE_MAPPED for now. */ |
1307 | if (unlikely(PageHuge(page))) { | 1303 | if (unlikely(PageHuge(page))) { |
@@ -1320,12 +1316,12 @@ static void page_remove_file_rmap(struct page *page) | |||
1320 | * pte lock(a spinlock) is held, which implies preemption disabled. | 1316 | * pte lock(a spinlock) is held, which implies preemption disabled. |
1321 | */ | 1317 | */ |
1322 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 1318 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
1323 | mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); | 1319 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); |
1324 | 1320 | ||
1325 | if (unlikely(PageMlocked(page))) | 1321 | if (unlikely(PageMlocked(page))) |
1326 | clear_page_mlock(page); | 1322 | clear_page_mlock(page); |
1327 | out: | 1323 | out: |
1328 | mem_cgroup_end_page_stat(memcg); | 1324 | unlock_page_memcg(page); |
1329 | } | 1325 | } |
1330 | 1326 | ||
1331 | static void page_remove_anon_compound_rmap(struct page *page) | 1327 | static void page_remove_anon_compound_rmap(struct page *page) |
diff --git a/mm/shmem.c b/mm/shmem.c index 440e2a7e6c1c..1acfdbc4bd9e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1116,7 +1116,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
1116 | */ | 1116 | */ |
1117 | oldpage = newpage; | 1117 | oldpage = newpage; |
1118 | } else { | 1118 | } else { |
1119 | mem_cgroup_replace_page(oldpage, newpage); | 1119 | mem_cgroup_migrate(oldpage, newpage); |
1120 | lru_cache_add_anon(newpage); | 1120 | lru_cache_add_anon(newpage); |
1121 | *pagep = newpage; | 1121 | *pagep = newpage; |
1122 | } | 1122 | } |
@@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t; | |||
169 | #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) | 169 | #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) |
170 | 170 | ||
171 | /* | 171 | /* |
172 | * true if a page was allocated from pfmemalloc reserves for network-based | ||
173 | * swap | ||
174 | */ | ||
175 | static bool pfmemalloc_active __read_mostly; | ||
176 | |||
177 | /* | ||
178 | * struct array_cache | 172 | * struct array_cache |
179 | * | 173 | * |
180 | * Purpose: | 174 | * Purpose: |
@@ -195,10 +189,6 @@ struct array_cache { | |||
195 | * Must have this definition in here for the proper | 189 | * Must have this definition in here for the proper |
196 | * alignment of array_cache. Also simplifies accessing | 190 | * alignment of array_cache. Also simplifies accessing |
197 | * the entries. | 191 | * the entries. |
198 | * | ||
199 | * Entries should not be directly dereferenced as | ||
200 | * entries belonging to slabs marked pfmemalloc will | ||
201 | * have the lower bits set SLAB_OBJ_PFMEMALLOC | ||
202 | */ | 192 | */ |
203 | }; | 193 | }; |
204 | 194 | ||
@@ -207,33 +197,6 @@ struct alien_cache { | |||
207 | struct array_cache ac; | 197 | struct array_cache ac; |
208 | }; | 198 | }; |
209 | 199 | ||
210 | #define SLAB_OBJ_PFMEMALLOC 1 | ||
211 | static inline bool is_obj_pfmemalloc(void *objp) | ||
212 | { | ||
213 | return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; | ||
214 | } | ||
215 | |||
216 | static inline void set_obj_pfmemalloc(void **objp) | ||
217 | { | ||
218 | *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); | ||
219 | return; | ||
220 | } | ||
221 | |||
222 | static inline void clear_obj_pfmemalloc(void **objp) | ||
223 | { | ||
224 | *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); | ||
225 | } | ||
226 | |||
227 | /* | ||
228 | * bootstrap: The caches do not work without cpuarrays anymore, but the | ||
229 | * cpuarrays are allocated from the generic caches... | ||
230 | */ | ||
231 | #define BOOT_CPUCACHE_ENTRIES 1 | ||
232 | struct arraycache_init { | ||
233 | struct array_cache cache; | ||
234 | void *entries[BOOT_CPUCACHE_ENTRIES]; | ||
235 | }; | ||
236 | |||
237 | /* | 200 | /* |
238 | * Need this for bootstrapping a per node allocator. | 201 | * Need this for bootstrapping a per node allocator. |
239 | */ | 202 | */ |
@@ -280,9 +243,10 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) | |||
280 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ | 243 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ |
281 | } while (0) | 244 | } while (0) |
282 | 245 | ||
246 | #define CFLGS_OBJFREELIST_SLAB (0x40000000UL) | ||
283 | #define CFLGS_OFF_SLAB (0x80000000UL) | 247 | #define CFLGS_OFF_SLAB (0x80000000UL) |
248 | #define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) | ||
284 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | 249 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) |
285 | #define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) | ||
286 | 250 | ||
287 | #define BATCHREFILL_LIMIT 16 | 251 | #define BATCHREFILL_LIMIT 16 |
288 | /* | 252 | /* |
@@ -390,36 +354,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
390 | 354 | ||
391 | #endif | 355 | #endif |
392 | 356 | ||
393 | #define OBJECT_FREE (0) | ||
394 | #define OBJECT_ACTIVE (1) | ||
395 | |||
396 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 357 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
397 | 358 | ||
398 | static void set_obj_status(struct page *page, int idx, int val) | 359 | static inline bool is_store_user_clean(struct kmem_cache *cachep) |
399 | { | 360 | { |
400 | int freelist_size; | 361 | return atomic_read(&cachep->store_user_clean) == 1; |
401 | char *status; | ||
402 | struct kmem_cache *cachep = page->slab_cache; | ||
403 | |||
404 | freelist_size = cachep->num * sizeof(freelist_idx_t); | ||
405 | status = (char *)page->freelist + freelist_size; | ||
406 | status[idx] = val; | ||
407 | } | 362 | } |
408 | 363 | ||
409 | static inline unsigned int get_obj_status(struct page *page, int idx) | 364 | static inline void set_store_user_clean(struct kmem_cache *cachep) |
410 | { | 365 | { |
411 | int freelist_size; | 366 | atomic_set(&cachep->store_user_clean, 1); |
412 | char *status; | 367 | } |
413 | struct kmem_cache *cachep = page->slab_cache; | ||
414 | |||
415 | freelist_size = cachep->num * sizeof(freelist_idx_t); | ||
416 | status = (char *)page->freelist + freelist_size; | ||
417 | 368 | ||
418 | return status[idx]; | 369 | static inline void set_store_user_dirty(struct kmem_cache *cachep) |
370 | { | ||
371 | if (is_store_user_clean(cachep)) | ||
372 | atomic_set(&cachep->store_user_clean, 0); | ||
419 | } | 373 | } |
420 | 374 | ||
421 | #else | 375 | #else |
422 | static inline void set_obj_status(struct page *page, int idx, int val) {} | 376 | static inline void set_store_user_dirty(struct kmem_cache *cachep) {} |
423 | 377 | ||
424 | #endif | 378 | #endif |
425 | 379 | ||
@@ -457,6 +411,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, | |||
457 | return reciprocal_divide(offset, cache->reciprocal_buffer_size); | 411 | return reciprocal_divide(offset, cache->reciprocal_buffer_size); |
458 | } | 412 | } |
459 | 413 | ||
414 | #define BOOT_CPUCACHE_ENTRIES 1 | ||
460 | /* internal cache of cache description objs */ | 415 | /* internal cache of cache description objs */ |
461 | static struct kmem_cache kmem_cache_boot = { | 416 | static struct kmem_cache kmem_cache_boot = { |
462 | .batchcount = 1, | 417 | .batchcount = 1, |
@@ -475,61 +430,13 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | |||
475 | return this_cpu_ptr(cachep->cpu_cache); | 430 | return this_cpu_ptr(cachep->cpu_cache); |
476 | } | 431 | } |
477 | 432 | ||
478 | static size_t calculate_freelist_size(int nr_objs, size_t align) | ||
479 | { | ||
480 | size_t freelist_size; | ||
481 | |||
482 | freelist_size = nr_objs * sizeof(freelist_idx_t); | ||
483 | if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) | ||
484 | freelist_size += nr_objs * sizeof(char); | ||
485 | |||
486 | if (align) | ||
487 | freelist_size = ALIGN(freelist_size, align); | ||
488 | |||
489 | return freelist_size; | ||
490 | } | ||
491 | |||
492 | static int calculate_nr_objs(size_t slab_size, size_t buffer_size, | ||
493 | size_t idx_size, size_t align) | ||
494 | { | ||
495 | int nr_objs; | ||
496 | size_t remained_size; | ||
497 | size_t freelist_size; | ||
498 | int extra_space = 0; | ||
499 | |||
500 | if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) | ||
501 | extra_space = sizeof(char); | ||
502 | /* | ||
503 | * Ignore padding for the initial guess. The padding | ||
504 | * is at most @align-1 bytes, and @buffer_size is at | ||
505 | * least @align. In the worst case, this result will | ||
506 | * be one greater than the number of objects that fit | ||
507 | * into the memory allocation when taking the padding | ||
508 | * into account. | ||
509 | */ | ||
510 | nr_objs = slab_size / (buffer_size + idx_size + extra_space); | ||
511 | |||
512 | /* | ||
513 | * This calculated number will be either the right | ||
514 | * amount, or one greater than what we want. | ||
515 | */ | ||
516 | remained_size = slab_size - nr_objs * buffer_size; | ||
517 | freelist_size = calculate_freelist_size(nr_objs, align); | ||
518 | if (remained_size < freelist_size) | ||
519 | nr_objs--; | ||
520 | |||
521 | return nr_objs; | ||
522 | } | ||
523 | |||
524 | /* | 433 | /* |
525 | * Calculate the number of objects and left-over bytes for a given buffer size. | 434 | * Calculate the number of objects and left-over bytes for a given buffer size. |
526 | */ | 435 | */ |
527 | static void cache_estimate(unsigned long gfporder, size_t buffer_size, | 436 | static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, |
528 | size_t align, int flags, size_t *left_over, | 437 | unsigned long flags, size_t *left_over) |
529 | unsigned int *num) | ||
530 | { | 438 | { |
531 | int nr_objs; | 439 | unsigned int num; |
532 | size_t mgmt_size; | ||
533 | size_t slab_size = PAGE_SIZE << gfporder; | 440 | size_t slab_size = PAGE_SIZE << gfporder; |
534 | 441 | ||
535 | /* | 442 | /* |
@@ -537,26 +444,28 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, | |||
537 | * on it. For the latter case, the memory allocated for a | 444 | * on it. For the latter case, the memory allocated for a |
538 | * slab is used for: | 445 | * slab is used for: |
539 | * | 446 | * |
540 | * - One unsigned int for each object | ||
541 | * - Padding to respect alignment of @align | ||
542 | * - @buffer_size bytes for each object | 447 | * - @buffer_size bytes for each object |
448 | * - One freelist_idx_t for each object | ||
449 | * | ||
450 | * We don't need to consider alignment of freelist because | ||
451 | * freelist will be at the end of slab page. The objects will be | ||
452 | * at the correct alignment. | ||
543 | * | 453 | * |
544 | * If the slab management structure is off the slab, then the | 454 | * If the slab management structure is off the slab, then the |
545 | * alignment will already be calculated into the size. Because | 455 | * alignment will already be calculated into the size. Because |
546 | * the slabs are all pages aligned, the objects will be at the | 456 | * the slabs are all pages aligned, the objects will be at the |
547 | * correct alignment when allocated. | 457 | * correct alignment when allocated. |
548 | */ | 458 | */ |
549 | if (flags & CFLGS_OFF_SLAB) { | 459 | if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { |
550 | mgmt_size = 0; | 460 | num = slab_size / buffer_size; |
551 | nr_objs = slab_size / buffer_size; | 461 | *left_over = slab_size % buffer_size; |
552 | |||
553 | } else { | 462 | } else { |
554 | nr_objs = calculate_nr_objs(slab_size, buffer_size, | 463 | num = slab_size / (buffer_size + sizeof(freelist_idx_t)); |
555 | sizeof(freelist_idx_t), align); | 464 | *left_over = slab_size % |
556 | mgmt_size = calculate_freelist_size(nr_objs, align); | 465 | (buffer_size + sizeof(freelist_idx_t)); |
557 | } | 466 | } |
558 | *num = nr_objs; | 467 | |
559 | *left_over = slab_size - nr_objs*buffer_size - mgmt_size; | 468 | return num; |
560 | } | 469 | } |
561 | 470 | ||
562 | #if DEBUG | 471 | #if DEBUG |
@@ -687,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries, | |||
687 | return ac; | 596 | return ac; |
688 | } | 597 | } |
689 | 598 | ||
690 | static inline bool is_slab_pfmemalloc(struct page *page) | 599 | static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, |
691 | { | 600 | struct page *page, void *objp) |
692 | return PageSlabPfmemalloc(page); | ||
693 | } | ||
694 | |||
695 | /* Clears pfmemalloc_active if no slabs have pfmalloc set */ | ||
696 | static void recheck_pfmemalloc_active(struct kmem_cache *cachep, | ||
697 | struct array_cache *ac) | ||
698 | { | ||
699 | struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); | ||
700 | struct page *page; | ||
701 | unsigned long flags; | ||
702 | |||
703 | if (!pfmemalloc_active) | ||
704 | return; | ||
705 | |||
706 | spin_lock_irqsave(&n->list_lock, flags); | ||
707 | list_for_each_entry(page, &n->slabs_full, lru) | ||
708 | if (is_slab_pfmemalloc(page)) | ||
709 | goto out; | ||
710 | |||
711 | list_for_each_entry(page, &n->slabs_partial, lru) | ||
712 | if (is_slab_pfmemalloc(page)) | ||
713 | goto out; | ||
714 | |||
715 | list_for_each_entry(page, &n->slabs_free, lru) | ||
716 | if (is_slab_pfmemalloc(page)) | ||
717 | goto out; | ||
718 | |||
719 | pfmemalloc_active = false; | ||
720 | out: | ||
721 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
722 | } | ||
723 | |||
724 | static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
725 | gfp_t flags, bool force_refill) | ||
726 | { | 601 | { |
727 | int i; | 602 | struct kmem_cache_node *n; |
728 | void *objp = ac->entry[--ac->avail]; | 603 | int page_node; |
729 | 604 | LIST_HEAD(list); | |
730 | /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ | ||
731 | if (unlikely(is_obj_pfmemalloc(objp))) { | ||
732 | struct kmem_cache_node *n; | ||
733 | |||
734 | if (gfp_pfmemalloc_allowed(flags)) { | ||
735 | clear_obj_pfmemalloc(&objp); | ||
736 | return objp; | ||
737 | } | ||
738 | |||
739 | /* The caller cannot use PFMEMALLOC objects, find another one */ | ||
740 | for (i = 0; i < ac->avail; i++) { | ||
741 | /* If a !PFMEMALLOC object is found, swap them */ | ||
742 | if (!is_obj_pfmemalloc(ac->entry[i])) { | ||
743 | objp = ac->entry[i]; | ||
744 | ac->entry[i] = ac->entry[ac->avail]; | ||
745 | ac->entry[ac->avail] = objp; | ||
746 | return objp; | ||
747 | } | ||
748 | } | ||
749 | |||
750 | /* | ||
751 | * If there are empty slabs on the slabs_free list and we are | ||
752 | * being forced to refill the cache, mark this one !pfmemalloc. | ||
753 | */ | ||
754 | n = get_node(cachep, numa_mem_id()); | ||
755 | if (!list_empty(&n->slabs_free) && force_refill) { | ||
756 | struct page *page = virt_to_head_page(objp); | ||
757 | ClearPageSlabPfmemalloc(page); | ||
758 | clear_obj_pfmemalloc(&objp); | ||
759 | recheck_pfmemalloc_active(cachep, ac); | ||
760 | return objp; | ||
761 | } | ||
762 | |||
763 | /* No !PFMEMALLOC objects available */ | ||
764 | ac->avail++; | ||
765 | objp = NULL; | ||
766 | } | ||
767 | |||
768 | return objp; | ||
769 | } | ||
770 | |||
771 | static inline void *ac_get_obj(struct kmem_cache *cachep, | ||
772 | struct array_cache *ac, gfp_t flags, bool force_refill) | ||
773 | { | ||
774 | void *objp; | ||
775 | |||
776 | if (unlikely(sk_memalloc_socks())) | ||
777 | objp = __ac_get_obj(cachep, ac, flags, force_refill); | ||
778 | else | ||
779 | objp = ac->entry[--ac->avail]; | ||
780 | |||
781 | return objp; | ||
782 | } | ||
783 | |||
784 | static noinline void *__ac_put_obj(struct kmem_cache *cachep, | ||
785 | struct array_cache *ac, void *objp) | ||
786 | { | ||
787 | if (unlikely(pfmemalloc_active)) { | ||
788 | /* Some pfmemalloc slabs exist, check if this is one */ | ||
789 | struct page *page = virt_to_head_page(objp); | ||
790 | if (PageSlabPfmemalloc(page)) | ||
791 | set_obj_pfmemalloc(&objp); | ||
792 | } | ||
793 | 605 | ||
794 | return objp; | 606 | page_node = page_to_nid(page); |
795 | } | 607 | n = get_node(cachep, page_node); |
796 | 608 | ||
797 | static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | 609 | spin_lock(&n->list_lock); |
798 | void *objp) | 610 | free_block(cachep, &objp, 1, page_node, &list); |
799 | { | 611 | spin_unlock(&n->list_lock); |
800 | if (unlikely(sk_memalloc_socks())) | ||
801 | objp = __ac_put_obj(cachep, ac, objp); | ||
802 | 612 | ||
803 | ac->entry[ac->avail++] = objp; | 613 | slabs_destroy(cachep, &list); |
804 | } | 614 | } |
805 | 615 | ||
806 | /* | 616 | /* |
@@ -1003,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, | |||
1003 | STATS_INC_ACOVERFLOW(cachep); | 813 | STATS_INC_ACOVERFLOW(cachep); |
1004 | __drain_alien_cache(cachep, ac, page_node, &list); | 814 | __drain_alien_cache(cachep, ac, page_node, &list); |
1005 | } | 815 | } |
1006 | ac_put_obj(cachep, ac, objp); | 816 | ac->entry[ac->avail++] = objp; |
1007 | spin_unlock(&alien->lock); | 817 | spin_unlock(&alien->lock); |
1008 | slabs_destroy(cachep, &list); | 818 | slabs_destroy(cachep, &list); |
1009 | } else { | 819 | } else { |
@@ -1540,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) | |||
1540 | if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) | 1350 | if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) |
1541 | return; | 1351 | return; |
1542 | 1352 | ||
1543 | printk(KERN_WARNING | 1353 | pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", |
1544 | "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", | 1354 | nodeid, gfpflags, &gfpflags); |
1545 | nodeid, gfpflags); | 1355 | pr_warn(" cache: %s, object size: %d, order: %d\n", |
1546 | printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", | ||
1547 | cachep->name, cachep->size, cachep->gfporder); | 1356 | cachep->name, cachep->size, cachep->gfporder); |
1548 | 1357 | ||
1549 | for_each_kmem_cache_node(cachep, node, n) { | 1358 | for_each_kmem_cache_node(cachep, node, n) { |
@@ -1567,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) | |||
1567 | 1376 | ||
1568 | num_slabs += active_slabs; | 1377 | num_slabs += active_slabs; |
1569 | num_objs = num_slabs * cachep->num; | 1378 | num_objs = num_slabs * cachep->num; |
1570 | printk(KERN_WARNING | 1379 | pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", |
1571 | " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", | ||
1572 | node, active_slabs, num_slabs, active_objs, num_objs, | 1380 | node, active_slabs, num_slabs, active_objs, num_objs, |
1573 | free_objects); | 1381 | free_objects); |
1574 | } | 1382 | } |
@@ -1604,10 +1412,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, | |||
1604 | return NULL; | 1412 | return NULL; |
1605 | } | 1413 | } |
1606 | 1414 | ||
1607 | /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ | ||
1608 | if (page_is_pfmemalloc(page)) | ||
1609 | pfmemalloc_active = true; | ||
1610 | |||
1611 | nr_pages = (1 << cachep->gfporder); | 1415 | nr_pages = (1 << cachep->gfporder); |
1612 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1416 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1613 | add_zone_page_state(page_zone(page), | 1417 | add_zone_page_state(page_zone(page), |
@@ -1615,8 +1419,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, | |||
1615 | else | 1419 | else |
1616 | add_zone_page_state(page_zone(page), | 1420 | add_zone_page_state(page_zone(page), |
1617 | NR_SLAB_UNRECLAIMABLE, nr_pages); | 1421 | NR_SLAB_UNRECLAIMABLE, nr_pages); |
1422 | |||
1618 | __SetPageSlab(page); | 1423 | __SetPageSlab(page); |
1619 | if (page_is_pfmemalloc(page)) | 1424 | /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ |
1425 | if (sk_memalloc_socks() && page_is_pfmemalloc(page)) | ||
1620 | SetPageSlabPfmemalloc(page); | 1426 | SetPageSlabPfmemalloc(page); |
1621 | 1427 | ||
1622 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | 1428 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { |
@@ -1670,6 +1476,14 @@ static void kmem_rcu_free(struct rcu_head *head) | |||
1670 | } | 1476 | } |
1671 | 1477 | ||
1672 | #if DEBUG | 1478 | #if DEBUG |
1479 | static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) | ||
1480 | { | ||
1481 | if (debug_pagealloc_enabled() && OFF_SLAB(cachep) && | ||
1482 | (cachep->size % PAGE_SIZE) == 0) | ||
1483 | return true; | ||
1484 | |||
1485 | return false; | ||
1486 | } | ||
1673 | 1487 | ||
1674 | #ifdef CONFIG_DEBUG_PAGEALLOC | 1488 | #ifdef CONFIG_DEBUG_PAGEALLOC |
1675 | static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, | 1489 | static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, |
@@ -1703,6 +1517,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, | |||
1703 | } | 1517 | } |
1704 | *addr++ = 0x87654321; | 1518 | *addr++ = 0x87654321; |
1705 | } | 1519 | } |
1520 | |||
1521 | static void slab_kernel_map(struct kmem_cache *cachep, void *objp, | ||
1522 | int map, unsigned long caller) | ||
1523 | { | ||
1524 | if (!is_debug_pagealloc_cache(cachep)) | ||
1525 | return; | ||
1526 | |||
1527 | if (caller) | ||
1528 | store_stackinfo(cachep, objp, caller); | ||
1529 | |||
1530 | kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); | ||
1531 | } | ||
1532 | |||
1533 | #else | ||
1534 | static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, | ||
1535 | int map, unsigned long caller) {} | ||
1536 | |||
1706 | #endif | 1537 | #endif |
1707 | 1538 | ||
1708 | static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) | 1539 | static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) |
@@ -1781,6 +1612,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) | |||
1781 | int size, i; | 1612 | int size, i; |
1782 | int lines = 0; | 1613 | int lines = 0; |
1783 | 1614 | ||
1615 | if (is_debug_pagealloc_cache(cachep)) | ||
1616 | return; | ||
1617 | |||
1784 | realobj = (char *)objp + obj_offset(cachep); | 1618 | realobj = (char *)objp + obj_offset(cachep); |
1785 | size = cachep->object_size; | 1619 | size = cachep->object_size; |
1786 | 1620 | ||
@@ -1842,20 +1676,18 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, | |||
1842 | struct page *page) | 1676 | struct page *page) |
1843 | { | 1677 | { |
1844 | int i; | 1678 | int i; |
1679 | |||
1680 | if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) { | ||
1681 | poison_obj(cachep, page->freelist - obj_offset(cachep), | ||
1682 | POISON_FREE); | ||
1683 | } | ||
1684 | |||
1845 | for (i = 0; i < cachep->num; i++) { | 1685 | for (i = 0; i < cachep->num; i++) { |
1846 | void *objp = index_to_obj(cachep, page, i); | 1686 | void *objp = index_to_obj(cachep, page, i); |
1847 | 1687 | ||
1848 | if (cachep->flags & SLAB_POISON) { | 1688 | if (cachep->flags & SLAB_POISON) { |
1849 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
1850 | if (cachep->size % PAGE_SIZE == 0 && | ||
1851 | OFF_SLAB(cachep)) | ||
1852 | kernel_map_pages(virt_to_page(objp), | ||
1853 | cachep->size / PAGE_SIZE, 1); | ||
1854 | else | ||
1855 | check_poison_obj(cachep, objp); | ||
1856 | #else | ||
1857 | check_poison_obj(cachep, objp); | 1689 | check_poison_obj(cachep, objp); |
1858 | #endif | 1690 | slab_kernel_map(cachep, objp, 1, 0); |
1859 | } | 1691 | } |
1860 | if (cachep->flags & SLAB_RED_ZONE) { | 1692 | if (cachep->flags & SLAB_RED_ZONE) { |
1861 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) | 1693 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) |
@@ -1916,7 +1748,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) | |||
1916 | * calculate_slab_order - calculate size (page order) of slabs | 1748 | * calculate_slab_order - calculate size (page order) of slabs |
1917 | * @cachep: pointer to the cache that is being created | 1749 | * @cachep: pointer to the cache that is being created |
1918 | * @size: size of objects to be created in this cache. | 1750 | * @size: size of objects to be created in this cache. |
1919 | * @align: required alignment for the objects. | ||
1920 | * @flags: slab allocation flags | 1751 | * @flags: slab allocation flags |
1921 | * | 1752 | * |
1922 | * Also calculates the number of objects per slab. | 1753 | * Also calculates the number of objects per slab. |
@@ -1926,9 +1757,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) | |||
1926 | * towards high-order requests, this should be changed. | 1757 | * towards high-order requests, this should be changed. |
1927 | */ | 1758 | */ |
1928 | static size_t calculate_slab_order(struct kmem_cache *cachep, | 1759 | static size_t calculate_slab_order(struct kmem_cache *cachep, |
1929 | size_t size, size_t align, unsigned long flags) | 1760 | size_t size, unsigned long flags) |
1930 | { | 1761 | { |
1931 | unsigned long offslab_limit; | ||
1932 | size_t left_over = 0; | 1762 | size_t left_over = 0; |
1933 | int gfporder; | 1763 | int gfporder; |
1934 | 1764 | ||
@@ -1936,7 +1766,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
1936 | unsigned int num; | 1766 | unsigned int num; |
1937 | size_t remainder; | 1767 | size_t remainder; |
1938 | 1768 | ||
1939 | cache_estimate(gfporder, size, align, flags, &remainder, &num); | 1769 | num = cache_estimate(gfporder, size, flags, &remainder); |
1940 | if (!num) | 1770 | if (!num) |
1941 | continue; | 1771 | continue; |
1942 | 1772 | ||
@@ -1945,19 +1775,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
1945 | break; | 1775 | break; |
1946 | 1776 | ||
1947 | if (flags & CFLGS_OFF_SLAB) { | 1777 | if (flags & CFLGS_OFF_SLAB) { |
1948 | size_t freelist_size_per_obj = sizeof(freelist_idx_t); | 1778 | struct kmem_cache *freelist_cache; |
1779 | size_t freelist_size; | ||
1780 | |||
1781 | freelist_size = num * sizeof(freelist_idx_t); | ||
1782 | freelist_cache = kmalloc_slab(freelist_size, 0u); | ||
1783 | if (!freelist_cache) | ||
1784 | continue; | ||
1785 | |||
1949 | /* | 1786 | /* |
1950 | * Max number of objs-per-slab for caches which | 1787 | * Needed to avoid possible looping condition |
1951 | * use off-slab slabs. Needed to avoid a possible | 1788 | * in cache_grow() |
1952 | * looping condition in cache_grow(). | ||
1953 | */ | 1789 | */ |
1954 | if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) | 1790 | if (OFF_SLAB(freelist_cache)) |
1955 | freelist_size_per_obj += sizeof(char); | 1791 | continue; |
1956 | offslab_limit = size; | ||
1957 | offslab_limit /= freelist_size_per_obj; | ||
1958 | 1792 | ||
1959 | if (num > offslab_limit) | 1793 | /* check if off slab has enough benefit */ |
1960 | break; | 1794 | if (freelist_cache->size > cachep->size / 2) |
1795 | continue; | ||
1961 | } | 1796 | } |
1962 | 1797 | ||
1963 | /* Found something acceptable - save it away */ | 1798 | /* Found something acceptable - save it away */ |
@@ -2075,6 +1910,79 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, | |||
2075 | return cachep; | 1910 | return cachep; |
2076 | } | 1911 | } |
2077 | 1912 | ||
1913 | static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, | ||
1914 | size_t size, unsigned long flags) | ||
1915 | { | ||
1916 | size_t left; | ||
1917 | |||
1918 | cachep->num = 0; | ||
1919 | |||
1920 | if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) | ||
1921 | return false; | ||
1922 | |||
1923 | left = calculate_slab_order(cachep, size, | ||
1924 | flags | CFLGS_OBJFREELIST_SLAB); | ||
1925 | if (!cachep->num) | ||
1926 | return false; | ||
1927 | |||
1928 | if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size) | ||
1929 | return false; | ||
1930 | |||
1931 | cachep->colour = left / cachep->colour_off; | ||
1932 | |||
1933 | return true; | ||
1934 | } | ||
1935 | |||
1936 | static bool set_off_slab_cache(struct kmem_cache *cachep, | ||
1937 | size_t size, unsigned long flags) | ||
1938 | { | ||
1939 | size_t left; | ||
1940 | |||
1941 | cachep->num = 0; | ||
1942 | |||
1943 | /* | ||
1944 | * Always use on-slab management when SLAB_NOLEAKTRACE | ||
1945 | * to avoid recursive calls into kmemleak. | ||
1946 | */ | ||
1947 | if (flags & SLAB_NOLEAKTRACE) | ||
1948 | return false; | ||
1949 | |||
1950 | /* | ||
1951 | * Size is large, assume best to place the slab management obj | ||
1952 | * off-slab (should allow better packing of objs). | ||
1953 | */ | ||
1954 | left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB); | ||
1955 | if (!cachep->num) | ||
1956 | return false; | ||
1957 | |||
1958 | /* | ||
1959 | * If the slab has been placed off-slab, and we have enough space then | ||
1960 | * move it on-slab. This is at the expense of any extra colouring. | ||
1961 | */ | ||
1962 | if (left >= cachep->num * sizeof(freelist_idx_t)) | ||
1963 | return false; | ||
1964 | |||
1965 | cachep->colour = left / cachep->colour_off; | ||
1966 | |||
1967 | return true; | ||
1968 | } | ||
1969 | |||
1970 | static bool set_on_slab_cache(struct kmem_cache *cachep, | ||
1971 | size_t size, unsigned long flags) | ||
1972 | { | ||
1973 | size_t left; | ||
1974 | |||
1975 | cachep->num = 0; | ||
1976 | |||
1977 | left = calculate_slab_order(cachep, size, flags); | ||
1978 | if (!cachep->num) | ||
1979 | return false; | ||
1980 | |||
1981 | cachep->colour = left / cachep->colour_off; | ||
1982 | |||
1983 | return true; | ||
1984 | } | ||
1985 | |||
2078 | /** | 1986 | /** |
2079 | * __kmem_cache_create - Create a cache. | 1987 | * __kmem_cache_create - Create a cache. |
2080 | * @cachep: cache management descriptor | 1988 | * @cachep: cache management descriptor |
@@ -2099,7 +2007,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, | |||
2099 | int | 2007 | int |
2100 | __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | 2008 | __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) |
2101 | { | 2009 | { |
2102 | size_t left_over, freelist_size; | ||
2103 | size_t ralign = BYTES_PER_WORD; | 2010 | size_t ralign = BYTES_PER_WORD; |
2104 | gfp_t gfp; | 2011 | gfp_t gfp; |
2105 | int err; | 2012 | int err; |
@@ -2119,8 +2026,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2119 | if (!(flags & SLAB_DESTROY_BY_RCU)) | 2026 | if (!(flags & SLAB_DESTROY_BY_RCU)) |
2120 | flags |= SLAB_POISON; | 2027 | flags |= SLAB_POISON; |
2121 | #endif | 2028 | #endif |
2122 | if (flags & SLAB_DESTROY_BY_RCU) | ||
2123 | BUG_ON(flags & SLAB_POISON); | ||
2124 | #endif | 2029 | #endif |
2125 | 2030 | ||
2126 | /* | 2031 | /* |
@@ -2152,6 +2057,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2152 | * 4) Store it. | 2057 | * 4) Store it. |
2153 | */ | 2058 | */ |
2154 | cachep->align = ralign; | 2059 | cachep->align = ralign; |
2060 | cachep->colour_off = cache_line_size(); | ||
2061 | /* Offset must be a multiple of the alignment. */ | ||
2062 | if (cachep->colour_off < cachep->align) | ||
2063 | cachep->colour_off = cachep->align; | ||
2155 | 2064 | ||
2156 | if (slab_is_available()) | 2065 | if (slab_is_available()) |
2157 | gfp = GFP_KERNEL; | 2066 | gfp = GFP_KERNEL; |
@@ -2179,37 +2088,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2179 | else | 2088 | else |
2180 | size += BYTES_PER_WORD; | 2089 | size += BYTES_PER_WORD; |
2181 | } | 2090 | } |
2182 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) | ||
2183 | /* | ||
2184 | * To activate debug pagealloc, off-slab management is necessary | ||
2185 | * requirement. In early phase of initialization, small sized slab | ||
2186 | * doesn't get initialized so it would not be possible. So, we need | ||
2187 | * to check size >= 256. It guarantees that all necessary small | ||
2188 | * sized slab is initialized in current slab initialization sequence. | ||
2189 | */ | ||
2190 | if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) && | ||
2191 | size >= 256 && cachep->object_size > cache_line_size() && | ||
2192 | ALIGN(size, cachep->align) < PAGE_SIZE) { | ||
2193 | cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); | ||
2194 | size = PAGE_SIZE; | ||
2195 | } | ||
2196 | #endif | ||
2197 | #endif | 2091 | #endif |
2198 | 2092 | ||
2199 | /* | ||
2200 | * Determine if the slab management is 'on' or 'off' slab. | ||
2201 | * (bootstrapping cannot cope with offslab caches so don't do | ||
2202 | * it too early on. Always use on-slab management when | ||
2203 | * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) | ||
2204 | */ | ||
2205 | if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && | ||
2206 | !(flags & SLAB_NOLEAKTRACE)) | ||
2207 | /* | ||
2208 | * Size is large, assume best to place the slab management obj | ||
2209 | * off-slab (should allow better packing of objs). | ||
2210 | */ | ||
2211 | flags |= CFLGS_OFF_SLAB; | ||
2212 | |||
2213 | size = ALIGN(size, cachep->align); | 2093 | size = ALIGN(size, cachep->align); |
2214 | /* | 2094 | /* |
2215 | * We should restrict the number of objects in a slab to implement | 2095 | * We should restrict the number of objects in a slab to implement |
@@ -2218,42 +2098,46 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2218 | if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) | 2098 | if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) |
2219 | size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); | 2099 | size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); |
2220 | 2100 | ||
2221 | left_over = calculate_slab_order(cachep, size, cachep->align, flags); | 2101 | #if DEBUG |
2222 | |||
2223 | if (!cachep->num) | ||
2224 | return -E2BIG; | ||
2225 | |||
2226 | freelist_size = calculate_freelist_size(cachep->num, cachep->align); | ||
2227 | |||
2228 | /* | 2102 | /* |
2229 | * If the slab has been placed off-slab, and we have enough space then | 2103 | * To activate debug pagealloc, off-slab management is necessary |
2230 | * move it on-slab. This is at the expense of any extra colouring. | 2104 | * requirement. In early phase of initialization, small sized slab |
2105 | * doesn't get initialized so it would not be possible. So, we need | ||
2106 | * to check size >= 256. It guarantees that all necessary small | ||
2107 | * sized slab is initialized in current slab initialization sequence. | ||
2231 | */ | 2108 | */ |
2232 | if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { | 2109 | if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && |
2233 | flags &= ~CFLGS_OFF_SLAB; | 2110 | size >= 256 && cachep->object_size > cache_line_size()) { |
2234 | left_over -= freelist_size; | 2111 | if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { |
2112 | size_t tmp_size = ALIGN(size, PAGE_SIZE); | ||
2113 | |||
2114 | if (set_off_slab_cache(cachep, tmp_size, flags)) { | ||
2115 | flags |= CFLGS_OFF_SLAB; | ||
2116 | cachep->obj_offset += tmp_size - size; | ||
2117 | size = tmp_size; | ||
2118 | goto done; | ||
2119 | } | ||
2120 | } | ||
2235 | } | 2121 | } |
2122 | #endif | ||
2236 | 2123 | ||
2237 | if (flags & CFLGS_OFF_SLAB) { | 2124 | if (set_objfreelist_slab_cache(cachep, size, flags)) { |
2238 | /* really off slab. No need for manual alignment */ | 2125 | flags |= CFLGS_OBJFREELIST_SLAB; |
2239 | freelist_size = calculate_freelist_size(cachep->num, 0); | 2126 | goto done; |
2127 | } | ||
2240 | 2128 | ||
2241 | #ifdef CONFIG_PAGE_POISONING | 2129 | if (set_off_slab_cache(cachep, size, flags)) { |
2242 | /* If we're going to use the generic kernel_map_pages() | 2130 | flags |= CFLGS_OFF_SLAB; |
2243 | * poisoning, then it's going to smash the contents of | 2131 | goto done; |
2244 | * the redzone and userword anyhow, so switch them off. | ||
2245 | */ | ||
2246 | if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) | ||
2247 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
2248 | #endif | ||
2249 | } | 2132 | } |
2250 | 2133 | ||
2251 | cachep->colour_off = cache_line_size(); | 2134 | if (set_on_slab_cache(cachep, size, flags)) |
2252 | /* Offset must be a multiple of the alignment. */ | 2135 | goto done; |
2253 | if (cachep->colour_off < cachep->align) | 2136 | |
2254 | cachep->colour_off = cachep->align; | 2137 | return -E2BIG; |
2255 | cachep->colour = left_over / cachep->colour_off; | 2138 | |
2256 | cachep->freelist_size = freelist_size; | 2139 | done: |
2140 | cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); | ||
2257 | cachep->flags = flags; | 2141 | cachep->flags = flags; |
2258 | cachep->allocflags = __GFP_COMP; | 2142 | cachep->allocflags = __GFP_COMP; |
2259 | if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) | 2143 | if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) |
@@ -2261,16 +2145,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2261 | cachep->size = size; | 2145 | cachep->size = size; |
2262 | cachep->reciprocal_buffer_size = reciprocal_value(size); | 2146 | cachep->reciprocal_buffer_size = reciprocal_value(size); |
2263 | 2147 | ||
2264 | if (flags & CFLGS_OFF_SLAB) { | 2148 | #if DEBUG |
2265 | cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); | 2149 | /* |
2266 | /* | 2150 | * If we're going to use the generic kernel_map_pages() |
2267 | * This is a possibility for one of the kmalloc_{dma,}_caches. | 2151 | * poisoning, then it's going to smash the contents of |
2268 | * But since we go off slab only for object size greater than | 2152 | * the redzone and userword anyhow, so switch them off. |
2269 | * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created | 2153 | */ |
2270 | * in ascending order,this should not happen at all. | 2154 | if (IS_ENABLED(CONFIG_PAGE_POISONING) && |
2271 | * But leave a BUG_ON for some lucky dude. | 2155 | (cachep->flags & SLAB_POISON) && |
2272 | */ | 2156 | is_debug_pagealloc_cache(cachep)) |
2273 | BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); | 2157 | cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); |
2158 | #endif | ||
2159 | |||
2160 | if (OFF_SLAB(cachep)) { | ||
2161 | cachep->freelist_cache = | ||
2162 | kmalloc_slab(cachep->freelist_size, 0u); | ||
2274 | } | 2163 | } |
2275 | 2164 | ||
2276 | err = setup_cpu_cache(cachep, gfp); | 2165 | err = setup_cpu_cache(cachep, gfp); |
@@ -2377,9 +2266,6 @@ static int drain_freelist(struct kmem_cache *cache, | |||
2377 | } | 2266 | } |
2378 | 2267 | ||
2379 | page = list_entry(p, struct page, lru); | 2268 | page = list_entry(p, struct page, lru); |
2380 | #if DEBUG | ||
2381 | BUG_ON(page->active); | ||
2382 | #endif | ||
2383 | list_del(&page->lru); | 2269 | list_del(&page->lru); |
2384 | /* | 2270 | /* |
2385 | * Safe to drop the lock. The slab is no longer linked | 2271 | * Safe to drop the lock. The slab is no longer linked |
@@ -2454,18 +2340,23 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, | |||
2454 | void *freelist; | 2340 | void *freelist; |
2455 | void *addr = page_address(page); | 2341 | void *addr = page_address(page); |
2456 | 2342 | ||
2457 | if (OFF_SLAB(cachep)) { | 2343 | page->s_mem = addr + colour_off; |
2344 | page->active = 0; | ||
2345 | |||
2346 | if (OBJFREELIST_SLAB(cachep)) | ||
2347 | freelist = NULL; | ||
2348 | else if (OFF_SLAB(cachep)) { | ||
2458 | /* Slab management obj is off-slab. */ | 2349 | /* Slab management obj is off-slab. */ |
2459 | freelist = kmem_cache_alloc_node(cachep->freelist_cache, | 2350 | freelist = kmem_cache_alloc_node(cachep->freelist_cache, |
2460 | local_flags, nodeid); | 2351 | local_flags, nodeid); |
2461 | if (!freelist) | 2352 | if (!freelist) |
2462 | return NULL; | 2353 | return NULL; |
2463 | } else { | 2354 | } else { |
2464 | freelist = addr + colour_off; | 2355 | /* We will use last bytes at the slab for freelist */ |
2465 | colour_off += cachep->freelist_size; | 2356 | freelist = addr + (PAGE_SIZE << cachep->gfporder) - |
2357 | cachep->freelist_size; | ||
2466 | } | 2358 | } |
2467 | page->active = 0; | 2359 | |
2468 | page->s_mem = addr + colour_off; | ||
2469 | return freelist; | 2360 | return freelist; |
2470 | } | 2361 | } |
2471 | 2362 | ||
@@ -2480,17 +2371,14 @@ static inline void set_free_obj(struct page *page, | |||
2480 | ((freelist_idx_t *)(page->freelist))[idx] = val; | 2371 | ((freelist_idx_t *)(page->freelist))[idx] = val; |
2481 | } | 2372 | } |
2482 | 2373 | ||
2483 | static void cache_init_objs(struct kmem_cache *cachep, | 2374 | static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) |
2484 | struct page *page) | ||
2485 | { | 2375 | { |
2376 | #if DEBUG | ||
2486 | int i; | 2377 | int i; |
2487 | 2378 | ||
2488 | for (i = 0; i < cachep->num; i++) { | 2379 | for (i = 0; i < cachep->num; i++) { |
2489 | void *objp = index_to_obj(cachep, page, i); | 2380 | void *objp = index_to_obj(cachep, page, i); |
2490 | #if DEBUG | 2381 | |
2491 | /* need to poison the objs? */ | ||
2492 | if (cachep->flags & SLAB_POISON) | ||
2493 | poison_obj(cachep, objp, POISON_FREE); | ||
2494 | if (cachep->flags & SLAB_STORE_USER) | 2382 | if (cachep->flags & SLAB_STORE_USER) |
2495 | *dbg_userword(cachep, objp) = NULL; | 2383 | *dbg_userword(cachep, objp) = NULL; |
2496 | 2384 | ||
@@ -2514,15 +2402,32 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2514 | slab_error(cachep, "constructor overwrote the" | 2402 | slab_error(cachep, "constructor overwrote the" |
2515 | " start of an object"); | 2403 | " start of an object"); |
2516 | } | 2404 | } |
2517 | if ((cachep->size % PAGE_SIZE) == 0 && | 2405 | /* need to poison the objs? */ |
2518 | OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) | 2406 | if (cachep->flags & SLAB_POISON) { |
2519 | kernel_map_pages(virt_to_page(objp), | 2407 | poison_obj(cachep, objp, POISON_FREE); |
2520 | cachep->size / PAGE_SIZE, 0); | 2408 | slab_kernel_map(cachep, objp, 0, 0); |
2521 | #else | 2409 | } |
2522 | if (cachep->ctor) | 2410 | } |
2523 | cachep->ctor(objp); | ||
2524 | #endif | 2411 | #endif |
2525 | set_obj_status(page, i, OBJECT_FREE); | 2412 | } |
2413 | |||
2414 | static void cache_init_objs(struct kmem_cache *cachep, | ||
2415 | struct page *page) | ||
2416 | { | ||
2417 | int i; | ||
2418 | |||
2419 | cache_init_objs_debug(cachep, page); | ||
2420 | |||
2421 | if (OBJFREELIST_SLAB(cachep)) { | ||
2422 | page->freelist = index_to_obj(cachep, page, cachep->num - 1) + | ||
2423 | obj_offset(cachep); | ||
2424 | } | ||
2425 | |||
2426 | for (i = 0; i < cachep->num; i++) { | ||
2427 | /* constructor could break poison info */ | ||
2428 | if (DEBUG == 0 && cachep->ctor) | ||
2429 | cachep->ctor(index_to_obj(cachep, page, i)); | ||
2430 | |||
2526 | set_free_obj(page, i, i); | 2431 | set_free_obj(page, i, i); |
2527 | } | 2432 | } |
2528 | } | 2433 | } |
@@ -2537,30 +2442,28 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) | |||
2537 | } | 2442 | } |
2538 | } | 2443 | } |
2539 | 2444 | ||
2540 | static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, | 2445 | static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) |
2541 | int nodeid) | ||
2542 | { | 2446 | { |
2543 | void *objp; | 2447 | void *objp; |
2544 | 2448 | ||
2545 | objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); | 2449 | objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); |
2546 | page->active++; | 2450 | page->active++; |
2451 | |||
2547 | #if DEBUG | 2452 | #if DEBUG |
2548 | WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); | 2453 | if (cachep->flags & SLAB_STORE_USER) |
2454 | set_store_user_dirty(cachep); | ||
2549 | #endif | 2455 | #endif |
2550 | 2456 | ||
2551 | return objp; | 2457 | return objp; |
2552 | } | 2458 | } |
2553 | 2459 | ||
2554 | static void slab_put_obj(struct kmem_cache *cachep, struct page *page, | 2460 | static void slab_put_obj(struct kmem_cache *cachep, |
2555 | void *objp, int nodeid) | 2461 | struct page *page, void *objp) |
2556 | { | 2462 | { |
2557 | unsigned int objnr = obj_to_index(cachep, page, objp); | 2463 | unsigned int objnr = obj_to_index(cachep, page, objp); |
2558 | #if DEBUG | 2464 | #if DEBUG |
2559 | unsigned int i; | 2465 | unsigned int i; |
2560 | 2466 | ||
2561 | /* Verify that the slab belongs to the intended node */ | ||
2562 | WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); | ||
2563 | |||
2564 | /* Verify double free bug */ | 2467 | /* Verify double free bug */ |
2565 | for (i = page->active; i < cachep->num; i++) { | 2468 | for (i = page->active; i < cachep->num; i++) { |
2566 | if (get_free_obj(page, i) == objnr) { | 2469 | if (get_free_obj(page, i) == objnr) { |
@@ -2571,6 +2474,9 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, | |||
2571 | } | 2474 | } |
2572 | #endif | 2475 | #endif |
2573 | page->active--; | 2476 | page->active--; |
2477 | if (!page->freelist) | ||
2478 | page->freelist = objp + obj_offset(cachep); | ||
2479 | |||
2574 | set_free_obj(page, page->active, objnr); | 2480 | set_free_obj(page, page->active, objnr); |
2575 | } | 2481 | } |
2576 | 2482 | ||
@@ -2645,7 +2551,7 @@ static int cache_grow(struct kmem_cache *cachep, | |||
2645 | /* Get slab management. */ | 2551 | /* Get slab management. */ |
2646 | freelist = alloc_slabmgmt(cachep, page, offset, | 2552 | freelist = alloc_slabmgmt(cachep, page, offset, |
2647 | local_flags & ~GFP_CONSTRAINT_MASK, nodeid); | 2553 | local_flags & ~GFP_CONSTRAINT_MASK, nodeid); |
2648 | if (!freelist) | 2554 | if (OFF_SLAB(cachep) && !freelist) |
2649 | goto opps1; | 2555 | goto opps1; |
2650 | 2556 | ||
2651 | slab_map_pages(cachep, page, freelist); | 2557 | slab_map_pages(cachep, page, freelist); |
@@ -2726,27 +2632,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2726 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; | 2632 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; |
2727 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; | 2633 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; |
2728 | } | 2634 | } |
2729 | if (cachep->flags & SLAB_STORE_USER) | 2635 | if (cachep->flags & SLAB_STORE_USER) { |
2636 | set_store_user_dirty(cachep); | ||
2730 | *dbg_userword(cachep, objp) = (void *)caller; | 2637 | *dbg_userword(cachep, objp) = (void *)caller; |
2638 | } | ||
2731 | 2639 | ||
2732 | objnr = obj_to_index(cachep, page, objp); | 2640 | objnr = obj_to_index(cachep, page, objp); |
2733 | 2641 | ||
2734 | BUG_ON(objnr >= cachep->num); | 2642 | BUG_ON(objnr >= cachep->num); |
2735 | BUG_ON(objp != index_to_obj(cachep, page, objnr)); | 2643 | BUG_ON(objp != index_to_obj(cachep, page, objnr)); |
2736 | 2644 | ||
2737 | set_obj_status(page, objnr, OBJECT_FREE); | ||
2738 | if (cachep->flags & SLAB_POISON) { | 2645 | if (cachep->flags & SLAB_POISON) { |
2739 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
2740 | if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { | ||
2741 | store_stackinfo(cachep, objp, caller); | ||
2742 | kernel_map_pages(virt_to_page(objp), | ||
2743 | cachep->size / PAGE_SIZE, 0); | ||
2744 | } else { | ||
2745 | poison_obj(cachep, objp, POISON_FREE); | ||
2746 | } | ||
2747 | #else | ||
2748 | poison_obj(cachep, objp, POISON_FREE); | 2646 | poison_obj(cachep, objp, POISON_FREE); |
2749 | #endif | 2647 | slab_kernel_map(cachep, objp, 0, caller); |
2750 | } | 2648 | } |
2751 | return objp; | 2649 | return objp; |
2752 | } | 2650 | } |
@@ -2756,7 +2654,85 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2756 | #define cache_free_debugcheck(x,objp,z) (objp) | 2654 | #define cache_free_debugcheck(x,objp,z) (objp) |
2757 | #endif | 2655 | #endif |
2758 | 2656 | ||
2759 | static struct page *get_first_slab(struct kmem_cache_node *n) | 2657 | static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, |
2658 | void **list) | ||
2659 | { | ||
2660 | #if DEBUG | ||
2661 | void *next = *list; | ||
2662 | void *objp; | ||
2663 | |||
2664 | while (next) { | ||
2665 | objp = next - obj_offset(cachep); | ||
2666 | next = *(void **)next; | ||
2667 | poison_obj(cachep, objp, POISON_FREE); | ||
2668 | } | ||
2669 | #endif | ||
2670 | } | ||
2671 | |||
2672 | static inline void fixup_slab_list(struct kmem_cache *cachep, | ||
2673 | struct kmem_cache_node *n, struct page *page, | ||
2674 | void **list) | ||
2675 | { | ||
2676 | /* move slabp to correct slabp list: */ | ||
2677 | list_del(&page->lru); | ||
2678 | if (page->active == cachep->num) { | ||
2679 | list_add(&page->lru, &n->slabs_full); | ||
2680 | if (OBJFREELIST_SLAB(cachep)) { | ||
2681 | #if DEBUG | ||
2682 | /* Poisoning will be done without holding the lock */ | ||
2683 | if (cachep->flags & SLAB_POISON) { | ||
2684 | void **objp = page->freelist; | ||
2685 | |||
2686 | *objp = *list; | ||
2687 | *list = objp; | ||
2688 | } | ||
2689 | #endif | ||
2690 | page->freelist = NULL; | ||
2691 | } | ||
2692 | } else | ||
2693 | list_add(&page->lru, &n->slabs_partial); | ||
2694 | } | ||
2695 | |||
2696 | /* Try to find non-pfmemalloc slab if needed */ | ||
2697 | static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, | ||
2698 | struct page *page, bool pfmemalloc) | ||
2699 | { | ||
2700 | if (!page) | ||
2701 | return NULL; | ||
2702 | |||
2703 | if (pfmemalloc) | ||
2704 | return page; | ||
2705 | |||
2706 | if (!PageSlabPfmemalloc(page)) | ||
2707 | return page; | ||
2708 | |||
2709 | /* No need to keep pfmemalloc slab if we have enough free objects */ | ||
2710 | if (n->free_objects > n->free_limit) { | ||
2711 | ClearPageSlabPfmemalloc(page); | ||
2712 | return page; | ||
2713 | } | ||
2714 | |||
2715 | /* Move pfmemalloc slab to the end of list to speed up next search */ | ||
2716 | list_del(&page->lru); | ||
2717 | if (!page->active) | ||
2718 | list_add_tail(&page->lru, &n->slabs_free); | ||
2719 | else | ||
2720 | list_add_tail(&page->lru, &n->slabs_partial); | ||
2721 | |||
2722 | list_for_each_entry(page, &n->slabs_partial, lru) { | ||
2723 | if (!PageSlabPfmemalloc(page)) | ||
2724 | return page; | ||
2725 | } | ||
2726 | |||
2727 | list_for_each_entry(page, &n->slabs_free, lru) { | ||
2728 | if (!PageSlabPfmemalloc(page)) | ||
2729 | return page; | ||
2730 | } | ||
2731 | |||
2732 | return NULL; | ||
2733 | } | ||
2734 | |||
2735 | static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) | ||
2760 | { | 2736 | { |
2761 | struct page *page; | 2737 | struct page *page; |
2762 | 2738 | ||
@@ -2768,21 +2744,51 @@ static struct page *get_first_slab(struct kmem_cache_node *n) | |||
2768 | struct page, lru); | 2744 | struct page, lru); |
2769 | } | 2745 | } |
2770 | 2746 | ||
2747 | if (sk_memalloc_socks()) | ||
2748 | return get_valid_first_slab(n, page, pfmemalloc); | ||
2749 | |||
2771 | return page; | 2750 | return page; |
2772 | } | 2751 | } |
2773 | 2752 | ||
2774 | static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, | 2753 | static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, |
2775 | bool force_refill) | 2754 | struct kmem_cache_node *n, gfp_t flags) |
2755 | { | ||
2756 | struct page *page; | ||
2757 | void *obj; | ||
2758 | void *list = NULL; | ||
2759 | |||
2760 | if (!gfp_pfmemalloc_allowed(flags)) | ||
2761 | return NULL; | ||
2762 | |||
2763 | spin_lock(&n->list_lock); | ||
2764 | page = get_first_slab(n, true); | ||
2765 | if (!page) { | ||
2766 | spin_unlock(&n->list_lock); | ||
2767 | return NULL; | ||
2768 | } | ||
2769 | |||
2770 | obj = slab_get_obj(cachep, page); | ||
2771 | n->free_objects--; | ||
2772 | |||
2773 | fixup_slab_list(cachep, n, page, &list); | ||
2774 | |||
2775 | spin_unlock(&n->list_lock); | ||
2776 | fixup_objfreelist_debug(cachep, &list); | ||
2777 | |||
2778 | return obj; | ||
2779 | } | ||
2780 | |||
2781 | static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | ||
2776 | { | 2782 | { |
2777 | int batchcount; | 2783 | int batchcount; |
2778 | struct kmem_cache_node *n; | 2784 | struct kmem_cache_node *n; |
2779 | struct array_cache *ac; | 2785 | struct array_cache *ac; |
2780 | int node; | 2786 | int node; |
2787 | void *list = NULL; | ||
2781 | 2788 | ||
2782 | check_irq_off(); | 2789 | check_irq_off(); |
2783 | node = numa_mem_id(); | 2790 | node = numa_mem_id(); |
2784 | if (unlikely(force_refill)) | 2791 | |
2785 | goto force_grow; | ||
2786 | retry: | 2792 | retry: |
2787 | ac = cpu_cache_get(cachep); | 2793 | ac = cpu_cache_get(cachep); |
2788 | batchcount = ac->batchcount; | 2794 | batchcount = ac->batchcount; |
@@ -2808,7 +2814,7 @@ retry: | |||
2808 | while (batchcount > 0) { | 2814 | while (batchcount > 0) { |
2809 | struct page *page; | 2815 | struct page *page; |
2810 | /* Get slab alloc is to come from. */ | 2816 | /* Get slab alloc is to come from. */ |
2811 | page = get_first_slab(n); | 2817 | page = get_first_slab(n, false); |
2812 | if (!page) | 2818 | if (!page) |
2813 | goto must_grow; | 2819 | goto must_grow; |
2814 | 2820 | ||
@@ -2826,26 +2832,29 @@ retry: | |||
2826 | STATS_INC_ACTIVE(cachep); | 2832 | STATS_INC_ACTIVE(cachep); |
2827 | STATS_SET_HIGH(cachep); | 2833 | STATS_SET_HIGH(cachep); |
2828 | 2834 | ||
2829 | ac_put_obj(cachep, ac, slab_get_obj(cachep, page, | 2835 | ac->entry[ac->avail++] = slab_get_obj(cachep, page); |
2830 | node)); | ||
2831 | } | 2836 | } |
2832 | 2837 | ||
2833 | /* move slabp to correct slabp list: */ | 2838 | fixup_slab_list(cachep, n, page, &list); |
2834 | list_del(&page->lru); | ||
2835 | if (page->active == cachep->num) | ||
2836 | list_add(&page->lru, &n->slabs_full); | ||
2837 | else | ||
2838 | list_add(&page->lru, &n->slabs_partial); | ||
2839 | } | 2839 | } |
2840 | 2840 | ||
2841 | must_grow: | 2841 | must_grow: |
2842 | n->free_objects -= ac->avail; | 2842 | n->free_objects -= ac->avail; |
2843 | alloc_done: | 2843 | alloc_done: |
2844 | spin_unlock(&n->list_lock); | 2844 | spin_unlock(&n->list_lock); |
2845 | fixup_objfreelist_debug(cachep, &list); | ||
2845 | 2846 | ||
2846 | if (unlikely(!ac->avail)) { | 2847 | if (unlikely(!ac->avail)) { |
2847 | int x; | 2848 | int x; |
2848 | force_grow: | 2849 | |
2850 | /* Check if we can use obj in pfmemalloc slab */ | ||
2851 | if (sk_memalloc_socks()) { | ||
2852 | void *obj = cache_alloc_pfmemalloc(cachep, n, flags); | ||
2853 | |||
2854 | if (obj) | ||
2855 | return obj; | ||
2856 | } | ||
2857 | |||
2849 | x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); | 2858 | x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); |
2850 | 2859 | ||
2851 | /* cache_grow can reenable interrupts, then ac could change. */ | 2860 | /* cache_grow can reenable interrupts, then ac could change. */ |
@@ -2853,7 +2862,7 @@ force_grow: | |||
2853 | node = numa_mem_id(); | 2862 | node = numa_mem_id(); |
2854 | 2863 | ||
2855 | /* no objects in sight? abort */ | 2864 | /* no objects in sight? abort */ |
2856 | if (!x && (ac->avail == 0 || force_refill)) | 2865 | if (!x && ac->avail == 0) |
2857 | return NULL; | 2866 | return NULL; |
2858 | 2867 | ||
2859 | if (!ac->avail) /* objects refilled by interrupt? */ | 2868 | if (!ac->avail) /* objects refilled by interrupt? */ |
@@ -2861,7 +2870,7 @@ force_grow: | |||
2861 | } | 2870 | } |
2862 | ac->touched = 1; | 2871 | ac->touched = 1; |
2863 | 2872 | ||
2864 | return ac_get_obj(cachep, ac, flags, force_refill); | 2873 | return ac->entry[--ac->avail]; |
2865 | } | 2874 | } |
2866 | 2875 | ||
2867 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, | 2876 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, |
@@ -2877,20 +2886,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, | |||
2877 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | 2886 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, |
2878 | gfp_t flags, void *objp, unsigned long caller) | 2887 | gfp_t flags, void *objp, unsigned long caller) |
2879 | { | 2888 | { |
2880 | struct page *page; | ||
2881 | |||
2882 | if (!objp) | 2889 | if (!objp) |
2883 | return objp; | 2890 | return objp; |
2884 | if (cachep->flags & SLAB_POISON) { | 2891 | if (cachep->flags & SLAB_POISON) { |
2885 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
2886 | if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) | ||
2887 | kernel_map_pages(virt_to_page(objp), | ||
2888 | cachep->size / PAGE_SIZE, 1); | ||
2889 | else | ||
2890 | check_poison_obj(cachep, objp); | ||
2891 | #else | ||
2892 | check_poison_obj(cachep, objp); | 2892 | check_poison_obj(cachep, objp); |
2893 | #endif | 2893 | slab_kernel_map(cachep, objp, 1, 0); |
2894 | poison_obj(cachep, objp, POISON_INUSE); | 2894 | poison_obj(cachep, objp, POISON_INUSE); |
2895 | } | 2895 | } |
2896 | if (cachep->flags & SLAB_STORE_USER) | 2896 | if (cachep->flags & SLAB_STORE_USER) |
@@ -2910,8 +2910,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
2910 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; | 2910 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; |
2911 | } | 2911 | } |
2912 | 2912 | ||
2913 | page = virt_to_head_page(objp); | ||
2914 | set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); | ||
2915 | objp += obj_offset(cachep); | 2913 | objp += obj_offset(cachep); |
2916 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 2914 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
2917 | cachep->ctor(objp); | 2915 | cachep->ctor(objp); |
@@ -2926,40 +2924,24 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
2926 | #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) | 2924 | #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) |
2927 | #endif | 2925 | #endif |
2928 | 2926 | ||
2929 | static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) | ||
2930 | { | ||
2931 | if (unlikely(cachep == kmem_cache)) | ||
2932 | return false; | ||
2933 | |||
2934 | return should_failslab(cachep->object_size, flags, cachep->flags); | ||
2935 | } | ||
2936 | |||
2937 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 2927 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
2938 | { | 2928 | { |
2939 | void *objp; | 2929 | void *objp; |
2940 | struct array_cache *ac; | 2930 | struct array_cache *ac; |
2941 | bool force_refill = false; | ||
2942 | 2931 | ||
2943 | check_irq_off(); | 2932 | check_irq_off(); |
2944 | 2933 | ||
2945 | ac = cpu_cache_get(cachep); | 2934 | ac = cpu_cache_get(cachep); |
2946 | if (likely(ac->avail)) { | 2935 | if (likely(ac->avail)) { |
2947 | ac->touched = 1; | 2936 | ac->touched = 1; |
2948 | objp = ac_get_obj(cachep, ac, flags, false); | 2937 | objp = ac->entry[--ac->avail]; |
2949 | 2938 | ||
2950 | /* | 2939 | STATS_INC_ALLOCHIT(cachep); |
2951 | * Allow for the possibility all avail objects are not allowed | 2940 | goto out; |
2952 | * by the current flags | ||
2953 | */ | ||
2954 | if (objp) { | ||
2955 | STATS_INC_ALLOCHIT(cachep); | ||
2956 | goto out; | ||
2957 | } | ||
2958 | force_refill = true; | ||
2959 | } | 2941 | } |
2960 | 2942 | ||
2961 | STATS_INC_ALLOCMISS(cachep); | 2943 | STATS_INC_ALLOCMISS(cachep); |
2962 | objp = cache_alloc_refill(cachep, flags, force_refill); | 2944 | objp = cache_alloc_refill(cachep, flags); |
2963 | /* | 2945 | /* |
2964 | * the 'ac' may be updated by cache_alloc_refill(), | 2946 | * the 'ac' may be updated by cache_alloc_refill(), |
2965 | * and kmemleak_erase() requires its correct value. | 2947 | * and kmemleak_erase() requires its correct value. |
@@ -3097,6 +3079,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | |||
3097 | struct page *page; | 3079 | struct page *page; |
3098 | struct kmem_cache_node *n; | 3080 | struct kmem_cache_node *n; |
3099 | void *obj; | 3081 | void *obj; |
3082 | void *list = NULL; | ||
3100 | int x; | 3083 | int x; |
3101 | 3084 | ||
3102 | VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); | 3085 | VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); |
@@ -3106,7 +3089,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | |||
3106 | retry: | 3089 | retry: |
3107 | check_irq_off(); | 3090 | check_irq_off(); |
3108 | spin_lock(&n->list_lock); | 3091 | spin_lock(&n->list_lock); |
3109 | page = get_first_slab(n); | 3092 | page = get_first_slab(n, false); |
3110 | if (!page) | 3093 | if (!page) |
3111 | goto must_grow; | 3094 | goto must_grow; |
3112 | 3095 | ||
@@ -3118,17 +3101,13 @@ retry: | |||
3118 | 3101 | ||
3119 | BUG_ON(page->active == cachep->num); | 3102 | BUG_ON(page->active == cachep->num); |
3120 | 3103 | ||
3121 | obj = slab_get_obj(cachep, page, nodeid); | 3104 | obj = slab_get_obj(cachep, page); |
3122 | n->free_objects--; | 3105 | n->free_objects--; |
3123 | /* move slabp to correct slabp list: */ | ||
3124 | list_del(&page->lru); | ||
3125 | 3106 | ||
3126 | if (page->active == cachep->num) | 3107 | fixup_slab_list(cachep, n, page, &list); |
3127 | list_add(&page->lru, &n->slabs_full); | ||
3128 | else | ||
3129 | list_add(&page->lru, &n->slabs_partial); | ||
3130 | 3108 | ||
3131 | spin_unlock(&n->list_lock); | 3109 | spin_unlock(&n->list_lock); |
3110 | fixup_objfreelist_debug(cachep, &list); | ||
3132 | goto done; | 3111 | goto done; |
3133 | 3112 | ||
3134 | must_grow: | 3113 | must_grow: |
@@ -3152,14 +3131,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3152 | int slab_node = numa_mem_id(); | 3131 | int slab_node = numa_mem_id(); |
3153 | 3132 | ||
3154 | flags &= gfp_allowed_mask; | 3133 | flags &= gfp_allowed_mask; |
3155 | 3134 | cachep = slab_pre_alloc_hook(cachep, flags); | |
3156 | lockdep_trace_alloc(flags); | 3135 | if (unlikely(!cachep)) |
3157 | |||
3158 | if (slab_should_failslab(cachep, flags)) | ||
3159 | return NULL; | 3136 | return NULL; |
3160 | 3137 | ||
3161 | cachep = memcg_kmem_get_cache(cachep, flags); | ||
3162 | |||
3163 | cache_alloc_debugcheck_before(cachep, flags); | 3138 | cache_alloc_debugcheck_before(cachep, flags); |
3164 | local_irq_save(save_flags); | 3139 | local_irq_save(save_flags); |
3165 | 3140 | ||
@@ -3188,16 +3163,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3188 | out: | 3163 | out: |
3189 | local_irq_restore(save_flags); | 3164 | local_irq_restore(save_flags); |
3190 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); | 3165 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); |
3191 | kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, | ||
3192 | flags); | ||
3193 | 3166 | ||
3194 | if (likely(ptr)) { | 3167 | if (unlikely(flags & __GFP_ZERO) && ptr) |
3195 | kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); | 3168 | memset(ptr, 0, cachep->object_size); |
3196 | if (unlikely(flags & __GFP_ZERO)) | ||
3197 | memset(ptr, 0, cachep->object_size); | ||
3198 | } | ||
3199 | 3169 | ||
3200 | memcg_kmem_put_cache(cachep); | 3170 | slab_post_alloc_hook(cachep, flags, 1, &ptr); |
3201 | return ptr; | 3171 | return ptr; |
3202 | } | 3172 | } |
3203 | 3173 | ||
@@ -3240,30 +3210,21 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) | |||
3240 | void *objp; | 3210 | void *objp; |
3241 | 3211 | ||
3242 | flags &= gfp_allowed_mask; | 3212 | flags &= gfp_allowed_mask; |
3243 | 3213 | cachep = slab_pre_alloc_hook(cachep, flags); | |
3244 | lockdep_trace_alloc(flags); | 3214 | if (unlikely(!cachep)) |
3245 | |||
3246 | if (slab_should_failslab(cachep, flags)) | ||
3247 | return NULL; | 3215 | return NULL; |
3248 | 3216 | ||
3249 | cachep = memcg_kmem_get_cache(cachep, flags); | ||
3250 | |||
3251 | cache_alloc_debugcheck_before(cachep, flags); | 3217 | cache_alloc_debugcheck_before(cachep, flags); |
3252 | local_irq_save(save_flags); | 3218 | local_irq_save(save_flags); |
3253 | objp = __do_cache_alloc(cachep, flags); | 3219 | objp = __do_cache_alloc(cachep, flags); |
3254 | local_irq_restore(save_flags); | 3220 | local_irq_restore(save_flags); |
3255 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); | 3221 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); |
3256 | kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, | ||
3257 | flags); | ||
3258 | prefetchw(objp); | 3222 | prefetchw(objp); |
3259 | 3223 | ||
3260 | if (likely(objp)) { | 3224 | if (unlikely(flags & __GFP_ZERO) && objp) |
3261 | kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); | 3225 | memset(objp, 0, cachep->object_size); |
3262 | if (unlikely(flags & __GFP_ZERO)) | ||
3263 | memset(objp, 0, cachep->object_size); | ||
3264 | } | ||
3265 | 3226 | ||
3266 | memcg_kmem_put_cache(cachep); | 3227 | slab_post_alloc_hook(cachep, flags, 1, &objp); |
3267 | return objp; | 3228 | return objp; |
3268 | } | 3229 | } |
3269 | 3230 | ||
@@ -3281,13 +3242,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, | |||
3281 | void *objp; | 3242 | void *objp; |
3282 | struct page *page; | 3243 | struct page *page; |
3283 | 3244 | ||
3284 | clear_obj_pfmemalloc(&objpp[i]); | ||
3285 | objp = objpp[i]; | 3245 | objp = objpp[i]; |
3286 | 3246 | ||
3287 | page = virt_to_head_page(objp); | 3247 | page = virt_to_head_page(objp); |
3288 | list_del(&page->lru); | 3248 | list_del(&page->lru); |
3289 | check_spinlock_acquired_node(cachep, node); | 3249 | check_spinlock_acquired_node(cachep, node); |
3290 | slab_put_obj(cachep, page, objp, node); | 3250 | slab_put_obj(cachep, page, objp); |
3291 | STATS_DEC_ACTIVE(cachep); | 3251 | STATS_DEC_ACTIVE(cachep); |
3292 | n->free_objects++; | 3252 | n->free_objects++; |
3293 | 3253 | ||
@@ -3317,9 +3277,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) | |||
3317 | LIST_HEAD(list); | 3277 | LIST_HEAD(list); |
3318 | 3278 | ||
3319 | batchcount = ac->batchcount; | 3279 | batchcount = ac->batchcount; |
3320 | #if DEBUG | 3280 | |
3321 | BUG_ON(!batchcount || batchcount > ac->avail); | ||
3322 | #endif | ||
3323 | check_irq_off(); | 3281 | check_irq_off(); |
3324 | n = get_node(cachep, node); | 3282 | n = get_node(cachep, node); |
3325 | spin_lock(&n->list_lock); | 3283 | spin_lock(&n->list_lock); |
@@ -3389,7 +3347,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, | |||
3389 | cache_flusharray(cachep, ac); | 3347 | cache_flusharray(cachep, ac); |
3390 | } | 3348 | } |
3391 | 3349 | ||
3392 | ac_put_obj(cachep, ac, objp); | 3350 | if (sk_memalloc_socks()) { |
3351 | struct page *page = virt_to_head_page(objp); | ||
3352 | |||
3353 | if (unlikely(PageSlabPfmemalloc(page))) { | ||
3354 | cache_free_pfmemalloc(cachep, page, objp); | ||
3355 | return; | ||
3356 | } | ||
3357 | } | ||
3358 | |||
3359 | ac->entry[ac->avail++] = objp; | ||
3393 | } | 3360 | } |
3394 | 3361 | ||
3395 | /** | 3362 | /** |
@@ -3411,16 +3378,53 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3411 | } | 3378 | } |
3412 | EXPORT_SYMBOL(kmem_cache_alloc); | 3379 | EXPORT_SYMBOL(kmem_cache_alloc); |
3413 | 3380 | ||
3414 | void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) | 3381 | static __always_inline void |
3382 | cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, | ||
3383 | size_t size, void **p, unsigned long caller) | ||
3415 | { | 3384 | { |
3416 | __kmem_cache_free_bulk(s, size, p); | 3385 | size_t i; |
3386 | |||
3387 | for (i = 0; i < size; i++) | ||
3388 | p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); | ||
3417 | } | 3389 | } |
3418 | EXPORT_SYMBOL(kmem_cache_free_bulk); | ||
3419 | 3390 | ||
3420 | int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, | 3391 | int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, |
3421 | void **p) | 3392 | void **p) |
3422 | { | 3393 | { |
3423 | return __kmem_cache_alloc_bulk(s, flags, size, p); | 3394 | size_t i; |
3395 | |||
3396 | s = slab_pre_alloc_hook(s, flags); | ||
3397 | if (!s) | ||
3398 | return 0; | ||
3399 | |||
3400 | cache_alloc_debugcheck_before(s, flags); | ||
3401 | |||
3402 | local_irq_disable(); | ||
3403 | for (i = 0; i < size; i++) { | ||
3404 | void *objp = __do_cache_alloc(s, flags); | ||
3405 | |||
3406 | if (unlikely(!objp)) | ||
3407 | goto error; | ||
3408 | p[i] = objp; | ||
3409 | } | ||
3410 | local_irq_enable(); | ||
3411 | |||
3412 | cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); | ||
3413 | |||
3414 | /* Clear memory outside IRQ disabled section */ | ||
3415 | if (unlikely(flags & __GFP_ZERO)) | ||
3416 | for (i = 0; i < size; i++) | ||
3417 | memset(p[i], 0, s->object_size); | ||
3418 | |||
3419 | slab_post_alloc_hook(s, flags, size, p); | ||
3420 | /* FIXME: Trace call missing. Christoph would like a bulk variant */ | ||
3421 | return size; | ||
3422 | error: | ||
3423 | local_irq_enable(); | ||
3424 | cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); | ||
3425 | slab_post_alloc_hook(s, flags, i, p); | ||
3426 | __kmem_cache_free_bulk(s, i, p); | ||
3427 | return 0; | ||
3424 | } | 3428 | } |
3425 | EXPORT_SYMBOL(kmem_cache_alloc_bulk); | 3429 | EXPORT_SYMBOL(kmem_cache_alloc_bulk); |
3426 | 3430 | ||
@@ -3567,6 +3571,32 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) | |||
3567 | } | 3571 | } |
3568 | EXPORT_SYMBOL(kmem_cache_free); | 3572 | EXPORT_SYMBOL(kmem_cache_free); |
3569 | 3573 | ||
3574 | void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) | ||
3575 | { | ||
3576 | struct kmem_cache *s; | ||
3577 | size_t i; | ||
3578 | |||
3579 | local_irq_disable(); | ||
3580 | for (i = 0; i < size; i++) { | ||
3581 | void *objp = p[i]; | ||
3582 | |||
3583 | if (!orig_s) /* called via kfree_bulk */ | ||
3584 | s = virt_to_cache(objp); | ||
3585 | else | ||
3586 | s = cache_from_obj(orig_s, objp); | ||
3587 | |||
3588 | debug_check_no_locks_freed(objp, s->object_size); | ||
3589 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | ||
3590 | debug_check_no_obj_freed(objp, s->object_size); | ||
3591 | |||
3592 | __cache_free(s, objp, _RET_IP_); | ||
3593 | } | ||
3594 | local_irq_enable(); | ||
3595 | |||
3596 | /* FIXME: add tracing */ | ||
3597 | } | ||
3598 | EXPORT_SYMBOL(kmem_cache_free_bulk); | ||
3599 | |||
3570 | /** | 3600 | /** |
3571 | * kfree - free previously allocated memory | 3601 | * kfree - free previously allocated memory |
3572 | * @objp: pointer returned by kmalloc. | 3602 | * @objp: pointer returned by kmalloc. |
@@ -4102,15 +4132,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, | |||
4102 | struct page *page) | 4132 | struct page *page) |
4103 | { | 4133 | { |
4104 | void *p; | 4134 | void *p; |
4105 | int i; | 4135 | int i, j; |
4136 | unsigned long v; | ||
4106 | 4137 | ||
4107 | if (n[0] == n[1]) | 4138 | if (n[0] == n[1]) |
4108 | return; | 4139 | return; |
4109 | for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { | 4140 | for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { |
4110 | if (get_obj_status(page, i) != OBJECT_ACTIVE) | 4141 | bool active = true; |
4142 | |||
4143 | for (j = page->active; j < c->num; j++) { | ||
4144 | if (get_free_obj(page, j) == i) { | ||
4145 | active = false; | ||
4146 | break; | ||
4147 | } | ||
4148 | } | ||
4149 | |||
4150 | if (!active) | ||
4151 | continue; | ||
4152 | |||
4153 | /* | ||
4154 | * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table | ||
4155 | * mapping is established when actual object allocation and | ||
4156 | * we could mistakenly access the unmapped object in the cpu | ||
4157 | * cache. | ||
4158 | */ | ||
4159 | if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v))) | ||
4111 | continue; | 4160 | continue; |
4112 | 4161 | ||
4113 | if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) | 4162 | if (!add_caller(n, v)) |
4114 | return; | 4163 | return; |
4115 | } | 4164 | } |
4116 | } | 4165 | } |
@@ -4146,21 +4195,31 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4146 | if (!(cachep->flags & SLAB_RED_ZONE)) | 4195 | if (!(cachep->flags & SLAB_RED_ZONE)) |
4147 | return 0; | 4196 | return 0; |
4148 | 4197 | ||
4149 | /* OK, we can do it */ | 4198 | /* |
4199 | * Set store_user_clean and start to grab stored user information | ||
4200 | * for all objects on this cache. If some alloc/free requests comes | ||
4201 | * during the processing, information would be wrong so restart | ||
4202 | * whole processing. | ||
4203 | */ | ||
4204 | do { | ||
4205 | set_store_user_clean(cachep); | ||
4206 | drain_cpu_caches(cachep); | ||
4150 | 4207 | ||
4151 | x[1] = 0; | 4208 | x[1] = 0; |
4152 | 4209 | ||
4153 | for_each_kmem_cache_node(cachep, node, n) { | 4210 | for_each_kmem_cache_node(cachep, node, n) { |
4154 | 4211 | ||
4155 | check_irq_on(); | 4212 | check_irq_on(); |
4156 | spin_lock_irq(&n->list_lock); | 4213 | spin_lock_irq(&n->list_lock); |
4214 | |||
4215 | list_for_each_entry(page, &n->slabs_full, lru) | ||
4216 | handle_slab(x, cachep, page); | ||
4217 | list_for_each_entry(page, &n->slabs_partial, lru) | ||
4218 | handle_slab(x, cachep, page); | ||
4219 | spin_unlock_irq(&n->list_lock); | ||
4220 | } | ||
4221 | } while (!is_store_user_clean(cachep)); | ||
4157 | 4222 | ||
4158 | list_for_each_entry(page, &n->slabs_full, lru) | ||
4159 | handle_slab(x, cachep, page); | ||
4160 | list_for_each_entry(page, &n->slabs_partial, lru) | ||
4161 | handle_slab(x, cachep, page); | ||
4162 | spin_unlock_irq(&n->list_lock); | ||
4163 | } | ||
4164 | name = cachep->name; | 4223 | name = cachep->name; |
4165 | if (x[0] == x[1]) { | 4224 | if (x[0] == x[1]) { |
4166 | /* Increase the buffer size */ | 4225 | /* Increase the buffer size */ |
@@ -38,6 +38,10 @@ struct kmem_cache { | |||
38 | #endif | 38 | #endif |
39 | 39 | ||
40 | #include <linux/memcontrol.h> | 40 | #include <linux/memcontrol.h> |
41 | #include <linux/fault-inject.h> | ||
42 | #include <linux/kmemcheck.h> | ||
43 | #include <linux/kasan.h> | ||
44 | #include <linux/kmemleak.h> | ||
41 | 45 | ||
42 | /* | 46 | /* |
43 | * State of the slab allocator. | 47 | * State of the slab allocator. |
@@ -121,7 +125,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, | |||
121 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) | 125 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) |
122 | #elif defined(CONFIG_SLUB_DEBUG) | 126 | #elif defined(CONFIG_SLUB_DEBUG) |
123 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | 127 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ |
124 | SLAB_TRACE | SLAB_DEBUG_FREE) | 128 | SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) |
125 | #else | 129 | #else |
126 | #define SLAB_DEBUG_FLAGS (0) | 130 | #define SLAB_DEBUG_FLAGS (0) |
127 | #endif | 131 | #endif |
@@ -168,7 +172,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
168 | /* | 172 | /* |
169 | * Generic implementation of bulk operations | 173 | * Generic implementation of bulk operations |
170 | * These are useful for situations in which the allocator cannot | 174 | * These are useful for situations in which the allocator cannot |
171 | * perform optimizations. In that case segments of the objecct listed | 175 | * perform optimizations. In that case segments of the object listed |
172 | * may be allocated or freed using these operations. | 176 | * may be allocated or freed using these operations. |
173 | */ | 177 | */ |
174 | void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); | 178 | void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); |
@@ -307,7 +311,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) | |||
307 | * to not do even the assignment. In that case, slab_equal_or_root | 311 | * to not do even the assignment. In that case, slab_equal_or_root |
308 | * will also be a constant. | 312 | * will also be a constant. |
309 | */ | 313 | */ |
310 | if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) | 314 | if (!memcg_kmem_enabled() && |
315 | !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) | ||
311 | return s; | 316 | return s; |
312 | 317 | ||
313 | page = virt_to_head_page(x); | 318 | page = virt_to_head_page(x); |
@@ -321,6 +326,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) | |||
321 | return s; | 326 | return s; |
322 | } | 327 | } |
323 | 328 | ||
329 | static inline size_t slab_ksize(const struct kmem_cache *s) | ||
330 | { | ||
331 | #ifndef CONFIG_SLUB | ||
332 | return s->object_size; | ||
333 | |||
334 | #else /* CONFIG_SLUB */ | ||
335 | # ifdef CONFIG_SLUB_DEBUG | ||
336 | /* | ||
337 | * Debugging requires use of the padding between object | ||
338 | * and whatever may come after it. | ||
339 | */ | ||
340 | if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) | ||
341 | return s->object_size; | ||
342 | # endif | ||
343 | /* | ||
344 | * If we have the need to store the freelist pointer | ||
345 | * back there or track user information then we can | ||
346 | * only use the space before that information. | ||
347 | */ | ||
348 | if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) | ||
349 | return s->inuse; | ||
350 | /* | ||
351 | * Else we can use all the padding etc for the allocation | ||
352 | */ | ||
353 | return s->size; | ||
354 | #endif | ||
355 | } | ||
356 | |||
357 | static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, | ||
358 | gfp_t flags) | ||
359 | { | ||
360 | flags &= gfp_allowed_mask; | ||
361 | lockdep_trace_alloc(flags); | ||
362 | might_sleep_if(gfpflags_allow_blocking(flags)); | ||
363 | |||
364 | if (should_failslab(s, flags)) | ||
365 | return NULL; | ||
366 | |||
367 | return memcg_kmem_get_cache(s, flags); | ||
368 | } | ||
369 | |||
370 | static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, | ||
371 | size_t size, void **p) | ||
372 | { | ||
373 | size_t i; | ||
374 | |||
375 | flags &= gfp_allowed_mask; | ||
376 | for (i = 0; i < size; i++) { | ||
377 | void *object = p[i]; | ||
378 | |||
379 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | ||
380 | kmemleak_alloc_recursive(object, s->object_size, 1, | ||
381 | s->flags, flags); | ||
382 | kasan_slab_alloc(s, object); | ||
383 | } | ||
384 | memcg_kmem_put_cache(s); | ||
385 | } | ||
386 | |||
324 | #ifndef CONFIG_SLOB | 387 | #ifndef CONFIG_SLOB |
325 | /* | 388 | /* |
326 | * The slab lists for all objects. | 389 | * The slab lists for all objects. |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 065b7bdabdc3..6afb2263a5c5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -109,8 +109,12 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) | |||
109 | { | 109 | { |
110 | size_t i; | 110 | size_t i; |
111 | 111 | ||
112 | for (i = 0; i < nr; i++) | 112 | for (i = 0; i < nr; i++) { |
113 | kmem_cache_free(s, p[i]); | 113 | if (s) |
114 | kmem_cache_free(s, p[i]); | ||
115 | else | ||
116 | kfree(p[i]); | ||
117 | } | ||
114 | } | 118 | } |
115 | 119 | ||
116 | int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, | 120 | int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, |
@@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
124 | #endif | 124 | #endif |
125 | } | 125 | } |
126 | 126 | ||
127 | static inline void *fixup_red_left(struct kmem_cache *s, void *p) | ||
128 | { | ||
129 | if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) | ||
130 | p += s->red_left_pad; | ||
131 | |||
132 | return p; | ||
133 | } | ||
134 | |||
127 | static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) | 135 | static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) |
128 | { | 136 | { |
129 | #ifdef CONFIG_SLUB_CPU_PARTIAL | 137 | #ifdef CONFIG_SLUB_CPU_PARTIAL |
@@ -160,10 +168,18 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) | |||
160 | */ | 168 | */ |
161 | #define MAX_PARTIAL 10 | 169 | #define MAX_PARTIAL 10 |
162 | 170 | ||
163 | #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ | 171 | #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ |
164 | SLAB_POISON | SLAB_STORE_USER) | 172 | SLAB_POISON | SLAB_STORE_USER) |
165 | 173 | ||
166 | /* | 174 | /* |
175 | * These debug flags cannot use CMPXCHG because there might be consistency | ||
176 | * issues when checking or reading debug information | ||
177 | */ | ||
178 | #define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \ | ||
179 | SLAB_TRACE) | ||
180 | |||
181 | |||
182 | /* | ||
167 | * Debugging flags that require metadata to be stored in the slab. These get | 183 | * Debugging flags that require metadata to be stored in the slab. These get |
168 | * disabled when slub_debug=O is used and a cache's min order increases with | 184 | * disabled when slub_debug=O is used and a cache's min order increases with |
169 | * metadata. | 185 | * metadata. |
@@ -224,24 +240,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) | |||
224 | * Core slab cache functions | 240 | * Core slab cache functions |
225 | *******************************************************************/ | 241 | *******************************************************************/ |
226 | 242 | ||
227 | /* Verify that a pointer has an address that is valid within a slab page */ | ||
228 | static inline int check_valid_pointer(struct kmem_cache *s, | ||
229 | struct page *page, const void *object) | ||
230 | { | ||
231 | void *base; | ||
232 | |||
233 | if (!object) | ||
234 | return 1; | ||
235 | |||
236 | base = page_address(page); | ||
237 | if (object < base || object >= base + page->objects * s->size || | ||
238 | (object - base) % s->size) { | ||
239 | return 0; | ||
240 | } | ||
241 | |||
242 | return 1; | ||
243 | } | ||
244 | |||
245 | static inline void *get_freepointer(struct kmem_cache *s, void *object) | 243 | static inline void *get_freepointer(struct kmem_cache *s, void *object) |
246 | { | 244 | { |
247 | return *(void **)(object + s->offset); | 245 | return *(void **)(object + s->offset); |
@@ -271,12 +269,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) | |||
271 | 269 | ||
272 | /* Loop over all objects in a slab */ | 270 | /* Loop over all objects in a slab */ |
273 | #define for_each_object(__p, __s, __addr, __objects) \ | 271 | #define for_each_object(__p, __s, __addr, __objects) \ |
274 | for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ | 272 | for (__p = fixup_red_left(__s, __addr); \ |
275 | __p += (__s)->size) | 273 | __p < (__addr) + (__objects) * (__s)->size; \ |
274 | __p += (__s)->size) | ||
276 | 275 | ||
277 | #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ | 276 | #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ |
278 | for (__p = (__addr), __idx = 1; __idx <= __objects;\ | 277 | for (__p = fixup_red_left(__s, __addr), __idx = 1; \ |
279 | __p += (__s)->size, __idx++) | 278 | __idx <= __objects; \ |
279 | __p += (__s)->size, __idx++) | ||
280 | 280 | ||
281 | /* Determine object index from a given position */ | 281 | /* Determine object index from a given position */ |
282 | static inline int slab_index(void *p, struct kmem_cache *s, void *addr) | 282 | static inline int slab_index(void *p, struct kmem_cache *s, void *addr) |
@@ -284,30 +284,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) | |||
284 | return (p - addr) / s->size; | 284 | return (p - addr) / s->size; |
285 | } | 285 | } |
286 | 286 | ||
287 | static inline size_t slab_ksize(const struct kmem_cache *s) | ||
288 | { | ||
289 | #ifdef CONFIG_SLUB_DEBUG | ||
290 | /* | ||
291 | * Debugging requires use of the padding between object | ||
292 | * and whatever may come after it. | ||
293 | */ | ||
294 | if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) | ||
295 | return s->object_size; | ||
296 | |||
297 | #endif | ||
298 | /* | ||
299 | * If we have the need to store the freelist pointer | ||
300 | * back there or track user information then we can | ||
301 | * only use the space before that information. | ||
302 | */ | ||
303 | if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) | ||
304 | return s->inuse; | ||
305 | /* | ||
306 | * Else we can use all the padding etc for the allocation | ||
307 | */ | ||
308 | return s->size; | ||
309 | } | ||
310 | |||
311 | static inline int order_objects(int order, unsigned long size, int reserved) | 287 | static inline int order_objects(int order, unsigned long size, int reserved) |
312 | { | 288 | { |
313 | return ((PAGE_SIZE << order) - reserved) / size; | 289 | return ((PAGE_SIZE << order) - reserved) / size; |
@@ -458,6 +434,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) | |||
458 | set_bit(slab_index(p, s, addr), map); | 434 | set_bit(slab_index(p, s, addr), map); |
459 | } | 435 | } |
460 | 436 | ||
437 | static inline int size_from_object(struct kmem_cache *s) | ||
438 | { | ||
439 | if (s->flags & SLAB_RED_ZONE) | ||
440 | return s->size - s->red_left_pad; | ||
441 | |||
442 | return s->size; | ||
443 | } | ||
444 | |||
445 | static inline void *restore_red_left(struct kmem_cache *s, void *p) | ||
446 | { | ||
447 | if (s->flags & SLAB_RED_ZONE) | ||
448 | p -= s->red_left_pad; | ||
449 | |||
450 | return p; | ||
451 | } | ||
452 | |||
461 | /* | 453 | /* |
462 | * Debug settings: | 454 | * Debug settings: |
463 | */ | 455 | */ |
@@ -491,6 +483,26 @@ static inline void metadata_access_disable(void) | |||
491 | /* | 483 | /* |
492 | * Object debugging | 484 | * Object debugging |
493 | */ | 485 | */ |
486 | |||
487 | /* Verify that a pointer has an address that is valid within a slab page */ | ||
488 | static inline int check_valid_pointer(struct kmem_cache *s, | ||
489 | struct page *page, void *object) | ||
490 | { | ||
491 | void *base; | ||
492 | |||
493 | if (!object) | ||
494 | return 1; | ||
495 | |||
496 | base = page_address(page); | ||
497 | object = restore_red_left(s, object); | ||
498 | if (object < base || object >= base + page->objects * s->size || | ||
499 | (object - base) % s->size) { | ||
500 | return 0; | ||
501 | } | ||
502 | |||
503 | return 1; | ||
504 | } | ||
505 | |||
494 | static void print_section(char *text, u8 *addr, unsigned int length) | 506 | static void print_section(char *text, u8 *addr, unsigned int length) |
495 | { | 507 | { |
496 | metadata_access_enable(); | 508 | metadata_access_enable(); |
@@ -630,7 +642,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
630 | pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", | 642 | pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", |
631 | p, p - addr, get_freepointer(s, p)); | 643 | p, p - addr, get_freepointer(s, p)); |
632 | 644 | ||
633 | if (p > addr + 16) | 645 | if (s->flags & SLAB_RED_ZONE) |
646 | print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); | ||
647 | else if (p > addr + 16) | ||
634 | print_section("Bytes b4 ", p - 16, 16); | 648 | print_section("Bytes b4 ", p - 16, 16); |
635 | 649 | ||
636 | print_section("Object ", p, min_t(unsigned long, s->object_size, | 650 | print_section("Object ", p, min_t(unsigned long, s->object_size, |
@@ -647,9 +661,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
647 | if (s->flags & SLAB_STORE_USER) | 661 | if (s->flags & SLAB_STORE_USER) |
648 | off += 2 * sizeof(struct track); | 662 | off += 2 * sizeof(struct track); |
649 | 663 | ||
650 | if (off != s->size) | 664 | if (off != size_from_object(s)) |
651 | /* Beginning of the filler is the free pointer */ | 665 | /* Beginning of the filler is the free pointer */ |
652 | print_section("Padding ", p + off, s->size - off); | 666 | print_section("Padding ", p + off, size_from_object(s) - off); |
653 | 667 | ||
654 | dump_stack(); | 668 | dump_stack(); |
655 | } | 669 | } |
@@ -679,6 +693,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) | |||
679 | { | 693 | { |
680 | u8 *p = object; | 694 | u8 *p = object; |
681 | 695 | ||
696 | if (s->flags & SLAB_RED_ZONE) | ||
697 | memset(p - s->red_left_pad, val, s->red_left_pad); | ||
698 | |||
682 | if (s->flags & __OBJECT_POISON) { | 699 | if (s->flags & __OBJECT_POISON) { |
683 | memset(p, POISON_FREE, s->object_size - 1); | 700 | memset(p, POISON_FREE, s->object_size - 1); |
684 | p[s->object_size - 1] = POISON_END; | 701 | p[s->object_size - 1] = POISON_END; |
@@ -771,11 +788,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) | |||
771 | /* We also have user information there */ | 788 | /* We also have user information there */ |
772 | off += 2 * sizeof(struct track); | 789 | off += 2 * sizeof(struct track); |
773 | 790 | ||
774 | if (s->size == off) | 791 | if (size_from_object(s) == off) |
775 | return 1; | 792 | return 1; |
776 | 793 | ||
777 | return check_bytes_and_report(s, page, p, "Object padding", | 794 | return check_bytes_and_report(s, page, p, "Object padding", |
778 | p + off, POISON_INUSE, s->size - off); | 795 | p + off, POISON_INUSE, size_from_object(s) - off); |
779 | } | 796 | } |
780 | 797 | ||
781 | /* Check the pad bytes at the end of a slab page */ | 798 | /* Check the pad bytes at the end of a slab page */ |
@@ -820,6 +837,10 @@ static int check_object(struct kmem_cache *s, struct page *page, | |||
820 | 837 | ||
821 | if (s->flags & SLAB_RED_ZONE) { | 838 | if (s->flags & SLAB_RED_ZONE) { |
822 | if (!check_bytes_and_report(s, page, object, "Redzone", | 839 | if (!check_bytes_and_report(s, page, object, "Redzone", |
840 | object - s->red_left_pad, val, s->red_left_pad)) | ||
841 | return 0; | ||
842 | |||
843 | if (!check_bytes_and_report(s, page, object, "Redzone", | ||
823 | endobject, val, s->inuse - s->object_size)) | 844 | endobject, val, s->inuse - s->object_size)) |
824 | return 0; | 845 | return 0; |
825 | } else { | 846 | } else { |
@@ -1031,20 +1052,32 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, | |||
1031 | init_tracking(s, object); | 1052 | init_tracking(s, object); |
1032 | } | 1053 | } |
1033 | 1054 | ||
1034 | static noinline int alloc_debug_processing(struct kmem_cache *s, | 1055 | static inline int alloc_consistency_checks(struct kmem_cache *s, |
1035 | struct page *page, | 1056 | struct page *page, |
1036 | void *object, unsigned long addr) | 1057 | void *object, unsigned long addr) |
1037 | { | 1058 | { |
1038 | if (!check_slab(s, page)) | 1059 | if (!check_slab(s, page)) |
1039 | goto bad; | 1060 | return 0; |
1040 | 1061 | ||
1041 | if (!check_valid_pointer(s, page, object)) { | 1062 | if (!check_valid_pointer(s, page, object)) { |
1042 | object_err(s, page, object, "Freelist Pointer check fails"); | 1063 | object_err(s, page, object, "Freelist Pointer check fails"); |
1043 | goto bad; | 1064 | return 0; |
1044 | } | 1065 | } |
1045 | 1066 | ||
1046 | if (!check_object(s, page, object, SLUB_RED_INACTIVE)) | 1067 | if (!check_object(s, page, object, SLUB_RED_INACTIVE)) |
1047 | goto bad; | 1068 | return 0; |
1069 | |||
1070 | return 1; | ||
1071 | } | ||
1072 | |||
1073 | static noinline int alloc_debug_processing(struct kmem_cache *s, | ||
1074 | struct page *page, | ||
1075 | void *object, unsigned long addr) | ||
1076 | { | ||
1077 | if (s->flags & SLAB_CONSISTENCY_CHECKS) { | ||
1078 | if (!alloc_consistency_checks(s, page, object, addr)) | ||
1079 | goto bad; | ||
1080 | } | ||
1048 | 1081 | ||
1049 | /* Success perform special debug activities for allocs */ | 1082 | /* Success perform special debug activities for allocs */ |
1050 | if (s->flags & SLAB_STORE_USER) | 1083 | if (s->flags & SLAB_STORE_USER) |
@@ -1067,37 +1100,21 @@ bad: | |||
1067 | return 0; | 1100 | return 0; |
1068 | } | 1101 | } |
1069 | 1102 | ||
1070 | /* Supports checking bulk free of a constructed freelist */ | 1103 | static inline int free_consistency_checks(struct kmem_cache *s, |
1071 | static noinline struct kmem_cache_node *free_debug_processing( | 1104 | struct page *page, void *object, unsigned long addr) |
1072 | struct kmem_cache *s, struct page *page, | ||
1073 | void *head, void *tail, int bulk_cnt, | ||
1074 | unsigned long addr, unsigned long *flags) | ||
1075 | { | 1105 | { |
1076 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | ||
1077 | void *object = head; | ||
1078 | int cnt = 0; | ||
1079 | |||
1080 | spin_lock_irqsave(&n->list_lock, *flags); | ||
1081 | slab_lock(page); | ||
1082 | |||
1083 | if (!check_slab(s, page)) | ||
1084 | goto fail; | ||
1085 | |||
1086 | next_object: | ||
1087 | cnt++; | ||
1088 | |||
1089 | if (!check_valid_pointer(s, page, object)) { | 1106 | if (!check_valid_pointer(s, page, object)) { |
1090 | slab_err(s, page, "Invalid object pointer 0x%p", object); | 1107 | slab_err(s, page, "Invalid object pointer 0x%p", object); |
1091 | goto fail; | 1108 | return 0; |
1092 | } | 1109 | } |
1093 | 1110 | ||
1094 | if (on_freelist(s, page, object)) { | 1111 | if (on_freelist(s, page, object)) { |
1095 | object_err(s, page, object, "Object already free"); | 1112 | object_err(s, page, object, "Object already free"); |
1096 | goto fail; | 1113 | return 0; |
1097 | } | 1114 | } |
1098 | 1115 | ||
1099 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) | 1116 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) |
1100 | goto out; | 1117 | return 0; |
1101 | 1118 | ||
1102 | if (unlikely(s != page->slab_cache)) { | 1119 | if (unlikely(s != page->slab_cache)) { |
1103 | if (!PageSlab(page)) { | 1120 | if (!PageSlab(page)) { |
@@ -1110,7 +1127,37 @@ next_object: | |||
1110 | } else | 1127 | } else |
1111 | object_err(s, page, object, | 1128 | object_err(s, page, object, |
1112 | "page slab pointer corrupt."); | 1129 | "page slab pointer corrupt."); |
1113 | goto fail; | 1130 | return 0; |
1131 | } | ||
1132 | return 1; | ||
1133 | } | ||
1134 | |||
1135 | /* Supports checking bulk free of a constructed freelist */ | ||
1136 | static noinline int free_debug_processing( | ||
1137 | struct kmem_cache *s, struct page *page, | ||
1138 | void *head, void *tail, int bulk_cnt, | ||
1139 | unsigned long addr) | ||
1140 | { | ||
1141 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | ||
1142 | void *object = head; | ||
1143 | int cnt = 0; | ||
1144 | unsigned long uninitialized_var(flags); | ||
1145 | int ret = 0; | ||
1146 | |||
1147 | spin_lock_irqsave(&n->list_lock, flags); | ||
1148 | slab_lock(page); | ||
1149 | |||
1150 | if (s->flags & SLAB_CONSISTENCY_CHECKS) { | ||
1151 | if (!check_slab(s, page)) | ||
1152 | goto out; | ||
1153 | } | ||
1154 | |||
1155 | next_object: | ||
1156 | cnt++; | ||
1157 | |||
1158 | if (s->flags & SLAB_CONSISTENCY_CHECKS) { | ||
1159 | if (!free_consistency_checks(s, page, object, addr)) | ||
1160 | goto out; | ||
1114 | } | 1161 | } |
1115 | 1162 | ||
1116 | if (s->flags & SLAB_STORE_USER) | 1163 | if (s->flags & SLAB_STORE_USER) |
@@ -1124,23 +1171,18 @@ next_object: | |||
1124 | object = get_freepointer(s, object); | 1171 | object = get_freepointer(s, object); |
1125 | goto next_object; | 1172 | goto next_object; |
1126 | } | 1173 | } |
1174 | ret = 1; | ||
1175 | |||
1127 | out: | 1176 | out: |
1128 | if (cnt != bulk_cnt) | 1177 | if (cnt != bulk_cnt) |
1129 | slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", | 1178 | slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", |
1130 | bulk_cnt, cnt); | 1179 | bulk_cnt, cnt); |
1131 | 1180 | ||
1132 | slab_unlock(page); | 1181 | slab_unlock(page); |
1133 | /* | 1182 | spin_unlock_irqrestore(&n->list_lock, flags); |
1134 | * Keep node_lock to preserve integrity | 1183 | if (!ret) |
1135 | * until the object is actually freed | 1184 | slab_fix(s, "Object at 0x%p not freed", object); |
1136 | */ | 1185 | return ret; |
1137 | return n; | ||
1138 | |||
1139 | fail: | ||
1140 | slab_unlock(page); | ||
1141 | spin_unlock_irqrestore(&n->list_lock, *flags); | ||
1142 | slab_fix(s, "Object at 0x%p not freed", object); | ||
1143 | return NULL; | ||
1144 | } | 1186 | } |
1145 | 1187 | ||
1146 | static int __init setup_slub_debug(char *str) | 1188 | static int __init setup_slub_debug(char *str) |
@@ -1172,7 +1214,7 @@ static int __init setup_slub_debug(char *str) | |||
1172 | for (; *str && *str != ','; str++) { | 1214 | for (; *str && *str != ','; str++) { |
1173 | switch (tolower(*str)) { | 1215 | switch (tolower(*str)) { |
1174 | case 'f': | 1216 | case 'f': |
1175 | slub_debug |= SLAB_DEBUG_FREE; | 1217 | slub_debug |= SLAB_CONSISTENCY_CHECKS; |
1176 | break; | 1218 | break; |
1177 | case 'z': | 1219 | case 'z': |
1178 | slub_debug |= SLAB_RED_ZONE; | 1220 | slub_debug |= SLAB_RED_ZONE; |
@@ -1231,10 +1273,10 @@ static inline void setup_object_debug(struct kmem_cache *s, | |||
1231 | static inline int alloc_debug_processing(struct kmem_cache *s, | 1273 | static inline int alloc_debug_processing(struct kmem_cache *s, |
1232 | struct page *page, void *object, unsigned long addr) { return 0; } | 1274 | struct page *page, void *object, unsigned long addr) { return 0; } |
1233 | 1275 | ||
1234 | static inline struct kmem_cache_node *free_debug_processing( | 1276 | static inline int free_debug_processing( |
1235 | struct kmem_cache *s, struct page *page, | 1277 | struct kmem_cache *s, struct page *page, |
1236 | void *head, void *tail, int bulk_cnt, | 1278 | void *head, void *tail, int bulk_cnt, |
1237 | unsigned long addr, unsigned long *flags) { return NULL; } | 1279 | unsigned long addr) { return 0; } |
1238 | 1280 | ||
1239 | static inline int slab_pad_check(struct kmem_cache *s, struct page *page) | 1281 | static inline int slab_pad_check(struct kmem_cache *s, struct page *page) |
1240 | { return 1; } | 1282 | { return 1; } |
@@ -1281,36 +1323,6 @@ static inline void kfree_hook(const void *x) | |||
1281 | kasan_kfree_large(x); | 1323 | kasan_kfree_large(x); |
1282 | } | 1324 | } |
1283 | 1325 | ||
1284 | static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, | ||
1285 | gfp_t flags) | ||
1286 | { | ||
1287 | flags &= gfp_allowed_mask; | ||
1288 | lockdep_trace_alloc(flags); | ||
1289 | might_sleep_if(gfpflags_allow_blocking(flags)); | ||
1290 | |||
1291 | if (should_failslab(s->object_size, flags, s->flags)) | ||
1292 | return NULL; | ||
1293 | |||
1294 | return memcg_kmem_get_cache(s, flags); | ||
1295 | } | ||
1296 | |||
1297 | static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, | ||
1298 | size_t size, void **p) | ||
1299 | { | ||
1300 | size_t i; | ||
1301 | |||
1302 | flags &= gfp_allowed_mask; | ||
1303 | for (i = 0; i < size; i++) { | ||
1304 | void *object = p[i]; | ||
1305 | |||
1306 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | ||
1307 | kmemleak_alloc_recursive(object, s->object_size, 1, | ||
1308 | s->flags, flags); | ||
1309 | kasan_slab_alloc(s, object); | ||
1310 | } | ||
1311 | memcg_kmem_put_cache(s); | ||
1312 | } | ||
1313 | |||
1314 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | 1326 | static inline void slab_free_hook(struct kmem_cache *s, void *x) |
1315 | { | 1327 | { |
1316 | kmemleak_free_recursive(x, s->flags); | 1328 | kmemleak_free_recursive(x, s->flags); |
@@ -1470,7 +1482,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1470 | set_freepointer(s, p, NULL); | 1482 | set_freepointer(s, p, NULL); |
1471 | } | 1483 | } |
1472 | 1484 | ||
1473 | page->freelist = start; | 1485 | page->freelist = fixup_red_left(s, start); |
1474 | page->inuse = page->objects; | 1486 | page->inuse = page->objects; |
1475 | page->frozen = 1; | 1487 | page->frozen = 1; |
1476 | 1488 | ||
@@ -1506,7 +1518,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1506 | int order = compound_order(page); | 1518 | int order = compound_order(page); |
1507 | int pages = 1 << order; | 1519 | int pages = 1 << order; |
1508 | 1520 | ||
1509 | if (kmem_cache_debug(s)) { | 1521 | if (s->flags & SLAB_CONSISTENCY_CHECKS) { |
1510 | void *p; | 1522 | void *p; |
1511 | 1523 | ||
1512 | slab_pad_check(s, page); | 1524 | slab_pad_check(s, page); |
@@ -2224,8 +2236,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | |||
2224 | if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) | 2236 | if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) |
2225 | return; | 2237 | return; |
2226 | 2238 | ||
2227 | pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", | 2239 | pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", |
2228 | nid, gfpflags); | 2240 | nid, gfpflags, &gfpflags); |
2229 | pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", | 2241 | pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", |
2230 | s->name, s->object_size, s->size, oo_order(s->oo), | 2242 | s->name, s->object_size, s->size, oo_order(s->oo), |
2231 | oo_order(s->min)); | 2243 | oo_order(s->min)); |
@@ -2642,8 +2654,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2642 | stat(s, FREE_SLOWPATH); | 2654 | stat(s, FREE_SLOWPATH); |
2643 | 2655 | ||
2644 | if (kmem_cache_debug(s) && | 2656 | if (kmem_cache_debug(s) && |
2645 | !(n = free_debug_processing(s, page, head, tail, cnt, | 2657 | !free_debug_processing(s, page, head, tail, cnt, addr)) |
2646 | addr, &flags))) | ||
2647 | return; | 2658 | return; |
2648 | 2659 | ||
2649 | do { | 2660 | do { |
@@ -2815,6 +2826,7 @@ struct detached_freelist { | |||
2815 | void *tail; | 2826 | void *tail; |
2816 | void *freelist; | 2827 | void *freelist; |
2817 | int cnt; | 2828 | int cnt; |
2829 | struct kmem_cache *s; | ||
2818 | }; | 2830 | }; |
2819 | 2831 | ||
2820 | /* | 2832 | /* |
@@ -2829,26 +2841,45 @@ struct detached_freelist { | |||
2829 | * synchronization primitive. Look ahead in the array is limited due | 2841 | * synchronization primitive. Look ahead in the array is limited due |
2830 | * to performance reasons. | 2842 | * to performance reasons. |
2831 | */ | 2843 | */ |
2832 | static int build_detached_freelist(struct kmem_cache *s, size_t size, | 2844 | static inline |
2833 | void **p, struct detached_freelist *df) | 2845 | int build_detached_freelist(struct kmem_cache *s, size_t size, |
2846 | void **p, struct detached_freelist *df) | ||
2834 | { | 2847 | { |
2835 | size_t first_skipped_index = 0; | 2848 | size_t first_skipped_index = 0; |
2836 | int lookahead = 3; | 2849 | int lookahead = 3; |
2837 | void *object; | 2850 | void *object; |
2851 | struct page *page; | ||
2838 | 2852 | ||
2839 | /* Always re-init detached_freelist */ | 2853 | /* Always re-init detached_freelist */ |
2840 | df->page = NULL; | 2854 | df->page = NULL; |
2841 | 2855 | ||
2842 | do { | 2856 | do { |
2843 | object = p[--size]; | 2857 | object = p[--size]; |
2858 | /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */ | ||
2844 | } while (!object && size); | 2859 | } while (!object && size); |
2845 | 2860 | ||
2846 | if (!object) | 2861 | if (!object) |
2847 | return 0; | 2862 | return 0; |
2848 | 2863 | ||
2864 | page = virt_to_head_page(object); | ||
2865 | if (!s) { | ||
2866 | /* Handle kalloc'ed objects */ | ||
2867 | if (unlikely(!PageSlab(page))) { | ||
2868 | BUG_ON(!PageCompound(page)); | ||
2869 | kfree_hook(object); | ||
2870 | __free_kmem_pages(page, compound_order(page)); | ||
2871 | p[size] = NULL; /* mark object processed */ | ||
2872 | return size; | ||
2873 | } | ||
2874 | /* Derive kmem_cache from object */ | ||
2875 | df->s = page->slab_cache; | ||
2876 | } else { | ||
2877 | df->s = cache_from_obj(s, object); /* Support for memcg */ | ||
2878 | } | ||
2879 | |||
2849 | /* Start new detached freelist */ | 2880 | /* Start new detached freelist */ |
2850 | set_freepointer(s, object, NULL); | 2881 | df->page = page; |
2851 | df->page = virt_to_head_page(object); | 2882 | set_freepointer(df->s, object, NULL); |
2852 | df->tail = object; | 2883 | df->tail = object; |
2853 | df->freelist = object; | 2884 | df->freelist = object; |
2854 | p[size] = NULL; /* mark object processed */ | 2885 | p[size] = NULL; /* mark object processed */ |
@@ -2862,7 +2893,7 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, | |||
2862 | /* df->page is always set at this point */ | 2893 | /* df->page is always set at this point */ |
2863 | if (df->page == virt_to_head_page(object)) { | 2894 | if (df->page == virt_to_head_page(object)) { |
2864 | /* Opportunity build freelist */ | 2895 | /* Opportunity build freelist */ |
2865 | set_freepointer(s, object, df->freelist); | 2896 | set_freepointer(df->s, object, df->freelist); |
2866 | df->freelist = object; | 2897 | df->freelist = object; |
2867 | df->cnt++; | 2898 | df->cnt++; |
2868 | p[size] = NULL; /* mark object processed */ | 2899 | p[size] = NULL; /* mark object processed */ |
@@ -2881,25 +2912,20 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, | |||
2881 | return first_skipped_index; | 2912 | return first_skipped_index; |
2882 | } | 2913 | } |
2883 | 2914 | ||
2884 | |||
2885 | /* Note that interrupts must be enabled when calling this function. */ | 2915 | /* Note that interrupts must be enabled when calling this function. */ |
2886 | void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) | 2916 | void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) |
2887 | { | 2917 | { |
2888 | if (WARN_ON(!size)) | 2918 | if (WARN_ON(!size)) |
2889 | return; | 2919 | return; |
2890 | 2920 | ||
2891 | do { | 2921 | do { |
2892 | struct detached_freelist df; | 2922 | struct detached_freelist df; |
2893 | struct kmem_cache *s; | ||
2894 | |||
2895 | /* Support for memcg */ | ||
2896 | s = cache_from_obj(orig_s, p[size - 1]); | ||
2897 | 2923 | ||
2898 | size = build_detached_freelist(s, size, p, &df); | 2924 | size = build_detached_freelist(s, size, p, &df); |
2899 | if (unlikely(!df.page)) | 2925 | if (unlikely(!df.page)) |
2900 | continue; | 2926 | continue; |
2901 | 2927 | ||
2902 | slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); | 2928 | slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); |
2903 | } while (likely(size)); | 2929 | } while (likely(size)); |
2904 | } | 2930 | } |
2905 | EXPORT_SYMBOL(kmem_cache_free_bulk); | 2931 | EXPORT_SYMBOL(kmem_cache_free_bulk); |
@@ -3285,7 +3311,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3285 | */ | 3311 | */ |
3286 | size += 2 * sizeof(struct track); | 3312 | size += 2 * sizeof(struct track); |
3287 | 3313 | ||
3288 | if (flags & SLAB_RED_ZONE) | 3314 | if (flags & SLAB_RED_ZONE) { |
3289 | /* | 3315 | /* |
3290 | * Add some empty padding so that we can catch | 3316 | * Add some empty padding so that we can catch |
3291 | * overwrites from earlier objects rather than let | 3317 | * overwrites from earlier objects rather than let |
@@ -3294,6 +3320,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3294 | * of the object. | 3320 | * of the object. |
3295 | */ | 3321 | */ |
3296 | size += sizeof(void *); | 3322 | size += sizeof(void *); |
3323 | |||
3324 | s->red_left_pad = sizeof(void *); | ||
3325 | s->red_left_pad = ALIGN(s->red_left_pad, s->align); | ||
3326 | size += s->red_left_pad; | ||
3327 | } | ||
3297 | #endif | 3328 | #endif |
3298 | 3329 | ||
3299 | /* | 3330 | /* |
@@ -3357,7 +3388,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) | |||
3357 | 3388 | ||
3358 | #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ | 3389 | #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ |
3359 | defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) | 3390 | defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) |
3360 | if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) | 3391 | if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0) |
3361 | /* Enable fast mode */ | 3392 | /* Enable fast mode */ |
3362 | s->flags |= __CMPXCHG_DOUBLE; | 3393 | s->flags |= __CMPXCHG_DOUBLE; |
3363 | #endif | 3394 | #endif |
@@ -4812,16 +4843,16 @@ SLAB_ATTR_RO(total_objects); | |||
4812 | 4843 | ||
4813 | static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) | 4844 | static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) |
4814 | { | 4845 | { |
4815 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); | 4846 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); |
4816 | } | 4847 | } |
4817 | 4848 | ||
4818 | static ssize_t sanity_checks_store(struct kmem_cache *s, | 4849 | static ssize_t sanity_checks_store(struct kmem_cache *s, |
4819 | const char *buf, size_t length) | 4850 | const char *buf, size_t length) |
4820 | { | 4851 | { |
4821 | s->flags &= ~SLAB_DEBUG_FREE; | 4852 | s->flags &= ~SLAB_CONSISTENCY_CHECKS; |
4822 | if (buf[0] == '1') { | 4853 | if (buf[0] == '1') { |
4823 | s->flags &= ~__CMPXCHG_DOUBLE; | 4854 | s->flags &= ~__CMPXCHG_DOUBLE; |
4824 | s->flags |= SLAB_DEBUG_FREE; | 4855 | s->flags |= SLAB_CONSISTENCY_CHECKS; |
4825 | } | 4856 | } |
4826 | return length; | 4857 | return length; |
4827 | } | 4858 | } |
@@ -4865,7 +4896,6 @@ static ssize_t red_zone_store(struct kmem_cache *s, | |||
4865 | 4896 | ||
4866 | s->flags &= ~SLAB_RED_ZONE; | 4897 | s->flags &= ~SLAB_RED_ZONE; |
4867 | if (buf[0] == '1') { | 4898 | if (buf[0] == '1') { |
4868 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4869 | s->flags |= SLAB_RED_ZONE; | 4899 | s->flags |= SLAB_RED_ZONE; |
4870 | } | 4900 | } |
4871 | calculate_sizes(s, -1); | 4901 | calculate_sizes(s, -1); |
@@ -4886,7 +4916,6 @@ static ssize_t poison_store(struct kmem_cache *s, | |||
4886 | 4916 | ||
4887 | s->flags &= ~SLAB_POISON; | 4917 | s->flags &= ~SLAB_POISON; |
4888 | if (buf[0] == '1') { | 4918 | if (buf[0] == '1') { |
4889 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4890 | s->flags |= SLAB_POISON; | 4919 | s->flags |= SLAB_POISON; |
4891 | } | 4920 | } |
4892 | calculate_sizes(s, -1); | 4921 | calculate_sizes(s, -1); |
@@ -5356,7 +5385,7 @@ static char *create_unique_id(struct kmem_cache *s) | |||
5356 | *p++ = 'd'; | 5385 | *p++ = 'd'; |
5357 | if (s->flags & SLAB_RECLAIM_ACCOUNT) | 5386 | if (s->flags & SLAB_RECLAIM_ACCOUNT) |
5358 | *p++ = 'a'; | 5387 | *p++ = 'a'; |
5359 | if (s->flags & SLAB_DEBUG_FREE) | 5388 | if (s->flags & SLAB_CONSISTENCY_CHECKS) |
5360 | *p++ = 'F'; | 5389 | *p++ = 'F'; |
5361 | if (!(s->flags & SLAB_NOTRACK)) | 5390 | if (!(s->flags & SLAB_NOTRACK)) |
5362 | *p++ = 't'; | 5391 | *p++ = 't'; |
diff --git a/mm/truncate.c b/mm/truncate.c index e3ee0e27cd17..7598b552ae03 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -519,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages); | |||
519 | static int | 519 | static int |
520 | invalidate_complete_page2(struct address_space *mapping, struct page *page) | 520 | invalidate_complete_page2(struct address_space *mapping, struct page *page) |
521 | { | 521 | { |
522 | struct mem_cgroup *memcg; | ||
523 | unsigned long flags; | 522 | unsigned long flags; |
524 | 523 | ||
525 | if (page->mapping != mapping) | 524 | if (page->mapping != mapping) |
@@ -528,15 +527,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
528 | if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) | 527 | if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) |
529 | return 0; | 528 | return 0; |
530 | 529 | ||
531 | memcg = mem_cgroup_begin_page_stat(page); | ||
532 | spin_lock_irqsave(&mapping->tree_lock, flags); | 530 | spin_lock_irqsave(&mapping->tree_lock, flags); |
533 | if (PageDirty(page)) | 531 | if (PageDirty(page)) |
534 | goto failed; | 532 | goto failed; |
535 | 533 | ||
536 | BUG_ON(page_has_private(page)); | 534 | BUG_ON(page_has_private(page)); |
537 | __delete_from_page_cache(page, NULL, memcg); | 535 | __delete_from_page_cache(page, NULL); |
538 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 536 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
539 | mem_cgroup_end_page_stat(memcg); | ||
540 | 537 | ||
541 | if (mapping->a_ops->freepage) | 538 | if (mapping->a_ops->freepage) |
542 | mapping->a_ops->freepage(page); | 539 | mapping->a_ops->freepage(page); |
@@ -545,7 +542,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
545 | return 1; | 542 | return 1; |
546 | failed: | 543 | failed: |
547 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 544 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
548 | mem_cgroup_end_page_stat(memcg); | ||
549 | return 0; | 545 | return 0; |
550 | } | 546 | } |
551 | 547 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 71b1c29948db..dd984470248f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -195,25 +195,25 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) | |||
195 | { | 195 | { |
196 | unsigned long nr; | 196 | unsigned long nr; |
197 | 197 | ||
198 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + | 198 | nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + |
199 | zone_page_state(zone, NR_INACTIVE_FILE) + | 199 | zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + |
200 | zone_page_state(zone, NR_ISOLATED_FILE); | 200 | zone_page_state_snapshot(zone, NR_ISOLATED_FILE); |
201 | 201 | ||
202 | if (get_nr_swap_pages() > 0) | 202 | if (get_nr_swap_pages() > 0) |
203 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + | 203 | nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) + |
204 | zone_page_state(zone, NR_INACTIVE_ANON) + | 204 | zone_page_state_snapshot(zone, NR_INACTIVE_ANON) + |
205 | zone_page_state(zone, NR_ISOLATED_ANON); | 205 | zone_page_state_snapshot(zone, NR_ISOLATED_ANON); |
206 | 206 | ||
207 | return nr; | 207 | return nr; |
208 | } | 208 | } |
209 | 209 | ||
210 | bool zone_reclaimable(struct zone *zone) | 210 | bool zone_reclaimable(struct zone *zone) |
211 | { | 211 | { |
212 | return zone_page_state(zone, NR_PAGES_SCANNED) < | 212 | return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) < |
213 | zone_reclaimable_pages(zone) * 6; | 213 | zone_reclaimable_pages(zone) * 6; |
214 | } | 214 | } |
215 | 215 | ||
216 | static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) | 216 | unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) |
217 | { | 217 | { |
218 | if (!mem_cgroup_disabled()) | 218 | if (!mem_cgroup_disabled()) |
219 | return mem_cgroup_get_lru_size(lruvec, lru); | 219 | return mem_cgroup_get_lru_size(lruvec, lru); |
@@ -228,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker) | |||
228 | { | 228 | { |
229 | size_t size = sizeof(*shrinker->nr_deferred); | 229 | size_t size = sizeof(*shrinker->nr_deferred); |
230 | 230 | ||
231 | /* | ||
232 | * If we only have one possible node in the system anyway, save | ||
233 | * ourselves the trouble and disable NUMA aware behavior. This way we | ||
234 | * will save memory and some small loop time later. | ||
235 | */ | ||
236 | if (nr_node_ids == 1) | ||
237 | shrinker->flags &= ~SHRINKER_NUMA_AWARE; | ||
238 | |||
239 | if (shrinker->flags & SHRINKER_NUMA_AWARE) | 231 | if (shrinker->flags & SHRINKER_NUMA_AWARE) |
240 | size *= nr_node_ids; | 232 | size *= nr_node_ids; |
241 | 233 | ||
@@ -611,12 +603,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | |||
611 | bool reclaimed) | 603 | bool reclaimed) |
612 | { | 604 | { |
613 | unsigned long flags; | 605 | unsigned long flags; |
614 | struct mem_cgroup *memcg; | ||
615 | 606 | ||
616 | BUG_ON(!PageLocked(page)); | 607 | BUG_ON(!PageLocked(page)); |
617 | BUG_ON(mapping != page_mapping(page)); | 608 | BUG_ON(mapping != page_mapping(page)); |
618 | 609 | ||
619 | memcg = mem_cgroup_begin_page_stat(page); | ||
620 | spin_lock_irqsave(&mapping->tree_lock, flags); | 610 | spin_lock_irqsave(&mapping->tree_lock, flags); |
621 | /* | 611 | /* |
622 | * The non racy check for a busy page. | 612 | * The non racy check for a busy page. |
@@ -656,7 +646,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | |||
656 | mem_cgroup_swapout(page, swap); | 646 | mem_cgroup_swapout(page, swap); |
657 | __delete_from_swap_cache(page); | 647 | __delete_from_swap_cache(page); |
658 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 648 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
659 | mem_cgroup_end_page_stat(memcg); | ||
660 | swapcache_free(swap); | 649 | swapcache_free(swap); |
661 | } else { | 650 | } else { |
662 | void (*freepage)(struct page *); | 651 | void (*freepage)(struct page *); |
@@ -682,9 +671,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | |||
682 | if (reclaimed && page_is_file_cache(page) && | 671 | if (reclaimed && page_is_file_cache(page) && |
683 | !mapping_exiting(mapping) && !dax_mapping(mapping)) | 672 | !mapping_exiting(mapping) && !dax_mapping(mapping)) |
684 | shadow = workingset_eviction(mapping, page); | 673 | shadow = workingset_eviction(mapping, page); |
685 | __delete_from_page_cache(page, shadow, memcg); | 674 | __delete_from_page_cache(page, shadow); |
686 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 675 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
687 | mem_cgroup_end_page_stat(memcg); | ||
688 | 676 | ||
689 | if (freepage != NULL) | 677 | if (freepage != NULL) |
690 | freepage(page); | 678 | freepage(page); |
@@ -694,7 +682,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | |||
694 | 682 | ||
695 | cannot_free: | 683 | cannot_free: |
696 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 684 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
697 | mem_cgroup_end_page_stat(memcg); | ||
698 | return 0; | 685 | return 0; |
699 | } | 686 | } |
700 | 687 | ||
@@ -1931,8 +1918,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec) | |||
1931 | unsigned long inactive; | 1918 | unsigned long inactive; |
1932 | unsigned long active; | 1919 | unsigned long active; |
1933 | 1920 | ||
1934 | inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); | 1921 | inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); |
1935 | active = get_lru_size(lruvec, LRU_ACTIVE_FILE); | 1922 | active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); |
1936 | 1923 | ||
1937 | return active > inactive; | 1924 | return active > inactive; |
1938 | } | 1925 | } |
@@ -2071,7 +2058,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
2071 | * system is under heavy pressure. | 2058 | * system is under heavy pressure. |
2072 | */ | 2059 | */ |
2073 | if (!inactive_file_is_low(lruvec) && | 2060 | if (!inactive_file_is_low(lruvec) && |
2074 | get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { | 2061 | lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { |
2075 | scan_balance = SCAN_FILE; | 2062 | scan_balance = SCAN_FILE; |
2076 | goto out; | 2063 | goto out; |
2077 | } | 2064 | } |
@@ -2097,10 +2084,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
2097 | * anon in [0], file in [1] | 2084 | * anon in [0], file in [1] |
2098 | */ | 2085 | */ |
2099 | 2086 | ||
2100 | anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + | 2087 | anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) + |
2101 | get_lru_size(lruvec, LRU_INACTIVE_ANON); | 2088 | lruvec_lru_size(lruvec, LRU_INACTIVE_ANON); |
2102 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + | 2089 | file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + |
2103 | get_lru_size(lruvec, LRU_INACTIVE_FILE); | 2090 | lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); |
2104 | 2091 | ||
2105 | spin_lock_irq(&zone->lru_lock); | 2092 | spin_lock_irq(&zone->lru_lock); |
2106 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { | 2093 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { |
@@ -2138,7 +2125,7 @@ out: | |||
2138 | unsigned long size; | 2125 | unsigned long size; |
2139 | unsigned long scan; | 2126 | unsigned long scan; |
2140 | 2127 | ||
2141 | size = get_lru_size(lruvec, lru); | 2128 | size = lruvec_lru_size(lruvec, lru); |
2142 | scan = size >> sc->priority; | 2129 | scan = size >> sc->priority; |
2143 | 2130 | ||
2144 | if (!scan && pass && force_scan) | 2131 | if (!scan && pass && force_scan) |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 084c6725b373..69ce64f7b8d7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -924,19 +924,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | |||
924 | #endif | 924 | #endif |
925 | 925 | ||
926 | #ifdef CONFIG_PROC_FS | 926 | #ifdef CONFIG_PROC_FS |
927 | static char * const migratetype_names[MIGRATE_TYPES] = { | ||
928 | "Unmovable", | ||
929 | "Movable", | ||
930 | "Reclaimable", | ||
931 | "HighAtomic", | ||
932 | #ifdef CONFIG_CMA | ||
933 | "CMA", | ||
934 | #endif | ||
935 | #ifdef CONFIG_MEMORY_ISOLATION | ||
936 | "Isolate", | ||
937 | #endif | ||
938 | }; | ||
939 | |||
940 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, | 927 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, |
941 | struct zone *zone) | 928 | struct zone *zone) |
942 | { | 929 | { |
@@ -1133,7 +1120,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) | |||
1133 | #ifdef CONFIG_PAGE_OWNER | 1120 | #ifdef CONFIG_PAGE_OWNER |
1134 | int mtype; | 1121 | int mtype; |
1135 | 1122 | ||
1136 | if (!page_owner_inited) | 1123 | if (!static_branch_unlikely(&page_owner_inited)) |
1137 | return; | 1124 | return; |
1138 | 1125 | ||
1139 | drain_all_pages(NULL); | 1126 | drain_all_pages(NULL); |
diff --git a/mm/workingset.c b/mm/workingset.c index 61ead9e5549d..6130ba0b2641 100644 --- a/mm/workingset.c +++ b/mm/workingset.c | |||
@@ -152,8 +152,25 @@ | |||
152 | * refault distance will immediately activate the refaulting page. | 152 | * refault distance will immediately activate the refaulting page. |
153 | */ | 153 | */ |
154 | 154 | ||
155 | static void *pack_shadow(unsigned long eviction, struct zone *zone) | 155 | #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ |
156 | ZONES_SHIFT + NODES_SHIFT + \ | ||
157 | MEM_CGROUP_ID_SHIFT) | ||
158 | #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) | ||
159 | |||
160 | /* | ||
161 | * Eviction timestamps need to be able to cover the full range of | ||
162 | * actionable refaults. However, bits are tight in the radix tree | ||
163 | * entry, and after storing the identifier for the lruvec there might | ||
164 | * not be enough left to represent every single actionable refault. In | ||
165 | * that case, we have to sacrifice granularity for distance, and group | ||
166 | * evictions into coarser buckets by shaving off lower timestamp bits. | ||
167 | */ | ||
168 | static unsigned int bucket_order __read_mostly; | ||
169 | |||
170 | static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction) | ||
156 | { | 171 | { |
172 | eviction >>= bucket_order; | ||
173 | eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; | ||
157 | eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); | 174 | eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); |
158 | eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); | 175 | eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); |
159 | eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); | 176 | eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); |
@@ -161,45 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone) | |||
161 | return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); | 178 | return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); |
162 | } | 179 | } |
163 | 180 | ||
164 | static void unpack_shadow(void *shadow, | 181 | static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, |
165 | struct zone **zone, | 182 | unsigned long *evictionp) |
166 | unsigned long *distance) | ||
167 | { | 183 | { |
168 | unsigned long entry = (unsigned long)shadow; | 184 | unsigned long entry = (unsigned long)shadow; |
169 | unsigned long eviction; | 185 | int memcgid, nid, zid; |
170 | unsigned long refault; | ||
171 | unsigned long mask; | ||
172 | int zid, nid; | ||
173 | 186 | ||
174 | entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; | 187 | entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; |
175 | zid = entry & ((1UL << ZONES_SHIFT) - 1); | 188 | zid = entry & ((1UL << ZONES_SHIFT) - 1); |
176 | entry >>= ZONES_SHIFT; | 189 | entry >>= ZONES_SHIFT; |
177 | nid = entry & ((1UL << NODES_SHIFT) - 1); | 190 | nid = entry & ((1UL << NODES_SHIFT) - 1); |
178 | entry >>= NODES_SHIFT; | 191 | entry >>= NODES_SHIFT; |
179 | eviction = entry; | 192 | memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); |
193 | entry >>= MEM_CGROUP_ID_SHIFT; | ||
180 | 194 | ||
181 | *zone = NODE_DATA(nid)->node_zones + zid; | 195 | *memcgidp = memcgid; |
182 | 196 | *zonep = NODE_DATA(nid)->node_zones + zid; | |
183 | refault = atomic_long_read(&(*zone)->inactive_age); | 197 | *evictionp = entry << bucket_order; |
184 | mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + | ||
185 | RADIX_TREE_EXCEPTIONAL_SHIFT); | ||
186 | /* | ||
187 | * The unsigned subtraction here gives an accurate distance | ||
188 | * across inactive_age overflows in most cases. | ||
189 | * | ||
190 | * There is a special case: usually, shadow entries have a | ||
191 | * short lifetime and are either refaulted or reclaimed along | ||
192 | * with the inode before they get too old. But it is not | ||
193 | * impossible for the inactive_age to lap a shadow entry in | ||
194 | * the field, which can then can result in a false small | ||
195 | * refault distance, leading to a false activation should this | ||
196 | * old entry actually refault again. However, earlier kernels | ||
197 | * used to deactivate unconditionally with *every* reclaim | ||
198 | * invocation for the longest time, so the occasional | ||
199 | * inappropriate activation leading to pressure on the active | ||
200 | * list is not a problem. | ||
201 | */ | ||
202 | *distance = (refault - eviction) & mask; | ||
203 | } | 198 | } |
204 | 199 | ||
205 | /** | 200 | /** |
@@ -212,11 +207,20 @@ static void unpack_shadow(void *shadow, | |||
212 | */ | 207 | */ |
213 | void *workingset_eviction(struct address_space *mapping, struct page *page) | 208 | void *workingset_eviction(struct address_space *mapping, struct page *page) |
214 | { | 209 | { |
210 | struct mem_cgroup *memcg = page_memcg(page); | ||
215 | struct zone *zone = page_zone(page); | 211 | struct zone *zone = page_zone(page); |
212 | int memcgid = mem_cgroup_id(memcg); | ||
216 | unsigned long eviction; | 213 | unsigned long eviction; |
214 | struct lruvec *lruvec; | ||
217 | 215 | ||
218 | eviction = atomic_long_inc_return(&zone->inactive_age); | 216 | /* Page is fully exclusive and pins page->mem_cgroup */ |
219 | return pack_shadow(eviction, zone); | 217 | VM_BUG_ON_PAGE(PageLRU(page), page); |
218 | VM_BUG_ON_PAGE(page_count(page), page); | ||
219 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
220 | |||
221 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | ||
222 | eviction = atomic_long_inc_return(&lruvec->inactive_age); | ||
223 | return pack_shadow(memcgid, zone, eviction); | ||
220 | } | 224 | } |
221 | 225 | ||
222 | /** | 226 | /** |
@@ -231,12 +235,64 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) | |||
231 | bool workingset_refault(void *shadow) | 235 | bool workingset_refault(void *shadow) |
232 | { | 236 | { |
233 | unsigned long refault_distance; | 237 | unsigned long refault_distance; |
238 | unsigned long active_file; | ||
239 | struct mem_cgroup *memcg; | ||
240 | unsigned long eviction; | ||
241 | struct lruvec *lruvec; | ||
242 | unsigned long refault; | ||
234 | struct zone *zone; | 243 | struct zone *zone; |
244 | int memcgid; | ||
245 | |||
246 | unpack_shadow(shadow, &memcgid, &zone, &eviction); | ||
247 | |||
248 | rcu_read_lock(); | ||
249 | /* | ||
250 | * Look up the memcg associated with the stored ID. It might | ||
251 | * have been deleted since the page's eviction. | ||
252 | * | ||
253 | * Note that in rare events the ID could have been recycled | ||
254 | * for a new cgroup that refaults a shared page. This is | ||
255 | * impossible to tell from the available data. However, this | ||
256 | * should be a rare and limited disturbance, and activations | ||
257 | * are always speculative anyway. Ultimately, it's the aging | ||
258 | * algorithm's job to shake out the minimum access frequency | ||
259 | * for the active cache. | ||
260 | * | ||
261 | * XXX: On !CONFIG_MEMCG, this will always return NULL; it | ||
262 | * would be better if the root_mem_cgroup existed in all | ||
263 | * configurations instead. | ||
264 | */ | ||
265 | memcg = mem_cgroup_from_id(memcgid); | ||
266 | if (!mem_cgroup_disabled() && !memcg) { | ||
267 | rcu_read_unlock(); | ||
268 | return false; | ||
269 | } | ||
270 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | ||
271 | refault = atomic_long_read(&lruvec->inactive_age); | ||
272 | active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); | ||
273 | rcu_read_unlock(); | ||
274 | |||
275 | /* | ||
276 | * The unsigned subtraction here gives an accurate distance | ||
277 | * across inactive_age overflows in most cases. | ||
278 | * | ||
279 | * There is a special case: usually, shadow entries have a | ||
280 | * short lifetime and are either refaulted or reclaimed along | ||
281 | * with the inode before they get too old. But it is not | ||
282 | * impossible for the inactive_age to lap a shadow entry in | ||
283 | * the field, which can then can result in a false small | ||
284 | * refault distance, leading to a false activation should this | ||
285 | * old entry actually refault again. However, earlier kernels | ||
286 | * used to deactivate unconditionally with *every* reclaim | ||
287 | * invocation for the longest time, so the occasional | ||
288 | * inappropriate activation leading to pressure on the active | ||
289 | * list is not a problem. | ||
290 | */ | ||
291 | refault_distance = (refault - eviction) & EVICTION_MASK; | ||
235 | 292 | ||
236 | unpack_shadow(shadow, &zone, &refault_distance); | ||
237 | inc_zone_state(zone, WORKINGSET_REFAULT); | 293 | inc_zone_state(zone, WORKINGSET_REFAULT); |
238 | 294 | ||
239 | if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { | 295 | if (refault_distance <= active_file) { |
240 | inc_zone_state(zone, WORKINGSET_ACTIVATE); | 296 | inc_zone_state(zone, WORKINGSET_ACTIVATE); |
241 | return true; | 297 | return true; |
242 | } | 298 | } |
@@ -249,7 +305,22 @@ bool workingset_refault(void *shadow) | |||
249 | */ | 305 | */ |
250 | void workingset_activation(struct page *page) | 306 | void workingset_activation(struct page *page) |
251 | { | 307 | { |
252 | atomic_long_inc(&page_zone(page)->inactive_age); | 308 | struct lruvec *lruvec; |
309 | |||
310 | lock_page_memcg(page); | ||
311 | /* | ||
312 | * Filter non-memcg pages here, e.g. unmap can call | ||
313 | * mark_page_accessed() on VDSO pages. | ||
314 | * | ||
315 | * XXX: See workingset_refault() - this should return | ||
316 | * root_mem_cgroup even for !CONFIG_MEMCG. | ||
317 | */ | ||
318 | if (!mem_cgroup_disabled() && !page_memcg(page)) | ||
319 | goto out; | ||
320 | lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); | ||
321 | atomic_long_inc(&lruvec->inactive_age); | ||
322 | out: | ||
323 | unlock_page_memcg(page); | ||
253 | } | 324 | } |
254 | 325 | ||
255 | /* | 326 | /* |
@@ -398,8 +469,25 @@ static struct lock_class_key shadow_nodes_key; | |||
398 | 469 | ||
399 | static int __init workingset_init(void) | 470 | static int __init workingset_init(void) |
400 | { | 471 | { |
472 | unsigned int timestamp_bits; | ||
473 | unsigned int max_order; | ||
401 | int ret; | 474 | int ret; |
402 | 475 | ||
476 | BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); | ||
477 | /* | ||
478 | * Calculate the eviction bucket size to cover the longest | ||
479 | * actionable refault distance, which is currently half of | ||
480 | * memory (totalram_pages/2). However, memory hotplug may add | ||
481 | * some more pages at runtime, so keep working with up to | ||
482 | * double the initial memory by using totalram_pages as-is. | ||
483 | */ | ||
484 | timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; | ||
485 | max_order = fls_long(totalram_pages - 1); | ||
486 | if (max_order > timestamp_bits) | ||
487 | bucket_order = max_order - timestamp_bits; | ||
488 | printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", | ||
489 | timestamp_bits, max_order, bucket_order); | ||
490 | |||
403 | ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); | 491 | ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); |
404 | if (ret) | 492 | if (ret) |
405 | goto err; | 493 | goto err; |