aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-16 14:51:08 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-16 14:51:08 -0400
commit271ecc5253e2b317d729d366560789cd7f93836c (patch)
treed3a60bc4dfa8245ff934f357f2367db76b59e7cf
parentaa6865d836418eb2ba888a4cb1318a28e9aa2e0c (diff)
parent63c06227a22b098a3849c5c99e836aea161ca0d7 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge first patch-bomb from Andrew Morton: - some misc things - ofs2 updates - about half of MM - checkpatch updates - autofs4 update * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (120 commits) autofs4: fix string.h include in auto_dev-ioctl.h autofs4: use pr_xxx() macros directly for logging autofs4: change log print macros to not insert newline autofs4: make autofs log prints consistent autofs4: fix some white space errors autofs4: fix invalid ioctl return in autofs4_root_ioctl_unlocked() autofs4: fix coding style line length in autofs4_wait() autofs4: fix coding style problem in autofs4_get_set_timeout() autofs4: coding style fixes autofs: show pipe inode in mount options kallsyms: add support for relative offsets in kallsyms address table kallsyms: don't overload absolute symbol type for percpu symbols x86: kallsyms: disable absolute percpu symbols on !SMP checkpatch: fix another left brace warning checkpatch: improve UNSPECIFIED_INT test for bare signed/unsigned uses checkpatch: warn on bare unsigned or signed declarations without int checkpatch: exclude asm volatile from complex macro check mm: memcontrol: drop unnecessary lru locking from mem_cgroup_migrate() mm: migrate: consolidate mem_cgroup_migrate() calls mm/compaction: speed up pageblock_pfn_to_page() when zone is contiguous ...
-rw-r--r--Documentation/kernel-parameters.txt17
-rw-r--r--Documentation/memory-hotplug.txt23
-rw-r--r--Documentation/printk-formats.txt18
-rw-r--r--Documentation/vm/page_owner.txt9
-rw-r--r--Documentation/vm/slub.txt4
-rw-r--r--arch/blackfin/include/asm/pgtable.h2
-rw-r--r--arch/m32r/mm/init.c27
-rw-r--r--arch/s390/kernel/dumpstack.c6
-rw-r--r--arch/s390/mm/vmem.c10
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/mm/init.c36
-rw-r--r--arch/x86/mm/pageattr.c14
-rw-r--r--block/partition-generic.c11
-rw-r--r--drivers/acpi/processor_idle.c4
-rw-r--r--drivers/base/memory.c34
-rw-r--r--drivers/block/paride/pd.c4
-rw-r--r--drivers/block/paride/pt.c4
-rw-r--r--drivers/xen/Kconfig23
-rw-r--r--drivers/xen/balloon.c11
-rw-r--r--drivers/xen/events/events_2l.c5
-rw-r--r--fs/autofs4/autofs_i.h72
-rw-r--r--fs/autofs4/dev-ioctl.c57
-rw-r--r--fs/autofs4/expire.c84
-rw-r--r--fs/autofs4/init.c10
-rw-r--r--fs/autofs4/inode.c52
-rw-r--r--fs/autofs4/root.c163
-rw-r--r--fs/autofs4/symlink.c11
-rw-r--r--fs/autofs4/waitq.c78
-rw-r--r--fs/buffer.c24
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c10
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h26
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c13
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c127
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c40
-rw-r--r--fs/ocfs2/dlm/dlmthread.c13
-rw-r--r--fs/ocfs2/super.c5
-rw-r--r--fs/xfs/xfs_aops.c11
-rw-r--r--include/linux/auto_dev-ioctl.h6
-rw-r--r--include/linux/auto_fs.h10
-rw-r--r--include/linux/fault-inject.h5
-rw-r--r--include/linux/gfp.h12
-rw-r--r--include/linux/memcontrol.h103
-rw-r--r--include/linux/memory.h3
-rw-r--r--include/linux/memory_hotplug.h7
-rw-r--r--include/linux/migrate.h6
-rw-r--r--include/linux/mm.h37
-rw-r--r--include/linux/mmdebug.h3
-rw-r--r--include/linux/mmzone.h18
-rw-r--r--include/linux/page_ext.h1
-rw-r--r--include/linux/page_owner.h50
-rw-r--r--include/linux/pagemap.h3
-rw-r--r--include/linux/poison.h4
-rw-r--r--include/linux/slab.h13
-rw-r--r--include/linux/slab_def.h3
-rw-r--r--include/linux/slub_def.h1
-rw-r--r--include/linux/trace_events.h10
-rw-r--r--include/linux/tracepoint-defs.h14
-rw-r--r--include/trace/events/btrfs.h2
-rw-r--r--include/trace/events/compaction.h2
-rw-r--r--include/trace/events/gfpflags.h43
-rw-r--r--include/trace/events/huge_memory.h2
-rw-r--r--include/trace/events/kmem.h2
-rw-r--r--include/trace/events/mmflags.h164
-rw-r--r--include/trace/events/vmscan.h2
-rw-r--r--include/uapi/linux/auto_fs.h21
-rw-r--r--include/uapi/linux/auto_fs4.h17
-rw-r--r--init/Kconfig22
-rw-r--r--init/main.c4
-rw-r--r--kernel/kallsyms.c42
-rw-r--r--kernel/locking/lockdep.c3
-rw-r--r--kernel/memremap.c4
-rw-r--r--kernel/power/hibernate.c17
-rw-r--r--kernel/rcu/rcutorture.c6
-rw-r--r--kernel/workqueue.c3
-rw-r--r--lib/test_printf.c53
-rw-r--r--lib/vsprintf.c75
-rw-r--r--mm/Kconfig.debug57
-rw-r--r--mm/Makefile2
-rw-r--r--mm/compaction.c93
-rw-r--r--mm/debug.c165
-rw-r--r--mm/failslab.c12
-rw-r--r--mm/filemap.c113
-rw-r--r--mm/huge_memory.c20
-rw-r--r--mm/internal.h18
-rw-r--r--mm/kmemcheck.c3
-rw-r--r--mm/madvise.c19
-rw-r--r--mm/memblock.c8
-rw-r--r--mm/memcontrol.c92
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c7
-rw-r--r--mm/memory_hotplug.c30
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/migrate.c23
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/page-writeback.c62
-rw-r--r--mm/page_alloc.c295
-rw-r--r--mm/page_ext.c10
-rw-r--r--mm/page_owner.c100
-rw-r--r--mm/page_poison.c (renamed from mm/debug-pagealloc.c)67
-rw-r--r--mm/rmap.c16
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab.c1037
-rw-r--r--mm/slab.h69
-rw-r--r--mm/slab_common.c8
-rw-r--r--mm/slub.c325
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/vmscan.c47
-rw-r--r--mm/vmstat.c15
-rw-r--r--mm/workingset.c160
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv6/syncookies.c3
-rw-r--r--net/rds/page.c4
-rwxr-xr-xscripts/checkpatch.pl29
-rw-r--r--scripts/kallsyms.c93
-rwxr-xr-xscripts/link-vmlinux.sh6
-rwxr-xr-xscripts/namespace.pl2
-rw-r--r--tools/perf/builtin-kmem.c49
-rw-r--r--tools/vm/slabinfo.c2
119 files changed, 3139 insertions, 1805 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4d9ca7d92a20..5b47acb86111 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1759,7 +1759,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1759 1759
1760 keepinitrd [HW,ARM] 1760 keepinitrd [HW,ARM]
1761 1761
1762 kernelcore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter 1762 kernelcore= [KNL,X86,IA-64,PPC]
1763 Format: nn[KMGTPE] | "mirror"
1764 This parameter
1763 specifies the amount of memory usable by the kernel 1765 specifies the amount of memory usable by the kernel
1764 for non-movable allocations. The requested amount is 1766 for non-movable allocations. The requested amount is
1765 spread evenly throughout all nodes in the system. The 1767 spread evenly throughout all nodes in the system. The
@@ -1775,6 +1777,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1775 use the HighMem zone if it exists, and the Normal 1777 use the HighMem zone if it exists, and the Normal
1776 zone if it does not. 1778 zone if it does not.
1777 1779
1780 Instead of specifying the amount of memory (nn[KMGTPE]),
1781 you can specify "mirror" option. In case "mirror"
1782 option is specified, mirrored (reliable) memory is used
1783 for non-movable allocations and remaining memory is used
1784 for Movable pages. nn[KMGTPE] and "mirror" are exclusive,
1785 so you can NOT specify nn[KMGTPE] and "mirror" at the same
1786 time.
1787
1778 kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. 1788 kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port.
1779 Format: <Controller#>[,poll interval] 1789 Format: <Controller#>[,poll interval]
1780 The controller # is the number of the ehci usb debug 1790 The controller # is the number of the ehci usb debug
@@ -2732,6 +2742,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2732 we can turn it on. 2742 we can turn it on.
2733 on: enable the feature 2743 on: enable the feature
2734 2744
2745 page_poison= [KNL] Boot-time parameter changing the state of
2746 poisoning on the buddy allocator.
2747 off: turn off poisoning
2748 on: turn on poisoning
2749
2735 panic= [KNL] Kernel behaviour on panic: delay <timeout> 2750 panic= [KNL] Kernel behaviour on panic: delay <timeout>
2736 timeout > 0: seconds before rebooting 2751 timeout > 0: seconds before rebooting
2737 timeout = 0: wait forever 2752 timeout = 0: wait forever
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index ce2cfcf35c27..443f4b44ad97 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -256,10 +256,27 @@ If the memory block is offline, you'll read "offline".
256 256
2575.2. How to online memory 2575.2. How to online memory
258------------ 258------------
259Even if the memory is hot-added, it is not at ready-to-use state. 259When the memory is hot-added, the kernel decides whether or not to "online"
260For using newly added memory, you have to "online" the memory block. 260it according to the policy which can be read from "auto_online_blocks" file:
261 261
262For onlining, you have to write "online" to the memory block's state file as: 262% cat /sys/devices/system/memory/auto_online_blocks
263
264The default is "offline" which means the newly added memory is not in a
265ready-to-use state and you have to "online" the newly added memory blocks
266manually. Automatic onlining can be requested by writing "online" to
267"auto_online_blocks" file:
268
269% echo online > /sys/devices/system/memory/auto_online_blocks
270
271This sets a global policy and impacts all memory blocks that will subsequently
272be hotplugged. Currently offline blocks keep their state. It is possible, under
273certain circumstances, that some memory blocks will be added but will fail to
274online. User space tools can check their "state" files
275(/sys/devices/system/memory/memoryXXX/state) and try to online them manually.
276
277If the automatic onlining wasn't requested, failed, or some memory block was
278offlined it is possible to change the individual block's state by writing to the
279"state" file:
263 280
264% echo online > /sys/devices/system/memory/memoryXXX/state 281% echo online > /sys/devices/system/memory/memoryXXX/state
265 282
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 5d1128bf0282..5962949944fd 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -298,6 +298,24 @@ bitmap and its derivatives such as cpumask and nodemask:
298 298
299 Passed by reference. 299 Passed by reference.
300 300
301Flags bitfields such as page flags, gfp_flags:
302
303 %pGp referenced|uptodate|lru|active|private
304 %pGg GFP_USER|GFP_DMA32|GFP_NOWARN
305 %pGv read|exec|mayread|maywrite|mayexec|denywrite
306
307 For printing flags bitfields as a collection of symbolic constants that
308 would construct the value. The type of flags is given by the third
309 character. Currently supported are [p]age flags, [v]ma_flags (both
310 expect unsigned long *) and [g]fp_flags (expects gfp_t *). The flag
311 names and print order depends on the particular type.
312
313 Note that this format should not be used directly in TP_printk() part
314 of a tracepoint. Instead, use the show_*_flags() functions from
315 <trace/events/mmflags.h>.
316
317 Passed by reference.
318
301Network device features: 319Network device features:
302 320
303 %pNF 0x000000000000c000 321 %pNF 0x000000000000c000
diff --git a/Documentation/vm/page_owner.txt b/Documentation/vm/page_owner.txt
index 8f3ce9b3aa11..ffff1439076a 100644
--- a/Documentation/vm/page_owner.txt
+++ b/Documentation/vm/page_owner.txt
@@ -28,10 +28,11 @@ with page owner and page owner is disabled in runtime due to no enabling
28boot option, runtime overhead is marginal. If disabled in runtime, it 28boot option, runtime overhead is marginal. If disabled in runtime, it
29doesn't require memory to store owner information, so there is no runtime 29doesn't require memory to store owner information, so there is no runtime
30memory overhead. And, page owner inserts just two unlikely branches into 30memory overhead. And, page owner inserts just two unlikely branches into
31the page allocator hotpath and if it returns false then allocation is 31the page allocator hotpath and if not enabled, then allocation is done
32done like as the kernel without page owner. These two unlikely branches 32like as the kernel without page owner. These two unlikely branches should
33would not affect to allocation performance. Following is the kernel's 33not affect to allocation performance, especially if the static keys jump
34code size change due to this facility. 34label patching functionality is available. Following is the kernel's code
35size change due to this facility.
35 36
36- Without page owner 37- Without page owner
37 text data bss dec hex filename 38 text data bss dec hex filename
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
index f0d340959319..84652419bff2 100644
--- a/Documentation/vm/slub.txt
+++ b/Documentation/vm/slub.txt
@@ -35,8 +35,8 @@ slub_debug=<Debug-Options>,<slab name>
35 Enable options only for select slabs 35 Enable options only for select slabs
36 36
37Possible debug options are 37Possible debug options are
38 F Sanity checks on (enables SLAB_DEBUG_FREE. Sorry 38 F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
39 SLAB legacy issues) 39 Sorry SLAB legacy issues)
40 Z Red zoning 40 Z Red zoning
41 P Poisoning (object and padding) 41 P Poisoning (object and padding)
42 U User tracking (free and alloc) 42 U User tracking (free and alloc)
diff --git a/arch/blackfin/include/asm/pgtable.h b/arch/blackfin/include/asm/pgtable.h
index b88a1558b0b9..c1ee3d6533fb 100644
--- a/arch/blackfin/include/asm/pgtable.h
+++ b/arch/blackfin/include/asm/pgtable.h
@@ -97,6 +97,8 @@ extern unsigned long get_fb_unmapped_area(struct file *filp, unsigned long,
97 unsigned long); 97 unsigned long);
98#define HAVE_ARCH_FB_UNMAPPED_AREA 98#define HAVE_ARCH_FB_UNMAPPED_AREA
99 99
100#define pgprot_writecombine pgprot_noncached
101
100#include <asm-generic/pgtable.h> 102#include <asm-generic/pgtable.h>
101 103
102#endif /* _BLACKFIN_PGTABLE_H */ 104#endif /* _BLACKFIN_PGTABLE_H */
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index 0d4146f644dc..11fa717d93b1 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -59,21 +59,24 @@ void free_initrd_mem(unsigned long, unsigned long);
59void __init zone_sizes_init(void) 59void __init zone_sizes_init(void)
60{ 60{
61 unsigned long zones_size[MAX_NR_ZONES] = {0, }; 61 unsigned long zones_size[MAX_NR_ZONES] = {0, };
62 unsigned long max_dma;
63 unsigned long low;
64 unsigned long start_pfn; 62 unsigned long start_pfn;
65 63
66#ifdef CONFIG_MMU 64#ifdef CONFIG_MMU
67 start_pfn = START_PFN(0); 65 {
68 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 66 unsigned long low;
69 low = MAX_LOW_PFN(0); 67 unsigned long max_dma;
70 68
71 if (low < max_dma){ 69 start_pfn = START_PFN(0);
72 zones_size[ZONE_DMA] = low - start_pfn; 70 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
73 zones_size[ZONE_NORMAL] = 0; 71 low = MAX_LOW_PFN(0);
74 } else { 72
75 zones_size[ZONE_DMA] = low - start_pfn; 73 if (low < max_dma) {
76 zones_size[ZONE_NORMAL] = low - max_dma; 74 zones_size[ZONE_DMA] = low - start_pfn;
75 zones_size[ZONE_NORMAL] = 0;
76 } else {
77 zones_size[ZONE_DMA] = low - start_pfn;
78 zones_size[ZONE_NORMAL] = low - max_dma;
79 }
77 } 80 }
78#else 81#else
79 zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT; 82 zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT;
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 2150b0139a0b..1b6081c0aff9 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -11,6 +11,7 @@
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/kdebug.h> 12#include <linux/kdebug.h>
13#include <linux/ptrace.h> 13#include <linux/ptrace.h>
14#include <linux/mm.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/sched.h> 16#include <linux/sched.h>
16#include <asm/processor.h> 17#include <asm/processor.h>
@@ -189,9 +190,8 @@ void die(struct pt_regs *regs, const char *str)
189#ifdef CONFIG_SMP 190#ifdef CONFIG_SMP
190 printk("SMP "); 191 printk("SMP ");
191#endif 192#endif
192#ifdef CONFIG_DEBUG_PAGEALLOC 193 if (debug_pagealloc_enabled())
193 printk("DEBUG_PAGEALLOC"); 194 printk("DEBUG_PAGEALLOC");
194#endif
195 printk("\n"); 195 printk("\n");
196 notify_die(DIE_OOPS, str, regs, 0, regs->int_code & 0xffff, SIGSEGV); 196 notify_die(DIE_OOPS, str, regs, 0, regs->int_code & 0xffff, SIGSEGV);
197 print_modules(); 197 print_modules();
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index ef7d6c8fea66..d27fccbad7c1 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -94,16 +94,15 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
94 pgd_populate(&init_mm, pg_dir, pu_dir); 94 pgd_populate(&init_mm, pg_dir, pu_dir);
95 } 95 }
96 pu_dir = pud_offset(pg_dir, address); 96 pu_dir = pud_offset(pg_dir, address);
97#ifndef CONFIG_DEBUG_PAGEALLOC
98 if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address && 97 if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
99 !(address & ~PUD_MASK) && (address + PUD_SIZE <= end)) { 98 !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
99 !debug_pagealloc_enabled()) {
100 pud_val(*pu_dir) = __pa(address) | 100 pud_val(*pu_dir) = __pa(address) |
101 _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE | 101 _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE |
102 (ro ? _REGION_ENTRY_PROTECT : 0); 102 (ro ? _REGION_ENTRY_PROTECT : 0);
103 address += PUD_SIZE; 103 address += PUD_SIZE;
104 continue; 104 continue;
105 } 105 }
106#endif
107 if (pud_none(*pu_dir)) { 106 if (pud_none(*pu_dir)) {
108 pm_dir = vmem_pmd_alloc(); 107 pm_dir = vmem_pmd_alloc();
109 if (!pm_dir) 108 if (!pm_dir)
@@ -111,9 +110,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
111 pud_populate(&init_mm, pu_dir, pm_dir); 110 pud_populate(&init_mm, pu_dir, pm_dir);
112 } 111 }
113 pm_dir = pmd_offset(pu_dir, address); 112 pm_dir = pmd_offset(pu_dir, address);
114#ifndef CONFIG_DEBUG_PAGEALLOC
115 if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address && 113 if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
116 !(address & ~PMD_MASK) && (address + PMD_SIZE <= end)) { 114 !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) &&
115 !debug_pagealloc_enabled()) {
117 pmd_val(*pm_dir) = __pa(address) | 116 pmd_val(*pm_dir) = __pa(address) |
118 _SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE | 117 _SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE |
119 _SEGMENT_ENTRY_YOUNG | 118 _SEGMENT_ENTRY_YOUNG |
@@ -121,7 +120,6 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
121 address += PMD_SIZE; 120 address += PMD_SIZE;
122 continue; 121 continue;
123 } 122 }
124#endif
125 if (pmd_none(*pm_dir)) { 123 if (pmd_none(*pm_dir)) {
126 pt_dir = vmem_pte_alloc(address); 124 pt_dir = vmem_pte_alloc(address);
127 if (!pt_dir) 125 if (!pt_dir)
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 9c30acfadae2..32e5699eadfe 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -265,9 +265,8 @@ int __die(const char *str, struct pt_regs *regs, long err)
265#ifdef CONFIG_SMP 265#ifdef CONFIG_SMP
266 printk("SMP "); 266 printk("SMP ");
267#endif 267#endif
268#ifdef CONFIG_DEBUG_PAGEALLOC 268 if (debug_pagealloc_enabled())
269 printk("DEBUG_PAGEALLOC "); 269 printk("DEBUG_PAGEALLOC ");
270#endif
271#ifdef CONFIG_KASAN 270#ifdef CONFIG_KASAN
272 printk("KASAN"); 271 printk("KASAN");
273#endif 272#endif
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 493f54172b4a..9d56f271d519 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -150,13 +150,14 @@ static int page_size_mask;
150 150
151static void __init probe_page_size_mask(void) 151static void __init probe_page_size_mask(void)
152{ 152{
153#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) 153#if !defined(CONFIG_KMEMCHECK)
154 /* 154 /*
155 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 155 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
156 * use small pages.
156 * This will simplify cpa(), which otherwise needs to support splitting 157 * This will simplify cpa(), which otherwise needs to support splitting
157 * large pages into small in interrupt context, etc. 158 * large pages into small in interrupt context, etc.
158 */ 159 */
159 if (cpu_has_pse) 160 if (cpu_has_pse && !debug_pagealloc_enabled())
160 page_size_mask |= 1 << PG_LEVEL_2M; 161 page_size_mask |= 1 << PG_LEVEL_2M;
161#endif 162#endif
162 163
@@ -666,21 +667,22 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
666 * mark them not present - any buggy init-section access will 667 * mark them not present - any buggy init-section access will
667 * create a kernel page fault: 668 * create a kernel page fault:
668 */ 669 */
669#ifdef CONFIG_DEBUG_PAGEALLOC 670 if (debug_pagealloc_enabled()) {
670 printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n", 671 pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n",
671 begin, end - 1); 672 begin, end - 1);
672 set_memory_np(begin, (end - begin) >> PAGE_SHIFT); 673 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
673#else 674 } else {
674 /* 675 /*
675 * We just marked the kernel text read only above, now that 676 * We just marked the kernel text read only above, now that
676 * we are going to free part of that, we need to make that 677 * we are going to free part of that, we need to make that
677 * writeable and non-executable first. 678 * writeable and non-executable first.
678 */ 679 */
679 set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); 680 set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
680 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); 681 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
681 682
682 free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what); 683 free_reserved_area((void *)begin, (void *)end,
683#endif 684 POISON_FREE_INITMEM, what);
685 }
684} 686}
685 687
686void free_initmem(void) 688void free_initmem(void)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 007ebe2d8157..4d0b26253042 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -106,12 +106,6 @@ static inline unsigned long highmap_end_pfn(void)
106 106
107#endif 107#endif
108 108
109#ifdef CONFIG_DEBUG_PAGEALLOC
110# define debug_pagealloc 1
111#else
112# define debug_pagealloc 0
113#endif
114
115static inline int 109static inline int
116within(unsigned long addr, unsigned long start, unsigned long end) 110within(unsigned long addr, unsigned long start, unsigned long end)
117{ 111{
@@ -714,10 +708,10 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
714{ 708{
715 struct page *base; 709 struct page *base;
716 710
717 if (!debug_pagealloc) 711 if (!debug_pagealloc_enabled())
718 spin_unlock(&cpa_lock); 712 spin_unlock(&cpa_lock);
719 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); 713 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
720 if (!debug_pagealloc) 714 if (!debug_pagealloc_enabled())
721 spin_lock(&cpa_lock); 715 spin_lock(&cpa_lock);
722 if (!base) 716 if (!base)
723 return -ENOMEM; 717 return -ENOMEM;
@@ -1339,10 +1333,10 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1339 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) 1333 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
1340 cpa->numpages = 1; 1334 cpa->numpages = 1;
1341 1335
1342 if (!debug_pagealloc) 1336 if (!debug_pagealloc_enabled())
1343 spin_lock(&cpa_lock); 1337 spin_lock(&cpa_lock);
1344 ret = __change_page_attr(cpa, checkalias); 1338 ret = __change_page_attr(cpa, checkalias);
1345 if (!debug_pagealloc) 1339 if (!debug_pagealloc_enabled())
1346 spin_unlock(&cpa_lock); 1340 spin_unlock(&cpa_lock);
1347 if (ret) 1341 if (ret)
1348 return ret; 1342 return ret;
diff --git a/block/partition-generic.c b/block/partition-generic.c
index fefd01b496a0..5d8701941054 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -217,10 +217,21 @@ static void part_release(struct device *dev)
217 kfree(p); 217 kfree(p);
218} 218}
219 219
220static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
221{
222 struct hd_struct *part = dev_to_part(dev);
223
224 add_uevent_var(env, "PARTN=%u", part->partno);
225 if (part->info && part->info->volname[0])
226 add_uevent_var(env, "PARTNAME=%s", part->info->volname);
227 return 0;
228}
229
220struct device_type part_type = { 230struct device_type part_type = {
221 .name = "partition", 231 .name = "partition",
222 .groups = part_attr_groups, 232 .groups = part_attr_groups,
223 .release = part_release, 233 .release = part_release,
234 .uevent = part_uevent,
224}; 235};
225 236
226static void delete_partition_rcu_cb(struct rcu_head *head) 237static void delete_partition_rcu_cb(struct rcu_head *head)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 175c86bee3a9..9ca2b2fefd76 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -61,8 +61,8 @@ module_param(latency_factor, uint, 0644);
61 61
62static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device); 62static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device);
63 63
64static DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX], 64static
65 acpi_cstate); 65DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX], acpi_cstate);
66 66
67static int disabled_by_idle_boot_param(void) 67static int disabled_by_idle_boot_param(void)
68{ 68{
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 213456c2b123..f46dba8b7092 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -251,7 +251,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
251 return ret; 251 return ret;
252} 252}
253 253
254static int memory_block_change_state(struct memory_block *mem, 254int memory_block_change_state(struct memory_block *mem,
255 unsigned long to_state, unsigned long from_state_req) 255 unsigned long to_state, unsigned long from_state_req)
256{ 256{
257 int ret = 0; 257 int ret = 0;
@@ -439,6 +439,37 @@ print_block_size(struct device *dev, struct device_attribute *attr,
439static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL); 439static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
440 440
441/* 441/*
442 * Memory auto online policy.
443 */
444
445static ssize_t
446show_auto_online_blocks(struct device *dev, struct device_attribute *attr,
447 char *buf)
448{
449 if (memhp_auto_online)
450 return sprintf(buf, "online\n");
451 else
452 return sprintf(buf, "offline\n");
453}
454
455static ssize_t
456store_auto_online_blocks(struct device *dev, struct device_attribute *attr,
457 const char *buf, size_t count)
458{
459 if (sysfs_streq(buf, "online"))
460 memhp_auto_online = true;
461 else if (sysfs_streq(buf, "offline"))
462 memhp_auto_online = false;
463 else
464 return -EINVAL;
465
466 return count;
467}
468
469static DEVICE_ATTR(auto_online_blocks, 0644, show_auto_online_blocks,
470 store_auto_online_blocks);
471
472/*
442 * Some architectures will have custom drivers to do this, and 473 * Some architectures will have custom drivers to do this, and
443 * will not need to do it from userspace. The fake hot-add code 474 * will not need to do it from userspace. The fake hot-add code
444 * as well as ppc64 will do all of their discovery in userspace 475 * as well as ppc64 will do all of their discovery in userspace
@@ -746,6 +777,7 @@ static struct attribute *memory_root_attrs[] = {
746#endif 777#endif
747 778
748 &dev_attr_block_size_bytes.attr, 779 &dev_attr_block_size_bytes.attr,
780 &dev_attr_auto_online_blocks.attr,
749 NULL 781 NULL
750}; 782};
751 783
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 562b5a4ca7b7..78a39f736c64 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -126,7 +126,7 @@
126*/ 126*/
127#include <linux/types.h> 127#include <linux/types.h>
128 128
129static bool verbose = 0; 129static int verbose = 0;
130static int major = PD_MAJOR; 130static int major = PD_MAJOR;
131static char *name = PD_NAME; 131static char *name = PD_NAME;
132static int cluster = 64; 132static int cluster = 64;
@@ -161,7 +161,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV};
161static DEFINE_MUTEX(pd_mutex); 161static DEFINE_MUTEX(pd_mutex);
162static DEFINE_SPINLOCK(pd_lock); 162static DEFINE_SPINLOCK(pd_lock);
163 163
164module_param(verbose, bool, 0); 164module_param(verbose, int, 0);
165module_param(major, int, 0); 165module_param(major, int, 0);
166module_param(name, charp, 0); 166module_param(name, charp, 0);
167module_param(cluster, int, 0); 167module_param(cluster, int, 0);
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 1740d75e8a32..216a94fed5b4 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -117,7 +117,7 @@
117 117
118*/ 118*/
119 119
120static bool verbose = 0; 120static int verbose = 0;
121static int major = PT_MAJOR; 121static int major = PT_MAJOR;
122static char *name = PT_NAME; 122static char *name = PT_NAME;
123static int disable = 0; 123static int disable = 0;
@@ -152,7 +152,7 @@ static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
152 152
153#include <asm/uaccess.h> 153#include <asm/uaccess.h>
154 154
155module_param(verbose, bool, 0); 155module_param(verbose, int, 0);
156module_param(major, int, 0); 156module_param(major, int, 0);
157module_param(name, charp, 0); 157module_param(name, charp, 0);
158module_param_array(drive0, int, NULL, 0); 158module_param_array(drive0, int, NULL, 0);
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 73708acce3ca..979a8317204f 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -37,23 +37,30 @@ config XEN_BALLOON_MEMORY_HOTPLUG
37 37
38 Memory could be hotplugged in following steps: 38 Memory could be hotplugged in following steps:
39 39
40 1) dom0: xl mem-max <domU> <maxmem> 40 1) target domain: ensure that memory auto online policy is in
41 effect by checking /sys/devices/system/memory/auto_online_blocks
42 file (should be 'online').
43
44 2) control domain: xl mem-max <target-domain> <maxmem>
41 where <maxmem> is >= requested memory size, 45 where <maxmem> is >= requested memory size,
42 46
43 2) dom0: xl mem-set <domU> <memory> 47 3) control domain: xl mem-set <target-domain> <memory>
44 where <memory> is requested memory size; alternatively memory 48 where <memory> is requested memory size; alternatively memory
45 could be added by writing proper value to 49 could be added by writing proper value to
46 /sys/devices/system/xen_memory/xen_memory0/target or 50 /sys/devices/system/xen_memory/xen_memory0/target or
47 /sys/devices/system/xen_memory/xen_memory0/target_kb on dumU, 51 /sys/devices/system/xen_memory/xen_memory0/target_kb on the
52 target domain.
48 53
49 3) domU: for i in /sys/devices/system/memory/memory*/state; do \ 54 Alternatively, if memory auto onlining was not requested at step 1
50 [ "`cat "$i"`" = offline ] && echo online > "$i"; done 55 the newly added memory can be manually onlined in the target domain
56 by doing the following:
51 57
52 Memory could be onlined automatically on domU by adding following line to udev rules: 58 for i in /sys/devices/system/memory/memory*/state; do \
59 [ "`cat "$i"`" = offline ] && echo online > "$i"; done
53 60
54 SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'" 61 or by adding the following line to udev rules:
55 62
56 In that case step 3 should be omitted. 63 SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'"
57 64
58config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT 65config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
59 int "Hotplugged memory limit (in GiB) for a PV guest" 66 int "Hotplugged memory limit (in GiB) for a PV guest"
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index dc4305b407bf..7c8a2cf16f58 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -338,7 +338,16 @@ static enum bp_state reserve_additional_memory(void)
338 } 338 }
339#endif 339#endif
340 340
341 rc = add_memory_resource(nid, resource); 341 /*
342 * add_memory_resource() will call online_pages() which in its turn
343 * will call xen_online_page() callback causing deadlock if we don't
344 * release balloon_mutex here. Unlocking here is safe because the
345 * callers drop the mutex before trying again.
346 */
347 mutex_unlock(&balloon_mutex);
348 rc = add_memory_resource(nid, resource, memhp_auto_online);
349 mutex_lock(&balloon_mutex);
350
342 if (rc) { 351 if (rc) {
343 pr_warn("Cannot add additional memory (%i)\n", rc); 352 pr_warn("Cannot add additional memory (%i)\n", rc);
344 goto err; 353 goto err;
diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
index 7dd46312c180..403fe3955393 100644
--- a/drivers/xen/events/events_2l.c
+++ b/drivers/xen/events/events_2l.c
@@ -38,8 +38,9 @@
38/* Find the first set bit in a evtchn mask */ 38/* Find the first set bit in a evtchn mask */
39#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) 39#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
40 40
41static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD], 41#define EVTCHN_MASK_SIZE (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)
42 cpu_evtchn_mask); 42
43static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_MASK_SIZE], cpu_evtchn_mask);
43 44
44static unsigned evtchn_2l_max_channels(void) 45static unsigned evtchn_2l_max_channels(void)
45{ 46{
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c37149b929be..f0d268b97d19 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -1,15 +1,11 @@
1/* -*- c -*- ------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
3 * linux/fs/autofs/autofs_i.h 3 * Copyright 2005-2006 Ian Kent <raven@themaw.net>
4 *
5 * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
6 * Copyright 2005-2006 Ian Kent <raven@themaw.net>
7 * 4 *
8 * This file is part of the Linux kernel and is made available under 5 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your 6 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference. 7 * option, any later version, incorporated herein by reference.
11 * 8 */
12 * ----------------------------------------------------------------------- */
13 9
14/* Internal header file for autofs */ 10/* Internal header file for autofs */
15 11
@@ -35,28 +31,23 @@
35#include <linux/mount.h> 31#include <linux/mount.h>
36#include <linux/namei.h> 32#include <linux/namei.h>
37#include <asm/current.h> 33#include <asm/current.h>
38#include <asm/uaccess.h> 34#include <linux/uaccess.h>
39 35
40/* #define DEBUG */ 36/* #define DEBUG */
41 37
42#define DPRINTK(fmt, ...) \ 38#ifdef pr_fmt
43 pr_debug("pid %d: %s: " fmt "\n", \ 39#undef pr_fmt
44 current->pid, __func__, ##__VA_ARGS__) 40#endif
45 41#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__
46#define AUTOFS_WARN(fmt, ...) \ 42
47 printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ 43/*
48 current->pid, __func__, ##__VA_ARGS__) 44 * Unified info structure. This is pointed to by both the dentry and
49 45 * inode structures. Each file in the filesystem has an instance of this
50#define AUTOFS_ERROR(fmt, ...) \ 46 * structure. It holds a reference to the dentry, so dentries are never
51 printk(KERN_ERR "pid %d: %s: " fmt "\n", \ 47 * flushed while the file exists. All name lookups are dealt with at the
52 current->pid, __func__, ##__VA_ARGS__) 48 * dentry level, although the filesystem can interfere in the validation
53 49 * process. Readdir is implemented by traversing the dentry lists.
54/* Unified info structure. This is pointed to by both the dentry and 50 */
55 inode structures. Each file in the filesystem has an instance of this
56 structure. It holds a reference to the dentry, so dentries are never
57 flushed while the file exists. All name lookups are dealt with at the
58 dentry level, although the filesystem can interfere in the validation
59 process. Readdir is implemented by traversing the dentry lists. */
60struct autofs_info { 51struct autofs_info {
61 struct dentry *dentry; 52 struct dentry *dentry;
62 struct inode *inode; 53 struct inode *inode;
@@ -78,7 +69,7 @@ struct autofs_info {
78 kgid_t gid; 69 kgid_t gid;
79}; 70};
80 71
81#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 72#define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */
82#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered 73#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered
83 * for expiry, so RCU_walk is 74 * for expiry, so RCU_walk is
84 * not permitted 75 * not permitted
@@ -140,10 +131,11 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
140} 131}
141 132
142/* autofs4_oz_mode(): do we see the man behind the curtain? (The 133/* autofs4_oz_mode(): do we see the man behind the curtain? (The
143 processes which do manipulations for us in user space sees the raw 134 * processes which do manipulations for us in user space sees the raw
144 filesystem without "magic".) */ 135 * filesystem without "magic".)
145 136 */
146static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { 137static inline int autofs4_oz_mode(struct autofs_sb_info *sbi)
138{
147 return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; 139 return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
148} 140}
149 141
@@ -154,12 +146,12 @@ void autofs4_free_ino(struct autofs_info *);
154int is_autofs4_dentry(struct dentry *); 146int is_autofs4_dentry(struct dentry *);
155int autofs4_expire_wait(struct dentry *dentry, int rcu_walk); 147int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
156int autofs4_expire_run(struct super_block *, struct vfsmount *, 148int autofs4_expire_run(struct super_block *, struct vfsmount *,
157 struct autofs_sb_info *, 149 struct autofs_sb_info *,
158 struct autofs_packet_expire __user *); 150 struct autofs_packet_expire __user *);
159int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, 151int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
160 struct autofs_sb_info *sbi, int when); 152 struct autofs_sb_info *sbi, int when);
161int autofs4_expire_multi(struct super_block *, struct vfsmount *, 153int autofs4_expire_multi(struct super_block *, struct vfsmount *,
162 struct autofs_sb_info *, int __user *); 154 struct autofs_sb_info *, int __user *);
163struct dentry *autofs4_expire_direct(struct super_block *sb, 155struct dentry *autofs4_expire_direct(struct super_block *sb,
164 struct vfsmount *mnt, 156 struct vfsmount *mnt,
165 struct autofs_sb_info *sbi, int how); 157 struct autofs_sb_info *sbi, int how);
@@ -224,8 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
224 216
225/* Queue management functions */ 217/* Queue management functions */
226 218
227int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); 219int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify);
228int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); 220int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
229void autofs4_catatonic_mode(struct autofs_sb_info *); 221void autofs4_catatonic_mode(struct autofs_sb_info *);
230 222
231static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) 223static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
@@ -242,37 +234,37 @@ static inline void __autofs4_add_expiring(struct dentry *dentry)
242{ 234{
243 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 235 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
244 struct autofs_info *ino = autofs4_dentry_ino(dentry); 236 struct autofs_info *ino = autofs4_dentry_ino(dentry);
237
245 if (ino) { 238 if (ino) {
246 if (list_empty(&ino->expiring)) 239 if (list_empty(&ino->expiring))
247 list_add(&ino->expiring, &sbi->expiring_list); 240 list_add(&ino->expiring, &sbi->expiring_list);
248 } 241 }
249 return;
250} 242}
251 243
252static inline void autofs4_add_expiring(struct dentry *dentry) 244static inline void autofs4_add_expiring(struct dentry *dentry)
253{ 245{
254 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 246 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
255 struct autofs_info *ino = autofs4_dentry_ino(dentry); 247 struct autofs_info *ino = autofs4_dentry_ino(dentry);
248
256 if (ino) { 249 if (ino) {
257 spin_lock(&sbi->lookup_lock); 250 spin_lock(&sbi->lookup_lock);
258 if (list_empty(&ino->expiring)) 251 if (list_empty(&ino->expiring))
259 list_add(&ino->expiring, &sbi->expiring_list); 252 list_add(&ino->expiring, &sbi->expiring_list);
260 spin_unlock(&sbi->lookup_lock); 253 spin_unlock(&sbi->lookup_lock);
261 } 254 }
262 return;
263} 255}
264 256
265static inline void autofs4_del_expiring(struct dentry *dentry) 257static inline void autofs4_del_expiring(struct dentry *dentry)
266{ 258{
267 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 259 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
268 struct autofs_info *ino = autofs4_dentry_ino(dentry); 260 struct autofs_info *ino = autofs4_dentry_ino(dentry);
261
269 if (ino) { 262 if (ino) {
270 spin_lock(&sbi->lookup_lock); 263 spin_lock(&sbi->lookup_lock);
271 if (!list_empty(&ino->expiring)) 264 if (!list_empty(&ino->expiring))
272 list_del_init(&ino->expiring); 265 list_del_init(&ino->expiring);
273 spin_unlock(&sbi->lookup_lock); 266 spin_unlock(&sbi->lookup_lock);
274 } 267 }
275 return;
276} 268}
277 269
278extern void autofs4_kill_sb(struct super_block *); 270extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ac7d921ed984..c7fcc7438843 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -72,13 +72,13 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
72{ 72{
73 int err = 0; 73 int err = 0;
74 74
75 if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) || 75 if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) ||
76 (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) { 76 (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) {
77 AUTOFS_WARN("ioctl control interface version mismatch: " 77 pr_warn("ioctl control interface version mismatch: "
78 "kernel(%u.%u), user(%u.%u), cmd(%d)", 78 "kernel(%u.%u), user(%u.%u), cmd(%d)\n",
79 AUTOFS_DEV_IOCTL_VERSION_MAJOR, 79 AUTOFS_DEV_IOCTL_VERSION_MAJOR,
80 AUTOFS_DEV_IOCTL_VERSION_MINOR, 80 AUTOFS_DEV_IOCTL_VERSION_MINOR,
81 param->ver_major, param->ver_minor, cmd); 81 param->ver_major, param->ver_minor, cmd);
82 err = -EINVAL; 82 err = -EINVAL;
83 } 83 }
84 84
@@ -93,7 +93,8 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
93 * Copy parameter control struct, including a possible path allocated 93 * Copy parameter control struct, including a possible path allocated
94 * at the end of the struct. 94 * at the end of the struct.
95 */ 95 */
96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) 96static struct autofs_dev_ioctl *
97 copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
97{ 98{
98 struct autofs_dev_ioctl tmp, *res; 99 struct autofs_dev_ioctl tmp, *res;
99 100
@@ -116,7 +117,6 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
116static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) 117static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
117{ 118{
118 kfree(param); 119 kfree(param);
119 return;
120} 120}
121 121
122/* 122/*
@@ -129,24 +129,24 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
129 129
130 err = check_dev_ioctl_version(cmd, param); 130 err = check_dev_ioctl_version(cmd, param);
131 if (err) { 131 if (err) {
132 AUTOFS_WARN("invalid device control module version " 132 pr_warn("invalid device control module version "
133 "supplied for cmd(0x%08x)", cmd); 133 "supplied for cmd(0x%08x)\n", cmd);
134 goto out; 134 goto out;
135 } 135 }
136 136
137 if (param->size > sizeof(*param)) { 137 if (param->size > sizeof(*param)) {
138 err = invalid_str(param->path, param->size - sizeof(*param)); 138 err = invalid_str(param->path, param->size - sizeof(*param));
139 if (err) { 139 if (err) {
140 AUTOFS_WARN( 140 pr_warn(
141 "path string terminator missing for cmd(0x%08x)", 141 "path string terminator missing for cmd(0x%08x)\n",
142 cmd); 142 cmd);
143 goto out; 143 goto out;
144 } 144 }
145 145
146 err = check_name(param->path); 146 err = check_name(param->path);
147 if (err) { 147 if (err) {
148 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", 148 pr_warn("invalid path supplied for cmd(0x%08x)\n",
149 cmd); 149 cmd);
150 goto out; 150 goto out;
151 } 151 }
152 } 152 }
@@ -197,7 +197,9 @@ static int find_autofs_mount(const char *pathname,
197 void *data) 197 void *data)
198{ 198{
199 struct path path; 199 struct path path;
200 int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); 200 int err;
201
202 err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
201 if (err) 203 if (err)
202 return err; 204 return err;
203 err = -ENOENT; 205 err = -ENOENT;
@@ -225,6 +227,7 @@ static int test_by_dev(struct path *path, void *p)
225static int test_by_type(struct path *path, void *p) 227static int test_by_type(struct path *path, void *p)
226{ 228{
227 struct autofs_info *ino = autofs4_dentry_ino(path->dentry); 229 struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
230
228 return ino && ino->sbi->type & *(unsigned *)p; 231 return ino && ino->sbi->type & *(unsigned *)p;
229} 232}
230 233
@@ -370,7 +373,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
370 new_pid = get_task_pid(current, PIDTYPE_PGID); 373 new_pid = get_task_pid(current, PIDTYPE_PGID);
371 374
372 if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { 375 if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
373 AUTOFS_WARN("Not allowed to change PID namespace"); 376 pr_warn("not allowed to change PID namespace\n");
374 err = -EINVAL; 377 err = -EINVAL;
375 goto out; 378 goto out;
376 } 379 }
@@ -456,8 +459,10 @@ static int autofs_dev_ioctl_requester(struct file *fp,
456 err = 0; 459 err = 0;
457 autofs4_expire_wait(path.dentry, 0); 460 autofs4_expire_wait(path.dentry, 0);
458 spin_lock(&sbi->fs_lock); 461 spin_lock(&sbi->fs_lock);
459 param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); 462 param->requester.uid =
460 param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); 463 from_kuid_munged(current_user_ns(), ino->uid);
464 param->requester.gid =
465 from_kgid_munged(current_user_ns(), ino->gid);
461 spin_unlock(&sbi->fs_lock); 466 spin_unlock(&sbi->fs_lock);
462 } 467 }
463 path_put(&path); 468 path_put(&path);
@@ -619,7 +624,8 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
619} 624}
620 625
621/* ioctl dispatcher */ 626/* ioctl dispatcher */
622static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) 627static int _autofs_dev_ioctl(unsigned int command,
628 struct autofs_dev_ioctl __user *user)
623{ 629{
624 struct autofs_dev_ioctl *param; 630 struct autofs_dev_ioctl *param;
625 struct file *fp; 631 struct file *fp;
@@ -655,7 +661,7 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use
655 661
656 fn = lookup_dev_ioctl(cmd); 662 fn = lookup_dev_ioctl(cmd);
657 if (!fn) { 663 if (!fn) {
658 AUTOFS_WARN("unknown command 0x%08x", command); 664 pr_warn("unknown command 0x%08x\n", command);
659 return -ENOTTY; 665 return -ENOTTY;
660 } 666 }
661 667
@@ -711,6 +717,7 @@ out:
711static long autofs_dev_ioctl(struct file *file, uint command, ulong u) 717static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
712{ 718{
713 int err; 719 int err;
720
714 err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); 721 err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
715 return (long) err; 722 return (long) err;
716} 723}
@@ -733,8 +740,8 @@ static const struct file_operations _dev_ioctl_fops = {
733 740
734static struct miscdevice _autofs_dev_ioctl_misc = { 741static struct miscdevice _autofs_dev_ioctl_misc = {
735 .minor = AUTOFS_MINOR, 742 .minor = AUTOFS_MINOR,
736 .name = AUTOFS_DEVICE_NAME, 743 .name = AUTOFS_DEVICE_NAME,
737 .fops = &_dev_ioctl_fops 744 .fops = &_dev_ioctl_fops
738}; 745};
739 746
740MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); 747MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
@@ -747,7 +754,7 @@ int __init autofs_dev_ioctl_init(void)
747 754
748 r = misc_register(&_autofs_dev_ioctl_misc); 755 r = misc_register(&_autofs_dev_ioctl_misc);
749 if (r) { 756 if (r) {
750 AUTOFS_ERROR("misc_register failed for control device"); 757 pr_err("misc_register failed for control device\n");
751 return r; 758 return r;
752 } 759 }
753 760
@@ -757,6 +764,4 @@ int __init autofs_dev_ioctl_init(void)
757void autofs_dev_ioctl_exit(void) 764void autofs_dev_ioctl_exit(void)
758{ 765{
759 misc_deregister(&_autofs_dev_ioctl_misc); 766 misc_deregister(&_autofs_dev_ioctl_misc);
760 return;
761} 767}
762
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 1cebc3c52fa5..9510d8d2e9cd 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -1,16 +1,12 @@
1/* -*- c -*- --------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
3 * linux/fs/autofs/expire.c 3 * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
4 * 4 * Copyright 2001-2006 Ian Kent <raven@themaw.net>
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
7 * Copyright 2001-2006 Ian Kent <raven@themaw.net>
8 * 5 *
9 * This file is part of the Linux kernel and is made available under 6 * This file is part of the Linux kernel and is made available under
10 * the terms of the GNU General Public License, version 2, or at your 7 * the terms of the GNU General Public License, version 2, or at your
11 * option, any later version, incorporated herein by reference. 8 * option, any later version, incorporated herein by reference.
12 * 9 */
13 * ------------------------------------------------------------------------- */
14 10
15#include "autofs_i.h" 11#include "autofs_i.h"
16 12
@@ -18,7 +14,7 @@ static unsigned long now;
18 14
19/* Check if a dentry can be expired */ 15/* Check if a dentry can be expired */
20static inline int autofs4_can_expire(struct dentry *dentry, 16static inline int autofs4_can_expire(struct dentry *dentry,
21 unsigned long timeout, int do_now) 17 unsigned long timeout, int do_now)
22{ 18{
23 struct autofs_info *ino = autofs4_dentry_ino(dentry); 19 struct autofs_info *ino = autofs4_dentry_ino(dentry);
24 20
@@ -41,7 +37,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
41 struct path path = {.mnt = mnt, .dentry = dentry}; 37 struct path path = {.mnt = mnt, .dentry = dentry};
42 int status = 1; 38 int status = 1;
43 39
44 DPRINTK("dentry %p %pd", dentry, dentry); 40 pr_debug("dentry %p %pd\n", dentry, dentry);
45 41
46 path_get(&path); 42 path_get(&path);
47 43
@@ -58,14 +54,16 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
58 54
59 /* Update the expiry counter if fs is busy */ 55 /* Update the expiry counter if fs is busy */
60 if (!may_umount_tree(path.mnt)) { 56 if (!may_umount_tree(path.mnt)) {
61 struct autofs_info *ino = autofs4_dentry_ino(top); 57 struct autofs_info *ino;
58
59 ino = autofs4_dentry_ino(top);
62 ino->last_used = jiffies; 60 ino->last_used = jiffies;
63 goto done; 61 goto done;
64 } 62 }
65 63
66 status = 0; 64 status = 0;
67done: 65done:
68 DPRINTK("returning = %d", status); 66 pr_debug("returning = %d\n", status);
69 path_put(&path); 67 path_put(&path);
70 return status; 68 return status;
71} 69}
@@ -74,7 +72,7 @@ done:
74 * Calculate and dget next entry in the subdirs list under root. 72 * Calculate and dget next entry in the subdirs list under root.
75 */ 73 */
76static struct dentry *get_next_positive_subdir(struct dentry *prev, 74static struct dentry *get_next_positive_subdir(struct dentry *prev,
77 struct dentry *root) 75 struct dentry *root)
78{ 76{
79 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); 77 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
80 struct list_head *next; 78 struct list_head *next;
@@ -121,7 +119,7 @@ cont:
121 * Calculate and dget next entry in top down tree traversal. 119 * Calculate and dget next entry in top down tree traversal.
122 */ 120 */
123static struct dentry *get_next_positive_dentry(struct dentry *prev, 121static struct dentry *get_next_positive_dentry(struct dentry *prev,
124 struct dentry *root) 122 struct dentry *root)
125{ 123{
126 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); 124 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
127 struct list_head *next; 125 struct list_head *next;
@@ -187,15 +185,17 @@ again:
187 * autofs submounts. 185 * autofs submounts.
188 */ 186 */
189static int autofs4_direct_busy(struct vfsmount *mnt, 187static int autofs4_direct_busy(struct vfsmount *mnt,
190 struct dentry *top, 188 struct dentry *top,
191 unsigned long timeout, 189 unsigned long timeout,
192 int do_now) 190 int do_now)
193{ 191{
194 DPRINTK("top %p %pd", top, top); 192 pr_debug("top %p %pd\n", top, top);
195 193
196 /* If it's busy update the expiry counters */ 194 /* If it's busy update the expiry counters */
197 if (!may_umount_tree(mnt)) { 195 if (!may_umount_tree(mnt)) {
198 struct autofs_info *ino = autofs4_dentry_ino(top); 196 struct autofs_info *ino;
197
198 ino = autofs4_dentry_ino(top);
199 if (ino) 199 if (ino)
200 ino->last_used = jiffies; 200 ino->last_used = jiffies;
201 return 1; 201 return 1;
@@ -208,7 +208,8 @@ static int autofs4_direct_busy(struct vfsmount *mnt,
208 return 0; 208 return 0;
209} 209}
210 210
211/* Check a directory tree of mount points for busyness 211/*
212 * Check a directory tree of mount points for busyness
212 * The tree is not busy iff no mountpoints are busy 213 * The tree is not busy iff no mountpoints are busy
213 */ 214 */
214static int autofs4_tree_busy(struct vfsmount *mnt, 215static int autofs4_tree_busy(struct vfsmount *mnt,
@@ -219,7 +220,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
219 struct autofs_info *top_ino = autofs4_dentry_ino(top); 220 struct autofs_info *top_ino = autofs4_dentry_ino(top);
220 struct dentry *p; 221 struct dentry *p;
221 222
222 DPRINTK("top %p %pd", top, top); 223 pr_debug("top %p %pd\n", top, top);
223 224
224 /* Negative dentry - give up */ 225 /* Negative dentry - give up */
225 if (!simple_positive(top)) 226 if (!simple_positive(top))
@@ -227,7 +228,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
227 228
228 p = NULL; 229 p = NULL;
229 while ((p = get_next_positive_dentry(p, top))) { 230 while ((p = get_next_positive_dentry(p, top))) {
230 DPRINTK("dentry %p %pd", p, p); 231 pr_debug("dentry %p %pd\n", p, p);
231 232
232 /* 233 /*
233 * Is someone visiting anywhere in the subtree ? 234 * Is someone visiting anywhere in the subtree ?
@@ -273,11 +274,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
273{ 274{
274 struct dentry *p; 275 struct dentry *p;
275 276
276 DPRINTK("parent %p %pd", parent, parent); 277 pr_debug("parent %p %pd\n", parent, parent);
277 278
278 p = NULL; 279 p = NULL;
279 while ((p = get_next_positive_dentry(p, parent))) { 280 while ((p = get_next_positive_dentry(p, parent))) {
280 DPRINTK("dentry %p %pd", p, p); 281 pr_debug("dentry %p %pd\n", p, p);
281 282
282 if (d_mountpoint(p)) { 283 if (d_mountpoint(p)) {
283 /* Can we umount this guy */ 284 /* Can we umount this guy */
@@ -362,7 +363,7 @@ static struct dentry *should_expire(struct dentry *dentry,
362 * offset (autofs-5.0+). 363 * offset (autofs-5.0+).
363 */ 364 */
364 if (d_mountpoint(dentry)) { 365 if (d_mountpoint(dentry)) {
365 DPRINTK("checking mountpoint %p %pd", dentry, dentry); 366 pr_debug("checking mountpoint %p %pd\n", dentry, dentry);
366 367
367 /* Can we umount this guy */ 368 /* Can we umount this guy */
368 if (autofs4_mount_busy(mnt, dentry)) 369 if (autofs4_mount_busy(mnt, dentry))
@@ -375,7 +376,7 @@ static struct dentry *should_expire(struct dentry *dentry,
375 } 376 }
376 377
377 if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { 378 if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
378 DPRINTK("checking symlink %p %pd", dentry, dentry); 379 pr_debug("checking symlink %p %pd\n", dentry, dentry);
379 /* 380 /*
380 * A symlink can't be "busy" in the usual sense so 381 * A symlink can't be "busy" in the usual sense so
381 * just check last used for expire timeout. 382 * just check last used for expire timeout.
@@ -404,6 +405,7 @@ static struct dentry *should_expire(struct dentry *dentry,
404 } else { 405 } else {
405 /* Path walk currently on this dentry? */ 406 /* Path walk currently on this dentry? */
406 struct dentry *expired; 407 struct dentry *expired;
408
407 ino_count = atomic_read(&ino->count) + 1; 409 ino_count = atomic_read(&ino->count) + 1;
408 if (d_count(dentry) > ino_count) 410 if (d_count(dentry) > ino_count)
409 return NULL; 411 return NULL;
@@ -471,7 +473,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
471 return NULL; 473 return NULL;
472 474
473found: 475found:
474 DPRINTK("returning %p %pd", expired, expired); 476 pr_debug("returning %p %pd\n", expired, expired);
475 ino->flags |= AUTOFS_INF_EXPIRING; 477 ino->flags |= AUTOFS_INF_EXPIRING;
476 smp_mb(); 478 smp_mb();
477 ino->flags &= ~AUTOFS_INF_NO_RCU; 479 ino->flags &= ~AUTOFS_INF_NO_RCU;
@@ -503,12 +505,12 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
503 if (ino->flags & AUTOFS_INF_EXPIRING) { 505 if (ino->flags & AUTOFS_INF_EXPIRING) {
504 spin_unlock(&sbi->fs_lock); 506 spin_unlock(&sbi->fs_lock);
505 507
506 DPRINTK("waiting for expire %p name=%pd", dentry, dentry); 508 pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
507 509
508 status = autofs4_wait(sbi, dentry, NFY_NONE); 510 status = autofs4_wait(sbi, dentry, NFY_NONE);
509 wait_for_completion(&ino->expire_complete); 511 wait_for_completion(&ino->expire_complete);
510 512
511 DPRINTK("expire done status=%d", status); 513 pr_debug("expire done status=%d\n", status);
512 514
513 if (d_unhashed(dentry)) 515 if (d_unhashed(dentry))
514 return -EAGAIN; 516 return -EAGAIN;
@@ -522,21 +524,22 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
522 524
523/* Perform an expiry operation */ 525/* Perform an expiry operation */
524int autofs4_expire_run(struct super_block *sb, 526int autofs4_expire_run(struct super_block *sb,
525 struct vfsmount *mnt, 527 struct vfsmount *mnt,
526 struct autofs_sb_info *sbi, 528 struct autofs_sb_info *sbi,
527 struct autofs_packet_expire __user *pkt_p) 529 struct autofs_packet_expire __user *pkt_p)
528{ 530{
529 struct autofs_packet_expire pkt; 531 struct autofs_packet_expire pkt;
530 struct autofs_info *ino; 532 struct autofs_info *ino;
531 struct dentry *dentry; 533 struct dentry *dentry;
532 int ret = 0; 534 int ret = 0;
533 535
534 memset(&pkt,0,sizeof pkt); 536 memset(&pkt, 0, sizeof(pkt));
535 537
536 pkt.hdr.proto_version = sbi->version; 538 pkt.hdr.proto_version = sbi->version;
537 pkt.hdr.type = autofs_ptype_expire; 539 pkt.hdr.type = autofs_ptype_expire;
538 540
539 if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL) 541 dentry = autofs4_expire_indirect(sb, mnt, sbi, 0);
542 if (!dentry)
540 return -EAGAIN; 543 return -EAGAIN;
541 544
542 pkt.len = dentry->d_name.len; 545 pkt.len = dentry->d_name.len;
@@ -544,7 +547,7 @@ int autofs4_expire_run(struct super_block *sb,
544 pkt.name[pkt.len] = '\0'; 547 pkt.name[pkt.len] = '\0';
545 dput(dentry); 548 dput(dentry);
546 549
547 if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) ) 550 if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
548 ret = -EFAULT; 551 ret = -EFAULT;
549 552
550 spin_lock(&sbi->fs_lock); 553 spin_lock(&sbi->fs_lock);
@@ -573,7 +576,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
573 struct autofs_info *ino = autofs4_dentry_ino(dentry); 576 struct autofs_info *ino = autofs4_dentry_ino(dentry);
574 577
575 /* This is synchronous because it makes the daemon a 578 /* This is synchronous because it makes the daemon a
576 little easier */ 579 * little easier
580 */
577 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); 581 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
578 582
579 spin_lock(&sbi->fs_lock); 583 spin_lock(&sbi->fs_lock);
@@ -588,8 +592,10 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
588 return ret; 592 return ret;
589} 593}
590 594
591/* Call repeatedly until it returns -EAGAIN, meaning there's nothing 595/*
592 more to be done */ 596 * Call repeatedly until it returns -EAGAIN, meaning there's nothing
597 * more to be done.
598 */
593int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, 599int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
594 struct autofs_sb_info *sbi, int __user *arg) 600 struct autofs_sb_info *sbi, int __user *arg)
595{ 601{
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index b3db517e89ec..8cf0e63389ae 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -1,14 +1,10 @@
1/* -*- c -*- --------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
3 * linux/fs/autofs/init.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 * 3 *
7 * This file is part of the Linux kernel and is made available under 4 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your 5 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference. 6 * option, any later version, incorporated herein by reference.
10 * 7 */
11 * ------------------------------------------------------------------------- */
12 8
13#include <linux/module.h> 9#include <linux/module.h>
14#include <linux/init.h> 10#include <linux/init.h>
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a3ae0b2aeb5a..61b21051bd5a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -1,15 +1,11 @@
1/* -*- c -*- --------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
3 * linux/fs/autofs/inode.c 3 * Copyright 2005-2006 Ian Kent <raven@themaw.net>
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 * Copyright 2005-2006 Ian Kent <raven@themaw.net>
7 * 4 *
8 * This file is part of the Linux kernel and is made available under 5 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your 6 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference. 7 * option, any later version, incorporated herein by reference.
11 * 8 */
12 * ------------------------------------------------------------------------- */
13 9
14#include <linux/kernel.h> 10#include <linux/kernel.h>
15#include <linux/slab.h> 11#include <linux/slab.h>
@@ -24,7 +20,9 @@
24 20
25struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) 21struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
26{ 22{
27 struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL); 23 struct autofs_info *ino;
24
25 ino = kzalloc(sizeof(*ino), GFP_KERNEL);
28 if (ino) { 26 if (ino) {
29 INIT_LIST_HEAD(&ino->active); 27 INIT_LIST_HEAD(&ino->active);
30 INIT_LIST_HEAD(&ino->expiring); 28 INIT_LIST_HEAD(&ino->expiring);
@@ -62,7 +60,7 @@ void autofs4_kill_sb(struct super_block *sb)
62 put_pid(sbi->oz_pgrp); 60 put_pid(sbi->oz_pgrp);
63 } 61 }
64 62
65 DPRINTK("shutting down"); 63 pr_debug("shutting down\n");
66 kill_litter_super(sb); 64 kill_litter_super(sb);
67 if (sbi) 65 if (sbi)
68 kfree_rcu(sbi, rcu); 66 kfree_rcu(sbi, rcu);
@@ -94,7 +92,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
94 seq_printf(m, ",direct"); 92 seq_printf(m, ",direct");
95 else 93 else
96 seq_printf(m, ",indirect"); 94 seq_printf(m, ",indirect");
97 95#ifdef CONFIG_CHECKPOINT_RESTORE
96 if (sbi->pipe)
97 seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
98 else
99 seq_printf(m, ",pipe_ino=-1");
100#endif
98 return 0; 101 return 0;
99} 102}
100 103
@@ -147,6 +150,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
147 150
148 while ((p = strsep(&options, ",")) != NULL) { 151 while ((p = strsep(&options, ",")) != NULL) {
149 int token; 152 int token;
153
150 if (!*p) 154 if (!*p)
151 continue; 155 continue;
152 156
@@ -204,9 +208,9 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
204 208
205int autofs4_fill_super(struct super_block *s, void *data, int silent) 209int autofs4_fill_super(struct super_block *s, void *data, int silent)
206{ 210{
207 struct inode * root_inode; 211 struct inode *root_inode;
208 struct dentry * root; 212 struct dentry *root;
209 struct file * pipe; 213 struct file *pipe;
210 int pipefd; 214 int pipefd;
211 struct autofs_sb_info *sbi; 215 struct autofs_sb_info *sbi;
212 struct autofs_info *ino; 216 struct autofs_info *ino;
@@ -217,7 +221,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
217 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 221 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
218 if (!sbi) 222 if (!sbi)
219 return -ENOMEM; 223 return -ENOMEM;
220 DPRINTK("starting up, sbi = %p",sbi); 224 pr_debug("starting up, sbi = %p\n", sbi);
221 225
222 s->s_fs_info = sbi; 226 s->s_fs_info = sbi;
223 sbi->magic = AUTOFS_SBI_MAGIC; 227 sbi->magic = AUTOFS_SBI_MAGIC;
@@ -266,14 +270,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
266 if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, 270 if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
267 &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, 271 &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
268 &sbi->max_proto)) { 272 &sbi->max_proto)) {
269 printk("autofs: called with bogus options\n"); 273 pr_err("called with bogus options\n");
270 goto fail_dput; 274 goto fail_dput;
271 } 275 }
272 276
273 if (pgrp_set) { 277 if (pgrp_set) {
274 sbi->oz_pgrp = find_get_pid(pgrp); 278 sbi->oz_pgrp = find_get_pid(pgrp);
275 if (!sbi->oz_pgrp) { 279 if (!sbi->oz_pgrp) {
276 pr_warn("autofs: could not find process group %d\n", 280 pr_err("could not find process group %d\n",
277 pgrp); 281 pgrp);
278 goto fail_dput; 282 goto fail_dput;
279 } 283 }
@@ -290,10 +294,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
290 /* Couldn't this be tested earlier? */ 294 /* Couldn't this be tested earlier? */
291 if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || 295 if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
292 sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { 296 sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
293 printk("autofs: kernel does not match daemon version " 297 pr_err("kernel does not match daemon version "
294 "daemon (%d, %d) kernel (%d, %d)\n", 298 "daemon (%d, %d) kernel (%d, %d)\n",
295 sbi->min_proto, sbi->max_proto, 299 sbi->min_proto, sbi->max_proto,
296 AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); 300 AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
297 goto fail_dput; 301 goto fail_dput;
298 } 302 }
299 303
@@ -304,11 +308,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
304 sbi->version = sbi->max_proto; 308 sbi->version = sbi->max_proto;
305 sbi->sub_version = AUTOFS_PROTO_SUBVERSION; 309 sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
306 310
307 DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp)); 311 pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
308 pipe = fget(pipefd); 312 pipe = fget(pipefd);
309 313
310 if (!pipe) { 314 if (!pipe) {
311 printk("autofs: could not open pipe file descriptor\n"); 315 pr_err("could not open pipe file descriptor\n");
312 goto fail_dput; 316 goto fail_dput;
313 } 317 }
314 ret = autofs_prepare_pipe(pipe); 318 ret = autofs_prepare_pipe(pipe);
@@ -323,12 +327,12 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
323 */ 327 */
324 s->s_root = root; 328 s->s_root = root;
325 return 0; 329 return 0;
326 330
327 /* 331 /*
328 * Failure ... clean up. 332 * Failure ... clean up.
329 */ 333 */
330fail_fput: 334fail_fput:
331 printk("autofs: pipe file descriptor does not contain proper ops\n"); 335 pr_err("pipe file descriptor does not contain proper ops\n");
332 fput(pipe); 336 fput(pipe);
333 /* fall through */ 337 /* fall through */
334fail_dput: 338fail_dput:
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c6d7d3dbd52a..9328b5861c7a 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -1,16 +1,12 @@
1/* -*- c -*- --------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
3 * linux/fs/autofs/root.c 3 * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
4 * 4 * Copyright 2001-2006 Ian Kent <raven@themaw.net>
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
7 * Copyright 2001-2006 Ian Kent <raven@themaw.net>
8 * 5 *
9 * This file is part of the Linux kernel and is made available under 6 * This file is part of the Linux kernel and is made available under
10 * the terms of the GNU General Public License, version 2, or at your 7 * the terms of the GNU General Public License, version 2, or at your
11 * option, any later version, incorporated herein by reference. 8 * option, any later version, incorporated herein by reference.
12 * 9 */
13 * ------------------------------------------------------------------------- */
14 10
15#include <linux/capability.h> 11#include <linux/capability.h>
16#include <linux/errno.h> 12#include <linux/errno.h>
@@ -23,16 +19,18 @@
23 19
24#include "autofs_i.h" 20#include "autofs_i.h"
25 21
26static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 22static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *);
27static int autofs4_dir_unlink(struct inode *,struct dentry *); 23static int autofs4_dir_unlink(struct inode *, struct dentry *);
28static int autofs4_dir_rmdir(struct inode *,struct dentry *); 24static int autofs4_dir_rmdir(struct inode *, struct dentry *);
29static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t); 25static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t);
30static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); 26static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long);
31#ifdef CONFIG_COMPAT 27#ifdef CONFIG_COMPAT
32static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); 28static long autofs4_root_compat_ioctl(struct file *,
29 unsigned int, unsigned long);
33#endif 30#endif
34static int autofs4_dir_open(struct inode *inode, struct file *file); 31static int autofs4_dir_open(struct inode *inode, struct file *file);
35static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int); 32static struct dentry *autofs4_lookup(struct inode *,
33 struct dentry *, unsigned int);
36static struct vfsmount *autofs4_d_automount(struct path *); 34static struct vfsmount *autofs4_d_automount(struct path *);
37static int autofs4_d_manage(struct dentry *, bool); 35static int autofs4_d_manage(struct dentry *, bool);
38static void autofs4_dentry_release(struct dentry *); 36static void autofs4_dentry_release(struct dentry *);
@@ -74,7 +72,9 @@ const struct dentry_operations autofs4_dentry_operations = {
74static void autofs4_add_active(struct dentry *dentry) 72static void autofs4_add_active(struct dentry *dentry)
75{ 73{
76 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 74 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
77 struct autofs_info *ino = autofs4_dentry_ino(dentry); 75 struct autofs_info *ino;
76
77 ino = autofs4_dentry_ino(dentry);
78 if (ino) { 78 if (ino) {
79 spin_lock(&sbi->lookup_lock); 79 spin_lock(&sbi->lookup_lock);
80 if (!ino->active_count) { 80 if (!ino->active_count) {
@@ -84,13 +84,14 @@ static void autofs4_add_active(struct dentry *dentry)
84 ino->active_count++; 84 ino->active_count++;
85 spin_unlock(&sbi->lookup_lock); 85 spin_unlock(&sbi->lookup_lock);
86 } 86 }
87 return;
88} 87}
89 88
90static void autofs4_del_active(struct dentry *dentry) 89static void autofs4_del_active(struct dentry *dentry)
91{ 90{
92 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 91 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
93 struct autofs_info *ino = autofs4_dentry_ino(dentry); 92 struct autofs_info *ino;
93
94 ino = autofs4_dentry_ino(dentry);
94 if (ino) { 95 if (ino) {
95 spin_lock(&sbi->lookup_lock); 96 spin_lock(&sbi->lookup_lock);
96 ino->active_count--; 97 ino->active_count--;
@@ -100,7 +101,6 @@ static void autofs4_del_active(struct dentry *dentry)
100 } 101 }
101 spin_unlock(&sbi->lookup_lock); 102 spin_unlock(&sbi->lookup_lock);
102 } 103 }
103 return;
104} 104}
105 105
106static int autofs4_dir_open(struct inode *inode, struct file *file) 106static int autofs4_dir_open(struct inode *inode, struct file *file)
@@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
108 struct dentry *dentry = file->f_path.dentry; 108 struct dentry *dentry = file->f_path.dentry;
109 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 109 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
110 110
111 DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry); 111 pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
112 112
113 if (autofs4_oz_mode(sbi)) 113 if (autofs4_oz_mode(sbi))
114 goto out; 114 goto out;
@@ -138,7 +138,7 @@ static void autofs4_dentry_release(struct dentry *de)
138 struct autofs_info *ino = autofs4_dentry_ino(de); 138 struct autofs_info *ino = autofs4_dentry_ino(de);
139 struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); 139 struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
140 140
141 DPRINTK("releasing %p", de); 141 pr_debug("releasing %p\n", de);
142 142
143 if (!ino) 143 if (!ino)
144 return; 144 return;
@@ -278,9 +278,9 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
278 if (ino->flags & AUTOFS_INF_PENDING) { 278 if (ino->flags & AUTOFS_INF_PENDING) {
279 if (rcu_walk) 279 if (rcu_walk)
280 return -ECHILD; 280 return -ECHILD;
281 DPRINTK("waiting for mount name=%pd", dentry); 281 pr_debug("waiting for mount name=%pd\n", dentry);
282 status = autofs4_wait(sbi, dentry, NFY_MOUNT); 282 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
283 DPRINTK("mount wait done status=%d", status); 283 pr_debug("mount wait done status=%d\n", status);
284 } 284 }
285 ino->last_used = jiffies; 285 ino->last_used = jiffies;
286 return status; 286 return status;
@@ -320,7 +320,9 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { 320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
321 struct dentry *parent = dentry->d_parent; 321 struct dentry *parent = dentry->d_parent;
322 struct autofs_info *ino; 322 struct autofs_info *ino;
323 struct dentry *new = d_lookup(parent, &dentry->d_name); 323 struct dentry *new;
324
325 new = d_lookup(parent, &dentry->d_name);
324 if (!new) 326 if (!new)
325 return NULL; 327 return NULL;
326 ino = autofs4_dentry_ino(new); 328 ino = autofs4_dentry_ino(new);
@@ -338,7 +340,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
338 struct autofs_info *ino = autofs4_dentry_ino(dentry); 340 struct autofs_info *ino = autofs4_dentry_ino(dentry);
339 int status; 341 int status;
340 342
341 DPRINTK("dentry=%p %pd", dentry, dentry); 343 pr_debug("dentry=%p %pd\n", dentry, dentry);
342 344
343 /* The daemon never triggers a mount. */ 345 /* The daemon never triggers a mount. */
344 if (autofs4_oz_mode(sbi)) 346 if (autofs4_oz_mode(sbi))
@@ -425,7 +427,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
425 struct autofs_info *ino = autofs4_dentry_ino(dentry); 427 struct autofs_info *ino = autofs4_dentry_ino(dentry);
426 int status; 428 int status;
427 429
428 DPRINTK("dentry=%p %pd", dentry, dentry); 430 pr_debug("dentry=%p %pd\n", dentry, dentry);
429 431
430 /* The daemon never waits. */ 432 /* The daemon never waits. */
431 if (autofs4_oz_mode(sbi)) { 433 if (autofs4_oz_mode(sbi)) {
@@ -455,6 +457,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
455 * a mount-trap. 457 * a mount-trap.
456 */ 458 */
457 struct inode *inode; 459 struct inode *inode;
460
458 if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)) 461 if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
459 return 0; 462 return 0;
460 if (d_mountpoint(dentry)) 463 if (d_mountpoint(dentry))
@@ -494,13 +497,14 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
494} 497}
495 498
496/* Lookups in the root directory */ 499/* Lookups in the root directory */
497static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 500static struct dentry *autofs4_lookup(struct inode *dir,
501 struct dentry *dentry, unsigned int flags)
498{ 502{
499 struct autofs_sb_info *sbi; 503 struct autofs_sb_info *sbi;
500 struct autofs_info *ino; 504 struct autofs_info *ino;
501 struct dentry *active; 505 struct dentry *active;
502 506
503 DPRINTK("name = %pd", dentry); 507 pr_debug("name = %pd\n", dentry);
504 508
505 /* File name too long to exist */ 509 /* File name too long to exist */
506 if (dentry->d_name.len > NAME_MAX) 510 if (dentry->d_name.len > NAME_MAX)
@@ -508,14 +512,14 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
508 512
509 sbi = autofs4_sbi(dir->i_sb); 513 sbi = autofs4_sbi(dir->i_sb);
510 514
511 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 515 pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
512 current->pid, task_pgrp_nr(current), sbi->catatonic, 516 current->pid, task_pgrp_nr(current), sbi->catatonic,
513 autofs4_oz_mode(sbi)); 517 autofs4_oz_mode(sbi));
514 518
515 active = autofs4_lookup_active(dentry); 519 active = autofs4_lookup_active(dentry);
516 if (active) { 520 if (active)
517 return active; 521 return active;
518 } else { 522 else {
519 /* 523 /*
520 * A dentry that is not within the root can never trigger a 524 * A dentry that is not within the root can never trigger a
521 * mount operation, unless the directory already exists, so we 525 * mount operation, unless the directory already exists, so we
@@ -526,7 +530,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
526 return ERR_PTR(-ENOENT); 530 return ERR_PTR(-ENOENT);
527 531
528 /* Mark entries in the root as mount triggers */ 532 /* Mark entries in the root as mount triggers */
529 if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent)) 533 if (IS_ROOT(dentry->d_parent) &&
534 autofs_type_indirect(sbi->type))
530 __managed_dentry_set_managed(dentry); 535 __managed_dentry_set_managed(dentry);
531 536
532 ino = autofs4_new_ino(sbi); 537 ino = autofs4_new_ino(sbi);
@@ -554,7 +559,7 @@ static int autofs4_dir_symlink(struct inode *dir,
554 size_t size = strlen(symname); 559 size_t size = strlen(symname);
555 char *cp; 560 char *cp;
556 561
557 DPRINTK("%s <- %pd", symname, dentry); 562 pr_debug("%s <- %pd\n", symname, dentry);
558 563
559 if (!autofs4_oz_mode(sbi)) 564 if (!autofs4_oz_mode(sbi))
560 return -EACCES; 565 return -EACCES;
@@ -613,7 +618,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
613 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 618 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
614 struct autofs_info *ino = autofs4_dentry_ino(dentry); 619 struct autofs_info *ino = autofs4_dentry_ino(dentry);
615 struct autofs_info *p_ino; 620 struct autofs_info *p_ino;
616 621
617 /* This allows root to remove symlinks */ 622 /* This allows root to remove symlinks */
618 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) 623 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
619 return -EPERM; 624 return -EPERM;
@@ -664,7 +669,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry)
664 if (IS_ROOT(parent->d_parent)) 669 if (IS_ROOT(parent->d_parent))
665 return; 670 return;
666 managed_dentry_clear_managed(parent); 671 managed_dentry_clear_managed(parent);
667 return;
668} 672}
669 673
670static void autofs_clear_leaf_automount_flags(struct dentry *dentry) 674static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
@@ -687,7 +691,6 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
687 if (d_child->next == &parent->d_subdirs && 691 if (d_child->next == &parent->d_subdirs &&
688 d_child->prev == &parent->d_subdirs) 692 d_child->prev == &parent->d_subdirs)
689 managed_dentry_set_managed(parent); 693 managed_dentry_set_managed(parent);
690 return;
691} 694}
692 695
693static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) 696static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
@@ -695,8 +698,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
695 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 698 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
696 struct autofs_info *ino = autofs4_dentry_ino(dentry); 699 struct autofs_info *ino = autofs4_dentry_ino(dentry);
697 struct autofs_info *p_ino; 700 struct autofs_info *p_ino;
698 701
699 DPRINTK("dentry %p, removing %pd", dentry, dentry); 702 pr_debug("dentry %p, removing %pd\n", dentry, dentry);
700 703
701 if (!autofs4_oz_mode(sbi)) 704 if (!autofs4_oz_mode(sbi))
702 return -EACCES; 705 return -EACCES;
@@ -728,7 +731,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
728 return 0; 731 return 0;
729} 732}
730 733
731static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 734static int autofs4_dir_mkdir(struct inode *dir,
735 struct dentry *dentry, umode_t mode)
732{ 736{
733 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 737 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
734 struct autofs_info *ino = autofs4_dentry_ino(dentry); 738 struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -738,7 +742,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
738 if (!autofs4_oz_mode(sbi)) 742 if (!autofs4_oz_mode(sbi))
739 return -EACCES; 743 return -EACCES;
740 744
741 DPRINTK("dentry %p, creating %pd", dentry, dentry); 745 pr_debug("dentry %p, creating %pd\n", dentry, dentry);
742 746
743 BUG_ON(!ino); 747 BUG_ON(!ino);
744 748
@@ -768,14 +772,18 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
768/* Get/set timeout ioctl() operation */ 772/* Get/set timeout ioctl() operation */
769#ifdef CONFIG_COMPAT 773#ifdef CONFIG_COMPAT
770static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, 774static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
771 compat_ulong_t __user *p) 775 compat_ulong_t __user *p)
772{ 776{
773 int rv;
774 unsigned long ntimeout; 777 unsigned long ntimeout;
778 int rv;
775 779
776 if ((rv = get_user(ntimeout, p)) || 780 rv = get_user(ntimeout, p);
777 (rv = put_user(sbi->exp_timeout/HZ, p))) 781 if (rv)
778 return rv; 782 goto error;
783
784 rv = put_user(sbi->exp_timeout/HZ, p);
785 if (rv)
786 goto error;
779 787
780 if (ntimeout > UINT_MAX/HZ) 788 if (ntimeout > UINT_MAX/HZ)
781 sbi->exp_timeout = 0; 789 sbi->exp_timeout = 0;
@@ -783,18 +791,24 @@ static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
783 sbi->exp_timeout = ntimeout * HZ; 791 sbi->exp_timeout = ntimeout * HZ;
784 792
785 return 0; 793 return 0;
794error:
795 return rv;
786} 796}
787#endif 797#endif
788 798
789static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, 799static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
790 unsigned long __user *p) 800 unsigned long __user *p)
791{ 801{
792 int rv;
793 unsigned long ntimeout; 802 unsigned long ntimeout;
803 int rv;
794 804
795 if ((rv = get_user(ntimeout, p)) || 805 rv = get_user(ntimeout, p);
796 (rv = put_user(sbi->exp_timeout/HZ, p))) 806 if (rv)
797 return rv; 807 goto error;
808
809 rv = put_user(sbi->exp_timeout/HZ, p);
810 if (rv)
811 goto error;
798 812
799 if (ntimeout > ULONG_MAX/HZ) 813 if (ntimeout > ULONG_MAX/HZ)
800 sbi->exp_timeout = 0; 814 sbi->exp_timeout = 0;
@@ -802,16 +816,20 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
802 sbi->exp_timeout = ntimeout * HZ; 816 sbi->exp_timeout = ntimeout * HZ;
803 817
804 return 0; 818 return 0;
819error:
820 return rv;
805} 821}
806 822
807/* Return protocol version */ 823/* Return protocol version */
808static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p) 824static inline int autofs4_get_protover(struct autofs_sb_info *sbi,
825 int __user *p)
809{ 826{
810 return put_user(sbi->version, p); 827 return put_user(sbi->version, p);
811} 828}
812 829
813/* Return protocol sub version */ 830/* Return protocol sub version */
814static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p) 831static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi,
832 int __user *p)
815{ 833{
816 return put_user(sbi->sub_version, p); 834 return put_user(sbi->sub_version, p);
817} 835}
@@ -826,7 +844,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
826 if (may_umount(mnt)) 844 if (may_umount(mnt))
827 status = 1; 845 status = 1;
828 846
829 DPRINTK("returning %d", status); 847 pr_debug("returning %d\n", status);
830 848
831 status = put_user(status, p); 849 status = put_user(status, p);
832 850
@@ -834,9 +852,9 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
834} 852}
835 853
836/* Identify autofs4_dentries - this is so we can tell if there's 854/* Identify autofs4_dentries - this is so we can tell if there's
837 an extra dentry refcount or not. We only hold a refcount on the 855 * an extra dentry refcount or not. We only hold a refcount on the
838 dentry if its non-negative (ie, d_inode != NULL) 856 * dentry if its non-negative (ie, d_inode != NULL)
839*/ 857 */
840int is_autofs4_dentry(struct dentry *dentry) 858int is_autofs4_dentry(struct dentry *dentry)
841{ 859{
842 return dentry && d_really_is_positive(dentry) && 860 return dentry && d_really_is_positive(dentry) &&
@@ -854,21 +872,21 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
854 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); 872 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
855 void __user *p = (void __user *)arg; 873 void __user *p = (void __user *)arg;
856 874
857 DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u", 875 pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",
858 cmd,arg,sbi,task_pgrp_nr(current)); 876 cmd, arg, sbi, task_pgrp_nr(current));
859 877
860 if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || 878 if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
861 _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) 879 _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
862 return -ENOTTY; 880 return -ENOTTY;
863 881
864 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) 882 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
865 return -EPERM; 883 return -EPERM;
866 884
867 switch(cmd) { 885 switch (cmd) {
868 case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ 886 case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */
869 return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0); 887 return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0);
870 case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ 888 case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */
871 return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT); 889 return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT);
872 case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ 890 case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
873 autofs4_catatonic_mode(sbi); 891 autofs4_catatonic_mode(sbi);
874 return 0; 892 return 0;
@@ -888,13 +906,15 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
888 906
889 /* return a single thing to expire */ 907 /* return a single thing to expire */
890 case AUTOFS_IOC_EXPIRE: 908 case AUTOFS_IOC_EXPIRE:
891 return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p); 909 return autofs4_expire_run(inode->i_sb,
910 filp->f_path.mnt, sbi, p);
892 /* same as above, but can send multiple expires through pipe */ 911 /* same as above, but can send multiple expires through pipe */
893 case AUTOFS_IOC_EXPIRE_MULTI: 912 case AUTOFS_IOC_EXPIRE_MULTI:
894 return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p); 913 return autofs4_expire_multi(inode->i_sb,
914 filp->f_path.mnt, sbi, p);
895 915
896 default: 916 default:
897 return -ENOSYS; 917 return -EINVAL;
898 } 918 }
899} 919}
900 920
@@ -902,12 +922,13 @@ static long autofs4_root_ioctl(struct file *filp,
902 unsigned int cmd, unsigned long arg) 922 unsigned int cmd, unsigned long arg)
903{ 923{
904 struct inode *inode = file_inode(filp); 924 struct inode *inode = file_inode(filp);
925
905 return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); 926 return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
906} 927}
907 928
908#ifdef CONFIG_COMPAT 929#ifdef CONFIG_COMPAT
909static long autofs4_root_compat_ioctl(struct file *filp, 930static long autofs4_root_compat_ioctl(struct file *filp,
910 unsigned int cmd, unsigned long arg) 931 unsigned int cmd, unsigned long arg)
911{ 932{
912 struct inode *inode = file_inode(filp); 933 struct inode *inode = file_inode(filp);
913 int ret; 934 int ret;
@@ -916,7 +937,7 @@ static long autofs4_root_compat_ioctl(struct file *filp,
916 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); 937 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
917 else 938 else
918 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, 939 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
919 (unsigned long)compat_ptr(arg)); 940 (unsigned long) compat_ptr(arg));
920 941
921 return ret; 942 return ret;
922} 943}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index 84e037d1d129..99aab00dc217 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -1,14 +1,10 @@
1/* -*- c -*- --------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
3 * linux/fs/autofs/symlink.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 * 3 *
7 * This file is part of the Linux kernel and is made available under 4 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your 5 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference. 6 * option, any later version, incorporated herein by reference.
10 * 7 */
11 * ------------------------------------------------------------------------- */
12 8
13#include "autofs_i.h" 9#include "autofs_i.h"
14 10
@@ -18,6 +14,7 @@ static const char *autofs4_get_link(struct dentry *dentry,
18{ 14{
19 struct autofs_sb_info *sbi; 15 struct autofs_sb_info *sbi;
20 struct autofs_info *ino; 16 struct autofs_info *ino;
17
21 if (!dentry) 18 if (!dentry)
22 return ERR_PTR(-ECHILD); 19 return ERR_PTR(-ECHILD);
23 sbi = autofs4_sbi(dentry->d_sb); 20 sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 35b755e79c2d..0146d911f468 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -1,15 +1,11 @@
1/* -*- c -*- --------------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
3 * linux/fs/autofs/waitq.c 3 * Copyright 2001-2006 Ian Kent <raven@themaw.net>
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 * Copyright 2001-2006 Ian Kent <raven@themaw.net>
7 * 4 *
8 * This file is part of the Linux kernel and is made available under 5 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your 6 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference. 7 * option, any later version, incorporated herein by reference.
11 * 8 */
12 * ------------------------------------------------------------------------- */
13 9
14#include <linux/slab.h> 10#include <linux/slab.h>
15#include <linux/time.h> 11#include <linux/time.h>
@@ -18,7 +14,8 @@
18#include "autofs_i.h" 14#include "autofs_i.h"
19 15
20/* We make this a static variable rather than a part of the superblock; it 16/* We make this a static variable rather than a part of the superblock; it
21 is better if we don't reassign numbers easily even across filesystems */ 17 * is better if we don't reassign numbers easily even across filesystems
18 */
22static autofs_wqt_t autofs4_next_wait_queue = 1; 19static autofs_wqt_t autofs4_next_wait_queue = 1;
23 20
24/* These are the signals we allow interrupting a pending mount */ 21/* These are the signals we allow interrupting a pending mount */
@@ -34,7 +31,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
34 return; 31 return;
35 } 32 }
36 33
37 DPRINTK("entering catatonic mode"); 34 pr_debug("entering catatonic mode\n");
38 35
39 sbi->catatonic = 1; 36 sbi->catatonic = 1;
40 wq = sbi->queues; 37 wq = sbi->queues;
@@ -69,17 +66,19 @@ static int autofs4_write(struct autofs_sb_info *sbi,
69 set_fs(KERNEL_DS); 66 set_fs(KERNEL_DS);
70 67
71 mutex_lock(&sbi->pipe_mutex); 68 mutex_lock(&sbi->pipe_mutex);
72 while (bytes && 69 wr = __vfs_write(file, data, bytes, &file->f_pos);
73 (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) { 70 while (bytes && wr) {
74 data += wr; 71 data += wr;
75 bytes -= wr; 72 bytes -= wr;
73 wr = __vfs_write(file, data, bytes, &file->f_pos);
76 } 74 }
77 mutex_unlock(&sbi->pipe_mutex); 75 mutex_unlock(&sbi->pipe_mutex);
78 76
79 set_fs(fs); 77 set_fs(fs);
80 78
81 /* Keep the currently executing process from receiving a 79 /* Keep the currently executing process from receiving a
82 SIGPIPE unless it was already supposed to get one */ 80 * SIGPIPE unless it was already supposed to get one
81 */
83 if (wr == -EPIPE && !sigpipe) { 82 if (wr == -EPIPE && !sigpipe) {
84 spin_lock_irqsave(&current->sighand->siglock, flags); 83 spin_lock_irqsave(&current->sighand->siglock, flags);
85 sigdelset(&current->pending.signal, SIGPIPE); 84 sigdelset(&current->pending.signal, SIGPIPE);
@@ -89,7 +88,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
89 88
90 return (bytes > 0); 89 return (bytes > 0);
91} 90}
92 91
93static void autofs4_notify_daemon(struct autofs_sb_info *sbi, 92static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
94 struct autofs_wait_queue *wq, 93 struct autofs_wait_queue *wq,
95 int type) 94 int type)
@@ -102,10 +101,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
102 struct file *pipe = NULL; 101 struct file *pipe = NULL;
103 size_t pktsz; 102 size_t pktsz;
104 103
105 DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", 104 pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
106 (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type); 105 (unsigned long) wq->wait_queue_token,
106 wq->name.len, wq->name.name, type);
107 107
108 memset(&pkt,0,sizeof pkt); /* For security reasons */ 108 memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
109 109
110 pkt.hdr.proto_version = sbi->version; 110 pkt.hdr.proto_version = sbi->version;
111 pkt.hdr.type = type; 111 pkt.hdr.type = type;
@@ -126,7 +126,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
126 } 126 }
127 case autofs_ptype_expire_multi: 127 case autofs_ptype_expire_multi:
128 { 128 {
129 struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi; 129 struct autofs_packet_expire_multi *ep =
130 &pkt.v4_pkt.expire_multi;
130 131
131 pktsz = sizeof(*ep); 132 pktsz = sizeof(*ep);
132 133
@@ -163,7 +164,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
163 break; 164 break;
164 } 165 }
165 default: 166 default:
166 printk("autofs4_notify_daemon: bad type %d!\n", type); 167 pr_warn("bad type %d!\n", type);
167 mutex_unlock(&sbi->wq_mutex); 168 mutex_unlock(&sbi->wq_mutex);
168 return; 169 return;
169 } 170 }
@@ -231,7 +232,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
231 if (wq->name.hash == qstr->hash && 232 if (wq->name.hash == qstr->hash &&
232 wq->name.len == qstr->len && 233 wq->name.len == qstr->len &&
233 wq->name.name && 234 wq->name.name &&
234 !memcmp(wq->name.name, qstr->name, qstr->len)) 235 !memcmp(wq->name.name, qstr->name, qstr->len))
235 break; 236 break;
236 } 237 }
237 return wq; 238 return wq;
@@ -248,7 +249,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
248static int validate_request(struct autofs_wait_queue **wait, 249static int validate_request(struct autofs_wait_queue **wait,
249 struct autofs_sb_info *sbi, 250 struct autofs_sb_info *sbi,
250 struct qstr *qstr, 251 struct qstr *qstr,
251 struct dentry*dentry, enum autofs_notify notify) 252 struct dentry *dentry, enum autofs_notify notify)
252{ 253{
253 struct autofs_wait_queue *wq; 254 struct autofs_wait_queue *wq;
254 struct autofs_info *ino; 255 struct autofs_info *ino;
@@ -322,8 +323,10 @@ static int validate_request(struct autofs_wait_queue **wait,
322 * continue on and create a new request. 323 * continue on and create a new request.
323 */ 324 */
324 if (!IS_ROOT(dentry)) { 325 if (!IS_ROOT(dentry)) {
325 if (d_really_is_positive(dentry) && d_unhashed(dentry)) { 326 if (d_unhashed(dentry) &&
327 d_really_is_positive(dentry)) {
326 struct dentry *parent = dentry->d_parent; 328 struct dentry *parent = dentry->d_parent;
329
327 new = d_lookup(parent, &dentry->d_name); 330 new = d_lookup(parent, &dentry->d_name);
328 if (new) 331 if (new)
329 dentry = new; 332 dentry = new;
@@ -340,8 +343,8 @@ static int validate_request(struct autofs_wait_queue **wait,
340 return 1; 343 return 1;
341} 344}
342 345
343int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, 346int autofs4_wait(struct autofs_sb_info *sbi,
344 enum autofs_notify notify) 347 struct dentry *dentry, enum autofs_notify notify)
345{ 348{
346 struct autofs_wait_queue *wq; 349 struct autofs_wait_queue *wq;
347 struct qstr qstr; 350 struct qstr qstr;
@@ -411,7 +414,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
411 414
412 if (!wq) { 415 if (!wq) {
413 /* Create a new wait queue */ 416 /* Create a new wait queue */
414 wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); 417 wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL);
415 if (!wq) { 418 if (!wq) {
416 kfree(qstr.name); 419 kfree(qstr.name);
417 mutex_unlock(&sbi->wq_mutex); 420 mutex_unlock(&sbi->wq_mutex);
@@ -450,17 +453,19 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
450 autofs_ptype_expire_indirect; 453 autofs_ptype_expire_indirect;
451 } 454 }
452 455
453 DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", 456 pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
454 (unsigned long) wq->wait_queue_token, wq->name.len, 457 (unsigned long) wq->wait_queue_token, wq->name.len,
455 wq->name.name, notify); 458 wq->name.name, notify);
456 459
457 /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */ 460 /*
461 * autofs4_notify_daemon() may block; it will unlock ->wq_mutex
462 */
458 autofs4_notify_daemon(sbi, wq, type); 463 autofs4_notify_daemon(sbi, wq, type);
459 } else { 464 } else {
460 wq->wait_ctr++; 465 wq->wait_ctr++;
461 DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", 466 pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
462 (unsigned long) wq->wait_queue_token, wq->name.len, 467 (unsigned long) wq->wait_queue_token, wq->name.len,
463 wq->name.name, notify); 468 wq->name.name, notify);
464 mutex_unlock(&sbi->wq_mutex); 469 mutex_unlock(&sbi->wq_mutex);
465 kfree(qstr.name); 470 kfree(qstr.name);
466 } 471 }
@@ -471,12 +476,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
471 */ 476 */
472 if (wq->name.name) { 477 if (wq->name.name) {
473 /* Block all but "shutdown" signals while waiting */ 478 /* Block all but "shutdown" signals while waiting */
474 sigset_t oldset; 479 unsigned long shutdown_sigs_mask;
475 unsigned long irqflags; 480 unsigned long irqflags;
481 sigset_t oldset;
476 482
477 spin_lock_irqsave(&current->sighand->siglock, irqflags); 483 spin_lock_irqsave(&current->sighand->siglock, irqflags);
478 oldset = current->blocked; 484 oldset = current->blocked;
479 siginitsetinv(&current->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]); 485 shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0];
486 siginitsetinv(&current->blocked, shutdown_sigs_mask);
480 recalc_sigpending(); 487 recalc_sigpending();
481 spin_unlock_irqrestore(&current->sighand->siglock, irqflags); 488 spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
482 489
@@ -487,7 +494,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
487 recalc_sigpending(); 494 recalc_sigpending();
488 spin_unlock_irqrestore(&current->sighand->siglock, irqflags); 495 spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
489 } else { 496 } else {
490 DPRINTK("skipped sleeping"); 497 pr_debug("skipped sleeping\n");
491 } 498 }
492 499
493 status = wq->status; 500 status = wq->status;
@@ -562,4 +569,3 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
562 569
563 return 0; 570 return 0;
564} 571}
565
diff --git a/fs/buffer.c b/fs/buffer.c
index e1632abb4ca9..33be29675358 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -621,17 +621,17 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
621 * If warn is true, then emit a warning if the page is not uptodate and has 621 * If warn is true, then emit a warning if the page is not uptodate and has
622 * not been truncated. 622 * not been truncated.
623 * 623 *
624 * The caller must hold mem_cgroup_begin_page_stat() lock. 624 * The caller must hold lock_page_memcg().
625 */ 625 */
626static void __set_page_dirty(struct page *page, struct address_space *mapping, 626static void __set_page_dirty(struct page *page, struct address_space *mapping,
627 struct mem_cgroup *memcg, int warn) 627 int warn)
628{ 628{
629 unsigned long flags; 629 unsigned long flags;
630 630
631 spin_lock_irqsave(&mapping->tree_lock, flags); 631 spin_lock_irqsave(&mapping->tree_lock, flags);
632 if (page->mapping) { /* Race with truncate? */ 632 if (page->mapping) { /* Race with truncate? */
633 WARN_ON_ONCE(warn && !PageUptodate(page)); 633 WARN_ON_ONCE(warn && !PageUptodate(page));
634 account_page_dirtied(page, mapping, memcg); 634 account_page_dirtied(page, mapping);
635 radix_tree_tag_set(&mapping->page_tree, 635 radix_tree_tag_set(&mapping->page_tree,
636 page_index(page), PAGECACHE_TAG_DIRTY); 636 page_index(page), PAGECACHE_TAG_DIRTY);
637 } 637 }
@@ -666,7 +666,6 @@ static void __set_page_dirty(struct page *page, struct address_space *mapping,
666int __set_page_dirty_buffers(struct page *page) 666int __set_page_dirty_buffers(struct page *page)
667{ 667{
668 int newly_dirty; 668 int newly_dirty;
669 struct mem_cgroup *memcg;
670 struct address_space *mapping = page_mapping(page); 669 struct address_space *mapping = page_mapping(page);
671 670
672 if (unlikely(!mapping)) 671 if (unlikely(!mapping))
@@ -683,17 +682,17 @@ int __set_page_dirty_buffers(struct page *page)
683 } while (bh != head); 682 } while (bh != head);
684 } 683 }
685 /* 684 /*
686 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with 685 * Lock out page->mem_cgroup migration to keep PageDirty
687 * per-memcg dirty page counters. 686 * synchronized with per-memcg dirty page counters.
688 */ 687 */
689 memcg = mem_cgroup_begin_page_stat(page); 688 lock_page_memcg(page);
690 newly_dirty = !TestSetPageDirty(page); 689 newly_dirty = !TestSetPageDirty(page);
691 spin_unlock(&mapping->private_lock); 690 spin_unlock(&mapping->private_lock);
692 691
693 if (newly_dirty) 692 if (newly_dirty)
694 __set_page_dirty(page, mapping, memcg, 1); 693 __set_page_dirty(page, mapping, 1);
695 694
696 mem_cgroup_end_page_stat(memcg); 695 unlock_page_memcg(page);
697 696
698 if (newly_dirty) 697 if (newly_dirty)
699 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 698 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1167,15 +1166,14 @@ void mark_buffer_dirty(struct buffer_head *bh)
1167 if (!test_set_buffer_dirty(bh)) { 1166 if (!test_set_buffer_dirty(bh)) {
1168 struct page *page = bh->b_page; 1167 struct page *page = bh->b_page;
1169 struct address_space *mapping = NULL; 1168 struct address_space *mapping = NULL;
1170 struct mem_cgroup *memcg;
1171 1169
1172 memcg = mem_cgroup_begin_page_stat(page); 1170 lock_page_memcg(page);
1173 if (!TestSetPageDirty(page)) { 1171 if (!TestSetPageDirty(page)) {
1174 mapping = page_mapping(page); 1172 mapping = page_mapping(page);
1175 if (mapping) 1173 if (mapping)
1176 __set_page_dirty(page, mapping, memcg, 0); 1174 __set_page_dirty(page, mapping, 0);
1177 } 1175 }
1178 mem_cgroup_end_page_stat(memcg); 1176 unlock_page_memcg(page);
1179 if (mapping) 1177 if (mapping)
1180 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1178 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1181 } 1179 }
diff --git a/fs/mpage.c b/fs/mpage.c
index 1480d3a18037..6bd9fd90964e 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -24,6 +24,7 @@
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/prefetch.h> 25#include <linux/prefetch.h>
26#include <linux/mpage.h> 26#include <linux/mpage.h>
27#include <linux/mm_inline.h>
27#include <linux/writeback.h> 28#include <linux/writeback.h>
28#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
29#include <linux/pagevec.h> 30#include <linux/pagevec.h>
@@ -366,7 +367,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
366 map_bh.b_state = 0; 367 map_bh.b_state = 0;
367 map_bh.b_size = 0; 368 map_bh.b_size = 0;
368 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 369 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
369 struct page *page = list_entry(pages->prev, struct page, lru); 370 struct page *page = lru_to_page(pages);
370 371
371 prefetchw(&page->flags); 372 prefetchw(&page->flags);
372 list_del(&page->lru); 373 list_del(&page->lru);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a76b9ea7722e..ef6a2ec494de 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -287,7 +287,6 @@ struct o2hb_bio_wait_ctxt {
287static void o2hb_write_timeout(struct work_struct *work) 287static void o2hb_write_timeout(struct work_struct *work)
288{ 288{
289 int failed, quorum; 289 int failed, quorum;
290 unsigned long flags;
291 struct o2hb_region *reg = 290 struct o2hb_region *reg =
292 container_of(work, struct o2hb_region, 291 container_of(work, struct o2hb_region,
293 hr_write_timeout_work.work); 292 hr_write_timeout_work.work);
@@ -297,14 +296,14 @@ static void o2hb_write_timeout(struct work_struct *work)
297 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 296 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
298 297
299 if (o2hb_global_heartbeat_active()) { 298 if (o2hb_global_heartbeat_active()) {
300 spin_lock_irqsave(&o2hb_live_lock, flags); 299 spin_lock(&o2hb_live_lock);
301 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) 300 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
302 set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); 301 set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
303 failed = bitmap_weight(o2hb_failed_region_bitmap, 302 failed = bitmap_weight(o2hb_failed_region_bitmap,
304 O2NM_MAX_REGIONS); 303 O2NM_MAX_REGIONS);
305 quorum = bitmap_weight(o2hb_quorum_region_bitmap, 304 quorum = bitmap_weight(o2hb_quorum_region_bitmap,
306 O2NM_MAX_REGIONS); 305 O2NM_MAX_REGIONS);
307 spin_unlock_irqrestore(&o2hb_live_lock, flags); 306 spin_unlock(&o2hb_live_lock);
308 307
309 mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", 308 mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
310 quorum, failed); 309 quorum, failed);
@@ -2425,11 +2424,10 @@ EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
2425int o2hb_check_node_heartbeating_no_sem(u8 node_num) 2424int o2hb_check_node_heartbeating_no_sem(u8 node_num)
2426{ 2425{
2427 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 2426 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
2428 unsigned long flags;
2429 2427
2430 spin_lock_irqsave(&o2hb_live_lock, flags); 2428 spin_lock(&o2hb_live_lock);
2431 o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); 2429 o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
2432 spin_unlock_irqrestore(&o2hb_live_lock, flags); 2430 spin_unlock(&o2hb_live_lock);
2433 if (!test_bit(node_num, testing_map)) { 2431 if (!test_bit(node_num, testing_map)) {
2434 mlog(ML_HEARTBEAT, 2432 mlog(ML_HEARTBEAT,
2435 "node (%u) does not have heartbeating enabled.\n", 2433 "node (%u) does not have heartbeating enabled.\n",
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 68c607e63ff6..004f2cbe8f71 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
282#define DLM_LOCK_RES_DROPPING_REF 0x00000040 282#define DLM_LOCK_RES_DROPPING_REF 0x00000040
283#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 283#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000
284#define DLM_LOCK_RES_SETREF_INPROG 0x00002000 284#define DLM_LOCK_RES_SETREF_INPROG 0x00002000
285#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000
285 286
286/* max milliseconds to wait to sync up a network failure with a node death */ 287/* max milliseconds to wait to sync up a network failure with a node death */
287#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) 288#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
@@ -451,6 +452,7 @@ enum {
451 DLM_QUERY_REGION = 519, 452 DLM_QUERY_REGION = 519,
452 DLM_QUERY_NODEINFO = 520, 453 DLM_QUERY_NODEINFO = 520,
453 DLM_BEGIN_EXIT_DOMAIN_MSG = 521, 454 DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
455 DLM_DEREF_LOCKRES_DONE = 522,
454}; 456};
455 457
456struct dlm_reco_node_data 458struct dlm_reco_node_data
@@ -545,7 +547,7 @@ struct dlm_master_requery
545 * }; 547 * };
546 * 548 *
547 * from ../cluster/tcp.h 549 * from ../cluster/tcp.h
548 * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) 550 * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
549 * (roughly 4080 bytes) 551 * (roughly 4080 bytes)
550 * and sizeof(dlm_migratable_lockres) = 112 bytes 552 * and sizeof(dlm_migratable_lockres) = 112 bytes
551 * and sizeof(dlm_migratable_lock) = 16 bytes 553 * and sizeof(dlm_migratable_lock) = 16 bytes
@@ -586,7 +588,7 @@ struct dlm_migratable_lockres
586 588
587/* from above, 128 bytes 589/* from above, 128 bytes
588 * for some undetermined future use */ 590 * for some undetermined future use */
589#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \ 591#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \
590 DLM_MIG_LOCKRES_MAX_LEN) 592 DLM_MIG_LOCKRES_MAX_LEN)
591 593
592struct dlm_create_lock 594struct dlm_create_lock
@@ -782,6 +784,20 @@ struct dlm_deref_lockres
782 u8 name[O2NM_MAX_NAME_LEN]; 784 u8 name[O2NM_MAX_NAME_LEN];
783}; 785};
784 786
787enum {
788 DLM_DEREF_RESPONSE_DONE = 0,
789 DLM_DEREF_RESPONSE_INPROG = 1,
790};
791
792struct dlm_deref_lockres_done {
793 u32 pad1;
794 u16 pad2;
795 u8 node_idx;
796 u8 namelen;
797
798 u8 name[O2NM_MAX_NAME_LEN];
799};
800
785static inline enum dlm_status 801static inline enum dlm_status
786__dlm_lockres_state_to_status(struct dlm_lock_resource *res) 802__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
787{ 803{
@@ -789,7 +805,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
789 805
790 assert_spin_locked(&res->spinlock); 806 assert_spin_locked(&res->spinlock);
791 807
792 if (res->state & DLM_LOCK_RES_RECOVERING) 808 if (res->state & (DLM_LOCK_RES_RECOVERING|
809 DLM_LOCK_RES_RECOVERY_WAITING))
793 status = DLM_RECOVERING; 810 status = DLM_RECOVERING;
794 else if (res->state & DLM_LOCK_RES_MIGRATING) 811 else if (res->state & DLM_LOCK_RES_MIGRATING)
795 status = DLM_MIGRATING; 812 status = DLM_MIGRATING;
@@ -968,6 +985,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
968void dlm_assert_master_post_handler(int status, void *data, void *ret_data); 985void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
969int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 986int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
970 void **ret_data); 987 void **ret_data);
988int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
989 void **ret_data);
971int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, 990int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
972 void **ret_data); 991 void **ret_data);
973int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 992int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -1009,6 +1028,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
1009{ 1028{
1010 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| 1029 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
1011 DLM_LOCK_RES_RECOVERING| 1030 DLM_LOCK_RES_RECOVERING|
1031 DLM_LOCK_RES_RECOVERY_WAITING|
1012 DLM_LOCK_RES_MIGRATING)); 1032 DLM_LOCK_RES_MIGRATING));
1013} 1033}
1014 1034
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2ee7fe747cea..12e064b8be9a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
132 * - Message DLM_QUERY_NODEINFO added to allow online node removes 132 * - Message DLM_QUERY_NODEINFO added to allow online node removes
133 * New in version 1.2: 133 * New in version 1.2:
134 * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain 134 * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
135 * New in version 1.3:
136 * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the
137 * refmap is cleared
135 */ 138 */
136static const struct dlm_protocol_version dlm_protocol = { 139static const struct dlm_protocol_version dlm_protocol = {
137 .pv_major = 1, 140 .pv_major = 1,
138 .pv_minor = 2, 141 .pv_minor = 3,
139}; 142};
140 143
141#define DLM_DOMAIN_BACKOFF_MS 200 144#define DLM_DOMAIN_BACKOFF_MS 200
@@ -1396,7 +1399,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
1396 unsigned int map_size) 1399 unsigned int map_size)
1397{ 1400{
1398 int status, tmpstat; 1401 int status, tmpstat;
1399 unsigned int node; 1402 int node;
1400 1403
1401 if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * 1404 if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
1402 sizeof(unsigned long))) { 1405 sizeof(unsigned long))) {
@@ -1853,7 +1856,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1853 sizeof(struct dlm_exit_domain), 1856 sizeof(struct dlm_exit_domain),
1854 dlm_begin_exit_domain_handler, 1857 dlm_begin_exit_domain_handler,
1855 dlm, NULL, &dlm->dlm_domain_handlers); 1858 dlm, NULL, &dlm->dlm_domain_handlers);
1859 if (status)
1860 goto bail;
1856 1861
1862 status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key,
1863 sizeof(struct dlm_deref_lockres_done),
1864 dlm_deref_lockres_done_handler,
1865 dlm, NULL, &dlm->dlm_domain_handlers);
1857bail: 1866bail:
1858 if (status) 1867 if (status)
1859 dlm_unregister_domain_handlers(dlm); 1868 dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9477d6e1de37..9aed6e202201 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2278,7 +2278,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2278 dlm_print_one_lock_resource(res); 2278 dlm_print_one_lock_resource(res);
2279 BUG(); 2279 BUG();
2280 } 2280 }
2281 return ret; 2281 return ret ? ret : r;
2282} 2282}
2283 2283
2284int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 2284int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -2345,7 +2345,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
2345 res->lockname.len, res->lockname.name, node); 2345 res->lockname.len, res->lockname.name, node);
2346 dlm_print_one_lock_resource(res); 2346 dlm_print_one_lock_resource(res);
2347 } 2347 }
2348 ret = 0; 2348 ret = DLM_DEREF_RESPONSE_DONE;
2349 goto done; 2349 goto done;
2350 } 2350 }
2351 2351
@@ -2365,7 +2365,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
2365 spin_unlock(&dlm->work_lock); 2365 spin_unlock(&dlm->work_lock);
2366 2366
2367 queue_work(dlm->dlm_worker, &dlm->dispatched_work); 2367 queue_work(dlm->dlm_worker, &dlm->dispatched_work);
2368 return 0; 2368 return DLM_DEREF_RESPONSE_INPROG;
2369 2369
2370done: 2370done:
2371 if (res) 2371 if (res)
@@ -2375,6 +2375,122 @@ done:
2375 return ret; 2375 return ret;
2376} 2376}
2377 2377
2378int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
2379 void **ret_data)
2380{
2381 struct dlm_ctxt *dlm = data;
2382 struct dlm_deref_lockres_done *deref
2383 = (struct dlm_deref_lockres_done *)msg->buf;
2384 struct dlm_lock_resource *res = NULL;
2385 char *name;
2386 unsigned int namelen;
2387 int ret = -EINVAL;
2388 u8 node;
2389 unsigned int hash;
2390
2391 if (!dlm_grab(dlm))
2392 return 0;
2393
2394 name = deref->name;
2395 namelen = deref->namelen;
2396 node = deref->node_idx;
2397
2398 if (namelen > DLM_LOCKID_NAME_MAX) {
2399 mlog(ML_ERROR, "Invalid name length!");
2400 goto done;
2401 }
2402 if (deref->node_idx >= O2NM_MAX_NODES) {
2403 mlog(ML_ERROR, "Invalid node number: %u\n", node);
2404 goto done;
2405 }
2406
2407 hash = dlm_lockid_hash(name, namelen);
2408
2409 spin_lock(&dlm->spinlock);
2410 res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
2411 if (!res) {
2412 spin_unlock(&dlm->spinlock);
2413 mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
2414 dlm->name, namelen, name);
2415 goto done;
2416 }
2417
2418 spin_lock(&res->spinlock);
2419 BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF));
2420 if (!list_empty(&res->purge)) {
2421 mlog(0, "%s: Removing res %.*s from purgelist\n",
2422 dlm->name, res->lockname.len, res->lockname.name);
2423 list_del_init(&res->purge);
2424 dlm_lockres_put(res);
2425 dlm->purge_count--;
2426 }
2427
2428 if (!__dlm_lockres_unused(res)) {
2429 mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
2430 dlm->name, res->lockname.len, res->lockname.name);
2431 __dlm_print_one_lock_resource(res);
2432 BUG();
2433 }
2434
2435 __dlm_unhash_lockres(dlm, res);
2436
2437 spin_lock(&dlm->track_lock);
2438 if (!list_empty(&res->tracking))
2439 list_del_init(&res->tracking);
2440 else {
2441 mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
2442 dlm->name, res->lockname.len, res->lockname.name);
2443 __dlm_print_one_lock_resource(res);
2444 }
2445 spin_unlock(&dlm->track_lock);
2446
2447 /* lockres is not in the hash now. drop the flag and wake up
2448 * any processes waiting in dlm_get_lock_resource.
2449 */
2450 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
2451 spin_unlock(&res->spinlock);
2452 wake_up(&res->wq);
2453
2454 dlm_lockres_put(res);
2455
2456 spin_unlock(&dlm->spinlock);
2457
2458done:
2459 dlm_put(dlm);
2460 return ret;
2461}
2462
2463static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm,
2464 struct dlm_lock_resource *res, u8 node)
2465{
2466 struct dlm_deref_lockres_done deref;
2467 int ret = 0, r;
2468 const char *lockname;
2469 unsigned int namelen;
2470
2471 lockname = res->lockname.name;
2472 namelen = res->lockname.len;
2473 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2474
2475 memset(&deref, 0, sizeof(deref));
2476 deref.node_idx = dlm->node_num;
2477 deref.namelen = namelen;
2478 memcpy(deref.name, lockname, namelen);
2479
2480 ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key,
2481 &deref, sizeof(deref), node, &r);
2482 if (ret < 0) {
2483 mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE "
2484 " to node %u\n", dlm->name, namelen,
2485 lockname, ret, node);
2486 } else if (r < 0) {
2487 /* ignore the error */
2488 mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
2489 dlm->name, namelen, lockname, node, r);
2490 dlm_print_one_lock_resource(res);
2491 }
2492}
2493
2378static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) 2494static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2379{ 2495{
2380 struct dlm_ctxt *dlm; 2496 struct dlm_ctxt *dlm;
@@ -2395,6 +2511,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2395 } 2511 }
2396 spin_unlock(&res->spinlock); 2512 spin_unlock(&res->spinlock);
2397 2513
2514 dlm_drop_lockres_ref_done(dlm, res, node);
2515
2398 if (cleared) { 2516 if (cleared) {
2399 mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", 2517 mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
2400 dlm->name, res->lockname.len, res->lockname.name, node); 2518 dlm->name, res->lockname.len, res->lockname.name, node);
@@ -2432,7 +2550,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2432 return 0; 2550 return 0;
2433 2551
2434 /* delay migration when the lockres is in RECOCERING state */ 2552 /* delay migration when the lockres is in RECOCERING state */
2435 if (res->state & DLM_LOCK_RES_RECOVERING) 2553 if (res->state & (DLM_LOCK_RES_RECOVERING|
2554 DLM_LOCK_RES_RECOVERY_WAITING))
2436 return 0; 2555 return 0;
2437 2556
2438 if (res->owner != dlm->node_num) 2557 if (res->owner != dlm->node_num)
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b94a425f0175..cd38488a10fc 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1403,12 +1403,24 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
1403 * and RECOVERY flag changed when it completes. */ 1403 * and RECOVERY flag changed when it completes. */
1404 hash = dlm_lockid_hash(mres->lockname, mres->lockname_len); 1404 hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
1405 spin_lock(&dlm->spinlock); 1405 spin_lock(&dlm->spinlock);
1406 res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len, 1406 res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len,
1407 hash); 1407 hash);
1408 if (res) { 1408 if (res) {
1409 /* this will get a ref on res */ 1409 /* this will get a ref on res */
1410 /* mark it as recovering/migrating and hash it */ 1410 /* mark it as recovering/migrating and hash it */
1411 spin_lock(&res->spinlock); 1411 spin_lock(&res->spinlock);
1412 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
1413 mlog(0, "%s: node is attempting to migrate "
1414 "lockres %.*s, but marked as dropping "
1415 " ref!\n", dlm->name,
1416 mres->lockname_len, mres->lockname);
1417 ret = -EINVAL;
1418 spin_unlock(&res->spinlock);
1419 spin_unlock(&dlm->spinlock);
1420 dlm_lockres_put(res);
1421 goto leave;
1422 }
1423
1412 if (mres->flags & DLM_MRES_RECOVERY) { 1424 if (mres->flags & DLM_MRES_RECOVERY) {
1413 res->state |= DLM_LOCK_RES_RECOVERING; 1425 res->state |= DLM_LOCK_RES_RECOVERING;
1414 } else { 1426 } else {
@@ -2163,6 +2175,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2163 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 2175 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
2164 bucket = dlm_lockres_hash(dlm, i); 2176 bucket = dlm_lockres_hash(dlm, i);
2165 hlist_for_each_entry(res, bucket, hash_node) { 2177 hlist_for_each_entry(res, bucket, hash_node) {
2178 if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
2179 spin_lock(&res->spinlock);
2180 res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
2181 spin_unlock(&res->spinlock);
2182 wake_up(&res->wq);
2183 }
2184
2166 if (!(res->state & DLM_LOCK_RES_RECOVERING)) 2185 if (!(res->state & DLM_LOCK_RES_RECOVERING))
2167 continue; 2186 continue;
2168 2187
@@ -2300,6 +2319,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2300 res->lockname.len, res->lockname.name, freed, dead_node); 2319 res->lockname.len, res->lockname.name, freed, dead_node);
2301 __dlm_print_one_lock_resource(res); 2320 __dlm_print_one_lock_resource(res);
2302 } 2321 }
2322 res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
2303 dlm_lockres_clear_refmap_bit(dlm, res, dead_node); 2323 dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2304 } else if (test_bit(dead_node, res->refmap)) { 2324 } else if (test_bit(dead_node, res->refmap)) {
2305 mlog(0, "%s:%.*s: dead node %u had a ref, but had " 2325 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
@@ -2377,14 +2397,16 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2377 dlm_revalidate_lvb(dlm, res, dead_node); 2397 dlm_revalidate_lvb(dlm, res, dead_node);
2378 if (res->owner == dead_node) { 2398 if (res->owner == dead_node) {
2379 if (res->state & DLM_LOCK_RES_DROPPING_REF) { 2399 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
2380 mlog(ML_NOTICE, "%s: res %.*s, Skip " 2400 mlog(0, "%s:%.*s: owned by "
2381 "recovery as it is being freed\n", 2401 "dead node %u, this node was "
2382 dlm->name, res->lockname.len, 2402 "dropping its ref when it died. "
2383 res->lockname.name); 2403 "continue, dropping the flag.\n",
2384 } else 2404 dlm->name, res->lockname.len,
2385 dlm_move_lockres_to_recovery_list(dlm, 2405 res->lockname.name, dead_node);
2386 res); 2406 }
2387 2407 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
2408 dlm_move_lockres_to_recovery_list(dlm,
2409 res);
2388 } else if (res->owner == dlm->node_num) { 2410 } else if (res->owner == dlm->node_num) {
2389 dlm_free_dead_locks(dlm, res, dead_node); 2411 dlm_free_dead_locks(dlm, res, dead_node);
2390 __dlm_lockres_calc_usage(dlm, res); 2412 __dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index c5f6c241ecd7..68d239ba0c63 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
106 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) 106 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
107 return 0; 107 return 0;
108 108
109 if (res->state & DLM_LOCK_RES_RECOVERING) 109 if (res->state & (DLM_LOCK_RES_RECOVERING|
110 DLM_LOCK_RES_RECOVERY_WAITING))
110 return 0; 111 return 0;
111 112
112 /* Another node has this resource with this node as the master */ 113 /* Another node has this resource with this node as the master */
@@ -202,6 +203,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
202 dlm->purge_count--; 203 dlm->purge_count--;
203 } 204 }
204 205
206 if (!master && ret != 0) {
207 mlog(0, "%s: deref %.*s in progress or master goes down\n",
208 dlm->name, res->lockname.len, res->lockname.name);
209 spin_unlock(&res->spinlock);
210 return;
211 }
212
205 if (!__dlm_lockres_unused(res)) { 213 if (!__dlm_lockres_unused(res)) {
206 mlog(ML_ERROR, "%s: res %.*s in use after deref\n", 214 mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
207 dlm->name, res->lockname.len, res->lockname.name); 215 dlm->name, res->lockname.len, res->lockname.name);
@@ -700,7 +708,8 @@ static int dlm_thread(void *data)
700 * dirty for a short while. */ 708 * dirty for a short while. */
701 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); 709 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
702 if (res->state & (DLM_LOCK_RES_IN_PROGRESS | 710 if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
703 DLM_LOCK_RES_RECOVERING)) { 711 DLM_LOCK_RES_RECOVERING |
712 DLM_LOCK_RES_RECOVERY_WAITING)) {
704 /* move it to the tail and keep going */ 713 /* move it to the tail and keep going */
705 res->state &= ~DLM_LOCK_RES_DIRTY; 714 res->state &= ~DLM_LOCK_RES_DIRTY;
706 spin_unlock(&res->spinlock); 715 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index faa1365097bc..302854ee0985 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -236,6 +236,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
236 struct ocfs2_recovery_map *rm = osb->recovery_map; 236 struct ocfs2_recovery_map *rm = osb->recovery_map;
237 struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; 237 struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
238 int i, out = 0; 238 int i, out = 0;
239 unsigned long flags;
239 240
240 out += snprintf(buf + out, len - out, 241 out += snprintf(buf + out, len - out,
241 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", 242 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
@@ -271,14 +272,14 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
271 cconn->cc_version.pv_minor); 272 cconn->cc_version.pv_minor);
272 } 273 }
273 274
274 spin_lock(&osb->dc_task_lock); 275 spin_lock_irqsave(&osb->dc_task_lock, flags);
275 out += snprintf(buf + out, len - out, 276 out += snprintf(buf + out, len - out,
276 "%10s => Pid: %d Count: %lu WakeSeq: %lu " 277 "%10s => Pid: %d Count: %lu WakeSeq: %lu "
277 "WorkSeq: %lu\n", "DownCnvt", 278 "WorkSeq: %lu\n", "DownCnvt",
278 (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), 279 (osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
279 osb->blocked_lock_count, osb->dc_wake_sequence, 280 osb->blocked_lock_count, osb->dc_wake_sequence,
280 osb->dc_work_sequence); 281 osb->dc_work_sequence);
281 spin_unlock(&osb->dc_task_lock); 282 spin_unlock_irqrestore(&osb->dc_task_lock, flags);
282 283
283 spin_lock(&osb->osb_lock); 284 spin_lock(&osb->osb_lock);
284 out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", 285 out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a9ebabfe7587..5c57b7b40728 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1957,7 +1957,6 @@ xfs_vm_set_page_dirty(
1957 loff_t end_offset; 1957 loff_t end_offset;
1958 loff_t offset; 1958 loff_t offset;
1959 int newly_dirty; 1959 int newly_dirty;
1960 struct mem_cgroup *memcg;
1961 1960
1962 if (unlikely(!mapping)) 1961 if (unlikely(!mapping))
1963 return !TestSetPageDirty(page); 1962 return !TestSetPageDirty(page);
@@ -1978,10 +1977,10 @@ xfs_vm_set_page_dirty(
1978 } while (bh != head); 1977 } while (bh != head);
1979 } 1978 }
1980 /* 1979 /*
1981 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with 1980 * Lock out page->mem_cgroup migration to keep PageDirty
1982 * per-memcg dirty page counters. 1981 * synchronized with per-memcg dirty page counters.
1983 */ 1982 */
1984 memcg = mem_cgroup_begin_page_stat(page); 1983 lock_page_memcg(page);
1985 newly_dirty = !TestSetPageDirty(page); 1984 newly_dirty = !TestSetPageDirty(page);
1986 spin_unlock(&mapping->private_lock); 1985 spin_unlock(&mapping->private_lock);
1987 1986
@@ -1992,13 +1991,13 @@ xfs_vm_set_page_dirty(
1992 spin_lock_irqsave(&mapping->tree_lock, flags); 1991 spin_lock_irqsave(&mapping->tree_lock, flags);
1993 if (page->mapping) { /* Race with truncate? */ 1992 if (page->mapping) { /* Race with truncate? */
1994 WARN_ON_ONCE(!PageUptodate(page)); 1993 WARN_ON_ONCE(!PageUptodate(page));
1995 account_page_dirtied(page, mapping, memcg); 1994 account_page_dirtied(page, mapping);
1996 radix_tree_tag_set(&mapping->page_tree, 1995 radix_tree_tag_set(&mapping->page_tree,
1997 page_index(page), PAGECACHE_TAG_DIRTY); 1996 page_index(page), PAGECACHE_TAG_DIRTY);
1998 } 1997 }
1999 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1998 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2000 } 1999 }
2001 mem_cgroup_end_page_stat(memcg); 2000 unlock_page_memcg(page);
2002 if (newly_dirty) 2001 if (newly_dirty)
2003 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 2002 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
2004 return newly_dirty; 2003 return newly_dirty;
diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h
index 850f39b33e74..7caaf298f539 100644
--- a/include/linux/auto_dev-ioctl.h
+++ b/include/linux/auto_dev-ioctl.h
@@ -11,12 +11,7 @@
11#define _LINUX_AUTO_DEV_IOCTL_H 11#define _LINUX_AUTO_DEV_IOCTL_H
12 12
13#include <linux/auto_fs.h> 13#include <linux/auto_fs.h>
14
15#ifdef __KERNEL__
16#include <linux/string.h> 14#include <linux/string.h>
17#else
18#include <string.h>
19#endif /* __KERNEL__ */
20 15
21#define AUTOFS_DEVICE_NAME "autofs" 16#define AUTOFS_DEVICE_NAME "autofs"
22 17
@@ -125,7 +120,6 @@ static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
125 in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; 120 in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
126 in->size = sizeof(struct autofs_dev_ioctl); 121 in->size = sizeof(struct autofs_dev_ioctl);
127 in->ioctlfd = -1; 122 in->ioctlfd = -1;
128 return;
129} 123}
130 124
131/* 125/*
diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h
index fcd704d354c4..b4066bb89083 100644
--- a/include/linux/auto_fs.h
+++ b/include/linux/auto_fs.h
@@ -1,14 +1,10 @@
1/* -*- linux-c -*- ------------------------------------------------------- * 1/*
2 * 2 * Copyright 1997 Transmeta Corporation - All Rights Reserved
3 * linux/include/linux/auto_fs.h
4 *
5 * Copyright 1997 Transmeta Corporation - All Rights Reserved
6 * 3 *
7 * This file is part of the Linux kernel and is made available under 4 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your 5 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference. 6 * option, any later version, incorporated herein by reference.
10 * 7 */
11 * ----------------------------------------------------------------------- */
12 8
13#ifndef _LINUX_AUTO_FS_H 9#ifndef _LINUX_AUTO_FS_H
14#define _LINUX_AUTO_FS_H 10#define _LINUX_AUTO_FS_H
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 3159a7dba034..9f4956d8601c 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -62,10 +62,9 @@ static inline struct dentry *fault_create_debugfs_attr(const char *name,
62#endif /* CONFIG_FAULT_INJECTION */ 62#endif /* CONFIG_FAULT_INJECTION */
63 63
64#ifdef CONFIG_FAILSLAB 64#ifdef CONFIG_FAILSLAB
65extern bool should_failslab(size_t size, gfp_t gfpflags, unsigned long flags); 65extern bool should_failslab(struct kmem_cache *s, gfp_t gfpflags);
66#else 66#else
67static inline bool should_failslab(size_t size, gfp_t gfpflags, 67static inline bool should_failslab(struct kmem_cache *s, gfp_t gfpflags)
68 unsigned long flags)
69{ 68{
70 return false; 69 return false;
71} 70}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index af1f2b24bbe4..bb16dfeb917e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -9,6 +9,11 @@
9 9
10struct vm_area_struct; 10struct vm_area_struct;
11 11
12/*
13 * In case of changes, please don't forget to update
14 * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
15 */
16
12/* Plain integer GFP bitmasks. Do not use this directly. */ 17/* Plain integer GFP bitmasks. Do not use this directly. */
13#define ___GFP_DMA 0x01u 18#define ___GFP_DMA 0x01u
14#define ___GFP_HIGHMEM 0x02u 19#define ___GFP_HIGHMEM 0x02u
@@ -48,7 +53,6 @@ struct vm_area_struct;
48#define __GFP_DMA ((__force gfp_t)___GFP_DMA) 53#define __GFP_DMA ((__force gfp_t)___GFP_DMA)
49#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM) 54#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM)
50#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32) 55#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
51#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* Page is movable */
52#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */ 56#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */
53#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) 57#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
54 58
@@ -515,13 +519,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
515void drain_all_pages(struct zone *zone); 519void drain_all_pages(struct zone *zone);
516void drain_local_pages(struct zone *zone); 520void drain_local_pages(struct zone *zone);
517 521
518#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
519void page_alloc_init_late(void); 522void page_alloc_init_late(void);
520#else
521static inline void page_alloc_init_late(void)
522{
523}
524#endif
525 523
526/* 524/*
527 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what 525 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 792c8981e633..f0c4bec6565b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -28,6 +28,7 @@
28#include <linux/eventfd.h> 28#include <linux/eventfd.h>
29#include <linux/mmzone.h> 29#include <linux/mmzone.h>
30#include <linux/writeback.h> 30#include <linux/writeback.h>
31#include <linux/page-flags.h>
31 32
32struct mem_cgroup; 33struct mem_cgroup;
33struct page; 34struct page;
@@ -89,6 +90,10 @@ enum mem_cgroup_events_target {
89}; 90};
90 91
91#ifdef CONFIG_MEMCG 92#ifdef CONFIG_MEMCG
93
94#define MEM_CGROUP_ID_SHIFT 16
95#define MEM_CGROUP_ID_MAX USHRT_MAX
96
92struct mem_cgroup_stat_cpu { 97struct mem_cgroup_stat_cpu {
93 long count[MEMCG_NR_STAT]; 98 long count[MEMCG_NR_STAT];
94 unsigned long events[MEMCG_NR_EVENTS]; 99 unsigned long events[MEMCG_NR_EVENTS];
@@ -265,6 +270,11 @@ struct mem_cgroup {
265 270
266extern struct mem_cgroup *root_mem_cgroup; 271extern struct mem_cgroup *root_mem_cgroup;
267 272
273static inline bool mem_cgroup_disabled(void)
274{
275 return !cgroup_subsys_enabled(memory_cgrp_subsys);
276}
277
268/** 278/**
269 * mem_cgroup_events - count memory events against a cgroup 279 * mem_cgroup_events - count memory events against a cgroup
270 * @memcg: the memory cgroup 280 * @memcg: the memory cgroup
@@ -291,7 +301,7 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
291void mem_cgroup_uncharge(struct page *page); 301void mem_cgroup_uncharge(struct page *page);
292void mem_cgroup_uncharge_list(struct list_head *page_list); 302void mem_cgroup_uncharge_list(struct list_head *page_list);
293 303
294void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage); 304void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
295 305
296struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); 306struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
297struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); 307struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
@@ -312,6 +322,28 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
312 struct mem_cgroup_reclaim_cookie *); 322 struct mem_cgroup_reclaim_cookie *);
313void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); 323void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
314 324
325static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
326{
327 if (mem_cgroup_disabled())
328 return 0;
329
330 return memcg->css.id;
331}
332
333/**
334 * mem_cgroup_from_id - look up a memcg from an id
335 * @id: the id to look up
336 *
337 * Caller must hold rcu_read_lock() and use css_tryget() as necessary.
338 */
339static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
340{
341 struct cgroup_subsys_state *css;
342
343 css = css_from_id(id, &memory_cgrp_subsys);
344 return mem_cgroup_from_css(css);
345}
346
315/** 347/**
316 * parent_mem_cgroup - find the accounting parent of a memcg 348 * parent_mem_cgroup - find the accounting parent of a memcg
317 * @memcg: memcg whose parent to find 349 * @memcg: memcg whose parent to find
@@ -353,11 +385,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
353struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); 385struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
354ino_t page_cgroup_ino(struct page *page); 386ino_t page_cgroup_ino(struct page *page);
355 387
356static inline bool mem_cgroup_disabled(void)
357{
358 return !cgroup_subsys_enabled(memory_cgrp_subsys);
359}
360
361static inline bool mem_cgroup_online(struct mem_cgroup *memcg) 388static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
362{ 389{
363 if (mem_cgroup_disabled()) 390 if (mem_cgroup_disabled())
@@ -429,36 +456,43 @@ bool mem_cgroup_oom_synchronize(bool wait);
429extern int do_swap_account; 456extern int do_swap_account;
430#endif 457#endif
431 458
432struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page); 459void lock_page_memcg(struct page *page);
433void mem_cgroup_end_page_stat(struct mem_cgroup *memcg); 460void unlock_page_memcg(struct page *page);
434 461
435/** 462/**
436 * mem_cgroup_update_page_stat - update page state statistics 463 * mem_cgroup_update_page_stat - update page state statistics
437 * @memcg: memcg to account against 464 * @page: the page
438 * @idx: page state item to account 465 * @idx: page state item to account
439 * @val: number of pages (positive or negative) 466 * @val: number of pages (positive or negative)
440 * 467 *
441 * See mem_cgroup_begin_page_stat() for locking requirements. 468 * The @page must be locked or the caller must use lock_page_memcg()
469 * to prevent double accounting when the page is concurrently being
470 * moved to another memcg:
471 *
472 * lock_page(page) or lock_page_memcg(page)
473 * if (TestClearPageState(page))
474 * mem_cgroup_update_page_stat(page, state, -1);
475 * unlock_page(page) or unlock_page_memcg(page)
442 */ 476 */
443static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, 477static inline void mem_cgroup_update_page_stat(struct page *page,
444 enum mem_cgroup_stat_index idx, int val) 478 enum mem_cgroup_stat_index idx, int val)
445{ 479{
446 VM_BUG_ON(!rcu_read_lock_held()); 480 VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page)));
447 481
448 if (memcg) 482 if (page->mem_cgroup)
449 this_cpu_add(memcg->stat->count[idx], val); 483 this_cpu_add(page->mem_cgroup->stat->count[idx], val);
450} 484}
451 485
452static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, 486static inline void mem_cgroup_inc_page_stat(struct page *page,
453 enum mem_cgroup_stat_index idx) 487 enum mem_cgroup_stat_index idx)
454{ 488{
455 mem_cgroup_update_page_stat(memcg, idx, 1); 489 mem_cgroup_update_page_stat(page, idx, 1);
456} 490}
457 491
458static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg, 492static inline void mem_cgroup_dec_page_stat(struct page *page,
459 enum mem_cgroup_stat_index idx) 493 enum mem_cgroup_stat_index idx)
460{ 494{
461 mem_cgroup_update_page_stat(memcg, idx, -1); 495 mem_cgroup_update_page_stat(page, idx, -1);
462} 496}
463 497
464unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 498unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
@@ -496,8 +530,17 @@ void mem_cgroup_split_huge_fixup(struct page *head);
496#endif 530#endif
497 531
498#else /* CONFIG_MEMCG */ 532#else /* CONFIG_MEMCG */
533
534#define MEM_CGROUP_ID_SHIFT 0
535#define MEM_CGROUP_ID_MAX 0
536
499struct mem_cgroup; 537struct mem_cgroup;
500 538
539static inline bool mem_cgroup_disabled(void)
540{
541 return true;
542}
543
501static inline void mem_cgroup_events(struct mem_cgroup *memcg, 544static inline void mem_cgroup_events(struct mem_cgroup *memcg,
502 enum mem_cgroup_events_index idx, 545 enum mem_cgroup_events_index idx,
503 unsigned int nr) 546 unsigned int nr)
@@ -539,7 +582,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
539{ 582{
540} 583}
541 584
542static inline void mem_cgroup_replace_page(struct page *old, struct page *new) 585static inline void mem_cgroup_migrate(struct page *old, struct page *new)
543{ 586{
544} 587}
545 588
@@ -580,9 +623,16 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
580{ 623{
581} 624}
582 625
583static inline bool mem_cgroup_disabled(void) 626static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
584{ 627{
585 return true; 628 return 0;
629}
630
631static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
632{
633 WARN_ON_ONCE(id);
634 /* XXX: This should always return root_mem_cgroup */
635 return NULL;
586} 636}
587 637
588static inline bool mem_cgroup_online(struct mem_cgroup *memcg) 638static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
@@ -613,12 +663,11 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
613{ 663{
614} 664}
615 665
616static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) 666static inline void lock_page_memcg(struct page *page)
617{ 667{
618 return NULL;
619} 668}
620 669
621static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) 670static inline void unlock_page_memcg(struct page *page)
622{ 671{
623} 672}
624 673
@@ -644,12 +693,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
644 return false; 693 return false;
645} 694}
646 695
647static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, 696static inline void mem_cgroup_inc_page_stat(struct page *page,
648 enum mem_cgroup_stat_index idx) 697 enum mem_cgroup_stat_index idx)
649{ 698{
650} 699}
651 700
652static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg, 701static inline void mem_cgroup_dec_page_stat(struct page *page,
653 enum mem_cgroup_stat_index idx) 702 enum mem_cgroup_stat_index idx)
654{ 703{
655} 704}
@@ -765,7 +814,7 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
765void __memcg_kmem_uncharge(struct page *page, int order); 814void __memcg_kmem_uncharge(struct page *page, int order);
766 815
767/* 816/*
768 * helper for acessing a memcg's index. It will be used as an index in the 817 * helper for accessing a memcg's index. It will be used as an index in the
769 * child cache array in kmem_cache, and also to derive its name. This function 818 * child cache array in kmem_cache, and also to derive its name. This function
770 * will return -1 when this is not a kmem-limited memcg. 819 * will return -1 when this is not a kmem-limited memcg.
771 */ 820 */
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 8b8d8d12348e..82730adba950 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -109,6 +109,9 @@ extern void unregister_memory_notifier(struct notifier_block *nb);
109extern int register_memory_isolate_notifier(struct notifier_block *nb); 109extern int register_memory_isolate_notifier(struct notifier_block *nb);
110extern void unregister_memory_isolate_notifier(struct notifier_block *nb); 110extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
111extern int register_new_memory(int, struct mem_section *); 111extern int register_new_memory(int, struct mem_section *);
112extern int memory_block_change_state(struct memory_block *mem,
113 unsigned long to_state,
114 unsigned long from_state_req);
112#ifdef CONFIG_MEMORY_HOTREMOVE 115#ifdef CONFIG_MEMORY_HOTREMOVE
113extern int unregister_memory_section(struct mem_section *); 116extern int unregister_memory_section(struct mem_section *);
114#endif 117#endif
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 43405992d027..adbef586e696 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -99,6 +99,8 @@ extern void __online_page_free(struct page *page);
99 99
100extern int try_online_node(int nid); 100extern int try_online_node(int nid);
101 101
102extern bool memhp_auto_online;
103
102#ifdef CONFIG_MEMORY_HOTREMOVE 104#ifdef CONFIG_MEMORY_HOTREMOVE
103extern bool is_pageblock_removable_nolock(struct page *page); 105extern bool is_pageblock_removable_nolock(struct page *page);
104extern int arch_remove_memory(u64 start, u64 size); 106extern int arch_remove_memory(u64 start, u64 size);
@@ -196,6 +198,9 @@ void put_online_mems(void);
196void mem_hotplug_begin(void); 198void mem_hotplug_begin(void);
197void mem_hotplug_done(void); 199void mem_hotplug_done(void);
198 200
201extern void set_zone_contiguous(struct zone *zone);
202extern void clear_zone_contiguous(struct zone *zone);
203
199#else /* ! CONFIG_MEMORY_HOTPLUG */ 204#else /* ! CONFIG_MEMORY_HOTPLUG */
200/* 205/*
201 * Stub functions for when hotplug is off 206 * Stub functions for when hotplug is off
@@ -267,7 +272,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
267extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 272extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
268 void *arg, int (*func)(struct memory_block *, void *)); 273 void *arg, int (*func)(struct memory_block *, void *));
269extern int add_memory(int nid, u64 start, u64 size); 274extern int add_memory(int nid, u64 start, u64 size);
270extern int add_memory_resource(int nid, struct resource *resource); 275extern int add_memory_resource(int nid, struct resource *resource, bool online);
271extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, 276extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
272 bool for_device); 277 bool for_device);
273extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); 278extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index cac1c0904d5f..9b50325e4ddf 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -23,9 +23,13 @@ enum migrate_reason {
23 MR_SYSCALL, /* also applies to cpusets */ 23 MR_SYSCALL, /* also applies to cpusets */
24 MR_MEMPOLICY_MBIND, 24 MR_MEMPOLICY_MBIND,
25 MR_NUMA_MISPLACED, 25 MR_NUMA_MISPLACED,
26 MR_CMA 26 MR_CMA,
27 MR_TYPES
27}; 28};
28 29
30/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */
31extern char *migrate_reason_names[MR_TYPES];
32
29#ifdef CONFIG_MIGRATION 33#ifdef CONFIG_MIGRATION
30 34
31extern void putback_movable_pages(struct list_head *l); 35extern void putback_movable_pages(struct list_head *l);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3579d1e2fe3a..dbf1eddab964 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -905,20 +905,11 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
905{ 905{
906 return page->mem_cgroup; 906 return page->mem_cgroup;
907} 907}
908
909static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg)
910{
911 page->mem_cgroup = memcg;
912}
913#else 908#else
914static inline struct mem_cgroup *page_memcg(struct page *page) 909static inline struct mem_cgroup *page_memcg(struct page *page)
915{ 910{
916 return NULL; 911 return NULL;
917} 912}
918
919static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg)
920{
921}
922#endif 913#endif
923 914
924/* 915/*
@@ -1300,10 +1291,9 @@ int __set_page_dirty_nobuffers(struct page *page);
1300int __set_page_dirty_no_writeback(struct page *page); 1291int __set_page_dirty_no_writeback(struct page *page);
1301int redirty_page_for_writepage(struct writeback_control *wbc, 1292int redirty_page_for_writepage(struct writeback_control *wbc,
1302 struct page *page); 1293 struct page *page);
1303void account_page_dirtied(struct page *page, struct address_space *mapping, 1294void account_page_dirtied(struct page *page, struct address_space *mapping);
1304 struct mem_cgroup *memcg);
1305void account_page_cleaned(struct page *page, struct address_space *mapping, 1295void account_page_cleaned(struct page *page, struct address_space *mapping,
1306 struct mem_cgroup *memcg, struct bdi_writeback *wb); 1296 struct bdi_writeback *wb);
1307int set_page_dirty(struct page *page); 1297int set_page_dirty(struct page *page);
1308int set_page_dirty_lock(struct page *page); 1298int set_page_dirty_lock(struct page *page);
1309void cancel_dirty_page(struct page *page); 1299void cancel_dirty_page(struct page *page);
@@ -2178,6 +2168,17 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
2178 unsigned long size, pte_fn_t fn, void *data); 2168 unsigned long size, pte_fn_t fn, void *data);
2179 2169
2180 2170
2171#ifdef CONFIG_PAGE_POISONING
2172extern bool page_poisoning_enabled(void);
2173extern void kernel_poison_pages(struct page *page, int numpages, int enable);
2174extern bool page_is_poisoned(struct page *page);
2175#else
2176static inline bool page_poisoning_enabled(void) { return false; }
2177static inline void kernel_poison_pages(struct page *page, int numpages,
2178 int enable) { }
2179static inline bool page_is_poisoned(struct page *page) { return false; }
2180#endif
2181
2181#ifdef CONFIG_DEBUG_PAGEALLOC 2182#ifdef CONFIG_DEBUG_PAGEALLOC
2182extern bool _debug_pagealloc_enabled; 2183extern bool _debug_pagealloc_enabled;
2183extern void __kernel_map_pages(struct page *page, int numpages, int enable); 2184extern void __kernel_map_pages(struct page *page, int numpages, int enable);
@@ -2197,14 +2198,18 @@ kernel_map_pages(struct page *page, int numpages, int enable)
2197} 2198}
2198#ifdef CONFIG_HIBERNATION 2199#ifdef CONFIG_HIBERNATION
2199extern bool kernel_page_present(struct page *page); 2200extern bool kernel_page_present(struct page *page);
2200#endif /* CONFIG_HIBERNATION */ 2201#endif /* CONFIG_HIBERNATION */
2201#else 2202#else /* CONFIG_DEBUG_PAGEALLOC */
2202static inline void 2203static inline void
2203kernel_map_pages(struct page *page, int numpages, int enable) {} 2204kernel_map_pages(struct page *page, int numpages, int enable) {}
2204#ifdef CONFIG_HIBERNATION 2205#ifdef CONFIG_HIBERNATION
2205static inline bool kernel_page_present(struct page *page) { return true; } 2206static inline bool kernel_page_present(struct page *page) { return true; }
2206#endif /* CONFIG_HIBERNATION */ 2207#endif /* CONFIG_HIBERNATION */
2207#endif 2208static inline bool debug_pagealloc_enabled(void)
2209{
2210 return false;
2211}
2212#endif /* CONFIG_DEBUG_PAGEALLOC */
2208 2213
2209#ifdef __HAVE_ARCH_GATE_AREA 2214#ifdef __HAVE_ARCH_GATE_AREA
2210extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); 2215extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 053824b0a412..de7be78c6f0e 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -9,8 +9,7 @@ struct vm_area_struct;
9struct mm_struct; 9struct mm_struct;
10 10
11extern void dump_page(struct page *page, const char *reason); 11extern void dump_page(struct page *page, const char *reason);
12extern void dump_page_badflags(struct page *page, const char *reason, 12extern void __dump_page(struct page *page, const char *reason);
13 unsigned long badflags);
14void dump_vma(const struct vm_area_struct *vma); 13void dump_vma(const struct vm_area_struct *vma);
15void dump_mm(const struct mm_struct *mm); 14void dump_mm(const struct mm_struct *mm);
16 15
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7b6c2cfee390..6de02ac378a0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -63,6 +63,9 @@ enum {
63 MIGRATE_TYPES 63 MIGRATE_TYPES
64}; 64};
65 65
66/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
67extern char * const migratetype_names[MIGRATE_TYPES];
68
66#ifdef CONFIG_CMA 69#ifdef CONFIG_CMA
67# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) 70# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
68#else 71#else
@@ -209,10 +212,12 @@ struct zone_reclaim_stat {
209}; 212};
210 213
211struct lruvec { 214struct lruvec {
212 struct list_head lists[NR_LRU_LISTS]; 215 struct list_head lists[NR_LRU_LISTS];
213 struct zone_reclaim_stat reclaim_stat; 216 struct zone_reclaim_stat reclaim_stat;
217 /* Evictions & activations on the inactive file list */
218 atomic_long_t inactive_age;
214#ifdef CONFIG_MEMCG 219#ifdef CONFIG_MEMCG
215 struct zone *zone; 220 struct zone *zone;
216#endif 221#endif
217}; 222};
218 223
@@ -487,9 +492,6 @@ struct zone {
487 spinlock_t lru_lock; 492 spinlock_t lru_lock;
488 struct lruvec lruvec; 493 struct lruvec lruvec;
489 494
490 /* Evictions & activations on the inactive file list */
491 atomic_long_t inactive_age;
492
493 /* 495 /*
494 * When free pages are below this point, additional steps are taken 496 * When free pages are below this point, additional steps are taken
495 * when reading the number of free pages to avoid per-cpu counter 497 * when reading the number of free pages to avoid per-cpu counter
@@ -520,6 +522,8 @@ struct zone {
520 bool compact_blockskip_flush; 522 bool compact_blockskip_flush;
521#endif 523#endif
522 524
525 bool contiguous;
526
523 ZONE_PADDING(_pad3_) 527 ZONE_PADDING(_pad3_)
524 /* Zone statistics */ 528 /* Zone statistics */
525 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 529 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
@@ -758,6 +762,8 @@ static inline struct zone *lruvec_zone(struct lruvec *lruvec)
758#endif 762#endif
759} 763}
760 764
765extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
766
761#ifdef CONFIG_HAVE_MEMORY_PRESENT 767#ifdef CONFIG_HAVE_MEMORY_PRESENT
762void memory_present(int nid, unsigned long start, unsigned long end); 768void memory_present(int nid, unsigned long start, unsigned long end);
763#else 769#else
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 17f118a82854..e1fe7cf5bddf 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -45,6 +45,7 @@ struct page_ext {
45 unsigned int order; 45 unsigned int order;
46 gfp_t gfp_mask; 46 gfp_t gfp_mask;
47 unsigned int nr_entries; 47 unsigned int nr_entries;
48 int last_migrate_reason;
48 unsigned long trace_entries[8]; 49 unsigned long trace_entries[8];
49#endif 50#endif
50}; 51};
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index cacaabea8a09..46f1b939948c 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -1,38 +1,54 @@
1#ifndef __LINUX_PAGE_OWNER_H 1#ifndef __LINUX_PAGE_OWNER_H
2#define __LINUX_PAGE_OWNER_H 2#define __LINUX_PAGE_OWNER_H
3 3
4#include <linux/jump_label.h>
5
4#ifdef CONFIG_PAGE_OWNER 6#ifdef CONFIG_PAGE_OWNER
5extern bool page_owner_inited; 7extern struct static_key_false page_owner_inited;
6extern struct page_ext_operations page_owner_ops; 8extern struct page_ext_operations page_owner_ops;
7 9
8extern void __reset_page_owner(struct page *page, unsigned int order); 10extern void __reset_page_owner(struct page *page, unsigned int order);
9extern void __set_page_owner(struct page *page, 11extern void __set_page_owner(struct page *page,
10 unsigned int order, gfp_t gfp_mask); 12 unsigned int order, gfp_t gfp_mask);
11extern gfp_t __get_page_owner_gfp(struct page *page); 13extern gfp_t __get_page_owner_gfp(struct page *page);
14extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
15extern void __set_page_owner_migrate_reason(struct page *page, int reason);
16extern void __dump_page_owner(struct page *page);
12 17
13static inline void reset_page_owner(struct page *page, unsigned int order) 18static inline void reset_page_owner(struct page *page, unsigned int order)
14{ 19{
15 if (likely(!page_owner_inited)) 20 if (static_branch_unlikely(&page_owner_inited))
16 return; 21 __reset_page_owner(page, order);
17
18 __reset_page_owner(page, order);
19} 22}
20 23
21static inline void set_page_owner(struct page *page, 24static inline void set_page_owner(struct page *page,
22 unsigned int order, gfp_t gfp_mask) 25 unsigned int order, gfp_t gfp_mask)
23{ 26{
24 if (likely(!page_owner_inited)) 27 if (static_branch_unlikely(&page_owner_inited))
25 return; 28 __set_page_owner(page, order, gfp_mask);
26
27 __set_page_owner(page, order, gfp_mask);
28} 29}
29 30
30static inline gfp_t get_page_owner_gfp(struct page *page) 31static inline gfp_t get_page_owner_gfp(struct page *page)
31{ 32{
32 if (likely(!page_owner_inited)) 33 if (static_branch_unlikely(&page_owner_inited))
34 return __get_page_owner_gfp(page);
35 else
33 return 0; 36 return 0;
34 37}
35 return __get_page_owner_gfp(page); 38static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
39{
40 if (static_branch_unlikely(&page_owner_inited))
41 __copy_page_owner(oldpage, newpage);
42}
43static inline void set_page_owner_migrate_reason(struct page *page, int reason)
44{
45 if (static_branch_unlikely(&page_owner_inited))
46 __set_page_owner_migrate_reason(page, reason);
47}
48static inline void dump_page_owner(struct page *page)
49{
50 if (static_branch_unlikely(&page_owner_inited))
51 __dump_page_owner(page);
36} 52}
37#else 53#else
38static inline void reset_page_owner(struct page *page, unsigned int order) 54static inline void reset_page_owner(struct page *page, unsigned int order)
@@ -46,6 +62,14 @@ static inline gfp_t get_page_owner_gfp(struct page *page)
46{ 62{
47 return 0; 63 return 0;
48} 64}
49 65static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
66{
67}
68static inline void set_page_owner_migrate_reason(struct page *page, int reason)
69{
70}
71static inline void dump_page_owner(struct page *page)
72{
73}
50#endif /* CONFIG_PAGE_OWNER */ 74#endif /* CONFIG_PAGE_OWNER */
51#endif /* __LINUX_PAGE_OWNER_H */ 75#endif /* __LINUX_PAGE_OWNER_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 92395a0a7dc5..183b15ea052b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -663,8 +663,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
663int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 663int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
664 pgoff_t index, gfp_t gfp_mask); 664 pgoff_t index, gfp_t gfp_mask);
665extern void delete_from_page_cache(struct page *page); 665extern void delete_from_page_cache(struct page *page);
666extern void __delete_from_page_cache(struct page *page, void *shadow, 666extern void __delete_from_page_cache(struct page *page, void *shadow);
667 struct mem_cgroup *memcg);
668int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); 667int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
669 668
670/* 669/*
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 4a27153574e2..51334edec506 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -30,7 +30,11 @@
30#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) 30#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
31 31
32/********** mm/debug-pagealloc.c **********/ 32/********** mm/debug-pagealloc.c **********/
33#ifdef CONFIG_PAGE_POISONING_ZERO
34#define PAGE_POISON 0x00
35#else
33#define PAGE_POISON 0xaa 36#define PAGE_POISON 0xaa
37#endif
34 38
35/********** mm/page_alloc.c ************/ 39/********** mm/page_alloc.c ************/
36 40
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 3627d5c1bc47..e4b568738ca3 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -20,7 +20,7 @@
20 * Flags to pass to kmem_cache_create(). 20 * Flags to pass to kmem_cache_create().
21 * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. 21 * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
22 */ 22 */
23#define SLAB_DEBUG_FREE 0x00000100UL /* DEBUG: Perform (expensive) checks on free */ 23#define SLAB_CONSISTENCY_CHECKS 0x00000100UL /* DEBUG: Perform (expensive) checks on alloc/free */
24#define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */ 24#define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */
25#define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */ 25#define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */
26#define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */ 26#define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */
@@ -314,7 +314,7 @@ void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment
314void kmem_cache_free(struct kmem_cache *, void *); 314void kmem_cache_free(struct kmem_cache *, void *);
315 315
316/* 316/*
317 * Bulk allocation and freeing operations. These are accellerated in an 317 * Bulk allocation and freeing operations. These are accelerated in an
318 * allocator specific way to avoid taking locks repeatedly or building 318 * allocator specific way to avoid taking locks repeatedly or building
319 * metadata structures unnecessarily. 319 * metadata structures unnecessarily.
320 * 320 *
@@ -323,6 +323,15 @@ void kmem_cache_free(struct kmem_cache *, void *);
323void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); 323void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
324int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); 324int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
325 325
326/*
327 * Caller must not use kfree_bulk() on memory not originally allocated
328 * by kmalloc(), because the SLOB allocator cannot handle this.
329 */
330static __always_inline void kfree_bulk(size_t size, void **p)
331{
332 kmem_cache_free_bulk(NULL, size, p);
333}
334
326#ifdef CONFIG_NUMA 335#ifdef CONFIG_NUMA
327void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment; 336void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
328void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment; 337void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index cf139d3fa513..e878ba35ae91 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -60,6 +60,9 @@ struct kmem_cache {
60 atomic_t allocmiss; 60 atomic_t allocmiss;
61 atomic_t freehit; 61 atomic_t freehit;
62 atomic_t freemiss; 62 atomic_t freemiss;
63#ifdef CONFIG_DEBUG_SLAB_LEAK
64 atomic_t store_user_clean;
65#endif
63 66
64 /* 67 /*
65 * If debugging is enabled, then the allocator can add additional 68 * If debugging is enabled, then the allocator can add additional
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index b7e57927f521..ac5143f95ee6 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -81,6 +81,7 @@ struct kmem_cache {
81 int reserved; /* Reserved bytes at the end of slabs */ 81 int reserved; /* Reserved bytes at the end of slabs */
82 const char *name; /* Name (only for display!) */ 82 const char *name; /* Name (only for display!) */
83 struct list_head list; /* List of slab caches */ 83 struct list_head list; /* List of slab caches */
84 int red_left_pad; /* Left redzone padding size */
84#ifdef CONFIG_SYSFS 85#ifdef CONFIG_SYSFS
85 struct kobject kobj; /* For sysfs */ 86 struct kobject kobj; /* For sysfs */
86#endif 87#endif
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 925730bc9fc1..705df7db4482 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -15,16 +15,6 @@ struct tracer;
15struct dentry; 15struct dentry;
16struct bpf_prog; 16struct bpf_prog;
17 17
18struct trace_print_flags {
19 unsigned long mask;
20 const char *name;
21};
22
23struct trace_print_flags_u64 {
24 unsigned long long mask;
25 const char *name;
26};
27
28const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, 18const char *trace_print_flags_seq(struct trace_seq *p, const char *delim,
29 unsigned long flags, 19 unsigned long flags,
30 const struct trace_print_flags *flag_array); 20 const struct trace_print_flags *flag_array);
diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index e1ee97c713bf..4ac89acb6136 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -3,13 +3,23 @@
3 3
4/* 4/*
5 * File can be included directly by headers who only want to access 5 * File can be included directly by headers who only want to access
6 * tracepoint->key to guard out of line trace calls. Otherwise 6 * tracepoint->key to guard out of line trace calls, or the definition of
7 * linux/tracepoint.h should be used. 7 * trace_print_flags{_u64}. Otherwise linux/tracepoint.h should be used.
8 */ 8 */
9 9
10#include <linux/atomic.h> 10#include <linux/atomic.h>
11#include <linux/static_key.h> 11#include <linux/static_key.h>
12 12
13struct trace_print_flags {
14 unsigned long mask;
15 const char *name;
16};
17
18struct trace_print_flags_u64 {
19 unsigned long long mask;
20 const char *name;
21};
22
13struct tracepoint_func { 23struct tracepoint_func {
14 void *func; 24 void *func;
15 void *data; 25 void *data;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index d866f21efbbf..677807f29a1c 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -6,7 +6,7 @@
6 6
7#include <linux/writeback.h> 7#include <linux/writeback.h>
8#include <linux/tracepoint.h> 8#include <linux/tracepoint.h>
9#include <trace/events/gfpflags.h> 9#include <trace/events/mmflags.h>
10 10
11struct btrfs_root; 11struct btrfs_root;
12struct btrfs_fs_info; 12struct btrfs_fs_info;
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index c92d1e1cbad9..111e5666e5eb 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -7,7 +7,7 @@
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/list.h> 8#include <linux/list.h>
9#include <linux/tracepoint.h> 9#include <linux/tracepoint.h>
10#include <trace/events/gfpflags.h> 10#include <trace/events/mmflags.h>
11 11
12#define COMPACTION_STATUS \ 12#define COMPACTION_STATUS \
13 EM( COMPACT_DEFERRED, "deferred") \ 13 EM( COMPACT_DEFERRED, "deferred") \
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
deleted file mode 100644
index dde6bf092c8a..000000000000
--- a/include/trace/events/gfpflags.h
+++ /dev/null
@@ -1,43 +0,0 @@
1/*
2 * The order of these masks is important. Matching masks will be seen
3 * first and the left over flags will end up showing by themselves.
4 *
5 * For example, if we have GFP_KERNEL before GFP_USER we wil get:
6 *
7 * GFP_KERNEL|GFP_HARDWALL
8 *
9 * Thus most bits set go first.
10 */
11#define show_gfp_flags(flags) \
12 (flags) ? __print_flags(flags, "|", \
13 {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \
14 {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"}, \
15 {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \
16 {(unsigned long)GFP_USER, "GFP_USER"}, \
17 {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \
18 {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \
19 {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \
20 {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \
21 {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \
22 {(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \
23 {(unsigned long)__GFP_ATOMIC, "GFP_ATOMIC"}, \
24 {(unsigned long)__GFP_IO, "GFP_IO"}, \
25 {(unsigned long)__GFP_COLD, "GFP_COLD"}, \
26 {(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \
27 {(unsigned long)__GFP_REPEAT, "GFP_REPEAT"}, \
28 {(unsigned long)__GFP_NOFAIL, "GFP_NOFAIL"}, \
29 {(unsigned long)__GFP_NORETRY, "GFP_NORETRY"}, \
30 {(unsigned long)__GFP_COMP, "GFP_COMP"}, \
31 {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \
32 {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \
33 {(unsigned long)__GFP_MEMALLOC, "GFP_MEMALLOC"}, \
34 {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \
35 {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \
36 {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
37 {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \
38 {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \
39 {(unsigned long)__GFP_DIRECT_RECLAIM, "GFP_DIRECT_RECLAIM"}, \
40 {(unsigned long)__GFP_KSWAPD_RECLAIM, "GFP_KSWAPD_RECLAIM"}, \
41 {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \
42 ) : "GFP_NOWAIT"
43
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 47c6212d8f3c..551ba4acde4d 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -6,8 +6,6 @@
6 6
7#include <linux/tracepoint.h> 7#include <linux/tracepoint.h>
8 8
9#include <trace/events/gfpflags.h>
10
11#define SCAN_STATUS \ 9#define SCAN_STATUS \
12 EM( SCAN_FAIL, "failed") \ 10 EM( SCAN_FAIL, "failed") \
13 EM( SCAN_SUCCEED, "succeeded") \ 11 EM( SCAN_SUCCEED, "succeeded") \
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index f7554fd7fc62..ca7217389067 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -6,7 +6,7 @@
6 6
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/tracepoint.h> 8#include <linux/tracepoint.h>
9#include <trace/events/gfpflags.h> 9#include <trace/events/mmflags.h>
10 10
11DECLARE_EVENT_CLASS(kmem_alloc, 11DECLARE_EVENT_CLASS(kmem_alloc,
12 12
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
new file mode 100644
index 000000000000..a849185c82f0
--- /dev/null
+++ b/include/trace/events/mmflags.h
@@ -0,0 +1,164 @@
1/*
2 * The order of these masks is important. Matching masks will be seen
3 * first and the left over flags will end up showing by themselves.
4 *
5 * For example, if we have GFP_KERNEL before GFP_USER we wil get:
6 *
7 * GFP_KERNEL|GFP_HARDWALL
8 *
9 * Thus most bits set go first.
10 */
11
12#define __def_gfpflag_names \
13 {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \
14 {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\
15 {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \
16 {(unsigned long)GFP_USER, "GFP_USER"}, \
17 {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \
18 {(unsigned long)GFP_KERNEL_ACCOUNT, "GFP_KERNEL_ACCOUNT"}, \
19 {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \
20 {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \
21 {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \
22 {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \
23 {(unsigned long)GFP_NOWAIT, "GFP_NOWAIT"}, \
24 {(unsigned long)GFP_DMA, "GFP_DMA"}, \
25 {(unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \
26 {(unsigned long)GFP_DMA32, "GFP_DMA32"}, \
27 {(unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \
28 {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \
29 {(unsigned long)__GFP_IO, "__GFP_IO"}, \
30 {(unsigned long)__GFP_FS, "__GFP_FS"}, \
31 {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \
32 {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \
33 {(unsigned long)__GFP_REPEAT, "__GFP_REPEAT"}, \
34 {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \
35 {(unsigned long)__GFP_NORETRY, "__GFP_NORETRY"}, \
36 {(unsigned long)__GFP_COMP, "__GFP_COMP"}, \
37 {(unsigned long)__GFP_ZERO, "__GFP_ZERO"}, \
38 {(unsigned long)__GFP_NOMEMALLOC, "__GFP_NOMEMALLOC"}, \
39 {(unsigned long)__GFP_MEMALLOC, "__GFP_MEMALLOC"}, \
40 {(unsigned long)__GFP_HARDWALL, "__GFP_HARDWALL"}, \
41 {(unsigned long)__GFP_THISNODE, "__GFP_THISNODE"}, \
42 {(unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \
43 {(unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \
44 {(unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \
45 {(unsigned long)__GFP_NOTRACK, "__GFP_NOTRACK"}, \
46 {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \
47 {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \
48 {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\
49 {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\
50 {(unsigned long)__GFP_OTHER_NODE, "__GFP_OTHER_NODE"} \
51
52#define show_gfp_flags(flags) \
53 (flags) ? __print_flags(flags, "|", \
54 __def_gfpflag_names \
55 ) : "none"
56
57#ifdef CONFIG_MMU
58#define IF_HAVE_PG_MLOCK(flag,string) ,{1UL << flag, string}
59#else
60#define IF_HAVE_PG_MLOCK(flag,string)
61#endif
62
63#ifdef CONFIG_ARCH_USES_PG_UNCACHED
64#define IF_HAVE_PG_UNCACHED(flag,string) ,{1UL << flag, string}
65#else
66#define IF_HAVE_PG_UNCACHED(flag,string)
67#endif
68
69#ifdef CONFIG_MEMORY_FAILURE
70#define IF_HAVE_PG_HWPOISON(flag,string) ,{1UL << flag, string}
71#else
72#define IF_HAVE_PG_HWPOISON(flag,string)
73#endif
74
75#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
76#define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string}
77#else
78#define IF_HAVE_PG_IDLE(flag,string)
79#endif
80
81#define __def_pageflag_names \
82 {1UL << PG_locked, "locked" }, \
83 {1UL << PG_error, "error" }, \
84 {1UL << PG_referenced, "referenced" }, \
85 {1UL << PG_uptodate, "uptodate" }, \
86 {1UL << PG_dirty, "dirty" }, \
87 {1UL << PG_lru, "lru" }, \
88 {1UL << PG_active, "active" }, \
89 {1UL << PG_slab, "slab" }, \
90 {1UL << PG_owner_priv_1, "owner_priv_1" }, \
91 {1UL << PG_arch_1, "arch_1" }, \
92 {1UL << PG_reserved, "reserved" }, \
93 {1UL << PG_private, "private" }, \
94 {1UL << PG_private_2, "private_2" }, \
95 {1UL << PG_writeback, "writeback" }, \
96 {1UL << PG_head, "head" }, \
97 {1UL << PG_swapcache, "swapcache" }, \
98 {1UL << PG_mappedtodisk, "mappedtodisk" }, \
99 {1UL << PG_reclaim, "reclaim" }, \
100 {1UL << PG_swapbacked, "swapbacked" }, \
101 {1UL << PG_unevictable, "unevictable" } \
102IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \
103IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
104IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
105IF_HAVE_PG_IDLE(PG_young, "young" ) \
106IF_HAVE_PG_IDLE(PG_idle, "idle" )
107
108#define show_page_flags(flags) \
109 (flags) ? __print_flags(flags, "|", \
110 __def_pageflag_names \
111 ) : "none"
112
113#if defined(CONFIG_X86)
114#define __VM_ARCH_SPECIFIC {VM_PAT, "pat" }
115#elif defined(CONFIG_PPC)
116#define __VM_ARCH_SPECIFIC {VM_SAO, "sao" }
117#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
118#define __VM_ARCH_SPECIFIC {VM_GROWSUP, "growsup" }
119#elif !defined(CONFIG_MMU)
120#define __VM_ARCH_SPECIFIC {VM_MAPPED_COPY,"mappedcopy" }
121#else
122#define __VM_ARCH_SPECIFIC {VM_ARCH_1, "arch_1" }
123#endif
124
125#ifdef CONFIG_MEM_SOFT_DIRTY
126#define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name },
127#else
128#define IF_HAVE_VM_SOFTDIRTY(flag,name)
129#endif
130
131#define __def_vmaflag_names \
132 {VM_READ, "read" }, \
133 {VM_WRITE, "write" }, \
134 {VM_EXEC, "exec" }, \
135 {VM_SHARED, "shared" }, \
136 {VM_MAYREAD, "mayread" }, \
137 {VM_MAYWRITE, "maywrite" }, \
138 {VM_MAYEXEC, "mayexec" }, \
139 {VM_MAYSHARE, "mayshare" }, \
140 {VM_GROWSDOWN, "growsdown" }, \
141 {VM_PFNMAP, "pfnmap" }, \
142 {VM_DENYWRITE, "denywrite" }, \
143 {VM_LOCKONFAULT, "lockonfault" }, \
144 {VM_LOCKED, "locked" }, \
145 {VM_IO, "io" }, \
146 {VM_SEQ_READ, "seqread" }, \
147 {VM_RAND_READ, "randread" }, \
148 {VM_DONTCOPY, "dontcopy" }, \
149 {VM_DONTEXPAND, "dontexpand" }, \
150 {VM_ACCOUNT, "account" }, \
151 {VM_NORESERVE, "noreserve" }, \
152 {VM_HUGETLB, "hugetlb" }, \
153 __VM_ARCH_SPECIFIC , \
154 {VM_DONTDUMP, "dontdump" }, \
155IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
156 {VM_MIXEDMAP, "mixedmap" }, \
157 {VM_HUGEPAGE, "hugepage" }, \
158 {VM_NOHUGEPAGE, "nohugepage" }, \
159 {VM_MERGEABLE, "mergeable" } \
160
161#define show_vma_flags(flags) \
162 (flags) ? __print_flags(flags, "|", \
163 __def_vmaflag_names \
164 ) : "none"
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 31763dd8db1c..0101ef37f1ee 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -8,7 +8,7 @@
8#include <linux/tracepoint.h> 8#include <linux/tracepoint.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/memcontrol.h> 10#include <linux/memcontrol.h>
11#include <trace/events/gfpflags.h> 11#include <trace/events/mmflags.h>
12 12
13#define RECLAIM_WB_ANON 0x0001u 13#define RECLAIM_WB_ANON 0x0001u
14#define RECLAIM_WB_FILE 0x0002u 14#define RECLAIM_WB_FILE 0x0002u
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index bb991dfe134f..9175a1b4dc69 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -1,7 +1,4 @@
1/* -*- linux-c -*- ------------------------------------------------------- * 1/*
2 *
3 * linux/include/linux/auto_fs.h
4 *
5 * Copyright 1997 Transmeta Corporation - All Rights Reserved 2 * Copyright 1997 Transmeta Corporation - All Rights Reserved
6 * 3 *
7 * This file is part of the Linux kernel and is made available under 4 * This file is part of the Linux kernel and is made available under
@@ -51,7 +48,7 @@ struct autofs_packet_hdr {
51 48
52struct autofs_packet_missing { 49struct autofs_packet_missing {
53 struct autofs_packet_hdr hdr; 50 struct autofs_packet_hdr hdr;
54 autofs_wqt_t wait_queue_token; 51 autofs_wqt_t wait_queue_token;
55 int len; 52 int len;
56 char name[NAME_MAX+1]; 53 char name[NAME_MAX+1];
57}; 54};
@@ -63,12 +60,12 @@ struct autofs_packet_expire {
63 char name[NAME_MAX+1]; 60 char name[NAME_MAX+1];
64}; 61};
65 62
66#define AUTOFS_IOC_READY _IO(0x93,0x60) 63#define AUTOFS_IOC_READY _IO(0x93, 0x60)
67#define AUTOFS_IOC_FAIL _IO(0x93,0x61) 64#define AUTOFS_IOC_FAIL _IO(0x93, 0x61)
68#define AUTOFS_IOC_CATATONIC _IO(0x93,0x62) 65#define AUTOFS_IOC_CATATONIC _IO(0x93, 0x62)
69#define AUTOFS_IOC_PROTOVER _IOR(0x93,0x63,int) 66#define AUTOFS_IOC_PROTOVER _IOR(0x93, 0x63, int)
70#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,compat_ulong_t) 67#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93, 0x64, compat_ulong_t)
71#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93,0x64,unsigned long) 68#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93, 0x64, unsigned long)
72#define AUTOFS_IOC_EXPIRE _IOR(0x93,0x65,struct autofs_packet_expire) 69#define AUTOFS_IOC_EXPIRE _IOR(0x93, 0x65, struct autofs_packet_expire)
73 70
74#endif /* _UAPI_LINUX_AUTO_FS_H */ 71#endif /* _UAPI_LINUX_AUTO_FS_H */
diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index e02982fa2953..8f8f1bdcca8c 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -1,6 +1,4 @@
1/* -*- c -*- 1/*
2 * linux/include/linux/auto_fs4.h
3 *
4 * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org> 2 * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
5 * 3 *
6 * This file is part of the Linux kernel and is made available under 4 * This file is part of the Linux kernel and is made available under
@@ -38,7 +36,6 @@
38static inline void set_autofs_type_indirect(unsigned int *type) 36static inline void set_autofs_type_indirect(unsigned int *type)
39{ 37{
40 *type = AUTOFS_TYPE_INDIRECT; 38 *type = AUTOFS_TYPE_INDIRECT;
41 return;
42} 39}
43 40
44static inline unsigned int autofs_type_indirect(unsigned int type) 41static inline unsigned int autofs_type_indirect(unsigned int type)
@@ -49,7 +46,6 @@ static inline unsigned int autofs_type_indirect(unsigned int type)
49static inline void set_autofs_type_direct(unsigned int *type) 46static inline void set_autofs_type_direct(unsigned int *type)
50{ 47{
51 *type = AUTOFS_TYPE_DIRECT; 48 *type = AUTOFS_TYPE_DIRECT;
52 return;
53} 49}
54 50
55static inline unsigned int autofs_type_direct(unsigned int type) 51static inline unsigned int autofs_type_direct(unsigned int type)
@@ -60,7 +56,6 @@ static inline unsigned int autofs_type_direct(unsigned int type)
60static inline void set_autofs_type_offset(unsigned int *type) 56static inline void set_autofs_type_offset(unsigned int *type)
61{ 57{
62 *type = AUTOFS_TYPE_OFFSET; 58 *type = AUTOFS_TYPE_OFFSET;
63 return;
64} 59}
65 60
66static inline unsigned int autofs_type_offset(unsigned int type) 61static inline unsigned int autofs_type_offset(unsigned int type)
@@ -81,7 +76,6 @@ static inline unsigned int autofs_type_trigger(unsigned int type)
81static inline void set_autofs_type_any(unsigned int *type) 76static inline void set_autofs_type_any(unsigned int *type)
82{ 77{
83 *type = AUTOFS_TYPE_ANY; 78 *type = AUTOFS_TYPE_ANY;
84 return;
85} 79}
86 80
87static inline unsigned int autofs_type_any(unsigned int type) 81static inline unsigned int autofs_type_any(unsigned int type)
@@ -114,7 +108,7 @@ enum autofs_notify {
114/* v4 multi expire (via pipe) */ 108/* v4 multi expire (via pipe) */
115struct autofs_packet_expire_multi { 109struct autofs_packet_expire_multi {
116 struct autofs_packet_hdr hdr; 110 struct autofs_packet_hdr hdr;
117 autofs_wqt_t wait_queue_token; 111 autofs_wqt_t wait_queue_token;
118 int len; 112 int len;
119 char name[NAME_MAX+1]; 113 char name[NAME_MAX+1];
120}; 114};
@@ -154,11 +148,10 @@ union autofs_v5_packet_union {
154 autofs_packet_expire_direct_t expire_direct; 148 autofs_packet_expire_direct_t expire_direct;
155}; 149};
156 150
157#define AUTOFS_IOC_EXPIRE_MULTI _IOW(0x93,0x66,int) 151#define AUTOFS_IOC_EXPIRE_MULTI _IOW(0x93, 0x66, int)
158#define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI 152#define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI
159#define AUTOFS_IOC_EXPIRE_DIRECT AUTOFS_IOC_EXPIRE_MULTI 153#define AUTOFS_IOC_EXPIRE_DIRECT AUTOFS_IOC_EXPIRE_MULTI
160#define AUTOFS_IOC_PROTOSUBVER _IOR(0x93,0x67,int) 154#define AUTOFS_IOC_PROTOSUBVER _IOR(0x93, 0x67, int)
161#define AUTOFS_IOC_ASKUMOUNT _IOR(0x93,0x70,int) 155#define AUTOFS_IOC_ASKUMOUNT _IOR(0x93, 0x70, int)
162
163 156
164#endif /* _LINUX_AUTO_FS4_H */ 157#endif /* _LINUX_AUTO_FS4_H */
diff --git a/init/Kconfig b/init/Kconfig
index 22320804fbaf..fd664b3ab99e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1420,6 +1420,28 @@ config KALLSYMS_ALL
1420 1420
1421 Say N unless you really need all symbols. 1421 Say N unless you really need all symbols.
1422 1422
1423config KALLSYMS_ABSOLUTE_PERCPU
1424 bool
1425 default X86_64 && SMP
1426
1427config KALLSYMS_BASE_RELATIVE
1428 bool
1429 depends on KALLSYMS
1430 default !IA64 && !(TILE && 64BIT)
1431 help
1432 Instead of emitting them as absolute values in the native word size,
1433 emit the symbol references in the kallsyms table as 32-bit entries,
1434 each containing a relative value in the range [base, base + U32_MAX]
1435 or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either
1436 an absolute value in the range [0, S32_MAX] or a relative value in the
1437 range [base, base + S32_MAX], where base is the lowest relative symbol
1438 address encountered in the image.
1439
1440 On 64-bit builds, this reduces the size of the address table by 50%,
1441 but more importantly, it results in entries whose values are build
1442 time constants, and no relocation pass is required at runtime to fix
1443 up the entries based on the runtime load address of the kernel.
1444
1423config PRINTK 1445config PRINTK
1424 default y 1446 default y
1425 bool "Enable support for printk" if EXPERT 1447 bool "Enable support for printk" if EXPERT
diff --git a/init/main.c b/init/main.c
index 8dc93df20f7f..b3c6e363ae18 100644
--- a/init/main.c
+++ b/init/main.c
@@ -705,7 +705,6 @@ static int __init initcall_blacklist(char *str)
705 705
706static bool __init_or_module initcall_blacklisted(initcall_t fn) 706static bool __init_or_module initcall_blacklisted(initcall_t fn)
707{ 707{
708 struct list_head *tmp;
709 struct blacklist_entry *entry; 708 struct blacklist_entry *entry;
710 char *fn_name; 709 char *fn_name;
711 710
@@ -713,8 +712,7 @@ static bool __init_or_module initcall_blacklisted(initcall_t fn)
713 if (!fn_name) 712 if (!fn_name)
714 return false; 713 return false;
715 714
716 list_for_each(tmp, &blacklisted_initcalls) { 715 list_for_each_entry(entry, &blacklisted_initcalls, next) {
717 entry = list_entry(tmp, struct blacklist_entry, next);
718 if (!strcmp(fn_name, entry->buf)) { 716 if (!strcmp(fn_name, entry->buf)) {
719 pr_debug("initcall %s blacklisted\n", fn_name); 717 pr_debug("initcall %s blacklisted\n", fn_name);
720 kfree(fn_name); 718 kfree(fn_name);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5c5987f10819..fafd1a3ef0da 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -38,6 +38,7 @@
38 * during the second link stage. 38 * during the second link stage.
39 */ 39 */
40extern const unsigned long kallsyms_addresses[] __weak; 40extern const unsigned long kallsyms_addresses[] __weak;
41extern const int kallsyms_offsets[] __weak;
41extern const u8 kallsyms_names[] __weak; 42extern const u8 kallsyms_names[] __weak;
42 43
43/* 44/*
@@ -47,6 +48,9 @@ extern const u8 kallsyms_names[] __weak;
47extern const unsigned long kallsyms_num_syms 48extern const unsigned long kallsyms_num_syms
48__attribute__((weak, section(".rodata"))); 49__attribute__((weak, section(".rodata")));
49 50
51extern const unsigned long kallsyms_relative_base
52__attribute__((weak, section(".rodata")));
53
50extern const u8 kallsyms_token_table[] __weak; 54extern const u8 kallsyms_token_table[] __weak;
51extern const u16 kallsyms_token_index[] __weak; 55extern const u16 kallsyms_token_index[] __weak;
52 56
@@ -176,6 +180,23 @@ static unsigned int get_symbol_offset(unsigned long pos)
176 return name - kallsyms_names; 180 return name - kallsyms_names;
177} 181}
178 182
183static unsigned long kallsyms_sym_address(int idx)
184{
185 if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
186 return kallsyms_addresses[idx];
187
188 /* values are unsigned offsets if --absolute-percpu is not in effect */
189 if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU))
190 return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
191
192 /* ...otherwise, positive offsets are absolute values */
193 if (kallsyms_offsets[idx] >= 0)
194 return kallsyms_offsets[idx];
195
196 /* ...and negative offsets are relative to kallsyms_relative_base - 1 */
197 return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
198}
199
179/* Lookup the address for this symbol. Returns 0 if not found. */ 200/* Lookup the address for this symbol. Returns 0 if not found. */
180unsigned long kallsyms_lookup_name(const char *name) 201unsigned long kallsyms_lookup_name(const char *name)
181{ 202{
@@ -187,7 +208,7 @@ unsigned long kallsyms_lookup_name(const char *name)
187 off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); 208 off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
188 209
189 if (strcmp(namebuf, name) == 0) 210 if (strcmp(namebuf, name) == 0)
190 return kallsyms_addresses[i]; 211 return kallsyms_sym_address(i);
191 } 212 }
192 return module_kallsyms_lookup_name(name); 213 return module_kallsyms_lookup_name(name);
193} 214}
@@ -204,7 +225,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
204 225
205 for (i = 0, off = 0; i < kallsyms_num_syms; i++) { 226 for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
206 off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); 227 off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
207 ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); 228 ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
208 if (ret != 0) 229 if (ret != 0)
209 return ret; 230 return ret;
210 } 231 }
@@ -220,7 +241,10 @@ static unsigned long get_symbol_pos(unsigned long addr,
220 unsigned long i, low, high, mid; 241 unsigned long i, low, high, mid;
221 242
222 /* This kernel should never had been booted. */ 243 /* This kernel should never had been booted. */
223 BUG_ON(!kallsyms_addresses); 244 if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
245 BUG_ON(!kallsyms_addresses);
246 else
247 BUG_ON(!kallsyms_offsets);
224 248
225 /* Do a binary search on the sorted kallsyms_addresses array. */ 249 /* Do a binary search on the sorted kallsyms_addresses array. */
226 low = 0; 250 low = 0;
@@ -228,7 +252,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
228 252
229 while (high - low > 1) { 253 while (high - low > 1) {
230 mid = low + (high - low) / 2; 254 mid = low + (high - low) / 2;
231 if (kallsyms_addresses[mid] <= addr) 255 if (kallsyms_sym_address(mid) <= addr)
232 low = mid; 256 low = mid;
233 else 257 else
234 high = mid; 258 high = mid;
@@ -238,15 +262,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
238 * Search for the first aliased symbol. Aliased 262 * Search for the first aliased symbol. Aliased
239 * symbols are symbols with the same address. 263 * symbols are symbols with the same address.
240 */ 264 */
241 while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) 265 while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low))
242 --low; 266 --low;
243 267
244 symbol_start = kallsyms_addresses[low]; 268 symbol_start = kallsyms_sym_address(low);
245 269
246 /* Search for next non-aliased symbol. */ 270 /* Search for next non-aliased symbol. */
247 for (i = low + 1; i < kallsyms_num_syms; i++) { 271 for (i = low + 1; i < kallsyms_num_syms; i++) {
248 if (kallsyms_addresses[i] > symbol_start) { 272 if (kallsyms_sym_address(i) > symbol_start) {
249 symbol_end = kallsyms_addresses[i]; 273 symbol_end = kallsyms_sym_address(i);
250 break; 274 break;
251 } 275 }
252 } 276 }
@@ -470,7 +494,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
470 unsigned off = iter->nameoff; 494 unsigned off = iter->nameoff;
471 495
472 iter->module_name[0] = '\0'; 496 iter->module_name[0] = '\0';
473 iter->value = kallsyms_addresses[iter->pos]; 497 iter->value = kallsyms_sym_address(iter->pos);
474 498
475 iter->type = kallsyms_get_symbol_type(off); 499 iter->type = kallsyms_get_symbol_type(off);
476 500
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index f894a2cd9b2a..53ab2f85d77e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -148,8 +148,7 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
148} 148}
149 149
150#ifdef CONFIG_LOCK_STAT 150#ifdef CONFIG_LOCK_STAT
151static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], 151static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats);
152 cpu_lock_stats);
153 152
154static inline u64 lockstat_clock(void) 153static inline u64 lockstat_clock(void)
155{ 154{
diff --git a/kernel/memremap.c b/kernel/memremap.c
index fb9b88787ebc..584febd13e2e 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -391,7 +391,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
391 /* 391 /*
392 * 'memmap_start' is the virtual address for the first "struct 392 * 'memmap_start' is the virtual address for the first "struct
393 * page" in this range of the vmemmap array. In the case of 393 * page" in this range of the vmemmap array. In the case of
394 * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple 394 * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple
395 * pointer arithmetic, so we can perform this to_vmem_altmap() 395 * pointer arithmetic, so we can perform this to_vmem_altmap()
396 * conversion without concern for the initialization state of 396 * conversion without concern for the initialization state of
397 * the struct page fields. 397 * the struct page fields.
@@ -400,7 +400,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
400 struct dev_pagemap *pgmap; 400 struct dev_pagemap *pgmap;
401 401
402 /* 402 /*
403 * Uncoditionally retrieve a dev_pagemap associated with the 403 * Unconditionally retrieve a dev_pagemap associated with the
404 * given physical address, this is only for use in the 404 * given physical address, this is only for use in the
405 * arch_{add|remove}_memory() for setting up and tearing down 405 * arch_{add|remove}_memory() for setting up and tearing down
406 * the memmap. 406 * the memmap.
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b7342a24f559..aa0f26b58426 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1158,6 +1158,22 @@ static int __init kaslr_nohibernate_setup(char *str)
1158 return nohibernate_setup(str); 1158 return nohibernate_setup(str);
1159} 1159}
1160 1160
1161static int __init page_poison_nohibernate_setup(char *str)
1162{
1163#ifdef CONFIG_PAGE_POISONING_ZERO
1164 /*
1165 * The zeroing option for page poison skips the checks on alloc.
1166 * since hibernation doesn't save free pages there's no way to
1167 * guarantee the pages will still be zeroed.
1168 */
1169 if (!strcmp(str, "on")) {
1170 pr_info("Disabling hibernation due to page poisoning\n");
1171 return nohibernate_setup(str);
1172 }
1173#endif
1174 return 1;
1175}
1176
1161__setup("noresume", noresume_setup); 1177__setup("noresume", noresume_setup);
1162__setup("resume_offset=", resume_offset_setup); 1178__setup("resume_offset=", resume_offset_setup);
1163__setup("resume=", resume_setup); 1179__setup("resume=", resume_setup);
@@ -1166,3 +1182,4 @@ __setup("resumewait", resumewait_setup);
1166__setup("resumedelay=", resumedelay_setup); 1182__setup("resumedelay=", resumedelay_setup);
1167__setup("nohibernate", nohibernate_setup); 1183__setup("nohibernate", nohibernate_setup);
1168__setup("kaslr", kaslr_nohibernate_setup); 1184__setup("kaslr", kaslr_nohibernate_setup);
1185__setup("page_poison=", page_poison_nohibernate_setup);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 65ae0e5c35da..250ea67c1615 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -130,10 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current;
130static unsigned long rcu_torture_current_version; 130static unsigned long rcu_torture_current_version;
131static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 131static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
132static DEFINE_SPINLOCK(rcu_torture_lock); 132static DEFINE_SPINLOCK(rcu_torture_lock);
133static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], 133static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 };
134 rcu_torture_count) = { 0 }; 134static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 };
135static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
136 rcu_torture_batch) = { 0 };
137static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; 135static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
138static atomic_t n_rcu_torture_alloc; 136static atomic_t n_rcu_torture_alloc;
139static atomic_t n_rcu_torture_alloc_fail; 137static atomic_t n_rcu_torture_alloc_fail;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7ff5dc7d2ac5..16e13d8628a3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -320,8 +320,7 @@ static bool wq_debug_force_rr_cpu = false;
320module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); 320module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
321 321
322/* the per-cpu worker pools */ 322/* the per-cpu worker pools */
323static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], 323static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
324 cpu_worker_pools);
325 324
326static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ 325static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
327 326
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 4f6ae60433bc..563f10e6876a 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -17,6 +17,9 @@
17#include <linux/socket.h> 17#include <linux/socket.h>
18#include <linux/in.h> 18#include <linux/in.h>
19 19
20#include <linux/gfp.h>
21#include <linux/mm.h>
22
20#define BUF_SIZE 256 23#define BUF_SIZE 256
21#define PAD_SIZE 16 24#define PAD_SIZE 16
22#define FILL_CHAR '$' 25#define FILL_CHAR '$'
@@ -411,6 +414,55 @@ netdev_features(void)
411} 414}
412 415
413static void __init 416static void __init
417flags(void)
418{
419 unsigned long flags;
420 gfp_t gfp;
421 char *cmp_buffer;
422
423 flags = 0;
424 test("", "%pGp", &flags);
425
426 /* Page flags should filter the zone id */
427 flags = 1UL << NR_PAGEFLAGS;
428 test("", "%pGp", &flags);
429
430 flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru
431 | 1UL << PG_active | 1UL << PG_swapbacked;
432 test("uptodate|dirty|lru|active|swapbacked", "%pGp", &flags);
433
434
435 flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC
436 | VM_DENYWRITE;
437 test("read|exec|mayread|maywrite|mayexec|denywrite", "%pGv", &flags);
438
439 gfp = GFP_TRANSHUGE;
440 test("GFP_TRANSHUGE", "%pGg", &gfp);
441
442 gfp = GFP_ATOMIC|__GFP_DMA;
443 test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp);
444
445 gfp = __GFP_ATOMIC;
446 test("__GFP_ATOMIC", "%pGg", &gfp);
447
448 cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
449 if (!cmp_buffer)
450 return;
451
452 /* Any flags not translated by the table should remain numeric */
453 gfp = ~__GFP_BITS_MASK;
454 snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp);
455 test(cmp_buffer, "%pGg", &gfp);
456
457 snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx",
458 (unsigned long) gfp);
459 gfp |= __GFP_ATOMIC;
460 test(cmp_buffer, "%pGg", &gfp);
461
462 kfree(cmp_buffer);
463}
464
465static void __init
414test_pointer(void) 466test_pointer(void)
415{ 467{
416 plain(); 468 plain();
@@ -428,6 +480,7 @@ test_pointer(void)
428 struct_clk(); 480 struct_clk();
429 bitmap(); 481 bitmap();
430 netdev_features(); 482 netdev_features();
483 flags();
431} 484}
432 485
433static int __init 486static int __init
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index f44e178e6ede..525c8e19bda2 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -35,6 +35,8 @@
35#include <linux/blkdev.h> 35#include <linux/blkdev.h>
36#endif 36#endif
37 37
38#include "../mm/internal.h" /* For the trace_print_flags arrays */
39
38#include <asm/page.h> /* for PAGE_SIZE */ 40#include <asm/page.h> /* for PAGE_SIZE */
39#include <asm/sections.h> /* for dereference_function_descriptor() */ 41#include <asm/sections.h> /* for dereference_function_descriptor() */
40#include <asm/byteorder.h> /* cpu_to_le16 */ 42#include <asm/byteorder.h> /* cpu_to_le16 */
@@ -1407,6 +1409,72 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
1407 } 1409 }
1408} 1410}
1409 1411
1412static
1413char *format_flags(char *buf, char *end, unsigned long flags,
1414 const struct trace_print_flags *names)
1415{
1416 unsigned long mask;
1417 const struct printf_spec strspec = {
1418 .field_width = -1,
1419 .precision = -1,
1420 };
1421 const struct printf_spec numspec = {
1422 .flags = SPECIAL|SMALL,
1423 .field_width = -1,
1424 .precision = -1,
1425 .base = 16,
1426 };
1427
1428 for ( ; flags && names->name; names++) {
1429 mask = names->mask;
1430 if ((flags & mask) != mask)
1431 continue;
1432
1433 buf = string(buf, end, names->name, strspec);
1434
1435 flags &= ~mask;
1436 if (flags) {
1437 if (buf < end)
1438 *buf = '|';
1439 buf++;
1440 }
1441 }
1442
1443 if (flags)
1444 buf = number(buf, end, flags, numspec);
1445
1446 return buf;
1447}
1448
1449static noinline_for_stack
1450char *flags_string(char *buf, char *end, void *flags_ptr, const char *fmt)
1451{
1452 unsigned long flags;
1453 const struct trace_print_flags *names;
1454
1455 switch (fmt[1]) {
1456 case 'p':
1457 flags = *(unsigned long *)flags_ptr;
1458 /* Remove zone id */
1459 flags &= (1UL << NR_PAGEFLAGS) - 1;
1460 names = pageflag_names;
1461 break;
1462 case 'v':
1463 flags = *(unsigned long *)flags_ptr;
1464 names = vmaflag_names;
1465 break;
1466 case 'g':
1467 flags = *(gfp_t *)flags_ptr;
1468 names = gfpflag_names;
1469 break;
1470 default:
1471 WARN_ONCE(1, "Unsupported flags modifier: %c\n", fmt[1]);
1472 return buf;
1473 }
1474
1475 return format_flags(buf, end, flags, names);
1476}
1477
1410int kptr_restrict __read_mostly; 1478int kptr_restrict __read_mostly;
1411 1479
1412/* 1480/*
@@ -1495,6 +1563,11 @@ int kptr_restrict __read_mostly;
1495 * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address 1563 * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address
1496 * (legacy clock framework) of the clock 1564 * (legacy clock framework) of the clock
1497 * - 'Cr' For a clock, it prints the current rate of the clock 1565 * - 'Cr' For a clock, it prints the current rate of the clock
1566 * - 'G' For flags to be printed as a collection of symbolic strings that would
1567 * construct the specific value. Supported flags given by option:
1568 * p page flags (see struct page) given as pointer to unsigned long
1569 * g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
1570 * v vma flags (VM_*) given as pointer to unsigned long
1498 * 1571 *
1499 * ** Please update also Documentation/printk-formats.txt when making changes ** 1572 * ** Please update also Documentation/printk-formats.txt when making changes **
1500 * 1573 *
@@ -1648,6 +1721,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
1648 return bdev_name(buf, end, ptr, spec, fmt); 1721 return bdev_name(buf, end, ptr, spec, fmt);
1649#endif 1722#endif
1650 1723
1724 case 'G':
1725 return flags_string(buf, end, ptr, fmt);
1651 } 1726 }
1652 spec.flags |= SMALL; 1727 spec.flags |= SMALL;
1653 if (spec.field_width == -1) { 1728 if (spec.field_width == -1) {
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 957d3da53ddd..5c50b238b770 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC
16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC 16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
17 ---help--- 17 ---help---
18 Unmap pages from the kernel linear mapping after free_pages(). 18 Unmap pages from the kernel linear mapping after free_pages().
19 This results in a large slowdown, but helps to find certain types 19 Depending on runtime enablement, this results in a small or large
20 of memory corruption. 20 slowdown, but helps to find certain types of memory corruption.
21 21
22 For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, 22 For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
23 fill the pages with poison patterns after free_pages() and verify 23 fill the pages with poison patterns after free_pages() and verify
@@ -26,5 +26,56 @@ config DEBUG_PAGEALLOC
26 that would result in incorrect warnings of memory corruption after 26 that would result in incorrect warnings of memory corruption after
27 a resume because free pages are not saved to the suspend image. 27 a resume because free pages are not saved to the suspend image.
28 28
29 By default this option will have a small overhead, e.g. by not
30 allowing the kernel mapping to be backed by large pages on some
31 architectures. Even bigger overhead comes when the debugging is
32 enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc
33 command line parameter.
34
35config DEBUG_PAGEALLOC_ENABLE_DEFAULT
36 bool "Enable debug page memory allocations by default?"
37 default n
38 depends on DEBUG_PAGEALLOC
39 ---help---
40 Enable debug page memory allocations by default? This value
41 can be overridden by debug_pagealloc=off|on.
42
29config PAGE_POISONING 43config PAGE_POISONING
30 bool 44 bool "Poison pages after freeing"
45 select PAGE_EXTENSION
46 select PAGE_POISONING_NO_SANITY if HIBERNATION
47 ---help---
48 Fill the pages with poison patterns after free_pages() and verify
49 the patterns before alloc_pages. The filling of the memory helps
50 reduce the risk of information leaks from freed data. This does
51 have a potential performance impact.
52
53 Note that "poison" here is not the same thing as the "HWPoison"
54 for CONFIG_MEMORY_FAILURE. This is software poisoning only.
55
56 If unsure, say N
57
58config PAGE_POISONING_NO_SANITY
59 depends on PAGE_POISONING
60 bool "Only poison, don't sanity check"
61 ---help---
62 Skip the sanity checking on alloc, only fill the pages with
63 poison on free. This reduces some of the overhead of the
64 poisoning feature.
65
66 If you are only interested in sanitization, say Y. Otherwise
67 say N.
68
69config PAGE_POISONING_ZERO
70 bool "Use zero for poisoning instead of random data"
71 depends on PAGE_POISONING
72 ---help---
73 Instead of using the existing poison value, fill the pages with
74 zeros. This makes it harder to detect when errors are occurring
75 due to sanitization but the zeroing at free means that it is
76 no longer necessary to write zeros when GFP_ZERO is used on
77 allocation.
78
79 Enabling page poisoning with this option will disable hibernation
80
81 If unsure, say N
diff --git a/mm/Makefile b/mm/Makefile
index 2ed43191fc3b..cfdd481d27a5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -48,7 +48,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
48obj-$(CONFIG_SLOB) += slob.o 48obj-$(CONFIG_SLOB) += slob.o
49obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 49obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
50obj-$(CONFIG_KSM) += ksm.o 50obj-$(CONFIG_KSM) += ksm.o
51obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 51obj-$(CONFIG_PAGE_POISONING) += page_poison.o
52obj-$(CONFIG_SLAB) += slab.o 52obj-$(CONFIG_SLAB) += slab.o
53obj-$(CONFIG_SLUB) += slub.o 53obj-$(CONFIG_SLUB) += slub.o
54obj-$(CONFIG_KMEMCHECK) += kmemcheck.o 54obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 585de54dbe8c..93f71d968098 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -71,49 +71,6 @@ static inline bool migrate_async_suitable(int migratetype)
71 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 71 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
72} 72}
73 73
74/*
75 * Check that the whole (or subset of) a pageblock given by the interval of
76 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
77 * with the migration of free compaction scanner. The scanners then need to
78 * use only pfn_valid_within() check for arches that allow holes within
79 * pageblocks.
80 *
81 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
82 *
83 * It's possible on some configurations to have a setup like node0 node1 node0
84 * i.e. it's possible that all pages within a zones range of pages do not
85 * belong to a single zone. We assume that a border between node0 and node1
86 * can occur within a single pageblock, but not a node0 node1 node0
87 * interleaving within a single pageblock. It is therefore sufficient to check
88 * the first and last page of a pageblock and avoid checking each individual
89 * page in a pageblock.
90 */
91static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
92 unsigned long end_pfn, struct zone *zone)
93{
94 struct page *start_page;
95 struct page *end_page;
96
97 /* end_pfn is one past the range we are checking */
98 end_pfn--;
99
100 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
101 return NULL;
102
103 start_page = pfn_to_page(start_pfn);
104
105 if (page_zone(start_page) != zone)
106 return NULL;
107
108 end_page = pfn_to_page(end_pfn);
109
110 /* This gives a shorter code than deriving page_zone(end_page) */
111 if (page_zone_id(start_page) != page_zone_id(end_page))
112 return NULL;
113
114 return start_page;
115}
116
117#ifdef CONFIG_COMPACTION 74#ifdef CONFIG_COMPACTION
118 75
119/* Do not skip compaction more than 64 times */ 76/* Do not skip compaction more than 64 times */
@@ -200,7 +157,8 @@ static void reset_cached_positions(struct zone *zone)
200{ 157{
201 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; 158 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
202 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; 159 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
203 zone->compact_cached_free_pfn = zone_end_pfn(zone); 160 zone->compact_cached_free_pfn =
161 round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
204} 162}
205 163
206/* 164/*
@@ -554,13 +512,17 @@ unsigned long
554isolate_freepages_range(struct compact_control *cc, 512isolate_freepages_range(struct compact_control *cc,
555 unsigned long start_pfn, unsigned long end_pfn) 513 unsigned long start_pfn, unsigned long end_pfn)
556{ 514{
557 unsigned long isolated, pfn, block_end_pfn; 515 unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
558 LIST_HEAD(freelist); 516 LIST_HEAD(freelist);
559 517
560 pfn = start_pfn; 518 pfn = start_pfn;
519 block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
520 if (block_start_pfn < cc->zone->zone_start_pfn)
521 block_start_pfn = cc->zone->zone_start_pfn;
561 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 522 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
562 523
563 for (; pfn < end_pfn; pfn += isolated, 524 for (; pfn < end_pfn; pfn += isolated,
525 block_start_pfn = block_end_pfn,
564 block_end_pfn += pageblock_nr_pages) { 526 block_end_pfn += pageblock_nr_pages) {
565 /* Protect pfn from changing by isolate_freepages_block */ 527 /* Protect pfn from changing by isolate_freepages_block */
566 unsigned long isolate_start_pfn = pfn; 528 unsigned long isolate_start_pfn = pfn;
@@ -573,11 +535,13 @@ isolate_freepages_range(struct compact_control *cc,
573 * scanning range to right one. 535 * scanning range to right one.
574 */ 536 */
575 if (pfn >= block_end_pfn) { 537 if (pfn >= block_end_pfn) {
538 block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
576 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 539 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
577 block_end_pfn = min(block_end_pfn, end_pfn); 540 block_end_pfn = min(block_end_pfn, end_pfn);
578 } 541 }
579 542
580 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 543 if (!pageblock_pfn_to_page(block_start_pfn,
544 block_end_pfn, cc->zone))
581 break; 545 break;
582 546
583 isolated = isolate_freepages_block(cc, &isolate_start_pfn, 547 isolated = isolate_freepages_block(cc, &isolate_start_pfn,
@@ -863,18 +827,23 @@ unsigned long
863isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, 827isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
864 unsigned long end_pfn) 828 unsigned long end_pfn)
865{ 829{
866 unsigned long pfn, block_end_pfn; 830 unsigned long pfn, block_start_pfn, block_end_pfn;
867 831
868 /* Scan block by block. First and last block may be incomplete */ 832 /* Scan block by block. First and last block may be incomplete */
869 pfn = start_pfn; 833 pfn = start_pfn;
834 block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
835 if (block_start_pfn < cc->zone->zone_start_pfn)
836 block_start_pfn = cc->zone->zone_start_pfn;
870 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 837 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
871 838
872 for (; pfn < end_pfn; pfn = block_end_pfn, 839 for (; pfn < end_pfn; pfn = block_end_pfn,
840 block_start_pfn = block_end_pfn,
873 block_end_pfn += pageblock_nr_pages) { 841 block_end_pfn += pageblock_nr_pages) {
874 842
875 block_end_pfn = min(block_end_pfn, end_pfn); 843 block_end_pfn = min(block_end_pfn, end_pfn);
876 844
877 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 845 if (!pageblock_pfn_to_page(block_start_pfn,
846 block_end_pfn, cc->zone))
878 continue; 847 continue;
879 848
880 pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, 849 pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
@@ -1103,7 +1072,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1;
1103static isolate_migrate_t isolate_migratepages(struct zone *zone, 1072static isolate_migrate_t isolate_migratepages(struct zone *zone,
1104 struct compact_control *cc) 1073 struct compact_control *cc)
1105{ 1074{
1106 unsigned long low_pfn, end_pfn; 1075 unsigned long block_start_pfn;
1076 unsigned long block_end_pfn;
1077 unsigned long low_pfn;
1107 unsigned long isolate_start_pfn; 1078 unsigned long isolate_start_pfn;
1108 struct page *page; 1079 struct page *page;
1109 const isolate_mode_t isolate_mode = 1080 const isolate_mode_t isolate_mode =
@@ -1115,16 +1086,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1115 * initialized by compact_zone() 1086 * initialized by compact_zone()
1116 */ 1087 */
1117 low_pfn = cc->migrate_pfn; 1088 low_pfn = cc->migrate_pfn;
1089 block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
1090 if (block_start_pfn < zone->zone_start_pfn)
1091 block_start_pfn = zone->zone_start_pfn;
1118 1092
1119 /* Only scan within a pageblock boundary */ 1093 /* Only scan within a pageblock boundary */
1120 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); 1094 block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
1121 1095
1122 /* 1096 /*
1123 * Iterate over whole pageblocks until we find the first suitable. 1097 * Iterate over whole pageblocks until we find the first suitable.
1124 * Do not cross the free scanner. 1098 * Do not cross the free scanner.
1125 */ 1099 */
1126 for (; end_pfn <= cc->free_pfn; 1100 for (; block_end_pfn <= cc->free_pfn;
1127 low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { 1101 low_pfn = block_end_pfn,
1102 block_start_pfn = block_end_pfn,
1103 block_end_pfn += pageblock_nr_pages) {
1128 1104
1129 /* 1105 /*
1130 * This can potentially iterate a massively long zone with 1106 * This can potentially iterate a massively long zone with
@@ -1135,7 +1111,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1135 && compact_should_abort(cc)) 1111 && compact_should_abort(cc))
1136 break; 1112 break;
1137 1113
1138 page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); 1114 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1115 zone);
1139 if (!page) 1116 if (!page)
1140 continue; 1117 continue;
1141 1118
@@ -1154,8 +1131,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1154 1131
1155 /* Perform the isolation */ 1132 /* Perform the isolation */
1156 isolate_start_pfn = low_pfn; 1133 isolate_start_pfn = low_pfn;
1157 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, 1134 low_pfn = isolate_migratepages_block(cc, low_pfn,
1158 isolate_mode); 1135 block_end_pfn, isolate_mode);
1159 1136
1160 if (!low_pfn || cc->contended) { 1137 if (!low_pfn || cc->contended) {
1161 acct_isolated(zone, cc); 1138 acct_isolated(zone, cc);
@@ -1371,11 +1348,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1371 */ 1348 */
1372 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; 1349 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
1373 cc->free_pfn = zone->compact_cached_free_pfn; 1350 cc->free_pfn = zone->compact_cached_free_pfn;
1374 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { 1351 if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
1375 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); 1352 cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
1376 zone->compact_cached_free_pfn = cc->free_pfn; 1353 zone->compact_cached_free_pfn = cc->free_pfn;
1377 } 1354 }
1378 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { 1355 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
1379 cc->migrate_pfn = start_pfn; 1356 cc->migrate_pfn = start_pfn;
1380 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 1357 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1381 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1358 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
diff --git a/mm/debug.c b/mm/debug.c
index f05b2d5d6481..df7247b0b532 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -9,75 +9,38 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/trace_events.h> 10#include <linux/trace_events.h>
11#include <linux/memcontrol.h> 11#include <linux/memcontrol.h>
12 12#include <trace/events/mmflags.h>
13static const struct trace_print_flags pageflag_names[] = { 13#include <linux/migrate.h>
14 {1UL << PG_locked, "locked" }, 14#include <linux/page_owner.h>
15 {1UL << PG_error, "error" }, 15
16 {1UL << PG_referenced, "referenced" }, 16#include "internal.h"
17 {1UL << PG_uptodate, "uptodate" }, 17
18 {1UL << PG_dirty, "dirty" }, 18char *migrate_reason_names[MR_TYPES] = {
19 {1UL << PG_lru, "lru" }, 19 "compaction",
20 {1UL << PG_active, "active" }, 20 "memory_failure",
21 {1UL << PG_slab, "slab" }, 21 "memory_hotplug",
22 {1UL << PG_owner_priv_1, "owner_priv_1" }, 22 "syscall_or_cpuset",
23 {1UL << PG_arch_1, "arch_1" }, 23 "mempolicy_mbind",
24 {1UL << PG_reserved, "reserved" }, 24 "numa_misplaced",
25 {1UL << PG_private, "private" }, 25 "cma",
26 {1UL << PG_private_2, "private_2" },
27 {1UL << PG_writeback, "writeback" },
28 {1UL << PG_head, "head" },
29 {1UL << PG_swapcache, "swapcache" },
30 {1UL << PG_mappedtodisk, "mappedtodisk" },
31 {1UL << PG_reclaim, "reclaim" },
32 {1UL << PG_swapbacked, "swapbacked" },
33 {1UL << PG_unevictable, "unevictable" },
34#ifdef CONFIG_MMU
35 {1UL << PG_mlocked, "mlocked" },
36#endif
37#ifdef CONFIG_ARCH_USES_PG_UNCACHED
38 {1UL << PG_uncached, "uncached" },
39#endif
40#ifdef CONFIG_MEMORY_FAILURE
41 {1UL << PG_hwpoison, "hwpoison" },
42#endif
43#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
44 {1UL << PG_young, "young" },
45 {1UL << PG_idle, "idle" },
46#endif
47}; 26};
48 27
49static void dump_flags(unsigned long flags, 28const struct trace_print_flags pageflag_names[] = {
50 const struct trace_print_flags *names, int count) 29 __def_pageflag_names,
51{ 30 {0, NULL}
52 const char *delim = ""; 31};
53 unsigned long mask;
54 int i;
55
56 pr_emerg("flags: %#lx(", flags);
57
58 /* remove zone id */
59 flags &= (1UL << NR_PAGEFLAGS) - 1;
60
61 for (i = 0; i < count && flags; i++) {
62
63 mask = names[i].mask;
64 if ((flags & mask) != mask)
65 continue;
66
67 flags &= ~mask;
68 pr_cont("%s%s", delim, names[i].name);
69 delim = "|";
70 }
71 32
72 /* check for left over flags */ 33const struct trace_print_flags gfpflag_names[] = {
73 if (flags) 34 __def_gfpflag_names,
74 pr_cont("%s%#lx", delim, flags); 35 {0, NULL}
36};
75 37
76 pr_cont(")\n"); 38const struct trace_print_flags vmaflag_names[] = {
77} 39 __def_vmaflag_names,
40 {0, NULL}
41};
78 42
79void dump_page_badflags(struct page *page, const char *reason, 43void __dump_page(struct page *page, const char *reason)
80 unsigned long badflags)
81{ 44{
82 pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", 45 pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
83 page, atomic_read(&page->_count), page_mapcount(page), 46 page, atomic_read(&page->_count), page_mapcount(page),
@@ -85,15 +48,13 @@ void dump_page_badflags(struct page *page, const char *reason,
85 if (PageCompound(page)) 48 if (PageCompound(page))
86 pr_cont(" compound_mapcount: %d", compound_mapcount(page)); 49 pr_cont(" compound_mapcount: %d", compound_mapcount(page));
87 pr_cont("\n"); 50 pr_cont("\n");
88 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); 51 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
89 dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); 52
53 pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
54
90 if (reason) 55 if (reason)
91 pr_alert("page dumped because: %s\n", reason); 56 pr_alert("page dumped because: %s\n", reason);
92 if (page->flags & badflags) { 57
93 pr_alert("bad because of flags:\n");
94 dump_flags(page->flags & badflags,
95 pageflag_names, ARRAY_SIZE(pageflag_names));
96 }
97#ifdef CONFIG_MEMCG 58#ifdef CONFIG_MEMCG
98 if (page->mem_cgroup) 59 if (page->mem_cgroup)
99 pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); 60 pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
@@ -102,67 +63,26 @@ void dump_page_badflags(struct page *page, const char *reason,
102 63
103void dump_page(struct page *page, const char *reason) 64void dump_page(struct page *page, const char *reason)
104{ 65{
105 dump_page_badflags(page, reason, 0); 66 __dump_page(page, reason);
67 dump_page_owner(page);
106} 68}
107EXPORT_SYMBOL(dump_page); 69EXPORT_SYMBOL(dump_page);
108 70
109#ifdef CONFIG_DEBUG_VM 71#ifdef CONFIG_DEBUG_VM
110 72
111static const struct trace_print_flags vmaflags_names[] = {
112 {VM_READ, "read" },
113 {VM_WRITE, "write" },
114 {VM_EXEC, "exec" },
115 {VM_SHARED, "shared" },
116 {VM_MAYREAD, "mayread" },
117 {VM_MAYWRITE, "maywrite" },
118 {VM_MAYEXEC, "mayexec" },
119 {VM_MAYSHARE, "mayshare" },
120 {VM_GROWSDOWN, "growsdown" },
121 {VM_PFNMAP, "pfnmap" },
122 {VM_DENYWRITE, "denywrite" },
123 {VM_LOCKONFAULT, "lockonfault" },
124 {VM_LOCKED, "locked" },
125 {VM_IO, "io" },
126 {VM_SEQ_READ, "seqread" },
127 {VM_RAND_READ, "randread" },
128 {VM_DONTCOPY, "dontcopy" },
129 {VM_DONTEXPAND, "dontexpand" },
130 {VM_ACCOUNT, "account" },
131 {VM_NORESERVE, "noreserve" },
132 {VM_HUGETLB, "hugetlb" },
133#if defined(CONFIG_X86)
134 {VM_PAT, "pat" },
135#elif defined(CONFIG_PPC)
136 {VM_SAO, "sao" },
137#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
138 {VM_GROWSUP, "growsup" },
139#elif !defined(CONFIG_MMU)
140 {VM_MAPPED_COPY, "mappedcopy" },
141#else
142 {VM_ARCH_1, "arch_1" },
143#endif
144 {VM_DONTDUMP, "dontdump" },
145#ifdef CONFIG_MEM_SOFT_DIRTY
146 {VM_SOFTDIRTY, "softdirty" },
147#endif
148 {VM_MIXEDMAP, "mixedmap" },
149 {VM_HUGEPAGE, "hugepage" },
150 {VM_NOHUGEPAGE, "nohugepage" },
151 {VM_MERGEABLE, "mergeable" },
152};
153
154void dump_vma(const struct vm_area_struct *vma) 73void dump_vma(const struct vm_area_struct *vma)
155{ 74{
156 pr_emerg("vma %p start %p end %p\n" 75 pr_emerg("vma %p start %p end %p\n"
157 "next %p prev %p mm %p\n" 76 "next %p prev %p mm %p\n"
158 "prot %lx anon_vma %p vm_ops %p\n" 77 "prot %lx anon_vma %p vm_ops %p\n"
159 "pgoff %lx file %p private_data %p\n", 78 "pgoff %lx file %p private_data %p\n"
79 "flags: %#lx(%pGv)\n",
160 vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, 80 vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
161 vma->vm_prev, vma->vm_mm, 81 vma->vm_prev, vma->vm_mm,
162 (unsigned long)pgprot_val(vma->vm_page_prot), 82 (unsigned long)pgprot_val(vma->vm_page_prot),
163 vma->anon_vma, vma->vm_ops, vma->vm_pgoff, 83 vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
164 vma->vm_file, vma->vm_private_data); 84 vma->vm_file, vma->vm_private_data,
165 dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); 85 vma->vm_flags, &vma->vm_flags);
166} 86}
167EXPORT_SYMBOL(dump_vma); 87EXPORT_SYMBOL(dump_vma);
168 88
@@ -196,7 +116,7 @@ void dump_mm(const struct mm_struct *mm)
196#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) 116#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
197 "tlb_flush_pending %d\n" 117 "tlb_flush_pending %d\n"
198#endif 118#endif
199 "%s", /* This is here to hold the comma */ 119 "def_flags: %#lx(%pGv)\n",
200 120
201 mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, 121 mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
202#ifdef CONFIG_MMU 122#ifdef CONFIG_MMU
@@ -230,11 +150,8 @@ void dump_mm(const struct mm_struct *mm)
230#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) 150#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
231 mm->tlb_flush_pending, 151 mm->tlb_flush_pending,
232#endif 152#endif
233 "" /* This is here to not have a comma! */ 153 mm->def_flags, &mm->def_flags
234 ); 154 );
235
236 dump_flags(mm->def_flags, vmaflags_names,
237 ARRAY_SIZE(vmaflags_names));
238} 155}
239 156
240#endif /* CONFIG_DEBUG_VM */ 157#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/failslab.c b/mm/failslab.c
index 79171b4a5826..b0fac98cd938 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,5 +1,7 @@
1#include <linux/fault-inject.h> 1#include <linux/fault-inject.h>
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/mm.h>
4#include "slab.h"
3 5
4static struct { 6static struct {
5 struct fault_attr attr; 7 struct fault_attr attr;
@@ -11,18 +13,22 @@ static struct {
11 .cache_filter = false, 13 .cache_filter = false,
12}; 14};
13 15
14bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) 16bool should_failslab(struct kmem_cache *s, gfp_t gfpflags)
15{ 17{
18 /* No fault-injection for bootstrap cache */
19 if (unlikely(s == kmem_cache))
20 return false;
21
16 if (gfpflags & __GFP_NOFAIL) 22 if (gfpflags & __GFP_NOFAIL)
17 return false; 23 return false;
18 24
19 if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) 25 if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
20 return false; 26 return false;
21 27
22 if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) 28 if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
23 return false; 29 return false;
24 30
25 return should_fail(&failslab.attr, size); 31 return should_fail(&failslab.attr, s->object_size);
26} 32}
27 33
28static int __init setup_failslab(char *str) 34static int __init setup_failslab(char *str)
diff --git a/mm/filemap.c b/mm/filemap.c
index da7a35d83de7..61b441b191ad 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,7 +101,7 @@
101 * ->tree_lock (page_remove_rmap->set_page_dirty) 101 * ->tree_lock (page_remove_rmap->set_page_dirty)
102 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) 102 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
103 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 103 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
104 * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) 104 * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
105 * bdi.wb->list_lock (zap_pte_range->set_page_dirty) 105 * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
106 * ->inode->i_lock (zap_pte_range->set_page_dirty) 106 * ->inode->i_lock (zap_pte_range->set_page_dirty)
107 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 107 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
@@ -176,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
176/* 176/*
177 * Delete a page from the page cache and free it. Caller has to make 177 * Delete a page from the page cache and free it. Caller has to make
178 * sure the page is locked and that nobody else uses it - or that usage 178 * sure the page is locked and that nobody else uses it - or that usage
179 * is safe. The caller must hold the mapping's tree_lock and 179 * is safe. The caller must hold the mapping's tree_lock.
180 * mem_cgroup_begin_page_stat().
181 */ 180 */
182void __delete_from_page_cache(struct page *page, void *shadow, 181void __delete_from_page_cache(struct page *page, void *shadow)
183 struct mem_cgroup *memcg)
184{ 182{
185 struct address_space *mapping = page->mapping; 183 struct address_space *mapping = page->mapping;
186 184
@@ -239,8 +237,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
239 * anyway will be cleared before returning page into buddy allocator. 237 * anyway will be cleared before returning page into buddy allocator.
240 */ 238 */
241 if (WARN_ON_ONCE(PageDirty(page))) 239 if (WARN_ON_ONCE(PageDirty(page)))
242 account_page_cleaned(page, mapping, memcg, 240 account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
243 inode_to_wb(mapping->host));
244} 241}
245 242
246/** 243/**
@@ -254,7 +251,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
254void delete_from_page_cache(struct page *page) 251void delete_from_page_cache(struct page *page)
255{ 252{
256 struct address_space *mapping = page->mapping; 253 struct address_space *mapping = page->mapping;
257 struct mem_cgroup *memcg;
258 unsigned long flags; 254 unsigned long flags;
259 255
260 void (*freepage)(struct page *); 256 void (*freepage)(struct page *);
@@ -263,11 +259,9 @@ void delete_from_page_cache(struct page *page)
263 259
264 freepage = mapping->a_ops->freepage; 260 freepage = mapping->a_ops->freepage;
265 261
266 memcg = mem_cgroup_begin_page_stat(page);
267 spin_lock_irqsave(&mapping->tree_lock, flags); 262 spin_lock_irqsave(&mapping->tree_lock, flags);
268 __delete_from_page_cache(page, NULL, memcg); 263 __delete_from_page_cache(page, NULL);
269 spin_unlock_irqrestore(&mapping->tree_lock, flags); 264 spin_unlock_irqrestore(&mapping->tree_lock, flags);
270 mem_cgroup_end_page_stat(memcg);
271 265
272 if (freepage) 266 if (freepage)
273 freepage(page); 267 freepage(page);
@@ -551,7 +545,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
551 if (!error) { 545 if (!error) {
552 struct address_space *mapping = old->mapping; 546 struct address_space *mapping = old->mapping;
553 void (*freepage)(struct page *); 547 void (*freepage)(struct page *);
554 struct mem_cgroup *memcg;
555 unsigned long flags; 548 unsigned long flags;
556 549
557 pgoff_t offset = old->index; 550 pgoff_t offset = old->index;
@@ -561,9 +554,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
561 new->mapping = mapping; 554 new->mapping = mapping;
562 new->index = offset; 555 new->index = offset;
563 556
564 memcg = mem_cgroup_begin_page_stat(old);
565 spin_lock_irqsave(&mapping->tree_lock, flags); 557 spin_lock_irqsave(&mapping->tree_lock, flags);
566 __delete_from_page_cache(old, NULL, memcg); 558 __delete_from_page_cache(old, NULL);
567 error = radix_tree_insert(&mapping->page_tree, offset, new); 559 error = radix_tree_insert(&mapping->page_tree, offset, new);
568 BUG_ON(error); 560 BUG_ON(error);
569 mapping->nrpages++; 561 mapping->nrpages++;
@@ -576,8 +568,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
576 if (PageSwapBacked(new)) 568 if (PageSwapBacked(new))
577 __inc_zone_page_state(new, NR_SHMEM); 569 __inc_zone_page_state(new, NR_SHMEM);
578 spin_unlock_irqrestore(&mapping->tree_lock, flags); 570 spin_unlock_irqrestore(&mapping->tree_lock, flags);
579 mem_cgroup_end_page_stat(memcg); 571 mem_cgroup_migrate(old, new);
580 mem_cgroup_replace_page(old, new);
581 radix_tree_preload_end(); 572 radix_tree_preload_end();
582 if (freepage) 573 if (freepage)
583 freepage(old); 574 freepage(old);
@@ -1668,6 +1659,15 @@ find_page:
1668 index, last_index - index); 1659 index, last_index - index);
1669 } 1660 }
1670 if (!PageUptodate(page)) { 1661 if (!PageUptodate(page)) {
1662 /*
1663 * See comment in do_read_cache_page on why
1664 * wait_on_page_locked is used to avoid unnecessarily
1665 * serialisations and why it's safe.
1666 */
1667 wait_on_page_locked_killable(page);
1668 if (PageUptodate(page))
1669 goto page_ok;
1670
1671 if (inode->i_blkbits == PAGE_CACHE_SHIFT || 1671 if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1672 !mapping->a_ops->is_partially_uptodate) 1672 !mapping->a_ops->is_partially_uptodate)
1673 goto page_not_up_to_date; 1673 goto page_not_up_to_date;
@@ -2303,7 +2303,7 @@ static struct page *wait_on_page_read(struct page *page)
2303 return page; 2303 return page;
2304} 2304}
2305 2305
2306static struct page *__read_cache_page(struct address_space *mapping, 2306static struct page *do_read_cache_page(struct address_space *mapping,
2307 pgoff_t index, 2307 pgoff_t index,
2308 int (*filler)(void *, struct page *), 2308 int (*filler)(void *, struct page *),
2309 void *data, 2309 void *data,
@@ -2325,53 +2325,74 @@ repeat:
2325 /* Presumably ENOMEM for radix tree node */ 2325 /* Presumably ENOMEM for radix tree node */
2326 return ERR_PTR(err); 2326 return ERR_PTR(err);
2327 } 2327 }
2328
2329filler:
2328 err = filler(data, page); 2330 err = filler(data, page);
2329 if (err < 0) { 2331 if (err < 0) {
2330 page_cache_release(page); 2332 page_cache_release(page);
2331 page = ERR_PTR(err); 2333 return ERR_PTR(err);
2332 } else {
2333 page = wait_on_page_read(page);
2334 } 2334 }
2335 }
2336 return page;
2337}
2338
2339static struct page *do_read_cache_page(struct address_space *mapping,
2340 pgoff_t index,
2341 int (*filler)(void *, struct page *),
2342 void *data,
2343 gfp_t gfp)
2344 2335
2345{ 2336 page = wait_on_page_read(page);
2346 struct page *page; 2337 if (IS_ERR(page))
2347 int err; 2338 return page;
2339 goto out;
2340 }
2341 if (PageUptodate(page))
2342 goto out;
2348 2343
2349retry: 2344 /*
2350 page = __read_cache_page(mapping, index, filler, data, gfp); 2345 * Page is not up to date and may be locked due one of the following
2351 if (IS_ERR(page)) 2346 * case a: Page is being filled and the page lock is held
2352 return page; 2347 * case b: Read/write error clearing the page uptodate status
2348 * case c: Truncation in progress (page locked)
2349 * case d: Reclaim in progress
2350 *
2351 * Case a, the page will be up to date when the page is unlocked.
2352 * There is no need to serialise on the page lock here as the page
2353 * is pinned so the lock gives no additional protection. Even if the
2354 * the page is truncated, the data is still valid if PageUptodate as
2355 * it's a race vs truncate race.
2356 * Case b, the page will not be up to date
2357 * Case c, the page may be truncated but in itself, the data may still
2358 * be valid after IO completes as it's a read vs truncate race. The
2359 * operation must restart if the page is not uptodate on unlock but
2360 * otherwise serialising on page lock to stabilise the mapping gives
2361 * no additional guarantees to the caller as the page lock is
2362 * released before return.
2363 * Case d, similar to truncation. If reclaim holds the page lock, it
2364 * will be a race with remove_mapping that determines if the mapping
2365 * is valid on unlock but otherwise the data is valid and there is
2366 * no need to serialise with page lock.
2367 *
2368 * As the page lock gives no additional guarantee, we optimistically
2369 * wait on the page to be unlocked and check if it's up to date and
2370 * use the page if it is. Otherwise, the page lock is required to
2371 * distinguish between the different cases. The motivation is that we
2372 * avoid spurious serialisations and wakeups when multiple processes
2373 * wait on the same page for IO to complete.
2374 */
2375 wait_on_page_locked(page);
2353 if (PageUptodate(page)) 2376 if (PageUptodate(page))
2354 goto out; 2377 goto out;
2355 2378
2379 /* Distinguish between all the cases under the safety of the lock */
2356 lock_page(page); 2380 lock_page(page);
2381
2382 /* Case c or d, restart the operation */
2357 if (!page->mapping) { 2383 if (!page->mapping) {
2358 unlock_page(page); 2384 unlock_page(page);
2359 page_cache_release(page); 2385 page_cache_release(page);
2360 goto retry; 2386 goto repeat;
2361 } 2387 }
2388
2389 /* Someone else locked and filled the page in a very small window */
2362 if (PageUptodate(page)) { 2390 if (PageUptodate(page)) {
2363 unlock_page(page); 2391 unlock_page(page);
2364 goto out; 2392 goto out;
2365 } 2393 }
2366 err = filler(data, page); 2394 goto filler;
2367 if (err < 0) { 2395
2368 page_cache_release(page);
2369 return ERR_PTR(err);
2370 } else {
2371 page = wait_on_page_read(page);
2372 if (IS_ERR(page))
2373 return page;
2374 }
2375out: 2396out:
2376 mark_page_accessed(page); 2397 mark_page_accessed(page);
2377 return page; 2398 return page;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e10a4fee88d2..1ea21e203a70 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3220,28 +3220,26 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
3220 } 3220 }
3221} 3221}
3222 3222
3223static int __split_huge_page_tail(struct page *head, int tail, 3223static void __split_huge_page_tail(struct page *head, int tail,
3224 struct lruvec *lruvec, struct list_head *list) 3224 struct lruvec *lruvec, struct list_head *list)
3225{ 3225{
3226 int mapcount;
3227 struct page *page_tail = head + tail; 3226 struct page *page_tail = head + tail;
3228 3227
3229 mapcount = atomic_read(&page_tail->_mapcount) + 1; 3228 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
3230 VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); 3229 VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
3231 3230
3232 /* 3231 /*
3233 * tail_page->_count is zero and not changing from under us. But 3232 * tail_page->_count is zero and not changing from under us. But
3234 * get_page_unless_zero() may be running from under us on the 3233 * get_page_unless_zero() may be running from under us on the
3235 * tail_page. If we used atomic_set() below instead of atomic_add(), we 3234 * tail_page. If we used atomic_set() below instead of atomic_inc(), we
3236 * would then run atomic_set() concurrently with 3235 * would then run atomic_set() concurrently with
3237 * get_page_unless_zero(), and atomic_set() is implemented in C not 3236 * get_page_unless_zero(), and atomic_set() is implemented in C not
3238 * using locked ops. spin_unlock on x86 sometime uses locked ops 3237 * using locked ops. spin_unlock on x86 sometime uses locked ops
3239 * because of PPro errata 66, 92, so unless somebody can guarantee 3238 * because of PPro errata 66, 92, so unless somebody can guarantee
3240 * atomic_set() here would be safe on all archs (and not only on x86), 3239 * atomic_set() here would be safe on all archs (and not only on x86),
3241 * it's safer to use atomic_add(). 3240 * it's safer to use atomic_inc().
3242 */ 3241 */
3243 atomic_add(mapcount + 1, &page_tail->_count); 3242 atomic_inc(&page_tail->_count);
3244
3245 3243
3246 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 3244 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
3247 page_tail->flags |= (head->flags & 3245 page_tail->flags |= (head->flags &
@@ -3275,8 +3273,6 @@ static int __split_huge_page_tail(struct page *head, int tail,
3275 page_tail->index = head->index + tail; 3273 page_tail->index = head->index + tail;
3276 page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); 3274 page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
3277 lru_add_page_tail(head, page_tail, lruvec, list); 3275 lru_add_page_tail(head, page_tail, lruvec, list);
3278
3279 return mapcount;
3280} 3276}
3281 3277
3282static void __split_huge_page(struct page *page, struct list_head *list) 3278static void __split_huge_page(struct page *page, struct list_head *list)
@@ -3284,7 +3280,7 @@ static void __split_huge_page(struct page *page, struct list_head *list)
3284 struct page *head = compound_head(page); 3280 struct page *head = compound_head(page);
3285 struct zone *zone = page_zone(head); 3281 struct zone *zone = page_zone(head);
3286 struct lruvec *lruvec; 3282 struct lruvec *lruvec;
3287 int i, tail_mapcount; 3283 int i;
3288 3284
3289 /* prevent PageLRU to go away from under us, and freeze lru stats */ 3285 /* prevent PageLRU to go away from under us, and freeze lru stats */
3290 spin_lock_irq(&zone->lru_lock); 3286 spin_lock_irq(&zone->lru_lock);
@@ -3293,10 +3289,8 @@ static void __split_huge_page(struct page *page, struct list_head *list)
3293 /* complete memcg works before add pages to LRU */ 3289 /* complete memcg works before add pages to LRU */
3294 mem_cgroup_split_huge_fixup(head); 3290 mem_cgroup_split_huge_fixup(head);
3295 3291
3296 tail_mapcount = 0;
3297 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) 3292 for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
3298 tail_mapcount += __split_huge_page_tail(head, i, lruvec, list); 3293 __split_huge_page_tail(head, i, lruvec, list);
3299 atomic_sub(tail_mapcount, &head->_count);
3300 3294
3301 ClearPageCompound(head); 3295 ClearPageCompound(head);
3302 spin_unlock_irq(&zone->lru_lock); 3296 spin_unlock_irq(&zone->lru_lock);
diff --git a/mm/internal.h b/mm/internal.h
index a38a21ebddb4..ad9400d759c8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -14,6 +14,7 @@
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/tracepoint-defs.h>
17 18
18/* 19/*
19 * The set of flags that only affect watermark checking and reclaim 20 * The set of flags that only affect watermark checking and reclaim
@@ -131,6 +132,18 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
131 return page_idx ^ (1 << order); 132 return page_idx ^ (1 << order);
132} 133}
133 134
135extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
136 unsigned long end_pfn, struct zone *zone);
137
138static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
139 unsigned long end_pfn, struct zone *zone)
140{
141 if (zone->contiguous)
142 return pfn_to_page(start_pfn);
143
144 return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
145}
146
134extern int __isolate_free_page(struct page *page, unsigned int order); 147extern int __isolate_free_page(struct page *page, unsigned int order);
135extern void __free_pages_bootmem(struct page *page, unsigned long pfn, 148extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
136 unsigned int order); 149 unsigned int order);
@@ -466,4 +479,9 @@ static inline void try_to_unmap_flush_dirty(void)
466} 479}
467 480
468#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 481#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
482
483extern const struct trace_print_flags pageflag_names[];
484extern const struct trace_print_flags vmaflag_names[];
485extern const struct trace_print_flags gfpflag_names[];
486
469#endif /* __MM_INTERNAL_H */ 487#endif /* __MM_INTERNAL_H */
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index cab58bb592d8..6f4f424037c0 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -60,6 +60,9 @@ void kmemcheck_free_shadow(struct page *page, int order)
60void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, 60void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
61 size_t size) 61 size_t size)
62{ 62{
63 if (unlikely(!object)) /* Skip object if allocation failed */
64 return;
65
63 /* 66 /*
64 * Has already been memset(), which initializes the shadow for us 67 * Has already been memset(), which initializes the shadow for us
65 * as well. 68 * as well.
diff --git a/mm/madvise.c b/mm/madvise.c
index f56825b6d2e1..a01147359f3b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -555,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
555 } 555 }
556 pr_info("Injecting memory failure for page %#lx at %#lx\n", 556 pr_info("Injecting memory failure for page %#lx at %#lx\n",
557 page_to_pfn(p), start); 557 page_to_pfn(p), start);
558 /* Ignore return value for now */ 558 ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
559 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); 559 if (ret)
560 return ret;
560 } 561 }
561 return 0; 562 return 0;
562} 563}
@@ -638,14 +639,28 @@ madvise_behavior_valid(int behavior)
638 * some pages ahead. 639 * some pages ahead.
639 * MADV_DONTNEED - the application is finished with the given range, 640 * MADV_DONTNEED - the application is finished with the given range,
640 * so the kernel can free resources associated with it. 641 * so the kernel can free resources associated with it.
642 * MADV_FREE - the application marks pages in the given range as lazy free,
643 * where actual purges are postponed until memory pressure happens.
641 * MADV_REMOVE - the application wants to free up the given range of 644 * MADV_REMOVE - the application wants to free up the given range of
642 * pages and associated backing store. 645 * pages and associated backing store.
643 * MADV_DONTFORK - omit this area from child's address space when forking: 646 * MADV_DONTFORK - omit this area from child's address space when forking:
644 * typically, to avoid COWing pages pinned by get_user_pages(). 647 * typically, to avoid COWing pages pinned by get_user_pages().
645 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 648 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
649 * MADV_HWPOISON - trigger memory error handler as if the given memory range
650 * were corrupted by unrecoverable hardware memory failure.
651 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
646 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 652 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
647 * this area with pages of identical content from other such areas. 653 * this area with pages of identical content from other such areas.
648 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 654 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
655 * MADV_HUGEPAGE - the application wants to back the given range by transparent
656 * huge pages in the future. Existing pages might be coalesced and
657 * new pages might be allocated as THP.
658 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
659 * transparent huge pages so the existing pages will not be
660 * coalesced into THP and new pages will not be allocated as THP.
661 * MADV_DONTDUMP - the application wants to prevent pages in the given range
662 * from being included in its core dump.
663 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
649 * 664 *
650 * return values: 665 * return values:
651 * zero - success 666 * zero - success
diff --git a/mm/memblock.c b/mm/memblock.c
index dd7989929f13..fc7824fa1b42 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -612,14 +612,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
612 int nid, 612 int nid,
613 unsigned long flags) 613 unsigned long flags)
614{ 614{
615 struct memblock_type *type = &memblock.memory;
616
617 memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", 615 memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
618 (unsigned long long)base, 616 (unsigned long long)base,
619 (unsigned long long)base + size - 1, 617 (unsigned long long)base + size - 1,
620 flags, (void *)_RET_IP_); 618 flags, (void *)_RET_IP_);
621 619
622 return memblock_add_range(type, base, size, nid, flags); 620 return memblock_add_range(&memblock.memory, base, size, nid, flags);
623} 621}
624 622
625int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) 623int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -740,14 +738,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
740 int nid, 738 int nid,
741 unsigned long flags) 739 unsigned long flags)
742{ 740{
743 struct memblock_type *type = &memblock.reserved;
744
745 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", 741 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
746 (unsigned long long)base, 742 (unsigned long long)base,
747 (unsigned long long)base + size - 1, 743 (unsigned long long)base + size - 1,
748 flags, (void *)_RET_IP_); 744 flags, (void *)_RET_IP_);
749 745
750 return memblock_add_range(type, base, size, nid, flags); 746 return memblock_add_range(&memblock.reserved, base, size, nid, flags);
751} 747}
752 748
753int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) 749int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d06cae2de783..42882c1e7fce 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
268 return (memcg == root_mem_cgroup); 268 return (memcg == root_mem_cgroup);
269} 269}
270 270
271/*
272 * We restrict the id in the range of [1, 65535], so it can fit into
273 * an unsigned short.
274 */
275#define MEM_CGROUP_ID_MAX USHRT_MAX
276
277static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
278{
279 return memcg->css.id;
280}
281
282/*
283 * A helper function to get mem_cgroup from ID. must be called under
284 * rcu_read_lock(). The caller is responsible for calling
285 * css_tryget_online() if the mem_cgroup is used for charging. (dropping
286 * refcnt from swap can be called against removed memcg.)
287 */
288static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
289{
290 struct cgroup_subsys_state *css;
291
292 css = css_from_id(id, &memory_cgrp_subsys);
293 return mem_cgroup_from_css(css);
294}
295
296#ifndef CONFIG_SLOB 271#ifndef CONFIG_SLOB
297/* 272/*
298 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. 273 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
@@ -1709,19 +1684,13 @@ cleanup:
1709} 1684}
1710 1685
1711/** 1686/**
1712 * mem_cgroup_begin_page_stat - begin a page state statistics transaction 1687 * lock_page_memcg - lock a page->mem_cgroup binding
1713 * @page: page that is going to change accounted state 1688 * @page: the page
1714 *
1715 * This function must mark the beginning of an accounted page state
1716 * change to prevent double accounting when the page is concurrently
1717 * being moved to another memcg:
1718 * 1689 *
1719 * memcg = mem_cgroup_begin_page_stat(page); 1690 * This function protects unlocked LRU pages from being moved to
1720 * if (TestClearPageState(page)) 1691 * another cgroup and stabilizes their page->mem_cgroup binding.
1721 * mem_cgroup_update_page_stat(memcg, state, -1);
1722 * mem_cgroup_end_page_stat(memcg);
1723 */ 1692 */
1724struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) 1693void lock_page_memcg(struct page *page)
1725{ 1694{
1726 struct mem_cgroup *memcg; 1695 struct mem_cgroup *memcg;
1727 unsigned long flags; 1696 unsigned long flags;
@@ -1730,25 +1699,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
1730 * The RCU lock is held throughout the transaction. The fast 1699 * The RCU lock is held throughout the transaction. The fast
1731 * path can get away without acquiring the memcg->move_lock 1700 * path can get away without acquiring the memcg->move_lock
1732 * because page moving starts with an RCU grace period. 1701 * because page moving starts with an RCU grace period.
1733 *
1734 * The RCU lock also protects the memcg from being freed when
1735 * the page state that is going to change is the only thing
1736 * preventing the page from being uncharged.
1737 * E.g. end-writeback clearing PageWriteback(), which allows
1738 * migration to go ahead and uncharge the page before the
1739 * account transaction might be complete.
1740 */ 1702 */
1741 rcu_read_lock(); 1703 rcu_read_lock();
1742 1704
1743 if (mem_cgroup_disabled()) 1705 if (mem_cgroup_disabled())
1744 return NULL; 1706 return;
1745again: 1707again:
1746 memcg = page->mem_cgroup; 1708 memcg = page->mem_cgroup;
1747 if (unlikely(!memcg)) 1709 if (unlikely(!memcg))
1748 return NULL; 1710 return;
1749 1711
1750 if (atomic_read(&memcg->moving_account) <= 0) 1712 if (atomic_read(&memcg->moving_account) <= 0)
1751 return memcg; 1713 return;
1752 1714
1753 spin_lock_irqsave(&memcg->move_lock, flags); 1715 spin_lock_irqsave(&memcg->move_lock, flags);
1754 if (memcg != page->mem_cgroup) { 1716 if (memcg != page->mem_cgroup) {
@@ -1759,21 +1721,23 @@ again:
1759 /* 1721 /*
1760 * When charge migration first begins, we can have locked and 1722 * When charge migration first begins, we can have locked and
1761 * unlocked page stat updates happening concurrently. Track 1723 * unlocked page stat updates happening concurrently. Track
1762 * the task who has the lock for mem_cgroup_end_page_stat(). 1724 * the task who has the lock for unlock_page_memcg().
1763 */ 1725 */
1764 memcg->move_lock_task = current; 1726 memcg->move_lock_task = current;
1765 memcg->move_lock_flags = flags; 1727 memcg->move_lock_flags = flags;
1766 1728
1767 return memcg; 1729 return;
1768} 1730}
1769EXPORT_SYMBOL(mem_cgroup_begin_page_stat); 1731EXPORT_SYMBOL(lock_page_memcg);
1770 1732
1771/** 1733/**
1772 * mem_cgroup_end_page_stat - finish a page state statistics transaction 1734 * unlock_page_memcg - unlock a page->mem_cgroup binding
1773 * @memcg: the memcg that was accounted against 1735 * @page: the page
1774 */ 1736 */
1775void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) 1737void unlock_page_memcg(struct page *page)
1776{ 1738{
1739 struct mem_cgroup *memcg = page->mem_cgroup;
1740
1777 if (memcg && memcg->move_lock_task == current) { 1741 if (memcg && memcg->move_lock_task == current) {
1778 unsigned long flags = memcg->move_lock_flags; 1742 unsigned long flags = memcg->move_lock_flags;
1779 1743
@@ -1785,7 +1749,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
1785 1749
1786 rcu_read_unlock(); 1750 rcu_read_unlock();
1787} 1751}
1788EXPORT_SYMBOL(mem_cgroup_end_page_stat); 1752EXPORT_SYMBOL(unlock_page_memcg);
1789 1753
1790/* 1754/*
1791 * size of first charge trial. "32" comes from vmscan.c's magic value. 1755 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -4488,7 +4452,7 @@ static int mem_cgroup_move_account(struct page *page,
4488 VM_BUG_ON(compound && !PageTransHuge(page)); 4452 VM_BUG_ON(compound && !PageTransHuge(page));
4489 4453
4490 /* 4454 /*
4491 * Prevent mem_cgroup_replace_page() from looking at 4455 * Prevent mem_cgroup_migrate() from looking at
4492 * page->mem_cgroup of its source page while we change it. 4456 * page->mem_cgroup of its source page while we change it.
4493 */ 4457 */
4494 ret = -EBUSY; 4458 ret = -EBUSY;
@@ -4923,9 +4887,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4923 4887
4924 lru_add_drain_all(); 4888 lru_add_drain_all();
4925 /* 4889 /*
4926 * Signal mem_cgroup_begin_page_stat() to take the memcg's 4890 * Signal lock_page_memcg() to take the memcg's move_lock
4927 * move_lock while we're moving its pages to another memcg. 4891 * while we're moving its pages to another memcg. Then wait
4928 * Then wait for already started RCU-only updates to finish. 4892 * for already started RCU-only updates to finish.
4929 */ 4893 */
4930 atomic_inc(&mc.from->moving_account); 4894 atomic_inc(&mc.from->moving_account);
4931 synchronize_rcu(); 4895 synchronize_rcu();
@@ -5517,16 +5481,16 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
5517} 5481}
5518 5482
5519/** 5483/**
5520 * mem_cgroup_replace_page - migrate a charge to another page 5484 * mem_cgroup_migrate - charge a page's replacement
5521 * @oldpage: currently charged page 5485 * @oldpage: currently circulating page
5522 * @newpage: page to transfer the charge to 5486 * @newpage: replacement page
5523 * 5487 *
5524 * Migrate the charge from @oldpage to @newpage. 5488 * Charge @newpage as a replacement page for @oldpage. @oldpage will
5489 * be uncharged upon free.
5525 * 5490 *
5526 * Both pages must be locked, @newpage->mapping must be set up. 5491 * Both pages must be locked, @newpage->mapping must be set up.
5527 * Either or both pages might be on the LRU already.
5528 */ 5492 */
5529void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) 5493void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
5530{ 5494{
5531 struct mem_cgroup *memcg; 5495 struct mem_cgroup *memcg;
5532 unsigned int nr_pages; 5496 unsigned int nr_pages;
@@ -5559,7 +5523,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
5559 page_counter_charge(&memcg->memsw, nr_pages); 5523 page_counter_charge(&memcg->memsw, nr_pages);
5560 css_get_many(&memcg->css, nr_pages); 5524 css_get_many(&memcg->css, nr_pages);
5561 5525
5562 commit_charge(newpage, memcg, true); 5526 commit_charge(newpage, memcg, false);
5563 5527
5564 local_irq_disable(); 5528 local_irq_disable();
5565 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); 5529 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ac595e7a3a95..67c30eb993f0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -826,8 +826,6 @@ static struct page_state {
826#undef lru 826#undef lru
827#undef swapbacked 827#undef swapbacked
828#undef head 828#undef head
829#undef tail
830#undef compound
831#undef slab 829#undef slab
832#undef reserved 830#undef reserved
833 831
diff --git a/mm/memory.c b/mm/memory.c
index 906d8e3b42c0..0e247642ed5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1897,7 +1897,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1897 unsigned long end = addr + size; 1897 unsigned long end = addr + size;
1898 int err; 1898 int err;
1899 1899
1900 BUG_ON(addr >= end); 1900 if (WARN_ON(addr >= end))
1901 return -EINVAL;
1902
1901 pgd = pgd_offset(mm, addr); 1903 pgd = pgd_offset(mm, addr);
1902 do { 1904 do {
1903 next = pgd_addr_end(addr, end); 1905 next = pgd_addr_end(addr, end);
@@ -3143,8 +3145,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3143 unsigned long address, pte_t *page_table, pmd_t *pmd, 3145 unsigned long address, pte_t *page_table, pmd_t *pmd,
3144 unsigned int flags, pte_t orig_pte) 3146 unsigned int flags, pte_t orig_pte)
3145{ 3147{
3146 pgoff_t pgoff = (((address & PAGE_MASK) 3148 pgoff_t pgoff = linear_page_index(vma, address);
3147 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3148 3149
3149 pte_unmap(page_table); 3150 pte_unmap(page_table);
3150 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ 3151 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 979b18cbd343..24ea06393816 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -77,6 +77,9 @@ static struct {
77#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) 77#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
78#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) 78#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
79 79
80bool memhp_auto_online;
81EXPORT_SYMBOL_GPL(memhp_auto_online);
82
80void get_online_mems(void) 83void get_online_mems(void)
81{ 84{
82 might_sleep(); 85 might_sleep();
@@ -509,6 +512,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
509 int start_sec, end_sec; 512 int start_sec, end_sec;
510 struct vmem_altmap *altmap; 513 struct vmem_altmap *altmap;
511 514
515 clear_zone_contiguous(zone);
516
512 /* during initialize mem_map, align hot-added range to section */ 517 /* during initialize mem_map, align hot-added range to section */
513 start_sec = pfn_to_section_nr(phys_start_pfn); 518 start_sec = pfn_to_section_nr(phys_start_pfn);
514 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 519 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
@@ -521,7 +526,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
521 if (altmap->base_pfn != phys_start_pfn 526 if (altmap->base_pfn != phys_start_pfn
522 || vmem_altmap_offset(altmap) > nr_pages) { 527 || vmem_altmap_offset(altmap) > nr_pages) {
523 pr_warn_once("memory add fail, invalid altmap\n"); 528 pr_warn_once("memory add fail, invalid altmap\n");
524 return -EINVAL; 529 err = -EINVAL;
530 goto out;
525 } 531 }
526 altmap->alloc = 0; 532 altmap->alloc = 0;
527 } 533 }
@@ -539,7 +545,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
539 err = 0; 545 err = 0;
540 } 546 }
541 vmemmap_populate_print_last(); 547 vmemmap_populate_print_last();
542 548out:
549 set_zone_contiguous(zone);
543 return err; 550 return err;
544} 551}
545EXPORT_SYMBOL_GPL(__add_pages); 552EXPORT_SYMBOL_GPL(__add_pages);
@@ -811,6 +818,8 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
811 } 818 }
812 } 819 }
813 820
821 clear_zone_contiguous(zone);
822
814 /* 823 /*
815 * We can only remove entire sections 824 * We can only remove entire sections
816 */ 825 */
@@ -826,6 +835,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
826 if (ret) 835 if (ret)
827 break; 836 break;
828 } 837 }
838
839 set_zone_contiguous(zone);
840
829 return ret; 841 return ret;
830} 842}
831EXPORT_SYMBOL_GPL(__remove_pages); 843EXPORT_SYMBOL_GPL(__remove_pages);
@@ -1261,8 +1273,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
1261 return zone_default; 1273 return zone_default;
1262} 1274}
1263 1275
1276static int online_memory_block(struct memory_block *mem, void *arg)
1277{
1278 return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
1279}
1280
1264/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1281/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1265int __ref add_memory_resource(int nid, struct resource *res) 1282int __ref add_memory_resource(int nid, struct resource *res, bool online)
1266{ 1283{
1267 u64 start, size; 1284 u64 start, size;
1268 pg_data_t *pgdat = NULL; 1285 pg_data_t *pgdat = NULL;
@@ -1322,6 +1339,11 @@ int __ref add_memory_resource(int nid, struct resource *res)
1322 /* create new memmap entry */ 1339 /* create new memmap entry */
1323 firmware_map_add_hotplug(start, start + size, "System RAM"); 1340 firmware_map_add_hotplug(start, start + size, "System RAM");
1324 1341
1342 /* online pages if requested */
1343 if (online)
1344 walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
1345 NULL, online_memory_block);
1346
1325 goto out; 1347 goto out;
1326 1348
1327error: 1349error:
@@ -1345,7 +1367,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
1345 if (IS_ERR(res)) 1367 if (IS_ERR(res))
1346 return PTR_ERR(res); 1368 return PTR_ERR(res);
1347 1369
1348 ret = add_memory_resource(nid, res); 1370 ret = add_memory_resource(nid, res, memhp_auto_online);
1349 if (ret < 0) 1371 if (ret < 0)
1350 release_memory_resource(res); 1372 release_memory_resource(res);
1351 return ret; 1373 return ret;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9a3f6b90e628..8cbc74387df3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -643,7 +643,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
643 643
644 if (flags & MPOL_MF_LAZY) { 644 if (flags & MPOL_MF_LAZY) {
645 /* Similar to task_numa_work, skip inaccessible VMAs */ 645 /* Similar to task_numa_work, skip inaccessible VMAs */
646 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) 646 if (!is_vm_hugetlb_page(vma) &&
647 (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
648 !(vma->vm_flags & VM_MIXEDMAP))
647 change_prot_numa(vma, start, endvma); 649 change_prot_numa(vma, start, endvma);
648 return 1; 650 return 1;
649 } 651 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 3ad0fea5c438..568284ec75d4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -38,6 +38,7 @@
38#include <linux/balloon_compaction.h> 38#include <linux/balloon_compaction.h>
39#include <linux/mmu_notifier.h> 39#include <linux/mmu_notifier.h>
40#include <linux/page_idle.h> 40#include <linux/page_idle.h>
41#include <linux/page_owner.h>
41 42
42#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
43 44
@@ -325,7 +326,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
325 return -EAGAIN; 326 return -EAGAIN;
326 327
327 /* No turning back from here */ 328 /* No turning back from here */
328 set_page_memcg(newpage, page_memcg(page));
329 newpage->index = page->index; 329 newpage->index = page->index;
330 newpage->mapping = page->mapping; 330 newpage->mapping = page->mapping;
331 if (PageSwapBacked(page)) 331 if (PageSwapBacked(page))
@@ -372,7 +372,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
372 * Now we know that no one else is looking at the page: 372 * Now we know that no one else is looking at the page:
373 * no turning back from here. 373 * no turning back from here.
374 */ 374 */
375 set_page_memcg(newpage, page_memcg(page));
376 newpage->index = page->index; 375 newpage->index = page->index;
377 newpage->mapping = page->mapping; 376 newpage->mapping = page->mapping;
378 if (PageSwapBacked(page)) 377 if (PageSwapBacked(page))
@@ -457,9 +456,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
457 return -EAGAIN; 456 return -EAGAIN;
458 } 457 }
459 458
460 set_page_memcg(newpage, page_memcg(page));
461 newpage->index = page->index; 459 newpage->index = page->index;
462 newpage->mapping = page->mapping; 460 newpage->mapping = page->mapping;
461
463 get_page(newpage); 462 get_page(newpage);
464 463
465 radix_tree_replace_slot(pslot, newpage); 464 radix_tree_replace_slot(pslot, newpage);
@@ -467,6 +466,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
467 page_unfreeze_refs(page, expected_count - 1); 466 page_unfreeze_refs(page, expected_count - 1);
468 467
469 spin_unlock_irq(&mapping->tree_lock); 468 spin_unlock_irq(&mapping->tree_lock);
469
470 return MIGRATEPAGE_SUCCESS; 470 return MIGRATEPAGE_SUCCESS;
471} 471}
472 472
@@ -578,6 +578,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
578 */ 578 */
579 if (PageWriteback(newpage)) 579 if (PageWriteback(newpage))
580 end_page_writeback(newpage); 580 end_page_writeback(newpage);
581
582 copy_page_owner(page, newpage);
583
584 mem_cgroup_migrate(page, newpage);
581} 585}
582 586
583/************************************************************ 587/************************************************************
@@ -772,7 +776,6 @@ static int move_to_new_page(struct page *newpage, struct page *page,
772 * page is freed; but stats require that PageAnon be left as PageAnon. 776 * page is freed; but stats require that PageAnon be left as PageAnon.
773 */ 777 */
774 if (rc == MIGRATEPAGE_SUCCESS) { 778 if (rc == MIGRATEPAGE_SUCCESS) {
775 set_page_memcg(page, NULL);
776 if (!PageAnon(page)) 779 if (!PageAnon(page))
777 page->mapping = NULL; 780 page->mapping = NULL;
778 } 781 }
@@ -952,8 +955,10 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
952 } 955 }
953 956
954 rc = __unmap_and_move(page, newpage, force, mode); 957 rc = __unmap_and_move(page, newpage, force, mode);
955 if (rc == MIGRATEPAGE_SUCCESS) 958 if (rc == MIGRATEPAGE_SUCCESS) {
956 put_new_page = NULL; 959 put_new_page = NULL;
960 set_page_owner_migrate_reason(newpage, reason);
961 }
957 962
958out: 963out:
959 if (rc != -EAGAIN) { 964 if (rc != -EAGAIN) {
@@ -1018,7 +1023,7 @@ out:
1018static int unmap_and_move_huge_page(new_page_t get_new_page, 1023static int unmap_and_move_huge_page(new_page_t get_new_page,
1019 free_page_t put_new_page, unsigned long private, 1024 free_page_t put_new_page, unsigned long private,
1020 struct page *hpage, int force, 1025 struct page *hpage, int force,
1021 enum migrate_mode mode) 1026 enum migrate_mode mode, int reason)
1022{ 1027{
1023 int rc = -EAGAIN; 1028 int rc = -EAGAIN;
1024 int *result = NULL; 1029 int *result = NULL;
@@ -1076,6 +1081,7 @@ put_anon:
1076 if (rc == MIGRATEPAGE_SUCCESS) { 1081 if (rc == MIGRATEPAGE_SUCCESS) {
1077 hugetlb_cgroup_migrate(hpage, new_hpage); 1082 hugetlb_cgroup_migrate(hpage, new_hpage);
1078 put_new_page = NULL; 1083 put_new_page = NULL;
1084 set_page_owner_migrate_reason(new_hpage, reason);
1079 } 1085 }
1080 1086
1081 unlock_page(hpage); 1087 unlock_page(hpage);
@@ -1148,7 +1154,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
1148 if (PageHuge(page)) 1154 if (PageHuge(page))
1149 rc = unmap_and_move_huge_page(get_new_page, 1155 rc = unmap_and_move_huge_page(get_new_page,
1150 put_new_page, private, page, 1156 put_new_page, private, page,
1151 pass > 2, mode); 1157 pass > 2, mode, reason);
1152 else 1158 else
1153 rc = unmap_and_move(get_new_page, put_new_page, 1159 rc = unmap_and_move(get_new_page, put_new_page,
1154 private, page, pass > 2, mode, 1160 private, page, pass > 2, mode,
@@ -1836,9 +1842,8 @@ fail_putback:
1836 } 1842 }
1837 1843
1838 mlock_migrate_page(new_page, page); 1844 mlock_migrate_page(new_page, page);
1839 set_page_memcg(new_page, page_memcg(page));
1840 set_page_memcg(page, NULL);
1841 page_remove_rmap(page, true); 1845 page_remove_rmap(page, true);
1846 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
1842 1847
1843 spin_unlock(ptl); 1848 spin_unlock(ptl);
1844 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1849 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dc490c06941b..e97a05d9621f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -386,10 +386,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
386static void dump_header(struct oom_control *oc, struct task_struct *p, 386static void dump_header(struct oom_control *oc, struct task_struct *p,
387 struct mem_cgroup *memcg) 387 struct mem_cgroup *memcg)
388{ 388{
389 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 389 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, "
390 "oom_score_adj=%hd\n", 390 "oom_score_adj=%hd\n",
391 current->comm, oc->gfp_mask, oc->order, 391 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
392 current->signal->oom_score_adj); 392 current->signal->oom_score_adj);
393
393 cpuset_print_current_mems_allowed(); 394 cpuset_print_current_mems_allowed();
394 dump_stack(); 395 dump_stack();
395 if (memcg) 396 if (memcg)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6fe7d15bd1f7..11ff8f758631 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1169,6 +1169,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
1169 unsigned long balanced_dirty_ratelimit; 1169 unsigned long balanced_dirty_ratelimit;
1170 unsigned long step; 1170 unsigned long step;
1171 unsigned long x; 1171 unsigned long x;
1172 unsigned long shift;
1172 1173
1173 /* 1174 /*
1174 * The dirty rate will match the writeout rate in long term, except 1175 * The dirty rate will match the writeout rate in long term, except
@@ -1293,11 +1294,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
1293 * rate itself is constantly fluctuating. So decrease the track speed 1294 * rate itself is constantly fluctuating. So decrease the track speed
1294 * when it gets close to the target. Helps eliminate pointless tremors. 1295 * when it gets close to the target. Helps eliminate pointless tremors.
1295 */ 1296 */
1296 step >>= dirty_ratelimit / (2 * step + 1); 1297 shift = dirty_ratelimit / (2 * step + 1);
1297 /* 1298 if (shift < BITS_PER_LONG)
1298 * Limit the tracking speed to avoid overshooting. 1299 step = DIV_ROUND_UP(step >> shift, 8);
1299 */ 1300 else
1300 step = (step + 7) / 8; 1301 step = 0;
1301 1302
1302 if (dirty_ratelimit < balanced_dirty_ratelimit) 1303 if (dirty_ratelimit < balanced_dirty_ratelimit)
1303 dirty_ratelimit += step; 1304 dirty_ratelimit += step;
@@ -2409,12 +2410,11 @@ int __set_page_dirty_no_writeback(struct page *page)
2409/* 2410/*
2410 * Helper function for set_page_dirty family. 2411 * Helper function for set_page_dirty family.
2411 * 2412 *
2412 * Caller must hold mem_cgroup_begin_page_stat(). 2413 * Caller must hold lock_page_memcg().
2413 * 2414 *
2414 * NOTE: This relies on being atomic wrt interrupts. 2415 * NOTE: This relies on being atomic wrt interrupts.
2415 */ 2416 */
2416void account_page_dirtied(struct page *page, struct address_space *mapping, 2417void account_page_dirtied(struct page *page, struct address_space *mapping)
2417 struct mem_cgroup *memcg)
2418{ 2418{
2419 struct inode *inode = mapping->host; 2419 struct inode *inode = mapping->host;
2420 2420
@@ -2426,7 +2426,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping,
2426 inode_attach_wb(inode, page); 2426 inode_attach_wb(inode, page);
2427 wb = inode_to_wb(inode); 2427 wb = inode_to_wb(inode);
2428 2428
2429 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); 2429 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2430 __inc_zone_page_state(page, NR_FILE_DIRTY); 2430 __inc_zone_page_state(page, NR_FILE_DIRTY);
2431 __inc_zone_page_state(page, NR_DIRTIED); 2431 __inc_zone_page_state(page, NR_DIRTIED);
2432 __inc_wb_stat(wb, WB_RECLAIMABLE); 2432 __inc_wb_stat(wb, WB_RECLAIMABLE);
@@ -2441,13 +2441,13 @@ EXPORT_SYMBOL(account_page_dirtied);
2441/* 2441/*
2442 * Helper function for deaccounting dirty page without writeback. 2442 * Helper function for deaccounting dirty page without writeback.
2443 * 2443 *
2444 * Caller must hold mem_cgroup_begin_page_stat(). 2444 * Caller must hold lock_page_memcg().
2445 */ 2445 */
2446void account_page_cleaned(struct page *page, struct address_space *mapping, 2446void account_page_cleaned(struct page *page, struct address_space *mapping,
2447 struct mem_cgroup *memcg, struct bdi_writeback *wb) 2447 struct bdi_writeback *wb)
2448{ 2448{
2449 if (mapping_cap_account_dirty(mapping)) { 2449 if (mapping_cap_account_dirty(mapping)) {
2450 mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); 2450 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2451 dec_zone_page_state(page, NR_FILE_DIRTY); 2451 dec_zone_page_state(page, NR_FILE_DIRTY);
2452 dec_wb_stat(wb, WB_RECLAIMABLE); 2452 dec_wb_stat(wb, WB_RECLAIMABLE);
2453 task_io_account_cancelled_write(PAGE_CACHE_SIZE); 2453 task_io_account_cancelled_write(PAGE_CACHE_SIZE);
@@ -2468,26 +2468,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
2468 */ 2468 */
2469int __set_page_dirty_nobuffers(struct page *page) 2469int __set_page_dirty_nobuffers(struct page *page)
2470{ 2470{
2471 struct mem_cgroup *memcg; 2471 lock_page_memcg(page);
2472
2473 memcg = mem_cgroup_begin_page_stat(page);
2474 if (!TestSetPageDirty(page)) { 2472 if (!TestSetPageDirty(page)) {
2475 struct address_space *mapping = page_mapping(page); 2473 struct address_space *mapping = page_mapping(page);
2476 unsigned long flags; 2474 unsigned long flags;
2477 2475
2478 if (!mapping) { 2476 if (!mapping) {
2479 mem_cgroup_end_page_stat(memcg); 2477 unlock_page_memcg(page);
2480 return 1; 2478 return 1;
2481 } 2479 }
2482 2480
2483 spin_lock_irqsave(&mapping->tree_lock, flags); 2481 spin_lock_irqsave(&mapping->tree_lock, flags);
2484 BUG_ON(page_mapping(page) != mapping); 2482 BUG_ON(page_mapping(page) != mapping);
2485 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 2483 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
2486 account_page_dirtied(page, mapping, memcg); 2484 account_page_dirtied(page, mapping);
2487 radix_tree_tag_set(&mapping->page_tree, page_index(page), 2485 radix_tree_tag_set(&mapping->page_tree, page_index(page),
2488 PAGECACHE_TAG_DIRTY); 2486 PAGECACHE_TAG_DIRTY);
2489 spin_unlock_irqrestore(&mapping->tree_lock, flags); 2487 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2490 mem_cgroup_end_page_stat(memcg); 2488 unlock_page_memcg(page);
2491 2489
2492 if (mapping->host) { 2490 if (mapping->host) {
2493 /* !PageAnon && !swapper_space */ 2491 /* !PageAnon && !swapper_space */
@@ -2495,7 +2493,7 @@ int __set_page_dirty_nobuffers(struct page *page)
2495 } 2493 }
2496 return 1; 2494 return 1;
2497 } 2495 }
2498 mem_cgroup_end_page_stat(memcg); 2496 unlock_page_memcg(page);
2499 return 0; 2497 return 0;
2500} 2498}
2501EXPORT_SYMBOL(__set_page_dirty_nobuffers); 2499EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -2625,17 +2623,16 @@ void cancel_dirty_page(struct page *page)
2625 if (mapping_cap_account_dirty(mapping)) { 2623 if (mapping_cap_account_dirty(mapping)) {
2626 struct inode *inode = mapping->host; 2624 struct inode *inode = mapping->host;
2627 struct bdi_writeback *wb; 2625 struct bdi_writeback *wb;
2628 struct mem_cgroup *memcg;
2629 bool locked; 2626 bool locked;
2630 2627
2631 memcg = mem_cgroup_begin_page_stat(page); 2628 lock_page_memcg(page);
2632 wb = unlocked_inode_to_wb_begin(inode, &locked); 2629 wb = unlocked_inode_to_wb_begin(inode, &locked);
2633 2630
2634 if (TestClearPageDirty(page)) 2631 if (TestClearPageDirty(page))
2635 account_page_cleaned(page, mapping, memcg, wb); 2632 account_page_cleaned(page, mapping, wb);
2636 2633
2637 unlocked_inode_to_wb_end(inode, locked); 2634 unlocked_inode_to_wb_end(inode, locked);
2638 mem_cgroup_end_page_stat(memcg); 2635 unlock_page_memcg(page);
2639 } else { 2636 } else {
2640 ClearPageDirty(page); 2637 ClearPageDirty(page);
2641 } 2638 }
@@ -2666,7 +2663,6 @@ int clear_page_dirty_for_io(struct page *page)
2666 if (mapping && mapping_cap_account_dirty(mapping)) { 2663 if (mapping && mapping_cap_account_dirty(mapping)) {
2667 struct inode *inode = mapping->host; 2664 struct inode *inode = mapping->host;
2668 struct bdi_writeback *wb; 2665 struct bdi_writeback *wb;
2669 struct mem_cgroup *memcg;
2670 bool locked; 2666 bool locked;
2671 2667
2672 /* 2668 /*
@@ -2704,16 +2700,14 @@ int clear_page_dirty_for_io(struct page *page)
2704 * always locked coming in here, so we get the desired 2700 * always locked coming in here, so we get the desired
2705 * exclusion. 2701 * exclusion.
2706 */ 2702 */
2707 memcg = mem_cgroup_begin_page_stat(page);
2708 wb = unlocked_inode_to_wb_begin(inode, &locked); 2703 wb = unlocked_inode_to_wb_begin(inode, &locked);
2709 if (TestClearPageDirty(page)) { 2704 if (TestClearPageDirty(page)) {
2710 mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); 2705 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2711 dec_zone_page_state(page, NR_FILE_DIRTY); 2706 dec_zone_page_state(page, NR_FILE_DIRTY);
2712 dec_wb_stat(wb, WB_RECLAIMABLE); 2707 dec_wb_stat(wb, WB_RECLAIMABLE);
2713 ret = 1; 2708 ret = 1;
2714 } 2709 }
2715 unlocked_inode_to_wb_end(inode, locked); 2710 unlocked_inode_to_wb_end(inode, locked);
2716 mem_cgroup_end_page_stat(memcg);
2717 return ret; 2711 return ret;
2718 } 2712 }
2719 return TestClearPageDirty(page); 2713 return TestClearPageDirty(page);
@@ -2723,10 +2717,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
2723int test_clear_page_writeback(struct page *page) 2717int test_clear_page_writeback(struct page *page)
2724{ 2718{
2725 struct address_space *mapping = page_mapping(page); 2719 struct address_space *mapping = page_mapping(page);
2726 struct mem_cgroup *memcg;
2727 int ret; 2720 int ret;
2728 2721
2729 memcg = mem_cgroup_begin_page_stat(page); 2722 lock_page_memcg(page);
2730 if (mapping) { 2723 if (mapping) {
2731 struct inode *inode = mapping->host; 2724 struct inode *inode = mapping->host;
2732 struct backing_dev_info *bdi = inode_to_bdi(inode); 2725 struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2750,21 +2743,20 @@ int test_clear_page_writeback(struct page *page)
2750 ret = TestClearPageWriteback(page); 2743 ret = TestClearPageWriteback(page);
2751 } 2744 }
2752 if (ret) { 2745 if (ret) {
2753 mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 2746 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2754 dec_zone_page_state(page, NR_WRITEBACK); 2747 dec_zone_page_state(page, NR_WRITEBACK);
2755 inc_zone_page_state(page, NR_WRITTEN); 2748 inc_zone_page_state(page, NR_WRITTEN);
2756 } 2749 }
2757 mem_cgroup_end_page_stat(memcg); 2750 unlock_page_memcg(page);
2758 return ret; 2751 return ret;
2759} 2752}
2760 2753
2761int __test_set_page_writeback(struct page *page, bool keep_write) 2754int __test_set_page_writeback(struct page *page, bool keep_write)
2762{ 2755{
2763 struct address_space *mapping = page_mapping(page); 2756 struct address_space *mapping = page_mapping(page);
2764 struct mem_cgroup *memcg;
2765 int ret; 2757 int ret;
2766 2758
2767 memcg = mem_cgroup_begin_page_stat(page); 2759 lock_page_memcg(page);
2768 if (mapping) { 2760 if (mapping) {
2769 struct inode *inode = mapping->host; 2761 struct inode *inode = mapping->host;
2770 struct backing_dev_info *bdi = inode_to_bdi(inode); 2762 struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2792,10 +2784,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2792 ret = TestSetPageWriteback(page); 2784 ret = TestSetPageWriteback(page);
2793 } 2785 }
2794 if (!ret) { 2786 if (!ret) {
2795 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 2787 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2796 inc_zone_page_state(page, NR_WRITEBACK); 2788 inc_zone_page_state(page, NR_WRITEBACK);
2797 } 2789 }
2798 mem_cgroup_end_page_stat(memcg); 2790 unlock_page_memcg(page);
2799 return ret; 2791 return ret;
2800 2792
2801} 2793}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 838ca8bb64f7..c46b75d14b6f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = {
223#endif 223#endif
224}; 224};
225 225
226char * const migratetype_names[MIGRATE_TYPES] = {
227 "Unmovable",
228 "Movable",
229 "Reclaimable",
230 "HighAtomic",
231#ifdef CONFIG_CMA
232 "CMA",
233#endif
234#ifdef CONFIG_MEMORY_ISOLATION
235 "Isolate",
236#endif
237};
238
226compound_page_dtor * const compound_page_dtors[] = { 239compound_page_dtor * const compound_page_dtors[] = {
227 NULL, 240 NULL,
228 free_compound_page, 241 free_compound_page,
@@ -247,6 +260,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
247static unsigned long __initdata required_kernelcore; 260static unsigned long __initdata required_kernelcore;
248static unsigned long __initdata required_movablecore; 261static unsigned long __initdata required_movablecore;
249static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 262static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
263static bool mirrored_kernelcore;
250 264
251/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 265/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
252int movable_zone; 266int movable_zone;
@@ -416,7 +430,7 @@ static void bad_page(struct page *page, const char *reason,
416 goto out; 430 goto out;
417 } 431 }
418 if (nr_unshown) { 432 if (nr_unshown) {
419 printk(KERN_ALERT 433 pr_alert(
420 "BUG: Bad page state: %lu messages suppressed\n", 434 "BUG: Bad page state: %lu messages suppressed\n",
421 nr_unshown); 435 nr_unshown);
422 nr_unshown = 0; 436 nr_unshown = 0;
@@ -426,9 +440,14 @@ static void bad_page(struct page *page, const char *reason,
426 if (nr_shown++ == 0) 440 if (nr_shown++ == 0)
427 resume = jiffies + 60 * HZ; 441 resume = jiffies + 60 * HZ;
428 442
429 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 443 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
430 current->comm, page_to_pfn(page)); 444 current->comm, page_to_pfn(page));
431 dump_page_badflags(page, reason, bad_flags); 445 __dump_page(page, reason);
446 bad_flags &= page->flags;
447 if (bad_flags)
448 pr_alert("bad because of flags: %#lx(%pGp)\n",
449 bad_flags, &bad_flags);
450 dump_page_owner(page);
432 451
433 print_modules(); 452 print_modules();
434 dump_stack(); 453 dump_stack();
@@ -477,7 +496,8 @@ void prep_compound_page(struct page *page, unsigned int order)
477 496
478#ifdef CONFIG_DEBUG_PAGEALLOC 497#ifdef CONFIG_DEBUG_PAGEALLOC
479unsigned int _debug_guardpage_minorder; 498unsigned int _debug_guardpage_minorder;
480bool _debug_pagealloc_enabled __read_mostly; 499bool _debug_pagealloc_enabled __read_mostly
500 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
481bool _debug_guardpage_enabled __read_mostly; 501bool _debug_guardpage_enabled __read_mostly;
482 502
483static int __init early_debug_pagealloc(char *buf) 503static int __init early_debug_pagealloc(char *buf)
@@ -488,6 +508,9 @@ static int __init early_debug_pagealloc(char *buf)
488 if (strcmp(buf, "on") == 0) 508 if (strcmp(buf, "on") == 0)
489 _debug_pagealloc_enabled = true; 509 _debug_pagealloc_enabled = true;
490 510
511 if (strcmp(buf, "off") == 0)
512 _debug_pagealloc_enabled = false;
513
491 return 0; 514 return 0;
492} 515}
493early_param("debug_pagealloc", early_debug_pagealloc); 516early_param("debug_pagealloc", early_debug_pagealloc);
@@ -1002,6 +1025,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
1002 PAGE_SIZE << order); 1025 PAGE_SIZE << order);
1003 } 1026 }
1004 arch_free_page(page, order); 1027 arch_free_page(page, order);
1028 kernel_poison_pages(page, 1 << order, 0);
1005 kernel_map_pages(page, 1 << order, 0); 1029 kernel_map_pages(page, 1 << order, 0);
1006 1030
1007 return true; 1031 return true;
@@ -1104,6 +1128,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
1104 return __free_pages_boot_core(page, pfn, order); 1128 return __free_pages_boot_core(page, pfn, order);
1105} 1129}
1106 1130
1131/*
1132 * Check that the whole (or subset of) a pageblock given by the interval of
1133 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1134 * with the migration of free compaction scanner. The scanners then need to
1135 * use only pfn_valid_within() check for arches that allow holes within
1136 * pageblocks.
1137 *
1138 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1139 *
1140 * It's possible on some configurations to have a setup like node0 node1 node0
1141 * i.e. it's possible that all pages within a zones range of pages do not
1142 * belong to a single zone. We assume that a border between node0 and node1
1143 * can occur within a single pageblock, but not a node0 node1 node0
1144 * interleaving within a single pageblock. It is therefore sufficient to check
1145 * the first and last page of a pageblock and avoid checking each individual
1146 * page in a pageblock.
1147 */
1148struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1149 unsigned long end_pfn, struct zone *zone)
1150{
1151 struct page *start_page;
1152 struct page *end_page;
1153
1154 /* end_pfn is one past the range we are checking */
1155 end_pfn--;
1156
1157 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
1158 return NULL;
1159
1160 start_page = pfn_to_page(start_pfn);
1161
1162 if (page_zone(start_page) != zone)
1163 return NULL;
1164
1165 end_page = pfn_to_page(end_pfn);
1166
1167 /* This gives a shorter code than deriving page_zone(end_page) */
1168 if (page_zone_id(start_page) != page_zone_id(end_page))
1169 return NULL;
1170
1171 return start_page;
1172}
1173
1174void set_zone_contiguous(struct zone *zone)
1175{
1176 unsigned long block_start_pfn = zone->zone_start_pfn;
1177 unsigned long block_end_pfn;
1178
1179 block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
1180 for (; block_start_pfn < zone_end_pfn(zone);
1181 block_start_pfn = block_end_pfn,
1182 block_end_pfn += pageblock_nr_pages) {
1183
1184 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
1185
1186 if (!__pageblock_pfn_to_page(block_start_pfn,
1187 block_end_pfn, zone))
1188 return;
1189 }
1190
1191 /* We confirm that there is no hole */
1192 zone->contiguous = true;
1193}
1194
1195void clear_zone_contiguous(struct zone *zone)
1196{
1197 zone->contiguous = false;
1198}
1199
1107#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1200#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1108static void __init deferred_free_range(struct page *page, 1201static void __init deferred_free_range(struct page *page,
1109 unsigned long pfn, int nr_pages) 1202 unsigned long pfn, int nr_pages)
@@ -1254,9 +1347,13 @@ free_range:
1254 pgdat_init_report_one_done(); 1347 pgdat_init_report_one_done();
1255 return 0; 1348 return 0;
1256} 1349}
1350#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1257 1351
1258void __init page_alloc_init_late(void) 1352void __init page_alloc_init_late(void)
1259{ 1353{
1354 struct zone *zone;
1355
1356#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1260 int nid; 1357 int nid;
1261 1358
1262 /* There will be num_node_state(N_MEMORY) threads */ 1359 /* There will be num_node_state(N_MEMORY) threads */
@@ -1270,8 +1367,11 @@ void __init page_alloc_init_late(void)
1270 1367
1271 /* Reinit limits that are based on free pages after the kernel is up */ 1368 /* Reinit limits that are based on free pages after the kernel is up */
1272 files_maxfiles_init(); 1369 files_maxfiles_init();
1370#endif
1371
1372 for_each_populated_zone(zone)
1373 set_zone_contiguous(zone);
1273} 1374}
1274#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1275 1375
1276#ifdef CONFIG_CMA 1376#ifdef CONFIG_CMA
1277/* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 1377/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
@@ -1381,15 +1481,24 @@ static inline int check_new_page(struct page *page)
1381 return 0; 1481 return 0;
1382} 1482}
1383 1483
1484static inline bool free_pages_prezeroed(bool poisoned)
1485{
1486 return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
1487 page_poisoning_enabled() && poisoned;
1488}
1489
1384static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 1490static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
1385 int alloc_flags) 1491 int alloc_flags)
1386{ 1492{
1387 int i; 1493 int i;
1494 bool poisoned = true;
1388 1495
1389 for (i = 0; i < (1 << order); i++) { 1496 for (i = 0; i < (1 << order); i++) {
1390 struct page *p = page + i; 1497 struct page *p = page + i;
1391 if (unlikely(check_new_page(p))) 1498 if (unlikely(check_new_page(p)))
1392 return 1; 1499 return 1;
1500 if (poisoned)
1501 poisoned &= page_is_poisoned(p);
1393 } 1502 }
1394 1503
1395 set_page_private(page, 0); 1504 set_page_private(page, 0);
@@ -1397,9 +1506,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
1397 1506
1398 arch_alloc_page(page, order); 1507 arch_alloc_page(page, order);
1399 kernel_map_pages(page, 1 << order, 1); 1508 kernel_map_pages(page, 1 << order, 1);
1509 kernel_poison_pages(page, 1 << order, 1);
1400 kasan_alloc_pages(page, order); 1510 kasan_alloc_pages(page, order);
1401 1511
1402 if (gfp_flags & __GFP_ZERO) 1512 if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
1403 for (i = 0; i < (1 << order); i++) 1513 for (i = 0; i < (1 << order); i++)
1404 clear_highpage(page + i); 1514 clear_highpage(page + i);
1405 1515
@@ -2690,9 +2800,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
2690 va_end(args); 2800 va_end(args);
2691 } 2801 }
2692 2802
2693 pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", 2803 pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
2694 current->comm, order, gfp_mask); 2804 current->comm, order, gfp_mask, &gfp_mask);
2695
2696 dump_stack(); 2805 dump_stack();
2697 if (!should_suppress_show_mem()) 2806 if (!should_suppress_show_mem())
2698 show_mem(filter); 2807 show_mem(filter);
@@ -4491,6 +4600,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4491 pg_data_t *pgdat = NODE_DATA(nid); 4600 pg_data_t *pgdat = NODE_DATA(nid);
4492 unsigned long pfn; 4601 unsigned long pfn;
4493 unsigned long nr_initialised = 0; 4602 unsigned long nr_initialised = 0;
4603#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4604 struct memblock_region *r = NULL, *tmp;
4605#endif
4494 4606
4495 if (highest_memmap_pfn < end_pfn - 1) 4607 if (highest_memmap_pfn < end_pfn - 1)
4496 highest_memmap_pfn = end_pfn - 1; 4608 highest_memmap_pfn = end_pfn - 1;
@@ -4504,20 +4616,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4504 4616
4505 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 4617 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
4506 /* 4618 /*
4507 * There can be holes in boot-time mem_map[]s 4619 * There can be holes in boot-time mem_map[]s handed to this
4508 * handed to this function. They do not 4620 * function. They do not exist on hotplugged memory.
4509 * exist on hotplugged memory.
4510 */ 4621 */
4511 if (context == MEMMAP_EARLY) { 4622 if (context != MEMMAP_EARLY)
4512 if (!early_pfn_valid(pfn)) 4623 goto not_early;
4624
4625 if (!early_pfn_valid(pfn))
4626 continue;
4627 if (!early_pfn_in_nid(pfn, nid))
4628 continue;
4629 if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
4630 break;
4631
4632#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4633 /*
4634 * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
4635 * from zone_movable_pfn[nid] to end of each node should be
4636 * ZONE_MOVABLE not ZONE_NORMAL. skip it.
4637 */
4638 if (!mirrored_kernelcore && zone_movable_pfn[nid])
4639 if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
4513 continue; 4640 continue;
4514 if (!early_pfn_in_nid(pfn, nid)) 4641
4642 /*
4643 * Check given memblock attribute by firmware which can affect
4644 * kernel memory layout. If zone==ZONE_MOVABLE but memory is
4645 * mirrored, it's an overlapped memmap init. skip it.
4646 */
4647 if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
4648 if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
4649 for_each_memblock(memory, tmp)
4650 if (pfn < memblock_region_memory_end_pfn(tmp))
4651 break;
4652 r = tmp;
4653 }
4654 if (pfn >= memblock_region_memory_base_pfn(r) &&
4655 memblock_is_mirror(r)) {
4656 /* already initialized as NORMAL */
4657 pfn = memblock_region_memory_end_pfn(r);
4515 continue; 4658 continue;
4516 if (!update_defer_init(pgdat, pfn, end_pfn, 4659 }
4517 &nr_initialised))
4518 break;
4519 } 4660 }
4661#endif
4520 4662
4663not_early:
4521 /* 4664 /*
4522 * Mark the block movable so that blocks are reserved for 4665 * Mark the block movable so that blocks are reserved for
4523 * movable at startup. This will force kernel allocations 4666 * movable at startup. This will force kernel allocations
@@ -4934,11 +5077,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
4934 *zone_end_pfn = min(node_end_pfn, 5077 *zone_end_pfn = min(node_end_pfn,
4935 arch_zone_highest_possible_pfn[movable_zone]); 5078 arch_zone_highest_possible_pfn[movable_zone]);
4936 5079
4937 /* Adjust for ZONE_MOVABLE starting within this range */
4938 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4939 *zone_end_pfn > zone_movable_pfn[nid]) {
4940 *zone_end_pfn = zone_movable_pfn[nid];
4941
4942 /* Check if this whole range is within ZONE_MOVABLE */ 5080 /* Check if this whole range is within ZONE_MOVABLE */
4943 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 5081 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4944 *zone_start_pfn = *zone_end_pfn; 5082 *zone_start_pfn = *zone_end_pfn;
@@ -4953,31 +5091,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4953 unsigned long zone_type, 5091 unsigned long zone_type,
4954 unsigned long node_start_pfn, 5092 unsigned long node_start_pfn,
4955 unsigned long node_end_pfn, 5093 unsigned long node_end_pfn,
5094 unsigned long *zone_start_pfn,
5095 unsigned long *zone_end_pfn,
4956 unsigned long *ignored) 5096 unsigned long *ignored)
4957{ 5097{
4958 unsigned long zone_start_pfn, zone_end_pfn;
4959
4960 /* When hotadd a new node from cpu_up(), the node should be empty */ 5098 /* When hotadd a new node from cpu_up(), the node should be empty */
4961 if (!node_start_pfn && !node_end_pfn) 5099 if (!node_start_pfn && !node_end_pfn)
4962 return 0; 5100 return 0;
4963 5101
4964 /* Get the start and end of the zone */ 5102 /* Get the start and end of the zone */
4965 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 5103 *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4966 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 5104 *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4967 adjust_zone_range_for_zone_movable(nid, zone_type, 5105 adjust_zone_range_for_zone_movable(nid, zone_type,
4968 node_start_pfn, node_end_pfn, 5106 node_start_pfn, node_end_pfn,
4969 &zone_start_pfn, &zone_end_pfn); 5107 zone_start_pfn, zone_end_pfn);
4970 5108
4971 /* Check that this node has pages within the zone's required range */ 5109 /* Check that this node has pages within the zone's required range */
4972 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 5110 if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
4973 return 0; 5111 return 0;
4974 5112
4975 /* Move the zone boundaries inside the node if necessary */ 5113 /* Move the zone boundaries inside the node if necessary */
4976 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 5114 *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
4977 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 5115 *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
4978 5116
4979 /* Return the spanned pages */ 5117 /* Return the spanned pages */
4980 return zone_end_pfn - zone_start_pfn; 5118 return *zone_end_pfn - *zone_start_pfn;
4981} 5119}
4982 5120
4983/* 5121/*
@@ -5023,6 +5161,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
5023 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 5161 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
5024 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 5162 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
5025 unsigned long zone_start_pfn, zone_end_pfn; 5163 unsigned long zone_start_pfn, zone_end_pfn;
5164 unsigned long nr_absent;
5026 5165
5027 /* When hotadd a new node from cpu_up(), the node should be empty */ 5166 /* When hotadd a new node from cpu_up(), the node should be empty */
5028 if (!node_start_pfn && !node_end_pfn) 5167 if (!node_start_pfn && !node_end_pfn)
@@ -5034,7 +5173,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
5034 adjust_zone_range_for_zone_movable(nid, zone_type, 5173 adjust_zone_range_for_zone_movable(nid, zone_type,
5035 node_start_pfn, node_end_pfn, 5174 node_start_pfn, node_end_pfn,
5036 &zone_start_pfn, &zone_end_pfn); 5175 &zone_start_pfn, &zone_end_pfn);
5037 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 5176 nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
5177
5178 /*
5179 * ZONE_MOVABLE handling.
5180 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
5181 * and vice versa.
5182 */
5183 if (zone_movable_pfn[nid]) {
5184 if (mirrored_kernelcore) {
5185 unsigned long start_pfn, end_pfn;
5186 struct memblock_region *r;
5187
5188 for_each_memblock(memory, r) {
5189 start_pfn = clamp(memblock_region_memory_base_pfn(r),
5190 zone_start_pfn, zone_end_pfn);
5191 end_pfn = clamp(memblock_region_memory_end_pfn(r),
5192 zone_start_pfn, zone_end_pfn);
5193
5194 if (zone_type == ZONE_MOVABLE &&
5195 memblock_is_mirror(r))
5196 nr_absent += end_pfn - start_pfn;
5197
5198 if (zone_type == ZONE_NORMAL &&
5199 !memblock_is_mirror(r))
5200 nr_absent += end_pfn - start_pfn;
5201 }
5202 } else {
5203 if (zone_type == ZONE_NORMAL)
5204 nr_absent += node_end_pfn - zone_movable_pfn[nid];
5205 }
5206 }
5207
5208 return nr_absent;
5038} 5209}
5039 5210
5040#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5211#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
@@ -5042,8 +5213,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
5042 unsigned long zone_type, 5213 unsigned long zone_type,
5043 unsigned long node_start_pfn, 5214 unsigned long node_start_pfn,
5044 unsigned long node_end_pfn, 5215 unsigned long node_end_pfn,
5216 unsigned long *zone_start_pfn,
5217 unsigned long *zone_end_pfn,
5045 unsigned long *zones_size) 5218 unsigned long *zones_size)
5046{ 5219{
5220 unsigned int zone;
5221
5222 *zone_start_pfn = node_start_pfn;
5223 for (zone = 0; zone < zone_type; zone++)
5224 *zone_start_pfn += zones_size[zone];
5225
5226 *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
5227
5047 return zones_size[zone_type]; 5228 return zones_size[zone_type];
5048} 5229}
5049 5230
@@ -5072,15 +5253,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
5072 5253
5073 for (i = 0; i < MAX_NR_ZONES; i++) { 5254 for (i = 0; i < MAX_NR_ZONES; i++) {
5074 struct zone *zone = pgdat->node_zones + i; 5255 struct zone *zone = pgdat->node_zones + i;
5256 unsigned long zone_start_pfn, zone_end_pfn;
5075 unsigned long size, real_size; 5257 unsigned long size, real_size;
5076 5258
5077 size = zone_spanned_pages_in_node(pgdat->node_id, i, 5259 size = zone_spanned_pages_in_node(pgdat->node_id, i,
5078 node_start_pfn, 5260 node_start_pfn,
5079 node_end_pfn, 5261 node_end_pfn,
5262 &zone_start_pfn,
5263 &zone_end_pfn,
5080 zones_size); 5264 zones_size);
5081 real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, 5265 real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
5082 node_start_pfn, node_end_pfn, 5266 node_start_pfn, node_end_pfn,
5083 zholes_size); 5267 zholes_size);
5268 if (size)
5269 zone->zone_start_pfn = zone_start_pfn;
5270 else
5271 zone->zone_start_pfn = 0;
5084 zone->spanned_pages = size; 5272 zone->spanned_pages = size;
5085 zone->present_pages = real_size; 5273 zone->present_pages = real_size;
5086 5274
@@ -5201,7 +5389,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5201{ 5389{
5202 enum zone_type j; 5390 enum zone_type j;
5203 int nid = pgdat->node_id; 5391 int nid = pgdat->node_id;
5204 unsigned long zone_start_pfn = pgdat->node_start_pfn;
5205 int ret; 5392 int ret;
5206 5393
5207 pgdat_resize_init(pgdat); 5394 pgdat_resize_init(pgdat);
@@ -5222,6 +5409,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5222 for (j = 0; j < MAX_NR_ZONES; j++) { 5409 for (j = 0; j < MAX_NR_ZONES; j++) {
5223 struct zone *zone = pgdat->node_zones + j; 5410 struct zone *zone = pgdat->node_zones + j;
5224 unsigned long size, realsize, freesize, memmap_pages; 5411 unsigned long size, realsize, freesize, memmap_pages;
5412 unsigned long zone_start_pfn = zone->zone_start_pfn;
5225 5413
5226 size = zone->spanned_pages; 5414 size = zone->spanned_pages;
5227 realsize = freesize = zone->present_pages; 5415 realsize = freesize = zone->present_pages;
@@ -5290,7 +5478,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5290 ret = init_currently_empty_zone(zone, zone_start_pfn, size); 5478 ret = init_currently_empty_zone(zone, zone_start_pfn, size);
5291 BUG_ON(ret); 5479 BUG_ON(ret);
5292 memmap_init(size, nid, j, zone_start_pfn); 5480 memmap_init(size, nid, j, zone_start_pfn);
5293 zone_start_pfn += size;
5294 } 5481 }
5295} 5482}
5296 5483
@@ -5358,6 +5545,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
5358 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, 5545 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
5359 (u64)start_pfn << PAGE_SHIFT, 5546 (u64)start_pfn << PAGE_SHIFT,
5360 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); 5547 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
5548#else
5549 start_pfn = node_start_pfn;
5361#endif 5550#endif
5362 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 5551 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
5363 zones_size, zholes_size); 5552 zones_size, zholes_size);
@@ -5529,6 +5718,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
5529 } 5718 }
5530 5719
5531 /* 5720 /*
5721 * If kernelcore=mirror is specified, ignore movablecore option
5722 */
5723 if (mirrored_kernelcore) {
5724 bool mem_below_4gb_not_mirrored = false;
5725
5726 for_each_memblock(memory, r) {
5727 if (memblock_is_mirror(r))
5728 continue;
5729
5730 nid = r->nid;
5731
5732 usable_startpfn = memblock_region_memory_base_pfn(r);
5733
5734 if (usable_startpfn < 0x100000) {
5735 mem_below_4gb_not_mirrored = true;
5736 continue;
5737 }
5738
5739 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
5740 min(usable_startpfn, zone_movable_pfn[nid]) :
5741 usable_startpfn;
5742 }
5743
5744 if (mem_below_4gb_not_mirrored)
5745 pr_warn("This configuration results in unmirrored kernel memory.");
5746
5747 goto out2;
5748 }
5749
5750 /*
5532 * If movablecore=nn[KMG] was specified, calculate what size of 5751 * If movablecore=nn[KMG] was specified, calculate what size of
5533 * kernelcore that corresponds so that memory usable for 5752 * kernelcore that corresponds so that memory usable for
5534 * any allocation type is evenly spread. If both kernelcore 5753 * any allocation type is evenly spread. If both kernelcore
@@ -5788,6 +6007,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
5788 */ 6007 */
5789static int __init cmdline_parse_kernelcore(char *p) 6008static int __init cmdline_parse_kernelcore(char *p)
5790{ 6009{
6010 /* parse kernelcore=mirror */
6011 if (parse_option_str(p, "mirror")) {
6012 mirrored_kernelcore = true;
6013 return 0;
6014 }
6015
5791 return cmdline_parse_core(p, &required_kernelcore); 6016 return cmdline_parse_core(p, &required_kernelcore);
5792} 6017}
5793 6018
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 292ca7b8debd..2d864e64f7fe 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page)
106 struct page_ext *base; 106 struct page_ext *base;
107 107
108 base = NODE_DATA(page_to_nid(page))->node_page_ext; 108 base = NODE_DATA(page_to_nid(page))->node_page_ext;
109#ifdef CONFIG_DEBUG_VM 109#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
110 /* 110 /*
111 * The sanity checks the page allocator does upon freeing a 111 * The sanity checks the page allocator does upon freeing a
112 * page can reach here before the page_ext arrays are 112 * page can reach here before the page_ext arrays are
113 * allocated when feeding a range of pages to the allocator 113 * allocated when feeding a range of pages to the allocator
114 * for the first time during bootup or memory hotplug. 114 * for the first time during bootup or memory hotplug.
115 *
116 * This check is also necessary for ensuring page poisoning
117 * works as expected when enabled
115 */ 118 */
116 if (unlikely(!base)) 119 if (unlikely(!base))
117 return NULL; 120 return NULL;
@@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page)
180{ 183{
181 unsigned long pfn = page_to_pfn(page); 184 unsigned long pfn = page_to_pfn(page);
182 struct mem_section *section = __pfn_to_section(pfn); 185 struct mem_section *section = __pfn_to_section(pfn);
183#ifdef CONFIG_DEBUG_VM 186#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
184 /* 187 /*
185 * The sanity checks the page allocator does upon freeing a 188 * The sanity checks the page allocator does upon freeing a
186 * page can reach here before the page_ext arrays are 189 * page can reach here before the page_ext arrays are
187 * allocated when feeding a range of pages to the allocator 190 * allocated when feeding a range of pages to the allocator
188 * for the first time during bootup or memory hotplug. 191 * for the first time during bootup or memory hotplug.
192 *
193 * This check is also necessary for ensuring page poisoning
194 * works as expected when enabled
189 */ 195 */
190 if (!section->page_ext) 196 if (!section->page_ext)
191 return NULL; 197 return NULL;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 983c3a10fa07..44ad1f00c4e1 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -5,10 +5,12 @@
5#include <linux/bootmem.h> 5#include <linux/bootmem.h>
6#include <linux/stacktrace.h> 6#include <linux/stacktrace.h>
7#include <linux/page_owner.h> 7#include <linux/page_owner.h>
8#include <linux/jump_label.h>
9#include <linux/migrate.h>
8#include "internal.h" 10#include "internal.h"
9 11
10static bool page_owner_disabled = true; 12static bool page_owner_disabled = true;
11bool page_owner_inited __read_mostly; 13DEFINE_STATIC_KEY_FALSE(page_owner_inited);
12 14
13static void init_early_allocated_pages(void); 15static void init_early_allocated_pages(void);
14 16
@@ -37,7 +39,7 @@ static void init_page_owner(void)
37 if (page_owner_disabled) 39 if (page_owner_disabled)
38 return; 40 return;
39 41
40 page_owner_inited = true; 42 static_branch_enable(&page_owner_inited);
41 init_early_allocated_pages(); 43 init_early_allocated_pages();
42} 44}
43 45
@@ -72,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
72 page_ext->order = order; 74 page_ext->order = order;
73 page_ext->gfp_mask = gfp_mask; 75 page_ext->gfp_mask = gfp_mask;
74 page_ext->nr_entries = trace.nr_entries; 76 page_ext->nr_entries = trace.nr_entries;
77 page_ext->last_migrate_reason = -1;
75 78
76 __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 79 __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
77} 80}
78 81
82void __set_page_owner_migrate_reason(struct page *page, int reason)
83{
84 struct page_ext *page_ext = lookup_page_ext(page);
85
86 page_ext->last_migrate_reason = reason;
87}
88
79gfp_t __get_page_owner_gfp(struct page *page) 89gfp_t __get_page_owner_gfp(struct page *page)
80{ 90{
81 struct page_ext *page_ext = lookup_page_ext(page); 91 struct page_ext *page_ext = lookup_page_ext(page);
@@ -83,6 +93,31 @@ gfp_t __get_page_owner_gfp(struct page *page)
83 return page_ext->gfp_mask; 93 return page_ext->gfp_mask;
84} 94}
85 95
96void __copy_page_owner(struct page *oldpage, struct page *newpage)
97{
98 struct page_ext *old_ext = lookup_page_ext(oldpage);
99 struct page_ext *new_ext = lookup_page_ext(newpage);
100 int i;
101
102 new_ext->order = old_ext->order;
103 new_ext->gfp_mask = old_ext->gfp_mask;
104 new_ext->nr_entries = old_ext->nr_entries;
105
106 for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
107 new_ext->trace_entries[i] = old_ext->trace_entries[i];
108
109 /*
110 * We don't clear the bit on the oldpage as it's going to be freed
111 * after migration. Until then, the info can be useful in case of
112 * a bug, and the overal stats will be off a bit only temporarily.
113 * Also, migrate_misplaced_transhuge_page() can still fail the
114 * migration and then we want the oldpage to retain the info. But
115 * in that case we also don't need to explicitly clear the info from
116 * the new page, which will be freed.
117 */
118 __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
119}
120
86static ssize_t 121static ssize_t
87print_page_owner(char __user *buf, size_t count, unsigned long pfn, 122print_page_owner(char __user *buf, size_t count, unsigned long pfn,
88 struct page *page, struct page_ext *page_ext) 123 struct page *page, struct page_ext *page_ext)
@@ -100,8 +135,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
100 return -ENOMEM; 135 return -ENOMEM;
101 136
102 ret = snprintf(kbuf, count, 137 ret = snprintf(kbuf, count,
103 "Page allocated via order %u, mask 0x%x\n", 138 "Page allocated via order %u, mask %#x(%pGg)\n",
104 page_ext->order, page_ext->gfp_mask); 139 page_ext->order, page_ext->gfp_mask,
140 &page_ext->gfp_mask);
105 141
106 if (ret >= count) 142 if (ret >= count)
107 goto err; 143 goto err;
@@ -110,23 +146,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
110 pageblock_mt = get_pfnblock_migratetype(page, pfn); 146 pageblock_mt = get_pfnblock_migratetype(page, pfn);
111 page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); 147 page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
112 ret += snprintf(kbuf + ret, count - ret, 148 ret += snprintf(kbuf + ret, count - ret,
113 "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", 149 "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
114 pfn, 150 pfn,
151 migratetype_names[page_mt],
115 pfn >> pageblock_order, 152 pfn >> pageblock_order,
116 pageblock_mt, 153 migratetype_names[pageblock_mt],
117 pageblock_mt != page_mt ? "Fallback" : " ", 154 page->flags, &page->flags);
118 PageLocked(page) ? "K" : " ",
119 PageError(page) ? "E" : " ",
120 PageReferenced(page) ? "R" : " ",
121 PageUptodate(page) ? "U" : " ",
122 PageDirty(page) ? "D" : " ",
123 PageLRU(page) ? "L" : " ",
124 PageActive(page) ? "A" : " ",
125 PageSlab(page) ? "S" : " ",
126 PageWriteback(page) ? "W" : " ",
127 PageCompound(page) ? "C" : " ",
128 PageSwapCache(page) ? "B" : " ",
129 PageMappedToDisk(page) ? "M" : " ");
130 155
131 if (ret >= count) 156 if (ret >= count)
132 goto err; 157 goto err;
@@ -135,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
135 if (ret >= count) 160 if (ret >= count)
136 goto err; 161 goto err;
137 162
163 if (page_ext->last_migrate_reason != -1) {
164 ret += snprintf(kbuf + ret, count - ret,
165 "Page has been migrated, last migrate reason: %s\n",
166 migrate_reason_names[page_ext->last_migrate_reason]);
167 if (ret >= count)
168 goto err;
169 }
170
138 ret += snprintf(kbuf + ret, count - ret, "\n"); 171 ret += snprintf(kbuf + ret, count - ret, "\n");
139 if (ret >= count) 172 if (ret >= count)
140 goto err; 173 goto err;
@@ -150,6 +183,31 @@ err:
150 return -ENOMEM; 183 return -ENOMEM;
151} 184}
152 185
186void __dump_page_owner(struct page *page)
187{
188 struct page_ext *page_ext = lookup_page_ext(page);
189 struct stack_trace trace = {
190 .nr_entries = page_ext->nr_entries,
191 .entries = &page_ext->trace_entries[0],
192 };
193 gfp_t gfp_mask = page_ext->gfp_mask;
194 int mt = gfpflags_to_migratetype(gfp_mask);
195
196 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
197 pr_alert("page_owner info is not active (free page?)\n");
198 return;
199 }
200
201 pr_alert("page allocated via order %u, migratetype %s, "
202 "gfp_mask %#x(%pGg)\n", page_ext->order,
203 migratetype_names[mt], gfp_mask, &gfp_mask);
204 print_stack_trace(&trace, 0);
205
206 if (page_ext->last_migrate_reason != -1)
207 pr_alert("page has been migrated, last migrate reason: %s\n",
208 migrate_reason_names[page_ext->last_migrate_reason]);
209}
210
153static ssize_t 211static ssize_t
154read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) 212read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
155{ 213{
@@ -157,7 +215,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
157 struct page *page; 215 struct page *page;
158 struct page_ext *page_ext; 216 struct page_ext *page_ext;
159 217
160 if (!page_owner_inited) 218 if (!static_branch_unlikely(&page_owner_inited))
161 return -EINVAL; 219 return -EINVAL;
162 220
163 page = NULL; 221 page = NULL;
@@ -305,7 +363,7 @@ static int __init pageowner_init(void)
305{ 363{
306 struct dentry *dentry; 364 struct dentry *dentry;
307 365
308 if (!page_owner_inited) { 366 if (!static_branch_unlikely(&page_owner_inited)) {
309 pr_info("page_owner is disabled\n"); 367 pr_info("page_owner is disabled\n");
310 return 0; 368 return 0;
311 } 369 }
diff --git a/mm/debug-pagealloc.c b/mm/page_poison.c
index 5bf5906ce13b..479e7ea2bea6 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/page_poison.c
@@ -6,22 +6,48 @@
6#include <linux/poison.h> 6#include <linux/poison.h>
7#include <linux/ratelimit.h> 7#include <linux/ratelimit.h>
8 8
9static bool page_poisoning_enabled __read_mostly; 9static bool __page_poisoning_enabled __read_mostly;
10static bool want_page_poisoning __read_mostly;
10 11
11static bool need_page_poisoning(void) 12static int early_page_poison_param(char *buf)
12{ 13{
13 if (!debug_pagealloc_enabled()) 14 if (!buf)
14 return false; 15 return -EINVAL;
16
17 if (strcmp(buf, "on") == 0)
18 want_page_poisoning = true;
19 else if (strcmp(buf, "off") == 0)
20 want_page_poisoning = false;
15 21
16 return true; 22 return 0;
23}
24early_param("page_poison", early_page_poison_param);
25
26bool page_poisoning_enabled(void)
27{
28 return __page_poisoning_enabled;
29}
30
31static bool need_page_poisoning(void)
32{
33 return want_page_poisoning;
17} 34}
18 35
19static void init_page_poisoning(void) 36static void init_page_poisoning(void)
20{ 37{
21 if (!debug_pagealloc_enabled()) 38 /*
22 return; 39 * page poisoning is debug page alloc for some arches. If either
40 * of those options are enabled, enable poisoning
41 */
42 if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
43 if (!want_page_poisoning && !debug_pagealloc_enabled())
44 return;
45 } else {
46 if (!want_page_poisoning)
47 return;
48 }
23 49
24 page_poisoning_enabled = true; 50 __page_poisoning_enabled = true;
25} 51}
26 52
27struct page_ext_operations page_poisoning_ops = { 53struct page_ext_operations page_poisoning_ops = {
@@ -45,11 +71,14 @@ static inline void clear_page_poison(struct page *page)
45 __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); 71 __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
46} 72}
47 73
48static inline bool page_poison(struct page *page) 74bool page_is_poisoned(struct page *page)
49{ 75{
50 struct page_ext *page_ext; 76 struct page_ext *page_ext;
51 77
52 page_ext = lookup_page_ext(page); 78 page_ext = lookup_page_ext(page);
79 if (!page_ext)
80 return false;
81
53 return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); 82 return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
54} 83}
55 84
@@ -83,6 +112,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
83 unsigned char *start; 112 unsigned char *start;
84 unsigned char *end; 113 unsigned char *end;
85 114
115 if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
116 return;
117
86 start = memchr_inv(mem, PAGE_POISON, bytes); 118 start = memchr_inv(mem, PAGE_POISON, bytes);
87 if (!start) 119 if (!start)
88 return; 120 return;
@@ -95,9 +127,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
95 if (!__ratelimit(&ratelimit)) 127 if (!__ratelimit(&ratelimit))
96 return; 128 return;
97 else if (start == end && single_bit_flip(*start, PAGE_POISON)) 129 else if (start == end && single_bit_flip(*start, PAGE_POISON))
98 printk(KERN_ERR "pagealloc: single bit error\n"); 130 pr_err("pagealloc: single bit error\n");
99 else 131 else
100 printk(KERN_ERR "pagealloc: memory corruption\n"); 132 pr_err("pagealloc: memory corruption\n");
101 133
102 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, 134 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
103 end - start + 1, 1); 135 end - start + 1, 1);
@@ -108,7 +140,7 @@ static void unpoison_page(struct page *page)
108{ 140{
109 void *addr; 141 void *addr;
110 142
111 if (!page_poison(page)) 143 if (!page_is_poisoned(page))
112 return; 144 return;
113 145
114 addr = kmap_atomic(page); 146 addr = kmap_atomic(page);
@@ -125,9 +157,9 @@ static void unpoison_pages(struct page *page, int n)
125 unpoison_page(page + i); 157 unpoison_page(page + i);
126} 158}
127 159
128void __kernel_map_pages(struct page *page, int numpages, int enable) 160void kernel_poison_pages(struct page *page, int numpages, int enable)
129{ 161{
130 if (!page_poisoning_enabled) 162 if (!page_poisoning_enabled())
131 return; 163 return;
132 164
133 if (enable) 165 if (enable)
@@ -135,3 +167,10 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
135 else 167 else
136 poison_pages(page, numpages); 168 poison_pages(page, numpages);
137} 169}
170
171#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
172void __kernel_map_pages(struct page *page, int numpages, int enable)
173{
174 /* This function does nothing, all work is done via poison pages */
175}
176#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 79f3bf047f38..02f0bfc3c80a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1287,21 +1287,17 @@ void page_add_new_anon_rmap(struct page *page,
1287 */ 1287 */
1288void page_add_file_rmap(struct page *page) 1288void page_add_file_rmap(struct page *page)
1289{ 1289{
1290 struct mem_cgroup *memcg; 1290 lock_page_memcg(page);
1291
1292 memcg = mem_cgroup_begin_page_stat(page);
1293 if (atomic_inc_and_test(&page->_mapcount)) { 1291 if (atomic_inc_and_test(&page->_mapcount)) {
1294 __inc_zone_page_state(page, NR_FILE_MAPPED); 1292 __inc_zone_page_state(page, NR_FILE_MAPPED);
1295 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 1293 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
1296 } 1294 }
1297 mem_cgroup_end_page_stat(memcg); 1295 unlock_page_memcg(page);
1298} 1296}
1299 1297
1300static void page_remove_file_rmap(struct page *page) 1298static void page_remove_file_rmap(struct page *page)
1301{ 1299{
1302 struct mem_cgroup *memcg; 1300 lock_page_memcg(page);
1303
1304 memcg = mem_cgroup_begin_page_stat(page);
1305 1301
1306 /* Hugepages are not counted in NR_FILE_MAPPED for now. */ 1302 /* Hugepages are not counted in NR_FILE_MAPPED for now. */
1307 if (unlikely(PageHuge(page))) { 1303 if (unlikely(PageHuge(page))) {
@@ -1320,12 +1316,12 @@ static void page_remove_file_rmap(struct page *page)
1320 * pte lock(a spinlock) is held, which implies preemption disabled. 1316 * pte lock(a spinlock) is held, which implies preemption disabled.
1321 */ 1317 */
1322 __dec_zone_page_state(page, NR_FILE_MAPPED); 1318 __dec_zone_page_state(page, NR_FILE_MAPPED);
1323 mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 1319 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
1324 1320
1325 if (unlikely(PageMlocked(page))) 1321 if (unlikely(PageMlocked(page)))
1326 clear_page_mlock(page); 1322 clear_page_mlock(page);
1327out: 1323out:
1328 mem_cgroup_end_page_stat(memcg); 1324 unlock_page_memcg(page);
1329} 1325}
1330 1326
1331static void page_remove_anon_compound_rmap(struct page *page) 1327static void page_remove_anon_compound_rmap(struct page *page)
diff --git a/mm/shmem.c b/mm/shmem.c
index 440e2a7e6c1c..1acfdbc4bd9e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1116,7 +1116,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1116 */ 1116 */
1117 oldpage = newpage; 1117 oldpage = newpage;
1118 } else { 1118 } else {
1119 mem_cgroup_replace_page(oldpage, newpage); 1119 mem_cgroup_migrate(oldpage, newpage);
1120 lru_cache_add_anon(newpage); 1120 lru_cache_add_anon(newpage);
1121 *pagep = newpage; 1121 *pagep = newpage;
1122 } 1122 }
diff --git a/mm/slab.c b/mm/slab.c
index 621fbcb35a36..852fc5c79829 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t;
169#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) 169#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
170 170
171/* 171/*
172 * true if a page was allocated from pfmemalloc reserves for network-based
173 * swap
174 */
175static bool pfmemalloc_active __read_mostly;
176
177/*
178 * struct array_cache 172 * struct array_cache
179 * 173 *
180 * Purpose: 174 * Purpose:
@@ -195,10 +189,6 @@ struct array_cache {
195 * Must have this definition in here for the proper 189 * Must have this definition in here for the proper
196 * alignment of array_cache. Also simplifies accessing 190 * alignment of array_cache. Also simplifies accessing
197 * the entries. 191 * the entries.
198 *
199 * Entries should not be directly dereferenced as
200 * entries belonging to slabs marked pfmemalloc will
201 * have the lower bits set SLAB_OBJ_PFMEMALLOC
202 */ 192 */
203}; 193};
204 194
@@ -207,33 +197,6 @@ struct alien_cache {
207 struct array_cache ac; 197 struct array_cache ac;
208}; 198};
209 199
210#define SLAB_OBJ_PFMEMALLOC 1
211static inline bool is_obj_pfmemalloc(void *objp)
212{
213 return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
214}
215
216static inline void set_obj_pfmemalloc(void **objp)
217{
218 *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
219 return;
220}
221
222static inline void clear_obj_pfmemalloc(void **objp)
223{
224 *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
225}
226
227/*
228 * bootstrap: The caches do not work without cpuarrays anymore, but the
229 * cpuarrays are allocated from the generic caches...
230 */
231#define BOOT_CPUCACHE_ENTRIES 1
232struct arraycache_init {
233 struct array_cache cache;
234 void *entries[BOOT_CPUCACHE_ENTRIES];
235};
236
237/* 200/*
238 * Need this for bootstrapping a per node allocator. 201 * Need this for bootstrapping a per node allocator.
239 */ 202 */
@@ -280,9 +243,10 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
280 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 243 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
281 } while (0) 244 } while (0)
282 245
246#define CFLGS_OBJFREELIST_SLAB (0x40000000UL)
283#define CFLGS_OFF_SLAB (0x80000000UL) 247#define CFLGS_OFF_SLAB (0x80000000UL)
248#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
284#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 249#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
285#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
286 250
287#define BATCHREFILL_LIMIT 16 251#define BATCHREFILL_LIMIT 16
288/* 252/*
@@ -390,36 +354,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
390 354
391#endif 355#endif
392 356
393#define OBJECT_FREE (0)
394#define OBJECT_ACTIVE (1)
395
396#ifdef CONFIG_DEBUG_SLAB_LEAK 357#ifdef CONFIG_DEBUG_SLAB_LEAK
397 358
398static void set_obj_status(struct page *page, int idx, int val) 359static inline bool is_store_user_clean(struct kmem_cache *cachep)
399{ 360{
400 int freelist_size; 361 return atomic_read(&cachep->store_user_clean) == 1;
401 char *status;
402 struct kmem_cache *cachep = page->slab_cache;
403
404 freelist_size = cachep->num * sizeof(freelist_idx_t);
405 status = (char *)page->freelist + freelist_size;
406 status[idx] = val;
407} 362}
408 363
409static inline unsigned int get_obj_status(struct page *page, int idx) 364static inline void set_store_user_clean(struct kmem_cache *cachep)
410{ 365{
411 int freelist_size; 366 atomic_set(&cachep->store_user_clean, 1);
412 char *status; 367}
413 struct kmem_cache *cachep = page->slab_cache;
414
415 freelist_size = cachep->num * sizeof(freelist_idx_t);
416 status = (char *)page->freelist + freelist_size;
417 368
418 return status[idx]; 369static inline void set_store_user_dirty(struct kmem_cache *cachep)
370{
371 if (is_store_user_clean(cachep))
372 atomic_set(&cachep->store_user_clean, 0);
419} 373}
420 374
421#else 375#else
422static inline void set_obj_status(struct page *page, int idx, int val) {} 376static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
423 377
424#endif 378#endif
425 379
@@ -457,6 +411,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
457 return reciprocal_divide(offset, cache->reciprocal_buffer_size); 411 return reciprocal_divide(offset, cache->reciprocal_buffer_size);
458} 412}
459 413
414#define BOOT_CPUCACHE_ENTRIES 1
460/* internal cache of cache description objs */ 415/* internal cache of cache description objs */
461static struct kmem_cache kmem_cache_boot = { 416static struct kmem_cache kmem_cache_boot = {
462 .batchcount = 1, 417 .batchcount = 1,
@@ -475,61 +430,13 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
475 return this_cpu_ptr(cachep->cpu_cache); 430 return this_cpu_ptr(cachep->cpu_cache);
476} 431}
477 432
478static size_t calculate_freelist_size(int nr_objs, size_t align)
479{
480 size_t freelist_size;
481
482 freelist_size = nr_objs * sizeof(freelist_idx_t);
483 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
484 freelist_size += nr_objs * sizeof(char);
485
486 if (align)
487 freelist_size = ALIGN(freelist_size, align);
488
489 return freelist_size;
490}
491
492static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
493 size_t idx_size, size_t align)
494{
495 int nr_objs;
496 size_t remained_size;
497 size_t freelist_size;
498 int extra_space = 0;
499
500 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
501 extra_space = sizeof(char);
502 /*
503 * Ignore padding for the initial guess. The padding
504 * is at most @align-1 bytes, and @buffer_size is at
505 * least @align. In the worst case, this result will
506 * be one greater than the number of objects that fit
507 * into the memory allocation when taking the padding
508 * into account.
509 */
510 nr_objs = slab_size / (buffer_size + idx_size + extra_space);
511
512 /*
513 * This calculated number will be either the right
514 * amount, or one greater than what we want.
515 */
516 remained_size = slab_size - nr_objs * buffer_size;
517 freelist_size = calculate_freelist_size(nr_objs, align);
518 if (remained_size < freelist_size)
519 nr_objs--;
520
521 return nr_objs;
522}
523
524/* 433/*
525 * Calculate the number of objects and left-over bytes for a given buffer size. 434 * Calculate the number of objects and left-over bytes for a given buffer size.
526 */ 435 */
527static void cache_estimate(unsigned long gfporder, size_t buffer_size, 436static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
528 size_t align, int flags, size_t *left_over, 437 unsigned long flags, size_t *left_over)
529 unsigned int *num)
530{ 438{
531 int nr_objs; 439 unsigned int num;
532 size_t mgmt_size;
533 size_t slab_size = PAGE_SIZE << gfporder; 440 size_t slab_size = PAGE_SIZE << gfporder;
534 441
535 /* 442 /*
@@ -537,26 +444,28 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
537 * on it. For the latter case, the memory allocated for a 444 * on it. For the latter case, the memory allocated for a
538 * slab is used for: 445 * slab is used for:
539 * 446 *
540 * - One unsigned int for each object
541 * - Padding to respect alignment of @align
542 * - @buffer_size bytes for each object 447 * - @buffer_size bytes for each object
448 * - One freelist_idx_t for each object
449 *
450 * We don't need to consider alignment of freelist because
451 * freelist will be at the end of slab page. The objects will be
452 * at the correct alignment.
543 * 453 *
544 * If the slab management structure is off the slab, then the 454 * If the slab management structure is off the slab, then the
545 * alignment will already be calculated into the size. Because 455 * alignment will already be calculated into the size. Because
546 * the slabs are all pages aligned, the objects will be at the 456 * the slabs are all pages aligned, the objects will be at the
547 * correct alignment when allocated. 457 * correct alignment when allocated.
548 */ 458 */
549 if (flags & CFLGS_OFF_SLAB) { 459 if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) {
550 mgmt_size = 0; 460 num = slab_size / buffer_size;
551 nr_objs = slab_size / buffer_size; 461 *left_over = slab_size % buffer_size;
552
553 } else { 462 } else {
554 nr_objs = calculate_nr_objs(slab_size, buffer_size, 463 num = slab_size / (buffer_size + sizeof(freelist_idx_t));
555 sizeof(freelist_idx_t), align); 464 *left_over = slab_size %
556 mgmt_size = calculate_freelist_size(nr_objs, align); 465 (buffer_size + sizeof(freelist_idx_t));
557 } 466 }
558 *num = nr_objs; 467
559 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 468 return num;
560} 469}
561 470
562#if DEBUG 471#if DEBUG
@@ -687,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries,
687 return ac; 596 return ac;
688} 597}
689 598
690static inline bool is_slab_pfmemalloc(struct page *page) 599static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
691{ 600 struct page *page, void *objp)
692 return PageSlabPfmemalloc(page);
693}
694
695/* Clears pfmemalloc_active if no slabs have pfmalloc set */
696static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
697 struct array_cache *ac)
698{
699 struct kmem_cache_node *n = get_node(cachep, numa_mem_id());
700 struct page *page;
701 unsigned long flags;
702
703 if (!pfmemalloc_active)
704 return;
705
706 spin_lock_irqsave(&n->list_lock, flags);
707 list_for_each_entry(page, &n->slabs_full, lru)
708 if (is_slab_pfmemalloc(page))
709 goto out;
710
711 list_for_each_entry(page, &n->slabs_partial, lru)
712 if (is_slab_pfmemalloc(page))
713 goto out;
714
715 list_for_each_entry(page, &n->slabs_free, lru)
716 if (is_slab_pfmemalloc(page))
717 goto out;
718
719 pfmemalloc_active = false;
720out:
721 spin_unlock_irqrestore(&n->list_lock, flags);
722}
723
724static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
725 gfp_t flags, bool force_refill)
726{ 601{
727 int i; 602 struct kmem_cache_node *n;
728 void *objp = ac->entry[--ac->avail]; 603 int page_node;
729 604 LIST_HEAD(list);
730 /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
731 if (unlikely(is_obj_pfmemalloc(objp))) {
732 struct kmem_cache_node *n;
733
734 if (gfp_pfmemalloc_allowed(flags)) {
735 clear_obj_pfmemalloc(&objp);
736 return objp;
737 }
738
739 /* The caller cannot use PFMEMALLOC objects, find another one */
740 for (i = 0; i < ac->avail; i++) {
741 /* If a !PFMEMALLOC object is found, swap them */
742 if (!is_obj_pfmemalloc(ac->entry[i])) {
743 objp = ac->entry[i];
744 ac->entry[i] = ac->entry[ac->avail];
745 ac->entry[ac->avail] = objp;
746 return objp;
747 }
748 }
749
750 /*
751 * If there are empty slabs on the slabs_free list and we are
752 * being forced to refill the cache, mark this one !pfmemalloc.
753 */
754 n = get_node(cachep, numa_mem_id());
755 if (!list_empty(&n->slabs_free) && force_refill) {
756 struct page *page = virt_to_head_page(objp);
757 ClearPageSlabPfmemalloc(page);
758 clear_obj_pfmemalloc(&objp);
759 recheck_pfmemalloc_active(cachep, ac);
760 return objp;
761 }
762
763 /* No !PFMEMALLOC objects available */
764 ac->avail++;
765 objp = NULL;
766 }
767
768 return objp;
769}
770
771static inline void *ac_get_obj(struct kmem_cache *cachep,
772 struct array_cache *ac, gfp_t flags, bool force_refill)
773{
774 void *objp;
775
776 if (unlikely(sk_memalloc_socks()))
777 objp = __ac_get_obj(cachep, ac, flags, force_refill);
778 else
779 objp = ac->entry[--ac->avail];
780
781 return objp;
782}
783
784static noinline void *__ac_put_obj(struct kmem_cache *cachep,
785 struct array_cache *ac, void *objp)
786{
787 if (unlikely(pfmemalloc_active)) {
788 /* Some pfmemalloc slabs exist, check if this is one */
789 struct page *page = virt_to_head_page(objp);
790 if (PageSlabPfmemalloc(page))
791 set_obj_pfmemalloc(&objp);
792 }
793 605
794 return objp; 606 page_node = page_to_nid(page);
795} 607 n = get_node(cachep, page_node);
796 608
797static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, 609 spin_lock(&n->list_lock);
798 void *objp) 610 free_block(cachep, &objp, 1, page_node, &list);
799{ 611 spin_unlock(&n->list_lock);
800 if (unlikely(sk_memalloc_socks()))
801 objp = __ac_put_obj(cachep, ac, objp);
802 612
803 ac->entry[ac->avail++] = objp; 613 slabs_destroy(cachep, &list);
804} 614}
805 615
806/* 616/*
@@ -1003,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
1003 STATS_INC_ACOVERFLOW(cachep); 813 STATS_INC_ACOVERFLOW(cachep);
1004 __drain_alien_cache(cachep, ac, page_node, &list); 814 __drain_alien_cache(cachep, ac, page_node, &list);
1005 } 815 }
1006 ac_put_obj(cachep, ac, objp); 816 ac->entry[ac->avail++] = objp;
1007 spin_unlock(&alien->lock); 817 spin_unlock(&alien->lock);
1008 slabs_destroy(cachep, &list); 818 slabs_destroy(cachep, &list);
1009 } else { 819 } else {
@@ -1540,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1540 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) 1350 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
1541 return; 1351 return;
1542 1352
1543 printk(KERN_WARNING 1353 pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
1544 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", 1354 nodeid, gfpflags, &gfpflags);
1545 nodeid, gfpflags); 1355 pr_warn(" cache: %s, object size: %d, order: %d\n",
1546 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
1547 cachep->name, cachep->size, cachep->gfporder); 1356 cachep->name, cachep->size, cachep->gfporder);
1548 1357
1549 for_each_kmem_cache_node(cachep, node, n) { 1358 for_each_kmem_cache_node(cachep, node, n) {
@@ -1567,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1567 1376
1568 num_slabs += active_slabs; 1377 num_slabs += active_slabs;
1569 num_objs = num_slabs * cachep->num; 1378 num_objs = num_slabs * cachep->num;
1570 printk(KERN_WARNING 1379 pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
1571 " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
1572 node, active_slabs, num_slabs, active_objs, num_objs, 1380 node, active_slabs, num_slabs, active_objs, num_objs,
1573 free_objects); 1381 free_objects);
1574 } 1382 }
@@ -1604,10 +1412,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1604 return NULL; 1412 return NULL;
1605 } 1413 }
1606 1414
1607 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1608 if (page_is_pfmemalloc(page))
1609 pfmemalloc_active = true;
1610
1611 nr_pages = (1 << cachep->gfporder); 1415 nr_pages = (1 << cachep->gfporder);
1612 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1416 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1613 add_zone_page_state(page_zone(page), 1417 add_zone_page_state(page_zone(page),
@@ -1615,8 +1419,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1615 else 1419 else
1616 add_zone_page_state(page_zone(page), 1420 add_zone_page_state(page_zone(page),
1617 NR_SLAB_UNRECLAIMABLE, nr_pages); 1421 NR_SLAB_UNRECLAIMABLE, nr_pages);
1422
1618 __SetPageSlab(page); 1423 __SetPageSlab(page);
1619 if (page_is_pfmemalloc(page)) 1424 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1425 if (sk_memalloc_socks() && page_is_pfmemalloc(page))
1620 SetPageSlabPfmemalloc(page); 1426 SetPageSlabPfmemalloc(page);
1621 1427
1622 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1428 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
@@ -1670,6 +1476,14 @@ static void kmem_rcu_free(struct rcu_head *head)
1670} 1476}
1671 1477
1672#if DEBUG 1478#if DEBUG
1479static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
1480{
1481 if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
1482 (cachep->size % PAGE_SIZE) == 0)
1483 return true;
1484
1485 return false;
1486}
1673 1487
1674#ifdef CONFIG_DEBUG_PAGEALLOC 1488#ifdef CONFIG_DEBUG_PAGEALLOC
1675static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1489static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
@@ -1703,6 +1517,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1703 } 1517 }
1704 *addr++ = 0x87654321; 1518 *addr++ = 0x87654321;
1705} 1519}
1520
1521static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
1522 int map, unsigned long caller)
1523{
1524 if (!is_debug_pagealloc_cache(cachep))
1525 return;
1526
1527 if (caller)
1528 store_stackinfo(cachep, objp, caller);
1529
1530 kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
1531}
1532
1533#else
1534static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
1535 int map, unsigned long caller) {}
1536
1706#endif 1537#endif
1707 1538
1708static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 1539static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
@@ -1781,6 +1612,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1781 int size, i; 1612 int size, i;
1782 int lines = 0; 1613 int lines = 0;
1783 1614
1615 if (is_debug_pagealloc_cache(cachep))
1616 return;
1617
1784 realobj = (char *)objp + obj_offset(cachep); 1618 realobj = (char *)objp + obj_offset(cachep);
1785 size = cachep->object_size; 1619 size = cachep->object_size;
1786 1620
@@ -1842,20 +1676,18 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1842 struct page *page) 1676 struct page *page)
1843{ 1677{
1844 int i; 1678 int i;
1679
1680 if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
1681 poison_obj(cachep, page->freelist - obj_offset(cachep),
1682 POISON_FREE);
1683 }
1684
1845 for (i = 0; i < cachep->num; i++) { 1685 for (i = 0; i < cachep->num; i++) {
1846 void *objp = index_to_obj(cachep, page, i); 1686 void *objp = index_to_obj(cachep, page, i);
1847 1687
1848 if (cachep->flags & SLAB_POISON) { 1688 if (cachep->flags & SLAB_POISON) {
1849#ifdef CONFIG_DEBUG_PAGEALLOC
1850 if (cachep->size % PAGE_SIZE == 0 &&
1851 OFF_SLAB(cachep))
1852 kernel_map_pages(virt_to_page(objp),
1853 cachep->size / PAGE_SIZE, 1);
1854 else
1855 check_poison_obj(cachep, objp);
1856#else
1857 check_poison_obj(cachep, objp); 1689 check_poison_obj(cachep, objp);
1858#endif 1690 slab_kernel_map(cachep, objp, 1, 0);
1859 } 1691 }
1860 if (cachep->flags & SLAB_RED_ZONE) { 1692 if (cachep->flags & SLAB_RED_ZONE) {
1861 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1693 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
@@ -1916,7 +1748,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
1916 * calculate_slab_order - calculate size (page order) of slabs 1748 * calculate_slab_order - calculate size (page order) of slabs
1917 * @cachep: pointer to the cache that is being created 1749 * @cachep: pointer to the cache that is being created
1918 * @size: size of objects to be created in this cache. 1750 * @size: size of objects to be created in this cache.
1919 * @align: required alignment for the objects.
1920 * @flags: slab allocation flags 1751 * @flags: slab allocation flags
1921 * 1752 *
1922 * Also calculates the number of objects per slab. 1753 * Also calculates the number of objects per slab.
@@ -1926,9 +1757,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
1926 * towards high-order requests, this should be changed. 1757 * towards high-order requests, this should be changed.
1927 */ 1758 */
1928static size_t calculate_slab_order(struct kmem_cache *cachep, 1759static size_t calculate_slab_order(struct kmem_cache *cachep,
1929 size_t size, size_t align, unsigned long flags) 1760 size_t size, unsigned long flags)
1930{ 1761{
1931 unsigned long offslab_limit;
1932 size_t left_over = 0; 1762 size_t left_over = 0;
1933 int gfporder; 1763 int gfporder;
1934 1764
@@ -1936,7 +1766,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
1936 unsigned int num; 1766 unsigned int num;
1937 size_t remainder; 1767 size_t remainder;
1938 1768
1939 cache_estimate(gfporder, size, align, flags, &remainder, &num); 1769 num = cache_estimate(gfporder, size, flags, &remainder);
1940 if (!num) 1770 if (!num)
1941 continue; 1771 continue;
1942 1772
@@ -1945,19 +1775,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
1945 break; 1775 break;
1946 1776
1947 if (flags & CFLGS_OFF_SLAB) { 1777 if (flags & CFLGS_OFF_SLAB) {
1948 size_t freelist_size_per_obj = sizeof(freelist_idx_t); 1778 struct kmem_cache *freelist_cache;
1779 size_t freelist_size;
1780
1781 freelist_size = num * sizeof(freelist_idx_t);
1782 freelist_cache = kmalloc_slab(freelist_size, 0u);
1783 if (!freelist_cache)
1784 continue;
1785
1949 /* 1786 /*
1950 * Max number of objs-per-slab for caches which 1787 * Needed to avoid possible looping condition
1951 * use off-slab slabs. Needed to avoid a possible 1788 * in cache_grow()
1952 * looping condition in cache_grow().
1953 */ 1789 */
1954 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 1790 if (OFF_SLAB(freelist_cache))
1955 freelist_size_per_obj += sizeof(char); 1791 continue;
1956 offslab_limit = size;
1957 offslab_limit /= freelist_size_per_obj;
1958 1792
1959 if (num > offslab_limit) 1793 /* check if off slab has enough benefit */
1960 break; 1794 if (freelist_cache->size > cachep->size / 2)
1795 continue;
1961 } 1796 }
1962 1797
1963 /* Found something acceptable - save it away */ 1798 /* Found something acceptable - save it away */
@@ -2075,6 +1910,79 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
2075 return cachep; 1910 return cachep;
2076} 1911}
2077 1912
1913static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
1914 size_t size, unsigned long flags)
1915{
1916 size_t left;
1917
1918 cachep->num = 0;
1919
1920 if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU)
1921 return false;
1922
1923 left = calculate_slab_order(cachep, size,
1924 flags | CFLGS_OBJFREELIST_SLAB);
1925 if (!cachep->num)
1926 return false;
1927
1928 if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
1929 return false;
1930
1931 cachep->colour = left / cachep->colour_off;
1932
1933 return true;
1934}
1935
1936static bool set_off_slab_cache(struct kmem_cache *cachep,
1937 size_t size, unsigned long flags)
1938{
1939 size_t left;
1940
1941 cachep->num = 0;
1942
1943 /*
1944 * Always use on-slab management when SLAB_NOLEAKTRACE
1945 * to avoid recursive calls into kmemleak.
1946 */
1947 if (flags & SLAB_NOLEAKTRACE)
1948 return false;
1949
1950 /*
1951 * Size is large, assume best to place the slab management obj
1952 * off-slab (should allow better packing of objs).
1953 */
1954 left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB);
1955 if (!cachep->num)
1956 return false;
1957
1958 /*
1959 * If the slab has been placed off-slab, and we have enough space then
1960 * move it on-slab. This is at the expense of any extra colouring.
1961 */
1962 if (left >= cachep->num * sizeof(freelist_idx_t))
1963 return false;
1964
1965 cachep->colour = left / cachep->colour_off;
1966
1967 return true;
1968}
1969
1970static bool set_on_slab_cache(struct kmem_cache *cachep,
1971 size_t size, unsigned long flags)
1972{
1973 size_t left;
1974
1975 cachep->num = 0;
1976
1977 left = calculate_slab_order(cachep, size, flags);
1978 if (!cachep->num)
1979 return false;
1980
1981 cachep->colour = left / cachep->colour_off;
1982
1983 return true;
1984}
1985
2078/** 1986/**
2079 * __kmem_cache_create - Create a cache. 1987 * __kmem_cache_create - Create a cache.
2080 * @cachep: cache management descriptor 1988 * @cachep: cache management descriptor
@@ -2099,7 +2007,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
2099int 2007int
2100__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) 2008__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2101{ 2009{
2102 size_t left_over, freelist_size;
2103 size_t ralign = BYTES_PER_WORD; 2010 size_t ralign = BYTES_PER_WORD;
2104 gfp_t gfp; 2011 gfp_t gfp;
2105 int err; 2012 int err;
@@ -2119,8 +2026,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2119 if (!(flags & SLAB_DESTROY_BY_RCU)) 2026 if (!(flags & SLAB_DESTROY_BY_RCU))
2120 flags |= SLAB_POISON; 2027 flags |= SLAB_POISON;
2121#endif 2028#endif
2122 if (flags & SLAB_DESTROY_BY_RCU)
2123 BUG_ON(flags & SLAB_POISON);
2124#endif 2029#endif
2125 2030
2126 /* 2031 /*
@@ -2152,6 +2057,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2152 * 4) Store it. 2057 * 4) Store it.
2153 */ 2058 */
2154 cachep->align = ralign; 2059 cachep->align = ralign;
2060 cachep->colour_off = cache_line_size();
2061 /* Offset must be a multiple of the alignment. */
2062 if (cachep->colour_off < cachep->align)
2063 cachep->colour_off = cachep->align;
2155 2064
2156 if (slab_is_available()) 2065 if (slab_is_available())
2157 gfp = GFP_KERNEL; 2066 gfp = GFP_KERNEL;
@@ -2179,37 +2088,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2179 else 2088 else
2180 size += BYTES_PER_WORD; 2089 size += BYTES_PER_WORD;
2181 } 2090 }
2182#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2183 /*
2184 * To activate debug pagealloc, off-slab management is necessary
2185 * requirement. In early phase of initialization, small sized slab
2186 * doesn't get initialized so it would not be possible. So, we need
2187 * to check size >= 256. It guarantees that all necessary small
2188 * sized slab is initialized in current slab initialization sequence.
2189 */
2190 if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) &&
2191 size >= 256 && cachep->object_size > cache_line_size() &&
2192 ALIGN(size, cachep->align) < PAGE_SIZE) {
2193 cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
2194 size = PAGE_SIZE;
2195 }
2196#endif
2197#endif 2091#endif
2198 2092
2199 /*
2200 * Determine if the slab management is 'on' or 'off' slab.
2201 * (bootstrapping cannot cope with offslab caches so don't do
2202 * it too early on. Always use on-slab management when
2203 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2204 */
2205 if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
2206 !(flags & SLAB_NOLEAKTRACE))
2207 /*
2208 * Size is large, assume best to place the slab management obj
2209 * off-slab (should allow better packing of objs).
2210 */
2211 flags |= CFLGS_OFF_SLAB;
2212
2213 size = ALIGN(size, cachep->align); 2093 size = ALIGN(size, cachep->align);
2214 /* 2094 /*
2215 * We should restrict the number of objects in a slab to implement 2095 * We should restrict the number of objects in a slab to implement
@@ -2218,42 +2098,46 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2218 if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) 2098 if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
2219 size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); 2099 size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
2220 2100
2221 left_over = calculate_slab_order(cachep, size, cachep->align, flags); 2101#if DEBUG
2222
2223 if (!cachep->num)
2224 return -E2BIG;
2225
2226 freelist_size = calculate_freelist_size(cachep->num, cachep->align);
2227
2228 /* 2102 /*
2229 * If the slab has been placed off-slab, and we have enough space then 2103 * To activate debug pagealloc, off-slab management is necessary
2230 * move it on-slab. This is at the expense of any extra colouring. 2104 * requirement. In early phase of initialization, small sized slab
2105 * doesn't get initialized so it would not be possible. So, we need
2106 * to check size >= 256. It guarantees that all necessary small
2107 * sized slab is initialized in current slab initialization sequence.
2231 */ 2108 */
2232 if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { 2109 if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
2233 flags &= ~CFLGS_OFF_SLAB; 2110 size >= 256 && cachep->object_size > cache_line_size()) {
2234 left_over -= freelist_size; 2111 if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
2112 size_t tmp_size = ALIGN(size, PAGE_SIZE);
2113
2114 if (set_off_slab_cache(cachep, tmp_size, flags)) {
2115 flags |= CFLGS_OFF_SLAB;
2116 cachep->obj_offset += tmp_size - size;
2117 size = tmp_size;
2118 goto done;
2119 }
2120 }
2235 } 2121 }
2122#endif
2236 2123
2237 if (flags & CFLGS_OFF_SLAB) { 2124 if (set_objfreelist_slab_cache(cachep, size, flags)) {
2238 /* really off slab. No need for manual alignment */ 2125 flags |= CFLGS_OBJFREELIST_SLAB;
2239 freelist_size = calculate_freelist_size(cachep->num, 0); 2126 goto done;
2127 }
2240 2128
2241#ifdef CONFIG_PAGE_POISONING 2129 if (set_off_slab_cache(cachep, size, flags)) {
2242 /* If we're going to use the generic kernel_map_pages() 2130 flags |= CFLGS_OFF_SLAB;
2243 * poisoning, then it's going to smash the contents of 2131 goto done;
2244 * the redzone and userword anyhow, so switch them off.
2245 */
2246 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2247 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2248#endif
2249 } 2132 }
2250 2133
2251 cachep->colour_off = cache_line_size(); 2134 if (set_on_slab_cache(cachep, size, flags))
2252 /* Offset must be a multiple of the alignment. */ 2135 goto done;
2253 if (cachep->colour_off < cachep->align) 2136
2254 cachep->colour_off = cachep->align; 2137 return -E2BIG;
2255 cachep->colour = left_over / cachep->colour_off; 2138
2256 cachep->freelist_size = freelist_size; 2139done:
2140 cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
2257 cachep->flags = flags; 2141 cachep->flags = flags;
2258 cachep->allocflags = __GFP_COMP; 2142 cachep->allocflags = __GFP_COMP;
2259 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 2143 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
@@ -2261,16 +2145,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2261 cachep->size = size; 2145 cachep->size = size;
2262 cachep->reciprocal_buffer_size = reciprocal_value(size); 2146 cachep->reciprocal_buffer_size = reciprocal_value(size);
2263 2147
2264 if (flags & CFLGS_OFF_SLAB) { 2148#if DEBUG
2265 cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); 2149 /*
2266 /* 2150 * If we're going to use the generic kernel_map_pages()
2267 * This is a possibility for one of the kmalloc_{dma,}_caches. 2151 * poisoning, then it's going to smash the contents of
2268 * But since we go off slab only for object size greater than 2152 * the redzone and userword anyhow, so switch them off.
2269 * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created 2153 */
2270 * in ascending order,this should not happen at all. 2154 if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
2271 * But leave a BUG_ON for some lucky dude. 2155 (cachep->flags & SLAB_POISON) &&
2272 */ 2156 is_debug_pagealloc_cache(cachep))
2273 BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); 2157 cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2158#endif
2159
2160 if (OFF_SLAB(cachep)) {
2161 cachep->freelist_cache =
2162 kmalloc_slab(cachep->freelist_size, 0u);
2274 } 2163 }
2275 2164
2276 err = setup_cpu_cache(cachep, gfp); 2165 err = setup_cpu_cache(cachep, gfp);
@@ -2377,9 +2266,6 @@ static int drain_freelist(struct kmem_cache *cache,
2377 } 2266 }
2378 2267
2379 page = list_entry(p, struct page, lru); 2268 page = list_entry(p, struct page, lru);
2380#if DEBUG
2381 BUG_ON(page->active);
2382#endif
2383 list_del(&page->lru); 2269 list_del(&page->lru);
2384 /* 2270 /*
2385 * Safe to drop the lock. The slab is no longer linked 2271 * Safe to drop the lock. The slab is no longer linked
@@ -2454,18 +2340,23 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
2454 void *freelist; 2340 void *freelist;
2455 void *addr = page_address(page); 2341 void *addr = page_address(page);
2456 2342
2457 if (OFF_SLAB(cachep)) { 2343 page->s_mem = addr + colour_off;
2344 page->active = 0;
2345
2346 if (OBJFREELIST_SLAB(cachep))
2347 freelist = NULL;
2348 else if (OFF_SLAB(cachep)) {
2458 /* Slab management obj is off-slab. */ 2349 /* Slab management obj is off-slab. */
2459 freelist = kmem_cache_alloc_node(cachep->freelist_cache, 2350 freelist = kmem_cache_alloc_node(cachep->freelist_cache,
2460 local_flags, nodeid); 2351 local_flags, nodeid);
2461 if (!freelist) 2352 if (!freelist)
2462 return NULL; 2353 return NULL;
2463 } else { 2354 } else {
2464 freelist = addr + colour_off; 2355 /* We will use last bytes at the slab for freelist */
2465 colour_off += cachep->freelist_size; 2356 freelist = addr + (PAGE_SIZE << cachep->gfporder) -
2357 cachep->freelist_size;
2466 } 2358 }
2467 page->active = 0; 2359
2468 page->s_mem = addr + colour_off;
2469 return freelist; 2360 return freelist;
2470} 2361}
2471 2362
@@ -2480,17 +2371,14 @@ static inline void set_free_obj(struct page *page,
2480 ((freelist_idx_t *)(page->freelist))[idx] = val; 2371 ((freelist_idx_t *)(page->freelist))[idx] = val;
2481} 2372}
2482 2373
2483static void cache_init_objs(struct kmem_cache *cachep, 2374static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
2484 struct page *page)
2485{ 2375{
2376#if DEBUG
2486 int i; 2377 int i;
2487 2378
2488 for (i = 0; i < cachep->num; i++) { 2379 for (i = 0; i < cachep->num; i++) {
2489 void *objp = index_to_obj(cachep, page, i); 2380 void *objp = index_to_obj(cachep, page, i);
2490#if DEBUG 2381
2491 /* need to poison the objs? */
2492 if (cachep->flags & SLAB_POISON)
2493 poison_obj(cachep, objp, POISON_FREE);
2494 if (cachep->flags & SLAB_STORE_USER) 2382 if (cachep->flags & SLAB_STORE_USER)
2495 *dbg_userword(cachep, objp) = NULL; 2383 *dbg_userword(cachep, objp) = NULL;
2496 2384
@@ -2514,15 +2402,32 @@ static void cache_init_objs(struct kmem_cache *cachep,
2514 slab_error(cachep, "constructor overwrote the" 2402 slab_error(cachep, "constructor overwrote the"
2515 " start of an object"); 2403 " start of an object");
2516 } 2404 }
2517 if ((cachep->size % PAGE_SIZE) == 0 && 2405 /* need to poison the objs? */
2518 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2406 if (cachep->flags & SLAB_POISON) {
2519 kernel_map_pages(virt_to_page(objp), 2407 poison_obj(cachep, objp, POISON_FREE);
2520 cachep->size / PAGE_SIZE, 0); 2408 slab_kernel_map(cachep, objp, 0, 0);
2521#else 2409 }
2522 if (cachep->ctor) 2410 }
2523 cachep->ctor(objp);
2524#endif 2411#endif
2525 set_obj_status(page, i, OBJECT_FREE); 2412}
2413
2414static void cache_init_objs(struct kmem_cache *cachep,
2415 struct page *page)
2416{
2417 int i;
2418
2419 cache_init_objs_debug(cachep, page);
2420
2421 if (OBJFREELIST_SLAB(cachep)) {
2422 page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
2423 obj_offset(cachep);
2424 }
2425
2426 for (i = 0; i < cachep->num; i++) {
2427 /* constructor could break poison info */
2428 if (DEBUG == 0 && cachep->ctor)
2429 cachep->ctor(index_to_obj(cachep, page, i));
2430
2526 set_free_obj(page, i, i); 2431 set_free_obj(page, i, i);
2527 } 2432 }
2528} 2433}
@@ -2537,30 +2442,28 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2537 } 2442 }
2538} 2443}
2539 2444
2540static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, 2445static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
2541 int nodeid)
2542{ 2446{
2543 void *objp; 2447 void *objp;
2544 2448
2545 objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); 2449 objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
2546 page->active++; 2450 page->active++;
2451
2547#if DEBUG 2452#if DEBUG
2548 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); 2453 if (cachep->flags & SLAB_STORE_USER)
2454 set_store_user_dirty(cachep);
2549#endif 2455#endif
2550 2456
2551 return objp; 2457 return objp;
2552} 2458}
2553 2459
2554static void slab_put_obj(struct kmem_cache *cachep, struct page *page, 2460static void slab_put_obj(struct kmem_cache *cachep,
2555 void *objp, int nodeid) 2461 struct page *page, void *objp)
2556{ 2462{
2557 unsigned int objnr = obj_to_index(cachep, page, objp); 2463 unsigned int objnr = obj_to_index(cachep, page, objp);
2558#if DEBUG 2464#if DEBUG
2559 unsigned int i; 2465 unsigned int i;
2560 2466
2561 /* Verify that the slab belongs to the intended node */
2562 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
2563
2564 /* Verify double free bug */ 2467 /* Verify double free bug */
2565 for (i = page->active; i < cachep->num; i++) { 2468 for (i = page->active; i < cachep->num; i++) {
2566 if (get_free_obj(page, i) == objnr) { 2469 if (get_free_obj(page, i) == objnr) {
@@ -2571,6 +2474,9 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
2571 } 2474 }
2572#endif 2475#endif
2573 page->active--; 2476 page->active--;
2477 if (!page->freelist)
2478 page->freelist = objp + obj_offset(cachep);
2479
2574 set_free_obj(page, page->active, objnr); 2480 set_free_obj(page, page->active, objnr);
2575} 2481}
2576 2482
@@ -2645,7 +2551,7 @@ static int cache_grow(struct kmem_cache *cachep,
2645 /* Get slab management. */ 2551 /* Get slab management. */
2646 freelist = alloc_slabmgmt(cachep, page, offset, 2552 freelist = alloc_slabmgmt(cachep, page, offset,
2647 local_flags & ~GFP_CONSTRAINT_MASK, nodeid); 2553 local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2648 if (!freelist) 2554 if (OFF_SLAB(cachep) && !freelist)
2649 goto opps1; 2555 goto opps1;
2650 2556
2651 slab_map_pages(cachep, page, freelist); 2557 slab_map_pages(cachep, page, freelist);
@@ -2726,27 +2632,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2726 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2632 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2727 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2633 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2728 } 2634 }
2729 if (cachep->flags & SLAB_STORE_USER) 2635 if (cachep->flags & SLAB_STORE_USER) {
2636 set_store_user_dirty(cachep);
2730 *dbg_userword(cachep, objp) = (void *)caller; 2637 *dbg_userword(cachep, objp) = (void *)caller;
2638 }
2731 2639
2732 objnr = obj_to_index(cachep, page, objp); 2640 objnr = obj_to_index(cachep, page, objp);
2733 2641
2734 BUG_ON(objnr >= cachep->num); 2642 BUG_ON(objnr >= cachep->num);
2735 BUG_ON(objp != index_to_obj(cachep, page, objnr)); 2643 BUG_ON(objp != index_to_obj(cachep, page, objnr));
2736 2644
2737 set_obj_status(page, objnr, OBJECT_FREE);
2738 if (cachep->flags & SLAB_POISON) { 2645 if (cachep->flags & SLAB_POISON) {
2739#ifdef CONFIG_DEBUG_PAGEALLOC
2740 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2741 store_stackinfo(cachep, objp, caller);
2742 kernel_map_pages(virt_to_page(objp),
2743 cachep->size / PAGE_SIZE, 0);
2744 } else {
2745 poison_obj(cachep, objp, POISON_FREE);
2746 }
2747#else
2748 poison_obj(cachep, objp, POISON_FREE); 2646 poison_obj(cachep, objp, POISON_FREE);
2749#endif 2647 slab_kernel_map(cachep, objp, 0, caller);
2750 } 2648 }
2751 return objp; 2649 return objp;
2752} 2650}
@@ -2756,7 +2654,85 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2756#define cache_free_debugcheck(x,objp,z) (objp) 2654#define cache_free_debugcheck(x,objp,z) (objp)
2757#endif 2655#endif
2758 2656
2759static struct page *get_first_slab(struct kmem_cache_node *n) 2657static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
2658 void **list)
2659{
2660#if DEBUG
2661 void *next = *list;
2662 void *objp;
2663
2664 while (next) {
2665 objp = next - obj_offset(cachep);
2666 next = *(void **)next;
2667 poison_obj(cachep, objp, POISON_FREE);
2668 }
2669#endif
2670}
2671
2672static inline void fixup_slab_list(struct kmem_cache *cachep,
2673 struct kmem_cache_node *n, struct page *page,
2674 void **list)
2675{
2676 /* move slabp to correct slabp list: */
2677 list_del(&page->lru);
2678 if (page->active == cachep->num) {
2679 list_add(&page->lru, &n->slabs_full);
2680 if (OBJFREELIST_SLAB(cachep)) {
2681#if DEBUG
2682 /* Poisoning will be done without holding the lock */
2683 if (cachep->flags & SLAB_POISON) {
2684 void **objp = page->freelist;
2685
2686 *objp = *list;
2687 *list = objp;
2688 }
2689#endif
2690 page->freelist = NULL;
2691 }
2692 } else
2693 list_add(&page->lru, &n->slabs_partial);
2694}
2695
2696/* Try to find non-pfmemalloc slab if needed */
2697static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
2698 struct page *page, bool pfmemalloc)
2699{
2700 if (!page)
2701 return NULL;
2702
2703 if (pfmemalloc)
2704 return page;
2705
2706 if (!PageSlabPfmemalloc(page))
2707 return page;
2708
2709 /* No need to keep pfmemalloc slab if we have enough free objects */
2710 if (n->free_objects > n->free_limit) {
2711 ClearPageSlabPfmemalloc(page);
2712 return page;
2713 }
2714
2715 /* Move pfmemalloc slab to the end of list to speed up next search */
2716 list_del(&page->lru);
2717 if (!page->active)
2718 list_add_tail(&page->lru, &n->slabs_free);
2719 else
2720 list_add_tail(&page->lru, &n->slabs_partial);
2721
2722 list_for_each_entry(page, &n->slabs_partial, lru) {
2723 if (!PageSlabPfmemalloc(page))
2724 return page;
2725 }
2726
2727 list_for_each_entry(page, &n->slabs_free, lru) {
2728 if (!PageSlabPfmemalloc(page))
2729 return page;
2730 }
2731
2732 return NULL;
2733}
2734
2735static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
2760{ 2736{
2761 struct page *page; 2737 struct page *page;
2762 2738
@@ -2768,21 +2744,51 @@ static struct page *get_first_slab(struct kmem_cache_node *n)
2768 struct page, lru); 2744 struct page, lru);
2769 } 2745 }
2770 2746
2747 if (sk_memalloc_socks())
2748 return get_valid_first_slab(n, page, pfmemalloc);
2749
2771 return page; 2750 return page;
2772} 2751}
2773 2752
2774static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, 2753static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
2775 bool force_refill) 2754 struct kmem_cache_node *n, gfp_t flags)
2755{
2756 struct page *page;
2757 void *obj;
2758 void *list = NULL;
2759
2760 if (!gfp_pfmemalloc_allowed(flags))
2761 return NULL;
2762
2763 spin_lock(&n->list_lock);
2764 page = get_first_slab(n, true);
2765 if (!page) {
2766 spin_unlock(&n->list_lock);
2767 return NULL;
2768 }
2769
2770 obj = slab_get_obj(cachep, page);
2771 n->free_objects--;
2772
2773 fixup_slab_list(cachep, n, page, &list);
2774
2775 spin_unlock(&n->list_lock);
2776 fixup_objfreelist_debug(cachep, &list);
2777
2778 return obj;
2779}
2780
2781static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2776{ 2782{
2777 int batchcount; 2783 int batchcount;
2778 struct kmem_cache_node *n; 2784 struct kmem_cache_node *n;
2779 struct array_cache *ac; 2785 struct array_cache *ac;
2780 int node; 2786 int node;
2787 void *list = NULL;
2781 2788
2782 check_irq_off(); 2789 check_irq_off();
2783 node = numa_mem_id(); 2790 node = numa_mem_id();
2784 if (unlikely(force_refill)) 2791
2785 goto force_grow;
2786retry: 2792retry:
2787 ac = cpu_cache_get(cachep); 2793 ac = cpu_cache_get(cachep);
2788 batchcount = ac->batchcount; 2794 batchcount = ac->batchcount;
@@ -2808,7 +2814,7 @@ retry:
2808 while (batchcount > 0) { 2814 while (batchcount > 0) {
2809 struct page *page; 2815 struct page *page;
2810 /* Get slab alloc is to come from. */ 2816 /* Get slab alloc is to come from. */
2811 page = get_first_slab(n); 2817 page = get_first_slab(n, false);
2812 if (!page) 2818 if (!page)
2813 goto must_grow; 2819 goto must_grow;
2814 2820
@@ -2826,26 +2832,29 @@ retry:
2826 STATS_INC_ACTIVE(cachep); 2832 STATS_INC_ACTIVE(cachep);
2827 STATS_SET_HIGH(cachep); 2833 STATS_SET_HIGH(cachep);
2828 2834
2829 ac_put_obj(cachep, ac, slab_get_obj(cachep, page, 2835 ac->entry[ac->avail++] = slab_get_obj(cachep, page);
2830 node));
2831 } 2836 }
2832 2837
2833 /* move slabp to correct slabp list: */ 2838 fixup_slab_list(cachep, n, page, &list);
2834 list_del(&page->lru);
2835 if (page->active == cachep->num)
2836 list_add(&page->lru, &n->slabs_full);
2837 else
2838 list_add(&page->lru, &n->slabs_partial);
2839 } 2839 }
2840 2840
2841must_grow: 2841must_grow:
2842 n->free_objects -= ac->avail; 2842 n->free_objects -= ac->avail;
2843alloc_done: 2843alloc_done:
2844 spin_unlock(&n->list_lock); 2844 spin_unlock(&n->list_lock);
2845 fixup_objfreelist_debug(cachep, &list);
2845 2846
2846 if (unlikely(!ac->avail)) { 2847 if (unlikely(!ac->avail)) {
2847 int x; 2848 int x;
2848force_grow: 2849
2850 /* Check if we can use obj in pfmemalloc slab */
2851 if (sk_memalloc_socks()) {
2852 void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
2853
2854 if (obj)
2855 return obj;
2856 }
2857
2849 x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); 2858 x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
2850 2859
2851 /* cache_grow can reenable interrupts, then ac could change. */ 2860 /* cache_grow can reenable interrupts, then ac could change. */
@@ -2853,7 +2862,7 @@ force_grow:
2853 node = numa_mem_id(); 2862 node = numa_mem_id();
2854 2863
2855 /* no objects in sight? abort */ 2864 /* no objects in sight? abort */
2856 if (!x && (ac->avail == 0 || force_refill)) 2865 if (!x && ac->avail == 0)
2857 return NULL; 2866 return NULL;
2858 2867
2859 if (!ac->avail) /* objects refilled by interrupt? */ 2868 if (!ac->avail) /* objects refilled by interrupt? */
@@ -2861,7 +2870,7 @@ force_grow:
2861 } 2870 }
2862 ac->touched = 1; 2871 ac->touched = 1;
2863 2872
2864 return ac_get_obj(cachep, ac, flags, force_refill); 2873 return ac->entry[--ac->avail];
2865} 2874}
2866 2875
2867static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 2876static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -2877,20 +2886,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2877static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 2886static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2878 gfp_t flags, void *objp, unsigned long caller) 2887 gfp_t flags, void *objp, unsigned long caller)
2879{ 2888{
2880 struct page *page;
2881
2882 if (!objp) 2889 if (!objp)
2883 return objp; 2890 return objp;
2884 if (cachep->flags & SLAB_POISON) { 2891 if (cachep->flags & SLAB_POISON) {
2885#ifdef CONFIG_DEBUG_PAGEALLOC
2886 if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2887 kernel_map_pages(virt_to_page(objp),
2888 cachep->size / PAGE_SIZE, 1);
2889 else
2890 check_poison_obj(cachep, objp);
2891#else
2892 check_poison_obj(cachep, objp); 2892 check_poison_obj(cachep, objp);
2893#endif 2893 slab_kernel_map(cachep, objp, 1, 0);
2894 poison_obj(cachep, objp, POISON_INUSE); 2894 poison_obj(cachep, objp, POISON_INUSE);
2895 } 2895 }
2896 if (cachep->flags & SLAB_STORE_USER) 2896 if (cachep->flags & SLAB_STORE_USER)
@@ -2910,8 +2910,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2910 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2910 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2911 } 2911 }
2912 2912
2913 page = virt_to_head_page(objp);
2914 set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
2915 objp += obj_offset(cachep); 2913 objp += obj_offset(cachep);
2916 if (cachep->ctor && cachep->flags & SLAB_POISON) 2914 if (cachep->ctor && cachep->flags & SLAB_POISON)
2917 cachep->ctor(objp); 2915 cachep->ctor(objp);
@@ -2926,40 +2924,24 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2926#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2924#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2927#endif 2925#endif
2928 2926
2929static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
2930{
2931 if (unlikely(cachep == kmem_cache))
2932 return false;
2933
2934 return should_failslab(cachep->object_size, flags, cachep->flags);
2935}
2936
2937static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 2927static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2938{ 2928{
2939 void *objp; 2929 void *objp;
2940 struct array_cache *ac; 2930 struct array_cache *ac;
2941 bool force_refill = false;
2942 2931
2943 check_irq_off(); 2932 check_irq_off();
2944 2933
2945 ac = cpu_cache_get(cachep); 2934 ac = cpu_cache_get(cachep);
2946 if (likely(ac->avail)) { 2935 if (likely(ac->avail)) {
2947 ac->touched = 1; 2936 ac->touched = 1;
2948 objp = ac_get_obj(cachep, ac, flags, false); 2937 objp = ac->entry[--ac->avail];
2949 2938
2950 /* 2939 STATS_INC_ALLOCHIT(cachep);
2951 * Allow for the possibility all avail objects are not allowed 2940 goto out;
2952 * by the current flags
2953 */
2954 if (objp) {
2955 STATS_INC_ALLOCHIT(cachep);
2956 goto out;
2957 }
2958 force_refill = true;
2959 } 2941 }
2960 2942
2961 STATS_INC_ALLOCMISS(cachep); 2943 STATS_INC_ALLOCMISS(cachep);
2962 objp = cache_alloc_refill(cachep, flags, force_refill); 2944 objp = cache_alloc_refill(cachep, flags);
2963 /* 2945 /*
2964 * the 'ac' may be updated by cache_alloc_refill(), 2946 * the 'ac' may be updated by cache_alloc_refill(),
2965 * and kmemleak_erase() requires its correct value. 2947 * and kmemleak_erase() requires its correct value.
@@ -3097,6 +3079,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3097 struct page *page; 3079 struct page *page;
3098 struct kmem_cache_node *n; 3080 struct kmem_cache_node *n;
3099 void *obj; 3081 void *obj;
3082 void *list = NULL;
3100 int x; 3083 int x;
3101 3084
3102 VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); 3085 VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
@@ -3106,7 +3089,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3106retry: 3089retry:
3107 check_irq_off(); 3090 check_irq_off();
3108 spin_lock(&n->list_lock); 3091 spin_lock(&n->list_lock);
3109 page = get_first_slab(n); 3092 page = get_first_slab(n, false);
3110 if (!page) 3093 if (!page)
3111 goto must_grow; 3094 goto must_grow;
3112 3095
@@ -3118,17 +3101,13 @@ retry:
3118 3101
3119 BUG_ON(page->active == cachep->num); 3102 BUG_ON(page->active == cachep->num);
3120 3103
3121 obj = slab_get_obj(cachep, page, nodeid); 3104 obj = slab_get_obj(cachep, page);
3122 n->free_objects--; 3105 n->free_objects--;
3123 /* move slabp to correct slabp list: */
3124 list_del(&page->lru);
3125 3106
3126 if (page->active == cachep->num) 3107 fixup_slab_list(cachep, n, page, &list);
3127 list_add(&page->lru, &n->slabs_full);
3128 else
3129 list_add(&page->lru, &n->slabs_partial);
3130 3108
3131 spin_unlock(&n->list_lock); 3109 spin_unlock(&n->list_lock);
3110 fixup_objfreelist_debug(cachep, &list);
3132 goto done; 3111 goto done;
3133 3112
3134must_grow: 3113must_grow:
@@ -3152,14 +3131,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3152 int slab_node = numa_mem_id(); 3131 int slab_node = numa_mem_id();
3153 3132
3154 flags &= gfp_allowed_mask; 3133 flags &= gfp_allowed_mask;
3155 3134 cachep = slab_pre_alloc_hook(cachep, flags);
3156 lockdep_trace_alloc(flags); 3135 if (unlikely(!cachep))
3157
3158 if (slab_should_failslab(cachep, flags))
3159 return NULL; 3136 return NULL;
3160 3137
3161 cachep = memcg_kmem_get_cache(cachep, flags);
3162
3163 cache_alloc_debugcheck_before(cachep, flags); 3138 cache_alloc_debugcheck_before(cachep, flags);
3164 local_irq_save(save_flags); 3139 local_irq_save(save_flags);
3165 3140
@@ -3188,16 +3163,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3188 out: 3163 out:
3189 local_irq_restore(save_flags); 3164 local_irq_restore(save_flags);
3190 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3165 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3191 kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
3192 flags);
3193 3166
3194 if (likely(ptr)) { 3167 if (unlikely(flags & __GFP_ZERO) && ptr)
3195 kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); 3168 memset(ptr, 0, cachep->object_size);
3196 if (unlikely(flags & __GFP_ZERO))
3197 memset(ptr, 0, cachep->object_size);
3198 }
3199 3169
3200 memcg_kmem_put_cache(cachep); 3170 slab_post_alloc_hook(cachep, flags, 1, &ptr);
3201 return ptr; 3171 return ptr;
3202} 3172}
3203 3173
@@ -3240,30 +3210,21 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3240 void *objp; 3210 void *objp;
3241 3211
3242 flags &= gfp_allowed_mask; 3212 flags &= gfp_allowed_mask;
3243 3213 cachep = slab_pre_alloc_hook(cachep, flags);
3244 lockdep_trace_alloc(flags); 3214 if (unlikely(!cachep))
3245
3246 if (slab_should_failslab(cachep, flags))
3247 return NULL; 3215 return NULL;
3248 3216
3249 cachep = memcg_kmem_get_cache(cachep, flags);
3250
3251 cache_alloc_debugcheck_before(cachep, flags); 3217 cache_alloc_debugcheck_before(cachep, flags);
3252 local_irq_save(save_flags); 3218 local_irq_save(save_flags);
3253 objp = __do_cache_alloc(cachep, flags); 3219 objp = __do_cache_alloc(cachep, flags);
3254 local_irq_restore(save_flags); 3220 local_irq_restore(save_flags);
3255 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3221 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3256 kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
3257 flags);
3258 prefetchw(objp); 3222 prefetchw(objp);
3259 3223
3260 if (likely(objp)) { 3224 if (unlikely(flags & __GFP_ZERO) && objp)
3261 kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); 3225 memset(objp, 0, cachep->object_size);
3262 if (unlikely(flags & __GFP_ZERO))
3263 memset(objp, 0, cachep->object_size);
3264 }
3265 3226
3266 memcg_kmem_put_cache(cachep); 3227 slab_post_alloc_hook(cachep, flags, 1, &objp);
3267 return objp; 3228 return objp;
3268} 3229}
3269 3230
@@ -3281,13 +3242,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
3281 void *objp; 3242 void *objp;
3282 struct page *page; 3243 struct page *page;
3283 3244
3284 clear_obj_pfmemalloc(&objpp[i]);
3285 objp = objpp[i]; 3245 objp = objpp[i];
3286 3246
3287 page = virt_to_head_page(objp); 3247 page = virt_to_head_page(objp);
3288 list_del(&page->lru); 3248 list_del(&page->lru);
3289 check_spinlock_acquired_node(cachep, node); 3249 check_spinlock_acquired_node(cachep, node);
3290 slab_put_obj(cachep, page, objp, node); 3250 slab_put_obj(cachep, page, objp);
3291 STATS_DEC_ACTIVE(cachep); 3251 STATS_DEC_ACTIVE(cachep);
3292 n->free_objects++; 3252 n->free_objects++;
3293 3253
@@ -3317,9 +3277,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3317 LIST_HEAD(list); 3277 LIST_HEAD(list);
3318 3278
3319 batchcount = ac->batchcount; 3279 batchcount = ac->batchcount;
3320#if DEBUG 3280
3321 BUG_ON(!batchcount || batchcount > ac->avail);
3322#endif
3323 check_irq_off(); 3281 check_irq_off();
3324 n = get_node(cachep, node); 3282 n = get_node(cachep, node);
3325 spin_lock(&n->list_lock); 3283 spin_lock(&n->list_lock);
@@ -3389,7 +3347,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3389 cache_flusharray(cachep, ac); 3347 cache_flusharray(cachep, ac);
3390 } 3348 }
3391 3349
3392 ac_put_obj(cachep, ac, objp); 3350 if (sk_memalloc_socks()) {
3351 struct page *page = virt_to_head_page(objp);
3352
3353 if (unlikely(PageSlabPfmemalloc(page))) {
3354 cache_free_pfmemalloc(cachep, page, objp);
3355 return;
3356 }
3357 }
3358
3359 ac->entry[ac->avail++] = objp;
3393} 3360}
3394 3361
3395/** 3362/**
@@ -3411,16 +3378,53 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3411} 3378}
3412EXPORT_SYMBOL(kmem_cache_alloc); 3379EXPORT_SYMBOL(kmem_cache_alloc);
3413 3380
3414void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) 3381static __always_inline void
3382cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
3383 size_t size, void **p, unsigned long caller)
3415{ 3384{
3416 __kmem_cache_free_bulk(s, size, p); 3385 size_t i;
3386
3387 for (i = 0; i < size; i++)
3388 p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
3417} 3389}
3418EXPORT_SYMBOL(kmem_cache_free_bulk);
3419 3390
3420int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 3391int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
3421 void **p) 3392 void **p)
3422{ 3393{
3423 return __kmem_cache_alloc_bulk(s, flags, size, p); 3394 size_t i;
3395
3396 s = slab_pre_alloc_hook(s, flags);
3397 if (!s)
3398 return 0;
3399
3400 cache_alloc_debugcheck_before(s, flags);
3401
3402 local_irq_disable();
3403 for (i = 0; i < size; i++) {
3404 void *objp = __do_cache_alloc(s, flags);
3405
3406 if (unlikely(!objp))
3407 goto error;
3408 p[i] = objp;
3409 }
3410 local_irq_enable();
3411
3412 cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
3413
3414 /* Clear memory outside IRQ disabled section */
3415 if (unlikely(flags & __GFP_ZERO))
3416 for (i = 0; i < size; i++)
3417 memset(p[i], 0, s->object_size);
3418
3419 slab_post_alloc_hook(s, flags, size, p);
3420 /* FIXME: Trace call missing. Christoph would like a bulk variant */
3421 return size;
3422error:
3423 local_irq_enable();
3424 cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
3425 slab_post_alloc_hook(s, flags, i, p);
3426 __kmem_cache_free_bulk(s, i, p);
3427 return 0;
3424} 3428}
3425EXPORT_SYMBOL(kmem_cache_alloc_bulk); 3429EXPORT_SYMBOL(kmem_cache_alloc_bulk);
3426 3430
@@ -3567,6 +3571,32 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3567} 3571}
3568EXPORT_SYMBOL(kmem_cache_free); 3572EXPORT_SYMBOL(kmem_cache_free);
3569 3573
3574void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
3575{
3576 struct kmem_cache *s;
3577 size_t i;
3578
3579 local_irq_disable();
3580 for (i = 0; i < size; i++) {
3581 void *objp = p[i];
3582
3583 if (!orig_s) /* called via kfree_bulk */
3584 s = virt_to_cache(objp);
3585 else
3586 s = cache_from_obj(orig_s, objp);
3587
3588 debug_check_no_locks_freed(objp, s->object_size);
3589 if (!(s->flags & SLAB_DEBUG_OBJECTS))
3590 debug_check_no_obj_freed(objp, s->object_size);
3591
3592 __cache_free(s, objp, _RET_IP_);
3593 }
3594 local_irq_enable();
3595
3596 /* FIXME: add tracing */
3597}
3598EXPORT_SYMBOL(kmem_cache_free_bulk);
3599
3570/** 3600/**
3571 * kfree - free previously allocated memory 3601 * kfree - free previously allocated memory
3572 * @objp: pointer returned by kmalloc. 3602 * @objp: pointer returned by kmalloc.
@@ -4102,15 +4132,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
4102 struct page *page) 4132 struct page *page)
4103{ 4133{
4104 void *p; 4134 void *p;
4105 int i; 4135 int i, j;
4136 unsigned long v;
4106 4137
4107 if (n[0] == n[1]) 4138 if (n[0] == n[1])
4108 return; 4139 return;
4109 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { 4140 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
4110 if (get_obj_status(page, i) != OBJECT_ACTIVE) 4141 bool active = true;
4142
4143 for (j = page->active; j < c->num; j++) {
4144 if (get_free_obj(page, j) == i) {
4145 active = false;
4146 break;
4147 }
4148 }
4149
4150 if (!active)
4151 continue;
4152
4153 /*
4154 * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
4155 * mapping is established when actual object allocation and
4156 * we could mistakenly access the unmapped object in the cpu
4157 * cache.
4158 */
4159 if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
4111 continue; 4160 continue;
4112 4161
4113 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4162 if (!add_caller(n, v))
4114 return; 4163 return;
4115 } 4164 }
4116} 4165}
@@ -4146,21 +4195,31 @@ static int leaks_show(struct seq_file *m, void *p)
4146 if (!(cachep->flags & SLAB_RED_ZONE)) 4195 if (!(cachep->flags & SLAB_RED_ZONE))
4147 return 0; 4196 return 0;
4148 4197
4149 /* OK, we can do it */ 4198 /*
4199 * Set store_user_clean and start to grab stored user information
4200 * for all objects on this cache. If some alloc/free requests comes
4201 * during the processing, information would be wrong so restart
4202 * whole processing.
4203 */
4204 do {
4205 set_store_user_clean(cachep);
4206 drain_cpu_caches(cachep);
4150 4207
4151 x[1] = 0; 4208 x[1] = 0;
4152 4209
4153 for_each_kmem_cache_node(cachep, node, n) { 4210 for_each_kmem_cache_node(cachep, node, n) {
4154 4211
4155 check_irq_on(); 4212 check_irq_on();
4156 spin_lock_irq(&n->list_lock); 4213 spin_lock_irq(&n->list_lock);
4214
4215 list_for_each_entry(page, &n->slabs_full, lru)
4216 handle_slab(x, cachep, page);
4217 list_for_each_entry(page, &n->slabs_partial, lru)
4218 handle_slab(x, cachep, page);
4219 spin_unlock_irq(&n->list_lock);
4220 }
4221 } while (!is_store_user_clean(cachep));
4157 4222
4158 list_for_each_entry(page, &n->slabs_full, lru)
4159 handle_slab(x, cachep, page);
4160 list_for_each_entry(page, &n->slabs_partial, lru)
4161 handle_slab(x, cachep, page);
4162 spin_unlock_irq(&n->list_lock);
4163 }
4164 name = cachep->name; 4223 name = cachep->name;
4165 if (x[0] == x[1]) { 4224 if (x[0] == x[1]) {
4166 /* Increase the buffer size */ 4225 /* Increase the buffer size */
diff --git a/mm/slab.h b/mm/slab.h
index 2eedacea439d..b7934361f026 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -38,6 +38,10 @@ struct kmem_cache {
38#endif 38#endif
39 39
40#include <linux/memcontrol.h> 40#include <linux/memcontrol.h>
41#include <linux/fault-inject.h>
42#include <linux/kmemcheck.h>
43#include <linux/kasan.h>
44#include <linux/kmemleak.h>
41 45
42/* 46/*
43 * State of the slab allocator. 47 * State of the slab allocator.
@@ -121,7 +125,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
121#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 125#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
122#elif defined(CONFIG_SLUB_DEBUG) 126#elif defined(CONFIG_SLUB_DEBUG)
123#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 127#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
124 SLAB_TRACE | SLAB_DEBUG_FREE) 128 SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
125#else 129#else
126#define SLAB_DEBUG_FLAGS (0) 130#define SLAB_DEBUG_FLAGS (0)
127#endif 131#endif
@@ -168,7 +172,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
168/* 172/*
169 * Generic implementation of bulk operations 173 * Generic implementation of bulk operations
170 * These are useful for situations in which the allocator cannot 174 * These are useful for situations in which the allocator cannot
171 * perform optimizations. In that case segments of the objecct listed 175 * perform optimizations. In that case segments of the object listed
172 * may be allocated or freed using these operations. 176 * may be allocated or freed using these operations.
173 */ 177 */
174void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); 178void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
@@ -307,7 +311,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
307 * to not do even the assignment. In that case, slab_equal_or_root 311 * to not do even the assignment. In that case, slab_equal_or_root
308 * will also be a constant. 312 * will also be a constant.
309 */ 313 */
310 if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) 314 if (!memcg_kmem_enabled() &&
315 !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
311 return s; 316 return s;
312 317
313 page = virt_to_head_page(x); 318 page = virt_to_head_page(x);
@@ -321,6 +326,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
321 return s; 326 return s;
322} 327}
323 328
329static inline size_t slab_ksize(const struct kmem_cache *s)
330{
331#ifndef CONFIG_SLUB
332 return s->object_size;
333
334#else /* CONFIG_SLUB */
335# ifdef CONFIG_SLUB_DEBUG
336 /*
337 * Debugging requires use of the padding between object
338 * and whatever may come after it.
339 */
340 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
341 return s->object_size;
342# endif
343 /*
344 * If we have the need to store the freelist pointer
345 * back there or track user information then we can
346 * only use the space before that information.
347 */
348 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
349 return s->inuse;
350 /*
351 * Else we can use all the padding etc for the allocation
352 */
353 return s->size;
354#endif
355}
356
357static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
358 gfp_t flags)
359{
360 flags &= gfp_allowed_mask;
361 lockdep_trace_alloc(flags);
362 might_sleep_if(gfpflags_allow_blocking(flags));
363
364 if (should_failslab(s, flags))
365 return NULL;
366
367 return memcg_kmem_get_cache(s, flags);
368}
369
370static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
371 size_t size, void **p)
372{
373 size_t i;
374
375 flags &= gfp_allowed_mask;
376 for (i = 0; i < size; i++) {
377 void *object = p[i];
378
379 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
380 kmemleak_alloc_recursive(object, s->object_size, 1,
381 s->flags, flags);
382 kasan_slab_alloc(s, object);
383 }
384 memcg_kmem_put_cache(s);
385}
386
324#ifndef CONFIG_SLOB 387#ifndef CONFIG_SLOB
325/* 388/*
326 * The slab lists for all objects. 389 * The slab lists for all objects.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 065b7bdabdc3..6afb2263a5c5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -109,8 +109,12 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
109{ 109{
110 size_t i; 110 size_t i;
111 111
112 for (i = 0; i < nr; i++) 112 for (i = 0; i < nr; i++) {
113 kmem_cache_free(s, p[i]); 113 if (s)
114 kmem_cache_free(s, p[i]);
115 else
116 kfree(p[i]);
117 }
114} 118}
115 119
116int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, 120int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
diff --git a/mm/slub.c b/mm/slub.c
index d8fbd4a6ed59..6c91324f9370 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
124#endif 124#endif
125} 125}
126 126
127static inline void *fixup_red_left(struct kmem_cache *s, void *p)
128{
129 if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
130 p += s->red_left_pad;
131
132 return p;
133}
134
127static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) 135static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
128{ 136{
129#ifdef CONFIG_SLUB_CPU_PARTIAL 137#ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -160,10 +168,18 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
160 */ 168 */
161#define MAX_PARTIAL 10 169#define MAX_PARTIAL 10
162 170
163#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 171#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
164 SLAB_POISON | SLAB_STORE_USER) 172 SLAB_POISON | SLAB_STORE_USER)
165 173
166/* 174/*
175 * These debug flags cannot use CMPXCHG because there might be consistency
176 * issues when checking or reading debug information
177 */
178#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
179 SLAB_TRACE)
180
181
182/*
167 * Debugging flags that require metadata to be stored in the slab. These get 183 * Debugging flags that require metadata to be stored in the slab. These get
168 * disabled when slub_debug=O is used and a cache's min order increases with 184 * disabled when slub_debug=O is used and a cache's min order increases with
169 * metadata. 185 * metadata.
@@ -224,24 +240,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
224 * Core slab cache functions 240 * Core slab cache functions
225 *******************************************************************/ 241 *******************************************************************/
226 242
227/* Verify that a pointer has an address that is valid within a slab page */
228static inline int check_valid_pointer(struct kmem_cache *s,
229 struct page *page, const void *object)
230{
231 void *base;
232
233 if (!object)
234 return 1;
235
236 base = page_address(page);
237 if (object < base || object >= base + page->objects * s->size ||
238 (object - base) % s->size) {
239 return 0;
240 }
241
242 return 1;
243}
244
245static inline void *get_freepointer(struct kmem_cache *s, void *object) 243static inline void *get_freepointer(struct kmem_cache *s, void *object)
246{ 244{
247 return *(void **)(object + s->offset); 245 return *(void **)(object + s->offset);
@@ -271,12 +269,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
271 269
272/* Loop over all objects in a slab */ 270/* Loop over all objects in a slab */
273#define for_each_object(__p, __s, __addr, __objects) \ 271#define for_each_object(__p, __s, __addr, __objects) \
274 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ 272 for (__p = fixup_red_left(__s, __addr); \
275 __p += (__s)->size) 273 __p < (__addr) + (__objects) * (__s)->size; \
274 __p += (__s)->size)
276 275
277#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ 276#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
278 for (__p = (__addr), __idx = 1; __idx <= __objects;\ 277 for (__p = fixup_red_left(__s, __addr), __idx = 1; \
279 __p += (__s)->size, __idx++) 278 __idx <= __objects; \
279 __p += (__s)->size, __idx++)
280 280
281/* Determine object index from a given position */ 281/* Determine object index from a given position */
282static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 282static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -284,30 +284,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
284 return (p - addr) / s->size; 284 return (p - addr) / s->size;
285} 285}
286 286
287static inline size_t slab_ksize(const struct kmem_cache *s)
288{
289#ifdef CONFIG_SLUB_DEBUG
290 /*
291 * Debugging requires use of the padding between object
292 * and whatever may come after it.
293 */
294 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
295 return s->object_size;
296
297#endif
298 /*
299 * If we have the need to store the freelist pointer
300 * back there or track user information then we can
301 * only use the space before that information.
302 */
303 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
304 return s->inuse;
305 /*
306 * Else we can use all the padding etc for the allocation
307 */
308 return s->size;
309}
310
311static inline int order_objects(int order, unsigned long size, int reserved) 287static inline int order_objects(int order, unsigned long size, int reserved)
312{ 288{
313 return ((PAGE_SIZE << order) - reserved) / size; 289 return ((PAGE_SIZE << order) - reserved) / size;
@@ -458,6 +434,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
458 set_bit(slab_index(p, s, addr), map); 434 set_bit(slab_index(p, s, addr), map);
459} 435}
460 436
437static inline int size_from_object(struct kmem_cache *s)
438{
439 if (s->flags & SLAB_RED_ZONE)
440 return s->size - s->red_left_pad;
441
442 return s->size;
443}
444
445static inline void *restore_red_left(struct kmem_cache *s, void *p)
446{
447 if (s->flags & SLAB_RED_ZONE)
448 p -= s->red_left_pad;
449
450 return p;
451}
452
461/* 453/*
462 * Debug settings: 454 * Debug settings:
463 */ 455 */
@@ -491,6 +483,26 @@ static inline void metadata_access_disable(void)
491/* 483/*
492 * Object debugging 484 * Object debugging
493 */ 485 */
486
487/* Verify that a pointer has an address that is valid within a slab page */
488static inline int check_valid_pointer(struct kmem_cache *s,
489 struct page *page, void *object)
490{
491 void *base;
492
493 if (!object)
494 return 1;
495
496 base = page_address(page);
497 object = restore_red_left(s, object);
498 if (object < base || object >= base + page->objects * s->size ||
499 (object - base) % s->size) {
500 return 0;
501 }
502
503 return 1;
504}
505
494static void print_section(char *text, u8 *addr, unsigned int length) 506static void print_section(char *text, u8 *addr, unsigned int length)
495{ 507{
496 metadata_access_enable(); 508 metadata_access_enable();
@@ -630,7 +642,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
630 pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 642 pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
631 p, p - addr, get_freepointer(s, p)); 643 p, p - addr, get_freepointer(s, p));
632 644
633 if (p > addr + 16) 645 if (s->flags & SLAB_RED_ZONE)
646 print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
647 else if (p > addr + 16)
634 print_section("Bytes b4 ", p - 16, 16); 648 print_section("Bytes b4 ", p - 16, 16);
635 649
636 print_section("Object ", p, min_t(unsigned long, s->object_size, 650 print_section("Object ", p, min_t(unsigned long, s->object_size,
@@ -647,9 +661,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
647 if (s->flags & SLAB_STORE_USER) 661 if (s->flags & SLAB_STORE_USER)
648 off += 2 * sizeof(struct track); 662 off += 2 * sizeof(struct track);
649 663
650 if (off != s->size) 664 if (off != size_from_object(s))
651 /* Beginning of the filler is the free pointer */ 665 /* Beginning of the filler is the free pointer */
652 print_section("Padding ", p + off, s->size - off); 666 print_section("Padding ", p + off, size_from_object(s) - off);
653 667
654 dump_stack(); 668 dump_stack();
655} 669}
@@ -679,6 +693,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
679{ 693{
680 u8 *p = object; 694 u8 *p = object;
681 695
696 if (s->flags & SLAB_RED_ZONE)
697 memset(p - s->red_left_pad, val, s->red_left_pad);
698
682 if (s->flags & __OBJECT_POISON) { 699 if (s->flags & __OBJECT_POISON) {
683 memset(p, POISON_FREE, s->object_size - 1); 700 memset(p, POISON_FREE, s->object_size - 1);
684 p[s->object_size - 1] = POISON_END; 701 p[s->object_size - 1] = POISON_END;
@@ -771,11 +788,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
771 /* We also have user information there */ 788 /* We also have user information there */
772 off += 2 * sizeof(struct track); 789 off += 2 * sizeof(struct track);
773 790
774 if (s->size == off) 791 if (size_from_object(s) == off)
775 return 1; 792 return 1;
776 793
777 return check_bytes_and_report(s, page, p, "Object padding", 794 return check_bytes_and_report(s, page, p, "Object padding",
778 p + off, POISON_INUSE, s->size - off); 795 p + off, POISON_INUSE, size_from_object(s) - off);
779} 796}
780 797
781/* Check the pad bytes at the end of a slab page */ 798/* Check the pad bytes at the end of a slab page */
@@ -820,6 +837,10 @@ static int check_object(struct kmem_cache *s, struct page *page,
820 837
821 if (s->flags & SLAB_RED_ZONE) { 838 if (s->flags & SLAB_RED_ZONE) {
822 if (!check_bytes_and_report(s, page, object, "Redzone", 839 if (!check_bytes_and_report(s, page, object, "Redzone",
840 object - s->red_left_pad, val, s->red_left_pad))
841 return 0;
842
843 if (!check_bytes_and_report(s, page, object, "Redzone",
823 endobject, val, s->inuse - s->object_size)) 844 endobject, val, s->inuse - s->object_size))
824 return 0; 845 return 0;
825 } else { 846 } else {
@@ -1031,20 +1052,32 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
1031 init_tracking(s, object); 1052 init_tracking(s, object);
1032} 1053}
1033 1054
1034static noinline int alloc_debug_processing(struct kmem_cache *s, 1055static inline int alloc_consistency_checks(struct kmem_cache *s,
1035 struct page *page, 1056 struct page *page,
1036 void *object, unsigned long addr) 1057 void *object, unsigned long addr)
1037{ 1058{
1038 if (!check_slab(s, page)) 1059 if (!check_slab(s, page))
1039 goto bad; 1060 return 0;
1040 1061
1041 if (!check_valid_pointer(s, page, object)) { 1062 if (!check_valid_pointer(s, page, object)) {
1042 object_err(s, page, object, "Freelist Pointer check fails"); 1063 object_err(s, page, object, "Freelist Pointer check fails");
1043 goto bad; 1064 return 0;
1044 } 1065 }
1045 1066
1046 if (!check_object(s, page, object, SLUB_RED_INACTIVE)) 1067 if (!check_object(s, page, object, SLUB_RED_INACTIVE))
1047 goto bad; 1068 return 0;
1069
1070 return 1;
1071}
1072
1073static noinline int alloc_debug_processing(struct kmem_cache *s,
1074 struct page *page,
1075 void *object, unsigned long addr)
1076{
1077 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1078 if (!alloc_consistency_checks(s, page, object, addr))
1079 goto bad;
1080 }
1048 1081
1049 /* Success perform special debug activities for allocs */ 1082 /* Success perform special debug activities for allocs */
1050 if (s->flags & SLAB_STORE_USER) 1083 if (s->flags & SLAB_STORE_USER)
@@ -1067,37 +1100,21 @@ bad:
1067 return 0; 1100 return 0;
1068} 1101}
1069 1102
1070/* Supports checking bulk free of a constructed freelist */ 1103static inline int free_consistency_checks(struct kmem_cache *s,
1071static noinline struct kmem_cache_node *free_debug_processing( 1104 struct page *page, void *object, unsigned long addr)
1072 struct kmem_cache *s, struct page *page,
1073 void *head, void *tail, int bulk_cnt,
1074 unsigned long addr, unsigned long *flags)
1075{ 1105{
1076 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1077 void *object = head;
1078 int cnt = 0;
1079
1080 spin_lock_irqsave(&n->list_lock, *flags);
1081 slab_lock(page);
1082
1083 if (!check_slab(s, page))
1084 goto fail;
1085
1086next_object:
1087 cnt++;
1088
1089 if (!check_valid_pointer(s, page, object)) { 1106 if (!check_valid_pointer(s, page, object)) {
1090 slab_err(s, page, "Invalid object pointer 0x%p", object); 1107 slab_err(s, page, "Invalid object pointer 0x%p", object);
1091 goto fail; 1108 return 0;
1092 } 1109 }
1093 1110
1094 if (on_freelist(s, page, object)) { 1111 if (on_freelist(s, page, object)) {
1095 object_err(s, page, object, "Object already free"); 1112 object_err(s, page, object, "Object already free");
1096 goto fail; 1113 return 0;
1097 } 1114 }
1098 1115
1099 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1116 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1100 goto out; 1117 return 0;
1101 1118
1102 if (unlikely(s != page->slab_cache)) { 1119 if (unlikely(s != page->slab_cache)) {
1103 if (!PageSlab(page)) { 1120 if (!PageSlab(page)) {
@@ -1110,7 +1127,37 @@ next_object:
1110 } else 1127 } else
1111 object_err(s, page, object, 1128 object_err(s, page, object,
1112 "page slab pointer corrupt."); 1129 "page slab pointer corrupt.");
1113 goto fail; 1130 return 0;
1131 }
1132 return 1;
1133}
1134
1135/* Supports checking bulk free of a constructed freelist */
1136static noinline int free_debug_processing(
1137 struct kmem_cache *s, struct page *page,
1138 void *head, void *tail, int bulk_cnt,
1139 unsigned long addr)
1140{
1141 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1142 void *object = head;
1143 int cnt = 0;
1144 unsigned long uninitialized_var(flags);
1145 int ret = 0;
1146
1147 spin_lock_irqsave(&n->list_lock, flags);
1148 slab_lock(page);
1149
1150 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1151 if (!check_slab(s, page))
1152 goto out;
1153 }
1154
1155next_object:
1156 cnt++;
1157
1158 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1159 if (!free_consistency_checks(s, page, object, addr))
1160 goto out;
1114 } 1161 }
1115 1162
1116 if (s->flags & SLAB_STORE_USER) 1163 if (s->flags & SLAB_STORE_USER)
@@ -1124,23 +1171,18 @@ next_object:
1124 object = get_freepointer(s, object); 1171 object = get_freepointer(s, object);
1125 goto next_object; 1172 goto next_object;
1126 } 1173 }
1174 ret = 1;
1175
1127out: 1176out:
1128 if (cnt != bulk_cnt) 1177 if (cnt != bulk_cnt)
1129 slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", 1178 slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
1130 bulk_cnt, cnt); 1179 bulk_cnt, cnt);
1131 1180
1132 slab_unlock(page); 1181 slab_unlock(page);
1133 /* 1182 spin_unlock_irqrestore(&n->list_lock, flags);
1134 * Keep node_lock to preserve integrity 1183 if (!ret)
1135 * until the object is actually freed 1184 slab_fix(s, "Object at 0x%p not freed", object);
1136 */ 1185 return ret;
1137 return n;
1138
1139fail:
1140 slab_unlock(page);
1141 spin_unlock_irqrestore(&n->list_lock, *flags);
1142 slab_fix(s, "Object at 0x%p not freed", object);
1143 return NULL;
1144} 1186}
1145 1187
1146static int __init setup_slub_debug(char *str) 1188static int __init setup_slub_debug(char *str)
@@ -1172,7 +1214,7 @@ static int __init setup_slub_debug(char *str)
1172 for (; *str && *str != ','; str++) { 1214 for (; *str && *str != ','; str++) {
1173 switch (tolower(*str)) { 1215 switch (tolower(*str)) {
1174 case 'f': 1216 case 'f':
1175 slub_debug |= SLAB_DEBUG_FREE; 1217 slub_debug |= SLAB_CONSISTENCY_CHECKS;
1176 break; 1218 break;
1177 case 'z': 1219 case 'z':
1178 slub_debug |= SLAB_RED_ZONE; 1220 slub_debug |= SLAB_RED_ZONE;
@@ -1231,10 +1273,10 @@ static inline void setup_object_debug(struct kmem_cache *s,
1231static inline int alloc_debug_processing(struct kmem_cache *s, 1273static inline int alloc_debug_processing(struct kmem_cache *s,
1232 struct page *page, void *object, unsigned long addr) { return 0; } 1274 struct page *page, void *object, unsigned long addr) { return 0; }
1233 1275
1234static inline struct kmem_cache_node *free_debug_processing( 1276static inline int free_debug_processing(
1235 struct kmem_cache *s, struct page *page, 1277 struct kmem_cache *s, struct page *page,
1236 void *head, void *tail, int bulk_cnt, 1278 void *head, void *tail, int bulk_cnt,
1237 unsigned long addr, unsigned long *flags) { return NULL; } 1279 unsigned long addr) { return 0; }
1238 1280
1239static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1281static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1240 { return 1; } 1282 { return 1; }
@@ -1281,36 +1323,6 @@ static inline void kfree_hook(const void *x)
1281 kasan_kfree_large(x); 1323 kasan_kfree_large(x);
1282} 1324}
1283 1325
1284static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
1285 gfp_t flags)
1286{
1287 flags &= gfp_allowed_mask;
1288 lockdep_trace_alloc(flags);
1289 might_sleep_if(gfpflags_allow_blocking(flags));
1290
1291 if (should_failslab(s->object_size, flags, s->flags))
1292 return NULL;
1293
1294 return memcg_kmem_get_cache(s, flags);
1295}
1296
1297static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1298 size_t size, void **p)
1299{
1300 size_t i;
1301
1302 flags &= gfp_allowed_mask;
1303 for (i = 0; i < size; i++) {
1304 void *object = p[i];
1305
1306 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
1307 kmemleak_alloc_recursive(object, s->object_size, 1,
1308 s->flags, flags);
1309 kasan_slab_alloc(s, object);
1310 }
1311 memcg_kmem_put_cache(s);
1312}
1313
1314static inline void slab_free_hook(struct kmem_cache *s, void *x) 1326static inline void slab_free_hook(struct kmem_cache *s, void *x)
1315{ 1327{
1316 kmemleak_free_recursive(x, s->flags); 1328 kmemleak_free_recursive(x, s->flags);
@@ -1470,7 +1482,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1470 set_freepointer(s, p, NULL); 1482 set_freepointer(s, p, NULL);
1471 } 1483 }
1472 1484
1473 page->freelist = start; 1485 page->freelist = fixup_red_left(s, start);
1474 page->inuse = page->objects; 1486 page->inuse = page->objects;
1475 page->frozen = 1; 1487 page->frozen = 1;
1476 1488
@@ -1506,7 +1518,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1506 int order = compound_order(page); 1518 int order = compound_order(page);
1507 int pages = 1 << order; 1519 int pages = 1 << order;
1508 1520
1509 if (kmem_cache_debug(s)) { 1521 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1510 void *p; 1522 void *p;
1511 1523
1512 slab_pad_check(s, page); 1524 slab_pad_check(s, page);
@@ -2224,8 +2236,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2224 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) 2236 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
2225 return; 2237 return;
2226 2238
2227 pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2239 pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
2228 nid, gfpflags); 2240 nid, gfpflags, &gfpflags);
2229 pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", 2241 pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
2230 s->name, s->object_size, s->size, oo_order(s->oo), 2242 s->name, s->object_size, s->size, oo_order(s->oo),
2231 oo_order(s->min)); 2243 oo_order(s->min));
@@ -2642,8 +2654,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2642 stat(s, FREE_SLOWPATH); 2654 stat(s, FREE_SLOWPATH);
2643 2655
2644 if (kmem_cache_debug(s) && 2656 if (kmem_cache_debug(s) &&
2645 !(n = free_debug_processing(s, page, head, tail, cnt, 2657 !free_debug_processing(s, page, head, tail, cnt, addr))
2646 addr, &flags)))
2647 return; 2658 return;
2648 2659
2649 do { 2660 do {
@@ -2815,6 +2826,7 @@ struct detached_freelist {
2815 void *tail; 2826 void *tail;
2816 void *freelist; 2827 void *freelist;
2817 int cnt; 2828 int cnt;
2829 struct kmem_cache *s;
2818}; 2830};
2819 2831
2820/* 2832/*
@@ -2829,26 +2841,45 @@ struct detached_freelist {
2829 * synchronization primitive. Look ahead in the array is limited due 2841 * synchronization primitive. Look ahead in the array is limited due
2830 * to performance reasons. 2842 * to performance reasons.
2831 */ 2843 */
2832static int build_detached_freelist(struct kmem_cache *s, size_t size, 2844static inline
2833 void **p, struct detached_freelist *df) 2845int build_detached_freelist(struct kmem_cache *s, size_t size,
2846 void **p, struct detached_freelist *df)
2834{ 2847{
2835 size_t first_skipped_index = 0; 2848 size_t first_skipped_index = 0;
2836 int lookahead = 3; 2849 int lookahead = 3;
2837 void *object; 2850 void *object;
2851 struct page *page;
2838 2852
2839 /* Always re-init detached_freelist */ 2853 /* Always re-init detached_freelist */
2840 df->page = NULL; 2854 df->page = NULL;
2841 2855
2842 do { 2856 do {
2843 object = p[--size]; 2857 object = p[--size];
2858 /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
2844 } while (!object && size); 2859 } while (!object && size);
2845 2860
2846 if (!object) 2861 if (!object)
2847 return 0; 2862 return 0;
2848 2863
2864 page = virt_to_head_page(object);
2865 if (!s) {
2866 /* Handle kalloc'ed objects */
2867 if (unlikely(!PageSlab(page))) {
2868 BUG_ON(!PageCompound(page));
2869 kfree_hook(object);
2870 __free_kmem_pages(page, compound_order(page));
2871 p[size] = NULL; /* mark object processed */
2872 return size;
2873 }
2874 /* Derive kmem_cache from object */
2875 df->s = page->slab_cache;
2876 } else {
2877 df->s = cache_from_obj(s, object); /* Support for memcg */
2878 }
2879
2849 /* Start new detached freelist */ 2880 /* Start new detached freelist */
2850 set_freepointer(s, object, NULL); 2881 df->page = page;
2851 df->page = virt_to_head_page(object); 2882 set_freepointer(df->s, object, NULL);
2852 df->tail = object; 2883 df->tail = object;
2853 df->freelist = object; 2884 df->freelist = object;
2854 p[size] = NULL; /* mark object processed */ 2885 p[size] = NULL; /* mark object processed */
@@ -2862,7 +2893,7 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size,
2862 /* df->page is always set at this point */ 2893 /* df->page is always set at this point */
2863 if (df->page == virt_to_head_page(object)) { 2894 if (df->page == virt_to_head_page(object)) {
2864 /* Opportunity build freelist */ 2895 /* Opportunity build freelist */
2865 set_freepointer(s, object, df->freelist); 2896 set_freepointer(df->s, object, df->freelist);
2866 df->freelist = object; 2897 df->freelist = object;
2867 df->cnt++; 2898 df->cnt++;
2868 p[size] = NULL; /* mark object processed */ 2899 p[size] = NULL; /* mark object processed */
@@ -2881,25 +2912,20 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size,
2881 return first_skipped_index; 2912 return first_skipped_index;
2882} 2913}
2883 2914
2884
2885/* Note that interrupts must be enabled when calling this function. */ 2915/* Note that interrupts must be enabled when calling this function. */
2886void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) 2916void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
2887{ 2917{
2888 if (WARN_ON(!size)) 2918 if (WARN_ON(!size))
2889 return; 2919 return;
2890 2920
2891 do { 2921 do {
2892 struct detached_freelist df; 2922 struct detached_freelist df;
2893 struct kmem_cache *s;
2894
2895 /* Support for memcg */
2896 s = cache_from_obj(orig_s, p[size - 1]);
2897 2923
2898 size = build_detached_freelist(s, size, p, &df); 2924 size = build_detached_freelist(s, size, p, &df);
2899 if (unlikely(!df.page)) 2925 if (unlikely(!df.page))
2900 continue; 2926 continue;
2901 2927
2902 slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); 2928 slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
2903 } while (likely(size)); 2929 } while (likely(size));
2904} 2930}
2905EXPORT_SYMBOL(kmem_cache_free_bulk); 2931EXPORT_SYMBOL(kmem_cache_free_bulk);
@@ -3285,7 +3311,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3285 */ 3311 */
3286 size += 2 * sizeof(struct track); 3312 size += 2 * sizeof(struct track);
3287 3313
3288 if (flags & SLAB_RED_ZONE) 3314 if (flags & SLAB_RED_ZONE) {
3289 /* 3315 /*
3290 * Add some empty padding so that we can catch 3316 * Add some empty padding so that we can catch
3291 * overwrites from earlier objects rather than let 3317 * overwrites from earlier objects rather than let
@@ -3294,6 +3320,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3294 * of the object. 3320 * of the object.
3295 */ 3321 */
3296 size += sizeof(void *); 3322 size += sizeof(void *);
3323
3324 s->red_left_pad = sizeof(void *);
3325 s->red_left_pad = ALIGN(s->red_left_pad, s->align);
3326 size += s->red_left_pad;
3327 }
3297#endif 3328#endif
3298 3329
3299 /* 3330 /*
@@ -3357,7 +3388,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3357 3388
3358#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 3389#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
3359 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 3390 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
3360 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) 3391 if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
3361 /* Enable fast mode */ 3392 /* Enable fast mode */
3362 s->flags |= __CMPXCHG_DOUBLE; 3393 s->flags |= __CMPXCHG_DOUBLE;
3363#endif 3394#endif
@@ -4812,16 +4843,16 @@ SLAB_ATTR_RO(total_objects);
4812 4843
4813static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 4844static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
4814{ 4845{
4815 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 4846 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
4816} 4847}
4817 4848
4818static ssize_t sanity_checks_store(struct kmem_cache *s, 4849static ssize_t sanity_checks_store(struct kmem_cache *s,
4819 const char *buf, size_t length) 4850 const char *buf, size_t length)
4820{ 4851{
4821 s->flags &= ~SLAB_DEBUG_FREE; 4852 s->flags &= ~SLAB_CONSISTENCY_CHECKS;
4822 if (buf[0] == '1') { 4853 if (buf[0] == '1') {
4823 s->flags &= ~__CMPXCHG_DOUBLE; 4854 s->flags &= ~__CMPXCHG_DOUBLE;
4824 s->flags |= SLAB_DEBUG_FREE; 4855 s->flags |= SLAB_CONSISTENCY_CHECKS;
4825 } 4856 }
4826 return length; 4857 return length;
4827} 4858}
@@ -4865,7 +4896,6 @@ static ssize_t red_zone_store(struct kmem_cache *s,
4865 4896
4866 s->flags &= ~SLAB_RED_ZONE; 4897 s->flags &= ~SLAB_RED_ZONE;
4867 if (buf[0] == '1') { 4898 if (buf[0] == '1') {
4868 s->flags &= ~__CMPXCHG_DOUBLE;
4869 s->flags |= SLAB_RED_ZONE; 4899 s->flags |= SLAB_RED_ZONE;
4870 } 4900 }
4871 calculate_sizes(s, -1); 4901 calculate_sizes(s, -1);
@@ -4886,7 +4916,6 @@ static ssize_t poison_store(struct kmem_cache *s,
4886 4916
4887 s->flags &= ~SLAB_POISON; 4917 s->flags &= ~SLAB_POISON;
4888 if (buf[0] == '1') { 4918 if (buf[0] == '1') {
4889 s->flags &= ~__CMPXCHG_DOUBLE;
4890 s->flags |= SLAB_POISON; 4919 s->flags |= SLAB_POISON;
4891 } 4920 }
4892 calculate_sizes(s, -1); 4921 calculate_sizes(s, -1);
@@ -5356,7 +5385,7 @@ static char *create_unique_id(struct kmem_cache *s)
5356 *p++ = 'd'; 5385 *p++ = 'd';
5357 if (s->flags & SLAB_RECLAIM_ACCOUNT) 5386 if (s->flags & SLAB_RECLAIM_ACCOUNT)
5358 *p++ = 'a'; 5387 *p++ = 'a';
5359 if (s->flags & SLAB_DEBUG_FREE) 5388 if (s->flags & SLAB_CONSISTENCY_CHECKS)
5360 *p++ = 'F'; 5389 *p++ = 'F';
5361 if (!(s->flags & SLAB_NOTRACK)) 5390 if (!(s->flags & SLAB_NOTRACK))
5362 *p++ = 't'; 5391 *p++ = 't';
diff --git a/mm/truncate.c b/mm/truncate.c
index e3ee0e27cd17..7598b552ae03 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -519,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
519static int 519static int
520invalidate_complete_page2(struct address_space *mapping, struct page *page) 520invalidate_complete_page2(struct address_space *mapping, struct page *page)
521{ 521{
522 struct mem_cgroup *memcg;
523 unsigned long flags; 522 unsigned long flags;
524 523
525 if (page->mapping != mapping) 524 if (page->mapping != mapping)
@@ -528,15 +527,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
528 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) 527 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
529 return 0; 528 return 0;
530 529
531 memcg = mem_cgroup_begin_page_stat(page);
532 spin_lock_irqsave(&mapping->tree_lock, flags); 530 spin_lock_irqsave(&mapping->tree_lock, flags);
533 if (PageDirty(page)) 531 if (PageDirty(page))
534 goto failed; 532 goto failed;
535 533
536 BUG_ON(page_has_private(page)); 534 BUG_ON(page_has_private(page));
537 __delete_from_page_cache(page, NULL, memcg); 535 __delete_from_page_cache(page, NULL);
538 spin_unlock_irqrestore(&mapping->tree_lock, flags); 536 spin_unlock_irqrestore(&mapping->tree_lock, flags);
539 mem_cgroup_end_page_stat(memcg);
540 537
541 if (mapping->a_ops->freepage) 538 if (mapping->a_ops->freepage)
542 mapping->a_ops->freepage(page); 539 mapping->a_ops->freepage(page);
@@ -545,7 +542,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
545 return 1; 542 return 1;
546failed: 543failed:
547 spin_unlock_irqrestore(&mapping->tree_lock, flags); 544 spin_unlock_irqrestore(&mapping->tree_lock, flags);
548 mem_cgroup_end_page_stat(memcg);
549 return 0; 545 return 0;
550} 546}
551 547
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 71b1c29948db..dd984470248f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -195,25 +195,25 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
195{ 195{
196 unsigned long nr; 196 unsigned long nr;
197 197
198 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 198 nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
199 zone_page_state(zone, NR_INACTIVE_FILE) + 199 zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
200 zone_page_state(zone, NR_ISOLATED_FILE); 200 zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
201 201
202 if (get_nr_swap_pages() > 0) 202 if (get_nr_swap_pages() > 0)
203 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 203 nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
204 zone_page_state(zone, NR_INACTIVE_ANON) + 204 zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
205 zone_page_state(zone, NR_ISOLATED_ANON); 205 zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
206 206
207 return nr; 207 return nr;
208} 208}
209 209
210bool zone_reclaimable(struct zone *zone) 210bool zone_reclaimable(struct zone *zone)
211{ 211{
212 return zone_page_state(zone, NR_PAGES_SCANNED) < 212 return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
213 zone_reclaimable_pages(zone) * 6; 213 zone_reclaimable_pages(zone) * 6;
214} 214}
215 215
216static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) 216unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
217{ 217{
218 if (!mem_cgroup_disabled()) 218 if (!mem_cgroup_disabled())
219 return mem_cgroup_get_lru_size(lruvec, lru); 219 return mem_cgroup_get_lru_size(lruvec, lru);
@@ -228,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker)
228{ 228{
229 size_t size = sizeof(*shrinker->nr_deferred); 229 size_t size = sizeof(*shrinker->nr_deferred);
230 230
231 /*
232 * If we only have one possible node in the system anyway, save
233 * ourselves the trouble and disable NUMA aware behavior. This way we
234 * will save memory and some small loop time later.
235 */
236 if (nr_node_ids == 1)
237 shrinker->flags &= ~SHRINKER_NUMA_AWARE;
238
239 if (shrinker->flags & SHRINKER_NUMA_AWARE) 231 if (shrinker->flags & SHRINKER_NUMA_AWARE)
240 size *= nr_node_ids; 232 size *= nr_node_ids;
241 233
@@ -611,12 +603,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
611 bool reclaimed) 603 bool reclaimed)
612{ 604{
613 unsigned long flags; 605 unsigned long flags;
614 struct mem_cgroup *memcg;
615 606
616 BUG_ON(!PageLocked(page)); 607 BUG_ON(!PageLocked(page));
617 BUG_ON(mapping != page_mapping(page)); 608 BUG_ON(mapping != page_mapping(page));
618 609
619 memcg = mem_cgroup_begin_page_stat(page);
620 spin_lock_irqsave(&mapping->tree_lock, flags); 610 spin_lock_irqsave(&mapping->tree_lock, flags);
621 /* 611 /*
622 * The non racy check for a busy page. 612 * The non racy check for a busy page.
@@ -656,7 +646,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
656 mem_cgroup_swapout(page, swap); 646 mem_cgroup_swapout(page, swap);
657 __delete_from_swap_cache(page); 647 __delete_from_swap_cache(page);
658 spin_unlock_irqrestore(&mapping->tree_lock, flags); 648 spin_unlock_irqrestore(&mapping->tree_lock, flags);
659 mem_cgroup_end_page_stat(memcg);
660 swapcache_free(swap); 649 swapcache_free(swap);
661 } else { 650 } else {
662 void (*freepage)(struct page *); 651 void (*freepage)(struct page *);
@@ -682,9 +671,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
682 if (reclaimed && page_is_file_cache(page) && 671 if (reclaimed && page_is_file_cache(page) &&
683 !mapping_exiting(mapping) && !dax_mapping(mapping)) 672 !mapping_exiting(mapping) && !dax_mapping(mapping))
684 shadow = workingset_eviction(mapping, page); 673 shadow = workingset_eviction(mapping, page);
685 __delete_from_page_cache(page, shadow, memcg); 674 __delete_from_page_cache(page, shadow);
686 spin_unlock_irqrestore(&mapping->tree_lock, flags); 675 spin_unlock_irqrestore(&mapping->tree_lock, flags);
687 mem_cgroup_end_page_stat(memcg);
688 676
689 if (freepage != NULL) 677 if (freepage != NULL)
690 freepage(page); 678 freepage(page);
@@ -694,7 +682,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
694 682
695cannot_free: 683cannot_free:
696 spin_unlock_irqrestore(&mapping->tree_lock, flags); 684 spin_unlock_irqrestore(&mapping->tree_lock, flags);
697 mem_cgroup_end_page_stat(memcg);
698 return 0; 685 return 0;
699} 686}
700 687
@@ -1931,8 +1918,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
1931 unsigned long inactive; 1918 unsigned long inactive;
1932 unsigned long active; 1919 unsigned long active;
1933 1920
1934 inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1921 inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
1935 active = get_lru_size(lruvec, LRU_ACTIVE_FILE); 1922 active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
1936 1923
1937 return active > inactive; 1924 return active > inactive;
1938} 1925}
@@ -2071,7 +2058,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2071 * system is under heavy pressure. 2058 * system is under heavy pressure.
2072 */ 2059 */
2073 if (!inactive_file_is_low(lruvec) && 2060 if (!inactive_file_is_low(lruvec) &&
2074 get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { 2061 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
2075 scan_balance = SCAN_FILE; 2062 scan_balance = SCAN_FILE;
2076 goto out; 2063 goto out;
2077 } 2064 }
@@ -2097,10 +2084,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2097 * anon in [0], file in [1] 2084 * anon in [0], file in [1]
2098 */ 2085 */
2099 2086
2100 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + 2087 anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
2101 get_lru_size(lruvec, LRU_INACTIVE_ANON); 2088 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
2102 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + 2089 file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
2103 get_lru_size(lruvec, LRU_INACTIVE_FILE); 2090 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
2104 2091
2105 spin_lock_irq(&zone->lru_lock); 2092 spin_lock_irq(&zone->lru_lock);
2106 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 2093 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2138,7 +2125,7 @@ out:
2138 unsigned long size; 2125 unsigned long size;
2139 unsigned long scan; 2126 unsigned long scan;
2140 2127
2141 size = get_lru_size(lruvec, lru); 2128 size = lruvec_lru_size(lruvec, lru);
2142 scan = size >> sc->priority; 2129 scan = size >> sc->priority;
2143 2130
2144 if (!scan && pass && force_scan) 2131 if (!scan && pass && force_scan)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 084c6725b373..69ce64f7b8d7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -924,19 +924,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
924#endif 924#endif
925 925
926#ifdef CONFIG_PROC_FS 926#ifdef CONFIG_PROC_FS
927static char * const migratetype_names[MIGRATE_TYPES] = {
928 "Unmovable",
929 "Movable",
930 "Reclaimable",
931 "HighAtomic",
932#ifdef CONFIG_CMA
933 "CMA",
934#endif
935#ifdef CONFIG_MEMORY_ISOLATION
936 "Isolate",
937#endif
938};
939
940static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 927static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
941 struct zone *zone) 928 struct zone *zone)
942{ 929{
@@ -1133,7 +1120,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1133#ifdef CONFIG_PAGE_OWNER 1120#ifdef CONFIG_PAGE_OWNER
1134 int mtype; 1121 int mtype;
1135 1122
1136 if (!page_owner_inited) 1123 if (!static_branch_unlikely(&page_owner_inited))
1137 return; 1124 return;
1138 1125
1139 drain_all_pages(NULL); 1126 drain_all_pages(NULL);
diff --git a/mm/workingset.c b/mm/workingset.c
index 61ead9e5549d..6130ba0b2641 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -152,8 +152,25 @@
152 * refault distance will immediately activate the refaulting page. 152 * refault distance will immediately activate the refaulting page.
153 */ 153 */
154 154
155static void *pack_shadow(unsigned long eviction, struct zone *zone) 155#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
156 ZONES_SHIFT + NODES_SHIFT + \
157 MEM_CGROUP_ID_SHIFT)
158#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
159
160/*
161 * Eviction timestamps need to be able to cover the full range of
162 * actionable refaults. However, bits are tight in the radix tree
163 * entry, and after storing the identifier for the lruvec there might
164 * not be enough left to represent every single actionable refault. In
165 * that case, we have to sacrifice granularity for distance, and group
166 * evictions into coarser buckets by shaving off lower timestamp bits.
167 */
168static unsigned int bucket_order __read_mostly;
169
170static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
156{ 171{
172 eviction >>= bucket_order;
173 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
157 eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); 174 eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
158 eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); 175 eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
159 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); 176 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -161,45 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone)
161 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); 178 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
162} 179}
163 180
164static void unpack_shadow(void *shadow, 181static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
165 struct zone **zone, 182 unsigned long *evictionp)
166 unsigned long *distance)
167{ 183{
168 unsigned long entry = (unsigned long)shadow; 184 unsigned long entry = (unsigned long)shadow;
169 unsigned long eviction; 185 int memcgid, nid, zid;
170 unsigned long refault;
171 unsigned long mask;
172 int zid, nid;
173 186
174 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; 187 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
175 zid = entry & ((1UL << ZONES_SHIFT) - 1); 188 zid = entry & ((1UL << ZONES_SHIFT) - 1);
176 entry >>= ZONES_SHIFT; 189 entry >>= ZONES_SHIFT;
177 nid = entry & ((1UL << NODES_SHIFT) - 1); 190 nid = entry & ((1UL << NODES_SHIFT) - 1);
178 entry >>= NODES_SHIFT; 191 entry >>= NODES_SHIFT;
179 eviction = entry; 192 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
193 entry >>= MEM_CGROUP_ID_SHIFT;
180 194
181 *zone = NODE_DATA(nid)->node_zones + zid; 195 *memcgidp = memcgid;
182 196 *zonep = NODE_DATA(nid)->node_zones + zid;
183 refault = atomic_long_read(&(*zone)->inactive_age); 197 *evictionp = entry << bucket_order;
184 mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
185 RADIX_TREE_EXCEPTIONAL_SHIFT);
186 /*
187 * The unsigned subtraction here gives an accurate distance
188 * across inactive_age overflows in most cases.
189 *
190 * There is a special case: usually, shadow entries have a
191 * short lifetime and are either refaulted or reclaimed along
192 * with the inode before they get too old. But it is not
193 * impossible for the inactive_age to lap a shadow entry in
194 * the field, which can then can result in a false small
195 * refault distance, leading to a false activation should this
196 * old entry actually refault again. However, earlier kernels
197 * used to deactivate unconditionally with *every* reclaim
198 * invocation for the longest time, so the occasional
199 * inappropriate activation leading to pressure on the active
200 * list is not a problem.
201 */
202 *distance = (refault - eviction) & mask;
203} 198}
204 199
205/** 200/**
@@ -212,11 +207,20 @@ static void unpack_shadow(void *shadow,
212 */ 207 */
213void *workingset_eviction(struct address_space *mapping, struct page *page) 208void *workingset_eviction(struct address_space *mapping, struct page *page)
214{ 209{
210 struct mem_cgroup *memcg = page_memcg(page);
215 struct zone *zone = page_zone(page); 211 struct zone *zone = page_zone(page);
212 int memcgid = mem_cgroup_id(memcg);
216 unsigned long eviction; 213 unsigned long eviction;
214 struct lruvec *lruvec;
217 215
218 eviction = atomic_long_inc_return(&zone->inactive_age); 216 /* Page is fully exclusive and pins page->mem_cgroup */
219 return pack_shadow(eviction, zone); 217 VM_BUG_ON_PAGE(PageLRU(page), page);
218 VM_BUG_ON_PAGE(page_count(page), page);
219 VM_BUG_ON_PAGE(!PageLocked(page), page);
220
221 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
222 eviction = atomic_long_inc_return(&lruvec->inactive_age);
223 return pack_shadow(memcgid, zone, eviction);
220} 224}
221 225
222/** 226/**
@@ -231,12 +235,64 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
231bool workingset_refault(void *shadow) 235bool workingset_refault(void *shadow)
232{ 236{
233 unsigned long refault_distance; 237 unsigned long refault_distance;
238 unsigned long active_file;
239 struct mem_cgroup *memcg;
240 unsigned long eviction;
241 struct lruvec *lruvec;
242 unsigned long refault;
234 struct zone *zone; 243 struct zone *zone;
244 int memcgid;
245
246 unpack_shadow(shadow, &memcgid, &zone, &eviction);
247
248 rcu_read_lock();
249 /*
250 * Look up the memcg associated with the stored ID. It might
251 * have been deleted since the page's eviction.
252 *
253 * Note that in rare events the ID could have been recycled
254 * for a new cgroup that refaults a shared page. This is
255 * impossible to tell from the available data. However, this
256 * should be a rare and limited disturbance, and activations
257 * are always speculative anyway. Ultimately, it's the aging
258 * algorithm's job to shake out the minimum access frequency
259 * for the active cache.
260 *
261 * XXX: On !CONFIG_MEMCG, this will always return NULL; it
262 * would be better if the root_mem_cgroup existed in all
263 * configurations instead.
264 */
265 memcg = mem_cgroup_from_id(memcgid);
266 if (!mem_cgroup_disabled() && !memcg) {
267 rcu_read_unlock();
268 return false;
269 }
270 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
271 refault = atomic_long_read(&lruvec->inactive_age);
272 active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
273 rcu_read_unlock();
274
275 /*
276 * The unsigned subtraction here gives an accurate distance
277 * across inactive_age overflows in most cases.
278 *
279 * There is a special case: usually, shadow entries have a
280 * short lifetime and are either refaulted or reclaimed along
281 * with the inode before they get too old. But it is not
282 * impossible for the inactive_age to lap a shadow entry in
283 * the field, which can then can result in a false small
284 * refault distance, leading to a false activation should this
285 * old entry actually refault again. However, earlier kernels
286 * used to deactivate unconditionally with *every* reclaim
287 * invocation for the longest time, so the occasional
288 * inappropriate activation leading to pressure on the active
289 * list is not a problem.
290 */
291 refault_distance = (refault - eviction) & EVICTION_MASK;
235 292
236 unpack_shadow(shadow, &zone, &refault_distance);
237 inc_zone_state(zone, WORKINGSET_REFAULT); 293 inc_zone_state(zone, WORKINGSET_REFAULT);
238 294
239 if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { 295 if (refault_distance <= active_file) {
240 inc_zone_state(zone, WORKINGSET_ACTIVATE); 296 inc_zone_state(zone, WORKINGSET_ACTIVATE);
241 return true; 297 return true;
242 } 298 }
@@ -249,7 +305,22 @@ bool workingset_refault(void *shadow)
249 */ 305 */
250void workingset_activation(struct page *page) 306void workingset_activation(struct page *page)
251{ 307{
252 atomic_long_inc(&page_zone(page)->inactive_age); 308 struct lruvec *lruvec;
309
310 lock_page_memcg(page);
311 /*
312 * Filter non-memcg pages here, e.g. unmap can call
313 * mark_page_accessed() on VDSO pages.
314 *
315 * XXX: See workingset_refault() - this should return
316 * root_mem_cgroup even for !CONFIG_MEMCG.
317 */
318 if (!mem_cgroup_disabled() && !page_memcg(page))
319 goto out;
320 lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
321 atomic_long_inc(&lruvec->inactive_age);
322out:
323 unlock_page_memcg(page);
253} 324}
254 325
255/* 326/*
@@ -398,8 +469,25 @@ static struct lock_class_key shadow_nodes_key;
398 469
399static int __init workingset_init(void) 470static int __init workingset_init(void)
400{ 471{
472 unsigned int timestamp_bits;
473 unsigned int max_order;
401 int ret; 474 int ret;
402 475
476 BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
477 /*
478 * Calculate the eviction bucket size to cover the longest
479 * actionable refault distance, which is currently half of
480 * memory (totalram_pages/2). However, memory hotplug may add
481 * some more pages at runtime, so keep working with up to
482 * double the initial memory by using totalram_pages as-is.
483 */
484 timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
485 max_order = fls_long(totalram_pages - 1);
486 if (max_order > timestamp_bits)
487 bucket_order = max_order - timestamp_bits;
488 printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
489 timestamp_bits, max_order, bucket_order);
490
403 ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); 491 ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
404 if (ret) 492 if (ret)
405 goto err; 493 goto err;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 643a86c49020..2d5589b61e9f 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -50,8 +50,7 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
50#define TSBITS 6 50#define TSBITS 6
51#define TSMASK (((__u32)1 << TSBITS) - 1) 51#define TSMASK (((__u32)1 << TSBITS) - 1)
52 52
53static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], 53static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
54 ipv4_cookie_scratch);
55 54
56static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, 55static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
57 u32 count, int c) 56 u32 count, int c)
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 2906ef20795e..aae3e5ca63ea 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -41,8 +41,7 @@ static __u16 const msstab[] = {
41 9000 - 60, 41 9000 - 60,
42}; 42};
43 43
44static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], 44static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch);
45 ipv6_cookie_scratch);
46 45
47static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, 46static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
48 __be16 sport, __be16 dport, u32 count, int c) 47 __be16 sport, __be16 dport, u32 count, int c)
diff --git a/net/rds/page.c b/net/rds/page.c
index 5a14e6d6a926..616f21f4e7d7 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -42,8 +42,8 @@ struct rds_page_remainder {
42 unsigned long r_offset; 42 unsigned long r_offset;
43}; 43};
44 44
45static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, 45static
46 rds_page_remainders); 46DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
47 47
48/* 48/*
49 * returns 0 on success or -errno on failure. 49 * returns 0 on success or -errno on failure.
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 874132b26d23..d574d13ba963 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3240,6 +3240,30 @@ sub process {
3240#ignore lines not being added 3240#ignore lines not being added
3241 next if ($line =~ /^[^\+]/); 3241 next if ($line =~ /^[^\+]/);
3242 3242
3243# check for declarations of signed or unsigned without int
3244 while ($line =~ m{($Declare)\s*(?!char\b|short\b|int\b|long\b)\s*($Ident)?\s*[=,;\[\)\(]}g) {
3245 my $type = $1;
3246 my $var = $2;
3247 $var = "" if (!defined $var);
3248 if ($type =~ /^(?:(?:$Storage|$Inline|$Attribute)\s+)*((?:un)?signed)((?:\s*\*)*)\s*$/) {
3249 my $sign = $1;
3250 my $pointer = $2;
3251
3252 $pointer = "" if (!defined $pointer);
3253
3254 if (WARN("UNSPECIFIED_INT",
3255 "Prefer '" . trim($sign) . " int" . rtrim($pointer) . "' to bare use of '$sign" . rtrim($pointer) . "'\n" . $herecurr) &&
3256 $fix) {
3257 my $decl = trim($sign) . " int ";
3258 my $comp_pointer = $pointer;
3259 $comp_pointer =~ s/\s//g;
3260 $decl .= $comp_pointer;
3261 $decl = rtrim($decl) if ($var eq "");
3262 $fixed[$fixlinenr] =~ s@\b$sign\s*\Q$pointer\E\s*$var\b@$decl$var@;
3263 }
3264 }
3265 }
3266
3243# TEST: allow direct testing of the type matcher. 3267# TEST: allow direct testing of the type matcher.
3244 if ($dbg_type) { 3268 if ($dbg_type) {
3245 if ($line =~ /^.\s*$Declare\s*$/) { 3269 if ($line =~ /^.\s*$Declare\s*$/) {
@@ -4109,7 +4133,7 @@ sub process {
4109## } 4133## }
4110 4134
4111#need space before brace following if, while, etc 4135#need space before brace following if, while, etc
4112 if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\){/) || 4136 if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\)\{/) ||
4113 $line =~ /do\{/) { 4137 $line =~ /do\{/) {
4114 if (ERROR("SPACING", 4138 if (ERROR("SPACING",
4115 "space required before the open brace '{'\n" . $herecurr) && 4139 "space required before the open brace '{'\n" . $herecurr) &&
@@ -4561,6 +4585,9 @@ sub process {
4561 { 4585 {
4562 } 4586 }
4563 4587
4588 # Make asm volatile uses seem like a generic function
4589 $dstat =~ s/\b_*asm_*\s+_*volatile_*\b/asm_volatile/g;
4590
4564 my $exceptions = qr{ 4591 my $exceptions = qr{
4565 $Declare| 4592 $Declare|
4566 module_param_named| 4593 module_param_named|
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 8fa81e84e295..638b143ee60f 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -22,6 +22,7 @@
22#include <stdlib.h> 22#include <stdlib.h>
23#include <string.h> 23#include <string.h>
24#include <ctype.h> 24#include <ctype.h>
25#include <limits.h>
25 26
26#ifndef ARRAY_SIZE 27#ifndef ARRAY_SIZE
27#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) 28#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
@@ -34,6 +35,7 @@ struct sym_entry {
34 unsigned int len; 35 unsigned int len;
35 unsigned int start_pos; 36 unsigned int start_pos;
36 unsigned char *sym; 37 unsigned char *sym;
38 unsigned int percpu_absolute;
37}; 39};
38 40
39struct addr_range { 41struct addr_range {
@@ -42,6 +44,7 @@ struct addr_range {
42}; 44};
43 45
44static unsigned long long _text; 46static unsigned long long _text;
47static unsigned long long relative_base;
45static struct addr_range text_ranges[] = { 48static struct addr_range text_ranges[] = {
46 { "_stext", "_etext" }, 49 { "_stext", "_etext" },
47 { "_sinittext", "_einittext" }, 50 { "_sinittext", "_einittext" },
@@ -61,6 +64,7 @@ static int all_symbols = 0;
61static int absolute_percpu = 0; 64static int absolute_percpu = 0;
62static char symbol_prefix_char = '\0'; 65static char symbol_prefix_char = '\0';
63static unsigned long long kernel_start_addr = 0; 66static unsigned long long kernel_start_addr = 0;
67static int base_relative = 0;
64 68
65int token_profit[0x10000]; 69int token_profit[0x10000];
66 70
@@ -74,7 +78,7 @@ static void usage(void)
74 fprintf(stderr, "Usage: kallsyms [--all-symbols] " 78 fprintf(stderr, "Usage: kallsyms [--all-symbols] "
75 "[--symbol-prefix=<prefix char>] " 79 "[--symbol-prefix=<prefix char>] "
76 "[--page-offset=<CONFIG_PAGE_OFFSET>] " 80 "[--page-offset=<CONFIG_PAGE_OFFSET>] "
77 "< in.map > out.S\n"); 81 "[--base-relative] < in.map > out.S\n");
78 exit(1); 82 exit(1);
79} 83}
80 84
@@ -171,6 +175,8 @@ static int read_symbol(FILE *in, struct sym_entry *s)
171 strcpy((char *)s->sym + 1, str); 175 strcpy((char *)s->sym + 1, str);
172 s->sym[0] = stype; 176 s->sym[0] = stype;
173 177
178 s->percpu_absolute = 0;
179
174 /* Record if we've found __per_cpu_start/end. */ 180 /* Record if we've found __per_cpu_start/end. */
175 check_symbol_range(sym, s->addr, &percpu_range, 1); 181 check_symbol_range(sym, s->addr, &percpu_range, 1);
176 182
@@ -202,6 +208,8 @@ static int symbol_valid(struct sym_entry *s)
202 */ 208 */
203 static char *special_symbols[] = { 209 static char *special_symbols[] = {
204 "kallsyms_addresses", 210 "kallsyms_addresses",
211 "kallsyms_offsets",
212 "kallsyms_relative_base",
205 "kallsyms_num_syms", 213 "kallsyms_num_syms",
206 "kallsyms_names", 214 "kallsyms_names",
207 "kallsyms_markers", 215 "kallsyms_markers",
@@ -325,7 +333,7 @@ static int expand_symbol(unsigned char *data, int len, char *result)
325 333
326static int symbol_absolute(struct sym_entry *s) 334static int symbol_absolute(struct sym_entry *s)
327{ 335{
328 return toupper(s->sym[0]) == 'A'; 336 return s->percpu_absolute;
329} 337}
330 338
331static void write_src(void) 339static void write_src(void)
@@ -346,16 +354,48 @@ static void write_src(void)
346 354
347 printf("\t.section .rodata, \"a\"\n"); 355 printf("\t.section .rodata, \"a\"\n");
348 356
349 /* Provide proper symbols relocatability by their '_text' 357 /* Provide proper symbols relocatability by their relativeness
350 * relativeness. The symbol names cannot be used to construct 358 * to a fixed anchor point in the runtime image, either '_text'
351 * normal symbol references as the list of symbols contains 359 * for absolute address tables, in which case the linker will
352 * symbols that are declared static and are private to their 360 * emit the final addresses at build time. Otherwise, use the
353 * .o files. This prevents .tmp_kallsyms.o or any other 361 * offset relative to the lowest value encountered of all relative
354 * object from referencing them. 362 * symbols, and emit non-relocatable fixed offsets that will be fixed
363 * up at runtime.
364 *
365 * The symbol names cannot be used to construct normal symbol
366 * references as the list of symbols contains symbols that are
367 * declared static and are private to their .o files. This prevents
368 * .tmp_kallsyms.o or any other object from referencing them.
355 */ 369 */
356 output_label("kallsyms_addresses"); 370 if (!base_relative)
371 output_label("kallsyms_addresses");
372 else
373 output_label("kallsyms_offsets");
374
357 for (i = 0; i < table_cnt; i++) { 375 for (i = 0; i < table_cnt; i++) {
358 if (!symbol_absolute(&table[i])) { 376 if (base_relative) {
377 long long offset;
378 int overflow;
379
380 if (!absolute_percpu) {
381 offset = table[i].addr - relative_base;
382 overflow = (offset < 0 || offset > UINT_MAX);
383 } else if (symbol_absolute(&table[i])) {
384 offset = table[i].addr;
385 overflow = (offset < 0 || offset > INT_MAX);
386 } else {
387 offset = relative_base - table[i].addr - 1;
388 overflow = (offset < INT_MIN || offset >= 0);
389 }
390 if (overflow) {
391 fprintf(stderr, "kallsyms failure: "
392 "%s symbol value %#llx out of range in relative mode\n",
393 symbol_absolute(&table[i]) ? "absolute" : "relative",
394 table[i].addr);
395 exit(EXIT_FAILURE);
396 }
397 printf("\t.long\t%#x\n", (int)offset);
398 } else if (!symbol_absolute(&table[i])) {
359 if (_text <= table[i].addr) 399 if (_text <= table[i].addr)
360 printf("\tPTR\t_text + %#llx\n", 400 printf("\tPTR\t_text + %#llx\n",
361 table[i].addr - _text); 401 table[i].addr - _text);
@@ -368,6 +408,12 @@ static void write_src(void)
368 } 408 }
369 printf("\n"); 409 printf("\n");
370 410
411 if (base_relative) {
412 output_label("kallsyms_relative_base");
413 printf("\tPTR\t_text - %#llx\n", _text - relative_base);
414 printf("\n");
415 }
416
371 output_label("kallsyms_num_syms"); 417 output_label("kallsyms_num_syms");
372 printf("\tPTR\t%d\n", table_cnt); 418 printf("\tPTR\t%d\n", table_cnt);
373 printf("\n"); 419 printf("\n");
@@ -681,8 +727,27 @@ static void make_percpus_absolute(void)
681 unsigned int i; 727 unsigned int i;
682 728
683 for (i = 0; i < table_cnt; i++) 729 for (i = 0; i < table_cnt; i++)
684 if (symbol_in_range(&table[i], &percpu_range, 1)) 730 if (symbol_in_range(&table[i], &percpu_range, 1)) {
731 /*
732 * Keep the 'A' override for percpu symbols to
733 * ensure consistent behavior compared to older
734 * versions of this tool.
735 */
685 table[i].sym[0] = 'A'; 736 table[i].sym[0] = 'A';
737 table[i].percpu_absolute = 1;
738 }
739}
740
741/* find the minimum non-absolute symbol address */
742static void record_relative_base(void)
743{
744 unsigned int i;
745
746 relative_base = -1ULL;
747 for (i = 0; i < table_cnt; i++)
748 if (!symbol_absolute(&table[i]) &&
749 table[i].addr < relative_base)
750 relative_base = table[i].addr;
686} 751}
687 752
688int main(int argc, char **argv) 753int main(int argc, char **argv)
@@ -703,7 +768,9 @@ int main(int argc, char **argv)
703 } else if (strncmp(argv[i], "--page-offset=", 14) == 0) { 768 } else if (strncmp(argv[i], "--page-offset=", 14) == 0) {
704 const char *p = &argv[i][14]; 769 const char *p = &argv[i][14];
705 kernel_start_addr = strtoull(p, NULL, 16); 770 kernel_start_addr = strtoull(p, NULL, 16);
706 } else 771 } else if (strcmp(argv[i], "--base-relative") == 0)
772 base_relative = 1;
773 else
707 usage(); 774 usage();
708 } 775 }
709 } else if (argc != 1) 776 } else if (argc != 1)
@@ -712,6 +779,8 @@ int main(int argc, char **argv)
712 read_map(stdin); 779 read_map(stdin);
713 if (absolute_percpu) 780 if (absolute_percpu)
714 make_percpus_absolute(); 781 make_percpus_absolute();
782 if (base_relative)
783 record_relative_base();
715 sort_symbols(); 784 sort_symbols();
716 optimize_token_table(); 785 optimize_token_table();
717 write_src(); 786 write_src();
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index ba6c34ea5429..453ede9d2f3d 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -86,10 +86,14 @@ kallsyms()
86 kallsymopt="${kallsymopt} --page-offset=$CONFIG_PAGE_OFFSET" 86 kallsymopt="${kallsymopt} --page-offset=$CONFIG_PAGE_OFFSET"
87 fi 87 fi
88 88
89 if [ -n "${CONFIG_X86_64}" ]; then 89 if [ -n "${CONFIG_KALLSYMS_ABSOLUTE_PERCPU}" ]; then
90 kallsymopt="${kallsymopt} --absolute-percpu" 90 kallsymopt="${kallsymopt} --absolute-percpu"
91 fi 91 fi
92 92
93 if [ -n "${CONFIG_KALLSYMS_BASE_RELATIVE}" ]; then
94 kallsymopt="${kallsymopt} --base-relative"
95 fi
96
93 local aflags="${KBUILD_AFLAGS} ${KBUILD_AFLAGS_KERNEL} \ 97 local aflags="${KBUILD_AFLAGS} ${KBUILD_AFLAGS_KERNEL} \
94 ${NOSTDINC_FLAGS} ${LINUXINCLUDE} ${KBUILD_CPPFLAGS}" 98 ${NOSTDINC_FLAGS} ${LINUXINCLUDE} ${KBUILD_CPPFLAGS}"
95 99
diff --git a/scripts/namespace.pl b/scripts/namespace.pl
index a71be6b7cdec..9f3c9d47a4a5 100755
--- a/scripts/namespace.pl
+++ b/scripts/namespace.pl
@@ -117,6 +117,8 @@ my %nameexception = (
117 'kallsyms_names' => 1, 117 'kallsyms_names' => 1,
118 'kallsyms_num_syms' => 1, 118 'kallsyms_num_syms' => 1,
119 'kallsyms_addresses'=> 1, 119 'kallsyms_addresses'=> 1,
120 'kallsyms_offsets' => 1,
121 'kallsyms_relative_base'=> 1,
120 '__this_module' => 1, 122 '__this_module' => 1,
121 '_etext' => 1, 123 '_etext' => 1,
122 '_edata' => 1, 124 '_edata' => 1,
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 4d3340cce9a0..c9cb3be47cff 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -602,7 +602,7 @@ static int gfpcmp(const void *a, const void *b)
602 return fa->flags - fb->flags; 602 return fa->flags - fb->flags;
603} 603}
604 604
605/* see include/trace/events/gfpflags.h */ 605/* see include/trace/events/mmflags.h */
606static const struct { 606static const struct {
607 const char *original; 607 const char *original;
608 const char *compact; 608 const char *compact;
@@ -612,30 +612,39 @@ static const struct {
612 { "GFP_HIGHUSER", "HU" }, 612 { "GFP_HIGHUSER", "HU" },
613 { "GFP_USER", "U" }, 613 { "GFP_USER", "U" },
614 { "GFP_TEMPORARY", "TMP" }, 614 { "GFP_TEMPORARY", "TMP" },
615 { "GFP_KERNEL_ACCOUNT", "KAC" },
615 { "GFP_KERNEL", "K" }, 616 { "GFP_KERNEL", "K" },
616 { "GFP_NOFS", "NF" }, 617 { "GFP_NOFS", "NF" },
617 { "GFP_ATOMIC", "A" }, 618 { "GFP_ATOMIC", "A" },
618 { "GFP_NOIO", "NI" }, 619 { "GFP_NOIO", "NI" },
619 { "GFP_HIGH", "H" },
620 { "GFP_WAIT", "W" },
621 { "GFP_IO", "I" },
622 { "GFP_COLD", "CO" },
623 { "GFP_NOWARN", "NWR" },
624 { "GFP_REPEAT", "R" },
625 { "GFP_NOFAIL", "NF" },
626 { "GFP_NORETRY", "NR" },
627 { "GFP_COMP", "C" },
628 { "GFP_ZERO", "Z" },
629 { "GFP_NOMEMALLOC", "NMA" },
630 { "GFP_MEMALLOC", "MA" },
631 { "GFP_HARDWALL", "HW" },
632 { "GFP_THISNODE", "TN" },
633 { "GFP_RECLAIMABLE", "RC" },
634 { "GFP_MOVABLE", "M" },
635 { "GFP_NOTRACK", "NT" },
636 { "GFP_NO_KSWAPD", "NK" },
637 { "GFP_OTHER_NODE", "ON" },
638 { "GFP_NOWAIT", "NW" }, 620 { "GFP_NOWAIT", "NW" },
621 { "GFP_DMA", "D" },
622 { "__GFP_HIGHMEM", "HM" },
623 { "GFP_DMA32", "D32" },
624 { "__GFP_HIGH", "H" },
625 { "__GFP_ATOMIC", "_A" },
626 { "__GFP_IO", "I" },
627 { "__GFP_FS", "F" },
628 { "__GFP_COLD", "CO" },
629 { "__GFP_NOWARN", "NWR" },
630 { "__GFP_REPEAT", "R" },
631 { "__GFP_NOFAIL", "NF" },
632 { "__GFP_NORETRY", "NR" },
633 { "__GFP_COMP", "C" },
634 { "__GFP_ZERO", "Z" },
635 { "__GFP_NOMEMALLOC", "NMA" },
636 { "__GFP_MEMALLOC", "MA" },
637 { "__GFP_HARDWALL", "HW" },
638 { "__GFP_THISNODE", "TN" },
639 { "__GFP_RECLAIMABLE", "RC" },
640 { "__GFP_MOVABLE", "M" },
641 { "__GFP_ACCOUNT", "AC" },
642 { "__GFP_NOTRACK", "NT" },
643 { "__GFP_WRITE", "WR" },
644 { "__GFP_RECLAIM", "R" },
645 { "__GFP_DIRECT_RECLAIM", "DR" },
646 { "__GFP_KSWAPD_RECLAIM", "KR" },
647 { "__GFP_OTHER_NODE", "ON" },
639}; 648};
640 649
641static size_t max_gfp_len; 650static size_t max_gfp_len;
diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c
index 86e698d07e20..1889163f2f05 100644
--- a/tools/vm/slabinfo.c
+++ b/tools/vm/slabinfo.c
@@ -135,7 +135,7 @@ static void usage(void)
135 "\nValid debug options (FZPUT may be combined)\n" 135 "\nValid debug options (FZPUT may be combined)\n"
136 "a / A Switch on all debug options (=FZUP)\n" 136 "a / A Switch on all debug options (=FZUP)\n"
137 "- Switch off all debug options\n" 137 "- Switch off all debug options\n"
138 "f / F Sanity Checks (SLAB_DEBUG_FREE)\n" 138 "f / F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n"
139 "z / Z Redzoning\n" 139 "z / Z Redzoning\n"
140 "p / P Poisoning\n" 140 "p / P Poisoning\n"
141 "u / U Tracking\n" 141 "u / U Tracking\n"