aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/lockup-watchdogs.txt18
-rw-r--r--Documentation/sysctl/kernel.txt21
-rw-r--r--Documentation/vm/unevictable-lru.txt8
-rw-r--r--arch/alpha/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/arc/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/arm/include/asm/hugetlb.h4
-rw-r--r--arch/arm/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/arm/mm/hugetlbpage.c5
-rw-r--r--arch/arm64/include/asm/hugetlb.h4
-rw-r--r--arch/arm64/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/arm64/mm/hugetlbpage.c7
-rw-r--r--arch/avr32/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/blackfin/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/c6x/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/cris/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/frv/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/hexagon/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/ia64/include/asm/hugetlb.h4
-rw-r--r--arch/ia64/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/ia64/mm/hugetlbpage.c5
-rw-r--r--arch/m32r/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/m68k/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/metag/include/asm/dma-mapping.h14
-rw-r--r--arch/metag/include/asm/hugetlb.h4
-rw-r--r--arch/metag/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/metag/mm/hugetlbpage.c5
-rw-r--r--arch/microblaze/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/mips/include/asm/hugetlb.h4
-rw-r--r--arch/mips/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/mips/include/asm/pgtable.h8
-rw-r--r--arch/mips/mm/hugetlbpage.c5
-rw-r--r--arch/mn10300/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/nios2/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/openrisc/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/parisc/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/parisc/kernel/pci-dma.c27
-rw-r--r--arch/powerpc/include/asm/hugetlb.h5
-rw-r--r--arch/powerpc/include/asm/mm-arch-hooks.h28
-rw-r--r--arch/powerpc/include/asm/mmu_context.h23
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64.h14
-rw-r--r--arch/powerpc/kernel/vio.c10
-rw-r--r--arch/powerpc/mm/hugetlbpage.c5
-rw-r--r--arch/powerpc/mm/pgtable_64.c73
-rw-r--r--arch/s390/include/asm/hugetlb.h1
-rw-r--r--arch/s390/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/s390/include/asm/pgtable.h30
-rw-r--r--arch/s390/kernel/crash_dump.c5
-rw-r--r--arch/s390/mm/hugetlbpage.c5
-rw-r--r--arch/score/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/sh/include/asm/hugetlb.h3
-rw-r--r--arch/sh/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/sh/mm/hugetlbpage.c5
-rw-r--r--arch/sparc/include/asm/hugetlb.h4
-rw-r--r--arch/sparc/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/sparc/include/asm/pgtable_64.h8
-rw-r--r--arch/sparc/kernel/ldc.c8
-rw-r--r--arch/sparc/mm/hugetlbpage.c5
-rw-r--r--arch/sparc/mm/init_64.c6
-rw-r--r--arch/tile/include/asm/hugetlb.h4
-rw-r--r--arch/tile/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/tile/include/asm/pgtable.h8
-rw-r--r--arch/tile/mm/hugetlbpage.c5
-rw-r--r--arch/um/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/unicore32/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/x86/include/asm/hugetlb.h3
-rw-r--r--arch/x86/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/x86/include/asm/pgtable.h4
-rw-r--r--arch/x86/kernel/check.c3
-rw-r--r--arch/x86/kernel/e820.c3
-rw-r--r--arch/x86/kernel/setup.c3
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/platform/efi/efi.c21
-rw-r--r--arch/xtensa/include/asm/dma-mapping.h19
-rw-r--r--arch/xtensa/include/asm/mm-arch-hooks.h15
-rw-r--r--drivers/staging/android/lowmemorykiller.c2
-rw-r--r--drivers/tty/sysrq.c2
-rw-r--r--drivers/xen/tmem.c8
-rw-r--r--fs/configfs/item.c3
-rw-r--r--fs/hugetlbfs/inode.c1
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ntfs/malloc.h7
-rw-r--r--fs/ocfs2/alloc.c37
-rw-r--r--fs/ocfs2/aops.c23
-rw-r--r--fs/ocfs2/aops.h7
-rw-r--r--fs/ocfs2/cluster/masklog.c34
-rw-r--r--fs/ocfs2/cluster/masklog.h42
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dir.c25
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h1
-rw-r--r--fs/ocfs2/file.c31
-rw-r--r--fs/ocfs2/journal.c76
-rw-r--r--fs/ocfs2/namei.c33
-rw-r--r--fs/ocfs2/namei.h4
-rw-r--r--fs/ocfs2/ocfs2.h10
-rw-r--r--fs/ocfs2/refcounttree.c6
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/proc/array.c8
-rw-r--r--fs/splice.c2
-rw-r--r--include/asm-generic/pgtable.h34
-rw-r--r--include/linux/bootmem.h8
-rw-r--r--include/linux/configfs.h1
-rw-r--r--include/linux/efi.h3
-rw-r--r--include/linux/frontswap.h14
-rw-r--r--include/linux/fsnotify_backend.h2
-rw-r--r--include/linux/kmemleak.h6
-rw-r--r--include/linux/memblock.h49
-rw-r--r--include/linux/mm-arch-hooks.h25
-rw-r--r--include/linux/mm.h37
-rw-r--r--include/linux/mmu_notifier.h12
-rw-r--r--include/linux/nmi.h3
-rw-r--r--include/linux/oom.h12
-rw-r--r--include/linux/slab.h26
-rw-r--r--include/linux/smpboot.h5
-rw-r--r--include/ras/ras_event.h85
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/smpboot.c60
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/watchdog.c67
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/cma.c10
-rw-r--r--mm/filemap.c23
-rw-r--r--mm/frontswap.c215
-rw-r--r--mm/huge_memory.c22
-rw-r--r--mm/hugetlb.c187
-rw-r--r--mm/hwpoison-inject.c4
-rw-r--r--mm/kmemleak.c168
-rw-r--r--mm/memblock.c123
-rw-r--r--mm/memcontrol.c59
-rw-r--r--mm/memory-failure.c351
-rw-r--r--mm/memory.c10
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c38
-rw-r--r--mm/memtest.c3
-rw-r--r--mm/migrate.c11
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/mprotect.c11
-rw-r--r--mm/mremap.c17
-rw-r--r--mm/nobootmem.c14
-rw-r--r--mm/nommu.c112
-rw-r--r--mm/oom_kill.c158
-rw-r--r--mm/page_alloc.c177
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/pgtable-generic.c29
-rw-r--r--mm/rmap.c9
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab.c1
-rw-r--r--mm/slab.h1
-rw-r--r--mm/slab_common.c98
-rw-r--r--mm/slub.c1
-rw-r--r--mm/swap.c1
-rw-r--r--mm/vmscan.c15
151 files changed, 2277 insertions, 1321 deletions
diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt
index ab0baa692c13..22dd6af2e4bd 100644
--- a/Documentation/lockup-watchdogs.txt
+++ b/Documentation/lockup-watchdogs.txt
@@ -61,3 +61,21 @@ As explained above, a kernel knob is provided that allows
61administrators to configure the period of the hrtimer and the perf 61administrators to configure the period of the hrtimer and the perf
62event. The right value for a particular environment is a trade-off 62event. The right value for a particular environment is a trade-off
63between fast response to lockups and detection overhead. 63between fast response to lockups and detection overhead.
64
65By default, the watchdog runs on all online cores. However, on a
66kernel configured with NO_HZ_FULL, by default the watchdog runs only
67on the housekeeping cores, not the cores specified in the "nohz_full"
68boot argument. If we allowed the watchdog to run by default on
69the "nohz_full" cores, we would have to run timer ticks to activate
70the scheduler, which would prevent the "nohz_full" functionality
71from protecting the user code on those cores from the kernel.
72Of course, disabling it by default on the nohz_full cores means that
73when those cores do enter the kernel, by default we will not be
74able to detect if they lock up. However, allowing the watchdog
75to continue to run on the housekeeping (non-tickless) cores means
76that we will continue to detect lockups properly on those cores.
77
78In either case, the set of cores excluded from running the watchdog
79may be adjusted via the kernel.watchdog_cpumask sysctl. For
80nohz_full cores, this may be useful for debugging a case where the
81kernel seems to be hanging on the nohz_full cores.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index c831001c45f1..e5d528e0c46e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -923,6 +923,27 @@ and nmi_watchdog.
923 923
924============================================================== 924==============================================================
925 925
926watchdog_cpumask:
927
928This value can be used to control on which cpus the watchdog may run.
929The default cpumask is all possible cores, but if NO_HZ_FULL is
930enabled in the kernel config, and cores are specified with the
931nohz_full= boot argument, those cores are excluded by default.
932Offline cores can be included in this mask, and if the core is later
933brought online, the watchdog will be started based on the mask value.
934
935Typically this value would only be touched in the nohz_full case
936to re-enable cores that by default were not running the watchdog,
937if a kernel lockup was suspected on those cores.
938
939The argument value is the standard cpulist format for cpumasks,
940so for example to enable the watchdog on cores 0, 2, 3, and 4 you
941might say:
942
943 echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask
944
945==============================================================
946
926watchdog_thresh: 947watchdog_thresh:
927 948
928This value can be used to control the frequency of hrtimer and NMI 949This value can be used to control the frequency of hrtimer and NMI
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index 3be0bfc4738d..32ee3a67dba2 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -467,7 +467,13 @@ mmap(MAP_LOCKED) SYSTEM CALL HANDLING
467 467
468In addition the mlock()/mlockall() system calls, an application can request 468In addition the mlock()/mlockall() system calls, an application can request
469that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap() 469that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap()
470call. Furthermore, any mmap() call or brk() call that expands the heap by a 470call. There is one important and subtle difference here, though. mmap() + mlock()
471will fail if the range cannot be faulted in (e.g. because mm_populate fails)
472and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. The mmaped
473area will still have properties of the locked area - aka. pages will not get
474swapped out - but major page faults to fault memory in might still happen.
475
476Furthermore, any mmap() call or brk() call that expands the heap by a
471task that has previously called mlockall() with the MCL_FUTURE flag will result 477task that has previously called mlockall() with the MCL_FUTURE flag will result
472in the newly mapped memory being mlocked. Before the unevictable/mlock 478in the newly mapped memory being mlocked. Before the unevictable/mlock
473changes, the kernel simply called make_pages_present() to allocate pages and 479changes, the kernel simply called make_pages_present() to allocate pages and
diff --git a/arch/alpha/include/asm/mm-arch-hooks.h b/arch/alpha/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..b07fd862fec3
--- /dev/null
+++ b/arch/alpha/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_ALPHA_MM_ARCH_HOOKS_H
13#define _ASM_ALPHA_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_ALPHA_MM_ARCH_HOOKS_H */
diff --git a/arch/arc/include/asm/mm-arch-hooks.h b/arch/arc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..c37541c5f8ba
--- /dev/null
+++ b/arch/arc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_ARC_MM_ARCH_HOOKS_H
13#define _ASM_ARC_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_ARC_MM_ARCH_HOOKS_H */
diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index 1f1b1cd112f3..31bb7dccb971 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -53,10 +53,6 @@ static inline int prepare_hugepage_range(struct file *file,
53 return 0; 53 return 0;
54} 54}
55 55
56static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
57{
58}
59
60static inline int huge_pte_none(pte_t pte) 56static inline int huge_pte_none(pte_t pte)
61{ 57{
62 return pte_none(pte); 58 return pte_none(pte);
diff --git a/arch/arm/include/asm/mm-arch-hooks.h b/arch/arm/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..7056660c7cc4
--- /dev/null
+++ b/arch/arm/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_ARM_MM_ARCH_HOOKS_H
13#define _ASM_ARM_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_ARM_MM_ARCH_HOOKS_H */
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
index c72412415093..fcafb521f14e 100644
--- a/arch/arm/mm/hugetlbpage.c
+++ b/arch/arm/mm/hugetlbpage.c
@@ -41,11 +41,6 @@ int pud_huge(pud_t pud)
41 return 0; 41 return 0;
42} 42}
43 43
44int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
45{
46 return 0;
47}
48
49int pmd_huge(pmd_t pmd) 44int pmd_huge(pmd_t pmd)
50{ 45{
51 return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); 46 return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 5b7ca8ace95f..734c17e89e94 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -86,10 +86,6 @@ static inline int prepare_hugepage_range(struct file *file,
86 return 0; 86 return 0;
87} 87}
88 88
89static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
90{
91}
92
93static inline int huge_pte_none(pte_t pte) 89static inline int huge_pte_none(pte_t pte)
94{ 90{
95 return pte_none(pte); 91 return pte_none(pte);
diff --git a/arch/arm64/include/asm/mm-arch-hooks.h b/arch/arm64/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..562b655f5ba9
--- /dev/null
+++ b/arch/arm64/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_ARM64_MM_ARCH_HOOKS_H
13#define _ASM_ARM64_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_ARM64_MM_ARCH_HOOKS_H */
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 2de9d2e59d96..cccc4af87a03 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -31,13 +31,6 @@
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/pgalloc.h> 32#include <asm/pgalloc.h>
33 33
34#ifndef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
35int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
36{
37 return 0;
38}
39#endif
40
41int pmd_huge(pmd_t pmd) 34int pmd_huge(pmd_t pmd)
42{ 35{
43 return !(pmd_val(pmd) & PMD_TABLE_BIT); 36 return !(pmd_val(pmd) & PMD_TABLE_BIT);
diff --git a/arch/avr32/include/asm/mm-arch-hooks.h b/arch/avr32/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..145452ffbdad
--- /dev/null
+++ b/arch/avr32/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_AVR32_MM_ARCH_HOOKS_H
13#define _ASM_AVR32_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_AVR32_MM_ARCH_HOOKS_H */
diff --git a/arch/blackfin/include/asm/mm-arch-hooks.h b/arch/blackfin/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..1c5211ec338f
--- /dev/null
+++ b/arch/blackfin/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_BLACKFIN_MM_ARCH_HOOKS_H
13#define _ASM_BLACKFIN_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_BLACKFIN_MM_ARCH_HOOKS_H */
diff --git a/arch/c6x/include/asm/mm-arch-hooks.h b/arch/c6x/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..bb3c4a6ce8e9
--- /dev/null
+++ b/arch/c6x/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_C6X_MM_ARCH_HOOKS_H
13#define _ASM_C6X_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_C6X_MM_ARCH_HOOKS_H */
diff --git a/arch/cris/include/asm/mm-arch-hooks.h b/arch/cris/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..314f774db2b0
--- /dev/null
+++ b/arch/cris/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_CRIS_MM_ARCH_HOOKS_H
13#define _ASM_CRIS_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_CRIS_MM_ARCH_HOOKS_H */
diff --git a/arch/frv/include/asm/mm-arch-hooks.h b/arch/frv/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..51d13a870404
--- /dev/null
+++ b/arch/frv/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_FRV_MM_ARCH_HOOKS_H
13#define _ASM_FRV_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_FRV_MM_ARCH_HOOKS_H */
diff --git a/arch/hexagon/include/asm/mm-arch-hooks.h b/arch/hexagon/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..05e8b939e416
--- /dev/null
+++ b/arch/hexagon/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_HEXAGON_MM_ARCH_HOOKS_H
13#define _ASM_HEXAGON_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_HEXAGON_MM_ARCH_HOOKS_H */
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
index aa910054b8e7..ff1377bc02a6 100644
--- a/arch/ia64/include/asm/hugetlb.h
+++ b/arch/ia64/include/asm/hugetlb.h
@@ -20,10 +20,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
20 REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); 20 REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE);
21} 21}
22 22
23static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
24{
25}
26
27static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 23static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
28 pte_t *ptep, pte_t pte) 24 pte_t *ptep, pte_t pte)
29{ 25{
diff --git a/arch/ia64/include/asm/mm-arch-hooks.h b/arch/ia64/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..ab4b5c698322
--- /dev/null
+++ b/arch/ia64/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_IA64_MM_ARCH_HOOKS_H
13#define _ASM_IA64_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_IA64_MM_ARCH_HOOKS_H */
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 52b7604b5215..f50d4b3f501a 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -65,11 +65,6 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
65 return pte; 65 return pte;
66} 66}
67 67
68int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
69{
70 return 0;
71}
72
73#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } 68#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
74 69
75/* 70/*
diff --git a/arch/m32r/include/asm/mm-arch-hooks.h b/arch/m32r/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..6d60b4750f41
--- /dev/null
+++ b/arch/m32r/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_M32R_MM_ARCH_HOOKS_H
13#define _ASM_M32R_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_M32R_MM_ARCH_HOOKS_H */
diff --git a/arch/m68k/include/asm/mm-arch-hooks.h b/arch/m68k/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..7e8709bc90ae
--- /dev/null
+++ b/arch/m68k/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_M68K_MM_ARCH_HOOKS_H
13#define _ASM_M68K_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_M68K_MM_ARCH_HOOKS_H */
diff --git a/arch/metag/include/asm/dma-mapping.h b/arch/metag/include/asm/dma-mapping.h
index 14b23efd9b7a..eb5cdec94be0 100644
--- a/arch/metag/include/asm/dma-mapping.h
+++ b/arch/metag/include/asm/dma-mapping.h
@@ -134,20 +134,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
134} 134}
135 135
136static inline void 136static inline void
137dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, 137dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems,
138 enum dma_data_direction direction) 138 enum dma_data_direction direction)
139{ 139{
140 int i; 140 int i;
141 for (i = 0; i < nelems; i++, sg++) 141 struct scatterlist *sg;
142
143 for_each_sg(sglist, sg, nelems, i)
142 dma_sync_for_cpu(sg_virt(sg), sg->length, direction); 144 dma_sync_for_cpu(sg_virt(sg), sg->length, direction);
143} 145}
144 146
145static inline void 147static inline void
146dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, 148dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
147 enum dma_data_direction direction) 149 int nelems, enum dma_data_direction direction)
148{ 150{
149 int i; 151 int i;
150 for (i = 0; i < nelems; i++, sg++) 152 struct scatterlist *sg;
153
154 for_each_sg(sglist, sg, nelems, i)
151 dma_sync_for_device(sg_virt(sg), sg->length, direction); 155 dma_sync_for_device(sg_virt(sg), sg->length, direction);
152} 156}
153 157
diff --git a/arch/metag/include/asm/hugetlb.h b/arch/metag/include/asm/hugetlb.h
index 471f481e67f3..f730b396d79b 100644
--- a/arch/metag/include/asm/hugetlb.h
+++ b/arch/metag/include/asm/hugetlb.h
@@ -14,10 +14,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
14int prepare_hugepage_range(struct file *file, unsigned long addr, 14int prepare_hugepage_range(struct file *file, unsigned long addr,
15 unsigned long len); 15 unsigned long len);
16 16
17static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
18{
19}
20
21static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, 17static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
22 unsigned long addr, unsigned long end, 18 unsigned long addr, unsigned long end,
23 unsigned long floor, 19 unsigned long floor,
diff --git a/arch/metag/include/asm/mm-arch-hooks.h b/arch/metag/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..b0072b2eb0de
--- /dev/null
+++ b/arch/metag/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_METAG_MM_ARCH_HOOKS_H
13#define _ASM_METAG_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_METAG_MM_ARCH_HOOKS_H */
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index 7ca80ac42ed5..53f0f6c47027 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -89,11 +89,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
89 return pte; 89 return pte;
90} 90}
91 91
92int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
93{
94 return 0;
95}
96
97int pmd_huge(pmd_t pmd) 92int pmd_huge(pmd_t pmd)
98{ 93{
99 return pmd_page_shift(pmd) > PAGE_SHIFT; 94 return pmd_page_shift(pmd) > PAGE_SHIFT;
diff --git a/arch/microblaze/include/asm/mm-arch-hooks.h b/arch/microblaze/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..5c4065911bda
--- /dev/null
+++ b/arch/microblaze/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_MICROBLAZE_MM_ARCH_HOOKS_H
13#define _ASM_MICROBLAZE_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_MICROBLAZE_MM_ARCH_HOOKS_H */
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index fe0d15d32660..4a5bb5453408 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -38,10 +38,6 @@ static inline int prepare_hugepage_range(struct file *file,
38 return 0; 38 return 0;
39} 39}
40 40
41static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
42{
43}
44
45static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, 41static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
46 unsigned long addr, 42 unsigned long addr,
47 unsigned long end, 43 unsigned long end,
diff --git a/arch/mips/include/asm/mm-arch-hooks.h b/arch/mips/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..b5609fe8e475
--- /dev/null
+++ b/arch/mips/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_MIPS_MM_ARCH_HOOKS_H
13#define _ASM_MIPS_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_MIPS_MM_ARCH_HOOKS_H */
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 819af9d057a8..9d8106758142 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -568,12 +568,12 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
568} 568}
569 569
570/* 570/*
571 * The generic version pmdp_get_and_clear uses a version of pmd_clear() with a 571 * The generic version pmdp_huge_get_and_clear uses a version of pmd_clear() with a
572 * different prototype. 572 * different prototype.
573 */ 573 */
574#define __HAVE_ARCH_PMDP_GET_AND_CLEAR 574#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
575static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, 575static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
576 unsigned long address, pmd_t *pmdp) 576 unsigned long address, pmd_t *pmdp)
577{ 577{
578 pmd_t old = *pmdp; 578 pmd_t old = *pmdp;
579 579
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index 06e0f421b41b..74aa6f62468f 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -51,11 +51,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
51 return (pte_t *) pmd; 51 return (pte_t *) pmd;
52} 52}
53 53
54int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
55{
56 return 0;
57}
58
59/* 54/*
60 * This function checks for proper alignment of input addr and len parameters. 55 * This function checks for proper alignment of input addr and len parameters.
61 */ 56 */
diff --git a/arch/mn10300/include/asm/mm-arch-hooks.h b/arch/mn10300/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..e2029a652f4c
--- /dev/null
+++ b/arch/mn10300/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_MN10300_MM_ARCH_HOOKS_H
13#define _ASM_MN10300_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_MN10300_MM_ARCH_HOOKS_H */
diff --git a/arch/nios2/include/asm/mm-arch-hooks.h b/arch/nios2/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..d7290dc68558
--- /dev/null
+++ b/arch/nios2/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_NIOS2_MM_ARCH_HOOKS_H
13#define _ASM_NIOS2_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_NIOS2_MM_ARCH_HOOKS_H */
diff --git a/arch/openrisc/include/asm/mm-arch-hooks.h b/arch/openrisc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..6d33cb555fe1
--- /dev/null
+++ b/arch/openrisc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_OPENRISC_MM_ARCH_HOOKS_H
13#define _ASM_OPENRISC_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_OPENRISC_MM_ARCH_HOOKS_H */
diff --git a/arch/parisc/include/asm/mm-arch-hooks.h b/arch/parisc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..654ec63b0ee9
--- /dev/null
+++ b/arch/parisc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_PARISC_MM_ARCH_HOOKS_H
13#define _ASM_PARISC_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_PARISC_MM_ARCH_HOOKS_H */
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index ff834fd67478..b9402c9b3454 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -478,14 +478,16 @@ static void pa11_dma_unmap_single(struct device *dev, dma_addr_t dma_handle, siz
478static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) 478static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
479{ 479{
480 int i; 480 int i;
481 struct scatterlist *sg;
481 482
482 BUG_ON(direction == DMA_NONE); 483 BUG_ON(direction == DMA_NONE);
483 484
484 for (i = 0; i < nents; i++, sglist++ ) { 485 for_each_sg(sglist, sg, nents, i) {
485 unsigned long vaddr = (unsigned long)sg_virt(sglist); 486 unsigned long vaddr = (unsigned long)sg_virt(sg);
486 sg_dma_address(sglist) = (dma_addr_t) virt_to_phys(vaddr); 487
487 sg_dma_len(sglist) = sglist->length; 488 sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr);
488 flush_kernel_dcache_range(vaddr, sglist->length); 489 sg_dma_len(sg) = sg->length;
490 flush_kernel_dcache_range(vaddr, sg->length);
489 } 491 }
490 return nents; 492 return nents;
491} 493}
@@ -493,6 +495,7 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int n
493static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) 495static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
494{ 496{
495 int i; 497 int i;
498 struct scatterlist *sg;
496 499
497 BUG_ON(direction == DMA_NONE); 500 BUG_ON(direction == DMA_NONE);
498 501
@@ -501,8 +504,8 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, in
501 504
502 /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ 505 /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
503 506
504 for (i = 0; i < nents; i++, sglist++ ) 507 for_each_sg(sglist, sg, nents, i)
505 flush_kernel_vmap_range(sg_virt(sglist), sglist->length); 508 flush_kernel_vmap_range(sg_virt(sg), sg->length);
506 return; 509 return;
507} 510}
508 511
@@ -523,21 +526,23 @@ static void pa11_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_h
523static void pa11_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) 526static void pa11_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
524{ 527{
525 int i; 528 int i;
529 struct scatterlist *sg;
526 530
527 /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ 531 /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
528 532
529 for (i = 0; i < nents; i++, sglist++ ) 533 for_each_sg(sglist, sg, nents, i)
530 flush_kernel_vmap_range(sg_virt(sglist), sglist->length); 534 flush_kernel_vmap_range(sg_virt(sg), sg->length);
531} 535}
532 536
533static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) 537static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
534{ 538{
535 int i; 539 int i;
540 struct scatterlist *sg;
536 541
537 /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ 542 /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
538 543
539 for (i = 0; i < nents; i++, sglist++ ) 544 for_each_sg(sglist, sg, nents, i)
540 flush_kernel_vmap_range(sg_virt(sglist), sglist->length); 545 flush_kernel_vmap_range(sg_virt(sg), sg->length);
541} 546}
542 547
543struct hppa_dma_ops pcxl_dma_ops = { 548struct hppa_dma_ops pcxl_dma_ops = {
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 1d53a65b4ec1..4bbd3c8c2888 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -112,11 +112,6 @@ static inline int prepare_hugepage_range(struct file *file,
112 return 0; 112 return 0;
113} 113}
114 114
115static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
116{
117}
118
119
120static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 115static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
121 pte_t *ptep, pte_t pte) 116 pte_t *ptep, pte_t pte)
122{ 117{
diff --git a/arch/powerpc/include/asm/mm-arch-hooks.h b/arch/powerpc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..f2a2da895897
--- /dev/null
+++ b/arch/powerpc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,28 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_POWERPC_MM_ARCH_HOOKS_H
13#define _ASM_POWERPC_MM_ARCH_HOOKS_H
14
15static inline void arch_remap(struct mm_struct *mm,
16 unsigned long old_start, unsigned long old_end,
17 unsigned long new_start, unsigned long new_end)
18{
19 /*
20 * mremap() doesn't allow moving multiple vmas so we can limit the
21 * check to old_start == vdso_base.
22 */
23 if (old_start == mm->context.vdso_base)
24 mm->context.vdso_base = new_start;
25}
26#define arch_remap arch_remap
27
28#endif /* _ASM_POWERPC_MM_ARCH_HOOKS_H */
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 3e5184210d9b..878c27771717 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -8,7 +8,6 @@
8#include <linux/spinlock.h> 8#include <linux/spinlock.h>
9#include <asm/mmu.h> 9#include <asm/mmu.h>
10#include <asm/cputable.h> 10#include <asm/cputable.h>
11#include <asm-generic/mm_hooks.h>
12#include <asm/cputhreads.h> 11#include <asm/cputhreads.h>
13 12
14/* 13/*
@@ -127,5 +126,27 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
127#endif 126#endif
128} 127}
129 128
129static inline void arch_dup_mmap(struct mm_struct *oldmm,
130 struct mm_struct *mm)
131{
132}
133
134static inline void arch_exit_mmap(struct mm_struct *mm)
135{
136}
137
138static inline void arch_unmap(struct mm_struct *mm,
139 struct vm_area_struct *vma,
140 unsigned long start, unsigned long end)
141{
142 if (start <= mm->context.vdso_base && mm->context.vdso_base < end)
143 mm->context.vdso_base = 0;
144}
145
146static inline void arch_bprm_mm_init(struct mm_struct *mm,
147 struct vm_area_struct *vma)
148{
149}
150
130#endif /* __KERNEL__ */ 151#endif /* __KERNEL__ */
131#endif /* __ASM_POWERPC_MMU_CONTEXT_H */ 152#endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index f890f7ce1593..3bb7488bd24b 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -569,13 +569,9 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
569extern int pmdp_clear_flush_young(struct vm_area_struct *vma, 569extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
570 unsigned long address, pmd_t *pmdp); 570 unsigned long address, pmd_t *pmdp);
571 571
572#define __HAVE_ARCH_PMDP_GET_AND_CLEAR 572#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
573extern pmd_t pmdp_get_and_clear(struct mm_struct *mm, 573extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
574 unsigned long addr, pmd_t *pmdp); 574 unsigned long addr, pmd_t *pmdp);
575
576#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
577extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
578 pmd_t *pmdp);
579 575
580#define __HAVE_ARCH_PMDP_SET_WRPROTECT 576#define __HAVE_ARCH_PMDP_SET_WRPROTECT
581static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, 577static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
@@ -592,6 +588,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
592extern void pmdp_splitting_flush(struct vm_area_struct *vma, 588extern void pmdp_splitting_flush(struct vm_area_struct *vma,
593 unsigned long address, pmd_t *pmdp); 589 unsigned long address, pmd_t *pmdp);
594 590
591extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
592 unsigned long address, pmd_t *pmdp);
593#define pmdp_collapse_flush pmdp_collapse_flush
594
595#define __HAVE_ARCH_PGTABLE_DEPOSIT 595#define __HAVE_ARCH_PGTABLE_DEPOSIT
596extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 596extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
597 pgtable_t pgtable); 597 pgtable_t pgtable);
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index b41426c60ef6..5f8dcdaa2820 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -557,11 +557,11 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
557 struct vio_dev *viodev = to_vio_dev(dev); 557 struct vio_dev *viodev = to_vio_dev(dev);
558 struct iommu_table *tbl; 558 struct iommu_table *tbl;
559 struct scatterlist *sgl; 559 struct scatterlist *sgl;
560 int ret, count = 0; 560 int ret, count;
561 size_t alloc_size = 0; 561 size_t alloc_size = 0;
562 562
563 tbl = get_iommu_table_base(dev); 563 tbl = get_iommu_table_base(dev);
564 for (sgl = sglist; count < nelems; count++, sgl++) 564 for_each_sg(sglist, sgl, nelems, count)
565 alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl)); 565 alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
566 566
567 if (vio_cmo_alloc(viodev, alloc_size)) { 567 if (vio_cmo_alloc(viodev, alloc_size)) {
@@ -577,7 +577,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
577 return ret; 577 return ret;
578 } 578 }
579 579
580 for (sgl = sglist, count = 0; count < ret; count++, sgl++) 580 for_each_sg(sglist, sgl, ret, count)
581 alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); 581 alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
582 if (alloc_size) 582 if (alloc_size)
583 vio_cmo_dealloc(viodev, alloc_size); 583 vio_cmo_dealloc(viodev, alloc_size);
@@ -594,10 +594,10 @@ static void vio_dma_iommu_unmap_sg(struct device *dev,
594 struct iommu_table *tbl; 594 struct iommu_table *tbl;
595 struct scatterlist *sgl; 595 struct scatterlist *sgl;
596 size_t alloc_size = 0; 596 size_t alloc_size = 0;
597 int count = 0; 597 int count;
598 598
599 tbl = get_iommu_table_base(dev); 599 tbl = get_iommu_table_base(dev);
600 for (sgl = sglist; count < nelems; count++, sgl++) 600 for_each_sg(sglist, sgl, nelems, count)
601 alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); 601 alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
602 602
603 dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs); 603 dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 3385e3d0506e..38bd5d998c81 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -439,11 +439,6 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
439} 439}
440#endif 440#endif
441 441
442int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
443{
444 return 0;
445}
446
447#ifdef CONFIG_PPC_FSL_BOOK3E 442#ifdef CONFIG_PPC_FSL_BOOK3E
448#define HUGEPD_FREELIST_SIZE \ 443#define HUGEPD_FREELIST_SIZE \
449 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) 444 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 6bfadf1aa5cb..876232d64126 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -554,47 +554,42 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
554 return old; 554 return old;
555} 555}
556 556
557pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, 557pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
558 pmd_t *pmdp) 558 pmd_t *pmdp)
559{ 559{
560 pmd_t pmd; 560 pmd_t pmd;
561 561
562 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 562 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
563 if (pmd_trans_huge(*pmdp)) { 563 VM_BUG_ON(pmd_trans_huge(*pmdp));
564 pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); 564
565 } else { 565 pmd = *pmdp;
566 /* 566 pmd_clear(pmdp);
567 * khugepaged calls this for normal pmd 567 /*
568 */ 568 * Wait for all pending hash_page to finish. This is needed
569 pmd = *pmdp; 569 * in case of subpage collapse. When we collapse normal pages
570 pmd_clear(pmdp); 570 * to hugepage, we first clear the pmd, then invalidate all
571 /* 571 * the PTE entries. The assumption here is that any low level
572 * Wait for all pending hash_page to finish. This is needed 572 * page fault will see a none pmd and take the slow path that
573 * in case of subpage collapse. When we collapse normal pages 573 * will wait on mmap_sem. But we could very well be in a
574 * to hugepage, we first clear the pmd, then invalidate all 574 * hash_page with local ptep pointer value. Such a hash page
575 * the PTE entries. The assumption here is that any low level 575 * can result in adding new HPTE entries for normal subpages.
576 * page fault will see a none pmd and take the slow path that 576 * That means we could be modifying the page content as we
577 * will wait on mmap_sem. But we could very well be in a 577 * copy them to a huge page. So wait for parallel hash_page
578 * hash_page with local ptep pointer value. Such a hash page 578 * to finish before invalidating HPTE entries. We can do this
579 * can result in adding new HPTE entries for normal subpages. 579 * by sending an IPI to all the cpus and executing a dummy
580 * That means we could be modifying the page content as we 580 * function there.
581 * copy them to a huge page. So wait for parallel hash_page 581 */
582 * to finish before invalidating HPTE entries. We can do this 582 kick_all_cpus_sync();
583 * by sending an IPI to all the cpus and executing a dummy 583 /*
584 * function there. 584 * Now invalidate the hpte entries in the range
585 */ 585 * covered by pmd. This make sure we take a
586 kick_all_cpus_sync(); 586 * fault and will find the pmd as none, which will
587 /* 587 * result in a major fault which takes mmap_sem and
588 * Now invalidate the hpte entries in the range 588 * hence wait for collapse to complete. Without this
589 * covered by pmd. This make sure we take a 589 * the __collapse_huge_page_copy can result in copying
590 * fault and will find the pmd as none, which will 590 * the old content.
591 * result in a major fault which takes mmap_sem and 591 */
592 * hence wait for collapse to complete. Without this 592 flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
593 * the __collapse_huge_page_copy can result in copying
594 * the old content.
595 */
596 flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
597 }
598 return pmd; 593 return pmd;
599} 594}
600 595
@@ -817,8 +812,8 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
817 return; 812 return;
818} 813}
819 814
820pmd_t pmdp_get_and_clear(struct mm_struct *mm, 815pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
821 unsigned long addr, pmd_t *pmdp) 816 unsigned long addr, pmd_t *pmdp)
822{ 817{
823 pmd_t old_pmd; 818 pmd_t old_pmd;
824 pgtable_t pgtable; 819 pgtable_t pgtable;
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 11eae5f55b70..dfb542ade6b1 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -35,7 +35,6 @@ static inline int prepare_hugepage_range(struct file *file,
35 return 0; 35 return 0;
36} 36}
37 37
38#define hugetlb_prefault_arch_hook(mm) do { } while (0)
39#define arch_clear_hugepage_flags(page) do { } while (0) 38#define arch_clear_hugepage_flags(page) do { } while (0)
40 39
41int arch_prepare_hugepage(struct page *page); 40int arch_prepare_hugepage(struct page *page);
diff --git a/arch/s390/include/asm/mm-arch-hooks.h b/arch/s390/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..07680b2f3c59
--- /dev/null
+++ b/arch/s390/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_S390_MM_ARCH_HOOKS_H
13#define _ASM_S390_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_S390_MM_ARCH_HOOKS_H */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 0bb2da79adf3..f66d82798a6a 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1498,9 +1498,9 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
1498 return pmd_young(pmd); 1498 return pmd_young(pmd);
1499} 1499}
1500 1500
1501#define __HAVE_ARCH_PMDP_GET_AND_CLEAR 1501#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
1502static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, 1502static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
1503 unsigned long address, pmd_t *pmdp) 1503 unsigned long address, pmd_t *pmdp)
1504{ 1504{
1505 pmd_t pmd = *pmdp; 1505 pmd_t pmd = *pmdp;
1506 1506
@@ -1509,10 +1509,10 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
1509 return pmd; 1509 return pmd;
1510} 1510}
1511 1511
1512#define __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL 1512#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
1513static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm, 1513static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
1514 unsigned long address, 1514 unsigned long address,
1515 pmd_t *pmdp, int full) 1515 pmd_t *pmdp, int full)
1516{ 1516{
1517 pmd_t pmd = *pmdp; 1517 pmd_t pmd = *pmdp;
1518 1518
@@ -1522,11 +1522,11 @@ static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
1522 return pmd; 1522 return pmd;
1523} 1523}
1524 1524
1525#define __HAVE_ARCH_PMDP_CLEAR_FLUSH 1525#define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
1526static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma, 1526static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
1527 unsigned long address, pmd_t *pmdp) 1527 unsigned long address, pmd_t *pmdp)
1528{ 1528{
1529 return pmdp_get_and_clear(vma->vm_mm, address, pmdp); 1529 return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
1530} 1530}
1531 1531
1532#define __HAVE_ARCH_PMDP_INVALIDATE 1532#define __HAVE_ARCH_PMDP_INVALIDATE
@@ -1548,6 +1548,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
1548 } 1548 }
1549} 1549}
1550 1550
1551static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
1552 unsigned long address,
1553 pmd_t *pmdp)
1554{
1555 return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
1556}
1557#define pmdp_collapse_flush pmdp_collapse_flush
1558
1551#define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot)) 1559#define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot))
1552#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) 1560#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
1553 1561
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index d9f0dcfcae5e..7a75ad4594e3 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -33,11 +33,12 @@ static struct memblock_type oldmem_type = {
33}; 33};
34 34
35#define for_each_dump_mem_range(i, nid, p_start, p_end, p_nid) \ 35#define for_each_dump_mem_range(i, nid, p_start, p_end, p_nid) \
36 for (i = 0, __next_mem_range(&i, nid, &memblock.physmem, \ 36 for (i = 0, __next_mem_range(&i, nid, MEMBLOCK_NONE, \
37 &memblock.physmem, \
37 &oldmem_type, p_start, \ 38 &oldmem_type, p_start, \
38 p_end, p_nid); \ 39 p_end, p_nid); \
39 i != (u64)ULLONG_MAX; \ 40 i != (u64)ULLONG_MAX; \
40 __next_mem_range(&i, nid, &memblock.physmem, \ 41 __next_mem_range(&i, nid, MEMBLOCK_NONE, &memblock.physmem,\
41 &oldmem_type, \ 42 &oldmem_type, \
42 p_start, p_end, p_nid)) 43 p_start, p_end, p_nid))
43 44
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index e617e74b7be2..c3f8e3df92ff 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -193,11 +193,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
193 return (pte_t *) pmdp; 193 return (pte_t *) pmdp;
194} 194}
195 195
196int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
197{
198 return 0;
199}
200
201int pmd_huge(pmd_t pmd) 196int pmd_huge(pmd_t pmd)
202{ 197{
203 if (!MACHINE_HAS_HPAGE) 198 if (!MACHINE_HAS_HPAGE)
diff --git a/arch/score/include/asm/mm-arch-hooks.h b/arch/score/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..5e38689f189a
--- /dev/null
+++ b/arch/score/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_SCORE_MM_ARCH_HOOKS_H
13#define _ASM_SCORE_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_SCORE_MM_ARCH_HOOKS_H */
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index 699255d6d1c6..b788a9bc8918 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file,
26 return 0; 26 return 0;
27} 27}
28 28
29static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
30}
31
32static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, 29static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
33 unsigned long addr, unsigned long end, 30 unsigned long addr, unsigned long end,
34 unsigned long floor, 31 unsigned long floor,
diff --git a/arch/sh/include/asm/mm-arch-hooks.h b/arch/sh/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..18087298b728
--- /dev/null
+++ b/arch/sh/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_SH_MM_ARCH_HOOKS_H
13#define _ASM_SH_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_SH_MM_ARCH_HOOKS_H */
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 534bc978af8a..6385f60209b6 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -62,11 +62,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
62 return pte; 62 return pte;
63} 63}
64 64
65int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
66{
67 return 0;
68}
69
70int pmd_huge(pmd_t pmd) 65int pmd_huge(pmd_t pmd)
71{ 66{
72 return 0; 67 return 0;
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index e4cab465b81f..3130d7636312 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -11,10 +11,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
11pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, 11pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
12 pte_t *ptep); 12 pte_t *ptep);
13 13
14static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
15{
16}
17
18static inline int is_hugepage_only_range(struct mm_struct *mm, 14static inline int is_hugepage_only_range(struct mm_struct *mm,
19 unsigned long addr, 15 unsigned long addr,
20 unsigned long len) { 16 unsigned long len) {
diff --git a/arch/sparc/include/asm/mm-arch-hooks.h b/arch/sparc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..b89ba44c16f1
--- /dev/null
+++ b/arch/sparc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_SPARC_MM_ARCH_HOOKS_H
13#define _ASM_SPARC_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_SPARC_MM_ARCH_HOOKS_H */
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 2a52c91d2c8a..131d36fcd07a 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -865,10 +865,10 @@ static inline unsigned long pud_pfn(pud_t pud)
865void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, 865void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
866 pte_t *ptep, pte_t orig, int fullmm); 866 pte_t *ptep, pte_t orig, int fullmm);
867 867
868#define __HAVE_ARCH_PMDP_GET_AND_CLEAR 868#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
869static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, 869static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
870 unsigned long addr, 870 unsigned long addr,
871 pmd_t *pmdp) 871 pmd_t *pmdp)
872{ 872{
873 pmd_t pmd = *pmdp; 873 pmd_t pmd = *pmdp;
874 set_pmd_at(mm, addr, pmdp, __pmd(0UL)); 874 set_pmd_at(mm, addr, pmdp, __pmd(0UL));
diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c
index 7d3ca30fcd15..1ae5eb1bb045 100644
--- a/arch/sparc/kernel/ldc.c
+++ b/arch/sparc/kernel/ldc.c
@@ -2086,6 +2086,7 @@ int ldc_map_sg(struct ldc_channel *lp,
2086 struct cookie_state state; 2086 struct cookie_state state;
2087 struct ldc_iommu *iommu; 2087 struct ldc_iommu *iommu;
2088 int err; 2088 int err;
2089 struct scatterlist *s;
2089 2090
2090 if (map_perm & ~LDC_MAP_ALL) 2091 if (map_perm & ~LDC_MAP_ALL)
2091 return -EINVAL; 2092 return -EINVAL;
@@ -2112,9 +2113,10 @@ int ldc_map_sg(struct ldc_channel *lp,
2112 state.pte_idx = (base - iommu->page_table); 2113 state.pte_idx = (base - iommu->page_table);
2113 state.nc = 0; 2114 state.nc = 0;
2114 2115
2115 for (i = 0; i < num_sg; i++) 2116 for_each_sg(sg, s, num_sg, i) {
2116 fill_cookies(&state, page_to_pfn(sg_page(&sg[i])) << PAGE_SHIFT, 2117 fill_cookies(&state, page_to_pfn(sg_page(s)) << PAGE_SHIFT,
2117 sg[i].offset, sg[i].length); 2118 s->offset, s->length);
2119 }
2118 2120
2119 return state.nc; 2121 return state.nc;
2120} 2122}
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 4242eab12e10..131eaf4ad7f5 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -172,11 +172,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
172 return pte; 172 return pte;
173} 173}
174 174
175int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
176{
177 return 0;
178}
179
180void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 175void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
181 pte_t *ptep, pte_t entry) 176 pte_t *ptep, pte_t entry)
182{ 177{
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index c5d08b89a96c..4ac88b757514 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1966,7 +1966,8 @@ static phys_addr_t __init available_memory(void)
1966 phys_addr_t pa_start, pa_end; 1966 phys_addr_t pa_start, pa_end;
1967 u64 i; 1967 u64 i;
1968 1968
1969 for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) 1969 for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
1970 &pa_end, NULL)
1970 available = available + (pa_end - pa_start); 1971 available = available + (pa_end - pa_start);
1971 1972
1972 return available; 1973 return available;
@@ -1992,7 +1993,8 @@ static void __init reduce_memory(phys_addr_t limit_ram)
1992 if (limit_ram >= avail_ram) 1993 if (limit_ram >= avail_ram)
1993 return; 1994 return;
1994 1995
1995 for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) { 1996 for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
1997 &pa_end, NULL) {
1996 phys_addr_t region_size = pa_end - pa_start; 1998 phys_addr_t region_size = pa_end - pa_start;
1997 phys_addr_t clip_start = pa_start; 1999 phys_addr_t clip_start = pa_start;
1998 2000
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h
index 3257733003f8..1abd00c55236 100644
--- a/arch/tile/include/asm/hugetlb.h
+++ b/arch/tile/include/asm/hugetlb.h
@@ -40,10 +40,6 @@ static inline int prepare_hugepage_range(struct file *file,
40 return 0; 40 return 0;
41} 41}
42 42
43static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
44{
45}
46
47static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, 43static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
48 unsigned long addr, unsigned long end, 44 unsigned long addr, unsigned long end,
49 unsigned long floor, 45 unsigned long floor,
diff --git a/arch/tile/include/asm/mm-arch-hooks.h b/arch/tile/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..d1709ea774f7
--- /dev/null
+++ b/arch/tile/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_TILE_MM_ARCH_HOOKS_H
13#define _ASM_TILE_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_TILE_MM_ARCH_HOOKS_H */
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 95a4f19d16c5..2b05ccbebed9 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -414,10 +414,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
414} 414}
415 415
416 416
417#define __HAVE_ARCH_PMDP_GET_AND_CLEAR 417#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
418static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, 418static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
419 unsigned long address, 419 unsigned long address,
420 pmd_t *pmdp) 420 pmd_t *pmdp)
421{ 421{
422 return pte_pmd(ptep_get_and_clear(mm, address, pmdp_ptep(pmdp))); 422 return pte_pmd(ptep_get_and_clear(mm, address, pmdp_ptep(pmdp)));
423} 423}
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 8416240c322c..c034dc3fe2d4 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -160,11 +160,6 @@ int pud_huge(pud_t pud)
160 return !!(pud_val(pud) & _PAGE_HUGE_PAGE); 160 return !!(pud_val(pud) & _PAGE_HUGE_PAGE);
161} 161}
162 162
163int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
164{
165 return 0;
166}
167
168#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 163#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
169static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, 164static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
170 unsigned long addr, unsigned long len, 165 unsigned long addr, unsigned long len,
diff --git a/arch/um/include/asm/mm-arch-hooks.h b/arch/um/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..a7c8b0dfdd4e
--- /dev/null
+++ b/arch/um/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_UM_MM_ARCH_HOOKS_H
13#define _ASM_UM_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_UM_MM_ARCH_HOOKS_H */
diff --git a/arch/unicore32/include/asm/mm-arch-hooks.h b/arch/unicore32/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..4d79a850c509
--- /dev/null
+++ b/arch/unicore32/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_UNICORE32_MM_ARCH_HOOKS_H
13#define _ASM_UNICORE32_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_UNICORE32_MM_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 68c05398bba9..dab7a3a750bf 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file,
26 return 0; 26 return 0;
27} 27}
28 28
29static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
30}
31
32static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, 29static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
33 unsigned long addr, unsigned long end, 30 unsigned long addr, unsigned long end,
34 unsigned long floor, 31 unsigned long floor,
diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..4e881a342236
--- /dev/null
+++ b/arch/x86/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_X86_MM_ARCH_HOOKS_H
13#define _ASM_X86_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_X86_MM_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2562e303405b..867da5bbb4a3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -805,8 +805,8 @@ static inline int pmd_write(pmd_t pmd)
805 return pmd_flags(pmd) & _PAGE_RW; 805 return pmd_flags(pmd) & _PAGE_RW;
806} 806}
807 807
808#define __HAVE_ARCH_PMDP_GET_AND_CLEAR 808#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
809static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, 809static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
810 pmd_t *pmdp) 810 pmd_t *pmdp)
811{ 811{
812 pmd_t pmd = native_pmdp_get_and_clear(pmdp); 812 pmd_t pmd = native_pmdp_get_and_clear(pmdp);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 83a7995625a6..58118e207a69 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,8 @@ void __init setup_bios_corruption_check(void)
91 91
92 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); 92 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
93 93
94 for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) { 94 for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
95 NULL) {
95 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), 96 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
96 PAGE_SIZE, corruption_check_size); 97 PAGE_SIZE, corruption_check_size);
97 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), 98 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e2ce85db2283..c8dda42cb6a3 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1123,7 +1123,8 @@ void __init memblock_find_dma_reserve(void)
1123 nr_pages += end_pfn - start_pfn; 1123 nr_pages += end_pfn - start_pfn;
1124 } 1124 }
1125 1125
1126 for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) { 1126 for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
1127 NULL) {
1127 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); 1128 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
1128 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); 1129 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
1129 if (start_pfn < end_pfn) 1130 if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 39ca113676fe..d3b95b89e9b2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1105,6 +1105,9 @@ void __init setup_arch(char **cmdline_p)
1105 memblock_set_current_limit(ISA_END_ADDRESS); 1105 memblock_set_current_limit(ISA_END_ADDRESS);
1106 memblock_x86_fill(); 1106 memblock_x86_fill();
1107 1107
1108 if (efi_enabled(EFI_BOOT))
1109 efi_find_mirror();
1110
1108 /* 1111 /*
1109 * The EFI specification says that boot service code won't be called 1112 * The EFI specification says that boot service code won't be called
1110 * after ExitBootServices(). This is, in fact, a lie. 1113 * after ExitBootServices(). This is, in fact, a lie.
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c8140e12816a..8340e45c891a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -433,7 +433,7 @@ void __init add_highpages_with_active_regions(int nid,
433 phys_addr_t start, end; 433 phys_addr_t start, end;
434 u64 i; 434 u64 i;
435 435
436 for_each_free_mem_range(i, nid, &start, &end, NULL) { 436 for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
437 unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), 437 unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
438 start_pfn, end_pfn); 438 start_pfn, end_pfn);
439 unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), 439 unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 3b984c3aa1b0..c1c382c58c60 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -117,6 +117,27 @@ void efi_get_time(struct timespec *now)
117 now->tv_nsec = 0; 117 now->tv_nsec = 0;
118} 118}
119 119
120void __init efi_find_mirror(void)
121{
122 void *p;
123 u64 mirror_size = 0, total_size = 0;
124
125 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
126 efi_memory_desc_t *md = p;
127 unsigned long long start = md->phys_addr;
128 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
129
130 total_size += size;
131 if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
132 memblock_mark_mirror(start, size);
133 mirror_size += size;
134 }
135 }
136 if (mirror_size)
137 pr_info("Memory: %lldM/%lldM mirrored memory\n",
138 mirror_size>>20, total_size>>20);
139}
140
120/* 141/*
121 * Tell the kernel about the EFI memory map. This might include 142 * Tell the kernel about the EFI memory map. This might include
122 * more than the max 128 entries that can fit in the e820 legacy 143 * more than the max 128 entries that can fit in the e820 legacy
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index ba78ccf651e7..1f5f6dc09736 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -52,14 +52,15 @@ dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
52} 52}
53 53
54static inline int 54static inline int
55dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, 55dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
56 enum dma_data_direction direction) 56 enum dma_data_direction direction)
57{ 57{
58 int i; 58 int i;
59 struct scatterlist *sg;
59 60
60 BUG_ON(direction == DMA_NONE); 61 BUG_ON(direction == DMA_NONE);
61 62
62 for (i = 0; i < nents; i++, sg++ ) { 63 for_each_sg(sglist, sg, nents, i) {
63 BUG_ON(!sg_page(sg)); 64 BUG_ON(!sg_page(sg));
64 65
65 sg->dma_address = sg_phys(sg); 66 sg->dma_address = sg_phys(sg);
@@ -124,20 +125,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
124 consistent_sync((void *)bus_to_virt(dma_handle)+offset,size,direction); 125 consistent_sync((void *)bus_to_virt(dma_handle)+offset,size,direction);
125} 126}
126static inline void 127static inline void
127dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, 128dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems,
128 enum dma_data_direction dir) 129 enum dma_data_direction dir)
129{ 130{
130 int i; 131 int i;
131 for (i = 0; i < nelems; i++, sg++) 132 struct scatterlist *sg;
133
134 for_each_sg(sglist, sg, nelems, i)
132 consistent_sync(sg_virt(sg), sg->length, dir); 135 consistent_sync(sg_virt(sg), sg->length, dir);
133} 136}
134 137
135static inline void 138static inline void
136dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, 139dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
137 enum dma_data_direction dir) 140 int nelems, enum dma_data_direction dir)
138{ 141{
139 int i; 142 int i;
140 for (i = 0; i < nelems; i++, sg++) 143 struct scatterlist *sg;
144
145 for_each_sg(sglist, sg, nelems, i)
141 consistent_sync(sg_virt(sg), sg->length, dir); 146 consistent_sync(sg_virt(sg), sg->length, dir);
142} 147}
143static inline int 148static inline int
diff --git a/arch/xtensa/include/asm/mm-arch-hooks.h b/arch/xtensa/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..d2e5cfd3dd02
--- /dev/null
+++ b/arch/xtensa/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
1/*
2 * Architecture specific mm hooks
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef _ASM_XTENSA_MM_ARCH_HOOKS_H
13#define _ASM_XTENSA_MM_ARCH_HOOKS_H
14
15#endif /* _ASM_XTENSA_MM_ARCH_HOOKS_H */
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index feafa172b155..2345ee7342d9 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -165,7 +165,7 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
165 * infrastructure. There is no real reason why the selected 165 * infrastructure. There is no real reason why the selected
166 * task should have access to the memory reserves. 166 * task should have access to the memory reserves.
167 */ 167 */
168 mark_tsk_oom_victim(selected); 168 mark_oom_victim(selected);
169 send_sig(SIGKILL, selected, 0); 169 send_sig(SIGKILL, selected, 0);
170 rem += selected_tasksize; 170 rem += selected_tasksize;
171 } 171 }
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 9ffdfcf2ec6e..1c4791033b72 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -353,9 +353,11 @@ static struct sysrq_key_op sysrq_term_op = {
353 353
354static void moom_callback(struct work_struct *ignored) 354static void moom_callback(struct work_struct *ignored)
355{ 355{
356 mutex_lock(&oom_lock);
356 if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), 357 if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
357 GFP_KERNEL, 0, NULL, true)) 358 GFP_KERNEL, 0, NULL, true))
358 pr_info("OOM request ignored because killer is disabled\n"); 359 pr_info("OOM request ignored because killer is disabled\n");
360 mutex_unlock(&oom_lock);
359} 361}
360 362
361static DECLARE_WORK(moom_work, moom_callback); 363static DECLARE_WORK(moom_work, moom_callback);
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index c4211a31612d..d88f36754bf7 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -381,15 +381,9 @@ static int __init xen_tmem_init(void)
381#ifdef CONFIG_FRONTSWAP 381#ifdef CONFIG_FRONTSWAP
382 if (tmem_enabled && frontswap) { 382 if (tmem_enabled && frontswap) {
383 char *s = ""; 383 char *s = "";
384 struct frontswap_ops *old_ops;
385 384
386 tmem_frontswap_poolid = -1; 385 tmem_frontswap_poolid = -1;
387 old_ops = frontswap_register_ops(&tmem_frontswap_ops); 386 frontswap_register_ops(&tmem_frontswap_ops);
388 if (IS_ERR(old_ops) || old_ops) {
389 if (IS_ERR(old_ops))
390 return PTR_ERR(old_ops);
391 s = " (WARNING: frontswap_ops overridden)";
392 }
393 pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n", 387 pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n",
394 s); 388 s);
395 } 389 }
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e65f9ffbb999..4d6a30e76168 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -47,12 +47,11 @@ static void config_item_release(struct kref *kref);
47 * config_item_init - initialize item. 47 * config_item_init - initialize item.
48 * @item: item in question. 48 * @item: item in question.
49 */ 49 */
50void config_item_init(struct config_item *item) 50static void config_item_init(struct config_item *item)
51{ 51{
52 kref_init(&item->ci_kref); 52 kref_init(&item->ci_kref);
53 INIT_LIST_HEAD(&item->ci_entry); 53 INIT_LIST_HEAD(&item->ci_entry);
54} 54}
55EXPORT_SYMBOL(config_item_init);
56 55
57/** 56/**
58 * config_item_set_name - Set the name of an item 57 * config_item_set_name - Set the name of an item
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 87724c1d7be6..0cf74df68617 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -130,7 +130,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
130 goto out; 130 goto out;
131 131
132 ret = 0; 132 ret = 0;
133 hugetlb_prefault_arch_hook(vma->vm_mm);
134 if (vma->vm_flags & VM_WRITE && inode->i_size < len) 133 if (vma->vm_flags & VM_WRITE && inode->i_size < len)
135 inode->i_size = len; 134 inode->i_size = len;
136out: 135out:
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 7bb487e663b4..2cd653670764 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -525,7 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
525 } 525 }
526 } 526 }
527 err = add_to_page_cache_lru(*cached_page, mapping, 527 err = add_to_page_cache_lru(*cached_page, mapping,
528 index, GFP_KERNEL); 528 index,
529 GFP_KERNEL & mapping_gfp_mask(mapping));
529 if (unlikely(err)) { 530 if (unlikely(err)) {
530 if (err == -EEXIST) 531 if (err == -EEXIST)
531 continue; 532 continue;
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index a44b14cbceeb..ab172e5f51d9 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -85,12 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
85 85
86static inline void ntfs_free(void *addr) 86static inline void ntfs_free(void *addr)
87{ 87{
88 if (!is_vmalloc_addr(addr)) { 88 kvfree(addr);
89 kfree(addr);
90 /* free_page((unsigned long)addr); */
91 return;
92 }
93 vfree(addr);
94} 89}
95 90
96#endif /* _LINUX_NTFS_MALLOC_H */ 91#endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 2d7f76e52c37..5997c00a1515 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2925,7 +2925,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
2925 struct ocfs2_path *right_path = NULL; 2925 struct ocfs2_path *right_path = NULL;
2926 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 2926 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2927 2927
2928 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); 2928 if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])))
2929 return 0;
2929 2930
2930 *empty_extent_path = NULL; 2931 *empty_extent_path = NULL;
2931 2932
@@ -4311,13 +4312,13 @@ out:
4311 return ret; 4312 return ret;
4312} 4313}
4313 4314
4314static enum ocfs2_contig_type 4315static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4315ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4316 struct ocfs2_path *path, 4316 struct ocfs2_path *path,
4317 struct ocfs2_extent_list *el, int index, 4317 struct ocfs2_extent_list *el, int index,
4318 struct ocfs2_extent_rec *split_rec) 4318 struct ocfs2_extent_rec *split_rec,
4319 struct ocfs2_merge_ctxt *ctxt)
4319{ 4320{
4320 int status; 4321 int status = 0;
4321 enum ocfs2_contig_type ret = CONTIG_NONE; 4322 enum ocfs2_contig_type ret = CONTIG_NONE;
4322 u32 left_cpos, right_cpos; 4323 u32 left_cpos, right_cpos;
4323 struct ocfs2_extent_rec *rec = NULL; 4324 struct ocfs2_extent_rec *rec = NULL;
@@ -4336,8 +4337,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4336 4337
4337 if (left_cpos != 0) { 4338 if (left_cpos != 0) {
4338 left_path = ocfs2_new_path_from_path(path); 4339 left_path = ocfs2_new_path_from_path(path);
4339 if (!left_path) 4340 if (!left_path) {
4341 status = -ENOMEM;
4342 mlog_errno(status);
4340 goto exit; 4343 goto exit;
4344 }
4341 4345
4342 status = ocfs2_find_path(et->et_ci, left_path, 4346 status = ocfs2_find_path(et->et_ci, left_path,
4343 left_cpos); 4347 left_cpos);
@@ -4392,8 +4396,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4392 goto free_left_path; 4396 goto free_left_path;
4393 4397
4394 right_path = ocfs2_new_path_from_path(path); 4398 right_path = ocfs2_new_path_from_path(path);
4395 if (!right_path) 4399 if (!right_path) {
4400 status = -ENOMEM;
4401 mlog_errno(status);
4396 goto free_left_path; 4402 goto free_left_path;
4403 }
4397 4404
4398 status = ocfs2_find_path(et->et_ci, right_path, right_cpos); 4405 status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4399 if (status) 4406 if (status)
@@ -4433,7 +4440,10 @@ free_right_path:
4433free_left_path: 4440free_left_path:
4434 ocfs2_free_path(left_path); 4441 ocfs2_free_path(left_path);
4435exit: 4442exit:
4436 return ret; 4443 if (status == 0)
4444 ctxt->c_contig_type = ret;
4445
4446 return status;
4437} 4447}
4438 4448
4439static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, 4449static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
@@ -5039,9 +5049,14 @@ int ocfs2_split_extent(handle_t *handle,
5039 goto out; 5049 goto out;
5040 } 5050 }
5041 5051
5042 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, 5052 ret = ocfs2_figure_merge_contig_type(et, path, el,
5043 split_index, 5053 split_index,
5044 split_rec); 5054 split_rec,
5055 &ctxt);
5056 if (ret) {
5057 mlog_errno(ret);
5058 goto out;
5059 }
5045 5060
5046 /* 5061 /*
5047 * The core merge / split code wants to know how much room is 5062 * The core merge / split code wants to know how much room is
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f906a250da6a..1a35c6139656 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -523,7 +523,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
523 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 523 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
524 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; 524 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
525 unsigned long len = bh_result->b_size; 525 unsigned long len = bh_result->b_size;
526 unsigned int clusters_to_alloc = 0; 526 unsigned int clusters_to_alloc = 0, contig_clusters = 0;
527 527
528 cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); 528 cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
529 529
@@ -560,8 +560,10 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
560 /* fill hole, allocate blocks can't be larger than the size 560 /* fill hole, allocate blocks can't be larger than the size
561 * of the hole */ 561 * of the hole */
562 clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); 562 clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
563 if (clusters_to_alloc > contig_blocks) 563 contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
564 clusters_to_alloc = contig_blocks; 564 contig_blocks);
565 if (clusters_to_alloc > contig_clusters)
566 clusters_to_alloc = contig_clusters;
565 567
566 /* allocate extent and insert them into the extent tree */ 568 /* allocate extent and insert them into the extent tree */
567 ret = ocfs2_extend_allocation(inode, cpos, 569 ret = ocfs2_extend_allocation(inode, cpos,
@@ -619,9 +621,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
619 /* this io's submitter should not have unlocked this before we could */ 621 /* this io's submitter should not have unlocked this before we could */
620 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 622 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
621 623
622 if (ocfs2_iocb_is_sem_locked(iocb))
623 ocfs2_iocb_clear_sem_locked(iocb);
624
625 if (ocfs2_iocb_is_unaligned_aio(iocb)) { 624 if (ocfs2_iocb_is_unaligned_aio(iocb)) {
626 ocfs2_iocb_clear_unaligned_aio(iocb); 625 ocfs2_iocb_clear_unaligned_aio(iocb);
627 626
@@ -925,13 +924,23 @@ clean_orphan:
925 int update_isize = written > 0 ? 1 : 0; 924 int update_isize = written > 0 ? 1 : 0;
926 loff_t end = update_isize ? offset + written : 0; 925 loff_t end = update_isize ? offset + written : 0;
927 926
928 tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, 927 tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
928 if (tmp_ret < 0) {
929 ret = tmp_ret;
930 mlog_errno(ret);
931 goto out;
932 }
933
934 tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
929 update_isize, end); 935 update_isize, end);
930 if (tmp_ret < 0) { 936 if (tmp_ret < 0) {
931 ret = tmp_ret; 937 ret = tmp_ret;
938 mlog_errno(ret);
932 goto out; 939 goto out;
933 } 940 }
934 941
942 ocfs2_inode_unlock(inode, 1);
943
935 tmp_ret = jbd2_journal_force_commit(journal); 944 tmp_ret = jbd2_journal_force_commit(journal);
936 if (tmp_ret < 0) { 945 if (tmp_ret < 0) {
937 ret = tmp_ret; 946 ret = tmp_ret;
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index dd59599b022d..24e496d6bdcd 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -79,7 +79,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
79enum ocfs2_iocb_lock_bits { 79enum ocfs2_iocb_lock_bits {
80 OCFS2_IOCB_RW_LOCK = 0, 80 OCFS2_IOCB_RW_LOCK = 0,
81 OCFS2_IOCB_RW_LOCK_LEVEL, 81 OCFS2_IOCB_RW_LOCK_LEVEL,
82 OCFS2_IOCB_SEM,
83 OCFS2_IOCB_UNALIGNED_IO, 82 OCFS2_IOCB_UNALIGNED_IO,
84 OCFS2_IOCB_NUM_LOCKS 83 OCFS2_IOCB_NUM_LOCKS
85}; 84};
@@ -88,12 +87,6 @@ enum ocfs2_iocb_lock_bits {
88 clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) 87 clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
89#define ocfs2_iocb_rw_locked_level(iocb) \ 88#define ocfs2_iocb_rw_locked_level(iocb) \
90 test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) 89 test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
91#define ocfs2_iocb_set_sem_locked(iocb) \
92 set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
93#define ocfs2_iocb_clear_sem_locked(iocb) \
94 clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
95#define ocfs2_iocb_is_sem_locked(iocb) \
96 test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
97 90
98#define ocfs2_iocb_set_unaligned_aio(iocb) \ 91#define ocfs2_iocb_set_unaligned_aio(iocb) \
99 set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) 92 set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index af7598bff1b5..dfe162f5fd4c 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -64,6 +64,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
64 return count; 64 return count;
65} 65}
66 66
67void __mlog_printk(const u64 *mask, const char *func, int line,
68 const char *fmt, ...)
69{
70 struct va_format vaf;
71 va_list args;
72 const char *level;
73 const char *prefix = "";
74
75 if (!__mlog_test_u64(*mask, mlog_and_bits) ||
76 __mlog_test_u64(*mask, mlog_not_bits))
77 return;
78
79 if (*mask & ML_ERROR) {
80 level = KERN_ERR;
81 prefix = "ERROR: ";
82 } else if (*mask & ML_NOTICE) {
83 level = KERN_NOTICE;
84 } else {
85 level = KERN_INFO;
86 }
87
88 va_start(args, fmt);
89
90 vaf.fmt = fmt;
91 vaf.va = &args;
92
93 printk("%s(%s,%u,%u):%s:%d %s%pV",
94 level, current->comm, task_pid_nr(current),
95 raw_smp_processor_id(), func, line, prefix, &vaf);
96
97 va_end(args);
98}
99EXPORT_SYMBOL_GPL(__mlog_printk);
100
67struct mlog_attribute { 101struct mlog_attribute {
68 struct attribute attr; 102 struct attribute attr;
69 u64 mask; 103 u64 mask;
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 7fdc25a4d8c0..308ea0eb35fd 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -162,38 +162,20 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
162 162
163#endif 163#endif
164 164
165/* 165__printf(4, 5)
166 * smp_processor_id() "helpfully" screams when called outside preemptible 166void __mlog_printk(const u64 *m, const char *func, int line,
167 * regions in current kernels. sles doesn't have the variants that don't 167 const char *fmt, ...);
168 * scream. just do this instead of trying to guess which we're building
169 * against.. *sigh*.
170 */
171#define __mlog_cpu_guess ({ \
172 unsigned long _cpu = get_cpu(); \
173 put_cpu(); \
174 _cpu; \
175})
176 168
177/* In the following two macros, the whitespace after the ',' just 169/*
178 * before ##args is intentional. Otherwise, gcc 2.95 will eat the 170 * Testing before the __mlog_printk call lets the compiler eliminate the
179 * previous token if args expands to nothing. 171 * call completely when (m & ML_ALLOWED_BITS) is 0.
180 */ 172 */
181#define __mlog_printk(level, fmt, args...) \ 173#define mlog(mask, fmt, ...) \
182 printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ 174do { \
183 task_pid_nr(current), __mlog_cpu_guess, \ 175 u64 _m = MLOG_MASK_PREFIX | (mask); \
184 __PRETTY_FUNCTION__, __LINE__ , ##args) 176 if (_m & ML_ALLOWED_BITS) \
185 177 __mlog_printk(&_m, __func__, __LINE__, fmt, \
186#define mlog(mask, fmt, args...) do { \ 178 ##__VA_ARGS__); \
187 u64 __m = MLOG_MASK_PREFIX | (mask); \
188 if ((__m & ML_ALLOWED_BITS) && \
189 __mlog_test_u64(__m, mlog_and_bits) && \
190 !__mlog_test_u64(__m, mlog_not_bits)) { \
191 if (__m & ML_ERROR) \
192 __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
193 else if (__m & ML_NOTICE) \
194 __mlog_printk(KERN_NOTICE, fmt , ##args); \
195 else __mlog_printk(KERN_INFO, fmt , ##args); \
196 } \
197} while (0) 179} while (0)
198 180
199#define mlog_errno(st) ({ \ 181#define mlog_errno(st) ({ \
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 56c403a563bc..2d0acd6678fe 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -2204,7 +2204,7 @@ out:
2204 kfree(o2net_hand); 2204 kfree(o2net_hand);
2205 kfree(o2net_keep_req); 2205 kfree(o2net_keep_req);
2206 kfree(o2net_keep_resp); 2206 kfree(o2net_keep_resp);
2207 2207 o2net_debugfs_exit();
2208 o2quo_exit(); 2208 o2quo_exit();
2209 return -ENOMEM; 2209 return -ENOMEM;
2210} 2210}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ccd4dcfc3645..02878a83f0b4 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1617,7 +1617,7 @@ int __ocfs2_add_entry(handle_t *handle,
1617 struct ocfs2_dir_entry *de, *de1; 1617 struct ocfs2_dir_entry *de, *de1;
1618 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data; 1618 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1619 struct super_block *sb = dir->i_sb; 1619 struct super_block *sb = dir->i_sb;
1620 int retval, status; 1620 int retval;
1621 unsigned int size = sb->s_blocksize; 1621 unsigned int size = sb->s_blocksize;
1622 struct buffer_head *insert_bh = lookup->dl_leaf_bh; 1622 struct buffer_head *insert_bh = lookup->dl_leaf_bh;
1623 char *data_start = insert_bh->b_data; 1623 char *data_start = insert_bh->b_data;
@@ -1695,25 +1695,25 @@ int __ocfs2_add_entry(handle_t *handle,
1695 } 1695 }
1696 1696
1697 if (insert_bh == parent_fe_bh) 1697 if (insert_bh == parent_fe_bh)
1698 status = ocfs2_journal_access_di(handle, 1698 retval = ocfs2_journal_access_di(handle,
1699 INODE_CACHE(dir), 1699 INODE_CACHE(dir),
1700 insert_bh, 1700 insert_bh,
1701 OCFS2_JOURNAL_ACCESS_WRITE); 1701 OCFS2_JOURNAL_ACCESS_WRITE);
1702 else { 1702 else {
1703 status = ocfs2_journal_access_db(handle, 1703 retval = ocfs2_journal_access_db(handle,
1704 INODE_CACHE(dir), 1704 INODE_CACHE(dir),
1705 insert_bh, 1705 insert_bh,
1706 OCFS2_JOURNAL_ACCESS_WRITE); 1706 OCFS2_JOURNAL_ACCESS_WRITE);
1707 1707
1708 if (ocfs2_dir_indexed(dir)) { 1708 if (!retval && ocfs2_dir_indexed(dir))
1709 status = ocfs2_dx_dir_insert(dir, 1709 retval = ocfs2_dx_dir_insert(dir,
1710 handle, 1710 handle,
1711 lookup); 1711 lookup);
1712 if (status) { 1712 }
1713 mlog_errno(status); 1713
1714 goto bail; 1714 if (retval) {
1715 } 1715 mlog_errno(retval);
1716 } 1716 goto bail;
1717 } 1717 }
1718 1718
1719 /* By now the buffer is marked for journaling */ 1719 /* By now the buffer is marked for journaling */
@@ -3543,13 +3543,10 @@ static void dx_leaf_sort_swap(void *a, void *b, int size)
3543{ 3543{
3544 struct ocfs2_dx_entry *entry1 = a; 3544 struct ocfs2_dx_entry *entry1 = a;
3545 struct ocfs2_dx_entry *entry2 = b; 3545 struct ocfs2_dx_entry *entry2 = b;
3546 struct ocfs2_dx_entry tmp;
3547 3546
3548 BUG_ON(size != sizeof(*entry1)); 3547 BUG_ON(size != sizeof(*entry1));
3549 3548
3550 tmp = *entry1; 3549 swap(*entry1, *entry2);
3551 *entry1 = *entry2;
3552 *entry2 = tmp;
3553} 3550}
3554 3551
3555static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) 3552static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index fae17c640df3..e88ccf8c83ff 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1014,7 +1014,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1014 1014
1015/* will exit holding res->spinlock, but may drop in function */ 1015/* will exit holding res->spinlock, but may drop in function */
1016void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags); 1016void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
1017void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
1018 1017
1019/* will exit holding res->spinlock, but may drop in function */ 1018/* will exit holding res->spinlock, but may drop in function */
1020static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) 1019static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d8b670cbd909..fbfadb289e62 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2250,7 +2250,7 @@ out:
2250static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, 2250static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2251 struct iov_iter *from) 2251 struct iov_iter *from)
2252{ 2252{
2253 int direct_io, appending, rw_level, have_alloc_sem = 0; 2253 int direct_io, appending, rw_level;
2254 int can_do_direct, has_refcount = 0; 2254 int can_do_direct, has_refcount = 0;
2255 ssize_t written = 0; 2255 ssize_t written = 0;
2256 ssize_t ret; 2256 ssize_t ret;
@@ -2279,16 +2279,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2279 2279
2280 mutex_lock(&inode->i_mutex); 2280 mutex_lock(&inode->i_mutex);
2281 2281
2282 ocfs2_iocb_clear_sem_locked(iocb);
2283
2284relock: 2282relock:
2285 /* to match setattr's i_mutex -> rw_lock ordering */
2286 if (direct_io) {
2287 have_alloc_sem = 1;
2288 /* communicate with ocfs2_dio_end_io */
2289 ocfs2_iocb_set_sem_locked(iocb);
2290 }
2291
2292 /* 2283 /*
2293 * Concurrent O_DIRECT writes are allowed with 2284 * Concurrent O_DIRECT writes are allowed with
2294 * mount_option "coherency=buffered". 2285 * mount_option "coherency=buffered".
@@ -2298,7 +2289,7 @@ relock:
2298 ret = ocfs2_rw_lock(inode, rw_level); 2289 ret = ocfs2_rw_lock(inode, rw_level);
2299 if (ret < 0) { 2290 if (ret < 0) {
2300 mlog_errno(ret); 2291 mlog_errno(ret);
2301 goto out_sems; 2292 goto out_mutex;
2302 } 2293 }
2303 2294
2304 /* 2295 /*
@@ -2347,7 +2338,6 @@ relock:
2347 if (direct_io && !can_do_direct) { 2338 if (direct_io && !can_do_direct) {
2348 ocfs2_rw_unlock(inode, rw_level); 2339 ocfs2_rw_unlock(inode, rw_level);
2349 2340
2350 have_alloc_sem = 0;
2351 rw_level = -1; 2341 rw_level = -1;
2352 2342
2353 direct_io = 0; 2343 direct_io = 0;
@@ -2416,7 +2406,6 @@ no_sync:
2416 */ 2406 */
2417 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2407 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2418 rw_level = -1; 2408 rw_level = -1;
2419 have_alloc_sem = 0;
2420 unaligned_dio = 0; 2409 unaligned_dio = 0;
2421 } 2410 }
2422 2411
@@ -2429,10 +2418,7 @@ out:
2429 if (rw_level != -1) 2418 if (rw_level != -1)
2430 ocfs2_rw_unlock(inode, rw_level); 2419 ocfs2_rw_unlock(inode, rw_level);
2431 2420
2432out_sems: 2421out_mutex:
2433 if (have_alloc_sem)
2434 ocfs2_iocb_clear_sem_locked(iocb);
2435
2436 mutex_unlock(&inode->i_mutex); 2422 mutex_unlock(&inode->i_mutex);
2437 2423
2438 if (written) 2424 if (written)
@@ -2473,7 +2459,7 @@ bail:
2473static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, 2459static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
2474 struct iov_iter *to) 2460 struct iov_iter *to)
2475{ 2461{
2476 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; 2462 int ret = 0, rw_level = -1, lock_level = 0;
2477 struct file *filp = iocb->ki_filp; 2463 struct file *filp = iocb->ki_filp;
2478 struct inode *inode = file_inode(filp); 2464 struct inode *inode = file_inode(filp);
2479 2465
@@ -2490,16 +2476,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
2490 goto bail; 2476 goto bail;
2491 } 2477 }
2492 2478
2493 ocfs2_iocb_clear_sem_locked(iocb);
2494
2495 /* 2479 /*
2496 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 2480 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
2497 * need locks to protect pending reads from racing with truncate. 2481 * need locks to protect pending reads from racing with truncate.
2498 */ 2482 */
2499 if (iocb->ki_flags & IOCB_DIRECT) { 2483 if (iocb->ki_flags & IOCB_DIRECT) {
2500 have_alloc_sem = 1;
2501 ocfs2_iocb_set_sem_locked(iocb);
2502
2503 ret = ocfs2_rw_lock(inode, 0); 2484 ret = ocfs2_rw_lock(inode, 0);
2504 if (ret < 0) { 2485 if (ret < 0) {
2505 mlog_errno(ret); 2486 mlog_errno(ret);
@@ -2535,13 +2516,9 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
2535 /* see ocfs2_file_write_iter */ 2516 /* see ocfs2_file_write_iter */
2536 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2517 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2537 rw_level = -1; 2518 rw_level = -1;
2538 have_alloc_sem = 0;
2539 } 2519 }
2540 2520
2541bail: 2521bail:
2542 if (have_alloc_sem)
2543 ocfs2_iocb_clear_sem_locked(iocb);
2544
2545 if (rw_level != -1) 2522 if (rw_level != -1)
2546 ocfs2_rw_unlock(inode, rw_level); 2523 ocfs2_rw_unlock(inode, rw_level);
2547 2524
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ff531928269e..7c099f7032fd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -108,7 +108,7 @@ struct ocfs2_replay_map {
108 unsigned char rm_replay_slots[0]; 108 unsigned char rm_replay_slots[0];
109}; 109};
110 110
111void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) 111static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
112{ 112{
113 if (!osb->replay_map) 113 if (!osb->replay_map)
114 return; 114 return;
@@ -153,7 +153,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
153 return 0; 153 return 0;
154} 154}
155 155
156void ocfs2_queue_replay_slots(struct ocfs2_super *osb, 156static void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
157 enum ocfs2_orphan_reco_type orphan_reco_type) 157 enum ocfs2_orphan_reco_type orphan_reco_type)
158{ 158{
159 struct ocfs2_replay_map *replay_map = osb->replay_map; 159 struct ocfs2_replay_map *replay_map = osb->replay_map;
@@ -173,7 +173,7 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
173 replay_map->rm_state = REPLAY_DONE; 173 replay_map->rm_state = REPLAY_DONE;
174} 174}
175 175
176void ocfs2_free_replay_slots(struct ocfs2_super *osb) 176static void ocfs2_free_replay_slots(struct ocfs2_super *osb)
177{ 177{
178 struct ocfs2_replay_map *replay_map = osb->replay_map; 178 struct ocfs2_replay_map *replay_map = osb->replay_map;
179 179
@@ -571,9 +571,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
571 (unsigned long)bh, 571 (unsigned long)bh,
572 (unsigned long long)bh->b_blocknr); 572 (unsigned long long)bh->b_blocknr);
573 573
574 /* We aren't guaranteed to have the superblock here - but if we 574 ocfs2_error(bh->b_bdev->bd_super,
575 * don't, it'll just crash. */
576 ocfs2_error(bh->b_assoc_map->host->i_sb,
577 "JBD2 has aborted our journal, ocfs2 cannot continue\n"); 575 "JBD2 has aborted our journal, ocfs2 cannot continue\n");
578} 576}
579 577
@@ -775,7 +773,20 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
775 trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr); 773 trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
776 774
777 status = jbd2_journal_dirty_metadata(handle, bh); 775 status = jbd2_journal_dirty_metadata(handle, bh);
778 BUG_ON(status); 776 if (status) {
777 mlog_errno(status);
778 if (!is_handle_aborted(handle)) {
779 journal_t *journal = handle->h_transaction->t_journal;
780 struct super_block *sb = bh->b_bdev->bd_super;
781
782 mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. "
783 "Aborting transaction and journal.\n");
784 handle->h_err = status;
785 jbd2_journal_abort_handle(handle);
786 jbd2_journal_abort(journal, status);
787 ocfs2_abort(sb, "Journal already aborted.\n");
788 }
789 }
779} 790}
780 791
781#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) 792#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -1884,7 +1895,7 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
1884 * hasn't happened. The node queues a scan and increments the 1895 * hasn't happened. The node queues a scan and increments the
1885 * sequence number in the LVB. 1896 * sequence number in the LVB.
1886 */ 1897 */
1887void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) 1898static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1888{ 1899{
1889 struct ocfs2_orphan_scan *os; 1900 struct ocfs2_orphan_scan *os;
1890 int status, i; 1901 int status, i;
@@ -1933,7 +1944,7 @@ out:
1933} 1944}
1934 1945
1935/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */ 1946/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
1936void ocfs2_orphan_scan_work(struct work_struct *work) 1947static void ocfs2_orphan_scan_work(struct work_struct *work)
1937{ 1948{
1938 struct ocfs2_orphan_scan *os; 1949 struct ocfs2_orphan_scan *os;
1939 struct ocfs2_super *osb; 1950 struct ocfs2_super *osb;
@@ -2137,6 +2148,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2137 struct inode *inode = NULL; 2148 struct inode *inode = NULL;
2138 struct inode *iter; 2149 struct inode *iter;
2139 struct ocfs2_inode_info *oi; 2150 struct ocfs2_inode_info *oi;
2151 struct buffer_head *di_bh = NULL;
2152 struct ocfs2_dinode *di = NULL;
2140 2153
2141 trace_ocfs2_recover_orphans(slot); 2154 trace_ocfs2_recover_orphans(slot);
2142 2155
@@ -2157,16 +2170,22 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2157 iter = oi->ip_next_orphan; 2170 iter = oi->ip_next_orphan;
2158 oi->ip_next_orphan = NULL; 2171 oi->ip_next_orphan = NULL;
2159 2172
2173 ret = ocfs2_rw_lock(inode, 1);
2174 if (ret < 0) {
2175 mlog_errno(ret);
2176 goto next;
2177 }
2160 /* 2178 /*
2161 * We need to take and drop the inode lock to 2179 * We need to take and drop the inode lock to
2162 * force read inode from disk. 2180 * force read inode from disk.
2163 */ 2181 */
2164 ret = ocfs2_inode_lock(inode, NULL, 0); 2182 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2165 if (ret) { 2183 if (ret) {
2166 mlog_errno(ret); 2184 mlog_errno(ret);
2167 goto next; 2185 goto unlock_rw;
2168 } 2186 }
2169 ocfs2_inode_unlock(inode, 0); 2187
2188 di = (struct ocfs2_dinode *)di_bh->b_data;
2170 2189
2171 if (inode->i_nlink == 0) { 2190 if (inode->i_nlink == 0) {
2172 spin_lock(&oi->ip_lock); 2191 spin_lock(&oi->ip_lock);
@@ -2174,43 +2193,30 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2174 * ocfs2_delete_inode. */ 2193 * ocfs2_delete_inode. */
2175 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; 2194 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
2176 spin_unlock(&oi->ip_lock); 2195 spin_unlock(&oi->ip_lock);
2177 } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) { 2196 } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
2178 struct buffer_head *di_bh = NULL; 2197 (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
2179
2180 ret = ocfs2_rw_lock(inode, 1);
2181 if (ret) {
2182 mlog_errno(ret);
2183 goto next;
2184 }
2185
2186 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2187 if (ret < 0) {
2188 ocfs2_rw_unlock(inode, 1);
2189 mlog_errno(ret);
2190 goto next;
2191 }
2192
2193 ret = ocfs2_truncate_file(inode, di_bh, 2198 ret = ocfs2_truncate_file(inode, di_bh,
2194 i_size_read(inode)); 2199 i_size_read(inode));
2195 ocfs2_inode_unlock(inode, 1);
2196 ocfs2_rw_unlock(inode, 1);
2197 brelse(di_bh);
2198 if (ret < 0) { 2200 if (ret < 0) {
2199 if (ret != -ENOSPC) 2201 if (ret != -ENOSPC)
2200 mlog_errno(ret); 2202 mlog_errno(ret);
2201 goto next; 2203 goto unlock_inode;
2202 } 2204 }
2203 2205
2204 ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); 2206 ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
2205 if (ret) 2207 if (ret)
2206 mlog_errno(ret); 2208 mlog_errno(ret);
2207 2209
2208 wake_up(&OCFS2_I(inode)->append_dio_wq); 2210 wake_up(&OCFS2_I(inode)->append_dio_wq);
2209 } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ 2211 } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
2210 2212unlock_inode:
2213 ocfs2_inode_unlock(inode, 1);
2214unlock_rw:
2215 ocfs2_rw_unlock(inode, 1);
2211next: 2216next:
2212 iput(inode); 2217 iput(inode);
2213 2218 brelse(di_bh);
2219 di_bh = NULL;
2214 inode = iter; 2220 inode = iter;
2215 } 2221 }
2216 2222
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 176fe6afd94e..6e6abb93fda5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1116,8 +1116,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1116 int inode1_is_ancestor, inode2_is_ancestor; 1116 int inode1_is_ancestor, inode2_is_ancestor;
1117 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); 1117 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
1118 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); 1118 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
1119 struct buffer_head **tmpbh;
1120 struct inode *tmpinode;
1121 1119
1122 trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, 1120 trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
1123 (unsigned long long)oi2->ip_blkno); 1121 (unsigned long long)oi2->ip_blkno);
@@ -1148,13 +1146,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1148 (oi1->ip_blkno < oi2->ip_blkno && 1146 (oi1->ip_blkno < oi2->ip_blkno &&
1149 inode2_is_ancestor == 0)) { 1147 inode2_is_ancestor == 0)) {
1150 /* switch id1 and id2 around */ 1148 /* switch id1 and id2 around */
1151 tmpbh = bh2; 1149 swap(bh2, bh1);
1152 bh2 = bh1; 1150 swap(inode2, inode1);
1153 bh1 = tmpbh;
1154
1155 tmpinode = inode2;
1156 inode2 = inode1;
1157 inode1 = tmpinode;
1158 } 1151 }
1159 /* lock id2 */ 1152 /* lock id2 */
1160 status = ocfs2_inode_lock_nested(inode2, bh2, 1, 1153 status = ocfs2_inode_lock_nested(inode2, bh2, 1,
@@ -2670,30 +2663,22 @@ bail:
2670} 2663}
2671 2664
2672int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, 2665int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
2673 struct inode *inode, int update_isize, 2666 struct inode *inode, struct buffer_head *di_bh,
2674 loff_t end) 2667 int update_isize, loff_t end)
2675{ 2668{
2676 struct inode *orphan_dir_inode = NULL; 2669 struct inode *orphan_dir_inode = NULL;
2677 struct buffer_head *orphan_dir_bh = NULL; 2670 struct buffer_head *orphan_dir_bh = NULL;
2678 struct buffer_head *di_bh = NULL; 2671 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2679 struct ocfs2_dinode *di = NULL;
2680 handle_t *handle = NULL; 2672 handle_t *handle = NULL;
2681 int status = 0; 2673 int status = 0;
2682 2674
2683 status = ocfs2_inode_lock(inode, &di_bh, 1);
2684 if (status < 0) {
2685 mlog_errno(status);
2686 goto bail;
2687 }
2688 di = (struct ocfs2_dinode *) di_bh->b_data;
2689
2690 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 2675 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2691 ORPHAN_DIR_SYSTEM_INODE, 2676 ORPHAN_DIR_SYSTEM_INODE,
2692 le16_to_cpu(di->i_dio_orphaned_slot)); 2677 le16_to_cpu(di->i_dio_orphaned_slot));
2693 if (!orphan_dir_inode) { 2678 if (!orphan_dir_inode) {
2694 status = -ENOENT; 2679 status = -ENOENT;
2695 mlog_errno(status); 2680 mlog_errno(status);
2696 goto bail_unlock_inode; 2681 goto bail;
2697 } 2682 }
2698 2683
2699 mutex_lock(&orphan_dir_inode->i_mutex); 2684 mutex_lock(&orphan_dir_inode->i_mutex);
@@ -2702,7 +2687,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
2702 mutex_unlock(&orphan_dir_inode->i_mutex); 2687 mutex_unlock(&orphan_dir_inode->i_mutex);
2703 iput(orphan_dir_inode); 2688 iput(orphan_dir_inode);
2704 mlog_errno(status); 2689 mlog_errno(status);
2705 goto bail_unlock_inode; 2690 goto bail;
2706 } 2691 }
2707 2692
2708 handle = ocfs2_start_trans(osb, 2693 handle = ocfs2_start_trans(osb,
@@ -2749,10 +2734,6 @@ bail_unlock_orphan:
2749 brelse(orphan_dir_bh); 2734 brelse(orphan_dir_bh);
2750 iput(orphan_dir_inode); 2735 iput(orphan_dir_inode);
2751 2736
2752bail_unlock_inode:
2753 ocfs2_inode_unlock(inode, 1);
2754 brelse(di_bh);
2755
2756bail: 2737bail:
2757 return status; 2738 return status;
2758} 2739}
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 5ddecce172fa..e173329eb830 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -42,8 +42,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
42int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, 42int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
43 struct inode *inode); 43 struct inode *inode);
44int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, 44int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
45 struct inode *inode, int update_isize, 45 struct inode *inode, struct buffer_head *di_bh,
46 loff_t end); 46 int update_isize, loff_t end);
47int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, 47int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
48 struct inode *new_inode, 48 struct inode *new_inode,
49 struct dentry *new_dentry); 49 struct dentry *new_dentry);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 460c6c37e683..690ddc60189b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -717,6 +717,16 @@ static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
717 return (u64)clusters << c_to_b_bits; 717 return (u64)clusters << c_to_b_bits;
718} 718}
719 719
720static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb,
721 u64 blocks)
722{
723 int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
724 sb->s_blocksize_bits;
725
726 blocks += (1 << b_to_c_bits) - 1;
727 return (u32)(blocks >> b_to_c_bits);
728}
729
720static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, 730static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
721 u64 blocks) 731 u64 blocks)
722{ 732{
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d8c6af101f3f..b69dd14c0b9b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1406,11 +1406,9 @@ static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
1406 1406
1407static void swap_refcount_rec(void *a, void *b, int size) 1407static void swap_refcount_rec(void *a, void *b, int size)
1408{ 1408{
1409 struct ocfs2_refcount_rec *l = a, *r = b, tmp; 1409 struct ocfs2_refcount_rec *l = a, *r = b;
1410 1410
1411 tmp = *l; 1411 swap(*l, *r);
1412 *l = *r;
1413 *r = tmp;
1414} 1412}
1415 1413
1416/* 1414/*
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d03bfbf3d27d..889f3796a0d7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7271,7 +7271,7 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
7271 name, value, size, flags); 7271 name, value, size, flags);
7272} 7272}
7273 7273
7274int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, 7274static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
7275 void *fs_info) 7275 void *fs_info)
7276{ 7276{
7277 const struct xattr *xattr; 7277 const struct xattr *xattr;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fd02a9ebfc30..3f57dac31ba6 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -126,6 +126,14 @@ static inline const char *get_task_state(struct task_struct *tsk)
126{ 126{
127 unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; 127 unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
128 128
129 /*
130 * Parked tasks do not run; they sit in __kthread_parkme().
131 * Without this check, we would report them as running, which is
132 * clearly wrong, so we report them as sleeping instead.
133 */
134 if (tsk->state == TASK_PARKED)
135 state = TASK_INTERRUPTIBLE;
136
129 BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); 137 BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
130 138
131 return task_state_array[fls(state)]; 139 return task_state_array[fls(state)];
diff --git a/fs/splice.c b/fs/splice.c
index 4f355a1c1a9e..5fc1e50a7f30 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
360 break; 360 break;
361 361
362 error = add_to_page_cache_lru(page, mapping, index, 362 error = add_to_page_cache_lru(page, mapping, index,
363 GFP_KERNEL); 363 GFP_KERNEL & mapping_gfp_mask(mapping));
364 if (unlikely(error)) { 364 if (unlikely(error)) {
365 page_cache_release(page); 365 page_cache_release(page);
366 if (error == -EEXIST) 366 if (error == -EEXIST)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index bd910ceaccfa..29c57b2cb344 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -96,11 +96,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
96} 96}
97#endif 97#endif
98 98
99#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR 99#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
100#ifdef CONFIG_TRANSPARENT_HUGEPAGE 100#ifdef CONFIG_TRANSPARENT_HUGEPAGE
101static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, 101static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
102 unsigned long address, 102 unsigned long address,
103 pmd_t *pmdp) 103 pmd_t *pmdp)
104{ 104{
105 pmd_t pmd = *pmdp; 105 pmd_t pmd = *pmdp;
106 pmd_clear(pmdp); 106 pmd_clear(pmdp);
@@ -109,13 +109,13 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
109#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 109#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
110#endif 110#endif
111 111
112#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL 112#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
113#ifdef CONFIG_TRANSPARENT_HUGEPAGE 113#ifdef CONFIG_TRANSPARENT_HUGEPAGE
114static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm, 114static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
115 unsigned long address, pmd_t *pmdp, 115 unsigned long address, pmd_t *pmdp,
116 int full) 116 int full)
117{ 117{
118 return pmdp_get_and_clear(mm, address, pmdp); 118 return pmdp_huge_get_and_clear(mm, address, pmdp);
119} 119}
120#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 120#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
121#endif 121#endif
@@ -152,8 +152,8 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
152 pte_t *ptep); 152 pte_t *ptep);
153#endif 153#endif
154 154
155#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH 155#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
156extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, 156extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
157 unsigned long address, 157 unsigned long address,
158 pmd_t *pmdp); 158 pmd_t *pmdp);
159#endif 159#endif
@@ -189,6 +189,22 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
189 unsigned long address, pmd_t *pmdp); 189 unsigned long address, pmd_t *pmdp);
190#endif 190#endif
191 191
192#ifndef pmdp_collapse_flush
193#ifdef CONFIG_TRANSPARENT_HUGEPAGE
194extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
195 unsigned long address, pmd_t *pmdp);
196#else
197static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
198 unsigned long address,
199 pmd_t *pmdp)
200{
201 BUILD_BUG();
202 return *pmdp;
203}
204#define pmdp_collapse_flush pmdp_collapse_flush
205#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
206#endif
207
192#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT 208#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
193extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 209extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
194 pgtable_t pgtable); 210 pgtable_t pgtable);
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 0995c2de8162..f589222bfa87 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -357,12 +357,12 @@ extern void *alloc_large_system_hash(const char *tablename,
357/* Only NUMA needs hash distribution. 64bit NUMA architectures have 357/* Only NUMA needs hash distribution. 64bit NUMA architectures have
358 * sufficient vmalloc space. 358 * sufficient vmalloc space.
359 */ 359 */
360#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT) 360#ifdef CONFIG_NUMA
361#define HASHDIST_DEFAULT 1 361#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT)
362extern int hashdist; /* Distribute hashes across NUMA nodes? */
362#else 363#else
363#define HASHDIST_DEFAULT 0 364#define hashdist (0)
364#endif 365#endif
365extern int hashdist; /* Distribute hashes across NUMA nodes? */
366 366
367 367
368#endif /* _LINUX_BOOTMEM_H */ 368#endif /* _LINUX_BOOTMEM_H */
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 34025df61829..c9e5c57e4edf 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -71,7 +71,6 @@ static inline char *config_item_name(struct config_item * item)
71 return item->ci_name; 71 return item->ci_name;
72} 72}
73 73
74extern void config_item_init(struct config_item *);
75extern void config_item_init_type_name(struct config_item *item, 74extern void config_item_init_type_name(struct config_item *item,
76 const char *name, 75 const char *name,
77 struct config_item_type *type); 76 struct config_item_type *type);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 2092965afca3..5f19efe4eb3f 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -96,6 +96,8 @@ typedef struct {
96#define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ 96#define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */
97#define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ 97#define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */
98#define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ 98#define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */
99#define EFI_MEMORY_MORE_RELIABLE \
100 ((u64)0x0000000000010000ULL) /* higher reliability */
99#define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ 101#define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */
100#define EFI_MEMORY_DESCRIPTOR_VERSION 1 102#define EFI_MEMORY_DESCRIPTOR_VERSION 1
101 103
@@ -868,6 +870,7 @@ extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if pos
868extern void efi_late_init(void); 870extern void efi_late_init(void);
869extern void efi_free_boot_services(void); 871extern void efi_free_boot_services(void);
870extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size); 872extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size);
873extern void efi_find_mirror(void);
871#else 874#else
872static inline void efi_late_init(void) {} 875static inline void efi_late_init(void) {}
873static inline void efi_free_boot_services(void) {} 876static inline void efi_free_boot_services(void) {}
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 8293262401de..e65ef959546c 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -6,16 +6,16 @@
6#include <linux/bitops.h> 6#include <linux/bitops.h>
7 7
8struct frontswap_ops { 8struct frontswap_ops {
9 void (*init)(unsigned); 9 void (*init)(unsigned); /* this swap type was just swapon'ed */
10 int (*store)(unsigned, pgoff_t, struct page *); 10 int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
11 int (*load)(unsigned, pgoff_t, struct page *); 11 int (*load)(unsigned, pgoff_t, struct page *); /* load a page */
12 void (*invalidate_page)(unsigned, pgoff_t); 12 void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */
13 void (*invalidate_area)(unsigned); 13 void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */
14 struct frontswap_ops *next; /* private pointer to next ops */
14}; 15};
15 16
16extern bool frontswap_enabled; 17extern bool frontswap_enabled;
17extern struct frontswap_ops * 18extern void frontswap_register_ops(struct frontswap_ops *ops);
18 frontswap_register_ops(struct frontswap_ops *ops);
19extern void frontswap_shrink(unsigned long); 19extern void frontswap_shrink(unsigned long);
20extern unsigned long frontswap_curr_pages(void); 20extern unsigned long frontswap_curr_pages(void);
21extern void frontswap_writethrough(bool); 21extern void frontswap_writethrough(bool);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0f313f93c586..65a517dd32f7 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -84,8 +84,6 @@ struct fsnotify_fname;
84 * Each group much define these ops. The fsnotify infrastructure will call 84 * Each group much define these ops. The fsnotify infrastructure will call
85 * these operations for each relevant group. 85 * these operations for each relevant group.
86 * 86 *
87 * should_send_event - given a group, inode, and mask this function determines
88 * if the group is interested in this event.
89 * handle_event - main call for a group to handle an fs event 87 * handle_event - main call for a group to handle an fs event
90 * free_group_priv - called when a group refcnt hits 0 to clean up the private union 88 * free_group_priv - called when a group refcnt hits 0 to clean up the private union
91 * freeing_mark - called when a mark is being destroyed for some reason. The group 89 * freeing_mark - called when a mark is being destroyed for some reason. The group
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index e705467ddb47..d0a1f99e24e3 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -28,7 +28,8 @@
28extern void kmemleak_init(void) __ref; 28extern void kmemleak_init(void) __ref;
29extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, 29extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
30 gfp_t gfp) __ref; 30 gfp_t gfp) __ref;
31extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref; 31extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
32 gfp_t gfp) __ref;
32extern void kmemleak_free(const void *ptr) __ref; 33extern void kmemleak_free(const void *ptr) __ref;
33extern void kmemleak_free_part(const void *ptr, size_t size) __ref; 34extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
34extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; 35extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
@@ -71,7 +72,8 @@ static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
71 gfp_t gfp) 72 gfp_t gfp)
72{ 73{
73} 74}
74static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) 75static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
76 gfp_t gfp)
75{ 77{
76} 78}
77static inline void kmemleak_free(const void *ptr) 79static inline void kmemleak_free(const void *ptr)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 9497ec7c77ea..0215ffd63069 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -21,7 +21,11 @@
21#define INIT_PHYSMEM_REGIONS 4 21#define INIT_PHYSMEM_REGIONS 4
22 22
23/* Definition of memblock flags. */ 23/* Definition of memblock flags. */
24#define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */ 24enum {
25 MEMBLOCK_NONE = 0x0, /* No special request */
26 MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */
27 MEMBLOCK_MIRROR = 0x2, /* mirrored region */
28};
25 29
26struct memblock_region { 30struct memblock_region {
27 phys_addr_t base; 31 phys_addr_t base;
@@ -61,7 +65,7 @@ extern bool movable_node_enabled;
61 65
62phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, 66phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
63 phys_addr_t start, phys_addr_t end, 67 phys_addr_t start, phys_addr_t end,
64 int nid); 68 int nid, ulong flags);
65phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, 69phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
66 phys_addr_t size, phys_addr_t align); 70 phys_addr_t size, phys_addr_t align);
67phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); 71phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
@@ -75,6 +79,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
75void memblock_trim_memory(phys_addr_t align); 79void memblock_trim_memory(phys_addr_t align);
76int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); 80int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
77int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); 81int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
82int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
83ulong choose_memblock_flags(void);
78 84
79/* Low level functions */ 85/* Low level functions */
80int memblock_add_range(struct memblock_type *type, 86int memblock_add_range(struct memblock_type *type,
@@ -85,11 +91,13 @@ int memblock_remove_range(struct memblock_type *type,
85 phys_addr_t base, 91 phys_addr_t base,
86 phys_addr_t size); 92 phys_addr_t size);
87 93
88void __next_mem_range(u64 *idx, int nid, struct memblock_type *type_a, 94void __next_mem_range(u64 *idx, int nid, ulong flags,
95 struct memblock_type *type_a,
89 struct memblock_type *type_b, phys_addr_t *out_start, 96 struct memblock_type *type_b, phys_addr_t *out_start,
90 phys_addr_t *out_end, int *out_nid); 97 phys_addr_t *out_end, int *out_nid);
91 98
92void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, 99void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
100 struct memblock_type *type_a,
93 struct memblock_type *type_b, phys_addr_t *out_start, 101 struct memblock_type *type_b, phys_addr_t *out_start,
94 phys_addr_t *out_end, int *out_nid); 102 phys_addr_t *out_end, int *out_nid);
95 103
@@ -100,16 +108,17 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
100 * @type_a: ptr to memblock_type to iterate 108 * @type_a: ptr to memblock_type to iterate
101 * @type_b: ptr to memblock_type which excludes from the iteration 109 * @type_b: ptr to memblock_type which excludes from the iteration
102 * @nid: node selector, %NUMA_NO_NODE for all nodes 110 * @nid: node selector, %NUMA_NO_NODE for all nodes
111 * @flags: pick from blocks based on memory attributes
103 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 112 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
104 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 113 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
105 * @p_nid: ptr to int for nid of the range, can be %NULL 114 * @p_nid: ptr to int for nid of the range, can be %NULL
106 */ 115 */
107#define for_each_mem_range(i, type_a, type_b, nid, \ 116#define for_each_mem_range(i, type_a, type_b, nid, flags, \
108 p_start, p_end, p_nid) \ 117 p_start, p_end, p_nid) \
109 for (i = 0, __next_mem_range(&i, nid, type_a, type_b, \ 118 for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \
110 p_start, p_end, p_nid); \ 119 p_start, p_end, p_nid); \
111 i != (u64)ULLONG_MAX; \ 120 i != (u64)ULLONG_MAX; \
112 __next_mem_range(&i, nid, type_a, type_b, \ 121 __next_mem_range(&i, nid, flags, type_a, type_b, \
113 p_start, p_end, p_nid)) 122 p_start, p_end, p_nid))
114 123
115/** 124/**
@@ -119,17 +128,18 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
119 * @type_a: ptr to memblock_type to iterate 128 * @type_a: ptr to memblock_type to iterate
120 * @type_b: ptr to memblock_type which excludes from the iteration 129 * @type_b: ptr to memblock_type which excludes from the iteration
121 * @nid: node selector, %NUMA_NO_NODE for all nodes 130 * @nid: node selector, %NUMA_NO_NODE for all nodes
131 * @flags: pick from blocks based on memory attributes
122 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 132 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
123 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 133 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
124 * @p_nid: ptr to int for nid of the range, can be %NULL 134 * @p_nid: ptr to int for nid of the range, can be %NULL
125 */ 135 */
126#define for_each_mem_range_rev(i, type_a, type_b, nid, \ 136#define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \
127 p_start, p_end, p_nid) \ 137 p_start, p_end, p_nid) \
128 for (i = (u64)ULLONG_MAX, \ 138 for (i = (u64)ULLONG_MAX, \
129 __next_mem_range_rev(&i, nid, type_a, type_b, \ 139 __next_mem_range_rev(&i, nid, flags, type_a, type_b,\
130 p_start, p_end, p_nid); \ 140 p_start, p_end, p_nid); \
131 i != (u64)ULLONG_MAX; \ 141 i != (u64)ULLONG_MAX; \
132 __next_mem_range_rev(&i, nid, type_a, type_b, \ 142 __next_mem_range_rev(&i, nid, flags, type_a, type_b, \
133 p_start, p_end, p_nid)) 143 p_start, p_end, p_nid))
134 144
135#ifdef CONFIG_MOVABLE_NODE 145#ifdef CONFIG_MOVABLE_NODE
@@ -153,6 +163,11 @@ static inline bool movable_node_is_enabled(void)
153} 163}
154#endif 164#endif
155 165
166static inline bool memblock_is_mirror(struct memblock_region *m)
167{
168 return m->flags & MEMBLOCK_MIRROR;
169}
170
156#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 171#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
157int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, 172int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
158 unsigned long *end_pfn); 173 unsigned long *end_pfn);
@@ -181,13 +196,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
181 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 196 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
182 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 197 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
183 * @p_nid: ptr to int for nid of the range, can be %NULL 198 * @p_nid: ptr to int for nid of the range, can be %NULL
199 * @flags: pick from blocks based on memory attributes
184 * 200 *
185 * Walks over free (memory && !reserved) areas of memblock. Available as 201 * Walks over free (memory && !reserved) areas of memblock. Available as
186 * soon as memblock is initialized. 202 * soon as memblock is initialized.
187 */ 203 */
188#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \ 204#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \
189 for_each_mem_range(i, &memblock.memory, &memblock.reserved, \ 205 for_each_mem_range(i, &memblock.memory, &memblock.reserved, \
190 nid, p_start, p_end, p_nid) 206 nid, flags, p_start, p_end, p_nid)
191 207
192/** 208/**
193 * for_each_free_mem_range_reverse - rev-iterate through free memblock areas 209 * for_each_free_mem_range_reverse - rev-iterate through free memblock areas
@@ -196,13 +212,15 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
196 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 212 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
197 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 213 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
198 * @p_nid: ptr to int for nid of the range, can be %NULL 214 * @p_nid: ptr to int for nid of the range, can be %NULL
215 * @flags: pick from blocks based on memory attributes
199 * 216 *
200 * Walks over free (memory && !reserved) areas of memblock in reverse 217 * Walks over free (memory && !reserved) areas of memblock in reverse
201 * order. Available as soon as memblock is initialized. 218 * order. Available as soon as memblock is initialized.
202 */ 219 */
203#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \ 220#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \
221 p_nid) \
204 for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ 222 for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
205 nid, p_start, p_end, p_nid) 223 nid, flags, p_start, p_end, p_nid)
206 224
207static inline void memblock_set_region_flags(struct memblock_region *r, 225static inline void memblock_set_region_flags(struct memblock_region *r,
208 unsigned long flags) 226 unsigned long flags)
@@ -273,7 +291,8 @@ static inline bool memblock_bottom_up(void) { return false; }
273#define MEMBLOCK_ALLOC_ACCESSIBLE 0 291#define MEMBLOCK_ALLOC_ACCESSIBLE 0
274 292
275phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, 293phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
276 phys_addr_t start, phys_addr_t end); 294 phys_addr_t start, phys_addr_t end,
295 ulong flags);
277phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, 296phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
278 phys_addr_t max_addr); 297 phys_addr_t max_addr);
279phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, 298phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
new file mode 100644
index 000000000000..4efc3f56e6df
--- /dev/null
+++ b/include/linux/mm-arch-hooks.h
@@ -0,0 +1,25 @@
1/*
2 * Generic mm no-op hooks.
3 *
4 * Copyright (C) 2015, IBM Corporation
5 * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#ifndef _LINUX_MM_ARCH_HOOKS_H
12#define _LINUX_MM_ARCH_HOOKS_H
13
14#include <asm/mm-arch-hooks.h>
15
16#ifndef arch_remap
17static inline void arch_remap(struct mm_struct *mm,
18 unsigned long old_start, unsigned long old_end,
19 unsigned long new_start, unsigned long new_end)
20{
21}
22#define arch_remap arch_remap
23#endif
24
25#endif /* _LINUX_MM_ARCH_HOOKS_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0755b9fd03a7..24ad583596d1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -499,7 +499,7 @@ static inline int page_count(struct page *page)
499 499
500static inline bool __compound_tail_refcounted(struct page *page) 500static inline bool __compound_tail_refcounted(struct page *page)
501{ 501{
502 return !PageSlab(page) && !PageHeadHuge(page); 502 return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
503} 503}
504 504
505/* 505/*
@@ -2146,12 +2146,47 @@ enum mf_flags {
2146extern int memory_failure(unsigned long pfn, int trapno, int flags); 2146extern int memory_failure(unsigned long pfn, int trapno, int flags);
2147extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); 2147extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
2148extern int unpoison_memory(unsigned long pfn); 2148extern int unpoison_memory(unsigned long pfn);
2149extern int get_hwpoison_page(struct page *page);
2149extern int sysctl_memory_failure_early_kill; 2150extern int sysctl_memory_failure_early_kill;
2150extern int sysctl_memory_failure_recovery; 2151extern int sysctl_memory_failure_recovery;
2151extern void shake_page(struct page *p, int access); 2152extern void shake_page(struct page *p, int access);
2152extern atomic_long_t num_poisoned_pages; 2153extern atomic_long_t num_poisoned_pages;
2153extern int soft_offline_page(struct page *page, int flags); 2154extern int soft_offline_page(struct page *page, int flags);
2154 2155
2156
2157/*
2158 * Error handlers for various types of pages.
2159 */
2160enum mf_result {
2161 MF_IGNORED, /* Error: cannot be handled */
2162 MF_FAILED, /* Error: handling failed */
2163 MF_DELAYED, /* Will be handled later */
2164 MF_RECOVERED, /* Successfully recovered */
2165};
2166
2167enum mf_action_page_type {
2168 MF_MSG_KERNEL,
2169 MF_MSG_KERNEL_HIGH_ORDER,
2170 MF_MSG_SLAB,
2171 MF_MSG_DIFFERENT_COMPOUND,
2172 MF_MSG_POISONED_HUGE,
2173 MF_MSG_HUGE,
2174 MF_MSG_FREE_HUGE,
2175 MF_MSG_UNMAP_FAILED,
2176 MF_MSG_DIRTY_SWAPCACHE,
2177 MF_MSG_CLEAN_SWAPCACHE,
2178 MF_MSG_DIRTY_MLOCKED_LRU,
2179 MF_MSG_CLEAN_MLOCKED_LRU,
2180 MF_MSG_DIRTY_UNEVICTABLE_LRU,
2181 MF_MSG_CLEAN_UNEVICTABLE_LRU,
2182 MF_MSG_DIRTY_LRU,
2183 MF_MSG_CLEAN_LRU,
2184 MF_MSG_TRUNCATED_LRU,
2185 MF_MSG_BUDDY,
2186 MF_MSG_BUDDY_2ND,
2187 MF_MSG_UNKNOWN,
2188};
2189
2155#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) 2190#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
2156extern void clear_huge_page(struct page *page, 2191extern void clear_huge_page(struct page *page,
2157 unsigned long addr, 2192 unsigned long addr,
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 95243d28a0ee..61cd67f4d788 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -324,25 +324,25 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
324 ___pte; \ 324 ___pte; \
325}) 325})
326 326
327#define pmdp_clear_flush_notify(__vma, __haddr, __pmd) \ 327#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \
328({ \ 328({ \
329 unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ 329 unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
330 struct mm_struct *___mm = (__vma)->vm_mm; \ 330 struct mm_struct *___mm = (__vma)->vm_mm; \
331 pmd_t ___pmd; \ 331 pmd_t ___pmd; \
332 \ 332 \
333 ___pmd = pmdp_clear_flush(__vma, __haddr, __pmd); \ 333 ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \
334 mmu_notifier_invalidate_range(___mm, ___haddr, \ 334 mmu_notifier_invalidate_range(___mm, ___haddr, \
335 ___haddr + HPAGE_PMD_SIZE); \ 335 ___haddr + HPAGE_PMD_SIZE); \
336 \ 336 \
337 ___pmd; \ 337 ___pmd; \
338}) 338})
339 339
340#define pmdp_get_and_clear_notify(__mm, __haddr, __pmd) \ 340#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \
341({ \ 341({ \
342 unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ 342 unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
343 pmd_t ___pmd; \ 343 pmd_t ___pmd; \
344 \ 344 \
345 ___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd); \ 345 ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd); \
346 mmu_notifier_invalidate_range(__mm, ___haddr, \ 346 mmu_notifier_invalidate_range(__mm, ___haddr, \
347 ___haddr + HPAGE_PMD_SIZE); \ 347 ___haddr + HPAGE_PMD_SIZE); \
348 \ 348 \
@@ -428,8 +428,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
428#define ptep_clear_flush_young_notify ptep_clear_flush_young 428#define ptep_clear_flush_young_notify ptep_clear_flush_young
429#define pmdp_clear_flush_young_notify pmdp_clear_flush_young 429#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
430#define ptep_clear_flush_notify ptep_clear_flush 430#define ptep_clear_flush_notify ptep_clear_flush
431#define pmdp_clear_flush_notify pmdp_clear_flush 431#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
432#define pmdp_get_and_clear_notify pmdp_get_and_clear 432#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
433#define set_pte_at_notify set_pte_at 433#define set_pte_at_notify set_pte_at
434 434
435#endif /* CONFIG_MMU_NOTIFIER */ 435#endif /* CONFIG_MMU_NOTIFIER */
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 3d46fb4708e0..f94da0e65dea 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled;
67extern int soft_watchdog_enabled; 67extern int soft_watchdog_enabled;
68extern int watchdog_user_enabled; 68extern int watchdog_user_enabled;
69extern int watchdog_thresh; 69extern int watchdog_thresh;
70extern unsigned long *watchdog_cpumask_bits;
70extern int sysctl_softlockup_all_cpu_backtrace; 71extern int sysctl_softlockup_all_cpu_backtrace;
71struct ctl_table; 72struct ctl_table;
72extern int proc_watchdog(struct ctl_table *, int , 73extern int proc_watchdog(struct ctl_table *, int ,
@@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int ,
77 void __user *, size_t *, loff_t *); 78 void __user *, size_t *, loff_t *);
78extern int proc_watchdog_thresh(struct ctl_table *, int , 79extern int proc_watchdog_thresh(struct ctl_table *, int ,
79 void __user *, size_t *, loff_t *); 80 void __user *, size_t *, loff_t *);
81extern int proc_watchdog_cpumask(struct ctl_table *, int,
82 void __user *, size_t *, loff_t *);
80#endif 83#endif
81 84
82#ifdef CONFIG_HAVE_ACPI_APEI_NMI 85#ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 44b2f6f7bbd8..7deecb7bca5e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -32,6 +32,8 @@ enum oom_scan_t {
32/* Thread is the potential origin of an oom condition; kill first on oom */ 32/* Thread is the potential origin of an oom condition; kill first on oom */
33#define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1) 33#define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1)
34 34
35extern struct mutex oom_lock;
36
35static inline void set_current_oom_origin(void) 37static inline void set_current_oom_origin(void)
36{ 38{
37 current->signal->oom_flags |= OOM_FLAG_ORIGIN; 39 current->signal->oom_flags |= OOM_FLAG_ORIGIN;
@@ -47,9 +49,7 @@ static inline bool oom_task_origin(const struct task_struct *p)
47 return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); 49 return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
48} 50}
49 51
50extern void mark_tsk_oom_victim(struct task_struct *tsk); 52extern void mark_oom_victim(struct task_struct *tsk);
51
52extern void unmark_oom_victim(void);
53 53
54extern unsigned long oom_badness(struct task_struct *p, 54extern unsigned long oom_badness(struct task_struct *p,
55 struct mem_cgroup *memcg, const nodemask_t *nodemask, 55 struct mem_cgroup *memcg, const nodemask_t *nodemask,
@@ -62,9 +62,6 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
62 struct mem_cgroup *memcg, nodemask_t *nodemask, 62 struct mem_cgroup *memcg, nodemask_t *nodemask,
63 const char *message); 63 const char *message);
64 64
65extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags);
66extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags);
67
68extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 65extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
69 int order, const nodemask_t *nodemask, 66 int order, const nodemask_t *nodemask,
70 struct mem_cgroup *memcg); 67 struct mem_cgroup *memcg);
@@ -75,6 +72,9 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
75 72
76extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 73extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
77 int order, nodemask_t *mask, bool force_kill); 74 int order, nodemask_t *mask, bool force_kill);
75
76extern void exit_oom_victim(void);
77
78extern int register_oom_notifier(struct notifier_block *nb); 78extern int register_oom_notifier(struct notifier_block *nb);
79extern int unregister_oom_notifier(struct notifier_block *nb); 79extern int unregister_oom_notifier(struct notifier_block *nb);
80 80
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ffd24c830151..9de2fdc8b5e4 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -153,8 +153,30 @@ size_t ksize(const void *);
153#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN 153#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
154#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN 154#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
155#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) 155#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
156/*
157 * The KMALLOC_LOOP_LOW is the definition for the for loop index start number
158 * to create the kmalloc_caches object in create_kmalloc_caches(). The first
159 * and the second are 96 and 192. You can see that in the kmalloc_index(), if
160 * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64,
161 * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't
162 * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW.
163 */
164#if KMALLOC_MIN_SIZE <= 32
165#define KMALLOC_LOOP_LOW 1
166#elif KMALLOC_MIN_SIZE <= 64
167#define KMALLOC_LOOP_LOW 2
168#else
169#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW
170#endif
171
156#else 172#else
157#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 173#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
174/*
175 * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used.
176 * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be
177 * initialized.
178 */
179#define KMALLOC_LOOP_LOW 1
158#endif 180#endif
159 181
160/* 182/*
@@ -240,8 +262,8 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
240 * belongs to. 262 * belongs to.
241 * 0 = zero alloc 263 * 0 = zero alloc
242 * 1 = 65 .. 96 bytes 264 * 1 = 65 .. 96 bytes
243 * 2 = 120 .. 192 bytes 265 * 2 = 129 .. 192 bytes
244 * n = 2^(n-1) .. 2^n -1 266 * n = 2^(n-1)+1 .. 2^n
245 */ 267 */
246static __always_inline int kmalloc_index(size_t size) 268static __always_inline int kmalloc_index(size_t size)
247{ 269{
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index d600afb21926..da3c593f9845 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -27,6 +27,8 @@ struct smpboot_thread_data;
27 * @pre_unpark: Optional unpark function, called before the thread is 27 * @pre_unpark: Optional unpark function, called before the thread is
28 * unparked (cpu online). This is not guaranteed to be 28 * unparked (cpu online). This is not guaranteed to be
29 * called on the target cpu of the thread. Careful! 29 * called on the target cpu of the thread. Careful!
30 * @cpumask: Internal state. To update which threads are unparked,
31 * call smpboot_update_cpumask_percpu_thread().
30 * @selfparking: Thread is not parked by the park function. 32 * @selfparking: Thread is not parked by the park function.
31 * @thread_comm: The base name of the thread 33 * @thread_comm: The base name of the thread
32 */ 34 */
@@ -41,11 +43,14 @@ struct smp_hotplug_thread {
41 void (*park)(unsigned int cpu); 43 void (*park)(unsigned int cpu);
42 void (*unpark)(unsigned int cpu); 44 void (*unpark)(unsigned int cpu);
43 void (*pre_unpark)(unsigned int cpu); 45 void (*pre_unpark)(unsigned int cpu);
46 cpumask_var_t cpumask;
44 bool selfparking; 47 bool selfparking;
45 const char *thread_comm; 48 const char *thread_comm;
46}; 49};
47 50
48int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); 51int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
49void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); 52void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
53int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
54 const struct cpumask *);
50 55
51#endif 56#endif
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 79abb9c71772..1443d79e4fe6 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -11,6 +11,7 @@
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/aer.h> 12#include <linux/aer.h>
13#include <linux/cper.h> 13#include <linux/cper.h>
14#include <linux/mm.h>
14 15
15/* 16/*
16 * MCE Extended Error Log trace event 17 * MCE Extended Error Log trace event
@@ -232,6 +233,90 @@ TRACE_EVENT(aer_event,
232 __print_flags(__entry->status, "|", aer_uncorrectable_errors)) 233 __print_flags(__entry->status, "|", aer_uncorrectable_errors))
233); 234);
234 235
236/*
237 * memory-failure recovery action result event
238 *
239 * unsigned long pfn - Page Frame Number of the corrupted page
240 * int type - Page types of the corrupted page
241 * int result - Result of recovery action
242 */
243
244#ifdef CONFIG_MEMORY_FAILURE
245#define MF_ACTION_RESULT \
246 EM ( MF_IGNORED, "Ignored" ) \
247 EM ( MF_FAILED, "Failed" ) \
248 EM ( MF_DELAYED, "Delayed" ) \
249 EMe ( MF_RECOVERED, "Recovered" )
250
251#define MF_PAGE_TYPE \
252 EM ( MF_MSG_KERNEL, "reserved kernel page" ) \
253 EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \
254 EM ( MF_MSG_SLAB, "kernel slab page" ) \
255 EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
256 EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \
257 EM ( MF_MSG_HUGE, "huge page" ) \
258 EM ( MF_MSG_FREE_HUGE, "free huge page" ) \
259 EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \
260 EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \
261 EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \
262 EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" ) \
263 EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" ) \
264 EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" ) \
265 EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" ) \
266 EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" ) \
267 EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \
268 EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \
269 EM ( MF_MSG_BUDDY, "free buddy page" ) \
270 EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \
271 EMe ( MF_MSG_UNKNOWN, "unknown page" )
272
273/*
274 * First define the enums in MM_ACTION_RESULT to be exported to userspace
275 * via TRACE_DEFINE_ENUM().
276 */
277#undef EM
278#undef EMe
279#define EM(a, b) TRACE_DEFINE_ENUM(a);
280#define EMe(a, b) TRACE_DEFINE_ENUM(a);
281
282MF_ACTION_RESULT
283MF_PAGE_TYPE
284
285/*
286 * Now redefine the EM() and EMe() macros to map the enums to the strings
287 * that will be printed in the output.
288 */
289#undef EM
290#undef EMe
291#define EM(a, b) { a, b },
292#define EMe(a, b) { a, b }
293
294TRACE_EVENT(memory_failure_event,
295 TP_PROTO(unsigned long pfn,
296 int type,
297 int result),
298
299 TP_ARGS(pfn, type, result),
300
301 TP_STRUCT__entry(
302 __field(unsigned long, pfn)
303 __field(int, type)
304 __field(int, result)
305 ),
306
307 TP_fast_assign(
308 __entry->pfn = pfn;
309 __entry->type = type;
310 __entry->result = result;
311 ),
312
313 TP_printk("pfn %#lx: recovery action for %s: %s",
314 __entry->pfn,
315 __print_symbolic(__entry->type, MF_PAGE_TYPE),
316 __print_symbolic(__entry->result, MF_ACTION_RESULT)
317 )
318);
319#endif /* CONFIG_MEMORY_FAILURE */
235#endif /* _TRACE_HW_EVENT_MC_H */ 320#endif /* _TRACE_HW_EVENT_MC_H */
236 321
237/* This part must be outside protection */ 322/* This part must be outside protection */
diff --git a/kernel/exit.c b/kernel/exit.c
index 22fcc05dec40..185752a729f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -436,7 +436,7 @@ static void exit_mm(struct task_struct *tsk)
436 mm_update_next_owner(mm); 436 mm_update_next_owner(mm);
437 mmput(mm); 437 mmput(mm);
438 if (test_thread_flag(TIF_MEMDIE)) 438 if (test_thread_flag(TIF_MEMDIE))
439 unmark_oom_victim(); 439 exit_oom_victim();
440} 440}
441 441
442static struct task_struct *find_alive_thread(struct task_struct *p) 442static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index c697f73d82d6..7c434c39f02a 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -232,7 +232,8 @@ void smpboot_unpark_threads(unsigned int cpu)
232 232
233 mutex_lock(&smpboot_threads_lock); 233 mutex_lock(&smpboot_threads_lock);
234 list_for_each_entry(cur, &hotplug_threads, list) 234 list_for_each_entry(cur, &hotplug_threads, list)
235 smpboot_unpark_thread(cur, cpu); 235 if (cpumask_test_cpu(cpu, cur->cpumask))
236 smpboot_unpark_thread(cur, cpu);
236 mutex_unlock(&smpboot_threads_lock); 237 mutex_unlock(&smpboot_threads_lock);
237} 238}
238 239
@@ -258,6 +259,15 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
258{ 259{
259 unsigned int cpu; 260 unsigned int cpu;
260 261
262 /* Unpark any threads that were voluntarily parked. */
263 for_each_cpu_not(cpu, ht->cpumask) {
264 if (cpu_online(cpu)) {
265 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
266 if (tsk)
267 kthread_unpark(tsk);
268 }
269 }
270
261 /* We need to destroy also the parked threads of offline cpus */ 271 /* We need to destroy also the parked threads of offline cpus */
262 for_each_possible_cpu(cpu) { 272 for_each_possible_cpu(cpu) {
263 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); 273 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,6 +291,10 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
281 unsigned int cpu; 291 unsigned int cpu;
282 int ret = 0; 292 int ret = 0;
283 293
294 if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
295 return -ENOMEM;
296 cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
297
284 get_online_cpus(); 298 get_online_cpus();
285 mutex_lock(&smpboot_threads_lock); 299 mutex_lock(&smpboot_threads_lock);
286 for_each_online_cpu(cpu) { 300 for_each_online_cpu(cpu) {
@@ -313,9 +327,53 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
313 smpboot_destroy_threads(plug_thread); 327 smpboot_destroy_threads(plug_thread);
314 mutex_unlock(&smpboot_threads_lock); 328 mutex_unlock(&smpboot_threads_lock);
315 put_online_cpus(); 329 put_online_cpus();
330 free_cpumask_var(plug_thread->cpumask);
316} 331}
317EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); 332EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
318 333
334/**
335 * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
336 * @plug_thread: Hotplug thread descriptor
337 * @new: Revised mask to use
338 *
339 * The cpumask field in the smp_hotplug_thread must not be updated directly
340 * by the client, but only by calling this function.
341 * This function can only be called on a registered smp_hotplug_thread.
342 */
343int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
344 const struct cpumask *new)
345{
346 struct cpumask *old = plug_thread->cpumask;
347 cpumask_var_t tmp;
348 unsigned int cpu;
349
350 if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
351 return -ENOMEM;
352
353 get_online_cpus();
354 mutex_lock(&smpboot_threads_lock);
355
356 /* Park threads that were exclusively enabled on the old mask. */
357 cpumask_andnot(tmp, old, new);
358 for_each_cpu_and(cpu, tmp, cpu_online_mask)
359 smpboot_park_thread(plug_thread, cpu);
360
361 /* Unpark threads that are exclusively enabled on the new mask. */
362 cpumask_andnot(tmp, new, old);
363 for_each_cpu_and(cpu, tmp, cpu_online_mask)
364 smpboot_unpark_thread(plug_thread, cpu);
365
366 cpumask_copy(old, new);
367
368 mutex_unlock(&smpboot_threads_lock);
369 put_online_cpus();
370
371 free_cpumask_var(tmp);
372
373 return 0;
374}
375EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
376
319static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); 377static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
320 378
321/* 379/*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b13e9d2de302..812fcc3fd390 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -872,6 +872,13 @@ static struct ctl_table kern_table[] = {
872 .extra2 = &one, 872 .extra2 = &one,
873 }, 873 },
874 { 874 {
875 .procname = "watchdog_cpumask",
876 .data = &watchdog_cpumask_bits,
877 .maxlen = NR_CPUS,
878 .mode = 0644,
879 .proc_handler = proc_watchdog_cpumask,
880 },
881 {
875 .procname = "softlockup_panic", 882 .procname = "softlockup_panic",
876 .data = &softlockup_panic, 883 .data = &softlockup_panic,
877 .maxlen = sizeof(int), 884 .maxlen = sizeof(int),
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 581a68a04c64..a6ffa43f2993 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -19,6 +19,7 @@
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/smpboot.h> 20#include <linux/smpboot.h>
21#include <linux/sched/rt.h> 21#include <linux/sched/rt.h>
22#include <linux/tick.h>
22 23
23#include <asm/irq_regs.h> 24#include <asm/irq_regs.h>
24#include <linux/kvm_para.h> 25#include <linux/kvm_para.h>
@@ -58,6 +59,12 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace;
58#else 59#else
59#define sysctl_softlockup_all_cpu_backtrace 0 60#define sysctl_softlockup_all_cpu_backtrace 0
60#endif 61#endif
62static struct cpumask watchdog_cpumask __read_mostly;
63unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
64
65/* Helper for online, unparked cpus. */
66#define for_each_watchdog_cpu(cpu) \
67 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
61 68
62static int __read_mostly watchdog_running; 69static int __read_mostly watchdog_running;
63static u64 __read_mostly sample_period; 70static u64 __read_mostly sample_period;
@@ -207,7 +214,7 @@ void touch_all_softlockup_watchdogs(void)
207 * do we care if a 0 races with a timestamp? 214 * do we care if a 0 races with a timestamp?
208 * all it means is the softlock check starts one cycle later 215 * all it means is the softlock check starts one cycle later
209 */ 216 */
210 for_each_online_cpu(cpu) 217 for_each_watchdog_cpu(cpu)
211 per_cpu(watchdog_touch_ts, cpu) = 0; 218 per_cpu(watchdog_touch_ts, cpu) = 0;
212} 219}
213 220
@@ -616,7 +623,7 @@ void watchdog_nmi_enable_all(void)
616 goto unlock; 623 goto unlock;
617 624
618 get_online_cpus(); 625 get_online_cpus();
619 for_each_online_cpu(cpu) 626 for_each_watchdog_cpu(cpu)
620 watchdog_nmi_enable(cpu); 627 watchdog_nmi_enable(cpu);
621 put_online_cpus(); 628 put_online_cpus();
622 629
@@ -634,7 +641,7 @@ void watchdog_nmi_disable_all(void)
634 goto unlock; 641 goto unlock;
635 642
636 get_online_cpus(); 643 get_online_cpus();
637 for_each_online_cpu(cpu) 644 for_each_watchdog_cpu(cpu)
638 watchdog_nmi_disable(cpu); 645 watchdog_nmi_disable(cpu);
639 put_online_cpus(); 646 put_online_cpus();
640 647
@@ -696,7 +703,7 @@ static void update_watchdog_all_cpus(void)
696 int cpu; 703 int cpu;
697 704
698 get_online_cpus(); 705 get_online_cpus();
699 for_each_online_cpu(cpu) 706 for_each_watchdog_cpu(cpu)
700 update_watchdog(cpu); 707 update_watchdog(cpu);
701 put_online_cpus(); 708 put_online_cpus();
702} 709}
@@ -709,8 +716,12 @@ static int watchdog_enable_all_cpus(void)
709 err = smpboot_register_percpu_thread(&watchdog_threads); 716 err = smpboot_register_percpu_thread(&watchdog_threads);
710 if (err) 717 if (err)
711 pr_err("Failed to create watchdog threads, disabled\n"); 718 pr_err("Failed to create watchdog threads, disabled\n");
712 else 719 else {
720 if (smpboot_update_cpumask_percpu_thread(
721 &watchdog_threads, &watchdog_cpumask))
722 pr_err("Failed to set cpumask for watchdog threads\n");
713 watchdog_running = 1; 723 watchdog_running = 1;
724 }
714 } else { 725 } else {
715 /* 726 /*
716 * Enable/disable the lockup detectors or 727 * Enable/disable the lockup detectors or
@@ -879,12 +890,58 @@ out:
879 mutex_unlock(&watchdog_proc_mutex); 890 mutex_unlock(&watchdog_proc_mutex);
880 return err; 891 return err;
881} 892}
893
894/*
895 * The cpumask is the mask of possible cpus that the watchdog can run
896 * on, not the mask of cpus it is actually running on. This allows the
897 * user to specify a mask that will include cpus that have not yet
898 * been brought online, if desired.
899 */
900int proc_watchdog_cpumask(struct ctl_table *table, int write,
901 void __user *buffer, size_t *lenp, loff_t *ppos)
902{
903 int err;
904
905 mutex_lock(&watchdog_proc_mutex);
906 err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
907 if (!err && write) {
908 /* Remove impossible cpus to keep sysctl output cleaner. */
909 cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
910 cpu_possible_mask);
911
912 if (watchdog_running) {
913 /*
914 * Failure would be due to being unable to allocate
915 * a temporary cpumask, so we are likely not in a
916 * position to do much else to make things better.
917 */
918 if (smpboot_update_cpumask_percpu_thread(
919 &watchdog_threads, &watchdog_cpumask) != 0)
920 pr_err("cpumask update failed\n");
921 }
922 }
923 mutex_unlock(&watchdog_proc_mutex);
924 return err;
925}
926
882#endif /* CONFIG_SYSCTL */ 927#endif /* CONFIG_SYSCTL */
883 928
884void __init lockup_detector_init(void) 929void __init lockup_detector_init(void)
885{ 930{
886 set_sample_period(); 931 set_sample_period();
887 932
933#ifdef CONFIG_NO_HZ_FULL
934 if (tick_nohz_full_enabled()) {
935 if (!cpumask_empty(tick_nohz_full_mask))
936 pr_info("Disabling watchdog on nohz_full cores by default\n");
937 cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
938 tick_nohz_full_mask);
939 } else
940 cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
941#else
942 cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
943#endif
944
888 if (watchdog_enabled) 945 if (watchdog_enabled)
889 watchdog_enable_all_cpus(); 946 watchdog_enable_all_cpus();
890} 947}
diff --git a/mm/Kconfig b/mm/Kconfig
index 390214da4546..c180af880ed5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -368,6 +368,7 @@ config MEMORY_FAILURE
368 depends on ARCH_SUPPORTS_MEMORY_FAILURE 368 depends on ARCH_SUPPORTS_MEMORY_FAILURE
369 bool "Enable recovery from hardware memory errors" 369 bool "Enable recovery from hardware memory errors"
370 select MEMORY_ISOLATION 370 select MEMORY_ISOLATION
371 select RAS
371 help 372 help
372 Enables code to recover from some memory failures on systems 373 Enables code to recover from some memory failures on systems
373 with MCA recovery. This allows a system to continue running 374 with MCA recovery. This allows a system to continue running
diff --git a/mm/cma.c b/mm/cma.c
index 3a7a67b93394..e7d1db533025 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -182,7 +182,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
182 if (!size || !memblock_is_region_reserved(base, size)) 182 if (!size || !memblock_is_region_reserved(base, size))
183 return -EINVAL; 183 return -EINVAL;
184 184
185 /* ensure minimal alignment requied by mm core */ 185 /* ensure minimal alignment required by mm core */
186 alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); 186 alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
187 187
188 /* alignment should be aligned with order_per_bit */ 188 /* alignment should be aligned with order_per_bit */
@@ -238,7 +238,7 @@ int __init cma_declare_contiguous(phys_addr_t base,
238 /* 238 /*
239 * high_memory isn't direct mapped memory so retrieving its physical 239 * high_memory isn't direct mapped memory so retrieving its physical
240 * address isn't appropriate. But it would be useful to check the 240 * address isn't appropriate. But it would be useful to check the
241 * physical address of the highmem boundary so it's justfiable to get 241 * physical address of the highmem boundary so it's justifiable to get
242 * the physical address from it. On x86 there is a validation check for 242 * the physical address from it. On x86 there is a validation check for
243 * this case, so the following workaround is needed to avoid it. 243 * this case, so the following workaround is needed to avoid it.
244 */ 244 */
@@ -316,13 +316,15 @@ int __init cma_declare_contiguous(phys_addr_t base,
316 */ 316 */
317 if (base < highmem_start && limit > highmem_start) { 317 if (base < highmem_start && limit > highmem_start) {
318 addr = memblock_alloc_range(size, alignment, 318 addr = memblock_alloc_range(size, alignment,
319 highmem_start, limit); 319 highmem_start, limit,
320 MEMBLOCK_NONE);
320 limit = highmem_start; 321 limit = highmem_start;
321 } 322 }
322 323
323 if (!addr) { 324 if (!addr) {
324 addr = memblock_alloc_range(size, alignment, base, 325 addr = memblock_alloc_range(size, alignment, base,
325 limit); 326 limit,
327 MEMBLOCK_NONE);
326 if (!addr) { 328 if (!addr) {
327 ret = -ENOMEM; 329 ret = -ENOMEM;
328 goto err; 330 goto err;
diff --git a/mm/filemap.c b/mm/filemap.c
index 6bf5e42d560a..8d17ceea8dbe 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -196,7 +196,9 @@ void __delete_from_page_cache(struct page *page, void *shadow)
196 page->mapping = NULL; 196 page->mapping = NULL;
197 /* Leave page->index set: truncation lookup relies upon it */ 197 /* Leave page->index set: truncation lookup relies upon it */
198 198
199 __dec_zone_page_state(page, NR_FILE_PAGES); 199 /* hugetlb pages do not participate in page cache accounting. */
200 if (!PageHuge(page))
201 __dec_zone_page_state(page, NR_FILE_PAGES);
200 if (PageSwapBacked(page)) 202 if (PageSwapBacked(page))
201 __dec_zone_page_state(page, NR_SHMEM); 203 __dec_zone_page_state(page, NR_SHMEM);
202 BUG_ON(page_mapped(page)); 204 BUG_ON(page_mapped(page));
@@ -483,7 +485,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
483 error = radix_tree_insert(&mapping->page_tree, offset, new); 485 error = radix_tree_insert(&mapping->page_tree, offset, new);
484 BUG_ON(error); 486 BUG_ON(error);
485 mapping->nrpages++; 487 mapping->nrpages++;
486 __inc_zone_page_state(new, NR_FILE_PAGES); 488
489 /*
490 * hugetlb pages do not participate in page cache accounting.
491 */
492 if (!PageHuge(new))
493 __inc_zone_page_state(new, NR_FILE_PAGES);
487 if (PageSwapBacked(new)) 494 if (PageSwapBacked(new))
488 __inc_zone_page_state(new, NR_SHMEM); 495 __inc_zone_page_state(new, NR_SHMEM);
489 spin_unlock_irq(&mapping->tree_lock); 496 spin_unlock_irq(&mapping->tree_lock);
@@ -575,7 +582,10 @@ static int __add_to_page_cache_locked(struct page *page,
575 radix_tree_preload_end(); 582 radix_tree_preload_end();
576 if (unlikely(error)) 583 if (unlikely(error))
577 goto err_insert; 584 goto err_insert;
578 __inc_zone_page_state(page, NR_FILE_PAGES); 585
586 /* hugetlb pages do not participate in page cache accounting. */
587 if (!huge)
588 __inc_zone_page_state(page, NR_FILE_PAGES);
579 spin_unlock_irq(&mapping->tree_lock); 589 spin_unlock_irq(&mapping->tree_lock);
580 if (!huge) 590 if (!huge)
581 mem_cgroup_commit_charge(page, memcg, false); 591 mem_cgroup_commit_charge(page, memcg, false);
@@ -1654,8 +1664,8 @@ no_cached_page:
1654 error = -ENOMEM; 1664 error = -ENOMEM;
1655 goto out; 1665 goto out;
1656 } 1666 }
1657 error = add_to_page_cache_lru(page, mapping, 1667 error = add_to_page_cache_lru(page, mapping, index,
1658 index, GFP_KERNEL); 1668 GFP_KERNEL & mapping_gfp_mask(mapping));
1659 if (error) { 1669 if (error) {
1660 page_cache_release(page); 1670 page_cache_release(page);
1661 if (error == -EEXIST) { 1671 if (error == -EEXIST) {
@@ -1756,7 +1766,8 @@ static int page_cache_read(struct file *file, pgoff_t offset)
1756 if (!page) 1766 if (!page)
1757 return -ENOMEM; 1767 return -ENOMEM;
1758 1768
1759 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); 1769 ret = add_to_page_cache_lru(page, mapping, offset,
1770 GFP_KERNEL & mapping_gfp_mask(mapping));
1760 if (ret == 0) 1771 if (ret == 0)
1761 ret = mapping->a_ops->readpage(file, page); 1772 ret = mapping->a_ops->readpage(file, page);
1762 else if (ret == -EEXIST) 1773 else if (ret == -EEXIST)
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 8d82809eb085..27a9924caf61 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -21,11 +21,16 @@
21#include <linux/swapfile.h> 21#include <linux/swapfile.h>
22 22
23/* 23/*
24 * frontswap_ops is set by frontswap_register_ops to contain the pointers 24 * frontswap_ops are added by frontswap_register_ops, and provide the
25 * to the frontswap "backend" implementation functions. 25 * frontswap "backend" implementation functions. Multiple implementations
26 * may be registered, but implementations can never deregister. This
27 * is a simple singly-linked list of all registered implementations.
26 */ 28 */
27static struct frontswap_ops *frontswap_ops __read_mostly; 29static struct frontswap_ops *frontswap_ops __read_mostly;
28 30
31#define for_each_frontswap_ops(ops) \
32 for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
33
29/* 34/*
30 * If enabled, frontswap_store will return failure even on success. As 35 * If enabled, frontswap_store will return failure even on success. As
31 * a result, the swap subsystem will always write the page to swap, in 36 * a result, the swap subsystem will always write the page to swap, in
@@ -79,15 +84,6 @@ static inline void inc_frontswap_invalidates(void) { }
79 * on all frontswap functions to not call the backend until the backend 84 * on all frontswap functions to not call the backend until the backend
80 * has registered. 85 * has registered.
81 * 86 *
82 * Specifically when no backend is registered (nobody called
83 * frontswap_register_ops) all calls to frontswap_init (which is done via
84 * swapon -> enable_swap_info -> frontswap_init) are registered and remembered
85 * (via the setting of need_init bitmap) but fail to create tmem_pools. When a
86 * backend registers with frontswap at some later point the previous
87 * calls to frontswap_init are executed (by iterating over the need_init
88 * bitmap) to create tmem_pools and set the respective poolids. All of that is
89 * guarded by us using atomic bit operations on the 'need_init' bitmap.
90 *
91 * This would not guards us against the user deciding to call swapoff right as 87 * This would not guards us against the user deciding to call swapoff right as
92 * we are calling the backend to initialize (so swapon is in action). 88 * we are calling the backend to initialize (so swapon is in action).
93 * Fortunatly for us, the swapon_mutex has been taked by the callee so we are 89 * Fortunatly for us, the swapon_mutex has been taked by the callee so we are
@@ -106,37 +102,64 @@ static inline void inc_frontswap_invalidates(void) { }
106 * 102 *
107 * Obviously the opposite (unloading the backend) must be done after all 103 * Obviously the opposite (unloading the backend) must be done after all
108 * the frontswap_[store|load|invalidate_area|invalidate_page] start 104 * the frontswap_[store|load|invalidate_area|invalidate_page] start
109 * ignorning or failing the requests - at which point frontswap_ops 105 * ignoring or failing the requests. However, there is currently no way
110 * would have to be made in some fashion atomic. 106 * to unload a backend once it is registered.
111 */ 107 */
112static DECLARE_BITMAP(need_init, MAX_SWAPFILES);
113 108
114/* 109/*
115 * Register operations for frontswap, returning previous thus allowing 110 * Register operations for frontswap
116 * detection of multiple backends and possible nesting.
117 */ 111 */
118struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops) 112void frontswap_register_ops(struct frontswap_ops *ops)
119{ 113{
120 struct frontswap_ops *old = frontswap_ops; 114 DECLARE_BITMAP(a, MAX_SWAPFILES);
121 int i; 115 DECLARE_BITMAP(b, MAX_SWAPFILES);
122 116 struct swap_info_struct *si;
123 for (i = 0; i < MAX_SWAPFILES; i++) { 117 unsigned int i;
124 if (test_and_clear_bit(i, need_init)) { 118
125 struct swap_info_struct *sis = swap_info[i]; 119 bitmap_zero(a, MAX_SWAPFILES);
126 /* __frontswap_init _should_ have set it! */ 120 bitmap_zero(b, MAX_SWAPFILES);
127 if (!sis->frontswap_map) 121
128 return ERR_PTR(-EINVAL); 122 spin_lock(&swap_lock);
129 ops->init(i); 123 plist_for_each_entry(si, &swap_active_head, list) {
130 } 124 if (!WARN_ON(!si->frontswap_map))
125 set_bit(si->type, a);
131 } 126 }
127 spin_unlock(&swap_lock);
128
129 /* the new ops needs to know the currently active swap devices */
130 for_each_set_bit(i, a, MAX_SWAPFILES)
131 ops->init(i);
132
132 /* 133 /*
133 * We MUST have frontswap_ops set _after_ the frontswap_init's 134 * Setting frontswap_ops must happen after the ops->init() calls
134 * have been called. Otherwise __frontswap_store might fail. Hence 135 * above; cmpxchg implies smp_mb() which will ensure the init is
135 * the barrier to make sure compiler does not re-order us. 136 * complete at this point.
136 */ 137 */
137 barrier(); 138 do {
138 frontswap_ops = ops; 139 ops->next = frontswap_ops;
139 return old; 140 } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
141
142 spin_lock(&swap_lock);
143 plist_for_each_entry(si, &swap_active_head, list) {
144 if (si->frontswap_map)
145 set_bit(si->type, b);
146 }
147 spin_unlock(&swap_lock);
148
149 /*
150 * On the very unlikely chance that a swap device was added or
151 * removed between setting the "a" list bits and the ops init
152 * calls, we re-check and do init or invalidate for any changed
153 * bits.
154 */
155 if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
156 for (i = 0; i < MAX_SWAPFILES; i++) {
157 if (!test_bit(i, a) && test_bit(i, b))
158 ops->init(i);
159 else if (test_bit(i, a) && !test_bit(i, b))
160 ops->invalidate_area(i);
161 }
162 }
140} 163}
141EXPORT_SYMBOL(frontswap_register_ops); 164EXPORT_SYMBOL(frontswap_register_ops);
142 165
@@ -164,6 +187,7 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
164void __frontswap_init(unsigned type, unsigned long *map) 187void __frontswap_init(unsigned type, unsigned long *map)
165{ 188{
166 struct swap_info_struct *sis = swap_info[type]; 189 struct swap_info_struct *sis = swap_info[type];
190 struct frontswap_ops *ops;
167 191
168 BUG_ON(sis == NULL); 192 BUG_ON(sis == NULL);
169 193
@@ -179,28 +203,30 @@ void __frontswap_init(unsigned type, unsigned long *map)
179 * p->frontswap set to something valid to work properly. 203 * p->frontswap set to something valid to work properly.
180 */ 204 */
181 frontswap_map_set(sis, map); 205 frontswap_map_set(sis, map);
182 if (frontswap_ops) 206
183 frontswap_ops->init(type); 207 for_each_frontswap_ops(ops)
184 else { 208 ops->init(type);
185 BUG_ON(type >= MAX_SWAPFILES);
186 set_bit(type, need_init);
187 }
188} 209}
189EXPORT_SYMBOL(__frontswap_init); 210EXPORT_SYMBOL(__frontswap_init);
190 211
191bool __frontswap_test(struct swap_info_struct *sis, 212bool __frontswap_test(struct swap_info_struct *sis,
192 pgoff_t offset) 213 pgoff_t offset)
193{ 214{
194 bool ret = false; 215 if (sis->frontswap_map)
195 216 return test_bit(offset, sis->frontswap_map);
196 if (frontswap_ops && sis->frontswap_map) 217 return false;
197 ret = test_bit(offset, sis->frontswap_map);
198 return ret;
199} 218}
200EXPORT_SYMBOL(__frontswap_test); 219EXPORT_SYMBOL(__frontswap_test);
201 220
221static inline void __frontswap_set(struct swap_info_struct *sis,
222 pgoff_t offset)
223{
224 set_bit(offset, sis->frontswap_map);
225 atomic_inc(&sis->frontswap_pages);
226}
227
202static inline void __frontswap_clear(struct swap_info_struct *sis, 228static inline void __frontswap_clear(struct swap_info_struct *sis,
203 pgoff_t offset) 229 pgoff_t offset)
204{ 230{
205 clear_bit(offset, sis->frontswap_map); 231 clear_bit(offset, sis->frontswap_map);
206 atomic_dec(&sis->frontswap_pages); 232 atomic_dec(&sis->frontswap_pages);
@@ -215,39 +241,46 @@ static inline void __frontswap_clear(struct swap_info_struct *sis,
215 */ 241 */
216int __frontswap_store(struct page *page) 242int __frontswap_store(struct page *page)
217{ 243{
218 int ret = -1, dup = 0; 244 int ret = -1;
219 swp_entry_t entry = { .val = page_private(page), }; 245 swp_entry_t entry = { .val = page_private(page), };
220 int type = swp_type(entry); 246 int type = swp_type(entry);
221 struct swap_info_struct *sis = swap_info[type]; 247 struct swap_info_struct *sis = swap_info[type];
222 pgoff_t offset = swp_offset(entry); 248 pgoff_t offset = swp_offset(entry);
249 struct frontswap_ops *ops;
223 250
224 /* 251 /*
225 * Return if no backend registed. 252 * Return if no backend registed.
226 * Don't need to inc frontswap_failed_stores here. 253 * Don't need to inc frontswap_failed_stores here.
227 */ 254 */
228 if (!frontswap_ops) 255 if (!frontswap_ops)
229 return ret; 256 return -1;
230 257
231 BUG_ON(!PageLocked(page)); 258 BUG_ON(!PageLocked(page));
232 BUG_ON(sis == NULL); 259 BUG_ON(sis == NULL);
233 if (__frontswap_test(sis, offset)) 260
234 dup = 1; 261 /*
235 ret = frontswap_ops->store(type, offset, page); 262 * If a dup, we must remove the old page first; we can't leave the
263 * old page no matter if the store of the new page succeeds or fails,
264 * and we can't rely on the new page replacing the old page as we may
265 * not store to the same implementation that contains the old page.
266 */
267 if (__frontswap_test(sis, offset)) {
268 __frontswap_clear(sis, offset);
269 for_each_frontswap_ops(ops)
270 ops->invalidate_page(type, offset);
271 }
272
273 /* Try to store in each implementation, until one succeeds. */
274 for_each_frontswap_ops(ops) {
275 ret = ops->store(type, offset, page);
276 if (!ret) /* successful store */
277 break;
278 }
236 if (ret == 0) { 279 if (ret == 0) {
237 set_bit(offset, sis->frontswap_map); 280 __frontswap_set(sis, offset);
238 inc_frontswap_succ_stores(); 281 inc_frontswap_succ_stores();
239 if (!dup)
240 atomic_inc(&sis->frontswap_pages);
241 } else { 282 } else {
242 /*
243 failed dup always results in automatic invalidate of
244 the (older) page from frontswap
245 */
246 inc_frontswap_failed_stores(); 283 inc_frontswap_failed_stores();
247 if (dup) {
248 __frontswap_clear(sis, offset);
249 frontswap_ops->invalidate_page(type, offset);
250 }
251 } 284 }
252 if (frontswap_writethrough_enabled) 285 if (frontswap_writethrough_enabled)
253 /* report failure so swap also writes to swap device */ 286 /* report failure so swap also writes to swap device */
@@ -268,14 +301,22 @@ int __frontswap_load(struct page *page)
268 int type = swp_type(entry); 301 int type = swp_type(entry);
269 struct swap_info_struct *sis = swap_info[type]; 302 struct swap_info_struct *sis = swap_info[type];
270 pgoff_t offset = swp_offset(entry); 303 pgoff_t offset = swp_offset(entry);
304 struct frontswap_ops *ops;
305
306 if (!frontswap_ops)
307 return -1;
271 308
272 BUG_ON(!PageLocked(page)); 309 BUG_ON(!PageLocked(page));
273 BUG_ON(sis == NULL); 310 BUG_ON(sis == NULL);
274 /* 311 if (!__frontswap_test(sis, offset))
275 * __frontswap_test() will check whether there is backend registered 312 return -1;
276 */ 313
277 if (__frontswap_test(sis, offset)) 314 /* Try loading from each implementation, until one succeeds. */
278 ret = frontswap_ops->load(type, offset, page); 315 for_each_frontswap_ops(ops) {
316 ret = ops->load(type, offset, page);
317 if (!ret) /* successful load */
318 break;
319 }
279 if (ret == 0) { 320 if (ret == 0) {
280 inc_frontswap_loads(); 321 inc_frontswap_loads();
281 if (frontswap_tmem_exclusive_gets_enabled) { 322 if (frontswap_tmem_exclusive_gets_enabled) {
@@ -294,16 +335,19 @@ EXPORT_SYMBOL(__frontswap_load);
294void __frontswap_invalidate_page(unsigned type, pgoff_t offset) 335void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
295{ 336{
296 struct swap_info_struct *sis = swap_info[type]; 337 struct swap_info_struct *sis = swap_info[type];
338 struct frontswap_ops *ops;
339
340 if (!frontswap_ops)
341 return;
297 342
298 BUG_ON(sis == NULL); 343 BUG_ON(sis == NULL);
299 /* 344 if (!__frontswap_test(sis, offset))
300 * __frontswap_test() will check whether there is backend registered 345 return;
301 */ 346
302 if (__frontswap_test(sis, offset)) { 347 for_each_frontswap_ops(ops)
303 frontswap_ops->invalidate_page(type, offset); 348 ops->invalidate_page(type, offset);
304 __frontswap_clear(sis, offset); 349 __frontswap_clear(sis, offset);
305 inc_frontswap_invalidates(); 350 inc_frontswap_invalidates();
306 }
307} 351}
308EXPORT_SYMBOL(__frontswap_invalidate_page); 352EXPORT_SYMBOL(__frontswap_invalidate_page);
309 353
@@ -314,16 +358,19 @@ EXPORT_SYMBOL(__frontswap_invalidate_page);
314void __frontswap_invalidate_area(unsigned type) 358void __frontswap_invalidate_area(unsigned type)
315{ 359{
316 struct swap_info_struct *sis = swap_info[type]; 360 struct swap_info_struct *sis = swap_info[type];
361 struct frontswap_ops *ops;
317 362
318 if (frontswap_ops) { 363 if (!frontswap_ops)
319 BUG_ON(sis == NULL); 364 return;
320 if (sis->frontswap_map == NULL) 365
321 return; 366 BUG_ON(sis == NULL);
322 frontswap_ops->invalidate_area(type); 367 if (sis->frontswap_map == NULL)
323 atomic_set(&sis->frontswap_pages, 0); 368 return;
324 bitmap_zero(sis->frontswap_map, sis->max); 369
325 } 370 for_each_frontswap_ops(ops)
326 clear_bit(type, need_init); 371 ops->invalidate_area(type);
372 atomic_set(&sis->frontswap_pages, 0);
373 bitmap_zero(sis->frontswap_map, sis->max);
327} 374}
328EXPORT_SYMBOL(__frontswap_invalidate_area); 375EXPORT_SYMBOL(__frontswap_invalidate_area);
329 376
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 078832cf3636..c107094f79ba 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1031,7 +1031,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1031 goto out_free_pages; 1031 goto out_free_pages;
1032 VM_BUG_ON_PAGE(!PageHead(page), page); 1032 VM_BUG_ON_PAGE(!PageHead(page), page);
1033 1033
1034 pmdp_clear_flush_notify(vma, haddr, pmd); 1034 pmdp_huge_clear_flush_notify(vma, haddr, pmd);
1035 /* leave pmd empty until pte is filled */ 1035 /* leave pmd empty until pte is filled */
1036 1036
1037 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1037 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
@@ -1174,7 +1174,7 @@ alloc:
1174 pmd_t entry; 1174 pmd_t entry;
1175 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1175 entry = mk_huge_pmd(new_page, vma->vm_page_prot);
1176 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1176 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1177 pmdp_clear_flush_notify(vma, haddr, pmd); 1177 pmdp_huge_clear_flush_notify(vma, haddr, pmd);
1178 page_add_new_anon_rmap(new_page, vma, haddr); 1178 page_add_new_anon_rmap(new_page, vma, haddr);
1179 mem_cgroup_commit_charge(new_page, memcg, false); 1179 mem_cgroup_commit_charge(new_page, memcg, false);
1180 lru_cache_add_active_or_unevictable(new_page, vma); 1180 lru_cache_add_active_or_unevictable(new_page, vma);
@@ -1396,12 +1396,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1396 pmd_t orig_pmd; 1396 pmd_t orig_pmd;
1397 /* 1397 /*
1398 * For architectures like ppc64 we look at deposited pgtable 1398 * For architectures like ppc64 we look at deposited pgtable
1399 * when calling pmdp_get_and_clear. So do the 1399 * when calling pmdp_huge_get_and_clear. So do the
1400 * pgtable_trans_huge_withdraw after finishing pmdp related 1400 * pgtable_trans_huge_withdraw after finishing pmdp related
1401 * operations. 1401 * operations.
1402 */ 1402 */
1403 orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd, 1403 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
1404 tlb->fullmm); 1404 tlb->fullmm);
1405 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1405 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1406 pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); 1406 pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
1407 if (is_huge_zero_pmd(orig_pmd)) { 1407 if (is_huge_zero_pmd(orig_pmd)) {
@@ -1459,7 +1459,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1459 new_ptl = pmd_lockptr(mm, new_pmd); 1459 new_ptl = pmd_lockptr(mm, new_pmd);
1460 if (new_ptl != old_ptl) 1460 if (new_ptl != old_ptl)
1461 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 1461 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
1462 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); 1462 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
1463 VM_BUG_ON(!pmd_none(*new_pmd)); 1463 VM_BUG_ON(!pmd_none(*new_pmd));
1464 1464
1465 if (pmd_move_must_withdraw(new_ptl, old_ptl)) { 1465 if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
@@ -1505,7 +1505,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1505 } 1505 }
1506 1506
1507 if (!prot_numa || !pmd_protnone(*pmd)) { 1507 if (!prot_numa || !pmd_protnone(*pmd)) {
1508 entry = pmdp_get_and_clear_notify(mm, addr, pmd); 1508 entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
1509 entry = pmd_modify(entry, newprot); 1509 entry = pmd_modify(entry, newprot);
1510 if (preserve_write) 1510 if (preserve_write)
1511 entry = pmd_mkwrite(entry); 1511 entry = pmd_mkwrite(entry);
@@ -2499,7 +2499,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2499 * huge and small TLB entries for the same virtual address 2499 * huge and small TLB entries for the same virtual address
2500 * to avoid the risk of CPU bugs in that area. 2500 * to avoid the risk of CPU bugs in that area.
2501 */ 2501 */
2502 _pmd = pmdp_clear_flush(vma, address, pmd); 2502 _pmd = pmdp_collapse_flush(vma, address, pmd);
2503 spin_unlock(pmd_ptl); 2503 spin_unlock(pmd_ptl);
2504 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2504 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2505 2505
@@ -2799,7 +2799,7 @@ static void khugepaged_do_scan(void)
2799 2799
2800 cond_resched(); 2800 cond_resched();
2801 2801
2802 if (unlikely(kthread_should_stop() || freezing(current))) 2802 if (unlikely(kthread_should_stop() || try_to_freeze()))
2803 break; 2803 break;
2804 2804
2805 spin_lock(&khugepaged_mm_lock); 2805 spin_lock(&khugepaged_mm_lock);
@@ -2820,8 +2820,6 @@ static void khugepaged_do_scan(void)
2820 2820
2821static void khugepaged_wait_work(void) 2821static void khugepaged_wait_work(void)
2822{ 2822{
2823 try_to_freeze();
2824
2825 if (khugepaged_has_work()) { 2823 if (khugepaged_has_work()) {
2826 if (!khugepaged_scan_sleep_millisecs) 2824 if (!khugepaged_scan_sleep_millisecs)
2827 return; 2825 return;
@@ -2865,7 +2863,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2865 pmd_t _pmd; 2863 pmd_t _pmd;
2866 int i; 2864 int i;
2867 2865
2868 pmdp_clear_flush_notify(vma, haddr, pmd); 2866 pmdp_huge_clear_flush_notify(vma, haddr, pmd);
2869 /* leave pmd empty until pte is filled */ 2867 /* leave pmd empty until pte is filled */
2870 2868
2871 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2869 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 271e4432734c..75c0eef52c5d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,11 @@ int hugepages_treat_as_movable;
40int hugetlb_max_hstate __read_mostly; 40int hugetlb_max_hstate __read_mostly;
41unsigned int default_hstate_idx; 41unsigned int default_hstate_idx;
42struct hstate hstates[HUGE_MAX_HSTATE]; 42struct hstate hstates[HUGE_MAX_HSTATE];
43/*
44 * Minimum page order among possible hugepage sizes, set to a proper value
45 * at boot time.
46 */
47static unsigned int minimum_order __read_mostly = UINT_MAX;
43 48
44__initdata LIST_HEAD(huge_boot_pages); 49__initdata LIST_HEAD(huge_boot_pages);
45 50
@@ -212,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
212 * Region tracking -- allows tracking of reservations and instantiated pages 217 * Region tracking -- allows tracking of reservations and instantiated pages
213 * across the pages in a mapping. 218 * across the pages in a mapping.
214 * 219 *
215 * The region data structures are embedded into a resv_map and 220 * The region data structures are embedded into a resv_map and protected
216 * protected by a resv_map's lock 221 * by a resv_map's lock. The set of regions within the resv_map represent
222 * reservations for huge pages, or huge pages that have already been
223 * instantiated within the map. The from and to elements are huge page
224 * indicies into the associated mapping. from indicates the starting index
225 * of the region. to represents the first index past the end of the region.
226 *
227 * For example, a file region structure with from == 0 and to == 4 represents
228 * four huge pages in a mapping. It is important to note that the to element
229 * represents the first element past the end of the region. This is used in
230 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
231 *
232 * Interval notation of the form [from, to) will be used to indicate that
233 * the endpoint from is inclusive and to is exclusive.
217 */ 234 */
218struct file_region { 235struct file_region {
219 struct list_head link; 236 struct list_head link;
@@ -221,10 +238,22 @@ struct file_region {
221 long to; 238 long to;
222}; 239};
223 240
241/*
242 * Add the huge page range represented by [f, t) to the reserve
243 * map. Existing regions will be expanded to accommodate the
244 * specified range. We know only existing regions need to be
245 * expanded, because region_add is only called after region_chg
246 * with the same range. If a new file_region structure must
247 * be allocated, it is done in region_chg.
248 *
249 * Return the number of new huge pages added to the map. This
250 * number is greater than or equal to zero.
251 */
224static long region_add(struct resv_map *resv, long f, long t) 252static long region_add(struct resv_map *resv, long f, long t)
225{ 253{
226 struct list_head *head = &resv->regions; 254 struct list_head *head = &resv->regions;
227 struct file_region *rg, *nrg, *trg; 255 struct file_region *rg, *nrg, *trg;
256 long add = 0;
228 257
229 spin_lock(&resv->lock); 258 spin_lock(&resv->lock);
230 /* Locate the region we are either in or before. */ 259 /* Locate the region we are either in or before. */
@@ -250,16 +279,45 @@ static long region_add(struct resv_map *resv, long f, long t)
250 if (rg->to > t) 279 if (rg->to > t)
251 t = rg->to; 280 t = rg->to;
252 if (rg != nrg) { 281 if (rg != nrg) {
282 /* Decrement return value by the deleted range.
283 * Another range will span this area so that by
284 * end of routine add will be >= zero
285 */
286 add -= (rg->to - rg->from);
253 list_del(&rg->link); 287 list_del(&rg->link);
254 kfree(rg); 288 kfree(rg);
255 } 289 }
256 } 290 }
291
292 add += (nrg->from - f); /* Added to beginning of region */
257 nrg->from = f; 293 nrg->from = f;
294 add += t - nrg->to; /* Added to end of region */
258 nrg->to = t; 295 nrg->to = t;
296
259 spin_unlock(&resv->lock); 297 spin_unlock(&resv->lock);
260 return 0; 298 VM_BUG_ON(add < 0);
299 return add;
261} 300}
262 301
302/*
303 * Examine the existing reserve map and determine how many
304 * huge pages in the specified range [f, t) are NOT currently
305 * represented. This routine is called before a subsequent
306 * call to region_add that will actually modify the reserve
307 * map to add the specified range [f, t). region_chg does
308 * not change the number of huge pages represented by the
309 * map. However, if the existing regions in the map can not
310 * be expanded to represent the new range, a new file_region
311 * structure is added to the map as a placeholder. This is
312 * so that the subsequent region_add call will have all the
313 * regions it needs and will not fail.
314 *
315 * Returns the number of huge pages that need to be added
316 * to the existing reservation map for the range [f, t).
317 * This number is greater or equal to zero. -ENOMEM is
318 * returned if a new file_region structure is needed and can
319 * not be allocated.
320 */
263static long region_chg(struct resv_map *resv, long f, long t) 321static long region_chg(struct resv_map *resv, long f, long t)
264{ 322{
265 struct list_head *head = &resv->regions; 323 struct list_head *head = &resv->regions;
@@ -326,6 +384,11 @@ out_nrg:
326 return chg; 384 return chg;
327} 385}
328 386
387/*
388 * Truncate the reserve map at index 'end'. Modify/truncate any
389 * region which contains end. Delete any regions past end.
390 * Return the number of huge pages removed from the map.
391 */
329static long region_truncate(struct resv_map *resv, long end) 392static long region_truncate(struct resv_map *resv, long end)
330{ 393{
331 struct list_head *head = &resv->regions; 394 struct list_head *head = &resv->regions;
@@ -361,6 +424,10 @@ out:
361 return chg; 424 return chg;
362} 425}
363 426
427/*
428 * Count and return the number of huge pages in the reserve map
429 * that intersect with the range [f, t).
430 */
364static long region_count(struct resv_map *resv, long f, long t) 431static long region_count(struct resv_map *resv, long f, long t)
365{ 432{
366 struct list_head *head = &resv->regions; 433 struct list_head *head = &resv->regions;
@@ -1188,19 +1255,13 @@ static void dissolve_free_huge_page(struct page *page)
1188 */ 1255 */
1189void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 1256void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1190{ 1257{
1191 unsigned int order = 8 * sizeof(void *);
1192 unsigned long pfn; 1258 unsigned long pfn;
1193 struct hstate *h;
1194 1259
1195 if (!hugepages_supported()) 1260 if (!hugepages_supported())
1196 return; 1261 return;
1197 1262
1198 /* Set scan step to minimum hugepage size */ 1263 VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
1199 for_each_hstate(h) 1264 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
1200 if (order > huge_page_order(h))
1201 order = huge_page_order(h);
1202 VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
1203 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
1204 dissolve_free_huge_page(pfn_to_page(pfn)); 1265 dissolve_free_huge_page(pfn_to_page(pfn));
1205} 1266}
1206 1267
@@ -1423,46 +1484,56 @@ static void return_unused_surplus_pages(struct hstate *h,
1423} 1484}
1424 1485
1425/* 1486/*
1426 * Determine if the huge page at addr within the vma has an associated 1487 * vma_needs_reservation and vma_commit_reservation are used by the huge
1427 * reservation. Where it does not we will need to logically increase 1488 * page allocation routines to manage reservations.
1428 * reservation and actually increase subpool usage before an allocation 1489 *
1429 * can occur. Where any new reservation would be required the 1490 * vma_needs_reservation is called to determine if the huge page at addr
1430 * reservation change is prepared, but not committed. Once the page 1491 * within the vma has an associated reservation. If a reservation is
1431 * has been allocated from the subpool and instantiated the change should 1492 * needed, the value 1 is returned. The caller is then responsible for
1432 * be committed via vma_commit_reservation. No action is required on 1493 * managing the global reservation and subpool usage counts. After
1433 * failure. 1494 * the huge page has been allocated, vma_commit_reservation is called
1495 * to add the page to the reservation map.
1496 *
1497 * In the normal case, vma_commit_reservation returns the same value
1498 * as the preceding vma_needs_reservation call. The only time this
1499 * is not the case is if a reserve map was changed between calls. It
1500 * is the responsibility of the caller to notice the difference and
1501 * take appropriate action.
1434 */ 1502 */
1435static long vma_needs_reservation(struct hstate *h, 1503static long __vma_reservation_common(struct hstate *h,
1436 struct vm_area_struct *vma, unsigned long addr) 1504 struct vm_area_struct *vma, unsigned long addr,
1505 bool commit)
1437{ 1506{
1438 struct resv_map *resv; 1507 struct resv_map *resv;
1439 pgoff_t idx; 1508 pgoff_t idx;
1440 long chg; 1509 long ret;
1441 1510
1442 resv = vma_resv_map(vma); 1511 resv = vma_resv_map(vma);
1443 if (!resv) 1512 if (!resv)
1444 return 1; 1513 return 1;
1445 1514
1446 idx = vma_hugecache_offset(h, vma, addr); 1515 idx = vma_hugecache_offset(h, vma, addr);
1447 chg = region_chg(resv, idx, idx + 1); 1516 if (commit)
1517 ret = region_add(resv, idx, idx + 1);
1518 else
1519 ret = region_chg(resv, idx, idx + 1);
1448 1520
1449 if (vma->vm_flags & VM_MAYSHARE) 1521 if (vma->vm_flags & VM_MAYSHARE)
1450 return chg; 1522 return ret;
1451 else 1523 else
1452 return chg < 0 ? chg : 0; 1524 return ret < 0 ? ret : 0;
1453} 1525}
1454static void vma_commit_reservation(struct hstate *h, 1526
1527static long vma_needs_reservation(struct hstate *h,
1455 struct vm_area_struct *vma, unsigned long addr) 1528 struct vm_area_struct *vma, unsigned long addr)
1456{ 1529{
1457 struct resv_map *resv; 1530 return __vma_reservation_common(h, vma, addr, false);
1458 pgoff_t idx; 1531}
1459
1460 resv = vma_resv_map(vma);
1461 if (!resv)
1462 return;
1463 1532
1464 idx = vma_hugecache_offset(h, vma, addr); 1533static long vma_commit_reservation(struct hstate *h,
1465 region_add(resv, idx, idx + 1); 1534 struct vm_area_struct *vma, unsigned long addr)
1535{
1536 return __vma_reservation_common(h, vma, addr, true);
1466} 1537}
1467 1538
1468static struct page *alloc_huge_page(struct vm_area_struct *vma, 1539static struct page *alloc_huge_page(struct vm_area_struct *vma,
@@ -1471,7 +1542,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1471 struct hugepage_subpool *spool = subpool_vma(vma); 1542 struct hugepage_subpool *spool = subpool_vma(vma);
1472 struct hstate *h = hstate_vma(vma); 1543 struct hstate *h = hstate_vma(vma);
1473 struct page *page; 1544 struct page *page;
1474 long chg; 1545 long chg, commit;
1475 int ret, idx; 1546 int ret, idx;
1476 struct hugetlb_cgroup *h_cg; 1547 struct hugetlb_cgroup *h_cg;
1477 1548
@@ -1512,7 +1583,22 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1512 1583
1513 set_page_private(page, (unsigned long)spool); 1584 set_page_private(page, (unsigned long)spool);
1514 1585
1515 vma_commit_reservation(h, vma, addr); 1586 commit = vma_commit_reservation(h, vma, addr);
1587 if (unlikely(chg > commit)) {
1588 /*
1589 * The page was added to the reservation map between
1590 * vma_needs_reservation and vma_commit_reservation.
1591 * This indicates a race with hugetlb_reserve_pages.
1592 * Adjust for the subpool count incremented above AND
1593 * in hugetlb_reserve_pages for the same page. Also,
1594 * the reservation count added in hugetlb_reserve_pages
1595 * no longer applies.
1596 */
1597 long rsv_adjust;
1598
1599 rsv_adjust = hugepage_subpool_put_pages(spool, 1);
1600 hugetlb_acct_memory(h, -rsv_adjust);
1601 }
1516 return page; 1602 return page;
1517 1603
1518out_uncharge_cgroup: 1604out_uncharge_cgroup:
@@ -1627,10 +1713,14 @@ static void __init hugetlb_init_hstates(void)
1627 struct hstate *h; 1713 struct hstate *h;
1628 1714
1629 for_each_hstate(h) { 1715 for_each_hstate(h) {
1716 if (minimum_order > huge_page_order(h))
1717 minimum_order = huge_page_order(h);
1718
1630 /* oversize hugepages were init'ed in early boot */ 1719 /* oversize hugepages were init'ed in early boot */
1631 if (!hstate_is_gigantic(h)) 1720 if (!hstate_is_gigantic(h))
1632 hugetlb_hstate_alloc_pages(h); 1721 hugetlb_hstate_alloc_pages(h);
1633 } 1722 }
1723 VM_BUG_ON(minimum_order == UINT_MAX);
1634} 1724}
1635 1725
1636static char * __init memfmt(char *buf, unsigned long n) 1726static char * __init memfmt(char *buf, unsigned long n)
@@ -3626,8 +3716,24 @@ int hugetlb_reserve_pages(struct inode *inode,
3626 * consumed reservations are stored in the map. Hence, nothing 3716 * consumed reservations are stored in the map. Hence, nothing
3627 * else has to be done for private mappings here 3717 * else has to be done for private mappings here
3628 */ 3718 */
3629 if (!vma || vma->vm_flags & VM_MAYSHARE) 3719 if (!vma || vma->vm_flags & VM_MAYSHARE) {
3630 region_add(resv_map, from, to); 3720 long add = region_add(resv_map, from, to);
3721
3722 if (unlikely(chg > add)) {
3723 /*
3724 * pages in this range were added to the reserve
3725 * map between region_chg and region_add. This
3726 * indicates a race with alloc_huge_page. Adjust
3727 * the subpool and reserve counts modified above
3728 * based on the difference.
3729 */
3730 long rsv_adjust;
3731
3732 rsv_adjust = hugepage_subpool_put_pages(spool,
3733 chg - add);
3734 hugetlb_acct_memory(h, -rsv_adjust);
3735 }
3736 }
3631 return 0; 3737 return 0;
3632out_err: 3738out_err:
3633 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3739 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
@@ -3789,6 +3895,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3789{ 3895{
3790 return NULL; 3896 return NULL;
3791} 3897}
3898
3899int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
3900{
3901 return 0;
3902}
3792#define want_pmd_share() (0) 3903#define want_pmd_share() (0)
3793#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 3904#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
3794 3905
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 4ca5fe0042e1..bf73ac17dad4 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -28,7 +28,7 @@ static int hwpoison_inject(void *data, u64 val)
28 /* 28 /*
29 * This implies unable to support free buddy pages. 29 * This implies unable to support free buddy pages.
30 */ 30 */
31 if (!get_page_unless_zero(hpage)) 31 if (!get_hwpoison_page(p))
32 return 0; 32 return 0;
33 33
34 if (!hwpoison_filter_enable) 34 if (!hwpoison_filter_enable)
@@ -58,7 +58,7 @@ inject:
58 pr_info("Injecting memory failure at pfn %#lx\n", pfn); 58 pr_info("Injecting memory failure at pfn %#lx\n", pfn);
59 return memory_failure(pfn, 18, MF_COUNT_INCREASED); 59 return memory_failure(pfn, 18, MF_COUNT_INCREASED);
60put_out: 60put_out:
61 put_page(hpage); 61 put_page(p);
62 return 0; 62 return 0;
63} 63}
64 64
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f0fe4f2c1fa7..cf79f110157c 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -53,6 +53,13 @@
53 * modifications to the memory scanning parameters including the scan_thread 53 * modifications to the memory scanning parameters including the scan_thread
54 * pointer 54 * pointer
55 * 55 *
56 * Locks and mutexes are acquired/nested in the following order:
57 *
58 * scan_mutex [-> object->lock] -> kmemleak_lock -> other_object->lock (SINGLE_DEPTH_NESTING)
59 *
60 * No kmemleak_lock and object->lock nesting is allowed outside scan_mutex
61 * regions.
62 *
56 * The kmemleak_object structures have a use_count incremented or decremented 63 * The kmemleak_object structures have a use_count incremented or decremented
57 * using the get_object()/put_object() functions. When the use_count becomes 64 * using the get_object()/put_object() functions. When the use_count becomes
58 * 0, this count can no longer be incremented and put_object() schedules the 65 * 0, this count can no longer be incremented and put_object() schedules the
@@ -195,6 +202,8 @@ static struct kmem_cache *scan_area_cache;
195 202
196/* set if tracing memory operations is enabled */ 203/* set if tracing memory operations is enabled */
197static int kmemleak_enabled; 204static int kmemleak_enabled;
205/* same as above but only for the kmemleak_free() callback */
206static int kmemleak_free_enabled;
198/* set in the late_initcall if there were no errors */ 207/* set in the late_initcall if there were no errors */
199static int kmemleak_initialized; 208static int kmemleak_initialized;
200/* enables or disables early logging of the memory operations */ 209/* enables or disables early logging of the memory operations */
@@ -483,8 +492,7 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
483 492
484 rcu_read_lock(); 493 rcu_read_lock();
485 read_lock_irqsave(&kmemleak_lock, flags); 494 read_lock_irqsave(&kmemleak_lock, flags);
486 if (ptr >= min_addr && ptr < max_addr) 495 object = lookup_object(ptr, alias);
487 object = lookup_object(ptr, alias);
488 read_unlock_irqrestore(&kmemleak_lock, flags); 496 read_unlock_irqrestore(&kmemleak_lock, flags);
489 497
490 /* check whether the object is still available */ 498 /* check whether the object is still available */
@@ -496,6 +504,27 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
496} 504}
497 505
498/* 506/*
507 * Look up an object in the object search tree and remove it from both
508 * object_tree_root and object_list. The returned object's use_count should be
509 * at least 1, as initially set by create_object().
510 */
511static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias)
512{
513 unsigned long flags;
514 struct kmemleak_object *object;
515
516 write_lock_irqsave(&kmemleak_lock, flags);
517 object = lookup_object(ptr, alias);
518 if (object) {
519 rb_erase(&object->rb_node, &object_tree_root);
520 list_del_rcu(&object->object_list);
521 }
522 write_unlock_irqrestore(&kmemleak_lock, flags);
523
524 return object;
525}
526
527/*
499 * Save stack trace to the given array of MAX_TRACE size. 528 * Save stack trace to the given array of MAX_TRACE size.
500 */ 529 */
501static int __save_stack_trace(unsigned long *trace) 530static int __save_stack_trace(unsigned long *trace)
@@ -580,11 +609,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
580 kmemleak_stop("Cannot insert 0x%lx into the object " 609 kmemleak_stop("Cannot insert 0x%lx into the object "
581 "search tree (overlaps existing)\n", 610 "search tree (overlaps existing)\n",
582 ptr); 611 ptr);
612 /*
613 * No need for parent->lock here since "parent" cannot
614 * be freed while the kmemleak_lock is held.
615 */
616 dump_object_info(parent);
583 kmem_cache_free(object_cache, object); 617 kmem_cache_free(object_cache, object);
584 object = parent; 618 object = NULL;
585 spin_lock(&object->lock);
586 dump_object_info(object);
587 spin_unlock(&object->lock);
588 goto out; 619 goto out;
589 } 620 }
590 } 621 }
@@ -598,20 +629,14 @@ out:
598} 629}
599 630
600/* 631/*
601 * Remove the metadata (struct kmemleak_object) for a memory block from the 632 * Mark the object as not allocated and schedule RCU freeing via put_object().
602 * object_list and object_tree_root and decrement its use_count.
603 */ 633 */
604static void __delete_object(struct kmemleak_object *object) 634static void __delete_object(struct kmemleak_object *object)
605{ 635{
606 unsigned long flags; 636 unsigned long flags;
607 637
608 write_lock_irqsave(&kmemleak_lock, flags);
609 rb_erase(&object->rb_node, &object_tree_root);
610 list_del_rcu(&object->object_list);
611 write_unlock_irqrestore(&kmemleak_lock, flags);
612
613 WARN_ON(!(object->flags & OBJECT_ALLOCATED)); 638 WARN_ON(!(object->flags & OBJECT_ALLOCATED));
614 WARN_ON(atomic_read(&object->use_count) < 2); 639 WARN_ON(atomic_read(&object->use_count) < 1);
615 640
616 /* 641 /*
617 * Locking here also ensures that the corresponding memory block 642 * Locking here also ensures that the corresponding memory block
@@ -631,7 +656,7 @@ static void delete_object_full(unsigned long ptr)
631{ 656{
632 struct kmemleak_object *object; 657 struct kmemleak_object *object;
633 658
634 object = find_and_get_object(ptr, 0); 659 object = find_and_remove_object(ptr, 0);
635 if (!object) { 660 if (!object) {
636#ifdef DEBUG 661#ifdef DEBUG
637 kmemleak_warn("Freeing unknown object at 0x%08lx\n", 662 kmemleak_warn("Freeing unknown object at 0x%08lx\n",
@@ -640,7 +665,6 @@ static void delete_object_full(unsigned long ptr)
640 return; 665 return;
641 } 666 }
642 __delete_object(object); 667 __delete_object(object);
643 put_object(object);
644} 668}
645 669
646/* 670/*
@@ -653,7 +677,7 @@ static void delete_object_part(unsigned long ptr, size_t size)
653 struct kmemleak_object *object; 677 struct kmemleak_object *object;
654 unsigned long start, end; 678 unsigned long start, end;
655 679
656 object = find_and_get_object(ptr, 1); 680 object = find_and_remove_object(ptr, 1);
657 if (!object) { 681 if (!object) {
658#ifdef DEBUG 682#ifdef DEBUG
659 kmemleak_warn("Partially freeing unknown object at 0x%08lx " 683 kmemleak_warn("Partially freeing unknown object at 0x%08lx "
@@ -661,7 +685,6 @@ static void delete_object_part(unsigned long ptr, size_t size)
661#endif 685#endif
662 return; 686 return;
663 } 687 }
664 __delete_object(object);
665 688
666 /* 689 /*
667 * Create one or two objects that may result from the memory block 690 * Create one or two objects that may result from the memory block
@@ -679,7 +702,7 @@ static void delete_object_part(unsigned long ptr, size_t size)
679 create_object(ptr + size, end - ptr - size, object->min_count, 702 create_object(ptr + size, end - ptr - size, object->min_count,
680 GFP_KERNEL); 703 GFP_KERNEL);
681 704
682 put_object(object); 705 __delete_object(object);
683} 706}
684 707
685static void __paint_it(struct kmemleak_object *object, int color) 708static void __paint_it(struct kmemleak_object *object, int color)
@@ -907,12 +930,13 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
907 * kmemleak_alloc_percpu - register a newly allocated __percpu object 930 * kmemleak_alloc_percpu - register a newly allocated __percpu object
908 * @ptr: __percpu pointer to beginning of the object 931 * @ptr: __percpu pointer to beginning of the object
909 * @size: size of the object 932 * @size: size of the object
933 * @gfp: flags used for kmemleak internal memory allocations
910 * 934 *
911 * This function is called from the kernel percpu allocator when a new object 935 * This function is called from the kernel percpu allocator when a new object
912 * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL 936 * (memory block) is allocated (alloc_percpu).
913 * allocation.
914 */ 937 */
915void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) 938void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
939 gfp_t gfp)
916{ 940{
917 unsigned int cpu; 941 unsigned int cpu;
918 942
@@ -925,7 +949,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
925 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 949 if (kmemleak_enabled && ptr && !IS_ERR(ptr))
926 for_each_possible_cpu(cpu) 950 for_each_possible_cpu(cpu)
927 create_object((unsigned long)per_cpu_ptr(ptr, cpu), 951 create_object((unsigned long)per_cpu_ptr(ptr, cpu),
928 size, 0, GFP_KERNEL); 952 size, 0, gfp);
929 else if (kmemleak_early_log) 953 else if (kmemleak_early_log)
930 log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); 954 log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
931} 955}
@@ -942,7 +966,7 @@ void __ref kmemleak_free(const void *ptr)
942{ 966{
943 pr_debug("%s(0x%p)\n", __func__, ptr); 967 pr_debug("%s(0x%p)\n", __func__, ptr);
944 968
945 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 969 if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
946 delete_object_full((unsigned long)ptr); 970 delete_object_full((unsigned long)ptr);
947 else if (kmemleak_early_log) 971 else if (kmemleak_early_log)
948 log_early(KMEMLEAK_FREE, ptr, 0, 0); 972 log_early(KMEMLEAK_FREE, ptr, 0, 0);
@@ -982,7 +1006,7 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr)
982 1006
983 pr_debug("%s(0x%p)\n", __func__, ptr); 1007 pr_debug("%s(0x%p)\n", __func__, ptr);
984 1008
985 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 1009 if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
986 for_each_possible_cpu(cpu) 1010 for_each_possible_cpu(cpu)
987 delete_object_full((unsigned long)per_cpu_ptr(ptr, 1011 delete_object_full((unsigned long)per_cpu_ptr(ptr,
988 cpu)); 1012 cpu));
@@ -1148,19 +1172,18 @@ static int scan_should_stop(void)
1148 * found to the gray list. 1172 * found to the gray list.
1149 */ 1173 */
1150static void scan_block(void *_start, void *_end, 1174static void scan_block(void *_start, void *_end,
1151 struct kmemleak_object *scanned, int allow_resched) 1175 struct kmemleak_object *scanned)
1152{ 1176{
1153 unsigned long *ptr; 1177 unsigned long *ptr;
1154 unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); 1178 unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
1155 unsigned long *end = _end - (BYTES_PER_POINTER - 1); 1179 unsigned long *end = _end - (BYTES_PER_POINTER - 1);
1180 unsigned long flags;
1156 1181
1182 read_lock_irqsave(&kmemleak_lock, flags);
1157 for (ptr = start; ptr < end; ptr++) { 1183 for (ptr = start; ptr < end; ptr++) {
1158 struct kmemleak_object *object; 1184 struct kmemleak_object *object;
1159 unsigned long flags;
1160 unsigned long pointer; 1185 unsigned long pointer;
1161 1186
1162 if (allow_resched)
1163 cond_resched();
1164 if (scan_should_stop()) 1187 if (scan_should_stop())
1165 break; 1188 break;
1166 1189
@@ -1173,26 +1196,31 @@ static void scan_block(void *_start, void *_end,
1173 pointer = *ptr; 1196 pointer = *ptr;
1174 kasan_enable_current(); 1197 kasan_enable_current();
1175 1198
1176 object = find_and_get_object(pointer, 1); 1199 if (pointer < min_addr || pointer >= max_addr)
1200 continue;
1201
1202 /*
1203 * No need for get_object() here since we hold kmemleak_lock.
1204 * object->use_count cannot be dropped to 0 while the object
1205 * is still present in object_tree_root and object_list
1206 * (with updates protected by kmemleak_lock).
1207 */
1208 object = lookup_object(pointer, 1);
1177 if (!object) 1209 if (!object)
1178 continue; 1210 continue;
1179 if (object == scanned) { 1211 if (object == scanned)
1180 /* self referenced, ignore */ 1212 /* self referenced, ignore */
1181 put_object(object);
1182 continue; 1213 continue;
1183 }
1184 1214
1185 /* 1215 /*
1186 * Avoid the lockdep recursive warning on object->lock being 1216 * Avoid the lockdep recursive warning on object->lock being
1187 * previously acquired in scan_object(). These locks are 1217 * previously acquired in scan_object(). These locks are
1188 * enclosed by scan_mutex. 1218 * enclosed by scan_mutex.
1189 */ 1219 */
1190 spin_lock_irqsave_nested(&object->lock, flags, 1220 spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
1191 SINGLE_DEPTH_NESTING);
1192 if (!color_white(object)) { 1221 if (!color_white(object)) {
1193 /* non-orphan, ignored or new */ 1222 /* non-orphan, ignored or new */
1194 spin_unlock_irqrestore(&object->lock, flags); 1223 spin_unlock(&object->lock);
1195 put_object(object);
1196 continue; 1224 continue;
1197 } 1225 }
1198 1226
@@ -1204,13 +1232,27 @@ static void scan_block(void *_start, void *_end,
1204 */ 1232 */
1205 object->count++; 1233 object->count++;
1206 if (color_gray(object)) { 1234 if (color_gray(object)) {
1235 /* put_object() called when removing from gray_list */
1236 WARN_ON(!get_object(object));
1207 list_add_tail(&object->gray_list, &gray_list); 1237 list_add_tail(&object->gray_list, &gray_list);
1208 spin_unlock_irqrestore(&object->lock, flags);
1209 continue;
1210 } 1238 }
1239 spin_unlock(&object->lock);
1240 }
1241 read_unlock_irqrestore(&kmemleak_lock, flags);
1242}
1211 1243
1212 spin_unlock_irqrestore(&object->lock, flags); 1244/*
1213 put_object(object); 1245 * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency.
1246 */
1247static void scan_large_block(void *start, void *end)
1248{
1249 void *next;
1250
1251 while (start < end) {
1252 next = min(start + MAX_SCAN_SIZE, end);
1253 scan_block(start, next, NULL);
1254 start = next;
1255 cond_resched();
1214 } 1256 }
1215} 1257}
1216 1258
@@ -1236,22 +1278,25 @@ static void scan_object(struct kmemleak_object *object)
1236 if (hlist_empty(&object->area_list)) { 1278 if (hlist_empty(&object->area_list)) {
1237 void *start = (void *)object->pointer; 1279 void *start = (void *)object->pointer;
1238 void *end = (void *)(object->pointer + object->size); 1280 void *end = (void *)(object->pointer + object->size);
1281 void *next;
1239 1282
1240 while (start < end && (object->flags & OBJECT_ALLOCATED) && 1283 do {
1241 !(object->flags & OBJECT_NO_SCAN)) { 1284 next = min(start + MAX_SCAN_SIZE, end);
1242 scan_block(start, min(start + MAX_SCAN_SIZE, end), 1285 scan_block(start, next, object);
1243 object, 0); 1286
1244 start += MAX_SCAN_SIZE; 1287 start = next;
1288 if (start >= end)
1289 break;
1245 1290
1246 spin_unlock_irqrestore(&object->lock, flags); 1291 spin_unlock_irqrestore(&object->lock, flags);
1247 cond_resched(); 1292 cond_resched();
1248 spin_lock_irqsave(&object->lock, flags); 1293 spin_lock_irqsave(&object->lock, flags);
1249 } 1294 } while (object->flags & OBJECT_ALLOCATED);
1250 } else 1295 } else
1251 hlist_for_each_entry(area, &object->area_list, node) 1296 hlist_for_each_entry(area, &object->area_list, node)
1252 scan_block((void *)area->start, 1297 scan_block((void *)area->start,
1253 (void *)(area->start + area->size), 1298 (void *)(area->start + area->size),
1254 object, 0); 1299 object);
1255out: 1300out:
1256 spin_unlock_irqrestore(&object->lock, flags); 1301 spin_unlock_irqrestore(&object->lock, flags);
1257} 1302}
@@ -1328,14 +1373,14 @@ static void kmemleak_scan(void)
1328 rcu_read_unlock(); 1373 rcu_read_unlock();
1329 1374
1330 /* data/bss scanning */ 1375 /* data/bss scanning */
1331 scan_block(_sdata, _edata, NULL, 1); 1376 scan_large_block(_sdata, _edata);
1332 scan_block(__bss_start, __bss_stop, NULL, 1); 1377 scan_large_block(__bss_start, __bss_stop);
1333 1378
1334#ifdef CONFIG_SMP 1379#ifdef CONFIG_SMP
1335 /* per-cpu sections scanning */ 1380 /* per-cpu sections scanning */
1336 for_each_possible_cpu(i) 1381 for_each_possible_cpu(i)
1337 scan_block(__per_cpu_start + per_cpu_offset(i), 1382 scan_large_block(__per_cpu_start + per_cpu_offset(i),
1338 __per_cpu_end + per_cpu_offset(i), NULL, 1); 1383 __per_cpu_end + per_cpu_offset(i));
1339#endif 1384#endif
1340 1385
1341 /* 1386 /*
@@ -1356,7 +1401,7 @@ static void kmemleak_scan(void)
1356 /* only scan if page is in use */ 1401 /* only scan if page is in use */
1357 if (page_count(page) == 0) 1402 if (page_count(page) == 0)
1358 continue; 1403 continue;
1359 scan_block(page, page + 1, NULL, 1); 1404 scan_block(page, page + 1, NULL);
1360 } 1405 }
1361 } 1406 }
1362 put_online_mems(); 1407 put_online_mems();
@@ -1370,7 +1415,7 @@ static void kmemleak_scan(void)
1370 read_lock(&tasklist_lock); 1415 read_lock(&tasklist_lock);
1371 do_each_thread(g, p) { 1416 do_each_thread(g, p) {
1372 scan_block(task_stack_page(p), task_stack_page(p) + 1417 scan_block(task_stack_page(p), task_stack_page(p) +
1373 THREAD_SIZE, NULL, 0); 1418 THREAD_SIZE, NULL);
1374 } while_each_thread(g, p); 1419 } while_each_thread(g, p);
1375 read_unlock(&tasklist_lock); 1420 read_unlock(&tasklist_lock);
1376 } 1421 }
@@ -1747,15 +1792,20 @@ static void __kmemleak_do_cleanup(void)
1747 */ 1792 */
1748static void kmemleak_do_cleanup(struct work_struct *work) 1793static void kmemleak_do_cleanup(struct work_struct *work)
1749{ 1794{
1750 mutex_lock(&scan_mutex);
1751 stop_scan_thread(); 1795 stop_scan_thread();
1752 1796
1797 /*
1798 * Once the scan thread has stopped, it is safe to no longer track
1799 * object freeing. Ordering of the scan thread stopping and the memory
1800 * accesses below is guaranteed by the kthread_stop() function.
1801 */
1802 kmemleak_free_enabled = 0;
1803
1753 if (!kmemleak_found_leaks) 1804 if (!kmemleak_found_leaks)
1754 __kmemleak_do_cleanup(); 1805 __kmemleak_do_cleanup();
1755 else 1806 else
1756 pr_info("Kmemleak disabled without freeing internal data. " 1807 pr_info("Kmemleak disabled without freeing internal data. "
1757 "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); 1808 "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
1758 mutex_unlock(&scan_mutex);
1759} 1809}
1760 1810
1761static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); 1811static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
@@ -1776,6 +1826,8 @@ static void kmemleak_disable(void)
1776 /* check whether it is too early for a kernel thread */ 1826 /* check whether it is too early for a kernel thread */
1777 if (kmemleak_initialized) 1827 if (kmemleak_initialized)
1778 schedule_work(&cleanup_work); 1828 schedule_work(&cleanup_work);
1829 else
1830 kmemleak_free_enabled = 0;
1779 1831
1780 pr_info("Kernel memory leak detector disabled\n"); 1832 pr_info("Kernel memory leak detector disabled\n");
1781} 1833}
@@ -1840,8 +1892,10 @@ void __init kmemleak_init(void)
1840 if (kmemleak_error) { 1892 if (kmemleak_error) {
1841 local_irq_restore(flags); 1893 local_irq_restore(flags);
1842 return; 1894 return;
1843 } else 1895 } else {
1844 kmemleak_enabled = 1; 1896 kmemleak_enabled = 1;
1897 kmemleak_free_enabled = 1;
1898 }
1845 local_irq_restore(flags); 1899 local_irq_restore(flags);
1846 1900
1847 /* 1901 /*
diff --git a/mm/memblock.c b/mm/memblock.c
index 9318b567ed79..1b444c730846 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -54,10 +54,16 @@ int memblock_debug __initdata_memblock;
54#ifdef CONFIG_MOVABLE_NODE 54#ifdef CONFIG_MOVABLE_NODE
55bool movable_node_enabled __initdata_memblock = false; 55bool movable_node_enabled __initdata_memblock = false;
56#endif 56#endif
57static bool system_has_some_mirror __initdata_memblock = false;
57static int memblock_can_resize __initdata_memblock; 58static int memblock_can_resize __initdata_memblock;
58static int memblock_memory_in_slab __initdata_memblock = 0; 59static int memblock_memory_in_slab __initdata_memblock = 0;
59static int memblock_reserved_in_slab __initdata_memblock = 0; 60static int memblock_reserved_in_slab __initdata_memblock = 0;
60 61
62ulong __init_memblock choose_memblock_flags(void)
63{
64 return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
65}
66
61/* inline so we don't get a warning when pr_debug is compiled out */ 67/* inline so we don't get a warning when pr_debug is compiled out */
62static __init_memblock const char * 68static __init_memblock const char *
63memblock_type_name(struct memblock_type *type) 69memblock_type_name(struct memblock_type *type)
@@ -107,6 +113,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
107 * @size: size of free area to find 113 * @size: size of free area to find
108 * @align: alignment of free area to find 114 * @align: alignment of free area to find
109 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 115 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
116 * @flags: pick from blocks based on memory attributes
110 * 117 *
111 * Utility called from memblock_find_in_range_node(), find free area bottom-up. 118 * Utility called from memblock_find_in_range_node(), find free area bottom-up.
112 * 119 *
@@ -115,12 +122,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
115 */ 122 */
116static phys_addr_t __init_memblock 123static phys_addr_t __init_memblock
117__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, 124__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
118 phys_addr_t size, phys_addr_t align, int nid) 125 phys_addr_t size, phys_addr_t align, int nid,
126 ulong flags)
119{ 127{
120 phys_addr_t this_start, this_end, cand; 128 phys_addr_t this_start, this_end, cand;
121 u64 i; 129 u64 i;
122 130
123 for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { 131 for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
124 this_start = clamp(this_start, start, end); 132 this_start = clamp(this_start, start, end);
125 this_end = clamp(this_end, start, end); 133 this_end = clamp(this_end, start, end);
126 134
@@ -139,6 +147,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
139 * @size: size of free area to find 147 * @size: size of free area to find
140 * @align: alignment of free area to find 148 * @align: alignment of free area to find
141 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 149 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
150 * @flags: pick from blocks based on memory attributes
142 * 151 *
143 * Utility called from memblock_find_in_range_node(), find free area top-down. 152 * Utility called from memblock_find_in_range_node(), find free area top-down.
144 * 153 *
@@ -147,12 +156,14 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
147 */ 156 */
148static phys_addr_t __init_memblock 157static phys_addr_t __init_memblock
149__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, 158__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
150 phys_addr_t size, phys_addr_t align, int nid) 159 phys_addr_t size, phys_addr_t align, int nid,
160 ulong flags)
151{ 161{
152 phys_addr_t this_start, this_end, cand; 162 phys_addr_t this_start, this_end, cand;
153 u64 i; 163 u64 i;
154 164
155 for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { 165 for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
166 NULL) {
156 this_start = clamp(this_start, start, end); 167 this_start = clamp(this_start, start, end);
157 this_end = clamp(this_end, start, end); 168 this_end = clamp(this_end, start, end);
158 169
@@ -174,6 +185,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
174 * @start: start of candidate range 185 * @start: start of candidate range
175 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 186 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
176 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 187 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
188 * @flags: pick from blocks based on memory attributes
177 * 189 *
178 * Find @size free area aligned to @align in the specified range and node. 190 * Find @size free area aligned to @align in the specified range and node.
179 * 191 *
@@ -190,7 +202,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
190 */ 202 */
191phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, 203phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
192 phys_addr_t align, phys_addr_t start, 204 phys_addr_t align, phys_addr_t start,
193 phys_addr_t end, int nid) 205 phys_addr_t end, int nid, ulong flags)
194{ 206{
195 phys_addr_t kernel_end, ret; 207 phys_addr_t kernel_end, ret;
196 208
@@ -215,7 +227,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
215 227
216 /* ok, try bottom-up allocation first */ 228 /* ok, try bottom-up allocation first */
217 ret = __memblock_find_range_bottom_up(bottom_up_start, end, 229 ret = __memblock_find_range_bottom_up(bottom_up_start, end,
218 size, align, nid); 230 size, align, nid, flags);
219 if (ret) 231 if (ret)
220 return ret; 232 return ret;
221 233
@@ -233,7 +245,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
233 "memory hotunplug may be affected\n"); 245 "memory hotunplug may be affected\n");
234 } 246 }
235 247
236 return __memblock_find_range_top_down(start, end, size, align, nid); 248 return __memblock_find_range_top_down(start, end, size, align, nid,
249 flags);
237} 250}
238 251
239/** 252/**
@@ -252,8 +265,21 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
252 phys_addr_t end, phys_addr_t size, 265 phys_addr_t end, phys_addr_t size,
253 phys_addr_t align) 266 phys_addr_t align)
254{ 267{
255 return memblock_find_in_range_node(size, align, start, end, 268 phys_addr_t ret;
256 NUMA_NO_NODE); 269 ulong flags = choose_memblock_flags();
270
271again:
272 ret = memblock_find_in_range_node(size, align, start, end,
273 NUMA_NO_NODE, flags);
274
275 if (!ret && (flags & MEMBLOCK_MIRROR)) {
276 pr_warn("Could not allocate %pap bytes of mirrored memory\n",
277 &size);
278 flags &= ~MEMBLOCK_MIRROR;
279 goto again;
280 }
281
282 return ret;
257} 283}
258 284
259static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) 285static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -779,9 +805,25 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
779} 805}
780 806
781/** 807/**
808 * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR.
809 * @base: the base phys addr of the region
810 * @size: the size of the region
811 *
812 * Return 0 on succees, -errno on failure.
813 */
814int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
815{
816 system_has_some_mirror = true;
817
818 return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
819}
820
821
822/**
782 * __next__mem_range - next function for for_each_free_mem_range() etc. 823 * __next__mem_range - next function for for_each_free_mem_range() etc.
783 * @idx: pointer to u64 loop variable 824 * @idx: pointer to u64 loop variable
784 * @nid: node selector, %NUMA_NO_NODE for all nodes 825 * @nid: node selector, %NUMA_NO_NODE for all nodes
826 * @flags: pick from blocks based on memory attributes
785 * @type_a: pointer to memblock_type from where the range is taken 827 * @type_a: pointer to memblock_type from where the range is taken
786 * @type_b: pointer to memblock_type which excludes memory from being taken 828 * @type_b: pointer to memblock_type which excludes memory from being taken
787 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 829 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -803,7 +845,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
803 * As both region arrays are sorted, the function advances the two indices 845 * As both region arrays are sorted, the function advances the two indices
804 * in lockstep and returns each intersection. 846 * in lockstep and returns each intersection.
805 */ 847 */
806void __init_memblock __next_mem_range(u64 *idx, int nid, 848void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
807 struct memblock_type *type_a, 849 struct memblock_type *type_a,
808 struct memblock_type *type_b, 850 struct memblock_type *type_b,
809 phys_addr_t *out_start, 851 phys_addr_t *out_start,
@@ -831,6 +873,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
831 if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) 873 if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
832 continue; 874 continue;
833 875
876 /* if we want mirror memory skip non-mirror memory regions */
877 if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
878 continue;
879
834 if (!type_b) { 880 if (!type_b) {
835 if (out_start) 881 if (out_start)
836 *out_start = m_start; 882 *out_start = m_start;
@@ -895,6 +941,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
895 * 941 *
896 * @idx: pointer to u64 loop variable 942 * @idx: pointer to u64 loop variable
897 * @nid: nid: node selector, %NUMA_NO_NODE for all nodes 943 * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
944 * @flags: pick from blocks based on memory attributes
898 * @type_a: pointer to memblock_type from where the range is taken 945 * @type_a: pointer to memblock_type from where the range is taken
899 * @type_b: pointer to memblock_type which excludes memory from being taken 946 * @type_b: pointer to memblock_type which excludes memory from being taken
900 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 947 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -903,7 +950,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
903 * 950 *
904 * Reverse of __next_mem_range(). 951 * Reverse of __next_mem_range().
905 */ 952 */
906void __init_memblock __next_mem_range_rev(u64 *idx, int nid, 953void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
907 struct memblock_type *type_a, 954 struct memblock_type *type_a,
908 struct memblock_type *type_b, 955 struct memblock_type *type_b,
909 phys_addr_t *out_start, 956 phys_addr_t *out_start,
@@ -935,6 +982,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
935 if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) 982 if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
936 continue; 983 continue;
937 984
985 /* if we want mirror memory skip non-mirror memory regions */
986 if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
987 continue;
988
938 if (!type_b) { 989 if (!type_b) {
939 if (out_start) 990 if (out_start)
940 *out_start = m_start; 991 *out_start = m_start;
@@ -1050,14 +1101,15 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
1050 1101
1051static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, 1102static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
1052 phys_addr_t align, phys_addr_t start, 1103 phys_addr_t align, phys_addr_t start,
1053 phys_addr_t end, int nid) 1104 phys_addr_t end, int nid, ulong flags)
1054{ 1105{
1055 phys_addr_t found; 1106 phys_addr_t found;
1056 1107
1057 if (!align) 1108 if (!align)
1058 align = SMP_CACHE_BYTES; 1109 align = SMP_CACHE_BYTES;
1059 1110
1060 found = memblock_find_in_range_node(size, align, start, end, nid); 1111 found = memblock_find_in_range_node(size, align, start, end, nid,
1112 flags);
1061 if (found && !memblock_reserve(found, size)) { 1113 if (found && !memblock_reserve(found, size)) {
1062 /* 1114 /*
1063 * The min_count is set to 0 so that memblock allocations are 1115 * The min_count is set to 0 so that memblock allocations are
@@ -1070,26 +1122,40 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
1070} 1122}
1071 1123
1072phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, 1124phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
1073 phys_addr_t start, phys_addr_t end) 1125 phys_addr_t start, phys_addr_t end,
1126 ulong flags)
1074{ 1127{
1075 return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); 1128 return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
1129 flags);
1076} 1130}
1077 1131
1078static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, 1132static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
1079 phys_addr_t align, phys_addr_t max_addr, 1133 phys_addr_t align, phys_addr_t max_addr,
1080 int nid) 1134 int nid, ulong flags)
1081{ 1135{
1082 return memblock_alloc_range_nid(size, align, 0, max_addr, nid); 1136 return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
1083} 1137}
1084 1138
1085phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) 1139phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
1086{ 1140{
1087 return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); 1141 ulong flags = choose_memblock_flags();
1142 phys_addr_t ret;
1143
1144again:
1145 ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
1146 nid, flags);
1147
1148 if (!ret && (flags & MEMBLOCK_MIRROR)) {
1149 flags &= ~MEMBLOCK_MIRROR;
1150 goto again;
1151 }
1152 return ret;
1088} 1153}
1089 1154
1090phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 1155phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
1091{ 1156{
1092 return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); 1157 return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE,
1158 MEMBLOCK_NONE);
1093} 1159}
1094 1160
1095phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 1161phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -1153,6 +1219,7 @@ static void * __init memblock_virt_alloc_internal(
1153{ 1219{
1154 phys_addr_t alloc; 1220 phys_addr_t alloc;
1155 void *ptr; 1221 void *ptr;
1222 ulong flags = choose_memblock_flags();
1156 1223
1157 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) 1224 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
1158 nid = NUMA_NO_NODE; 1225 nid = NUMA_NO_NODE;
@@ -1173,13 +1240,14 @@ static void * __init memblock_virt_alloc_internal(
1173 1240
1174again: 1241again:
1175 alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, 1242 alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
1176 nid); 1243 nid, flags);
1177 if (alloc) 1244 if (alloc)
1178 goto done; 1245 goto done;
1179 1246
1180 if (nid != NUMA_NO_NODE) { 1247 if (nid != NUMA_NO_NODE) {
1181 alloc = memblock_find_in_range_node(size, align, min_addr, 1248 alloc = memblock_find_in_range_node(size, align, min_addr,
1182 max_addr, NUMA_NO_NODE); 1249 max_addr, NUMA_NO_NODE,
1250 flags);
1183 if (alloc) 1251 if (alloc)
1184 goto done; 1252 goto done;
1185 } 1253 }
@@ -1187,10 +1255,16 @@ again:
1187 if (min_addr) { 1255 if (min_addr) {
1188 min_addr = 0; 1256 min_addr = 0;
1189 goto again; 1257 goto again;
1190 } else {
1191 goto error;
1192 } 1258 }
1193 1259
1260 if (flags & MEMBLOCK_MIRROR) {
1261 flags &= ~MEMBLOCK_MIRROR;
1262 pr_warn("Could not allocate %pap bytes of mirrored memory\n",
1263 &size);
1264 goto again;
1265 }
1266
1267 return NULL;
1194done: 1268done:
1195 memblock_reserve(alloc, size); 1269 memblock_reserve(alloc, size);
1196 ptr = phys_to_virt(alloc); 1270 ptr = phys_to_virt(alloc);
@@ -1205,9 +1279,6 @@ done:
1205 kmemleak_alloc(ptr, size, 0, 0); 1279 kmemleak_alloc(ptr, size, 0, 0);
1206 1280
1207 return ptr; 1281 return ptr;
1208
1209error:
1210 return NULL;
1211} 1282}
1212 1283
1213/** 1284/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a04225d372ba..e65f7b0131d3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -285,9 +285,9 @@ struct mem_cgroup {
285 */ 285 */
286 bool use_hierarchy; 286 bool use_hierarchy;
287 287
288 /* protected by memcg_oom_lock */
288 bool oom_lock; 289 bool oom_lock;
289 atomic_t under_oom; 290 int under_oom;
290 atomic_t oom_wakeups;
291 291
292 int swappiness; 292 int swappiness;
293 /* OOM-Killer disable */ 293 /* OOM-Killer disable */
@@ -1530,14 +1530,16 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1530 unsigned int points = 0; 1530 unsigned int points = 0;
1531 struct task_struct *chosen = NULL; 1531 struct task_struct *chosen = NULL;
1532 1532
1533 mutex_lock(&oom_lock);
1534
1533 /* 1535 /*
1534 * If current has a pending SIGKILL or is exiting, then automatically 1536 * If current has a pending SIGKILL or is exiting, then automatically
1535 * select it. The goal is to allow it to allocate so that it may 1537 * select it. The goal is to allow it to allocate so that it may
1536 * quickly exit and free its memory. 1538 * quickly exit and free its memory.
1537 */ 1539 */
1538 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 1540 if (fatal_signal_pending(current) || task_will_free_mem(current)) {
1539 mark_tsk_oom_victim(current); 1541 mark_oom_victim(current);
1540 return; 1542 goto unlock;
1541 } 1543 }
1542 1544
1543 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); 1545 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
@@ -1564,7 +1566,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1564 mem_cgroup_iter_break(memcg, iter); 1566 mem_cgroup_iter_break(memcg, iter);
1565 if (chosen) 1567 if (chosen)
1566 put_task_struct(chosen); 1568 put_task_struct(chosen);
1567 return; 1569 goto unlock;
1568 case OOM_SCAN_OK: 1570 case OOM_SCAN_OK:
1569 break; 1571 break;
1570 }; 1572 };
@@ -1585,11 +1587,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1585 css_task_iter_end(&it); 1587 css_task_iter_end(&it);
1586 } 1588 }
1587 1589
1588 if (!chosen) 1590 if (chosen) {
1589 return; 1591 points = chosen_points * 1000 / totalpages;
1590 points = chosen_points * 1000 / totalpages; 1592 oom_kill_process(chosen, gfp_mask, order, points, totalpages,
1591 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, 1593 memcg, NULL, "Memory cgroup out of memory");
1592 NULL, "Memory cgroup out of memory"); 1594 }
1595unlock:
1596 mutex_unlock(&oom_lock);
1593} 1597}
1594 1598
1595#if MAX_NUMNODES > 1 1599#if MAX_NUMNODES > 1
@@ -1806,8 +1810,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1806{ 1810{
1807 struct mem_cgroup *iter; 1811 struct mem_cgroup *iter;
1808 1812
1813 spin_lock(&memcg_oom_lock);
1809 for_each_mem_cgroup_tree(iter, memcg) 1814 for_each_mem_cgroup_tree(iter, memcg)
1810 atomic_inc(&iter->under_oom); 1815 iter->under_oom++;
1816 spin_unlock(&memcg_oom_lock);
1811} 1817}
1812 1818
1813static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1819static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
@@ -1816,11 +1822,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1816 1822
1817 /* 1823 /*
1818 * When a new child is created while the hierarchy is under oom, 1824 * When a new child is created while the hierarchy is under oom,
1819 * mem_cgroup_oom_lock() may not be called. We have to use 1825 * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1820 * atomic_add_unless() here.
1821 */ 1826 */
1827 spin_lock(&memcg_oom_lock);
1822 for_each_mem_cgroup_tree(iter, memcg) 1828 for_each_mem_cgroup_tree(iter, memcg)
1823 atomic_add_unless(&iter->under_oom, -1, 0); 1829 if (iter->under_oom > 0)
1830 iter->under_oom--;
1831 spin_unlock(&memcg_oom_lock);
1824} 1832}
1825 1833
1826static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1834static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1846,17 +1854,18 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
1846 return autoremove_wake_function(wait, mode, sync, arg); 1854 return autoremove_wake_function(wait, mode, sync, arg);
1847} 1855}
1848 1856
1849static void memcg_wakeup_oom(struct mem_cgroup *memcg)
1850{
1851 atomic_inc(&memcg->oom_wakeups);
1852 /* for filtering, pass "memcg" as argument. */
1853 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1854}
1855
1856static void memcg_oom_recover(struct mem_cgroup *memcg) 1857static void memcg_oom_recover(struct mem_cgroup *memcg)
1857{ 1858{
1858 if (memcg && atomic_read(&memcg->under_oom)) 1859 /*
1859 memcg_wakeup_oom(memcg); 1860 * For the following lockless ->under_oom test, the only required
1861 * guarantee is that it must see the state asserted by an OOM when
1862 * this function is called as a result of userland actions
1863 * triggered by the notification of the OOM. This is trivially
1864 * achieved by invoking mem_cgroup_mark_under_oom() before
1865 * triggering notification.
1866 */
1867 if (memcg && memcg->under_oom)
1868 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1860} 1869}
1861 1870
1862static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1871static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
@@ -3864,7 +3873,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
3864 list_add(&event->list, &memcg->oom_notify); 3873 list_add(&event->list, &memcg->oom_notify);
3865 3874
3866 /* already in OOM ? */ 3875 /* already in OOM ? */
3867 if (atomic_read(&memcg->under_oom)) 3876 if (memcg->under_oom)
3868 eventfd_signal(eventfd, 1); 3877 eventfd_signal(eventfd, 1);
3869 spin_unlock(&memcg_oom_lock); 3878 spin_unlock(&memcg_oom_lock);
3870 3879
@@ -3893,7 +3902,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3893 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 3902 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3894 3903
3895 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 3904 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3896 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); 3905 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
3897 return 0; 3906 return 0;
3898} 3907}
3899 3908
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 501820c815b3..c53543d89282 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -20,6 +20,14 @@
20 * this code has to be extremely careful. Generally it tries to use 20 * this code has to be extremely careful. Generally it tries to use
21 * normal locking rules, as in get the standard locks, even if that means 21 * normal locking rules, as in get the standard locks, even if that means
22 * the error handling takes potentially a long time. 22 * the error handling takes potentially a long time.
23 *
24 * It can be very tempting to add handling for obscure cases here.
25 * In general any code for handling new cases should only be added iff:
26 * - You know how to test it.
27 * - You have a test that can be added to mce-test
28 * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
29 * - The case actually shows up as a frequent (top 10) page state in
30 * tools/vm/page-types when running a real workload.
23 * 31 *
24 * There are several operations here with exponential complexity because 32 * There are several operations here with exponential complexity because
25 * of unsuitable VM data structures. For example the operation to map back 33 * of unsuitable VM data structures. For example the operation to map back
@@ -28,13 +36,6 @@
28 * are rare we hope to get away with this. This avoids impacting the core 36 * are rare we hope to get away with this. This avoids impacting the core
29 * VM. 37 * VM.
30 */ 38 */
31
32/*
33 * Notebook:
34 * - hugetlb needs more code
35 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
36 * - pass bad pages to kdump next kernel
37 */
38#include <linux/kernel.h> 39#include <linux/kernel.h>
39#include <linux/mm.h> 40#include <linux/mm.h>
40#include <linux/page-flags.h> 41#include <linux/page-flags.h>
@@ -56,6 +57,7 @@
56#include <linux/mm_inline.h> 57#include <linux/mm_inline.h>
57#include <linux/kfifo.h> 58#include <linux/kfifo.h>
58#include "internal.h" 59#include "internal.h"
60#include "ras/ras_event.h"
59 61
60int sysctl_memory_failure_early_kill __read_mostly = 0; 62int sysctl_memory_failure_early_kill __read_mostly = 0;
61 63
@@ -503,68 +505,34 @@ static void collect_procs(struct page *page, struct list_head *tokill,
503 kfree(tk); 505 kfree(tk);
504} 506}
505 507
506/*
507 * Error handlers for various types of pages.
508 */
509
510enum outcome {
511 IGNORED, /* Error: cannot be handled */
512 FAILED, /* Error: handling failed */
513 DELAYED, /* Will be handled later */
514 RECOVERED, /* Successfully recovered */
515};
516
517static const char *action_name[] = { 508static const char *action_name[] = {
518 [IGNORED] = "Ignored", 509 [MF_IGNORED] = "Ignored",
519 [FAILED] = "Failed", 510 [MF_FAILED] = "Failed",
520 [DELAYED] = "Delayed", 511 [MF_DELAYED] = "Delayed",
521 [RECOVERED] = "Recovered", 512 [MF_RECOVERED] = "Recovered",
522};
523
524enum action_page_type {
525 MSG_KERNEL,
526 MSG_KERNEL_HIGH_ORDER,
527 MSG_SLAB,
528 MSG_DIFFERENT_COMPOUND,
529 MSG_POISONED_HUGE,
530 MSG_HUGE,
531 MSG_FREE_HUGE,
532 MSG_UNMAP_FAILED,
533 MSG_DIRTY_SWAPCACHE,
534 MSG_CLEAN_SWAPCACHE,
535 MSG_DIRTY_MLOCKED_LRU,
536 MSG_CLEAN_MLOCKED_LRU,
537 MSG_DIRTY_UNEVICTABLE_LRU,
538 MSG_CLEAN_UNEVICTABLE_LRU,
539 MSG_DIRTY_LRU,
540 MSG_CLEAN_LRU,
541 MSG_TRUNCATED_LRU,
542 MSG_BUDDY,
543 MSG_BUDDY_2ND,
544 MSG_UNKNOWN,
545}; 513};
546 514
547static const char * const action_page_types[] = { 515static const char * const action_page_types[] = {
548 [MSG_KERNEL] = "reserved kernel page", 516 [MF_MSG_KERNEL] = "reserved kernel page",
549 [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", 517 [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
550 [MSG_SLAB] = "kernel slab page", 518 [MF_MSG_SLAB] = "kernel slab page",
551 [MSG_DIFFERENT_COMPOUND] = "different compound page after locking", 519 [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
552 [MSG_POISONED_HUGE] = "huge page already hardware poisoned", 520 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
553 [MSG_HUGE] = "huge page", 521 [MF_MSG_HUGE] = "huge page",
554 [MSG_FREE_HUGE] = "free huge page", 522 [MF_MSG_FREE_HUGE] = "free huge page",
555 [MSG_UNMAP_FAILED] = "unmapping failed page", 523 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
556 [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", 524 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
557 [MSG_CLEAN_SWAPCACHE] = "clean swapcache page", 525 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
558 [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", 526 [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
559 [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", 527 [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
560 [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", 528 [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
561 [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", 529 [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
562 [MSG_DIRTY_LRU] = "dirty LRU page", 530 [MF_MSG_DIRTY_LRU] = "dirty LRU page",
563 [MSG_CLEAN_LRU] = "clean LRU page", 531 [MF_MSG_CLEAN_LRU] = "clean LRU page",
564 [MSG_TRUNCATED_LRU] = "already truncated LRU page", 532 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
565 [MSG_BUDDY] = "free buddy page", 533 [MF_MSG_BUDDY] = "free buddy page",
566 [MSG_BUDDY_2ND] = "free buddy page (2nd try)", 534 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
567 [MSG_UNKNOWN] = "unknown page", 535 [MF_MSG_UNKNOWN] = "unknown page",
568}; 536};
569 537
570/* 538/*
@@ -598,7 +566,7 @@ static int delete_from_lru_cache(struct page *p)
598 */ 566 */
599static int me_kernel(struct page *p, unsigned long pfn) 567static int me_kernel(struct page *p, unsigned long pfn)
600{ 568{
601 return IGNORED; 569 return MF_IGNORED;
602} 570}
603 571
604/* 572/*
@@ -607,7 +575,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
607static int me_unknown(struct page *p, unsigned long pfn) 575static int me_unknown(struct page *p, unsigned long pfn)
608{ 576{
609 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); 577 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
610 return FAILED; 578 return MF_FAILED;
611} 579}
612 580
613/* 581/*
@@ -616,7 +584,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
616static int me_pagecache_clean(struct page *p, unsigned long pfn) 584static int me_pagecache_clean(struct page *p, unsigned long pfn)
617{ 585{
618 int err; 586 int err;
619 int ret = FAILED; 587 int ret = MF_FAILED;
620 struct address_space *mapping; 588 struct address_space *mapping;
621 589
622 delete_from_lru_cache(p); 590 delete_from_lru_cache(p);
@@ -626,7 +594,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
626 * should be the one m_f() holds. 594 * should be the one m_f() holds.
627 */ 595 */
628 if (PageAnon(p)) 596 if (PageAnon(p))
629 return RECOVERED; 597 return MF_RECOVERED;
630 598
631 /* 599 /*
632 * Now truncate the page in the page cache. This is really 600 * Now truncate the page in the page cache. This is really
@@ -640,7 +608,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
640 /* 608 /*
641 * Page has been teared down in the meanwhile 609 * Page has been teared down in the meanwhile
642 */ 610 */
643 return FAILED; 611 return MF_FAILED;
644 } 612 }
645 613
646 /* 614 /*
@@ -657,7 +625,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
657 !try_to_release_page(p, GFP_NOIO)) { 625 !try_to_release_page(p, GFP_NOIO)) {
658 pr_info("MCE %#lx: failed to release buffers\n", pfn); 626 pr_info("MCE %#lx: failed to release buffers\n", pfn);
659 } else { 627 } else {
660 ret = RECOVERED; 628 ret = MF_RECOVERED;
661 } 629 }
662 } else { 630 } else {
663 /* 631 /*
@@ -665,7 +633,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
665 * This fails on dirty or anything with private pages 633 * This fails on dirty or anything with private pages
666 */ 634 */
667 if (invalidate_inode_page(p)) 635 if (invalidate_inode_page(p))
668 ret = RECOVERED; 636 ret = MF_RECOVERED;
669 else 637 else
670 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", 638 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
671 pfn); 639 pfn);
@@ -751,9 +719,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
751 ClearPageUptodate(p); 719 ClearPageUptodate(p);
752 720
753 if (!delete_from_lru_cache(p)) 721 if (!delete_from_lru_cache(p))
754 return DELAYED; 722 return MF_DELAYED;
755 else 723 else
756 return FAILED; 724 return MF_FAILED;
757} 725}
758 726
759static int me_swapcache_clean(struct page *p, unsigned long pfn) 727static int me_swapcache_clean(struct page *p, unsigned long pfn)
@@ -761,9 +729,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
761 delete_from_swap_cache(p); 729 delete_from_swap_cache(p);
762 730
763 if (!delete_from_lru_cache(p)) 731 if (!delete_from_lru_cache(p))
764 return RECOVERED; 732 return MF_RECOVERED;
765 else 733 else
766 return FAILED; 734 return MF_FAILED;
767} 735}
768 736
769/* 737/*
@@ -776,6 +744,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
776{ 744{
777 int res = 0; 745 int res = 0;
778 struct page *hpage = compound_head(p); 746 struct page *hpage = compound_head(p);
747
748 if (!PageHuge(hpage))
749 return MF_DELAYED;
750
779 /* 751 /*
780 * We can safely recover from error on free or reserved (i.e. 752 * We can safely recover from error on free or reserved (i.e.
781 * not in-use) hugepage by dequeuing it from freelist. 753 * not in-use) hugepage by dequeuing it from freelist.
@@ -789,9 +761,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
789 if (!(page_mapping(hpage) || PageAnon(hpage))) { 761 if (!(page_mapping(hpage) || PageAnon(hpage))) {
790 res = dequeue_hwpoisoned_huge_page(hpage); 762 res = dequeue_hwpoisoned_huge_page(hpage);
791 if (!res) 763 if (!res)
792 return RECOVERED; 764 return MF_RECOVERED;
793 } 765 }
794 return DELAYED; 766 return MF_DELAYED;
795} 767}
796 768
797/* 769/*
@@ -823,10 +795,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
823static struct page_state { 795static struct page_state {
824 unsigned long mask; 796 unsigned long mask;
825 unsigned long res; 797 unsigned long res;
826 enum action_page_type type; 798 enum mf_action_page_type type;
827 int (*action)(struct page *p, unsigned long pfn); 799 int (*action)(struct page *p, unsigned long pfn);
828} error_states[] = { 800} error_states[] = {
829 { reserved, reserved, MSG_KERNEL, me_kernel }, 801 { reserved, reserved, MF_MSG_KERNEL, me_kernel },
830 /* 802 /*
831 * free pages are specially detected outside this table: 803 * free pages are specially detected outside this table:
832 * PG_buddy pages only make a small fraction of all free pages. 804 * PG_buddy pages only make a small fraction of all free pages.
@@ -837,31 +809,31 @@ static struct page_state {
837 * currently unused objects without touching them. But just 809 * currently unused objects without touching them. But just
838 * treat it as standard kernel for now. 810 * treat it as standard kernel for now.
839 */ 811 */
840 { slab, slab, MSG_SLAB, me_kernel }, 812 { slab, slab, MF_MSG_SLAB, me_kernel },
841 813
842#ifdef CONFIG_PAGEFLAGS_EXTENDED 814#ifdef CONFIG_PAGEFLAGS_EXTENDED
843 { head, head, MSG_HUGE, me_huge_page }, 815 { head, head, MF_MSG_HUGE, me_huge_page },
844 { tail, tail, MSG_HUGE, me_huge_page }, 816 { tail, tail, MF_MSG_HUGE, me_huge_page },
845#else 817#else
846 { compound, compound, MSG_HUGE, me_huge_page }, 818 { compound, compound, MF_MSG_HUGE, me_huge_page },
847#endif 819#endif
848 820
849 { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, 821 { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
850 { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, 822 { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
851 823
852 { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, 824 { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
853 { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, 825 { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
854 826
855 { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, 827 { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
856 { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, 828 { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
857 829
858 { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty }, 830 { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
859 { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean }, 831 { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
860 832
861 /* 833 /*
862 * Catchall entry: must be at end. 834 * Catchall entry: must be at end.
863 */ 835 */
864 { 0, 0, MSG_UNKNOWN, me_unknown }, 836 { 0, 0, MF_MSG_UNKNOWN, me_unknown },
865}; 837};
866 838
867#undef dirty 839#undef dirty
@@ -881,8 +853,11 @@ static struct page_state {
881 * "Dirty/Clean" indication is not 100% accurate due to the possibility of 853 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
882 * setting PG_dirty outside page lock. See also comment above set_page_dirty(). 854 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
883 */ 855 */
884static void action_result(unsigned long pfn, enum action_page_type type, int result) 856static void action_result(unsigned long pfn, enum mf_action_page_type type,
857 enum mf_result result)
885{ 858{
859 trace_memory_failure_event(pfn, type, result);
860
886 pr_err("MCE %#lx: recovery action for %s: %s\n", 861 pr_err("MCE %#lx: recovery action for %s: %s\n",
887 pfn, action_page_types[type], action_name[result]); 862 pfn, action_page_types[type], action_name[result]);
888} 863}
@@ -896,13 +871,13 @@ static int page_action(struct page_state *ps, struct page *p,
896 result = ps->action(p, pfn); 871 result = ps->action(p, pfn);
897 872
898 count = page_count(p) - 1; 873 count = page_count(p) - 1;
899 if (ps->action == me_swapcache_dirty && result == DELAYED) 874 if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
900 count--; 875 count--;
901 if (count != 0) { 876 if (count != 0) {
902 printk(KERN_ERR 877 printk(KERN_ERR
903 "MCE %#lx: %s still referenced by %d users\n", 878 "MCE %#lx: %s still referenced by %d users\n",
904 pfn, action_page_types[ps->type], count); 879 pfn, action_page_types[ps->type], count);
905 result = FAILED; 880 result = MF_FAILED;
906 } 881 }
907 action_result(pfn, ps->type, result); 882 action_result(pfn, ps->type, result);
908 883
@@ -911,9 +886,42 @@ static int page_action(struct page_state *ps, struct page *p,
911 * Could adjust zone counters here to correct for the missing page. 886 * Could adjust zone counters here to correct for the missing page.
912 */ 887 */
913 888
914 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; 889 return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
915} 890}
916 891
892/**
893 * get_hwpoison_page() - Get refcount for memory error handling:
894 * @page: raw error page (hit by memory error)
895 *
896 * Return: return 0 if failed to grab the refcount, otherwise true (some
897 * non-zero value.)
898 */
899int get_hwpoison_page(struct page *page)
900{
901 struct page *head = compound_head(page);
902
903 if (PageHuge(head))
904 return get_page_unless_zero(head);
905
906 /*
907 * Thp tail page has special refcounting rule (refcount of tail pages
908 * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
909 * directly for tail pages.
910 */
911 if (PageTransHuge(head)) {
912 if (get_page_unless_zero(head)) {
913 if (PageTail(page))
914 get_page(page);
915 return 1;
916 } else {
917 return 0;
918 }
919 }
920
921 return get_page_unless_zero(page);
922}
923EXPORT_SYMBOL_GPL(get_hwpoison_page);
924
917/* 925/*
918 * Do all that is necessary to remove user space mappings. Unmap 926 * Do all that is necessary to remove user space mappings. Unmap
919 * the pages and send SIGBUS to the processes if the data was dirty. 927 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -927,7 +935,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
927 int ret; 935 int ret;
928 int kill = 1, forcekill; 936 int kill = 1, forcekill;
929 struct page *hpage = *hpagep; 937 struct page *hpage = *hpagep;
930 struct page *ppage;
931 938
932 /* 939 /*
933 * Here we are interested only in user-mapped pages, so skip any 940 * Here we are interested only in user-mapped pages, so skip any
@@ -977,59 +984,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
977 } 984 }
978 985
979 /* 986 /*
980 * ppage: poisoned page
981 * if p is regular page(4k page)
982 * ppage == real poisoned page;
983 * else p is hugetlb or THP, ppage == head page.
984 */
985 ppage = hpage;
986
987 if (PageTransHuge(hpage)) {
988 /*
989 * Verify that this isn't a hugetlbfs head page, the check for
990 * PageAnon is just for avoid tripping a split_huge_page
991 * internal debug check, as split_huge_page refuses to deal with
992 * anything that isn't an anon page. PageAnon can't go away fro
993 * under us because we hold a refcount on the hpage, without a
994 * refcount on the hpage. split_huge_page can't be safely called
995 * in the first place, having a refcount on the tail isn't
996 * enough * to be safe.
997 */
998 if (!PageHuge(hpage) && PageAnon(hpage)) {
999 if (unlikely(split_huge_page(hpage))) {
1000 /*
1001 * FIXME: if splitting THP is failed, it is
1002 * better to stop the following operation rather
1003 * than causing panic by unmapping. System might
1004 * survive if the page is freed later.
1005 */
1006 printk(KERN_INFO
1007 "MCE %#lx: failed to split THP\n", pfn);
1008
1009 BUG_ON(!PageHWPoison(p));
1010 return SWAP_FAIL;
1011 }
1012 /*
1013 * We pinned the head page for hwpoison handling,
1014 * now we split the thp and we are interested in
1015 * the hwpoisoned raw page, so move the refcount
1016 * to it. Similarly, page lock is shifted.
1017 */
1018 if (hpage != p) {
1019 if (!(flags & MF_COUNT_INCREASED)) {
1020 put_page(hpage);
1021 get_page(p);
1022 }
1023 lock_page(p);
1024 unlock_page(hpage);
1025 *hpagep = p;
1026 }
1027 /* THP is split, so ppage should be the real poisoned page. */
1028 ppage = p;
1029 }
1030 }
1031
1032 /*
1033 * First collect all the processes that have the page 987 * First collect all the processes that have the page
1034 * mapped in dirty form. This has to be done before try_to_unmap, 988 * mapped in dirty form. This has to be done before try_to_unmap,
1035 * because ttu takes the rmap data structures down. 989 * because ttu takes the rmap data structures down.
@@ -1038,12 +992,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
1038 * there's nothing that can be done. 992 * there's nothing that can be done.
1039 */ 993 */
1040 if (kill) 994 if (kill)
1041 collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); 995 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1042 996
1043 ret = try_to_unmap(ppage, ttu); 997 ret = try_to_unmap(hpage, ttu);
1044 if (ret != SWAP_SUCCESS) 998 if (ret != SWAP_SUCCESS)
1045 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 999 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
1046 pfn, page_mapcount(ppage)); 1000 pfn, page_mapcount(hpage));
1047 1001
1048 /* 1002 /*
1049 * Now that the dirty bit has been propagated to the 1003 * Now that the dirty bit has been propagated to the
@@ -1055,7 +1009,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
1055 * use a more force-full uncatchable kill to prevent 1009 * use a more force-full uncatchable kill to prevent
1056 * any accesses to the poisoned memory. 1010 * any accesses to the poisoned memory.
1057 */ 1011 */
1058 forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); 1012 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1059 kill_procs(&tokill, forcekill, trapno, 1013 kill_procs(&tokill, forcekill, trapno,
1060 ret != SWAP_SUCCESS, p, pfn, flags); 1014 ret != SWAP_SUCCESS, p, pfn, flags);
1061 1015
@@ -1101,6 +1055,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1101 struct page_state *ps; 1055 struct page_state *ps;
1102 struct page *p; 1056 struct page *p;
1103 struct page *hpage; 1057 struct page *hpage;
1058 struct page *orig_head;
1104 int res; 1059 int res;
1105 unsigned int nr_pages; 1060 unsigned int nr_pages;
1106 unsigned long page_flags; 1061 unsigned long page_flags;
@@ -1116,7 +1071,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1116 } 1071 }
1117 1072
1118 p = pfn_to_page(pfn); 1073 p = pfn_to_page(pfn);
1119 hpage = compound_head(p); 1074 orig_head = hpage = compound_head(p);
1120 if (TestSetPageHWPoison(p)) { 1075 if (TestSetPageHWPoison(p)) {
1121 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); 1076 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
1122 return 0; 1077 return 0;
@@ -1149,10 +1104,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1149 * In fact it's dangerous to directly bump up page count from 0, 1104 * In fact it's dangerous to directly bump up page count from 0,
1150 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 1105 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
1151 */ 1106 */
1152 if (!(flags & MF_COUNT_INCREASED) && 1107 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1153 !get_page_unless_zero(hpage)) {
1154 if (is_free_buddy_page(p)) { 1108 if (is_free_buddy_page(p)) {
1155 action_result(pfn, MSG_BUDDY, DELAYED); 1109 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1156 return 0; 1110 return 0;
1157 } else if (PageHuge(hpage)) { 1111 } else if (PageHuge(hpage)) {
1158 /* 1112 /*
@@ -1169,16 +1123,39 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1169 } 1123 }
1170 set_page_hwpoison_huge_page(hpage); 1124 set_page_hwpoison_huge_page(hpage);
1171 res = dequeue_hwpoisoned_huge_page(hpage); 1125 res = dequeue_hwpoisoned_huge_page(hpage);
1172 action_result(pfn, MSG_FREE_HUGE, 1126 action_result(pfn, MF_MSG_FREE_HUGE,
1173 res ? IGNORED : DELAYED); 1127 res ? MF_IGNORED : MF_DELAYED);
1174 unlock_page(hpage); 1128 unlock_page(hpage);
1175 return res; 1129 return res;
1176 } else { 1130 } else {
1177 action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED); 1131 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1178 return -EBUSY; 1132 return -EBUSY;
1179 } 1133 }
1180 } 1134 }
1181 1135
1136 if (!PageHuge(p) && PageTransHuge(hpage)) {
1137 if (!PageAnon(hpage)) {
1138 pr_err("MCE: %#lx: non anonymous thp\n", pfn);
1139 if (TestClearPageHWPoison(p))
1140 atomic_long_sub(nr_pages, &num_poisoned_pages);
1141 put_page(p);
1142 if (p != hpage)
1143 put_page(hpage);
1144 return -EBUSY;
1145 }
1146 if (unlikely(split_huge_page(hpage))) {
1147 pr_err("MCE: %#lx: thp split failed\n", pfn);
1148 if (TestClearPageHWPoison(p))
1149 atomic_long_sub(nr_pages, &num_poisoned_pages);
1150 put_page(p);
1151 if (p != hpage)
1152 put_page(hpage);
1153 return -EBUSY;
1154 }
1155 VM_BUG_ON_PAGE(!page_count(p), p);
1156 hpage = compound_head(p);
1157 }
1158
1182 /* 1159 /*
1183 * We ignore non-LRU pages for good reasons. 1160 * We ignore non-LRU pages for good reasons.
1184 * - PG_locked is only well defined for LRU pages and a few others 1161 * - PG_locked is only well defined for LRU pages and a few others
@@ -1188,18 +1165,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1188 * walked by the page reclaim code, however that's not a big loss. 1165 * walked by the page reclaim code, however that's not a big loss.
1189 */ 1166 */
1190 if (!PageHuge(p)) { 1167 if (!PageHuge(p)) {
1191 if (!PageLRU(hpage)) 1168 if (!PageLRU(p))
1192 shake_page(hpage, 0); 1169 shake_page(p, 0);
1193 if (!PageLRU(hpage)) { 1170 if (!PageLRU(p)) {
1194 /* 1171 /*
1195 * shake_page could have turned it free. 1172 * shake_page could have turned it free.
1196 */ 1173 */
1197 if (is_free_buddy_page(p)) { 1174 if (is_free_buddy_page(p)) {
1198 if (flags & MF_COUNT_INCREASED) 1175 if (flags & MF_COUNT_INCREASED)
1199 action_result(pfn, MSG_BUDDY, DELAYED); 1176 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1200 else 1177 else
1201 action_result(pfn, MSG_BUDDY_2ND, 1178 action_result(pfn, MF_MSG_BUDDY_2ND,
1202 DELAYED); 1179 MF_DELAYED);
1203 return 0; 1180 return 0;
1204 } 1181 }
1205 } 1182 }
@@ -1211,8 +1188,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1211 * The page could have changed compound pages during the locking. 1188 * The page could have changed compound pages during the locking.
1212 * If this happens just bail out. 1189 * If this happens just bail out.
1213 */ 1190 */
1214 if (compound_head(p) != hpage) { 1191 if (PageCompound(p) && compound_head(p) != orig_head) {
1215 action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED); 1192 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1216 res = -EBUSY; 1193 res = -EBUSY;
1217 goto out; 1194 goto out;
1218 } 1195 }
@@ -1252,7 +1229,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1252 * on the head page to show that the hugepage is hwpoisoned 1229 * on the head page to show that the hugepage is hwpoisoned
1253 */ 1230 */
1254 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { 1231 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1255 action_result(pfn, MSG_POISONED_HUGE, IGNORED); 1232 action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
1256 unlock_page(hpage); 1233 unlock_page(hpage);
1257 put_page(hpage); 1234 put_page(hpage);
1258 return 0; 1235 return 0;
@@ -1281,7 +1258,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1281 */ 1258 */
1282 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) 1259 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
1283 != SWAP_SUCCESS) { 1260 != SWAP_SUCCESS) {
1284 action_result(pfn, MSG_UNMAP_FAILED, IGNORED); 1261 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1285 res = -EBUSY; 1262 res = -EBUSY;
1286 goto out; 1263 goto out;
1287 } 1264 }
@@ -1290,7 +1267,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1290 * Torn down by someone else? 1267 * Torn down by someone else?
1291 */ 1268 */
1292 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 1269 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1293 action_result(pfn, MSG_TRUNCATED_LRU, IGNORED); 1270 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1294 res = -EBUSY; 1271 res = -EBUSY;
1295 goto out; 1272 goto out;
1296 } 1273 }
@@ -1450,12 +1427,12 @@ int unpoison_memory(unsigned long pfn)
1450 */ 1427 */
1451 if (!PageHuge(page) && PageTransHuge(page)) { 1428 if (!PageHuge(page) && PageTransHuge(page)) {
1452 pr_info("MCE: Memory failure is now running on %#lx\n", pfn); 1429 pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
1453 return 0; 1430 return 0;
1454 } 1431 }
1455 1432
1456 nr_pages = 1 << compound_order(page); 1433 nr_pages = 1 << compound_order(page);
1457 1434
1458 if (!get_page_unless_zero(page)) { 1435 if (!get_hwpoison_page(p)) {
1459 /* 1436 /*
1460 * Since HWPoisoned hugepage should have non-zero refcount, 1437 * Since HWPoisoned hugepage should have non-zero refcount,
1461 * race between memory failure and unpoison seems to happen. 1438 * race between memory failure and unpoison seems to happen.
@@ -1523,7 +1500,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1523 * When the target page is a free hugepage, just remove it 1500 * When the target page is a free hugepage, just remove it
1524 * from free hugepage list. 1501 * from free hugepage list.
1525 */ 1502 */
1526 if (!get_page_unless_zero(compound_head(p))) { 1503 if (!get_hwpoison_page(p)) {
1527 if (PageHuge(p)) { 1504 if (PageHuge(p)) {
1528 pr_info("%s: %#lx free huge page\n", __func__, pfn); 1505 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1529 ret = 0; 1506 ret = 0;
@@ -1694,20 +1671,7 @@ static int __soft_offline_page(struct page *page, int flags)
1694 if (ret > 0) 1671 if (ret > 0)
1695 ret = -EIO; 1672 ret = -EIO;
1696 } else { 1673 } else {
1697 /*
1698 * After page migration succeeds, the source page can
1699 * be trapped in pagevec and actual freeing is delayed.
1700 * Freeing code works differently based on PG_hwpoison,
1701 * so there's a race. We need to make sure that the
1702 * source page should be freed back to buddy before
1703 * setting PG_hwpoison.
1704 */
1705 if (!is_free_buddy_page(page))
1706 drain_all_pages(page_zone(page));
1707 SetPageHWPoison(page); 1674 SetPageHWPoison(page);
1708 if (!is_free_buddy_page(page))
1709 pr_info("soft offline: %#lx: page leaked\n",
1710 pfn);
1711 atomic_long_inc(&num_poisoned_pages); 1675 atomic_long_inc(&num_poisoned_pages);
1712 } 1676 }
1713 } else { 1677 } else {
@@ -1759,14 +1723,6 @@ int soft_offline_page(struct page *page, int flags)
1759 1723
1760 get_online_mems(); 1724 get_online_mems();
1761 1725
1762 /*
1763 * Isolate the page, so that it doesn't get reallocated if it
1764 * was free. This flag should be kept set until the source page
1765 * is freed and PG_hwpoison on it is set.
1766 */
1767 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
1768 set_migratetype_isolate(page, true);
1769
1770 ret = get_any_page(page, pfn, flags); 1726 ret = get_any_page(page, pfn, flags);
1771 put_online_mems(); 1727 put_online_mems();
1772 if (ret > 0) { /* for in-use pages */ 1728 if (ret > 0) { /* for in-use pages */
@@ -1785,6 +1741,5 @@ int soft_offline_page(struct page *page, int flags)
1785 atomic_long_inc(&num_poisoned_pages); 1741 atomic_long_inc(&num_poisoned_pages);
1786 } 1742 }
1787 } 1743 }
1788 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1789 return ret; 1744 return ret;
1790} 1745}
diff --git a/mm/memory.c b/mm/memory.c
index 17734c3c1183..11b9ca176740 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2081,11 +2081,12 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2081 goto oom; 2081 goto oom;
2082 cow_user_page(new_page, old_page, address, vma); 2082 cow_user_page(new_page, old_page, address, vma);
2083 } 2083 }
2084 __SetPageUptodate(new_page);
2085 2084
2086 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) 2085 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2087 goto oom_free_new; 2086 goto oom_free_new;
2088 2087
2088 __SetPageUptodate(new_page);
2089
2089 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2090 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2090 2091
2091 /* 2092 /*
@@ -2689,6 +2690,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2689 page = alloc_zeroed_user_highpage_movable(vma, address); 2690 page = alloc_zeroed_user_highpage_movable(vma, address);
2690 if (!page) 2691 if (!page)
2691 goto oom; 2692 goto oom;
2693
2694 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
2695 goto oom_free_page;
2696
2692 /* 2697 /*
2693 * The memory barrier inside __SetPageUptodate makes sure that 2698 * The memory barrier inside __SetPageUptodate makes sure that
2694 * preceeding stores to the page contents become visible before 2699 * preceeding stores to the page contents become visible before
@@ -2696,9 +2701,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2696 */ 2701 */
2697 __SetPageUptodate(page); 2702 __SetPageUptodate(page);
2698 2703
2699 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
2700 goto oom_free_page;
2701
2702 entry = mk_pte(page, vma->vm_page_prot); 2704 entry = mk_pte(page, vma->vm_page_prot);
2703 if (vma->vm_flags & VM_WRITE) 2705 if (vma->vm_flags & VM_WRITE)
2704 entry = pte_mkwrite(pte_mkdirty(entry)); 2706 entry = pte_mkwrite(pte_mkdirty(entry));
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9e88f749aa51..26fbba7d888f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -513,6 +513,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
513 break; 513 break;
514 err = 0; 514 err = 0;
515 } 515 }
516 vmemmap_populate_print_last();
516 517
517 return err; 518 return err;
518} 519}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 747743237d9f..99d4c1d0b858 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1972,35 +1972,41 @@ retry_cpuset:
1972 pol = get_vma_policy(vma, addr); 1972 pol = get_vma_policy(vma, addr);
1973 cpuset_mems_cookie = read_mems_allowed_begin(); 1973 cpuset_mems_cookie = read_mems_allowed_begin();
1974 1974
1975 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage && 1975 if (pol->mode == MPOL_INTERLEAVE) {
1976 pol->mode != MPOL_INTERLEAVE)) { 1976 unsigned nid;
1977
1978 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1979 mpol_cond_put(pol);
1980 page = alloc_page_interleave(gfp, order, nid);
1981 goto out;
1982 }
1983
1984 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1985 int hpage_node = node;
1986
1977 /* 1987 /*
1978 * For hugepage allocation and non-interleave policy which 1988 * For hugepage allocation and non-interleave policy which
1979 * allows the current node, we only try to allocate from the 1989 * allows the current node (or other explicitly preferred
1980 * current node and don't fall back to other nodes, as the 1990 * node) we only try to allocate from the current/preferred
1981 * cost of remote accesses would likely offset THP benefits. 1991 * node and don't fall back to other nodes, as the cost of
1992 * remote accesses would likely offset THP benefits.
1982 * 1993 *
1983 * If the policy is interleave, or does not allow the current 1994 * If the policy is interleave, or does not allow the current
1984 * node in its nodemask, we allocate the standard way. 1995 * node in its nodemask, we allocate the standard way.
1985 */ 1996 */
1997 if (pol->mode == MPOL_PREFERRED &&
1998 !(pol->flags & MPOL_F_LOCAL))
1999 hpage_node = pol->v.preferred_node;
2000
1986 nmask = policy_nodemask(gfp, pol); 2001 nmask = policy_nodemask(gfp, pol);
1987 if (!nmask || node_isset(node, *nmask)) { 2002 if (!nmask || node_isset(hpage_node, *nmask)) {
1988 mpol_cond_put(pol); 2003 mpol_cond_put(pol);
1989 page = alloc_pages_exact_node(node, 2004 page = alloc_pages_exact_node(hpage_node,
1990 gfp | __GFP_THISNODE, order); 2005 gfp | __GFP_THISNODE, order);
1991 goto out; 2006 goto out;
1992 } 2007 }
1993 } 2008 }
1994 2009
1995 if (pol->mode == MPOL_INTERLEAVE) {
1996 unsigned nid;
1997
1998 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1999 mpol_cond_put(pol);
2000 page = alloc_page_interleave(gfp, order, nid);
2001 goto out;
2002 }
2003
2004 nmask = policy_nodemask(gfp, pol); 2010 nmask = policy_nodemask(gfp, pol);
2005 zl = policy_zonelist(gfp, pol, node); 2011 zl = policy_zonelist(gfp, pol, node);
2006 mpol_cond_put(pol); 2012 mpol_cond_put(pol);
diff --git a/mm/memtest.c b/mm/memtest.c
index 1997d934b13b..0a1cc133f6d7 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -74,7 +74,8 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
74 u64 i; 74 u64 i;
75 phys_addr_t this_start, this_end; 75 phys_addr_t this_start, this_end;
76 76
77 for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { 77 for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start,
78 &this_end, NULL) {
78 this_start = clamp(this_start, start, end); 79 this_start = clamp(this_start, start, end);
79 this_end = clamp(this_end, start, end); 80 this_end = clamp(this_end, start, end);
80 if (this_start < this_end) { 81 if (this_start < this_end) {
diff --git a/mm/migrate.c b/mm/migrate.c
index f53838fe3dfe..ee401e4e5ef1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -918,7 +918,8 @@ out:
918static ICE_noinline int unmap_and_move(new_page_t get_new_page, 918static ICE_noinline int unmap_and_move(new_page_t get_new_page,
919 free_page_t put_new_page, 919 free_page_t put_new_page,
920 unsigned long private, struct page *page, 920 unsigned long private, struct page *page,
921 int force, enum migrate_mode mode) 921 int force, enum migrate_mode mode,
922 enum migrate_reason reason)
922{ 923{
923 int rc = 0; 924 int rc = 0;
924 int *result = NULL; 925 int *result = NULL;
@@ -949,7 +950,8 @@ out:
949 list_del(&page->lru); 950 list_del(&page->lru);
950 dec_zone_page_state(page, NR_ISOLATED_ANON + 951 dec_zone_page_state(page, NR_ISOLATED_ANON +
951 page_is_file_cache(page)); 952 page_is_file_cache(page));
952 putback_lru_page(page); 953 if (reason != MR_MEMORY_FAILURE)
954 putback_lru_page(page);
953 } 955 }
954 956
955 /* 957 /*
@@ -1122,7 +1124,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
1122 pass > 2, mode); 1124 pass > 2, mode);
1123 else 1125 else
1124 rc = unmap_and_move(get_new_page, put_new_page, 1126 rc = unmap_and_move(get_new_page, put_new_page,
1125 private, page, pass > 2, mode); 1127 private, page, pass > 2, mode,
1128 reason);
1126 1129
1127 switch(rc) { 1130 switch(rc) {
1128 case -ENOMEM: 1131 case -ENOMEM:
@@ -1796,7 +1799,7 @@ fail_putback:
1796 */ 1799 */
1797 flush_cache_range(vma, mmun_start, mmun_end); 1800 flush_cache_range(vma, mmun_start, mmun_end);
1798 page_add_anon_rmap(new_page, vma, mmun_start); 1801 page_add_anon_rmap(new_page, vma, mmun_start);
1799 pmdp_clear_flush_notify(vma, mmun_start, pmd); 1802 pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
1800 set_pmd_at(mm, mmun_start, pmd, entry); 1803 set_pmd_at(mm, mmun_start, pmd, entry);
1801 flush_tlb_range(vma, mmun_start, mmun_end); 1804 flush_tlb_range(vma, mmun_start, mmun_end);
1802 update_mmu_cache_pmd(vma, address, &entry); 1805 update_mmu_cache_pmd(vma, address, &entry);
diff --git a/mm/mmap.c b/mm/mmap.c
index bb50cacc3ea5..aa632ade2be7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1258,6 +1258,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1258 1258
1259 *populate = 0; 1259 *populate = 0;
1260 1260
1261 if (!len)
1262 return -EINVAL;
1263
1261 /* 1264 /*
1262 * Does the application expect PROT_READ to imply PROT_EXEC? 1265 * Does the application expect PROT_READ to imply PROT_EXEC?
1263 * 1266 *
@@ -1268,9 +1271,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1268 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) 1271 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
1269 prot |= PROT_EXEC; 1272 prot |= PROT_EXEC;
1270 1273
1271 if (!len)
1272 return -EINVAL;
1273
1274 if (!(flags & MAP_FIXED)) 1274 if (!(flags & MAP_FIXED))
1275 addr = round_hint_to_min(addr); 1275 addr = round_hint_to_min(addr);
1276 1276
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 88584838e704..e7d6f1171ecb 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,6 +29,8 @@
29#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
30#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
31 31
32#include "internal.h"
33
32/* 34/*
33 * For a prot_numa update we only hold mmap_sem for read so there is a 35 * For a prot_numa update we only hold mmap_sem for read so there is a
34 * potential race with faulting where a pmd was temporarily none. This 36 * potential race with faulting where a pmd was temporarily none. This
@@ -322,6 +324,15 @@ success:
322 change_protection(vma, start, end, vma->vm_page_prot, 324 change_protection(vma, start, end, vma->vm_page_prot,
323 dirty_accountable, 0); 325 dirty_accountable, 0);
324 326
327 /*
328 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
329 * fault on access.
330 */
331 if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
332 (newflags & VM_WRITE)) {
333 populate_vma_page_range(vma, start, end, NULL);
334 }
335
325 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 336 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
326 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 337 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
327 perf_event_mmap(vma); 338 perf_event_mmap(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index 034e2d360652..a7c93eceb1c8 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,6 +22,7 @@
22#include <linux/mmu_notifier.h> 22#include <linux/mmu_notifier.h>
23#include <linux/sched/sysctl.h> 23#include <linux/sched/sysctl.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/mm-arch-hooks.h>
25 26
26#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
27#include <asm/tlbflush.h> 28#include <asm/tlbflush.h>
@@ -286,13 +287,17 @@ static unsigned long move_vma(struct vm_area_struct *vma,
286 old_len = new_len; 287 old_len = new_len;
287 old_addr = new_addr; 288 old_addr = new_addr;
288 new_addr = -ENOMEM; 289 new_addr = -ENOMEM;
289 } else if (vma->vm_file && vma->vm_file->f_op->mremap) { 290 } else {
290 err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); 291 if (vma->vm_file && vma->vm_file->f_op->mremap) {
291 if (err < 0) { 292 err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
292 move_page_tables(new_vma, new_addr, vma, old_addr, 293 if (err < 0) {
293 moved_len, true); 294 move_page_tables(new_vma, new_addr, vma,
294 return err; 295 old_addr, moved_len, true);
296 return err;
297 }
295 } 298 }
299 arch_remap(mm, old_addr, old_addr + old_len,
300 new_addr, new_addr + new_len);
296 } 301 }
297 302
298 /* Conceal VM_ACCOUNT so old reservation is not undone */ 303 /* Conceal VM_ACCOUNT so old reservation is not undone */
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 90b50468333e..5258386fa1be 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -37,11 +37,20 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
37{ 37{
38 void *ptr; 38 void *ptr;
39 u64 addr; 39 u64 addr;
40 ulong flags = choose_memblock_flags();
40 41
41 if (limit > memblock.current_limit) 42 if (limit > memblock.current_limit)
42 limit = memblock.current_limit; 43 limit = memblock.current_limit;
43 44
44 addr = memblock_find_in_range_node(size, align, goal, limit, nid); 45again:
46 addr = memblock_find_in_range_node(size, align, goal, limit, nid,
47 flags);
48 if (!addr && (flags & MEMBLOCK_MIRROR)) {
49 flags &= ~MEMBLOCK_MIRROR;
50 pr_warn("Could not allocate %pap bytes of mirrored memory\n",
51 &size);
52 goto again;
53 }
45 if (!addr) 54 if (!addr)
46 return NULL; 55 return NULL;
47 56
@@ -121,7 +130,8 @@ static unsigned long __init free_low_memory_core_early(void)
121 130
122 memblock_clear_hotplug(0, -1); 131 memblock_clear_hotplug(0, -1);
123 132
124 for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) 133 for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
134 NULL)
125 count += __free_memory_core(start, end); 135 count += __free_memory_core(start, end);
126 136
127#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK 137#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
diff --git a/mm/nommu.c b/mm/nommu.c
index e544508e2a4b..05e7447d960b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -42,22 +42,6 @@
42#include <asm/mmu_context.h> 42#include <asm/mmu_context.h>
43#include "internal.h" 43#include "internal.h"
44 44
45#if 0
46#define kenter(FMT, ...) \
47 printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
48#define kleave(FMT, ...) \
49 printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
50#define kdebug(FMT, ...) \
51 printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
52#else
53#define kenter(FMT, ...) \
54 no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
55#define kleave(FMT, ...) \
56 no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
57#define kdebug(FMT, ...) \
58 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
59#endif
60
61void *high_memory; 45void *high_memory;
62EXPORT_SYMBOL(high_memory); 46EXPORT_SYMBOL(high_memory);
63struct page *mem_map; 47struct page *mem_map;
@@ -665,11 +649,7 @@ static void free_page_series(unsigned long from, unsigned long to)
665 for (; from < to; from += PAGE_SIZE) { 649 for (; from < to; from += PAGE_SIZE) {
666 struct page *page = virt_to_page(from); 650 struct page *page = virt_to_page(from);
667 651
668 kdebug("- free %lx", from);
669 atomic_long_dec(&mmap_pages_allocated); 652 atomic_long_dec(&mmap_pages_allocated);
670 if (page_count(page) != 1)
671 kdebug("free page %p: refcount not one: %d",
672 page, page_count(page));
673 put_page(page); 653 put_page(page);
674 } 654 }
675} 655}
@@ -683,8 +663,6 @@ static void free_page_series(unsigned long from, unsigned long to)
683static void __put_nommu_region(struct vm_region *region) 663static void __put_nommu_region(struct vm_region *region)
684 __releases(nommu_region_sem) 664 __releases(nommu_region_sem)
685{ 665{
686 kenter("%p{%d}", region, region->vm_usage);
687
688 BUG_ON(!nommu_region_tree.rb_node); 666 BUG_ON(!nommu_region_tree.rb_node);
689 667
690 if (--region->vm_usage == 0) { 668 if (--region->vm_usage == 0) {
@@ -697,10 +675,8 @@ static void __put_nommu_region(struct vm_region *region)
697 675
698 /* IO memory and memory shared directly out of the pagecache 676 /* IO memory and memory shared directly out of the pagecache
699 * from ramfs/tmpfs mustn't be released here */ 677 * from ramfs/tmpfs mustn't be released here */
700 if (region->vm_flags & VM_MAPPED_COPY) { 678 if (region->vm_flags & VM_MAPPED_COPY)
701 kdebug("free series");
702 free_page_series(region->vm_start, region->vm_top); 679 free_page_series(region->vm_start, region->vm_top);
703 }
704 kmem_cache_free(vm_region_jar, region); 680 kmem_cache_free(vm_region_jar, region);
705 } else { 681 } else {
706 up_write(&nommu_region_sem); 682 up_write(&nommu_region_sem);
@@ -744,8 +720,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
744 struct address_space *mapping; 720 struct address_space *mapping;
745 struct rb_node **p, *parent, *rb_prev; 721 struct rb_node **p, *parent, *rb_prev;
746 722
747 kenter(",%p", vma);
748
749 BUG_ON(!vma->vm_region); 723 BUG_ON(!vma->vm_region);
750 724
751 mm->map_count++; 725 mm->map_count++;
@@ -813,8 +787,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
813 struct mm_struct *mm = vma->vm_mm; 787 struct mm_struct *mm = vma->vm_mm;
814 struct task_struct *curr = current; 788 struct task_struct *curr = current;
815 789
816 kenter("%p", vma);
817
818 protect_vma(vma, 0); 790 protect_vma(vma, 0);
819 791
820 mm->map_count--; 792 mm->map_count--;
@@ -854,7 +826,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
854 */ 826 */
855static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) 827static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
856{ 828{
857 kenter("%p", vma);
858 if (vma->vm_ops && vma->vm_ops->close) 829 if (vma->vm_ops && vma->vm_ops->close)
859 vma->vm_ops->close(vma); 830 vma->vm_ops->close(vma);
860 if (vma->vm_file) 831 if (vma->vm_file)
@@ -957,12 +928,8 @@ static int validate_mmap_request(struct file *file,
957 int ret; 928 int ret;
958 929
959 /* do the simple checks first */ 930 /* do the simple checks first */
960 if (flags & MAP_FIXED) { 931 if (flags & MAP_FIXED)
961 printk(KERN_DEBUG
962 "%d: Can't do fixed-address/overlay mmap of RAM\n",
963 current->pid);
964 return -EINVAL; 932 return -EINVAL;
965 }
966 933
967 if ((flags & MAP_TYPE) != MAP_PRIVATE && 934 if ((flags & MAP_TYPE) != MAP_PRIVATE &&
968 (flags & MAP_TYPE) != MAP_SHARED) 935 (flags & MAP_TYPE) != MAP_SHARED)
@@ -1060,8 +1027,7 @@ static int validate_mmap_request(struct file *file,
1060 ) { 1027 ) {
1061 capabilities &= ~NOMMU_MAP_DIRECT; 1028 capabilities &= ~NOMMU_MAP_DIRECT;
1062 if (flags & MAP_SHARED) { 1029 if (flags & MAP_SHARED) {
1063 printk(KERN_WARNING 1030 pr_warn("MAP_SHARED not completely supported on !MMU\n");
1064 "MAP_SHARED not completely supported on !MMU\n");
1065 return -EINVAL; 1031 return -EINVAL;
1066 } 1032 }
1067 } 1033 }
@@ -1205,16 +1171,12 @@ static int do_mmap_private(struct vm_area_struct *vma,
1205 * we're allocating is smaller than a page 1171 * we're allocating is smaller than a page
1206 */ 1172 */
1207 order = get_order(len); 1173 order = get_order(len);
1208 kdebug("alloc order %d for %lx", order, len);
1209
1210 total = 1 << order; 1174 total = 1 << order;
1211 point = len >> PAGE_SHIFT; 1175 point = len >> PAGE_SHIFT;
1212 1176
1213 /* we don't want to allocate a power-of-2 sized page set */ 1177 /* we don't want to allocate a power-of-2 sized page set */
1214 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { 1178 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages)
1215 total = point; 1179 total = point;
1216 kdebug("try to alloc exact %lu pages", total);
1217 }
1218 1180
1219 base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL); 1181 base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL);
1220 if (!base) 1182 if (!base)
@@ -1285,18 +1247,14 @@ unsigned long do_mmap_pgoff(struct file *file,
1285 unsigned long capabilities, vm_flags, result; 1247 unsigned long capabilities, vm_flags, result;
1286 int ret; 1248 int ret;
1287 1249
1288 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1289
1290 *populate = 0; 1250 *populate = 0;
1291 1251
1292 /* decide whether we should attempt the mapping, and if so what sort of 1252 /* decide whether we should attempt the mapping, and if so what sort of
1293 * mapping */ 1253 * mapping */
1294 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1254 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
1295 &capabilities); 1255 &capabilities);
1296 if (ret < 0) { 1256 if (ret < 0)
1297 kleave(" = %d [val]", ret);
1298 return ret; 1257 return ret;
1299 }
1300 1258
1301 /* we ignore the address hint */ 1259 /* we ignore the address hint */
1302 addr = 0; 1260 addr = 0;
@@ -1383,11 +1341,9 @@ unsigned long do_mmap_pgoff(struct file *file,
1383 vma->vm_start = start; 1341 vma->vm_start = start;
1384 vma->vm_end = start + len; 1342 vma->vm_end = start + len;
1385 1343
1386 if (pregion->vm_flags & VM_MAPPED_COPY) { 1344 if (pregion->vm_flags & VM_MAPPED_COPY)
1387 kdebug("share copy");
1388 vma->vm_flags |= VM_MAPPED_COPY; 1345 vma->vm_flags |= VM_MAPPED_COPY;
1389 } else { 1346 else {
1390 kdebug("share mmap");
1391 ret = do_mmap_shared_file(vma); 1347 ret = do_mmap_shared_file(vma);
1392 if (ret < 0) { 1348 if (ret < 0) {
1393 vma->vm_region = NULL; 1349 vma->vm_region = NULL;
@@ -1467,7 +1423,6 @@ share:
1467 1423
1468 up_write(&nommu_region_sem); 1424 up_write(&nommu_region_sem);
1469 1425
1470 kleave(" = %lx", result);
1471 return result; 1426 return result;
1472 1427
1473error_just_free: 1428error_just_free:
@@ -1479,27 +1434,24 @@ error:
1479 if (vma->vm_file) 1434 if (vma->vm_file)
1480 fput(vma->vm_file); 1435 fput(vma->vm_file);
1481 kmem_cache_free(vm_area_cachep, vma); 1436 kmem_cache_free(vm_area_cachep, vma);
1482 kleave(" = %d", ret);
1483 return ret; 1437 return ret;
1484 1438
1485sharing_violation: 1439sharing_violation:
1486 up_write(&nommu_region_sem); 1440 up_write(&nommu_region_sem);
1487 printk(KERN_WARNING "Attempt to share mismatched mappings\n"); 1441 pr_warn("Attempt to share mismatched mappings\n");
1488 ret = -EINVAL; 1442 ret = -EINVAL;
1489 goto error; 1443 goto error;
1490 1444
1491error_getting_vma: 1445error_getting_vma:
1492 kmem_cache_free(vm_region_jar, region); 1446 kmem_cache_free(vm_region_jar, region);
1493 printk(KERN_WARNING "Allocation of vma for %lu byte allocation" 1447 pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
1494 " from process %d failed\n", 1448 len, current->pid);
1495 len, current->pid);
1496 show_free_areas(0); 1449 show_free_areas(0);
1497 return -ENOMEM; 1450 return -ENOMEM;
1498 1451
1499error_getting_region: 1452error_getting_region:
1500 printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" 1453 pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
1501 " from process %d failed\n", 1454 len, current->pid);
1502 len, current->pid);
1503 show_free_areas(0); 1455 show_free_areas(0);
1504 return -ENOMEM; 1456 return -ENOMEM;
1505} 1457}
@@ -1563,8 +1515,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1563 struct vm_region *region; 1515 struct vm_region *region;
1564 unsigned long npages; 1516 unsigned long npages;
1565 1517
1566 kenter("");
1567
1568 /* we're only permitted to split anonymous regions (these should have 1518 /* we're only permitted to split anonymous regions (these should have
1569 * only a single usage on the region) */ 1519 * only a single usage on the region) */
1570 if (vma->vm_file) 1520 if (vma->vm_file)
@@ -1628,8 +1578,6 @@ static int shrink_vma(struct mm_struct *mm,
1628{ 1578{
1629 struct vm_region *region; 1579 struct vm_region *region;
1630 1580
1631 kenter("");
1632
1633 /* adjust the VMA's pointers, which may reposition it in the MM's tree 1581 /* adjust the VMA's pointers, which may reposition it in the MM's tree
1634 * and list */ 1582 * and list */
1635 delete_vma_from_mm(vma); 1583 delete_vma_from_mm(vma);
@@ -1669,8 +1617,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1669 unsigned long end; 1617 unsigned long end;
1670 int ret; 1618 int ret;
1671 1619
1672 kenter(",%lx,%zx", start, len);
1673
1674 len = PAGE_ALIGN(len); 1620 len = PAGE_ALIGN(len);
1675 if (len == 0) 1621 if (len == 0)
1676 return -EINVAL; 1622 return -EINVAL;
@@ -1682,11 +1628,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1682 if (!vma) { 1628 if (!vma) {
1683 static int limit; 1629 static int limit;
1684 if (limit < 5) { 1630 if (limit < 5) {
1685 printk(KERN_WARNING 1631 pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n",
1686 "munmap of memory not mmapped by process %d" 1632 current->pid, current->comm,
1687 " (%s): 0x%lx-0x%lx\n", 1633 start, start + len - 1);
1688 current->pid, current->comm,
1689 start, start + len - 1);
1690 limit++; 1634 limit++;
1691 } 1635 }
1692 return -EINVAL; 1636 return -EINVAL;
@@ -1695,38 +1639,27 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1695 /* we're allowed to split an anonymous VMA but not a file-backed one */ 1639 /* we're allowed to split an anonymous VMA but not a file-backed one */
1696 if (vma->vm_file) { 1640 if (vma->vm_file) {
1697 do { 1641 do {
1698 if (start > vma->vm_start) { 1642 if (start > vma->vm_start)
1699 kleave(" = -EINVAL [miss]");
1700 return -EINVAL; 1643 return -EINVAL;
1701 }
1702 if (end == vma->vm_end) 1644 if (end == vma->vm_end)
1703 goto erase_whole_vma; 1645 goto erase_whole_vma;
1704 vma = vma->vm_next; 1646 vma = vma->vm_next;
1705 } while (vma); 1647 } while (vma);
1706 kleave(" = -EINVAL [split file]");
1707 return -EINVAL; 1648 return -EINVAL;
1708 } else { 1649 } else {
1709 /* the chunk must be a subset of the VMA found */ 1650 /* the chunk must be a subset of the VMA found */
1710 if (start == vma->vm_start && end == vma->vm_end) 1651 if (start == vma->vm_start && end == vma->vm_end)
1711 goto erase_whole_vma; 1652 goto erase_whole_vma;
1712 if (start < vma->vm_start || end > vma->vm_end) { 1653 if (start < vma->vm_start || end > vma->vm_end)
1713 kleave(" = -EINVAL [superset]");
1714 return -EINVAL; 1654 return -EINVAL;
1715 } 1655 if (start & ~PAGE_MASK)
1716 if (start & ~PAGE_MASK) {
1717 kleave(" = -EINVAL [unaligned start]");
1718 return -EINVAL; 1656 return -EINVAL;
1719 } 1657 if (end != vma->vm_end && end & ~PAGE_MASK)
1720 if (end != vma->vm_end && end & ~PAGE_MASK) {
1721 kleave(" = -EINVAL [unaligned split]");
1722 return -EINVAL; 1658 return -EINVAL;
1723 }
1724 if (start != vma->vm_start && end != vma->vm_end) { 1659 if (start != vma->vm_start && end != vma->vm_end) {
1725 ret = split_vma(mm, vma, start, 1); 1660 ret = split_vma(mm, vma, start, 1);
1726 if (ret < 0) { 1661 if (ret < 0)
1727 kleave(" = %d [split]", ret);
1728 return ret; 1662 return ret;
1729 }
1730 } 1663 }
1731 return shrink_vma(mm, vma, start, end); 1664 return shrink_vma(mm, vma, start, end);
1732 } 1665 }
@@ -1734,7 +1667,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1734erase_whole_vma: 1667erase_whole_vma:
1735 delete_vma_from_mm(vma); 1668 delete_vma_from_mm(vma);
1736 delete_vma(mm, vma); 1669 delete_vma(mm, vma);
1737 kleave(" = 0");
1738 return 0; 1670 return 0;
1739} 1671}
1740EXPORT_SYMBOL(do_munmap); 1672EXPORT_SYMBOL(do_munmap);
@@ -1766,8 +1698,6 @@ void exit_mmap(struct mm_struct *mm)
1766 if (!mm) 1698 if (!mm)
1767 return; 1699 return;
1768 1700
1769 kenter("");
1770
1771 mm->total_vm = 0; 1701 mm->total_vm = 0;
1772 1702
1773 while ((vma = mm->mmap)) { 1703 while ((vma = mm->mmap)) {
@@ -1776,8 +1706,6 @@ void exit_mmap(struct mm_struct *mm)
1776 delete_vma(mm, vma); 1706 delete_vma(mm, vma);
1777 cond_resched(); 1707 cond_resched();
1778 } 1708 }
1779
1780 kleave("");
1781} 1709}
1782 1710
1783unsigned long vm_brk(unsigned long addr, unsigned long len) 1711unsigned long vm_brk(unsigned long addr, unsigned long len)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2b665da1b3c9..dff991e0681e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,7 +42,8 @@
42int sysctl_panic_on_oom; 42int sysctl_panic_on_oom;
43int sysctl_oom_kill_allocating_task; 43int sysctl_oom_kill_allocating_task;
44int sysctl_oom_dump_tasks = 1; 44int sysctl_oom_dump_tasks = 1;
45static DEFINE_SPINLOCK(zone_scan_lock); 45
46DEFINE_MUTEX(oom_lock);
46 47
47#ifdef CONFIG_NUMA 48#ifdef CONFIG_NUMA
48/** 49/**
@@ -405,16 +406,15 @@ static atomic_t oom_victims = ATOMIC_INIT(0);
405static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); 406static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
406 407
407bool oom_killer_disabled __read_mostly; 408bool oom_killer_disabled __read_mostly;
408static DECLARE_RWSEM(oom_sem);
409 409
410/** 410/**
411 * mark_tsk_oom_victim - marks the given task as OOM victim. 411 * mark_oom_victim - mark the given task as OOM victim
412 * @tsk: task to mark 412 * @tsk: task to mark
413 * 413 *
414 * Has to be called with oom_sem taken for read and never after 414 * Has to be called with oom_lock held and never after
415 * oom has been disabled already. 415 * oom has been disabled already.
416 */ 416 */
417void mark_tsk_oom_victim(struct task_struct *tsk) 417void mark_oom_victim(struct task_struct *tsk)
418{ 418{
419 WARN_ON(oom_killer_disabled); 419 WARN_ON(oom_killer_disabled);
420 /* OOM killer might race with memcg OOM */ 420 /* OOM killer might race with memcg OOM */
@@ -431,23 +431,14 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
431} 431}
432 432
433/** 433/**
434 * unmark_oom_victim - unmarks the current task as OOM victim. 434 * exit_oom_victim - note the exit of an OOM victim
435 *
436 * Wakes up all waiters in oom_killer_disable()
437 */ 435 */
438void unmark_oom_victim(void) 436void exit_oom_victim(void)
439{ 437{
440 if (!test_and_clear_thread_flag(TIF_MEMDIE)) 438 clear_thread_flag(TIF_MEMDIE);
441 return;
442 439
443 down_read(&oom_sem); 440 if (!atomic_dec_return(&oom_victims))
444 /*
445 * There is no need to signal the lasst oom_victim if there
446 * is nobody who cares.
447 */
448 if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
449 wake_up_all(&oom_victims_wait); 441 wake_up_all(&oom_victims_wait);
450 up_read(&oom_sem);
451} 442}
452 443
453/** 444/**
@@ -469,14 +460,14 @@ bool oom_killer_disable(void)
469 * Make sure to not race with an ongoing OOM killer 460 * Make sure to not race with an ongoing OOM killer
470 * and that the current is not the victim. 461 * and that the current is not the victim.
471 */ 462 */
472 down_write(&oom_sem); 463 mutex_lock(&oom_lock);
473 if (test_thread_flag(TIF_MEMDIE)) { 464 if (test_thread_flag(TIF_MEMDIE)) {
474 up_write(&oom_sem); 465 mutex_unlock(&oom_lock);
475 return false; 466 return false;
476 } 467 }
477 468
478 oom_killer_disabled = true; 469 oom_killer_disabled = true;
479 up_write(&oom_sem); 470 mutex_unlock(&oom_lock);
480 471
481 wait_event(oom_victims_wait, !atomic_read(&oom_victims)); 472 wait_event(oom_victims_wait, !atomic_read(&oom_victims));
482 473
@@ -488,9 +479,7 @@ bool oom_killer_disable(void)
488 */ 479 */
489void oom_killer_enable(void) 480void oom_killer_enable(void)
490{ 481{
491 down_write(&oom_sem);
492 oom_killer_disabled = false; 482 oom_killer_disabled = false;
493 up_write(&oom_sem);
494} 483}
495 484
496#define K(x) ((x) << (PAGE_SHIFT-10)) 485#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -517,7 +506,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
517 */ 506 */
518 task_lock(p); 507 task_lock(p);
519 if (p->mm && task_will_free_mem(p)) { 508 if (p->mm && task_will_free_mem(p)) {
520 mark_tsk_oom_victim(p); 509 mark_oom_victim(p);
521 task_unlock(p); 510 task_unlock(p);
522 put_task_struct(p); 511 put_task_struct(p);
523 return; 512 return;
@@ -528,7 +517,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
528 dump_header(p, gfp_mask, order, memcg, nodemask); 517 dump_header(p, gfp_mask, order, memcg, nodemask);
529 518
530 task_lock(p); 519 task_lock(p);
531 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", 520 pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
532 message, task_pid_nr(p), p->comm, points); 521 message, task_pid_nr(p), p->comm, points);
533 task_unlock(p); 522 task_unlock(p);
534 523
@@ -572,7 +561,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
572 561
573 /* mm cannot safely be dereferenced after task_unlock(victim) */ 562 /* mm cannot safely be dereferenced after task_unlock(victim) */
574 mm = victim->mm; 563 mm = victim->mm;
575 mark_tsk_oom_victim(victim); 564 mark_oom_victim(victim);
576 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 565 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
577 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), 566 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
578 K(get_mm_counter(victim->mm, MM_ANONPAGES)), 567 K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -645,52 +634,6 @@ int unregister_oom_notifier(struct notifier_block *nb)
645} 634}
646EXPORT_SYMBOL_GPL(unregister_oom_notifier); 635EXPORT_SYMBOL_GPL(unregister_oom_notifier);
647 636
648/*
649 * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero
650 * if a parallel OOM killing is already taking place that includes a zone in
651 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
652 */
653bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
654{
655 struct zoneref *z;
656 struct zone *zone;
657 bool ret = true;
658
659 spin_lock(&zone_scan_lock);
660 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
661 if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) {
662 ret = false;
663 goto out;
664 }
665
666 /*
667 * Lock each zone in the zonelist under zone_scan_lock so a parallel
668 * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
669 */
670 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
671 set_bit(ZONE_OOM_LOCKED, &zone->flags);
672
673out:
674 spin_unlock(&zone_scan_lock);
675 return ret;
676}
677
678/*
679 * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
680 * allocation attempts with zonelists containing them may now recall the OOM
681 * killer, if necessary.
682 */
683void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
684{
685 struct zoneref *z;
686 struct zone *zone;
687
688 spin_lock(&zone_scan_lock);
689 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
690 clear_bit(ZONE_OOM_LOCKED, &zone->flags);
691 spin_unlock(&zone_scan_lock);
692}
693
694/** 637/**
695 * __out_of_memory - kill the "best" process when we run out of memory 638 * __out_of_memory - kill the "best" process when we run out of memory
696 * @zonelist: zonelist pointer 639 * @zonelist: zonelist pointer
@@ -704,8 +647,8 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
704 * OR try to be smart about which process to kill. Note that we 647 * OR try to be smart about which process to kill. Note that we
705 * don't have to be perfect here, we just have to be good. 648 * don't have to be perfect here, we just have to be good.
706 */ 649 */
707static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 650bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
708 int order, nodemask_t *nodemask, bool force_kill) 651 int order, nodemask_t *nodemask, bool force_kill)
709{ 652{
710 const nodemask_t *mpol_mask; 653 const nodemask_t *mpol_mask;
711 struct task_struct *p; 654 struct task_struct *p;
@@ -715,10 +658,13 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
715 enum oom_constraint constraint = CONSTRAINT_NONE; 658 enum oom_constraint constraint = CONSTRAINT_NONE;
716 int killed = 0; 659 int killed = 0;
717 660
661 if (oom_killer_disabled)
662 return false;
663
718 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 664 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
719 if (freed > 0) 665 if (freed > 0)
720 /* Got some memory back in the last second. */ 666 /* Got some memory back in the last second. */
721 return; 667 goto out;
722 668
723 /* 669 /*
724 * If current has a pending SIGKILL or is exiting, then automatically 670 * If current has a pending SIGKILL or is exiting, then automatically
@@ -730,8 +676,8 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
730 */ 676 */
731 if (current->mm && 677 if (current->mm &&
732 (fatal_signal_pending(current) || task_will_free_mem(current))) { 678 (fatal_signal_pending(current) || task_will_free_mem(current))) {
733 mark_tsk_oom_victim(current); 679 mark_oom_victim(current);
734 return; 680 goto out;
735 } 681 }
736 682
737 /* 683 /*
@@ -771,32 +717,8 @@ out:
771 */ 717 */
772 if (killed) 718 if (killed)
773 schedule_timeout_killable(1); 719 schedule_timeout_killable(1);
774}
775
776/**
777 * out_of_memory - tries to invoke OOM killer.
778 * @zonelist: zonelist pointer
779 * @gfp_mask: memory allocation flags
780 * @order: amount of memory being requested as a power of 2
781 * @nodemask: nodemask passed to page allocator
782 * @force_kill: true if a task must be killed, even if others are exiting
783 *
784 * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
785 * when it returns false. Otherwise returns true.
786 */
787bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
788 int order, nodemask_t *nodemask, bool force_kill)
789{
790 bool ret = false;
791
792 down_read(&oom_sem);
793 if (!oom_killer_disabled) {
794 __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
795 ret = true;
796 }
797 up_read(&oom_sem);
798 720
799 return ret; 721 return true;
800} 722}
801 723
802/* 724/*
@@ -806,27 +728,21 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
806 */ 728 */
807void pagefault_out_of_memory(void) 729void pagefault_out_of_memory(void)
808{ 730{
809 struct zonelist *zonelist;
810
811 down_read(&oom_sem);
812 if (mem_cgroup_oom_synchronize(true)) 731 if (mem_cgroup_oom_synchronize(true))
813 goto unlock; 732 return;
814 733
815 zonelist = node_zonelist(first_memory_node, GFP_KERNEL); 734 if (!mutex_trylock(&oom_lock))
816 if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { 735 return;
817 if (!oom_killer_disabled)
818 __out_of_memory(NULL, 0, 0, NULL, false);
819 else
820 /*
821 * There shouldn't be any user tasks runable while the
822 * OOM killer is disabled so the current task has to
823 * be a racing OOM victim for which oom_killer_disable()
824 * is waiting for.
825 */
826 WARN_ON(test_thread_flag(TIF_MEMDIE));
827 736
828 oom_zonelist_unlock(zonelist, GFP_KERNEL); 737 if (!out_of_memory(NULL, 0, 0, NULL, false)) {
738 /*
739 * There shouldn't be any user tasks runnable while the
740 * OOM killer is disabled, so the current task has to
741 * be a racing OOM victim for which oom_killer_disable()
742 * is waiting for.
743 */
744 WARN_ON(test_thread_flag(TIF_MEMDIE));
829 } 745 }
830unlock: 746
831 up_read(&oom_sem); 747 mutex_unlock(&oom_lock);
832} 748}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2fd31aebef30..5e6fa06f2784 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -380,20 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order)
380 } 380 }
381} 381}
382 382
383static inline void prep_zero_page(struct page *page, unsigned int order,
384 gfp_t gfp_flags)
385{
386 int i;
387
388 /*
389 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
390 * and __GFP_HIGHMEM from hard or soft interrupt context.
391 */
392 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
393 for (i = 0; i < (1 << order); i++)
394 clear_highpage(page + i);
395}
396
397#ifdef CONFIG_DEBUG_PAGEALLOC 383#ifdef CONFIG_DEBUG_PAGEALLOC
398unsigned int _debug_guardpage_minorder; 384unsigned int _debug_guardpage_minorder;
399bool _debug_pagealloc_enabled __read_mostly; 385bool _debug_pagealloc_enabled __read_mostly;
@@ -975,7 +961,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
975 kasan_alloc_pages(page, order); 961 kasan_alloc_pages(page, order);
976 962
977 if (gfp_flags & __GFP_ZERO) 963 if (gfp_flags & __GFP_ZERO)
978 prep_zero_page(page, order, gfp_flags); 964 for (i = 0; i < (1 << order); i++)
965 clear_highpage(page + i);
979 966
980 if (order && (gfp_flags & __GFP_COMP)) 967 if (order && (gfp_flags & __GFP_COMP))
981 prep_compound_page(page, order); 968 prep_compound_page(page, order);
@@ -2322,48 +2309,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2322 show_mem(filter); 2309 show_mem(filter);
2323} 2310}
2324 2311
2325static inline int
2326should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2327 unsigned long did_some_progress,
2328 unsigned long pages_reclaimed)
2329{
2330 /* Do not loop if specifically requested */
2331 if (gfp_mask & __GFP_NORETRY)
2332 return 0;
2333
2334 /* Always retry if specifically requested */
2335 if (gfp_mask & __GFP_NOFAIL)
2336 return 1;
2337
2338 /*
2339 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
2340 * making forward progress without invoking OOM. Suspend also disables
2341 * storage devices so kswapd will not help. Bail if we are suspending.
2342 */
2343 if (!did_some_progress && pm_suspended_storage())
2344 return 0;
2345
2346 /*
2347 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
2348 * means __GFP_NOFAIL, but that may not be true in other
2349 * implementations.
2350 */
2351 if (order <= PAGE_ALLOC_COSTLY_ORDER)
2352 return 1;
2353
2354 /*
2355 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
2356 * specified, then we retry until we no longer reclaim any pages
2357 * (above), or we've reclaimed an order of pages at least as
2358 * large as the allocation's order. In both cases, if the
2359 * allocation still fails, we stop retrying.
2360 */
2361 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
2362 return 1;
2363
2364 return 0;
2365}
2366
2367static inline struct page * 2312static inline struct page *
2368__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2313__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2369 const struct alloc_context *ac, unsigned long *did_some_progress) 2314 const struct alloc_context *ac, unsigned long *did_some_progress)
@@ -2373,10 +2318,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2373 *did_some_progress = 0; 2318 *did_some_progress = 0;
2374 2319
2375 /* 2320 /*
2376 * Acquire the per-zone oom lock for each zone. If that 2321 * Acquire the oom lock. If that fails, somebody else is
2377 * fails, somebody else is making progress for us. 2322 * making progress for us.
2378 */ 2323 */
2379 if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { 2324 if (!mutex_trylock(&oom_lock)) {
2380 *did_some_progress = 1; 2325 *did_some_progress = 1;
2381 schedule_timeout_uninterruptible(1); 2326 schedule_timeout_uninterruptible(1);
2382 return NULL; 2327 return NULL;
@@ -2402,16 +2347,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2402 /* The OOM killer does not needlessly kill tasks for lowmem */ 2347 /* The OOM killer does not needlessly kill tasks for lowmem */
2403 if (ac->high_zoneidx < ZONE_NORMAL) 2348 if (ac->high_zoneidx < ZONE_NORMAL)
2404 goto out; 2349 goto out;
2405 /* The OOM killer does not compensate for light reclaim */ 2350 /* The OOM killer does not compensate for IO-less reclaim */
2406 if (!(gfp_mask & __GFP_FS)) { 2351 if (!(gfp_mask & __GFP_FS)) {
2407 /* 2352 /*
2408 * XXX: Page reclaim didn't yield anything, 2353 * XXX: Page reclaim didn't yield anything,
2409 * and the OOM killer can't be invoked, but 2354 * and the OOM killer can't be invoked, but
2410 * keep looping as per should_alloc_retry(). 2355 * keep looping as per tradition.
2411 */ 2356 */
2412 *did_some_progress = 1; 2357 *did_some_progress = 1;
2413 goto out; 2358 goto out;
2414 } 2359 }
2360 if (pm_suspended_storage())
2361 goto out;
2415 /* The OOM killer may not free memory on a specific node */ 2362 /* The OOM killer may not free memory on a specific node */
2416 if (gfp_mask & __GFP_THISNODE) 2363 if (gfp_mask & __GFP_THISNODE)
2417 goto out; 2364 goto out;
@@ -2421,7 +2368,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2421 || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) 2368 || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
2422 *did_some_progress = 1; 2369 *did_some_progress = 1;
2423out: 2370out:
2424 oom_zonelist_unlock(ac->zonelist, gfp_mask); 2371 mutex_unlock(&oom_lock);
2425 return page; 2372 return page;
2426} 2373}
2427 2374
@@ -2794,40 +2741,40 @@ retry:
2794 if (page) 2741 if (page)
2795 goto got_pg; 2742 goto got_pg;
2796 2743
2797 /* Check if we should retry the allocation */ 2744 /* Do not loop if specifically requested */
2745 if (gfp_mask & __GFP_NORETRY)
2746 goto noretry;
2747
2748 /* Keep reclaiming pages as long as there is reasonable progress */
2798 pages_reclaimed += did_some_progress; 2749 pages_reclaimed += did_some_progress;
2799 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2750 if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
2800 pages_reclaimed)) { 2751 ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
2801 /*
2802 * If we fail to make progress by freeing individual
2803 * pages, but the allocation wants us to keep going,
2804 * start OOM killing tasks.
2805 */
2806 if (!did_some_progress) {
2807 page = __alloc_pages_may_oom(gfp_mask, order, ac,
2808 &did_some_progress);
2809 if (page)
2810 goto got_pg;
2811 if (!did_some_progress)
2812 goto nopage;
2813 }
2814 /* Wait for some write requests to complete then retry */ 2752 /* Wait for some write requests to complete then retry */
2815 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); 2753 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
2816 goto retry; 2754 goto retry;
2817 } else {
2818 /*
2819 * High-order allocations do not necessarily loop after
2820 * direct reclaim and reclaim/compaction depends on compaction
2821 * being called after reclaim so call directly if necessary
2822 */
2823 page = __alloc_pages_direct_compact(gfp_mask, order,
2824 alloc_flags, ac, migration_mode,
2825 &contended_compaction,
2826 &deferred_compaction);
2827 if (page)
2828 goto got_pg;
2829 } 2755 }
2830 2756
2757 /* Reclaim has failed us, start killing things */
2758 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
2759 if (page)
2760 goto got_pg;
2761
2762 /* Retry as long as the OOM killer is making progress */
2763 if (did_some_progress)
2764 goto retry;
2765
2766noretry:
2767 /*
2768 * High-order allocations do not necessarily loop after
2769 * direct reclaim and reclaim/compaction depends on compaction
2770 * being called after reclaim so call directly if necessary
2771 */
2772 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
2773 ac, migration_mode,
2774 &contended_compaction,
2775 &deferred_compaction);
2776 if (page)
2777 goto got_pg;
2831nopage: 2778nopage:
2832 warn_alloc_failed(gfp_mask, order, NULL); 2779 warn_alloc_failed(gfp_mask, order, NULL);
2833got_pg: 2780got_pg:
@@ -4867,22 +4814,28 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4867 unsigned long *zones_size, 4814 unsigned long *zones_size,
4868 unsigned long *zholes_size) 4815 unsigned long *zholes_size)
4869{ 4816{
4870 unsigned long realtotalpages, totalpages = 0; 4817 unsigned long realtotalpages = 0, totalpages = 0;
4871 enum zone_type i; 4818 enum zone_type i;
4872 4819
4873 for (i = 0; i < MAX_NR_ZONES; i++) 4820 for (i = 0; i < MAX_NR_ZONES; i++) {
4874 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4821 struct zone *zone = pgdat->node_zones + i;
4875 node_start_pfn, 4822 unsigned long size, real_size;
4876 node_end_pfn, 4823
4877 zones_size); 4824 size = zone_spanned_pages_in_node(pgdat->node_id, i,
4878 pgdat->node_spanned_pages = totalpages; 4825 node_start_pfn,
4879 4826 node_end_pfn,
4880 realtotalpages = totalpages; 4827 zones_size);
4881 for (i = 0; i < MAX_NR_ZONES; i++) 4828 real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
4882 realtotalpages -=
4883 zone_absent_pages_in_node(pgdat->node_id, i,
4884 node_start_pfn, node_end_pfn, 4829 node_start_pfn, node_end_pfn,
4885 zholes_size); 4830 zholes_size);
4831 zone->spanned_pages = size;
4832 zone->present_pages = real_size;
4833
4834 totalpages += size;
4835 realtotalpages += real_size;
4836 }
4837
4838 pgdat->node_spanned_pages = totalpages;
4886 pgdat->node_present_pages = realtotalpages; 4839 pgdat->node_present_pages = realtotalpages;
4887 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4840 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4888 realtotalpages); 4841 realtotalpages);
@@ -4992,8 +4945,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4992 * NOTE: pgdat should get zeroed by caller. 4945 * NOTE: pgdat should get zeroed by caller.
4993 */ 4946 */
4994static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4947static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4995 unsigned long node_start_pfn, unsigned long node_end_pfn, 4948 unsigned long node_start_pfn, unsigned long node_end_pfn)
4996 unsigned long *zones_size, unsigned long *zholes_size)
4997{ 4949{
4998 enum zone_type j; 4950 enum zone_type j;
4999 int nid = pgdat->node_id; 4951 int nid = pgdat->node_id;
@@ -5014,12 +4966,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
5014 struct zone *zone = pgdat->node_zones + j; 4966 struct zone *zone = pgdat->node_zones + j;
5015 unsigned long size, realsize, freesize, memmap_pages; 4967 unsigned long size, realsize, freesize, memmap_pages;
5016 4968
5017 size = zone_spanned_pages_in_node(nid, j, node_start_pfn, 4969 size = zone->spanned_pages;
5018 node_end_pfn, zones_size); 4970 realsize = freesize = zone->present_pages;
5019 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
5020 node_start_pfn,
5021 node_end_pfn,
5022 zholes_size);
5023 4971
5024 /* 4972 /*
5025 * Adjust freesize so that it accounts for how much memory 4973 * Adjust freesize so that it accounts for how much memory
@@ -5054,8 +5002,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
5054 nr_kernel_pages -= memmap_pages; 5002 nr_kernel_pages -= memmap_pages;
5055 nr_all_pages += freesize; 5003 nr_all_pages += freesize;
5056 5004
5057 zone->spanned_pages = size;
5058 zone->present_pages = realsize;
5059 /* 5005 /*
5060 * Set an approximate value for lowmem here, it will be adjusted 5006 * Set an approximate value for lowmem here, it will be adjusted
5061 * when the bootmem allocator frees pages into the buddy system. 5007 * when the bootmem allocator frees pages into the buddy system.
@@ -5161,8 +5107,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
5161 (unsigned long)pgdat->node_mem_map); 5107 (unsigned long)pgdat->node_mem_map);
5162#endif 5108#endif
5163 5109
5164 free_area_init_core(pgdat, start_pfn, end_pfn, 5110 free_area_init_core(pgdat, start_pfn, end_pfn);
5165 zones_size, zholes_size);
5166} 5111}
5167 5112
5168#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5113#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -6111,9 +6056,9 @@ out:
6111 return ret; 6056 return ret;
6112} 6057}
6113 6058
6059#ifdef CONFIG_NUMA
6114int hashdist = HASHDIST_DEFAULT; 6060int hashdist = HASHDIST_DEFAULT;
6115 6061
6116#ifdef CONFIG_NUMA
6117static int __init set_hashdist(char *str) 6062static int __init set_hashdist(char *str)
6118{ 6063{
6119 if (!str) 6064 if (!str)
diff --git a/mm/percpu.c b/mm/percpu.c
index dfd02484e8de..2dd74487a0af 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1030,7 +1030,7 @@ area_found:
1030 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); 1030 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1031 1031
1032 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); 1032 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
1033 kmemleak_alloc_percpu(ptr, size); 1033 kmemleak_alloc_percpu(ptr, size, gfp);
1034 return ptr; 1034 return ptr;
1035 1035
1036fail_unlock: 1036fail_unlock:
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index c25f94b33811..6b674e00153c 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -119,14 +119,15 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
119} 119}
120#endif 120#endif
121 121
122#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH 122#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
123#ifdef CONFIG_TRANSPARENT_HUGEPAGE 123#ifdef CONFIG_TRANSPARENT_HUGEPAGE
124pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, 124pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
125 pmd_t *pmdp) 125 pmd_t *pmdp)
126{ 126{
127 pmd_t pmd; 127 pmd_t pmd;
128 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 128 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
129 pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); 129 VM_BUG_ON(!pmd_trans_huge(*pmdp));
130 pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
130 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 131 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
131 return pmd; 132 return pmd;
132} 133}
@@ -198,3 +199,23 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
198} 199}
199#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 200#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
200#endif 201#endif
202
203#ifndef pmdp_collapse_flush
204#ifdef CONFIG_TRANSPARENT_HUGEPAGE
205pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
206 pmd_t *pmdp)
207{
208 /*
209 * pmd and hugepage pte format are same. So we could
210 * use the same function.
211 */
212 pmd_t pmd;
213
214 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
215 VM_BUG_ON(pmd_trans_huge(*pmdp));
216 pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
217 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
218 return pmd;
219}
220#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
221#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 24dd3f9fee27..7af1ecb21ccb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -625,7 +625,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
625 625
626 pmd = pmd_offset(pud, address); 626 pmd = pmd_offset(pud, address);
627 /* 627 /*
628 * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at() 628 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
629 * without holding anon_vma lock for write. So when looking for a 629 * without holding anon_vma lock for write. So when looking for a
630 * genuine pmde (in which to find pte), test present and !THP together. 630 * genuine pmde (in which to find pte), test present and !THP together.
631 */ 631 */
@@ -950,7 +950,12 @@ void page_move_anon_rmap(struct page *page,
950 VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); 950 VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
951 951
952 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 952 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
953 page->mapping = (struct address_space *) anon_vma; 953 /*
954 * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
955 * simultaneously, so a concurrent reader (eg page_referenced()'s
956 * PageAnon()) will not see one without the other.
957 */
958 WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
954} 959}
955 960
956/** 961/**
diff --git a/mm/shmem.c b/mm/shmem.c
index 3759099d8ce4..4caf8ed24d65 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -569,7 +569,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
569 i_size_write(inode, newsize); 569 i_size_write(inode, newsize);
570 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 570 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
571 } 571 }
572 if (newsize < oldsize) { 572 if (newsize <= oldsize) {
573 loff_t holebegin = round_up(newsize, PAGE_SIZE); 573 loff_t holebegin = round_up(newsize, PAGE_SIZE);
574 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 574 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
575 shmem_truncate_range(inode, newsize, (loff_t)-1); 575 shmem_truncate_range(inode, newsize, (loff_t)-1);
diff --git a/mm/slab.c b/mm/slab.c
index 7eb38dd1cefa..200e22412a16 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1454,6 +1454,7 @@ void __init kmem_cache_init(void)
1454 kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", 1454 kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
1455 kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); 1455 kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
1456 slab_state = PARTIAL_NODE; 1456 slab_state = PARTIAL_NODE;
1457 setup_kmalloc_cache_index_table();
1457 1458
1458 slab_early_init = 0; 1459 slab_early_init = 0;
1459 1460
diff --git a/mm/slab.h b/mm/slab.h
index 4c3ac12dd644..8da63e4e470f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -71,6 +71,7 @@ unsigned long calculate_alignment(unsigned long flags,
71 71
72#ifndef CONFIG_SLOB 72#ifndef CONFIG_SLOB
73/* Kmalloc array related functions */ 73/* Kmalloc array related functions */
74void setup_kmalloc_cache_index_table(void);
74void create_kmalloc_caches(unsigned long); 75void create_kmalloc_caches(unsigned long);
75 76
76/* Find the kmalloc slab corresponding for a certain size */ 77/* Find the kmalloc slab corresponding for a certain size */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 999bb3424d44..9f8d71f78404 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -784,25 +784,45 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
784} 784}
785 785
786/* 786/*
787 * Create the kmalloc array. Some of the regular kmalloc arrays 787 * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
788 * may already have been created because they were needed to 788 * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
789 * enable allocations for slab creation. 789 * kmalloc-67108864.
790 */ 790 */
791void __init create_kmalloc_caches(unsigned long flags) 791static struct {
792 const char *name;
793 unsigned long size;
794} const kmalloc_info[] __initconst = {
795 {NULL, 0}, {"kmalloc-96", 96},
796 {"kmalloc-192", 192}, {"kmalloc-8", 8},
797 {"kmalloc-16", 16}, {"kmalloc-32", 32},
798 {"kmalloc-64", 64}, {"kmalloc-128", 128},
799 {"kmalloc-256", 256}, {"kmalloc-512", 512},
800 {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048},
801 {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192},
802 {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768},
803 {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072},
804 {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288},
805 {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152},
806 {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608},
807 {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432},
808 {"kmalloc-67108864", 67108864}
809};
810
811/*
812 * Patch up the size_index table if we have strange large alignment
813 * requirements for the kmalloc array. This is only the case for
814 * MIPS it seems. The standard arches will not generate any code here.
815 *
816 * Largest permitted alignment is 256 bytes due to the way we
817 * handle the index determination for the smaller caches.
818 *
819 * Make sure that nothing crazy happens if someone starts tinkering
820 * around with ARCH_KMALLOC_MINALIGN
821 */
822void __init setup_kmalloc_cache_index_table(void)
792{ 823{
793 int i; 824 int i;
794 825
795 /*
796 * Patch up the size_index table if we have strange large alignment
797 * requirements for the kmalloc array. This is only the case for
798 * MIPS it seems. The standard arches will not generate any code here.
799 *
800 * Largest permitted alignment is 256 bytes due to the way we
801 * handle the index determination for the smaller caches.
802 *
803 * Make sure that nothing crazy happens if someone starts tinkering
804 * around with ARCH_KMALLOC_MINALIGN
805 */
806 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 826 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
807 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 827 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
808 828
@@ -833,39 +853,41 @@ void __init create_kmalloc_caches(unsigned long flags)
833 for (i = 128 + 8; i <= 192; i += 8) 853 for (i = 128 + 8; i <= 192; i += 8)
834 size_index[size_index_elem(i)] = 8; 854 size_index[size_index_elem(i)] = 8;
835 } 855 }
836 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { 856}
857
858/*
859 * Create the kmalloc array. Some of the regular kmalloc arrays
860 * may already have been created because they were needed to
861 * enable allocations for slab creation.
862 */
863void __init create_kmalloc_caches(unsigned long flags)
864{
865 int i;
866
867 for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
837 if (!kmalloc_caches[i]) { 868 if (!kmalloc_caches[i]) {
838 kmalloc_caches[i] = create_kmalloc_cache(NULL, 869 kmalloc_caches[i] = create_kmalloc_cache(
839 1 << i, flags); 870 kmalloc_info[i].name,
871 kmalloc_info[i].size,
872 flags);
840 } 873 }
841 874
842 /* 875 /*
843 * Caches that are not of the two-to-the-power-of size. 876 * "i == 2" is the "kmalloc-192" case which is the last special
844 * These have to be created immediately after the 877 * case for initialization and it's the point to jump to
845 * earlier power of two caches 878 * allocate the minimize size of the object. In slab allocator,
879 * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4
880 * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is
881 * defined, it may be larger than 2^5 and here is also the
882 * trick to skip the empty gap.
846 */ 883 */
847 if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) 884 if (i == 2)
848 kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); 885 i = (KMALLOC_SHIFT_LOW - 1);
849
850 if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
851 kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
852 } 886 }
853 887
854 /* Kmalloc array is now usable */ 888 /* Kmalloc array is now usable */
855 slab_state = UP; 889 slab_state = UP;
856 890
857 for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
858 struct kmem_cache *s = kmalloc_caches[i];
859 char *n;
860
861 if (s) {
862 n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));
863
864 BUG_ON(!n);
865 s->name = n;
866 }
867 }
868
869#ifdef CONFIG_ZONE_DMA 891#ifdef CONFIG_ZONE_DMA
870 for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { 892 for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
871 struct kmem_cache *s = kmalloc_caches[i]; 893 struct kmem_cache *s = kmalloc_caches[i];
diff --git a/mm/slub.c b/mm/slub.c
index 54c0876b43d5..816df0016555 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3700,6 +3700,7 @@ void __init kmem_cache_init(void)
3700 kmem_cache_node = bootstrap(&boot_kmem_cache_node); 3700 kmem_cache_node = bootstrap(&boot_kmem_cache_node);
3701 3701
3702 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 3702 /* Now we can use the kmem_cache to allocate kmalloc slabs */
3703 setup_kmalloc_cache_index_table();
3703 create_kmalloc_caches(0); 3704 create_kmalloc_caches(0);
3704 3705
3705#ifdef CONFIG_SMP 3706#ifdef CONFIG_SMP
diff --git a/mm/swap.c b/mm/swap.c
index a7251a8ed532..a3a0a2f1f7c3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -131,7 +131,6 @@ void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
131 * here, see the comment above this function. 131 * here, see the comment above this function.
132 */ 132 */
133 VM_BUG_ON_PAGE(!PageHead(page_head), page_head); 133 VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
134 VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
135 if (put_page_testzero(page_head)) { 134 if (put_page_testzero(page_head)) {
136 /* 135 /*
137 * If this is the tail of a slab THP page, 136 * If this is the tail of a slab THP page,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e8eadd71bac..19ef01e90ac4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2646,7 +2646,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2646 2646
2647 for (i = 0; i <= ZONE_NORMAL; i++) { 2647 for (i = 0; i <= ZONE_NORMAL; i++) {
2648 zone = &pgdat->node_zones[i]; 2648 zone = &pgdat->node_zones[i];
2649 if (!populated_zone(zone)) 2649 if (!populated_zone(zone) ||
2650 zone_reclaimable_pages(zone) == 0)
2650 continue; 2651 continue;
2651 2652
2652 pfmemalloc_reserve += min_wmark_pages(zone); 2653 pfmemalloc_reserve += min_wmark_pages(zone);
@@ -3596,7 +3597,7 @@ int zone_reclaim_mode __read_mostly;
3596#define RECLAIM_OFF 0 3597#define RECLAIM_OFF 0
3597#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ 3598#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
3598#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 3599#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
3599#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 3600#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
3600 3601
3601/* 3602/*
3602 * Priority for ZONE_RECLAIM. This determines the fraction of pages 3603 * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -3638,12 +3639,12 @@ static long zone_pagecache_reclaimable(struct zone *zone)
3638 long delta = 0; 3639 long delta = 0;
3639 3640
3640 /* 3641 /*
3641 * If RECLAIM_SWAP is set, then all file pages are considered 3642 * If RECLAIM_UNMAP is set, then all file pages are considered
3642 * potentially reclaimable. Otherwise, we have to worry about 3643 * potentially reclaimable. Otherwise, we have to worry about
3643 * pages like swapcache and zone_unmapped_file_pages() provides 3644 * pages like swapcache and zone_unmapped_file_pages() provides
3644 * a better estimate 3645 * a better estimate
3645 */ 3646 */
3646 if (zone_reclaim_mode & RECLAIM_SWAP) 3647 if (zone_reclaim_mode & RECLAIM_UNMAP)
3647 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); 3648 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
3648 else 3649 else
3649 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); 3650 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
@@ -3674,15 +3675,15 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3674 .order = order, 3675 .order = order,
3675 .priority = ZONE_RECLAIM_PRIORITY, 3676 .priority = ZONE_RECLAIM_PRIORITY,
3676 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3677 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3677 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3678 .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
3678 .may_swap = 1, 3679 .may_swap = 1,
3679 }; 3680 };
3680 3681
3681 cond_resched(); 3682 cond_resched();
3682 /* 3683 /*
3683 * We need to be able to allocate from the reserves for RECLAIM_SWAP 3684 * We need to be able to allocate from the reserves for RECLAIM_UNMAP
3684 * and we also need to be able to write out pages for RECLAIM_WRITE 3685 * and we also need to be able to write out pages for RECLAIM_WRITE
3685 * and RECLAIM_SWAP. 3686 * and RECLAIM_UNMAP.
3686 */ 3687 */
3687 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 3688 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
3688 lockdep_set_current_reclaim_state(gfp_mask); 3689 lockdep_set_current_reclaim_state(gfp_mask);