diff options
151 files changed, 2277 insertions, 1321 deletions
diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt index ab0baa692c13..22dd6af2e4bd 100644 --- a/Documentation/lockup-watchdogs.txt +++ b/Documentation/lockup-watchdogs.txt | |||
@@ -61,3 +61,21 @@ As explained above, a kernel knob is provided that allows | |||
61 | administrators to configure the period of the hrtimer and the perf | 61 | administrators to configure the period of the hrtimer and the perf |
62 | event. The right value for a particular environment is a trade-off | 62 | event. The right value for a particular environment is a trade-off |
63 | between fast response to lockups and detection overhead. | 63 | between fast response to lockups and detection overhead. |
64 | |||
65 | By default, the watchdog runs on all online cores. However, on a | ||
66 | kernel configured with NO_HZ_FULL, by default the watchdog runs only | ||
67 | on the housekeeping cores, not the cores specified in the "nohz_full" | ||
68 | boot argument. If we allowed the watchdog to run by default on | ||
69 | the "nohz_full" cores, we would have to run timer ticks to activate | ||
70 | the scheduler, which would prevent the "nohz_full" functionality | ||
71 | from protecting the user code on those cores from the kernel. | ||
72 | Of course, disabling it by default on the nohz_full cores means that | ||
73 | when those cores do enter the kernel, by default we will not be | ||
74 | able to detect if they lock up. However, allowing the watchdog | ||
75 | to continue to run on the housekeeping (non-tickless) cores means | ||
76 | that we will continue to detect lockups properly on those cores. | ||
77 | |||
78 | In either case, the set of cores excluded from running the watchdog | ||
79 | may be adjusted via the kernel.watchdog_cpumask sysctl. For | ||
80 | nohz_full cores, this may be useful for debugging a case where the | ||
81 | kernel seems to be hanging on the nohz_full cores. | ||
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index c831001c45f1..e5d528e0c46e 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -923,6 +923,27 @@ and nmi_watchdog. | |||
923 | 923 | ||
924 | ============================================================== | 924 | ============================================================== |
925 | 925 | ||
926 | watchdog_cpumask: | ||
927 | |||
928 | This value can be used to control on which cpus the watchdog may run. | ||
929 | The default cpumask is all possible cores, but if NO_HZ_FULL is | ||
930 | enabled in the kernel config, and cores are specified with the | ||
931 | nohz_full= boot argument, those cores are excluded by default. | ||
932 | Offline cores can be included in this mask, and if the core is later | ||
933 | brought online, the watchdog will be started based on the mask value. | ||
934 | |||
935 | Typically this value would only be touched in the nohz_full case | ||
936 | to re-enable cores that by default were not running the watchdog, | ||
937 | if a kernel lockup was suspected on those cores. | ||
938 | |||
939 | The argument value is the standard cpulist format for cpumasks, | ||
940 | so for example to enable the watchdog on cores 0, 2, 3, and 4 you | ||
941 | might say: | ||
942 | |||
943 | echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask | ||
944 | |||
945 | ============================================================== | ||
946 | |||
926 | watchdog_thresh: | 947 | watchdog_thresh: |
927 | 948 | ||
928 | This value can be used to control the frequency of hrtimer and NMI | 949 | This value can be used to control the frequency of hrtimer and NMI |
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index 3be0bfc4738d..32ee3a67dba2 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt | |||
@@ -467,7 +467,13 @@ mmap(MAP_LOCKED) SYSTEM CALL HANDLING | |||
467 | 467 | ||
468 | In addition the mlock()/mlockall() system calls, an application can request | 468 | In addition the mlock()/mlockall() system calls, an application can request |
469 | that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap() | 469 | that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap() |
470 | call. Furthermore, any mmap() call or brk() call that expands the heap by a | 470 | call. There is one important and subtle difference here, though. mmap() + mlock() |
471 | will fail if the range cannot be faulted in (e.g. because mm_populate fails) | ||
472 | and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. The mmaped | ||
473 | area will still have properties of the locked area - aka. pages will not get | ||
474 | swapped out - but major page faults to fault memory in might still happen. | ||
475 | |||
476 | Furthermore, any mmap() call or brk() call that expands the heap by a | ||
471 | task that has previously called mlockall() with the MCL_FUTURE flag will result | 477 | task that has previously called mlockall() with the MCL_FUTURE flag will result |
472 | in the newly mapped memory being mlocked. Before the unevictable/mlock | 478 | in the newly mapped memory being mlocked. Before the unevictable/mlock |
473 | changes, the kernel simply called make_pages_present() to allocate pages and | 479 | changes, the kernel simply called make_pages_present() to allocate pages and |
diff --git a/arch/alpha/include/asm/mm-arch-hooks.h b/arch/alpha/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..b07fd862fec3 --- /dev/null +++ b/arch/alpha/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_ALPHA_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_ALPHA_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_ALPHA_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/arc/include/asm/mm-arch-hooks.h b/arch/arc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..c37541c5f8ba --- /dev/null +++ b/arch/arc/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_ARC_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_ARC_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_ARC_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 1f1b1cd112f3..31bb7dccb971 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h | |||
@@ -53,10 +53,6 @@ static inline int prepare_hugepage_range(struct file *file, | |||
53 | return 0; | 53 | return 0; |
54 | } | 54 | } |
55 | 55 | ||
56 | static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) | ||
57 | { | ||
58 | } | ||
59 | |||
60 | static inline int huge_pte_none(pte_t pte) | 56 | static inline int huge_pte_none(pte_t pte) |
61 | { | 57 | { |
62 | return pte_none(pte); | 58 | return pte_none(pte); |
diff --git a/arch/arm/include/asm/mm-arch-hooks.h b/arch/arm/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..7056660c7cc4 --- /dev/null +++ b/arch/arm/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_ARM_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_ARM_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_ARM_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c index c72412415093..fcafb521f14e 100644 --- a/arch/arm/mm/hugetlbpage.c +++ b/arch/arm/mm/hugetlbpage.c | |||
@@ -41,11 +41,6 @@ int pud_huge(pud_t pud) | |||
41 | return 0; | 41 | return 0; |
42 | } | 42 | } |
43 | 43 | ||
44 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
45 | { | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | int pmd_huge(pmd_t pmd) | 44 | int pmd_huge(pmd_t pmd) |
50 | { | 45 | { |
51 | return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); | 46 | return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); |
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 5b7ca8ace95f..734c17e89e94 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h | |||
@@ -86,10 +86,6 @@ static inline int prepare_hugepage_range(struct file *file, | |||
86 | return 0; | 86 | return 0; |
87 | } | 87 | } |
88 | 88 | ||
89 | static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) | ||
90 | { | ||
91 | } | ||
92 | |||
93 | static inline int huge_pte_none(pte_t pte) | 89 | static inline int huge_pte_none(pte_t pte) |
94 | { | 90 | { |
95 | return pte_none(pte); | 91 | return pte_none(pte); |
diff --git a/arch/arm64/include/asm/mm-arch-hooks.h b/arch/arm64/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..562b655f5ba9 --- /dev/null +++ b/arch/arm64/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_ARM64_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_ARM64_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_ARM64_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 2de9d2e59d96..cccc4af87a03 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c | |||
@@ -31,13 +31,6 @@ | |||
31 | #include <asm/tlbflush.h> | 31 | #include <asm/tlbflush.h> |
32 | #include <asm/pgalloc.h> | 32 | #include <asm/pgalloc.h> |
33 | 33 | ||
34 | #ifndef CONFIG_ARCH_WANT_HUGE_PMD_SHARE | ||
35 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
36 | { | ||
37 | return 0; | ||
38 | } | ||
39 | #endif | ||
40 | |||
41 | int pmd_huge(pmd_t pmd) | 34 | int pmd_huge(pmd_t pmd) |
42 | { | 35 | { |
43 | return !(pmd_val(pmd) & PMD_TABLE_BIT); | 36 | return !(pmd_val(pmd) & PMD_TABLE_BIT); |
diff --git a/arch/avr32/include/asm/mm-arch-hooks.h b/arch/avr32/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..145452ffbdad --- /dev/null +++ b/arch/avr32/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_AVR32_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_AVR32_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_AVR32_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/blackfin/include/asm/mm-arch-hooks.h b/arch/blackfin/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..1c5211ec338f --- /dev/null +++ b/arch/blackfin/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_BLACKFIN_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_BLACKFIN_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_BLACKFIN_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/c6x/include/asm/mm-arch-hooks.h b/arch/c6x/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..bb3c4a6ce8e9 --- /dev/null +++ b/arch/c6x/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_C6X_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_C6X_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_C6X_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/cris/include/asm/mm-arch-hooks.h b/arch/cris/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..314f774db2b0 --- /dev/null +++ b/arch/cris/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_CRIS_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_CRIS_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_CRIS_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/frv/include/asm/mm-arch-hooks.h b/arch/frv/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..51d13a870404 --- /dev/null +++ b/arch/frv/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_FRV_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_FRV_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_FRV_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/hexagon/include/asm/mm-arch-hooks.h b/arch/hexagon/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..05e8b939e416 --- /dev/null +++ b/arch/hexagon/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_HEXAGON_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_HEXAGON_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_HEXAGON_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index aa910054b8e7..ff1377bc02a6 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h | |||
@@ -20,10 +20,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, | |||
20 | REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); | 20 | REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); |
21 | } | 21 | } |
22 | 22 | ||
23 | static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) | ||
24 | { | ||
25 | } | ||
26 | |||
27 | static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | 23 | static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
28 | pte_t *ptep, pte_t pte) | 24 | pte_t *ptep, pte_t pte) |
29 | { | 25 | { |
diff --git a/arch/ia64/include/asm/mm-arch-hooks.h b/arch/ia64/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..ab4b5c698322 --- /dev/null +++ b/arch/ia64/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_IA64_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_IA64_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_IA64_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 52b7604b5215..f50d4b3f501a 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c | |||
@@ -65,11 +65,6 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr) | |||
65 | return pte; | 65 | return pte; |
66 | } | 66 | } |
67 | 67 | ||
68 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
69 | { | ||
70 | return 0; | ||
71 | } | ||
72 | |||
73 | #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } | 68 | #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } |
74 | 69 | ||
75 | /* | 70 | /* |
diff --git a/arch/m32r/include/asm/mm-arch-hooks.h b/arch/m32r/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..6d60b4750f41 --- /dev/null +++ b/arch/m32r/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_M32R_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_M32R_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_M32R_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/m68k/include/asm/mm-arch-hooks.h b/arch/m68k/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..7e8709bc90ae --- /dev/null +++ b/arch/m68k/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_M68K_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_M68K_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_M68K_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/metag/include/asm/dma-mapping.h b/arch/metag/include/asm/dma-mapping.h index 14b23efd9b7a..eb5cdec94be0 100644 --- a/arch/metag/include/asm/dma-mapping.h +++ b/arch/metag/include/asm/dma-mapping.h | |||
@@ -134,20 +134,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, | |||
134 | } | 134 | } |
135 | 135 | ||
136 | static inline void | 136 | static inline void |
137 | dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, | 137 | dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems, |
138 | enum dma_data_direction direction) | 138 | enum dma_data_direction direction) |
139 | { | 139 | { |
140 | int i; | 140 | int i; |
141 | for (i = 0; i < nelems; i++, sg++) | 141 | struct scatterlist *sg; |
142 | |||
143 | for_each_sg(sglist, sg, nelems, i) | ||
142 | dma_sync_for_cpu(sg_virt(sg), sg->length, direction); | 144 | dma_sync_for_cpu(sg_virt(sg), sg->length, direction); |
143 | } | 145 | } |
144 | 146 | ||
145 | static inline void | 147 | static inline void |
146 | dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, | 148 | dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, |
147 | enum dma_data_direction direction) | 149 | int nelems, enum dma_data_direction direction) |
148 | { | 150 | { |
149 | int i; | 151 | int i; |
150 | for (i = 0; i < nelems; i++, sg++) | 152 | struct scatterlist *sg; |
153 | |||
154 | for_each_sg(sglist, sg, nelems, i) | ||
151 | dma_sync_for_device(sg_virt(sg), sg->length, direction); | 155 | dma_sync_for_device(sg_virt(sg), sg->length, direction); |
152 | } | 156 | } |
153 | 157 | ||
diff --git a/arch/metag/include/asm/hugetlb.h b/arch/metag/include/asm/hugetlb.h index 471f481e67f3..f730b396d79b 100644 --- a/arch/metag/include/asm/hugetlb.h +++ b/arch/metag/include/asm/hugetlb.h | |||
@@ -14,10 +14,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, | |||
14 | int prepare_hugepage_range(struct file *file, unsigned long addr, | 14 | int prepare_hugepage_range(struct file *file, unsigned long addr, |
15 | unsigned long len); | 15 | unsigned long len); |
16 | 16 | ||
17 | static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) | ||
18 | { | ||
19 | } | ||
20 | |||
21 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, | 17 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, |
22 | unsigned long addr, unsigned long end, | 18 | unsigned long addr, unsigned long end, |
23 | unsigned long floor, | 19 | unsigned long floor, |
diff --git a/arch/metag/include/asm/mm-arch-hooks.h b/arch/metag/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..b0072b2eb0de --- /dev/null +++ b/arch/metag/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_METAG_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_METAG_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_METAG_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index 7ca80ac42ed5..53f0f6c47027 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c | |||
@@ -89,11 +89,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
89 | return pte; | 89 | return pte; |
90 | } | 90 | } |
91 | 91 | ||
92 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
93 | { | ||
94 | return 0; | ||
95 | } | ||
96 | |||
97 | int pmd_huge(pmd_t pmd) | 92 | int pmd_huge(pmd_t pmd) |
98 | { | 93 | { |
99 | return pmd_page_shift(pmd) > PAGE_SHIFT; | 94 | return pmd_page_shift(pmd) > PAGE_SHIFT; |
diff --git a/arch/microblaze/include/asm/mm-arch-hooks.h b/arch/microblaze/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..5c4065911bda --- /dev/null +++ b/arch/microblaze/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_MICROBLAZE_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_MICROBLAZE_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_MICROBLAZE_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index fe0d15d32660..4a5bb5453408 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h | |||
@@ -38,10 +38,6 @@ static inline int prepare_hugepage_range(struct file *file, | |||
38 | return 0; | 38 | return 0; |
39 | } | 39 | } |
40 | 40 | ||
41 | static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) | ||
42 | { | ||
43 | } | ||
44 | |||
45 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, | 41 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, |
46 | unsigned long addr, | 42 | unsigned long addr, |
47 | unsigned long end, | 43 | unsigned long end, |
diff --git a/arch/mips/include/asm/mm-arch-hooks.h b/arch/mips/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..b5609fe8e475 --- /dev/null +++ b/arch/mips/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_MIPS_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_MIPS_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_MIPS_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 819af9d057a8..9d8106758142 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h | |||
@@ -568,12 +568,12 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) | |||
568 | } | 568 | } |
569 | 569 | ||
570 | /* | 570 | /* |
571 | * The generic version pmdp_get_and_clear uses a version of pmd_clear() with a | 571 | * The generic version pmdp_huge_get_and_clear uses a version of pmd_clear() with a |
572 | * different prototype. | 572 | * different prototype. |
573 | */ | 573 | */ |
574 | #define __HAVE_ARCH_PMDP_GET_AND_CLEAR | 574 | #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR |
575 | static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, | 575 | static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, |
576 | unsigned long address, pmd_t *pmdp) | 576 | unsigned long address, pmd_t *pmdp) |
577 | { | 577 | { |
578 | pmd_t old = *pmdp; | 578 | pmd_t old = *pmdp; |
579 | 579 | ||
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 06e0f421b41b..74aa6f62468f 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c | |||
@@ -51,11 +51,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
51 | return (pte_t *) pmd; | 51 | return (pte_t *) pmd; |
52 | } | 52 | } |
53 | 53 | ||
54 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
55 | { | ||
56 | return 0; | ||
57 | } | ||
58 | |||
59 | /* | 54 | /* |
60 | * This function checks for proper alignment of input addr and len parameters. | 55 | * This function checks for proper alignment of input addr and len parameters. |
61 | */ | 56 | */ |
diff --git a/arch/mn10300/include/asm/mm-arch-hooks.h b/arch/mn10300/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..e2029a652f4c --- /dev/null +++ b/arch/mn10300/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_MN10300_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_MN10300_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_MN10300_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/nios2/include/asm/mm-arch-hooks.h b/arch/nios2/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..d7290dc68558 --- /dev/null +++ b/arch/nios2/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_NIOS2_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_NIOS2_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_NIOS2_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/openrisc/include/asm/mm-arch-hooks.h b/arch/openrisc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..6d33cb555fe1 --- /dev/null +++ b/arch/openrisc/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_OPENRISC_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_OPENRISC_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_OPENRISC_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/parisc/include/asm/mm-arch-hooks.h b/arch/parisc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..654ec63b0ee9 --- /dev/null +++ b/arch/parisc/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_PARISC_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_PARISC_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_PARISC_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index ff834fd67478..b9402c9b3454 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c | |||
@@ -478,14 +478,16 @@ static void pa11_dma_unmap_single(struct device *dev, dma_addr_t dma_handle, siz | |||
478 | static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) | 478 | static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) |
479 | { | 479 | { |
480 | int i; | 480 | int i; |
481 | struct scatterlist *sg; | ||
481 | 482 | ||
482 | BUG_ON(direction == DMA_NONE); | 483 | BUG_ON(direction == DMA_NONE); |
483 | 484 | ||
484 | for (i = 0; i < nents; i++, sglist++ ) { | 485 | for_each_sg(sglist, sg, nents, i) { |
485 | unsigned long vaddr = (unsigned long)sg_virt(sglist); | 486 | unsigned long vaddr = (unsigned long)sg_virt(sg); |
486 | sg_dma_address(sglist) = (dma_addr_t) virt_to_phys(vaddr); | 487 | |
487 | sg_dma_len(sglist) = sglist->length; | 488 | sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr); |
488 | flush_kernel_dcache_range(vaddr, sglist->length); | 489 | sg_dma_len(sg) = sg->length; |
490 | flush_kernel_dcache_range(vaddr, sg->length); | ||
489 | } | 491 | } |
490 | return nents; | 492 | return nents; |
491 | } | 493 | } |
@@ -493,6 +495,7 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int n | |||
493 | static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) | 495 | static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) |
494 | { | 496 | { |
495 | int i; | 497 | int i; |
498 | struct scatterlist *sg; | ||
496 | 499 | ||
497 | BUG_ON(direction == DMA_NONE); | 500 | BUG_ON(direction == DMA_NONE); |
498 | 501 | ||
@@ -501,8 +504,8 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, in | |||
501 | 504 | ||
502 | /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ | 505 | /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ |
503 | 506 | ||
504 | for (i = 0; i < nents; i++, sglist++ ) | 507 | for_each_sg(sglist, sg, nents, i) |
505 | flush_kernel_vmap_range(sg_virt(sglist), sglist->length); | 508 | flush_kernel_vmap_range(sg_virt(sg), sg->length); |
506 | return; | 509 | return; |
507 | } | 510 | } |
508 | 511 | ||
@@ -523,21 +526,23 @@ static void pa11_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_h | |||
523 | static void pa11_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) | 526 | static void pa11_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) |
524 | { | 527 | { |
525 | int i; | 528 | int i; |
529 | struct scatterlist *sg; | ||
526 | 530 | ||
527 | /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ | 531 | /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ |
528 | 532 | ||
529 | for (i = 0; i < nents; i++, sglist++ ) | 533 | for_each_sg(sglist, sg, nents, i) |
530 | flush_kernel_vmap_range(sg_virt(sglist), sglist->length); | 534 | flush_kernel_vmap_range(sg_virt(sg), sg->length); |
531 | } | 535 | } |
532 | 536 | ||
533 | static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) | 537 | static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) |
534 | { | 538 | { |
535 | int i; | 539 | int i; |
540 | struct scatterlist *sg; | ||
536 | 541 | ||
537 | /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ | 542 | /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ |
538 | 543 | ||
539 | for (i = 0; i < nents; i++, sglist++ ) | 544 | for_each_sg(sglist, sg, nents, i) |
540 | flush_kernel_vmap_range(sg_virt(sglist), sglist->length); | 545 | flush_kernel_vmap_range(sg_virt(sg), sg->length); |
541 | } | 546 | } |
542 | 547 | ||
543 | struct hppa_dma_ops pcxl_dma_ops = { | 548 | struct hppa_dma_ops pcxl_dma_ops = { |
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 1d53a65b4ec1..4bbd3c8c2888 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h | |||
@@ -112,11 +112,6 @@ static inline int prepare_hugepage_range(struct file *file, | |||
112 | return 0; | 112 | return 0; |
113 | } | 113 | } |
114 | 114 | ||
115 | static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) | ||
116 | { | ||
117 | } | ||
118 | |||
119 | |||
120 | static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | 115 | static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
121 | pte_t *ptep, pte_t pte) | 116 | pte_t *ptep, pte_t pte) |
122 | { | 117 | { |
diff --git a/arch/powerpc/include/asm/mm-arch-hooks.h b/arch/powerpc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..f2a2da895897 --- /dev/null +++ b/arch/powerpc/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,28 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_POWERPC_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_POWERPC_MM_ARCH_HOOKS_H | ||
14 | |||
15 | static inline void arch_remap(struct mm_struct *mm, | ||
16 | unsigned long old_start, unsigned long old_end, | ||
17 | unsigned long new_start, unsigned long new_end) | ||
18 | { | ||
19 | /* | ||
20 | * mremap() doesn't allow moving multiple vmas so we can limit the | ||
21 | * check to old_start == vdso_base. | ||
22 | */ | ||
23 | if (old_start == mm->context.vdso_base) | ||
24 | mm->context.vdso_base = new_start; | ||
25 | } | ||
26 | #define arch_remap arch_remap | ||
27 | |||
28 | #endif /* _ASM_POWERPC_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 3e5184210d9b..878c27771717 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h | |||
@@ -8,7 +8,6 @@ | |||
8 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
9 | #include <asm/mmu.h> | 9 | #include <asm/mmu.h> |
10 | #include <asm/cputable.h> | 10 | #include <asm/cputable.h> |
11 | #include <asm-generic/mm_hooks.h> | ||
12 | #include <asm/cputhreads.h> | 11 | #include <asm/cputhreads.h> |
13 | 12 | ||
14 | /* | 13 | /* |
@@ -127,5 +126,27 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, | |||
127 | #endif | 126 | #endif |
128 | } | 127 | } |
129 | 128 | ||
129 | static inline void arch_dup_mmap(struct mm_struct *oldmm, | ||
130 | struct mm_struct *mm) | ||
131 | { | ||
132 | } | ||
133 | |||
134 | static inline void arch_exit_mmap(struct mm_struct *mm) | ||
135 | { | ||
136 | } | ||
137 | |||
138 | static inline void arch_unmap(struct mm_struct *mm, | ||
139 | struct vm_area_struct *vma, | ||
140 | unsigned long start, unsigned long end) | ||
141 | { | ||
142 | if (start <= mm->context.vdso_base && mm->context.vdso_base < end) | ||
143 | mm->context.vdso_base = 0; | ||
144 | } | ||
145 | |||
146 | static inline void arch_bprm_mm_init(struct mm_struct *mm, | ||
147 | struct vm_area_struct *vma) | ||
148 | { | ||
149 | } | ||
150 | |||
130 | #endif /* __KERNEL__ */ | 151 | #endif /* __KERNEL__ */ |
131 | #endif /* __ASM_POWERPC_MMU_CONTEXT_H */ | 152 | #endif /* __ASM_POWERPC_MMU_CONTEXT_H */ |
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index f890f7ce1593..3bb7488bd24b 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h | |||
@@ -569,13 +569,9 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, | |||
569 | extern int pmdp_clear_flush_young(struct vm_area_struct *vma, | 569 | extern int pmdp_clear_flush_young(struct vm_area_struct *vma, |
570 | unsigned long address, pmd_t *pmdp); | 570 | unsigned long address, pmd_t *pmdp); |
571 | 571 | ||
572 | #define __HAVE_ARCH_PMDP_GET_AND_CLEAR | 572 | #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR |
573 | extern pmd_t pmdp_get_and_clear(struct mm_struct *mm, | 573 | extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, |
574 | unsigned long addr, pmd_t *pmdp); | 574 | unsigned long addr, pmd_t *pmdp); |
575 | |||
576 | #define __HAVE_ARCH_PMDP_CLEAR_FLUSH | ||
577 | extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, | ||
578 | pmd_t *pmdp); | ||
579 | 575 | ||
580 | #define __HAVE_ARCH_PMDP_SET_WRPROTECT | 576 | #define __HAVE_ARCH_PMDP_SET_WRPROTECT |
581 | static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, | 577 | static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, |
@@ -592,6 +588,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, | |||
592 | extern void pmdp_splitting_flush(struct vm_area_struct *vma, | 588 | extern void pmdp_splitting_flush(struct vm_area_struct *vma, |
593 | unsigned long address, pmd_t *pmdp); | 589 | unsigned long address, pmd_t *pmdp); |
594 | 590 | ||
591 | extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, | ||
592 | unsigned long address, pmd_t *pmdp); | ||
593 | #define pmdp_collapse_flush pmdp_collapse_flush | ||
594 | |||
595 | #define __HAVE_ARCH_PGTABLE_DEPOSIT | 595 | #define __HAVE_ARCH_PGTABLE_DEPOSIT |
596 | extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, | 596 | extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, |
597 | pgtable_t pgtable); | 597 | pgtable_t pgtable); |
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index b41426c60ef6..5f8dcdaa2820 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c | |||
@@ -557,11 +557,11 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, | |||
557 | struct vio_dev *viodev = to_vio_dev(dev); | 557 | struct vio_dev *viodev = to_vio_dev(dev); |
558 | struct iommu_table *tbl; | 558 | struct iommu_table *tbl; |
559 | struct scatterlist *sgl; | 559 | struct scatterlist *sgl; |
560 | int ret, count = 0; | 560 | int ret, count; |
561 | size_t alloc_size = 0; | 561 | size_t alloc_size = 0; |
562 | 562 | ||
563 | tbl = get_iommu_table_base(dev); | 563 | tbl = get_iommu_table_base(dev); |
564 | for (sgl = sglist; count < nelems; count++, sgl++) | 564 | for_each_sg(sglist, sgl, nelems, count) |
565 | alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl)); | 565 | alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl)); |
566 | 566 | ||
567 | if (vio_cmo_alloc(viodev, alloc_size)) { | 567 | if (vio_cmo_alloc(viodev, alloc_size)) { |
@@ -577,7 +577,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, | |||
577 | return ret; | 577 | return ret; |
578 | } | 578 | } |
579 | 579 | ||
580 | for (sgl = sglist, count = 0; count < ret; count++, sgl++) | 580 | for_each_sg(sglist, sgl, ret, count) |
581 | alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); | 581 | alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); |
582 | if (alloc_size) | 582 | if (alloc_size) |
583 | vio_cmo_dealloc(viodev, alloc_size); | 583 | vio_cmo_dealloc(viodev, alloc_size); |
@@ -594,10 +594,10 @@ static void vio_dma_iommu_unmap_sg(struct device *dev, | |||
594 | struct iommu_table *tbl; | 594 | struct iommu_table *tbl; |
595 | struct scatterlist *sgl; | 595 | struct scatterlist *sgl; |
596 | size_t alloc_size = 0; | 596 | size_t alloc_size = 0; |
597 | int count = 0; | 597 | int count; |
598 | 598 | ||
599 | tbl = get_iommu_table_base(dev); | 599 | tbl = get_iommu_table_base(dev); |
600 | for (sgl = sglist; count < nelems; count++, sgl++) | 600 | for_each_sg(sglist, sgl, nelems, count) |
601 | alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); | 601 | alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); |
602 | 602 | ||
603 | dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs); | 603 | dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs); |
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 3385e3d0506e..38bd5d998c81 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c | |||
@@ -439,11 +439,6 @@ int alloc_bootmem_huge_page(struct hstate *hstate) | |||
439 | } | 439 | } |
440 | #endif | 440 | #endif |
441 | 441 | ||
442 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
443 | { | ||
444 | return 0; | ||
445 | } | ||
446 | |||
447 | #ifdef CONFIG_PPC_FSL_BOOK3E | 442 | #ifdef CONFIG_PPC_FSL_BOOK3E |
448 | #define HUGEPD_FREELIST_SIZE \ | 443 | #define HUGEPD_FREELIST_SIZE \ |
449 | ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) | 444 | ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) |
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 6bfadf1aa5cb..876232d64126 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c | |||
@@ -554,47 +554,42 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, | |||
554 | return old; | 554 | return old; |
555 | } | 555 | } |
556 | 556 | ||
557 | pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, | 557 | pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, |
558 | pmd_t *pmdp) | 558 | pmd_t *pmdp) |
559 | { | 559 | { |
560 | pmd_t pmd; | 560 | pmd_t pmd; |
561 | 561 | ||
562 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 562 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
563 | if (pmd_trans_huge(*pmdp)) { | 563 | VM_BUG_ON(pmd_trans_huge(*pmdp)); |
564 | pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); | 564 | |
565 | } else { | 565 | pmd = *pmdp; |
566 | /* | 566 | pmd_clear(pmdp); |
567 | * khugepaged calls this for normal pmd | 567 | /* |
568 | */ | 568 | * Wait for all pending hash_page to finish. This is needed |
569 | pmd = *pmdp; | 569 | * in case of subpage collapse. When we collapse normal pages |
570 | pmd_clear(pmdp); | 570 | * to hugepage, we first clear the pmd, then invalidate all |
571 | /* | 571 | * the PTE entries. The assumption here is that any low level |
572 | * Wait for all pending hash_page to finish. This is needed | 572 | * page fault will see a none pmd and take the slow path that |
573 | * in case of subpage collapse. When we collapse normal pages | 573 | * will wait on mmap_sem. But we could very well be in a |
574 | * to hugepage, we first clear the pmd, then invalidate all | 574 | * hash_page with local ptep pointer value. Such a hash page |
575 | * the PTE entries. The assumption here is that any low level | 575 | * can result in adding new HPTE entries for normal subpages. |
576 | * page fault will see a none pmd and take the slow path that | 576 | * That means we could be modifying the page content as we |
577 | * will wait on mmap_sem. But we could very well be in a | 577 | * copy them to a huge page. So wait for parallel hash_page |
578 | * hash_page with local ptep pointer value. Such a hash page | 578 | * to finish before invalidating HPTE entries. We can do this |
579 | * can result in adding new HPTE entries for normal subpages. | 579 | * by sending an IPI to all the cpus and executing a dummy |
580 | * That means we could be modifying the page content as we | 580 | * function there. |
581 | * copy them to a huge page. So wait for parallel hash_page | 581 | */ |
582 | * to finish before invalidating HPTE entries. We can do this | 582 | kick_all_cpus_sync(); |
583 | * by sending an IPI to all the cpus and executing a dummy | 583 | /* |
584 | * function there. | 584 | * Now invalidate the hpte entries in the range |
585 | */ | 585 | * covered by pmd. This make sure we take a |
586 | kick_all_cpus_sync(); | 586 | * fault and will find the pmd as none, which will |
587 | /* | 587 | * result in a major fault which takes mmap_sem and |
588 | * Now invalidate the hpte entries in the range | 588 | * hence wait for collapse to complete. Without this |
589 | * covered by pmd. This make sure we take a | 589 | * the __collapse_huge_page_copy can result in copying |
590 | * fault and will find the pmd as none, which will | 590 | * the old content. |
591 | * result in a major fault which takes mmap_sem and | 591 | */ |
592 | * hence wait for collapse to complete. Without this | 592 | flush_tlb_pmd_range(vma->vm_mm, &pmd, address); |
593 | * the __collapse_huge_page_copy can result in copying | ||
594 | * the old content. | ||
595 | */ | ||
596 | flush_tlb_pmd_range(vma->vm_mm, &pmd, address); | ||
597 | } | ||
598 | return pmd; | 593 | return pmd; |
599 | } | 594 | } |
600 | 595 | ||
@@ -817,8 +812,8 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, | |||
817 | return; | 812 | return; |
818 | } | 813 | } |
819 | 814 | ||
820 | pmd_t pmdp_get_and_clear(struct mm_struct *mm, | 815 | pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, |
821 | unsigned long addr, pmd_t *pmdp) | 816 | unsigned long addr, pmd_t *pmdp) |
822 | { | 817 | { |
823 | pmd_t old_pmd; | 818 | pmd_t old_pmd; |
824 | pgtable_t pgtable; | 819 | pgtable_t pgtable; |
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 11eae5f55b70..dfb542ade6b1 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h | |||
@@ -35,7 +35,6 @@ static inline int prepare_hugepage_range(struct file *file, | |||
35 | return 0; | 35 | return 0; |
36 | } | 36 | } |
37 | 37 | ||
38 | #define hugetlb_prefault_arch_hook(mm) do { } while (0) | ||
39 | #define arch_clear_hugepage_flags(page) do { } while (0) | 38 | #define arch_clear_hugepage_flags(page) do { } while (0) |
40 | 39 | ||
41 | int arch_prepare_hugepage(struct page *page); | 40 | int arch_prepare_hugepage(struct page *page); |
diff --git a/arch/s390/include/asm/mm-arch-hooks.h b/arch/s390/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..07680b2f3c59 --- /dev/null +++ b/arch/s390/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_S390_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_S390_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_S390_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0bb2da79adf3..f66d82798a6a 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h | |||
@@ -1498,9 +1498,9 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, | |||
1498 | return pmd_young(pmd); | 1498 | return pmd_young(pmd); |
1499 | } | 1499 | } |
1500 | 1500 | ||
1501 | #define __HAVE_ARCH_PMDP_GET_AND_CLEAR | 1501 | #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR |
1502 | static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, | 1502 | static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, |
1503 | unsigned long address, pmd_t *pmdp) | 1503 | unsigned long address, pmd_t *pmdp) |
1504 | { | 1504 | { |
1505 | pmd_t pmd = *pmdp; | 1505 | pmd_t pmd = *pmdp; |
1506 | 1506 | ||
@@ -1509,10 +1509,10 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, | |||
1509 | return pmd; | 1509 | return pmd; |
1510 | } | 1510 | } |
1511 | 1511 | ||
1512 | #define __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL | 1512 | #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL |
1513 | static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm, | 1513 | static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, |
1514 | unsigned long address, | 1514 | unsigned long address, |
1515 | pmd_t *pmdp, int full) | 1515 | pmd_t *pmdp, int full) |
1516 | { | 1516 | { |
1517 | pmd_t pmd = *pmdp; | 1517 | pmd_t pmd = *pmdp; |
1518 | 1518 | ||
@@ -1522,11 +1522,11 @@ static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm, | |||
1522 | return pmd; | 1522 | return pmd; |
1523 | } | 1523 | } |
1524 | 1524 | ||
1525 | #define __HAVE_ARCH_PMDP_CLEAR_FLUSH | 1525 | #define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH |
1526 | static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma, | 1526 | static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, |
1527 | unsigned long address, pmd_t *pmdp) | 1527 | unsigned long address, pmd_t *pmdp) |
1528 | { | 1528 | { |
1529 | return pmdp_get_and_clear(vma->vm_mm, address, pmdp); | 1529 | return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); |
1530 | } | 1530 | } |
1531 | 1531 | ||
1532 | #define __HAVE_ARCH_PMDP_INVALIDATE | 1532 | #define __HAVE_ARCH_PMDP_INVALIDATE |
@@ -1548,6 +1548,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, | |||
1548 | } | 1548 | } |
1549 | } | 1549 | } |
1550 | 1550 | ||
1551 | static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, | ||
1552 | unsigned long address, | ||
1553 | pmd_t *pmdp) | ||
1554 | { | ||
1555 | return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); | ||
1556 | } | ||
1557 | #define pmdp_collapse_flush pmdp_collapse_flush | ||
1558 | |||
1551 | #define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot)) | 1559 | #define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot)) |
1552 | #define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) | 1560 | #define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) |
1553 | 1561 | ||
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index d9f0dcfcae5e..7a75ad4594e3 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c | |||
@@ -33,11 +33,12 @@ static struct memblock_type oldmem_type = { | |||
33 | }; | 33 | }; |
34 | 34 | ||
35 | #define for_each_dump_mem_range(i, nid, p_start, p_end, p_nid) \ | 35 | #define for_each_dump_mem_range(i, nid, p_start, p_end, p_nid) \ |
36 | for (i = 0, __next_mem_range(&i, nid, &memblock.physmem, \ | 36 | for (i = 0, __next_mem_range(&i, nid, MEMBLOCK_NONE, \ |
37 | &memblock.physmem, \ | ||
37 | &oldmem_type, p_start, \ | 38 | &oldmem_type, p_start, \ |
38 | p_end, p_nid); \ | 39 | p_end, p_nid); \ |
39 | i != (u64)ULLONG_MAX; \ | 40 | i != (u64)ULLONG_MAX; \ |
40 | __next_mem_range(&i, nid, &memblock.physmem, \ | 41 | __next_mem_range(&i, nid, MEMBLOCK_NONE, &memblock.physmem,\ |
41 | &oldmem_type, \ | 42 | &oldmem_type, \ |
42 | p_start, p_end, p_nid)) | 43 | p_start, p_end, p_nid)) |
43 | 44 | ||
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index e617e74b7be2..c3f8e3df92ff 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c | |||
@@ -193,11 +193,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
193 | return (pte_t *) pmdp; | 193 | return (pte_t *) pmdp; |
194 | } | 194 | } |
195 | 195 | ||
196 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
197 | { | ||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | int pmd_huge(pmd_t pmd) | 196 | int pmd_huge(pmd_t pmd) |
202 | { | 197 | { |
203 | if (!MACHINE_HAS_HPAGE) | 198 | if (!MACHINE_HAS_HPAGE) |
diff --git a/arch/score/include/asm/mm-arch-hooks.h b/arch/score/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..5e38689f189a --- /dev/null +++ b/arch/score/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_SCORE_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_SCORE_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_SCORE_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 699255d6d1c6..b788a9bc8918 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h | |||
@@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file, | |||
26 | return 0; | 26 | return 0; |
27 | } | 27 | } |
28 | 28 | ||
29 | static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) { | ||
30 | } | ||
31 | |||
32 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, | 29 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, |
33 | unsigned long addr, unsigned long end, | 30 | unsigned long addr, unsigned long end, |
34 | unsigned long floor, | 31 | unsigned long floor, |
diff --git a/arch/sh/include/asm/mm-arch-hooks.h b/arch/sh/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..18087298b728 --- /dev/null +++ b/arch/sh/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_SH_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_SH_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_SH_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 534bc978af8a..6385f60209b6 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c | |||
@@ -62,11 +62,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
62 | return pte; | 62 | return pte; |
63 | } | 63 | } |
64 | 64 | ||
65 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
66 | { | ||
67 | return 0; | ||
68 | } | ||
69 | |||
70 | int pmd_huge(pmd_t pmd) | 65 | int pmd_huge(pmd_t pmd) |
71 | { | 66 | { |
72 | return 0; | 67 | return 0; |
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index e4cab465b81f..3130d7636312 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h | |||
@@ -11,10 +11,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | |||
11 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, | 11 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, |
12 | pte_t *ptep); | 12 | pte_t *ptep); |
13 | 13 | ||
14 | static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) | ||
15 | { | ||
16 | } | ||
17 | |||
18 | static inline int is_hugepage_only_range(struct mm_struct *mm, | 14 | static inline int is_hugepage_only_range(struct mm_struct *mm, |
19 | unsigned long addr, | 15 | unsigned long addr, |
20 | unsigned long len) { | 16 | unsigned long len) { |
diff --git a/arch/sparc/include/asm/mm-arch-hooks.h b/arch/sparc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..b89ba44c16f1 --- /dev/null +++ b/arch/sparc/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_SPARC_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_SPARC_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_SPARC_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 2a52c91d2c8a..131d36fcd07a 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h | |||
@@ -865,10 +865,10 @@ static inline unsigned long pud_pfn(pud_t pud) | |||
865 | void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, | 865 | void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, |
866 | pte_t *ptep, pte_t orig, int fullmm); | 866 | pte_t *ptep, pte_t orig, int fullmm); |
867 | 867 | ||
868 | #define __HAVE_ARCH_PMDP_GET_AND_CLEAR | 868 | #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR |
869 | static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, | 869 | static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, |
870 | unsigned long addr, | 870 | unsigned long addr, |
871 | pmd_t *pmdp) | 871 | pmd_t *pmdp) |
872 | { | 872 | { |
873 | pmd_t pmd = *pmdp; | 873 | pmd_t pmd = *pmdp; |
874 | set_pmd_at(mm, addr, pmdp, __pmd(0UL)); | 874 | set_pmd_at(mm, addr, pmdp, __pmd(0UL)); |
diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c index 7d3ca30fcd15..1ae5eb1bb045 100644 --- a/arch/sparc/kernel/ldc.c +++ b/arch/sparc/kernel/ldc.c | |||
@@ -2086,6 +2086,7 @@ int ldc_map_sg(struct ldc_channel *lp, | |||
2086 | struct cookie_state state; | 2086 | struct cookie_state state; |
2087 | struct ldc_iommu *iommu; | 2087 | struct ldc_iommu *iommu; |
2088 | int err; | 2088 | int err; |
2089 | struct scatterlist *s; | ||
2089 | 2090 | ||
2090 | if (map_perm & ~LDC_MAP_ALL) | 2091 | if (map_perm & ~LDC_MAP_ALL) |
2091 | return -EINVAL; | 2092 | return -EINVAL; |
@@ -2112,9 +2113,10 @@ int ldc_map_sg(struct ldc_channel *lp, | |||
2112 | state.pte_idx = (base - iommu->page_table); | 2113 | state.pte_idx = (base - iommu->page_table); |
2113 | state.nc = 0; | 2114 | state.nc = 0; |
2114 | 2115 | ||
2115 | for (i = 0; i < num_sg; i++) | 2116 | for_each_sg(sg, s, num_sg, i) { |
2116 | fill_cookies(&state, page_to_pfn(sg_page(&sg[i])) << PAGE_SHIFT, | 2117 | fill_cookies(&state, page_to_pfn(sg_page(s)) << PAGE_SHIFT, |
2117 | sg[i].offset, sg[i].length); | 2118 | s->offset, s->length); |
2119 | } | ||
2118 | 2120 | ||
2119 | return state.nc; | 2121 | return state.nc; |
2120 | } | 2122 | } |
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 4242eab12e10..131eaf4ad7f5 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c | |||
@@ -172,11 +172,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
172 | return pte; | 172 | return pte; |
173 | } | 173 | } |
174 | 174 | ||
175 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
176 | { | ||
177 | return 0; | ||
178 | } | ||
179 | |||
180 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | 175 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
181 | pte_t *ptep, pte_t entry) | 176 | pte_t *ptep, pte_t entry) |
182 | { | 177 | { |
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index c5d08b89a96c..4ac88b757514 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c | |||
@@ -1966,7 +1966,8 @@ static phys_addr_t __init available_memory(void) | |||
1966 | phys_addr_t pa_start, pa_end; | 1966 | phys_addr_t pa_start, pa_end; |
1967 | u64 i; | 1967 | u64 i; |
1968 | 1968 | ||
1969 | for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) | 1969 | for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start, |
1970 | &pa_end, NULL) | ||
1970 | available = available + (pa_end - pa_start); | 1971 | available = available + (pa_end - pa_start); |
1971 | 1972 | ||
1972 | return available; | 1973 | return available; |
@@ -1992,7 +1993,8 @@ static void __init reduce_memory(phys_addr_t limit_ram) | |||
1992 | if (limit_ram >= avail_ram) | 1993 | if (limit_ram >= avail_ram) |
1993 | return; | 1994 | return; |
1994 | 1995 | ||
1995 | for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) { | 1996 | for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start, |
1997 | &pa_end, NULL) { | ||
1996 | phys_addr_t region_size = pa_end - pa_start; | 1998 | phys_addr_t region_size = pa_end - pa_start; |
1997 | phys_addr_t clip_start = pa_start; | 1999 | phys_addr_t clip_start = pa_start; |
1998 | 2000 | ||
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h index 3257733003f8..1abd00c55236 100644 --- a/arch/tile/include/asm/hugetlb.h +++ b/arch/tile/include/asm/hugetlb.h | |||
@@ -40,10 +40,6 @@ static inline int prepare_hugepage_range(struct file *file, | |||
40 | return 0; | 40 | return 0; |
41 | } | 41 | } |
42 | 42 | ||
43 | static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) | ||
44 | { | ||
45 | } | ||
46 | |||
47 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, | 43 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, |
48 | unsigned long addr, unsigned long end, | 44 | unsigned long addr, unsigned long end, |
49 | unsigned long floor, | 45 | unsigned long floor, |
diff --git a/arch/tile/include/asm/mm-arch-hooks.h b/arch/tile/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..d1709ea774f7 --- /dev/null +++ b/arch/tile/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_TILE_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_TILE_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_TILE_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index 95a4f19d16c5..2b05ccbebed9 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h | |||
@@ -414,10 +414,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, | |||
414 | } | 414 | } |
415 | 415 | ||
416 | 416 | ||
417 | #define __HAVE_ARCH_PMDP_GET_AND_CLEAR | 417 | #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR |
418 | static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, | 418 | static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, |
419 | unsigned long address, | 419 | unsigned long address, |
420 | pmd_t *pmdp) | 420 | pmd_t *pmdp) |
421 | { | 421 | { |
422 | return pte_pmd(ptep_get_and_clear(mm, address, pmdp_ptep(pmdp))); | 422 | return pte_pmd(ptep_get_and_clear(mm, address, pmdp_ptep(pmdp))); |
423 | } | 423 | } |
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 8416240c322c..c034dc3fe2d4 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c | |||
@@ -160,11 +160,6 @@ int pud_huge(pud_t pud) | |||
160 | return !!(pud_val(pud) & _PAGE_HUGE_PAGE); | 160 | return !!(pud_val(pud) & _PAGE_HUGE_PAGE); |
161 | } | 161 | } |
162 | 162 | ||
163 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
164 | { | ||
165 | return 0; | ||
166 | } | ||
167 | |||
168 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA | 163 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA |
169 | static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | 164 | static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, |
170 | unsigned long addr, unsigned long len, | 165 | unsigned long addr, unsigned long len, |
diff --git a/arch/um/include/asm/mm-arch-hooks.h b/arch/um/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..a7c8b0dfdd4e --- /dev/null +++ b/arch/um/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_UM_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_UM_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_UM_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/unicore32/include/asm/mm-arch-hooks.h b/arch/unicore32/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..4d79a850c509 --- /dev/null +++ b/arch/unicore32/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_UNICORE32_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_UNICORE32_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_UNICORE32_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 68c05398bba9..dab7a3a750bf 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h | |||
@@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file, | |||
26 | return 0; | 26 | return 0; |
27 | } | 27 | } |
28 | 28 | ||
29 | static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) { | ||
30 | } | ||
31 | |||
32 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, | 29 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, |
33 | unsigned long addr, unsigned long end, | 30 | unsigned long addr, unsigned long end, |
34 | unsigned long floor, | 31 | unsigned long floor, |
diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..4e881a342236 --- /dev/null +++ b/arch/x86/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_X86_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_X86_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_X86_MM_ARCH_HOOKS_H */ | ||
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2562e303405b..867da5bbb4a3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -805,8 +805,8 @@ static inline int pmd_write(pmd_t pmd) | |||
805 | return pmd_flags(pmd) & _PAGE_RW; | 805 | return pmd_flags(pmd) & _PAGE_RW; |
806 | } | 806 | } |
807 | 807 | ||
808 | #define __HAVE_ARCH_PMDP_GET_AND_CLEAR | 808 | #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR |
809 | static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, | 809 | static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, |
810 | pmd_t *pmdp) | 810 | pmd_t *pmdp) |
811 | { | 811 | { |
812 | pmd_t pmd = native_pmdp_get_and_clear(pmdp); | 812 | pmd_t pmd = native_pmdp_get_and_clear(pmdp); |
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 83a7995625a6..58118e207a69 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c | |||
@@ -91,7 +91,8 @@ void __init setup_bios_corruption_check(void) | |||
91 | 91 | ||
92 | corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); | 92 | corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); |
93 | 93 | ||
94 | for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) { | 94 | for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, |
95 | NULL) { | ||
95 | start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), | 96 | start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), |
96 | PAGE_SIZE, corruption_check_size); | 97 | PAGE_SIZE, corruption_check_size); |
97 | end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), | 98 | end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), |
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e2ce85db2283..c8dda42cb6a3 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -1123,7 +1123,8 @@ void __init memblock_find_dma_reserve(void) | |||
1123 | nr_pages += end_pfn - start_pfn; | 1123 | nr_pages += end_pfn - start_pfn; |
1124 | } | 1124 | } |
1125 | 1125 | ||
1126 | for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) { | 1126 | for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, |
1127 | NULL) { | ||
1127 | start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); | 1128 | start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); |
1128 | end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); | 1129 | end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); |
1129 | if (start_pfn < end_pfn) | 1130 | if (start_pfn < end_pfn) |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 39ca113676fe..d3b95b89e9b2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -1105,6 +1105,9 @@ void __init setup_arch(char **cmdline_p) | |||
1105 | memblock_set_current_limit(ISA_END_ADDRESS); | 1105 | memblock_set_current_limit(ISA_END_ADDRESS); |
1106 | memblock_x86_fill(); | 1106 | memblock_x86_fill(); |
1107 | 1107 | ||
1108 | if (efi_enabled(EFI_BOOT)) | ||
1109 | efi_find_mirror(); | ||
1110 | |||
1108 | /* | 1111 | /* |
1109 | * The EFI specification says that boot service code won't be called | 1112 | * The EFI specification says that boot service code won't be called |
1110 | * after ExitBootServices(). This is, in fact, a lie. | 1113 | * after ExitBootServices(). This is, in fact, a lie. |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c8140e12816a..8340e45c891a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -433,7 +433,7 @@ void __init add_highpages_with_active_regions(int nid, | |||
433 | phys_addr_t start, end; | 433 | phys_addr_t start, end; |
434 | u64 i; | 434 | u64 i; |
435 | 435 | ||
436 | for_each_free_mem_range(i, nid, &start, &end, NULL) { | 436 | for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) { |
437 | unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), | 437 | unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), |
438 | start_pfn, end_pfn); | 438 | start_pfn, end_pfn); |
439 | unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), | 439 | unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), |
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 3b984c3aa1b0..c1c382c58c60 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c | |||
@@ -117,6 +117,27 @@ void efi_get_time(struct timespec *now) | |||
117 | now->tv_nsec = 0; | 117 | now->tv_nsec = 0; |
118 | } | 118 | } |
119 | 119 | ||
120 | void __init efi_find_mirror(void) | ||
121 | { | ||
122 | void *p; | ||
123 | u64 mirror_size = 0, total_size = 0; | ||
124 | |||
125 | for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { | ||
126 | efi_memory_desc_t *md = p; | ||
127 | unsigned long long start = md->phys_addr; | ||
128 | unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; | ||
129 | |||
130 | total_size += size; | ||
131 | if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { | ||
132 | memblock_mark_mirror(start, size); | ||
133 | mirror_size += size; | ||
134 | } | ||
135 | } | ||
136 | if (mirror_size) | ||
137 | pr_info("Memory: %lldM/%lldM mirrored memory\n", | ||
138 | mirror_size>>20, total_size>>20); | ||
139 | } | ||
140 | |||
120 | /* | 141 | /* |
121 | * Tell the kernel about the EFI memory map. This might include | 142 | * Tell the kernel about the EFI memory map. This might include |
122 | * more than the max 128 entries that can fit in the e820 legacy | 143 | * more than the max 128 entries that can fit in the e820 legacy |
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h index ba78ccf651e7..1f5f6dc09736 100644 --- a/arch/xtensa/include/asm/dma-mapping.h +++ b/arch/xtensa/include/asm/dma-mapping.h | |||
@@ -52,14 +52,15 @@ dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, | |||
52 | } | 52 | } |
53 | 53 | ||
54 | static inline int | 54 | static inline int |
55 | dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, | 55 | dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, |
56 | enum dma_data_direction direction) | 56 | enum dma_data_direction direction) |
57 | { | 57 | { |
58 | int i; | 58 | int i; |
59 | struct scatterlist *sg; | ||
59 | 60 | ||
60 | BUG_ON(direction == DMA_NONE); | 61 | BUG_ON(direction == DMA_NONE); |
61 | 62 | ||
62 | for (i = 0; i < nents; i++, sg++ ) { | 63 | for_each_sg(sglist, sg, nents, i) { |
63 | BUG_ON(!sg_page(sg)); | 64 | BUG_ON(!sg_page(sg)); |
64 | 65 | ||
65 | sg->dma_address = sg_phys(sg); | 66 | sg->dma_address = sg_phys(sg); |
@@ -124,20 +125,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, | |||
124 | consistent_sync((void *)bus_to_virt(dma_handle)+offset,size,direction); | 125 | consistent_sync((void *)bus_to_virt(dma_handle)+offset,size,direction); |
125 | } | 126 | } |
126 | static inline void | 127 | static inline void |
127 | dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, | 128 | dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems, |
128 | enum dma_data_direction dir) | 129 | enum dma_data_direction dir) |
129 | { | 130 | { |
130 | int i; | 131 | int i; |
131 | for (i = 0; i < nelems; i++, sg++) | 132 | struct scatterlist *sg; |
133 | |||
134 | for_each_sg(sglist, sg, nelems, i) | ||
132 | consistent_sync(sg_virt(sg), sg->length, dir); | 135 | consistent_sync(sg_virt(sg), sg->length, dir); |
133 | } | 136 | } |
134 | 137 | ||
135 | static inline void | 138 | static inline void |
136 | dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, | 139 | dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, |
137 | enum dma_data_direction dir) | 140 | int nelems, enum dma_data_direction dir) |
138 | { | 141 | { |
139 | int i; | 142 | int i; |
140 | for (i = 0; i < nelems; i++, sg++) | 143 | struct scatterlist *sg; |
144 | |||
145 | for_each_sg(sglist, sg, nelems, i) | ||
141 | consistent_sync(sg_virt(sg), sg->length, dir); | 146 | consistent_sync(sg_virt(sg), sg->length, dir); |
142 | } | 147 | } |
143 | static inline int | 148 | static inline int |
diff --git a/arch/xtensa/include/asm/mm-arch-hooks.h b/arch/xtensa/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..d2e5cfd3dd02 --- /dev/null +++ b/arch/xtensa/include/asm/mm-arch-hooks.h | |||
@@ -0,0 +1,15 @@ | |||
1 | /* | ||
2 | * Architecture specific mm hooks | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef _ASM_XTENSA_MM_ARCH_HOOKS_H | ||
13 | #define _ASM_XTENSA_MM_ARCH_HOOKS_H | ||
14 | |||
15 | #endif /* _ASM_XTENSA_MM_ARCH_HOOKS_H */ | ||
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index feafa172b155..2345ee7342d9 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c | |||
@@ -165,7 +165,7 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) | |||
165 | * infrastructure. There is no real reason why the selected | 165 | * infrastructure. There is no real reason why the selected |
166 | * task should have access to the memory reserves. | 166 | * task should have access to the memory reserves. |
167 | */ | 167 | */ |
168 | mark_tsk_oom_victim(selected); | 168 | mark_oom_victim(selected); |
169 | send_sig(SIGKILL, selected, 0); | 169 | send_sig(SIGKILL, selected, 0); |
170 | rem += selected_tasksize; | 170 | rem += selected_tasksize; |
171 | } | 171 | } |
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 9ffdfcf2ec6e..1c4791033b72 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c | |||
@@ -353,9 +353,11 @@ static struct sysrq_key_op sysrq_term_op = { | |||
353 | 353 | ||
354 | static void moom_callback(struct work_struct *ignored) | 354 | static void moom_callback(struct work_struct *ignored) |
355 | { | 355 | { |
356 | mutex_lock(&oom_lock); | ||
356 | if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), | 357 | if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), |
357 | GFP_KERNEL, 0, NULL, true)) | 358 | GFP_KERNEL, 0, NULL, true)) |
358 | pr_info("OOM request ignored because killer is disabled\n"); | 359 | pr_info("OOM request ignored because killer is disabled\n"); |
360 | mutex_unlock(&oom_lock); | ||
359 | } | 361 | } |
360 | 362 | ||
361 | static DECLARE_WORK(moom_work, moom_callback); | 363 | static DECLARE_WORK(moom_work, moom_callback); |
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index c4211a31612d..d88f36754bf7 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c | |||
@@ -381,15 +381,9 @@ static int __init xen_tmem_init(void) | |||
381 | #ifdef CONFIG_FRONTSWAP | 381 | #ifdef CONFIG_FRONTSWAP |
382 | if (tmem_enabled && frontswap) { | 382 | if (tmem_enabled && frontswap) { |
383 | char *s = ""; | 383 | char *s = ""; |
384 | struct frontswap_ops *old_ops; | ||
385 | 384 | ||
386 | tmem_frontswap_poolid = -1; | 385 | tmem_frontswap_poolid = -1; |
387 | old_ops = frontswap_register_ops(&tmem_frontswap_ops); | 386 | frontswap_register_ops(&tmem_frontswap_ops); |
388 | if (IS_ERR(old_ops) || old_ops) { | ||
389 | if (IS_ERR(old_ops)) | ||
390 | return PTR_ERR(old_ops); | ||
391 | s = " (WARNING: frontswap_ops overridden)"; | ||
392 | } | ||
393 | pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n", | 387 | pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n", |
394 | s); | 388 | s); |
395 | } | 389 | } |
diff --git a/fs/configfs/item.c b/fs/configfs/item.c index e65f9ffbb999..4d6a30e76168 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c | |||
@@ -47,12 +47,11 @@ static void config_item_release(struct kref *kref); | |||
47 | * config_item_init - initialize item. | 47 | * config_item_init - initialize item. |
48 | * @item: item in question. | 48 | * @item: item in question. |
49 | */ | 49 | */ |
50 | void config_item_init(struct config_item *item) | 50 | static void config_item_init(struct config_item *item) |
51 | { | 51 | { |
52 | kref_init(&item->ci_kref); | 52 | kref_init(&item->ci_kref); |
53 | INIT_LIST_HEAD(&item->ci_entry); | 53 | INIT_LIST_HEAD(&item->ci_entry); |
54 | } | 54 | } |
55 | EXPORT_SYMBOL(config_item_init); | ||
56 | 55 | ||
57 | /** | 56 | /** |
58 | * config_item_set_name - Set the name of an item | 57 | * config_item_set_name - Set the name of an item |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 87724c1d7be6..0cf74df68617 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -130,7 +130,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | |||
130 | goto out; | 130 | goto out; |
131 | 131 | ||
132 | ret = 0; | 132 | ret = 0; |
133 | hugetlb_prefault_arch_hook(vma->vm_mm); | ||
134 | if (vma->vm_flags & VM_WRITE && inode->i_size < len) | 133 | if (vma->vm_flags & VM_WRITE && inode->i_size < len) |
135 | inode->i_size = len; | 134 | inode->i_size = len; |
136 | out: | 135 | out: |
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 7bb487e663b4..2cd653670764 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c | |||
@@ -525,7 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, | |||
525 | } | 525 | } |
526 | } | 526 | } |
527 | err = add_to_page_cache_lru(*cached_page, mapping, | 527 | err = add_to_page_cache_lru(*cached_page, mapping, |
528 | index, GFP_KERNEL); | 528 | index, |
529 | GFP_KERNEL & mapping_gfp_mask(mapping)); | ||
529 | if (unlikely(err)) { | 530 | if (unlikely(err)) { |
530 | if (err == -EEXIST) | 531 | if (err == -EEXIST) |
531 | continue; | 532 | continue; |
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index a44b14cbceeb..ab172e5f51d9 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h | |||
@@ -85,12 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size) | |||
85 | 85 | ||
86 | static inline void ntfs_free(void *addr) | 86 | static inline void ntfs_free(void *addr) |
87 | { | 87 | { |
88 | if (!is_vmalloc_addr(addr)) { | 88 | kvfree(addr); |
89 | kfree(addr); | ||
90 | /* free_page((unsigned long)addr); */ | ||
91 | return; | ||
92 | } | ||
93 | vfree(addr); | ||
94 | } | 89 | } |
95 | 90 | ||
96 | #endif /* _LINUX_NTFS_MALLOC_H */ | 91 | #endif /* _LINUX_NTFS_MALLOC_H */ |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 2d7f76e52c37..5997c00a1515 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -2925,7 +2925,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, | |||
2925 | struct ocfs2_path *right_path = NULL; | 2925 | struct ocfs2_path *right_path = NULL; |
2926 | struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); | 2926 | struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); |
2927 | 2927 | ||
2928 | BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); | 2928 | if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))) |
2929 | return 0; | ||
2929 | 2930 | ||
2930 | *empty_extent_path = NULL; | 2931 | *empty_extent_path = NULL; |
2931 | 2932 | ||
@@ -4311,13 +4312,13 @@ out: | |||
4311 | return ret; | 4312 | return ret; |
4312 | } | 4313 | } |
4313 | 4314 | ||
4314 | static enum ocfs2_contig_type | 4315 | static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, |
4315 | ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, | ||
4316 | struct ocfs2_path *path, | 4316 | struct ocfs2_path *path, |
4317 | struct ocfs2_extent_list *el, int index, | 4317 | struct ocfs2_extent_list *el, int index, |
4318 | struct ocfs2_extent_rec *split_rec) | 4318 | struct ocfs2_extent_rec *split_rec, |
4319 | struct ocfs2_merge_ctxt *ctxt) | ||
4319 | { | 4320 | { |
4320 | int status; | 4321 | int status = 0; |
4321 | enum ocfs2_contig_type ret = CONTIG_NONE; | 4322 | enum ocfs2_contig_type ret = CONTIG_NONE; |
4322 | u32 left_cpos, right_cpos; | 4323 | u32 left_cpos, right_cpos; |
4323 | struct ocfs2_extent_rec *rec = NULL; | 4324 | struct ocfs2_extent_rec *rec = NULL; |
@@ -4336,8 +4337,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, | |||
4336 | 4337 | ||
4337 | if (left_cpos != 0) { | 4338 | if (left_cpos != 0) { |
4338 | left_path = ocfs2_new_path_from_path(path); | 4339 | left_path = ocfs2_new_path_from_path(path); |
4339 | if (!left_path) | 4340 | if (!left_path) { |
4341 | status = -ENOMEM; | ||
4342 | mlog_errno(status); | ||
4340 | goto exit; | 4343 | goto exit; |
4344 | } | ||
4341 | 4345 | ||
4342 | status = ocfs2_find_path(et->et_ci, left_path, | 4346 | status = ocfs2_find_path(et->et_ci, left_path, |
4343 | left_cpos); | 4347 | left_cpos); |
@@ -4392,8 +4396,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, | |||
4392 | goto free_left_path; | 4396 | goto free_left_path; |
4393 | 4397 | ||
4394 | right_path = ocfs2_new_path_from_path(path); | 4398 | right_path = ocfs2_new_path_from_path(path); |
4395 | if (!right_path) | 4399 | if (!right_path) { |
4400 | status = -ENOMEM; | ||
4401 | mlog_errno(status); | ||
4396 | goto free_left_path; | 4402 | goto free_left_path; |
4403 | } | ||
4397 | 4404 | ||
4398 | status = ocfs2_find_path(et->et_ci, right_path, right_cpos); | 4405 | status = ocfs2_find_path(et->et_ci, right_path, right_cpos); |
4399 | if (status) | 4406 | if (status) |
@@ -4433,7 +4440,10 @@ free_right_path: | |||
4433 | free_left_path: | 4440 | free_left_path: |
4434 | ocfs2_free_path(left_path); | 4441 | ocfs2_free_path(left_path); |
4435 | exit: | 4442 | exit: |
4436 | return ret; | 4443 | if (status == 0) |
4444 | ctxt->c_contig_type = ret; | ||
4445 | |||
4446 | return status; | ||
4437 | } | 4447 | } |
4438 | 4448 | ||
4439 | static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, | 4449 | static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, |
@@ -5039,9 +5049,14 @@ int ocfs2_split_extent(handle_t *handle, | |||
5039 | goto out; | 5049 | goto out; |
5040 | } | 5050 | } |
5041 | 5051 | ||
5042 | ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, | 5052 | ret = ocfs2_figure_merge_contig_type(et, path, el, |
5043 | split_index, | 5053 | split_index, |
5044 | split_rec); | 5054 | split_rec, |
5055 | &ctxt); | ||
5056 | if (ret) { | ||
5057 | mlog_errno(ret); | ||
5058 | goto out; | ||
5059 | } | ||
5045 | 5060 | ||
5046 | /* | 5061 | /* |
5047 | * The core merge / split code wants to know how much room is | 5062 | * The core merge / split code wants to know how much room is |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f906a250da6a..1a35c6139656 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -523,7 +523,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
523 | unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; | 523 | unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; |
524 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; | 524 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; |
525 | unsigned long len = bh_result->b_size; | 525 | unsigned long len = bh_result->b_size; |
526 | unsigned int clusters_to_alloc = 0; | 526 | unsigned int clusters_to_alloc = 0, contig_clusters = 0; |
527 | 527 | ||
528 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); | 528 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); |
529 | 529 | ||
@@ -560,8 +560,10 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
560 | /* fill hole, allocate blocks can't be larger than the size | 560 | /* fill hole, allocate blocks can't be larger than the size |
561 | * of the hole */ | 561 | * of the hole */ |
562 | clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); | 562 | clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); |
563 | if (clusters_to_alloc > contig_blocks) | 563 | contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb, |
564 | clusters_to_alloc = contig_blocks; | 564 | contig_blocks); |
565 | if (clusters_to_alloc > contig_clusters) | ||
566 | clusters_to_alloc = contig_clusters; | ||
565 | 567 | ||
566 | /* allocate extent and insert them into the extent tree */ | 568 | /* allocate extent and insert them into the extent tree */ |
567 | ret = ocfs2_extend_allocation(inode, cpos, | 569 | ret = ocfs2_extend_allocation(inode, cpos, |
@@ -619,9 +621,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, | |||
619 | /* this io's submitter should not have unlocked this before we could */ | 621 | /* this io's submitter should not have unlocked this before we could */ |
620 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | 622 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); |
621 | 623 | ||
622 | if (ocfs2_iocb_is_sem_locked(iocb)) | ||
623 | ocfs2_iocb_clear_sem_locked(iocb); | ||
624 | |||
625 | if (ocfs2_iocb_is_unaligned_aio(iocb)) { | 624 | if (ocfs2_iocb_is_unaligned_aio(iocb)) { |
626 | ocfs2_iocb_clear_unaligned_aio(iocb); | 625 | ocfs2_iocb_clear_unaligned_aio(iocb); |
627 | 626 | ||
@@ -925,13 +924,23 @@ clean_orphan: | |||
925 | int update_isize = written > 0 ? 1 : 0; | 924 | int update_isize = written > 0 ? 1 : 0; |
926 | loff_t end = update_isize ? offset + written : 0; | 925 | loff_t end = update_isize ? offset + written : 0; |
927 | 926 | ||
928 | tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, | 927 | tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1); |
928 | if (tmp_ret < 0) { | ||
929 | ret = tmp_ret; | ||
930 | mlog_errno(ret); | ||
931 | goto out; | ||
932 | } | ||
933 | |||
934 | tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, | ||
929 | update_isize, end); | 935 | update_isize, end); |
930 | if (tmp_ret < 0) { | 936 | if (tmp_ret < 0) { |
931 | ret = tmp_ret; | 937 | ret = tmp_ret; |
938 | mlog_errno(ret); | ||
932 | goto out; | 939 | goto out; |
933 | } | 940 | } |
934 | 941 | ||
942 | ocfs2_inode_unlock(inode, 1); | ||
943 | |||
935 | tmp_ret = jbd2_journal_force_commit(journal); | 944 | tmp_ret = jbd2_journal_force_commit(journal); |
936 | if (tmp_ret < 0) { | 945 | if (tmp_ret < 0) { |
937 | ret = tmp_ret; | 946 | ret = tmp_ret; |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index dd59599b022d..24e496d6bdcd 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
@@ -79,7 +79,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) | |||
79 | enum ocfs2_iocb_lock_bits { | 79 | enum ocfs2_iocb_lock_bits { |
80 | OCFS2_IOCB_RW_LOCK = 0, | 80 | OCFS2_IOCB_RW_LOCK = 0, |
81 | OCFS2_IOCB_RW_LOCK_LEVEL, | 81 | OCFS2_IOCB_RW_LOCK_LEVEL, |
82 | OCFS2_IOCB_SEM, | ||
83 | OCFS2_IOCB_UNALIGNED_IO, | 82 | OCFS2_IOCB_UNALIGNED_IO, |
84 | OCFS2_IOCB_NUM_LOCKS | 83 | OCFS2_IOCB_NUM_LOCKS |
85 | }; | 84 | }; |
@@ -88,12 +87,6 @@ enum ocfs2_iocb_lock_bits { | |||
88 | clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) | 87 | clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) |
89 | #define ocfs2_iocb_rw_locked_level(iocb) \ | 88 | #define ocfs2_iocb_rw_locked_level(iocb) \ |
90 | test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) | 89 | test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) |
91 | #define ocfs2_iocb_set_sem_locked(iocb) \ | ||
92 | set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) | ||
93 | #define ocfs2_iocb_clear_sem_locked(iocb) \ | ||
94 | clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) | ||
95 | #define ocfs2_iocb_is_sem_locked(iocb) \ | ||
96 | test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) | ||
97 | 90 | ||
98 | #define ocfs2_iocb_set_unaligned_aio(iocb) \ | 91 | #define ocfs2_iocb_set_unaligned_aio(iocb) \ |
99 | set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | 92 | set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) |
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index af7598bff1b5..dfe162f5fd4c 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c | |||
@@ -64,6 +64,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) | |||
64 | return count; | 64 | return count; |
65 | } | 65 | } |
66 | 66 | ||
67 | void __mlog_printk(const u64 *mask, const char *func, int line, | ||
68 | const char *fmt, ...) | ||
69 | { | ||
70 | struct va_format vaf; | ||
71 | va_list args; | ||
72 | const char *level; | ||
73 | const char *prefix = ""; | ||
74 | |||
75 | if (!__mlog_test_u64(*mask, mlog_and_bits) || | ||
76 | __mlog_test_u64(*mask, mlog_not_bits)) | ||
77 | return; | ||
78 | |||
79 | if (*mask & ML_ERROR) { | ||
80 | level = KERN_ERR; | ||
81 | prefix = "ERROR: "; | ||
82 | } else if (*mask & ML_NOTICE) { | ||
83 | level = KERN_NOTICE; | ||
84 | } else { | ||
85 | level = KERN_INFO; | ||
86 | } | ||
87 | |||
88 | va_start(args, fmt); | ||
89 | |||
90 | vaf.fmt = fmt; | ||
91 | vaf.va = &args; | ||
92 | |||
93 | printk("%s(%s,%u,%u):%s:%d %s%pV", | ||
94 | level, current->comm, task_pid_nr(current), | ||
95 | raw_smp_processor_id(), func, line, prefix, &vaf); | ||
96 | |||
97 | va_end(args); | ||
98 | } | ||
99 | EXPORT_SYMBOL_GPL(__mlog_printk); | ||
100 | |||
67 | struct mlog_attribute { | 101 | struct mlog_attribute { |
68 | struct attribute attr; | 102 | struct attribute attr; |
69 | u64 mask; | 103 | u64 mask; |
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 7fdc25a4d8c0..308ea0eb35fd 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h | |||
@@ -162,38 +162,20 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; | |||
162 | 162 | ||
163 | #endif | 163 | #endif |
164 | 164 | ||
165 | /* | 165 | __printf(4, 5) |
166 | * smp_processor_id() "helpfully" screams when called outside preemptible | 166 | void __mlog_printk(const u64 *m, const char *func, int line, |
167 | * regions in current kernels. sles doesn't have the variants that don't | 167 | const char *fmt, ...); |
168 | * scream. just do this instead of trying to guess which we're building | ||
169 | * against.. *sigh*. | ||
170 | */ | ||
171 | #define __mlog_cpu_guess ({ \ | ||
172 | unsigned long _cpu = get_cpu(); \ | ||
173 | put_cpu(); \ | ||
174 | _cpu; \ | ||
175 | }) | ||
176 | 168 | ||
177 | /* In the following two macros, the whitespace after the ',' just | 169 | /* |
178 | * before ##args is intentional. Otherwise, gcc 2.95 will eat the | 170 | * Testing before the __mlog_printk call lets the compiler eliminate the |
179 | * previous token if args expands to nothing. | 171 | * call completely when (m & ML_ALLOWED_BITS) is 0. |
180 | */ | 172 | */ |
181 | #define __mlog_printk(level, fmt, args...) \ | 173 | #define mlog(mask, fmt, ...) \ |
182 | printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ | 174 | do { \ |
183 | task_pid_nr(current), __mlog_cpu_guess, \ | 175 | u64 _m = MLOG_MASK_PREFIX | (mask); \ |
184 | __PRETTY_FUNCTION__, __LINE__ , ##args) | 176 | if (_m & ML_ALLOWED_BITS) \ |
185 | 177 | __mlog_printk(&_m, __func__, __LINE__, fmt, \ | |
186 | #define mlog(mask, fmt, args...) do { \ | 178 | ##__VA_ARGS__); \ |
187 | u64 __m = MLOG_MASK_PREFIX | (mask); \ | ||
188 | if ((__m & ML_ALLOWED_BITS) && \ | ||
189 | __mlog_test_u64(__m, mlog_and_bits) && \ | ||
190 | !__mlog_test_u64(__m, mlog_not_bits)) { \ | ||
191 | if (__m & ML_ERROR) \ | ||
192 | __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ | ||
193 | else if (__m & ML_NOTICE) \ | ||
194 | __mlog_printk(KERN_NOTICE, fmt , ##args); \ | ||
195 | else __mlog_printk(KERN_INFO, fmt , ##args); \ | ||
196 | } \ | ||
197 | } while (0) | 179 | } while (0) |
198 | 180 | ||
199 | #define mlog_errno(st) ({ \ | 181 | #define mlog_errno(st) ({ \ |
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 56c403a563bc..2d0acd6678fe 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -2204,7 +2204,7 @@ out: | |||
2204 | kfree(o2net_hand); | 2204 | kfree(o2net_hand); |
2205 | kfree(o2net_keep_req); | 2205 | kfree(o2net_keep_req); |
2206 | kfree(o2net_keep_resp); | 2206 | kfree(o2net_keep_resp); |
2207 | 2207 | o2net_debugfs_exit(); | |
2208 | o2quo_exit(); | 2208 | o2quo_exit(); |
2209 | return -ENOMEM; | 2209 | return -ENOMEM; |
2210 | } | 2210 | } |
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index ccd4dcfc3645..02878a83f0b4 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c | |||
@@ -1617,7 +1617,7 @@ int __ocfs2_add_entry(handle_t *handle, | |||
1617 | struct ocfs2_dir_entry *de, *de1; | 1617 | struct ocfs2_dir_entry *de, *de1; |
1618 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data; | 1618 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data; |
1619 | struct super_block *sb = dir->i_sb; | 1619 | struct super_block *sb = dir->i_sb; |
1620 | int retval, status; | 1620 | int retval; |
1621 | unsigned int size = sb->s_blocksize; | 1621 | unsigned int size = sb->s_blocksize; |
1622 | struct buffer_head *insert_bh = lookup->dl_leaf_bh; | 1622 | struct buffer_head *insert_bh = lookup->dl_leaf_bh; |
1623 | char *data_start = insert_bh->b_data; | 1623 | char *data_start = insert_bh->b_data; |
@@ -1695,25 +1695,25 @@ int __ocfs2_add_entry(handle_t *handle, | |||
1695 | } | 1695 | } |
1696 | 1696 | ||
1697 | if (insert_bh == parent_fe_bh) | 1697 | if (insert_bh == parent_fe_bh) |
1698 | status = ocfs2_journal_access_di(handle, | 1698 | retval = ocfs2_journal_access_di(handle, |
1699 | INODE_CACHE(dir), | 1699 | INODE_CACHE(dir), |
1700 | insert_bh, | 1700 | insert_bh, |
1701 | OCFS2_JOURNAL_ACCESS_WRITE); | 1701 | OCFS2_JOURNAL_ACCESS_WRITE); |
1702 | else { | 1702 | else { |
1703 | status = ocfs2_journal_access_db(handle, | 1703 | retval = ocfs2_journal_access_db(handle, |
1704 | INODE_CACHE(dir), | 1704 | INODE_CACHE(dir), |
1705 | insert_bh, | 1705 | insert_bh, |
1706 | OCFS2_JOURNAL_ACCESS_WRITE); | 1706 | OCFS2_JOURNAL_ACCESS_WRITE); |
1707 | 1707 | ||
1708 | if (ocfs2_dir_indexed(dir)) { | 1708 | if (!retval && ocfs2_dir_indexed(dir)) |
1709 | status = ocfs2_dx_dir_insert(dir, | 1709 | retval = ocfs2_dx_dir_insert(dir, |
1710 | handle, | 1710 | handle, |
1711 | lookup); | 1711 | lookup); |
1712 | if (status) { | 1712 | } |
1713 | mlog_errno(status); | 1713 | |
1714 | goto bail; | 1714 | if (retval) { |
1715 | } | 1715 | mlog_errno(retval); |
1716 | } | 1716 | goto bail; |
1717 | } | 1717 | } |
1718 | 1718 | ||
1719 | /* By now the buffer is marked for journaling */ | 1719 | /* By now the buffer is marked for journaling */ |
@@ -3543,13 +3543,10 @@ static void dx_leaf_sort_swap(void *a, void *b, int size) | |||
3543 | { | 3543 | { |
3544 | struct ocfs2_dx_entry *entry1 = a; | 3544 | struct ocfs2_dx_entry *entry1 = a; |
3545 | struct ocfs2_dx_entry *entry2 = b; | 3545 | struct ocfs2_dx_entry *entry2 = b; |
3546 | struct ocfs2_dx_entry tmp; | ||
3547 | 3546 | ||
3548 | BUG_ON(size != sizeof(*entry1)); | 3547 | BUG_ON(size != sizeof(*entry1)); |
3549 | 3548 | ||
3550 | tmp = *entry1; | 3549 | swap(*entry1, *entry2); |
3551 | *entry1 = *entry2; | ||
3552 | *entry2 = tmp; | ||
3553 | } | 3550 | } |
3554 | 3551 | ||
3555 | static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) | 3552 | static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) |
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index fae17c640df3..e88ccf8c83ff 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h | |||
@@ -1014,7 +1014,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, | |||
1014 | 1014 | ||
1015 | /* will exit holding res->spinlock, but may drop in function */ | 1015 | /* will exit holding res->spinlock, but may drop in function */ |
1016 | void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags); | 1016 | void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags); |
1017 | void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags); | ||
1018 | 1017 | ||
1019 | /* will exit holding res->spinlock, but may drop in function */ | 1018 | /* will exit holding res->spinlock, but may drop in function */ |
1020 | static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) | 1019 | static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d8b670cbd909..fbfadb289e62 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -2250,7 +2250,7 @@ out: | |||
2250 | static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, | 2250 | static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, |
2251 | struct iov_iter *from) | 2251 | struct iov_iter *from) |
2252 | { | 2252 | { |
2253 | int direct_io, appending, rw_level, have_alloc_sem = 0; | 2253 | int direct_io, appending, rw_level; |
2254 | int can_do_direct, has_refcount = 0; | 2254 | int can_do_direct, has_refcount = 0; |
2255 | ssize_t written = 0; | 2255 | ssize_t written = 0; |
2256 | ssize_t ret; | 2256 | ssize_t ret; |
@@ -2279,16 +2279,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, | |||
2279 | 2279 | ||
2280 | mutex_lock(&inode->i_mutex); | 2280 | mutex_lock(&inode->i_mutex); |
2281 | 2281 | ||
2282 | ocfs2_iocb_clear_sem_locked(iocb); | ||
2283 | |||
2284 | relock: | 2282 | relock: |
2285 | /* to match setattr's i_mutex -> rw_lock ordering */ | ||
2286 | if (direct_io) { | ||
2287 | have_alloc_sem = 1; | ||
2288 | /* communicate with ocfs2_dio_end_io */ | ||
2289 | ocfs2_iocb_set_sem_locked(iocb); | ||
2290 | } | ||
2291 | |||
2292 | /* | 2283 | /* |
2293 | * Concurrent O_DIRECT writes are allowed with | 2284 | * Concurrent O_DIRECT writes are allowed with |
2294 | * mount_option "coherency=buffered". | 2285 | * mount_option "coherency=buffered". |
@@ -2298,7 +2289,7 @@ relock: | |||
2298 | ret = ocfs2_rw_lock(inode, rw_level); | 2289 | ret = ocfs2_rw_lock(inode, rw_level); |
2299 | if (ret < 0) { | 2290 | if (ret < 0) { |
2300 | mlog_errno(ret); | 2291 | mlog_errno(ret); |
2301 | goto out_sems; | 2292 | goto out_mutex; |
2302 | } | 2293 | } |
2303 | 2294 | ||
2304 | /* | 2295 | /* |
@@ -2347,7 +2338,6 @@ relock: | |||
2347 | if (direct_io && !can_do_direct) { | 2338 | if (direct_io && !can_do_direct) { |
2348 | ocfs2_rw_unlock(inode, rw_level); | 2339 | ocfs2_rw_unlock(inode, rw_level); |
2349 | 2340 | ||
2350 | have_alloc_sem = 0; | ||
2351 | rw_level = -1; | 2341 | rw_level = -1; |
2352 | 2342 | ||
2353 | direct_io = 0; | 2343 | direct_io = 0; |
@@ -2416,7 +2406,6 @@ no_sync: | |||
2416 | */ | 2406 | */ |
2417 | if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { | 2407 | if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { |
2418 | rw_level = -1; | 2408 | rw_level = -1; |
2419 | have_alloc_sem = 0; | ||
2420 | unaligned_dio = 0; | 2409 | unaligned_dio = 0; |
2421 | } | 2410 | } |
2422 | 2411 | ||
@@ -2429,10 +2418,7 @@ out: | |||
2429 | if (rw_level != -1) | 2418 | if (rw_level != -1) |
2430 | ocfs2_rw_unlock(inode, rw_level); | 2419 | ocfs2_rw_unlock(inode, rw_level); |
2431 | 2420 | ||
2432 | out_sems: | 2421 | out_mutex: |
2433 | if (have_alloc_sem) | ||
2434 | ocfs2_iocb_clear_sem_locked(iocb); | ||
2435 | |||
2436 | mutex_unlock(&inode->i_mutex); | 2422 | mutex_unlock(&inode->i_mutex); |
2437 | 2423 | ||
2438 | if (written) | 2424 | if (written) |
@@ -2473,7 +2459,7 @@ bail: | |||
2473 | static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, | 2459 | static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, |
2474 | struct iov_iter *to) | 2460 | struct iov_iter *to) |
2475 | { | 2461 | { |
2476 | int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; | 2462 | int ret = 0, rw_level = -1, lock_level = 0; |
2477 | struct file *filp = iocb->ki_filp; | 2463 | struct file *filp = iocb->ki_filp; |
2478 | struct inode *inode = file_inode(filp); | 2464 | struct inode *inode = file_inode(filp); |
2479 | 2465 | ||
@@ -2490,16 +2476,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, | |||
2490 | goto bail; | 2476 | goto bail; |
2491 | } | 2477 | } |
2492 | 2478 | ||
2493 | ocfs2_iocb_clear_sem_locked(iocb); | ||
2494 | |||
2495 | /* | 2479 | /* |
2496 | * buffered reads protect themselves in ->readpage(). O_DIRECT reads | 2480 | * buffered reads protect themselves in ->readpage(). O_DIRECT reads |
2497 | * need locks to protect pending reads from racing with truncate. | 2481 | * need locks to protect pending reads from racing with truncate. |
2498 | */ | 2482 | */ |
2499 | if (iocb->ki_flags & IOCB_DIRECT) { | 2483 | if (iocb->ki_flags & IOCB_DIRECT) { |
2500 | have_alloc_sem = 1; | ||
2501 | ocfs2_iocb_set_sem_locked(iocb); | ||
2502 | |||
2503 | ret = ocfs2_rw_lock(inode, 0); | 2484 | ret = ocfs2_rw_lock(inode, 0); |
2504 | if (ret < 0) { | 2485 | if (ret < 0) { |
2505 | mlog_errno(ret); | 2486 | mlog_errno(ret); |
@@ -2535,13 +2516,9 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, | |||
2535 | /* see ocfs2_file_write_iter */ | 2516 | /* see ocfs2_file_write_iter */ |
2536 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { | 2517 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { |
2537 | rw_level = -1; | 2518 | rw_level = -1; |
2538 | have_alloc_sem = 0; | ||
2539 | } | 2519 | } |
2540 | 2520 | ||
2541 | bail: | 2521 | bail: |
2542 | if (have_alloc_sem) | ||
2543 | ocfs2_iocb_clear_sem_locked(iocb); | ||
2544 | |||
2545 | if (rw_level != -1) | 2522 | if (rw_level != -1) |
2546 | ocfs2_rw_unlock(inode, rw_level); | 2523 | ocfs2_rw_unlock(inode, rw_level); |
2547 | 2524 | ||
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ff531928269e..7c099f7032fd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -108,7 +108,7 @@ struct ocfs2_replay_map { | |||
108 | unsigned char rm_replay_slots[0]; | 108 | unsigned char rm_replay_slots[0]; |
109 | }; | 109 | }; |
110 | 110 | ||
111 | void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) | 111 | static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) |
112 | { | 112 | { |
113 | if (!osb->replay_map) | 113 | if (!osb->replay_map) |
114 | return; | 114 | return; |
@@ -153,7 +153,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) | |||
153 | return 0; | 153 | return 0; |
154 | } | 154 | } |
155 | 155 | ||
156 | void ocfs2_queue_replay_slots(struct ocfs2_super *osb, | 156 | static void ocfs2_queue_replay_slots(struct ocfs2_super *osb, |
157 | enum ocfs2_orphan_reco_type orphan_reco_type) | 157 | enum ocfs2_orphan_reco_type orphan_reco_type) |
158 | { | 158 | { |
159 | struct ocfs2_replay_map *replay_map = osb->replay_map; | 159 | struct ocfs2_replay_map *replay_map = osb->replay_map; |
@@ -173,7 +173,7 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb, | |||
173 | replay_map->rm_state = REPLAY_DONE; | 173 | replay_map->rm_state = REPLAY_DONE; |
174 | } | 174 | } |
175 | 175 | ||
176 | void ocfs2_free_replay_slots(struct ocfs2_super *osb) | 176 | static void ocfs2_free_replay_slots(struct ocfs2_super *osb) |
177 | { | 177 | { |
178 | struct ocfs2_replay_map *replay_map = osb->replay_map; | 178 | struct ocfs2_replay_map *replay_map = osb->replay_map; |
179 | 179 | ||
@@ -571,9 +571,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, | |||
571 | (unsigned long)bh, | 571 | (unsigned long)bh, |
572 | (unsigned long long)bh->b_blocknr); | 572 | (unsigned long long)bh->b_blocknr); |
573 | 573 | ||
574 | /* We aren't guaranteed to have the superblock here - but if we | 574 | ocfs2_error(bh->b_bdev->bd_super, |
575 | * don't, it'll just crash. */ | ||
576 | ocfs2_error(bh->b_assoc_map->host->i_sb, | ||
577 | "JBD2 has aborted our journal, ocfs2 cannot continue\n"); | 575 | "JBD2 has aborted our journal, ocfs2 cannot continue\n"); |
578 | } | 576 | } |
579 | 577 | ||
@@ -775,7 +773,20 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) | |||
775 | trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr); | 773 | trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr); |
776 | 774 | ||
777 | status = jbd2_journal_dirty_metadata(handle, bh); | 775 | status = jbd2_journal_dirty_metadata(handle, bh); |
778 | BUG_ON(status); | 776 | if (status) { |
777 | mlog_errno(status); | ||
778 | if (!is_handle_aborted(handle)) { | ||
779 | journal_t *journal = handle->h_transaction->t_journal; | ||
780 | struct super_block *sb = bh->b_bdev->bd_super; | ||
781 | |||
782 | mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. " | ||
783 | "Aborting transaction and journal.\n"); | ||
784 | handle->h_err = status; | ||
785 | jbd2_journal_abort_handle(handle); | ||
786 | jbd2_journal_abort(journal, status); | ||
787 | ocfs2_abort(sb, "Journal already aborted.\n"); | ||
788 | } | ||
789 | } | ||
779 | } | 790 | } |
780 | 791 | ||
781 | #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) | 792 | #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) |
@@ -1884,7 +1895,7 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void) | |||
1884 | * hasn't happened. The node queues a scan and increments the | 1895 | * hasn't happened. The node queues a scan and increments the |
1885 | * sequence number in the LVB. | 1896 | * sequence number in the LVB. |
1886 | */ | 1897 | */ |
1887 | void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) | 1898 | static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) |
1888 | { | 1899 | { |
1889 | struct ocfs2_orphan_scan *os; | 1900 | struct ocfs2_orphan_scan *os; |
1890 | int status, i; | 1901 | int status, i; |
@@ -1933,7 +1944,7 @@ out: | |||
1933 | } | 1944 | } |
1934 | 1945 | ||
1935 | /* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */ | 1946 | /* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */ |
1936 | void ocfs2_orphan_scan_work(struct work_struct *work) | 1947 | static void ocfs2_orphan_scan_work(struct work_struct *work) |
1937 | { | 1948 | { |
1938 | struct ocfs2_orphan_scan *os; | 1949 | struct ocfs2_orphan_scan *os; |
1939 | struct ocfs2_super *osb; | 1950 | struct ocfs2_super *osb; |
@@ -2137,6 +2148,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
2137 | struct inode *inode = NULL; | 2148 | struct inode *inode = NULL; |
2138 | struct inode *iter; | 2149 | struct inode *iter; |
2139 | struct ocfs2_inode_info *oi; | 2150 | struct ocfs2_inode_info *oi; |
2151 | struct buffer_head *di_bh = NULL; | ||
2152 | struct ocfs2_dinode *di = NULL; | ||
2140 | 2153 | ||
2141 | trace_ocfs2_recover_orphans(slot); | 2154 | trace_ocfs2_recover_orphans(slot); |
2142 | 2155 | ||
@@ -2157,16 +2170,22 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
2157 | iter = oi->ip_next_orphan; | 2170 | iter = oi->ip_next_orphan; |
2158 | oi->ip_next_orphan = NULL; | 2171 | oi->ip_next_orphan = NULL; |
2159 | 2172 | ||
2173 | ret = ocfs2_rw_lock(inode, 1); | ||
2174 | if (ret < 0) { | ||
2175 | mlog_errno(ret); | ||
2176 | goto next; | ||
2177 | } | ||
2160 | /* | 2178 | /* |
2161 | * We need to take and drop the inode lock to | 2179 | * We need to take and drop the inode lock to |
2162 | * force read inode from disk. | 2180 | * force read inode from disk. |
2163 | */ | 2181 | */ |
2164 | ret = ocfs2_inode_lock(inode, NULL, 0); | 2182 | ret = ocfs2_inode_lock(inode, &di_bh, 1); |
2165 | if (ret) { | 2183 | if (ret) { |
2166 | mlog_errno(ret); | 2184 | mlog_errno(ret); |
2167 | goto next; | 2185 | goto unlock_rw; |
2168 | } | 2186 | } |
2169 | ocfs2_inode_unlock(inode, 0); | 2187 | |
2188 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
2170 | 2189 | ||
2171 | if (inode->i_nlink == 0) { | 2190 | if (inode->i_nlink == 0) { |
2172 | spin_lock(&oi->ip_lock); | 2191 | spin_lock(&oi->ip_lock); |
@@ -2174,43 +2193,30 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
2174 | * ocfs2_delete_inode. */ | 2193 | * ocfs2_delete_inode. */ |
2175 | oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; | 2194 | oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; |
2176 | spin_unlock(&oi->ip_lock); | 2195 | spin_unlock(&oi->ip_lock); |
2177 | } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) { | 2196 | } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && |
2178 | struct buffer_head *di_bh = NULL; | 2197 | (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { |
2179 | |||
2180 | ret = ocfs2_rw_lock(inode, 1); | ||
2181 | if (ret) { | ||
2182 | mlog_errno(ret); | ||
2183 | goto next; | ||
2184 | } | ||
2185 | |||
2186 | ret = ocfs2_inode_lock(inode, &di_bh, 1); | ||
2187 | if (ret < 0) { | ||
2188 | ocfs2_rw_unlock(inode, 1); | ||
2189 | mlog_errno(ret); | ||
2190 | goto next; | ||
2191 | } | ||
2192 | |||
2193 | ret = ocfs2_truncate_file(inode, di_bh, | 2198 | ret = ocfs2_truncate_file(inode, di_bh, |
2194 | i_size_read(inode)); | 2199 | i_size_read(inode)); |
2195 | ocfs2_inode_unlock(inode, 1); | ||
2196 | ocfs2_rw_unlock(inode, 1); | ||
2197 | brelse(di_bh); | ||
2198 | if (ret < 0) { | 2200 | if (ret < 0) { |
2199 | if (ret != -ENOSPC) | 2201 | if (ret != -ENOSPC) |
2200 | mlog_errno(ret); | 2202 | mlog_errno(ret); |
2201 | goto next; | 2203 | goto unlock_inode; |
2202 | } | 2204 | } |
2203 | 2205 | ||
2204 | ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); | 2206 | ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); |
2205 | if (ret) | 2207 | if (ret) |
2206 | mlog_errno(ret); | 2208 | mlog_errno(ret); |
2207 | 2209 | ||
2208 | wake_up(&OCFS2_I(inode)->append_dio_wq); | 2210 | wake_up(&OCFS2_I(inode)->append_dio_wq); |
2209 | } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ | 2211 | } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ |
2210 | 2212 | unlock_inode: | |
2213 | ocfs2_inode_unlock(inode, 1); | ||
2214 | unlock_rw: | ||
2215 | ocfs2_rw_unlock(inode, 1); | ||
2211 | next: | 2216 | next: |
2212 | iput(inode); | 2217 | iput(inode); |
2213 | 2218 | brelse(di_bh); | |
2219 | di_bh = NULL; | ||
2214 | inode = iter; | 2220 | inode = iter; |
2215 | } | 2221 | } |
2216 | 2222 | ||
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 176fe6afd94e..6e6abb93fda5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
@@ -1116,8 +1116,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, | |||
1116 | int inode1_is_ancestor, inode2_is_ancestor; | 1116 | int inode1_is_ancestor, inode2_is_ancestor; |
1117 | struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); | 1117 | struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); |
1118 | struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); | 1118 | struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); |
1119 | struct buffer_head **tmpbh; | ||
1120 | struct inode *tmpinode; | ||
1121 | 1119 | ||
1122 | trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, | 1120 | trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, |
1123 | (unsigned long long)oi2->ip_blkno); | 1121 | (unsigned long long)oi2->ip_blkno); |
@@ -1148,13 +1146,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, | |||
1148 | (oi1->ip_blkno < oi2->ip_blkno && | 1146 | (oi1->ip_blkno < oi2->ip_blkno && |
1149 | inode2_is_ancestor == 0)) { | 1147 | inode2_is_ancestor == 0)) { |
1150 | /* switch id1 and id2 around */ | 1148 | /* switch id1 and id2 around */ |
1151 | tmpbh = bh2; | 1149 | swap(bh2, bh1); |
1152 | bh2 = bh1; | 1150 | swap(inode2, inode1); |
1153 | bh1 = tmpbh; | ||
1154 | |||
1155 | tmpinode = inode2; | ||
1156 | inode2 = inode1; | ||
1157 | inode1 = tmpinode; | ||
1158 | } | 1151 | } |
1159 | /* lock id2 */ | 1152 | /* lock id2 */ |
1160 | status = ocfs2_inode_lock_nested(inode2, bh2, 1, | 1153 | status = ocfs2_inode_lock_nested(inode2, bh2, 1, |
@@ -2670,30 +2663,22 @@ bail: | |||
2670 | } | 2663 | } |
2671 | 2664 | ||
2672 | int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, | 2665 | int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, |
2673 | struct inode *inode, int update_isize, | 2666 | struct inode *inode, struct buffer_head *di_bh, |
2674 | loff_t end) | 2667 | int update_isize, loff_t end) |
2675 | { | 2668 | { |
2676 | struct inode *orphan_dir_inode = NULL; | 2669 | struct inode *orphan_dir_inode = NULL; |
2677 | struct buffer_head *orphan_dir_bh = NULL; | 2670 | struct buffer_head *orphan_dir_bh = NULL; |
2678 | struct buffer_head *di_bh = NULL; | 2671 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; |
2679 | struct ocfs2_dinode *di = NULL; | ||
2680 | handle_t *handle = NULL; | 2672 | handle_t *handle = NULL; |
2681 | int status = 0; | 2673 | int status = 0; |
2682 | 2674 | ||
2683 | status = ocfs2_inode_lock(inode, &di_bh, 1); | ||
2684 | if (status < 0) { | ||
2685 | mlog_errno(status); | ||
2686 | goto bail; | ||
2687 | } | ||
2688 | di = (struct ocfs2_dinode *) di_bh->b_data; | ||
2689 | |||
2690 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, | 2675 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, |
2691 | ORPHAN_DIR_SYSTEM_INODE, | 2676 | ORPHAN_DIR_SYSTEM_INODE, |
2692 | le16_to_cpu(di->i_dio_orphaned_slot)); | 2677 | le16_to_cpu(di->i_dio_orphaned_slot)); |
2693 | if (!orphan_dir_inode) { | 2678 | if (!orphan_dir_inode) { |
2694 | status = -ENOENT; | 2679 | status = -ENOENT; |
2695 | mlog_errno(status); | 2680 | mlog_errno(status); |
2696 | goto bail_unlock_inode; | 2681 | goto bail; |
2697 | } | 2682 | } |
2698 | 2683 | ||
2699 | mutex_lock(&orphan_dir_inode->i_mutex); | 2684 | mutex_lock(&orphan_dir_inode->i_mutex); |
@@ -2702,7 +2687,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, | |||
2702 | mutex_unlock(&orphan_dir_inode->i_mutex); | 2687 | mutex_unlock(&orphan_dir_inode->i_mutex); |
2703 | iput(orphan_dir_inode); | 2688 | iput(orphan_dir_inode); |
2704 | mlog_errno(status); | 2689 | mlog_errno(status); |
2705 | goto bail_unlock_inode; | 2690 | goto bail; |
2706 | } | 2691 | } |
2707 | 2692 | ||
2708 | handle = ocfs2_start_trans(osb, | 2693 | handle = ocfs2_start_trans(osb, |
@@ -2749,10 +2734,6 @@ bail_unlock_orphan: | |||
2749 | brelse(orphan_dir_bh); | 2734 | brelse(orphan_dir_bh); |
2750 | iput(orphan_dir_inode); | 2735 | iput(orphan_dir_inode); |
2751 | 2736 | ||
2752 | bail_unlock_inode: | ||
2753 | ocfs2_inode_unlock(inode, 1); | ||
2754 | brelse(di_bh); | ||
2755 | |||
2756 | bail: | 2737 | bail: |
2757 | return status; | 2738 | return status; |
2758 | } | 2739 | } |
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 5ddecce172fa..e173329eb830 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h | |||
@@ -42,8 +42,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, | |||
42 | int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, | 42 | int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, |
43 | struct inode *inode); | 43 | struct inode *inode); |
44 | int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, | 44 | int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, |
45 | struct inode *inode, int update_isize, | 45 | struct inode *inode, struct buffer_head *di_bh, |
46 | loff_t end); | 46 | int update_isize, loff_t end); |
47 | int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, | 47 | int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, |
48 | struct inode *new_inode, | 48 | struct inode *new_inode, |
49 | struct dentry *new_dentry); | 49 | struct dentry *new_dentry); |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 460c6c37e683..690ddc60189b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -717,6 +717,16 @@ static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, | |||
717 | return (u64)clusters << c_to_b_bits; | 717 | return (u64)clusters << c_to_b_bits; |
718 | } | 718 | } |
719 | 719 | ||
720 | static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb, | ||
721 | u64 blocks) | ||
722 | { | ||
723 | int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - | ||
724 | sb->s_blocksize_bits; | ||
725 | |||
726 | blocks += (1 << b_to_c_bits) - 1; | ||
727 | return (u32)(blocks >> b_to_c_bits); | ||
728 | } | ||
729 | |||
720 | static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, | 730 | static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, |
721 | u64 blocks) | 731 | u64 blocks) |
722 | { | 732 | { |
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index d8c6af101f3f..b69dd14c0b9b 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c | |||
@@ -1406,11 +1406,9 @@ static int cmp_refcount_rec_by_cpos(const void *a, const void *b) | |||
1406 | 1406 | ||
1407 | static void swap_refcount_rec(void *a, void *b, int size) | 1407 | static void swap_refcount_rec(void *a, void *b, int size) |
1408 | { | 1408 | { |
1409 | struct ocfs2_refcount_rec *l = a, *r = b, tmp; | 1409 | struct ocfs2_refcount_rec *l = a, *r = b; |
1410 | 1410 | ||
1411 | tmp = *l; | 1411 | swap(*l, *r); |
1412 | *l = *r; | ||
1413 | *r = tmp; | ||
1414 | } | 1412 | } |
1415 | 1413 | ||
1416 | /* | 1414 | /* |
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d03bfbf3d27d..889f3796a0d7 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c | |||
@@ -7271,7 +7271,7 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, | |||
7271 | name, value, size, flags); | 7271 | name, value, size, flags); |
7272 | } | 7272 | } |
7273 | 7273 | ||
7274 | int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, | 7274 | static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, |
7275 | void *fs_info) | 7275 | void *fs_info) |
7276 | { | 7276 | { |
7277 | const struct xattr *xattr; | 7277 | const struct xattr *xattr; |
diff --git a/fs/proc/array.c b/fs/proc/array.c index fd02a9ebfc30..3f57dac31ba6 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -126,6 +126,14 @@ static inline const char *get_task_state(struct task_struct *tsk) | |||
126 | { | 126 | { |
127 | unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; | 127 | unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; |
128 | 128 | ||
129 | /* | ||
130 | * Parked tasks do not run; they sit in __kthread_parkme(). | ||
131 | * Without this check, we would report them as running, which is | ||
132 | * clearly wrong, so we report them as sleeping instead. | ||
133 | */ | ||
134 | if (tsk->state == TASK_PARKED) | ||
135 | state = TASK_INTERRUPTIBLE; | ||
136 | |||
129 | BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); | 137 | BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); |
130 | 138 | ||
131 | return task_state_array[fls(state)]; | 139 | return task_state_array[fls(state)]; |
diff --git a/fs/splice.c b/fs/splice.c index 4f355a1c1a9e..5fc1e50a7f30 100644 --- a/fs/splice.c +++ b/fs/splice.c | |||
@@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, | |||
360 | break; | 360 | break; |
361 | 361 | ||
362 | error = add_to_page_cache_lru(page, mapping, index, | 362 | error = add_to_page_cache_lru(page, mapping, index, |
363 | GFP_KERNEL); | 363 | GFP_KERNEL & mapping_gfp_mask(mapping)); |
364 | if (unlikely(error)) { | 364 | if (unlikely(error)) { |
365 | page_cache_release(page); | 365 | page_cache_release(page); |
366 | if (error == -EEXIST) | 366 | if (error == -EEXIST) |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index bd910ceaccfa..29c57b2cb344 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -96,11 +96,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, | |||
96 | } | 96 | } |
97 | #endif | 97 | #endif |
98 | 98 | ||
99 | #ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR | 99 | #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR |
100 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 100 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
101 | static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, | 101 | static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, |
102 | unsigned long address, | 102 | unsigned long address, |
103 | pmd_t *pmdp) | 103 | pmd_t *pmdp) |
104 | { | 104 | { |
105 | pmd_t pmd = *pmdp; | 105 | pmd_t pmd = *pmdp; |
106 | pmd_clear(pmdp); | 106 | pmd_clear(pmdp); |
@@ -109,13 +109,13 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, | |||
109 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 109 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
110 | #endif | 110 | #endif |
111 | 111 | ||
112 | #ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL | 112 | #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL |
113 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 113 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
114 | static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm, | 114 | static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, |
115 | unsigned long address, pmd_t *pmdp, | 115 | unsigned long address, pmd_t *pmdp, |
116 | int full) | 116 | int full) |
117 | { | 117 | { |
118 | return pmdp_get_and_clear(mm, address, pmdp); | 118 | return pmdp_huge_get_and_clear(mm, address, pmdp); |
119 | } | 119 | } |
120 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 120 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
121 | #endif | 121 | #endif |
@@ -152,8 +152,8 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma, | |||
152 | pte_t *ptep); | 152 | pte_t *ptep); |
153 | #endif | 153 | #endif |
154 | 154 | ||
155 | #ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH | 155 | #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH |
156 | extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, | 156 | extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, |
157 | unsigned long address, | 157 | unsigned long address, |
158 | pmd_t *pmdp); | 158 | pmd_t *pmdp); |
159 | #endif | 159 | #endif |
@@ -189,6 +189,22 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma, | |||
189 | unsigned long address, pmd_t *pmdp); | 189 | unsigned long address, pmd_t *pmdp); |
190 | #endif | 190 | #endif |
191 | 191 | ||
192 | #ifndef pmdp_collapse_flush | ||
193 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
194 | extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, | ||
195 | unsigned long address, pmd_t *pmdp); | ||
196 | #else | ||
197 | static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, | ||
198 | unsigned long address, | ||
199 | pmd_t *pmdp) | ||
200 | { | ||
201 | BUILD_BUG(); | ||
202 | return *pmdp; | ||
203 | } | ||
204 | #define pmdp_collapse_flush pmdp_collapse_flush | ||
205 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
206 | #endif | ||
207 | |||
192 | #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT | 208 | #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT |
193 | extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, | 209 | extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, |
194 | pgtable_t pgtable); | 210 | pgtable_t pgtable); |
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 0995c2de8162..f589222bfa87 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h | |||
@@ -357,12 +357,12 @@ extern void *alloc_large_system_hash(const char *tablename, | |||
357 | /* Only NUMA needs hash distribution. 64bit NUMA architectures have | 357 | /* Only NUMA needs hash distribution. 64bit NUMA architectures have |
358 | * sufficient vmalloc space. | 358 | * sufficient vmalloc space. |
359 | */ | 359 | */ |
360 | #if defined(CONFIG_NUMA) && defined(CONFIG_64BIT) | 360 | #ifdef CONFIG_NUMA |
361 | #define HASHDIST_DEFAULT 1 | 361 | #define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT) |
362 | extern int hashdist; /* Distribute hashes across NUMA nodes? */ | ||
362 | #else | 363 | #else |
363 | #define HASHDIST_DEFAULT 0 | 364 | #define hashdist (0) |
364 | #endif | 365 | #endif |
365 | extern int hashdist; /* Distribute hashes across NUMA nodes? */ | ||
366 | 366 | ||
367 | 367 | ||
368 | #endif /* _LINUX_BOOTMEM_H */ | 368 | #endif /* _LINUX_BOOTMEM_H */ |
diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 34025df61829..c9e5c57e4edf 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h | |||
@@ -71,7 +71,6 @@ static inline char *config_item_name(struct config_item * item) | |||
71 | return item->ci_name; | 71 | return item->ci_name; |
72 | } | 72 | } |
73 | 73 | ||
74 | extern void config_item_init(struct config_item *); | ||
75 | extern void config_item_init_type_name(struct config_item *item, | 74 | extern void config_item_init_type_name(struct config_item *item, |
76 | const char *name, | 75 | const char *name, |
77 | struct config_item_type *type); | 76 | struct config_item_type *type); |
diff --git a/include/linux/efi.h b/include/linux/efi.h index 2092965afca3..5f19efe4eb3f 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h | |||
@@ -96,6 +96,8 @@ typedef struct { | |||
96 | #define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ | 96 | #define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ |
97 | #define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ | 97 | #define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ |
98 | #define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ | 98 | #define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ |
99 | #define EFI_MEMORY_MORE_RELIABLE \ | ||
100 | ((u64)0x0000000000010000ULL) /* higher reliability */ | ||
99 | #define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ | 101 | #define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ |
100 | #define EFI_MEMORY_DESCRIPTOR_VERSION 1 | 102 | #define EFI_MEMORY_DESCRIPTOR_VERSION 1 |
101 | 103 | ||
@@ -868,6 +870,7 @@ extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if pos | |||
868 | extern void efi_late_init(void); | 870 | extern void efi_late_init(void); |
869 | extern void efi_free_boot_services(void); | 871 | extern void efi_free_boot_services(void); |
870 | extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size); | 872 | extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size); |
873 | extern void efi_find_mirror(void); | ||
871 | #else | 874 | #else |
872 | static inline void efi_late_init(void) {} | 875 | static inline void efi_late_init(void) {} |
873 | static inline void efi_free_boot_services(void) {} | 876 | static inline void efi_free_boot_services(void) {} |
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 8293262401de..e65ef959546c 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h | |||
@@ -6,16 +6,16 @@ | |||
6 | #include <linux/bitops.h> | 6 | #include <linux/bitops.h> |
7 | 7 | ||
8 | struct frontswap_ops { | 8 | struct frontswap_ops { |
9 | void (*init)(unsigned); | 9 | void (*init)(unsigned); /* this swap type was just swapon'ed */ |
10 | int (*store)(unsigned, pgoff_t, struct page *); | 10 | int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ |
11 | int (*load)(unsigned, pgoff_t, struct page *); | 11 | int (*load)(unsigned, pgoff_t, struct page *); /* load a page */ |
12 | void (*invalidate_page)(unsigned, pgoff_t); | 12 | void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */ |
13 | void (*invalidate_area)(unsigned); | 13 | void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */ |
14 | struct frontswap_ops *next; /* private pointer to next ops */ | ||
14 | }; | 15 | }; |
15 | 16 | ||
16 | extern bool frontswap_enabled; | 17 | extern bool frontswap_enabled; |
17 | extern struct frontswap_ops * | 18 | extern void frontswap_register_ops(struct frontswap_ops *ops); |
18 | frontswap_register_ops(struct frontswap_ops *ops); | ||
19 | extern void frontswap_shrink(unsigned long); | 19 | extern void frontswap_shrink(unsigned long); |
20 | extern unsigned long frontswap_curr_pages(void); | 20 | extern unsigned long frontswap_curr_pages(void); |
21 | extern void frontswap_writethrough(bool); | 21 | extern void frontswap_writethrough(bool); |
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0f313f93c586..65a517dd32f7 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h | |||
@@ -84,8 +84,6 @@ struct fsnotify_fname; | |||
84 | * Each group much define these ops. The fsnotify infrastructure will call | 84 | * Each group much define these ops. The fsnotify infrastructure will call |
85 | * these operations for each relevant group. | 85 | * these operations for each relevant group. |
86 | * | 86 | * |
87 | * should_send_event - given a group, inode, and mask this function determines | ||
88 | * if the group is interested in this event. | ||
89 | * handle_event - main call for a group to handle an fs event | 87 | * handle_event - main call for a group to handle an fs event |
90 | * free_group_priv - called when a group refcnt hits 0 to clean up the private union | 88 | * free_group_priv - called when a group refcnt hits 0 to clean up the private union |
91 | * freeing_mark - called when a mark is being destroyed for some reason. The group | 89 | * freeing_mark - called when a mark is being destroyed for some reason. The group |
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index e705467ddb47..d0a1f99e24e3 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h | |||
@@ -28,7 +28,8 @@ | |||
28 | extern void kmemleak_init(void) __ref; | 28 | extern void kmemleak_init(void) __ref; |
29 | extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, | 29 | extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, |
30 | gfp_t gfp) __ref; | 30 | gfp_t gfp) __ref; |
31 | extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref; | 31 | extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, |
32 | gfp_t gfp) __ref; | ||
32 | extern void kmemleak_free(const void *ptr) __ref; | 33 | extern void kmemleak_free(const void *ptr) __ref; |
33 | extern void kmemleak_free_part(const void *ptr, size_t size) __ref; | 34 | extern void kmemleak_free_part(const void *ptr, size_t size) __ref; |
34 | extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; | 35 | extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; |
@@ -71,7 +72,8 @@ static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, | |||
71 | gfp_t gfp) | 72 | gfp_t gfp) |
72 | { | 73 | { |
73 | } | 74 | } |
74 | static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) | 75 | static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, |
76 | gfp_t gfp) | ||
75 | { | 77 | { |
76 | } | 78 | } |
77 | static inline void kmemleak_free(const void *ptr) | 79 | static inline void kmemleak_free(const void *ptr) |
diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 9497ec7c77ea..0215ffd63069 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h | |||
@@ -21,7 +21,11 @@ | |||
21 | #define INIT_PHYSMEM_REGIONS 4 | 21 | #define INIT_PHYSMEM_REGIONS 4 |
22 | 22 | ||
23 | /* Definition of memblock flags. */ | 23 | /* Definition of memblock flags. */ |
24 | #define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */ | 24 | enum { |
25 | MEMBLOCK_NONE = 0x0, /* No special request */ | ||
26 | MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ | ||
27 | MEMBLOCK_MIRROR = 0x2, /* mirrored region */ | ||
28 | }; | ||
25 | 29 | ||
26 | struct memblock_region { | 30 | struct memblock_region { |
27 | phys_addr_t base; | 31 | phys_addr_t base; |
@@ -61,7 +65,7 @@ extern bool movable_node_enabled; | |||
61 | 65 | ||
62 | phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, | 66 | phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, |
63 | phys_addr_t start, phys_addr_t end, | 67 | phys_addr_t start, phys_addr_t end, |
64 | int nid); | 68 | int nid, ulong flags); |
65 | phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, | 69 | phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, |
66 | phys_addr_t size, phys_addr_t align); | 70 | phys_addr_t size, phys_addr_t align); |
67 | phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); | 71 | phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); |
@@ -75,6 +79,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); | |||
75 | void memblock_trim_memory(phys_addr_t align); | 79 | void memblock_trim_memory(phys_addr_t align); |
76 | int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); | 80 | int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); |
77 | int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); | 81 | int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); |
82 | int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); | ||
83 | ulong choose_memblock_flags(void); | ||
78 | 84 | ||
79 | /* Low level functions */ | 85 | /* Low level functions */ |
80 | int memblock_add_range(struct memblock_type *type, | 86 | int memblock_add_range(struct memblock_type *type, |
@@ -85,11 +91,13 @@ int memblock_remove_range(struct memblock_type *type, | |||
85 | phys_addr_t base, | 91 | phys_addr_t base, |
86 | phys_addr_t size); | 92 | phys_addr_t size); |
87 | 93 | ||
88 | void __next_mem_range(u64 *idx, int nid, struct memblock_type *type_a, | 94 | void __next_mem_range(u64 *idx, int nid, ulong flags, |
95 | struct memblock_type *type_a, | ||
89 | struct memblock_type *type_b, phys_addr_t *out_start, | 96 | struct memblock_type *type_b, phys_addr_t *out_start, |
90 | phys_addr_t *out_end, int *out_nid); | 97 | phys_addr_t *out_end, int *out_nid); |
91 | 98 | ||
92 | void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, | 99 | void __next_mem_range_rev(u64 *idx, int nid, ulong flags, |
100 | struct memblock_type *type_a, | ||
93 | struct memblock_type *type_b, phys_addr_t *out_start, | 101 | struct memblock_type *type_b, phys_addr_t *out_start, |
94 | phys_addr_t *out_end, int *out_nid); | 102 | phys_addr_t *out_end, int *out_nid); |
95 | 103 | ||
@@ -100,16 +108,17 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, | |||
100 | * @type_a: ptr to memblock_type to iterate | 108 | * @type_a: ptr to memblock_type to iterate |
101 | * @type_b: ptr to memblock_type which excludes from the iteration | 109 | * @type_b: ptr to memblock_type which excludes from the iteration |
102 | * @nid: node selector, %NUMA_NO_NODE for all nodes | 110 | * @nid: node selector, %NUMA_NO_NODE for all nodes |
111 | * @flags: pick from blocks based on memory attributes | ||
103 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 112 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL |
104 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 113 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL |
105 | * @p_nid: ptr to int for nid of the range, can be %NULL | 114 | * @p_nid: ptr to int for nid of the range, can be %NULL |
106 | */ | 115 | */ |
107 | #define for_each_mem_range(i, type_a, type_b, nid, \ | 116 | #define for_each_mem_range(i, type_a, type_b, nid, flags, \ |
108 | p_start, p_end, p_nid) \ | 117 | p_start, p_end, p_nid) \ |
109 | for (i = 0, __next_mem_range(&i, nid, type_a, type_b, \ | 118 | for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \ |
110 | p_start, p_end, p_nid); \ | 119 | p_start, p_end, p_nid); \ |
111 | i != (u64)ULLONG_MAX; \ | 120 | i != (u64)ULLONG_MAX; \ |
112 | __next_mem_range(&i, nid, type_a, type_b, \ | 121 | __next_mem_range(&i, nid, flags, type_a, type_b, \ |
113 | p_start, p_end, p_nid)) | 122 | p_start, p_end, p_nid)) |
114 | 123 | ||
115 | /** | 124 | /** |
@@ -119,17 +128,18 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, | |||
119 | * @type_a: ptr to memblock_type to iterate | 128 | * @type_a: ptr to memblock_type to iterate |
120 | * @type_b: ptr to memblock_type which excludes from the iteration | 129 | * @type_b: ptr to memblock_type which excludes from the iteration |
121 | * @nid: node selector, %NUMA_NO_NODE for all nodes | 130 | * @nid: node selector, %NUMA_NO_NODE for all nodes |
131 | * @flags: pick from blocks based on memory attributes | ||
122 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 132 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL |
123 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 133 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL |
124 | * @p_nid: ptr to int for nid of the range, can be %NULL | 134 | * @p_nid: ptr to int for nid of the range, can be %NULL |
125 | */ | 135 | */ |
126 | #define for_each_mem_range_rev(i, type_a, type_b, nid, \ | 136 | #define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \ |
127 | p_start, p_end, p_nid) \ | 137 | p_start, p_end, p_nid) \ |
128 | for (i = (u64)ULLONG_MAX, \ | 138 | for (i = (u64)ULLONG_MAX, \ |
129 | __next_mem_range_rev(&i, nid, type_a, type_b, \ | 139 | __next_mem_range_rev(&i, nid, flags, type_a, type_b,\ |
130 | p_start, p_end, p_nid); \ | 140 | p_start, p_end, p_nid); \ |
131 | i != (u64)ULLONG_MAX; \ | 141 | i != (u64)ULLONG_MAX; \ |
132 | __next_mem_range_rev(&i, nid, type_a, type_b, \ | 142 | __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ |
133 | p_start, p_end, p_nid)) | 143 | p_start, p_end, p_nid)) |
134 | 144 | ||
135 | #ifdef CONFIG_MOVABLE_NODE | 145 | #ifdef CONFIG_MOVABLE_NODE |
@@ -153,6 +163,11 @@ static inline bool movable_node_is_enabled(void) | |||
153 | } | 163 | } |
154 | #endif | 164 | #endif |
155 | 165 | ||
166 | static inline bool memblock_is_mirror(struct memblock_region *m) | ||
167 | { | ||
168 | return m->flags & MEMBLOCK_MIRROR; | ||
169 | } | ||
170 | |||
156 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 171 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
157 | int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, | 172 | int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, |
158 | unsigned long *end_pfn); | 173 | unsigned long *end_pfn); |
@@ -181,13 +196,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, | |||
181 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 196 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL |
182 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 197 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL |
183 | * @p_nid: ptr to int for nid of the range, can be %NULL | 198 | * @p_nid: ptr to int for nid of the range, can be %NULL |
199 | * @flags: pick from blocks based on memory attributes | ||
184 | * | 200 | * |
185 | * Walks over free (memory && !reserved) areas of memblock. Available as | 201 | * Walks over free (memory && !reserved) areas of memblock. Available as |
186 | * soon as memblock is initialized. | 202 | * soon as memblock is initialized. |
187 | */ | 203 | */ |
188 | #define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \ | 204 | #define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \ |
189 | for_each_mem_range(i, &memblock.memory, &memblock.reserved, \ | 205 | for_each_mem_range(i, &memblock.memory, &memblock.reserved, \ |
190 | nid, p_start, p_end, p_nid) | 206 | nid, flags, p_start, p_end, p_nid) |
191 | 207 | ||
192 | /** | 208 | /** |
193 | * for_each_free_mem_range_reverse - rev-iterate through free memblock areas | 209 | * for_each_free_mem_range_reverse - rev-iterate through free memblock areas |
@@ -196,13 +212,15 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, | |||
196 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 212 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL |
197 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 213 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL |
198 | * @p_nid: ptr to int for nid of the range, can be %NULL | 214 | * @p_nid: ptr to int for nid of the range, can be %NULL |
215 | * @flags: pick from blocks based on memory attributes | ||
199 | * | 216 | * |
200 | * Walks over free (memory && !reserved) areas of memblock in reverse | 217 | * Walks over free (memory && !reserved) areas of memblock in reverse |
201 | * order. Available as soon as memblock is initialized. | 218 | * order. Available as soon as memblock is initialized. |
202 | */ | 219 | */ |
203 | #define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \ | 220 | #define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \ |
221 | p_nid) \ | ||
204 | for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ | 222 | for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ |
205 | nid, p_start, p_end, p_nid) | 223 | nid, flags, p_start, p_end, p_nid) |
206 | 224 | ||
207 | static inline void memblock_set_region_flags(struct memblock_region *r, | 225 | static inline void memblock_set_region_flags(struct memblock_region *r, |
208 | unsigned long flags) | 226 | unsigned long flags) |
@@ -273,7 +291,8 @@ static inline bool memblock_bottom_up(void) { return false; } | |||
273 | #define MEMBLOCK_ALLOC_ACCESSIBLE 0 | 291 | #define MEMBLOCK_ALLOC_ACCESSIBLE 0 |
274 | 292 | ||
275 | phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, | 293 | phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, |
276 | phys_addr_t start, phys_addr_t end); | 294 | phys_addr_t start, phys_addr_t end, |
295 | ulong flags); | ||
277 | phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, | 296 | phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, |
278 | phys_addr_t max_addr); | 297 | phys_addr_t max_addr); |
279 | phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, | 298 | phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, |
diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h new file mode 100644 index 000000000000..4efc3f56e6df --- /dev/null +++ b/include/linux/mm-arch-hooks.h | |||
@@ -0,0 +1,25 @@ | |||
1 | /* | ||
2 | * Generic mm no-op hooks. | ||
3 | * | ||
4 | * Copyright (C) 2015, IBM Corporation | ||
5 | * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | #ifndef _LINUX_MM_ARCH_HOOKS_H | ||
12 | #define _LINUX_MM_ARCH_HOOKS_H | ||
13 | |||
14 | #include <asm/mm-arch-hooks.h> | ||
15 | |||
16 | #ifndef arch_remap | ||
17 | static inline void arch_remap(struct mm_struct *mm, | ||
18 | unsigned long old_start, unsigned long old_end, | ||
19 | unsigned long new_start, unsigned long new_end) | ||
20 | { | ||
21 | } | ||
22 | #define arch_remap arch_remap | ||
23 | #endif | ||
24 | |||
25 | #endif /* _LINUX_MM_ARCH_HOOKS_H */ | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index 0755b9fd03a7..24ad583596d1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -499,7 +499,7 @@ static inline int page_count(struct page *page) | |||
499 | 499 | ||
500 | static inline bool __compound_tail_refcounted(struct page *page) | 500 | static inline bool __compound_tail_refcounted(struct page *page) |
501 | { | 501 | { |
502 | return !PageSlab(page) && !PageHeadHuge(page); | 502 | return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); |
503 | } | 503 | } |
504 | 504 | ||
505 | /* | 505 | /* |
@@ -2146,12 +2146,47 @@ enum mf_flags { | |||
2146 | extern int memory_failure(unsigned long pfn, int trapno, int flags); | 2146 | extern int memory_failure(unsigned long pfn, int trapno, int flags); |
2147 | extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); | 2147 | extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); |
2148 | extern int unpoison_memory(unsigned long pfn); | 2148 | extern int unpoison_memory(unsigned long pfn); |
2149 | extern int get_hwpoison_page(struct page *page); | ||
2149 | extern int sysctl_memory_failure_early_kill; | 2150 | extern int sysctl_memory_failure_early_kill; |
2150 | extern int sysctl_memory_failure_recovery; | 2151 | extern int sysctl_memory_failure_recovery; |
2151 | extern void shake_page(struct page *p, int access); | 2152 | extern void shake_page(struct page *p, int access); |
2152 | extern atomic_long_t num_poisoned_pages; | 2153 | extern atomic_long_t num_poisoned_pages; |
2153 | extern int soft_offline_page(struct page *page, int flags); | 2154 | extern int soft_offline_page(struct page *page, int flags); |
2154 | 2155 | ||
2156 | |||
2157 | /* | ||
2158 | * Error handlers for various types of pages. | ||
2159 | */ | ||
2160 | enum mf_result { | ||
2161 | MF_IGNORED, /* Error: cannot be handled */ | ||
2162 | MF_FAILED, /* Error: handling failed */ | ||
2163 | MF_DELAYED, /* Will be handled later */ | ||
2164 | MF_RECOVERED, /* Successfully recovered */ | ||
2165 | }; | ||
2166 | |||
2167 | enum mf_action_page_type { | ||
2168 | MF_MSG_KERNEL, | ||
2169 | MF_MSG_KERNEL_HIGH_ORDER, | ||
2170 | MF_MSG_SLAB, | ||
2171 | MF_MSG_DIFFERENT_COMPOUND, | ||
2172 | MF_MSG_POISONED_HUGE, | ||
2173 | MF_MSG_HUGE, | ||
2174 | MF_MSG_FREE_HUGE, | ||
2175 | MF_MSG_UNMAP_FAILED, | ||
2176 | MF_MSG_DIRTY_SWAPCACHE, | ||
2177 | MF_MSG_CLEAN_SWAPCACHE, | ||
2178 | MF_MSG_DIRTY_MLOCKED_LRU, | ||
2179 | MF_MSG_CLEAN_MLOCKED_LRU, | ||
2180 | MF_MSG_DIRTY_UNEVICTABLE_LRU, | ||
2181 | MF_MSG_CLEAN_UNEVICTABLE_LRU, | ||
2182 | MF_MSG_DIRTY_LRU, | ||
2183 | MF_MSG_CLEAN_LRU, | ||
2184 | MF_MSG_TRUNCATED_LRU, | ||
2185 | MF_MSG_BUDDY, | ||
2186 | MF_MSG_BUDDY_2ND, | ||
2187 | MF_MSG_UNKNOWN, | ||
2188 | }; | ||
2189 | |||
2155 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) | 2190 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) |
2156 | extern void clear_huge_page(struct page *page, | 2191 | extern void clear_huge_page(struct page *page, |
2157 | unsigned long addr, | 2192 | unsigned long addr, |
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 95243d28a0ee..61cd67f4d788 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h | |||
@@ -324,25 +324,25 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) | |||
324 | ___pte; \ | 324 | ___pte; \ |
325 | }) | 325 | }) |
326 | 326 | ||
327 | #define pmdp_clear_flush_notify(__vma, __haddr, __pmd) \ | 327 | #define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ |
328 | ({ \ | 328 | ({ \ |
329 | unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ | 329 | unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ |
330 | struct mm_struct *___mm = (__vma)->vm_mm; \ | 330 | struct mm_struct *___mm = (__vma)->vm_mm; \ |
331 | pmd_t ___pmd; \ | 331 | pmd_t ___pmd; \ |
332 | \ | 332 | \ |
333 | ___pmd = pmdp_clear_flush(__vma, __haddr, __pmd); \ | 333 | ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ |
334 | mmu_notifier_invalidate_range(___mm, ___haddr, \ | 334 | mmu_notifier_invalidate_range(___mm, ___haddr, \ |
335 | ___haddr + HPAGE_PMD_SIZE); \ | 335 | ___haddr + HPAGE_PMD_SIZE); \ |
336 | \ | 336 | \ |
337 | ___pmd; \ | 337 | ___pmd; \ |
338 | }) | 338 | }) |
339 | 339 | ||
340 | #define pmdp_get_and_clear_notify(__mm, __haddr, __pmd) \ | 340 | #define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \ |
341 | ({ \ | 341 | ({ \ |
342 | unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ | 342 | unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ |
343 | pmd_t ___pmd; \ | 343 | pmd_t ___pmd; \ |
344 | \ | 344 | \ |
345 | ___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd); \ | 345 | ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd); \ |
346 | mmu_notifier_invalidate_range(__mm, ___haddr, \ | 346 | mmu_notifier_invalidate_range(__mm, ___haddr, \ |
347 | ___haddr + HPAGE_PMD_SIZE); \ | 347 | ___haddr + HPAGE_PMD_SIZE); \ |
348 | \ | 348 | \ |
@@ -428,8 +428,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) | |||
428 | #define ptep_clear_flush_young_notify ptep_clear_flush_young | 428 | #define ptep_clear_flush_young_notify ptep_clear_flush_young |
429 | #define pmdp_clear_flush_young_notify pmdp_clear_flush_young | 429 | #define pmdp_clear_flush_young_notify pmdp_clear_flush_young |
430 | #define ptep_clear_flush_notify ptep_clear_flush | 430 | #define ptep_clear_flush_notify ptep_clear_flush |
431 | #define pmdp_clear_flush_notify pmdp_clear_flush | 431 | #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush |
432 | #define pmdp_get_and_clear_notify pmdp_get_and_clear | 432 | #define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear |
433 | #define set_pte_at_notify set_pte_at | 433 | #define set_pte_at_notify set_pte_at |
434 | 434 | ||
435 | #endif /* CONFIG_MMU_NOTIFIER */ | 435 | #endif /* CONFIG_MMU_NOTIFIER */ |
diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 3d46fb4708e0..f94da0e65dea 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h | |||
@@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled; | |||
67 | extern int soft_watchdog_enabled; | 67 | extern int soft_watchdog_enabled; |
68 | extern int watchdog_user_enabled; | 68 | extern int watchdog_user_enabled; |
69 | extern int watchdog_thresh; | 69 | extern int watchdog_thresh; |
70 | extern unsigned long *watchdog_cpumask_bits; | ||
70 | extern int sysctl_softlockup_all_cpu_backtrace; | 71 | extern int sysctl_softlockup_all_cpu_backtrace; |
71 | struct ctl_table; | 72 | struct ctl_table; |
72 | extern int proc_watchdog(struct ctl_table *, int , | 73 | extern int proc_watchdog(struct ctl_table *, int , |
@@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int , | |||
77 | void __user *, size_t *, loff_t *); | 78 | void __user *, size_t *, loff_t *); |
78 | extern int proc_watchdog_thresh(struct ctl_table *, int , | 79 | extern int proc_watchdog_thresh(struct ctl_table *, int , |
79 | void __user *, size_t *, loff_t *); | 80 | void __user *, size_t *, loff_t *); |
81 | extern int proc_watchdog_cpumask(struct ctl_table *, int, | ||
82 | void __user *, size_t *, loff_t *); | ||
80 | #endif | 83 | #endif |
81 | 84 | ||
82 | #ifdef CONFIG_HAVE_ACPI_APEI_NMI | 85 | #ifdef CONFIG_HAVE_ACPI_APEI_NMI |
diff --git a/include/linux/oom.h b/include/linux/oom.h index 44b2f6f7bbd8..7deecb7bca5e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -32,6 +32,8 @@ enum oom_scan_t { | |||
32 | /* Thread is the potential origin of an oom condition; kill first on oom */ | 32 | /* Thread is the potential origin of an oom condition; kill first on oom */ |
33 | #define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1) | 33 | #define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1) |
34 | 34 | ||
35 | extern struct mutex oom_lock; | ||
36 | |||
35 | static inline void set_current_oom_origin(void) | 37 | static inline void set_current_oom_origin(void) |
36 | { | 38 | { |
37 | current->signal->oom_flags |= OOM_FLAG_ORIGIN; | 39 | current->signal->oom_flags |= OOM_FLAG_ORIGIN; |
@@ -47,9 +49,7 @@ static inline bool oom_task_origin(const struct task_struct *p) | |||
47 | return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); | 49 | return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); |
48 | } | 50 | } |
49 | 51 | ||
50 | extern void mark_tsk_oom_victim(struct task_struct *tsk); | 52 | extern void mark_oom_victim(struct task_struct *tsk); |
51 | |||
52 | extern void unmark_oom_victim(void); | ||
53 | 53 | ||
54 | extern unsigned long oom_badness(struct task_struct *p, | 54 | extern unsigned long oom_badness(struct task_struct *p, |
55 | struct mem_cgroup *memcg, const nodemask_t *nodemask, | 55 | struct mem_cgroup *memcg, const nodemask_t *nodemask, |
@@ -62,9 +62,6 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
62 | struct mem_cgroup *memcg, nodemask_t *nodemask, | 62 | struct mem_cgroup *memcg, nodemask_t *nodemask, |
63 | const char *message); | 63 | const char *message); |
64 | 64 | ||
65 | extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags); | ||
66 | extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags); | ||
67 | |||
68 | extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | 65 | extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, |
69 | int order, const nodemask_t *nodemask, | 66 | int order, const nodemask_t *nodemask, |
70 | struct mem_cgroup *memcg); | 67 | struct mem_cgroup *memcg); |
@@ -75,6 +72,9 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
75 | 72 | ||
76 | extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 73 | extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
77 | int order, nodemask_t *mask, bool force_kill); | 74 | int order, nodemask_t *mask, bool force_kill); |
75 | |||
76 | extern void exit_oom_victim(void); | ||
77 | |||
78 | extern int register_oom_notifier(struct notifier_block *nb); | 78 | extern int register_oom_notifier(struct notifier_block *nb); |
79 | extern int unregister_oom_notifier(struct notifier_block *nb); | 79 | extern int unregister_oom_notifier(struct notifier_block *nb); |
80 | 80 | ||
diff --git a/include/linux/slab.h b/include/linux/slab.h index ffd24c830151..9de2fdc8b5e4 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -153,8 +153,30 @@ size_t ksize(const void *); | |||
153 | #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN | 153 | #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN |
154 | #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN | 154 | #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN |
155 | #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) | 155 | #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) |
156 | /* | ||
157 | * The KMALLOC_LOOP_LOW is the definition for the for loop index start number | ||
158 | * to create the kmalloc_caches object in create_kmalloc_caches(). The first | ||
159 | * and the second are 96 and 192. You can see that in the kmalloc_index(), if | ||
160 | * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64, | ||
161 | * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't | ||
162 | * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW. | ||
163 | */ | ||
164 | #if KMALLOC_MIN_SIZE <= 32 | ||
165 | #define KMALLOC_LOOP_LOW 1 | ||
166 | #elif KMALLOC_MIN_SIZE <= 64 | ||
167 | #define KMALLOC_LOOP_LOW 2 | ||
168 | #else | ||
169 | #define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW | ||
170 | #endif | ||
171 | |||
156 | #else | 172 | #else |
157 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) | 173 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) |
174 | /* | ||
175 | * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used. | ||
176 | * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be | ||
177 | * initialized. | ||
178 | */ | ||
179 | #define KMALLOC_LOOP_LOW 1 | ||
158 | #endif | 180 | #endif |
159 | 181 | ||
160 | /* | 182 | /* |
@@ -240,8 +262,8 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; | |||
240 | * belongs to. | 262 | * belongs to. |
241 | * 0 = zero alloc | 263 | * 0 = zero alloc |
242 | * 1 = 65 .. 96 bytes | 264 | * 1 = 65 .. 96 bytes |
243 | * 2 = 120 .. 192 bytes | 265 | * 2 = 129 .. 192 bytes |
244 | * n = 2^(n-1) .. 2^n -1 | 266 | * n = 2^(n-1)+1 .. 2^n |
245 | */ | 267 | */ |
246 | static __always_inline int kmalloc_index(size_t size) | 268 | static __always_inline int kmalloc_index(size_t size) |
247 | { | 269 | { |
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index d600afb21926..da3c593f9845 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h | |||
@@ -27,6 +27,8 @@ struct smpboot_thread_data; | |||
27 | * @pre_unpark: Optional unpark function, called before the thread is | 27 | * @pre_unpark: Optional unpark function, called before the thread is |
28 | * unparked (cpu online). This is not guaranteed to be | 28 | * unparked (cpu online). This is not guaranteed to be |
29 | * called on the target cpu of the thread. Careful! | 29 | * called on the target cpu of the thread. Careful! |
30 | * @cpumask: Internal state. To update which threads are unparked, | ||
31 | * call smpboot_update_cpumask_percpu_thread(). | ||
30 | * @selfparking: Thread is not parked by the park function. | 32 | * @selfparking: Thread is not parked by the park function. |
31 | * @thread_comm: The base name of the thread | 33 | * @thread_comm: The base name of the thread |
32 | */ | 34 | */ |
@@ -41,11 +43,14 @@ struct smp_hotplug_thread { | |||
41 | void (*park)(unsigned int cpu); | 43 | void (*park)(unsigned int cpu); |
42 | void (*unpark)(unsigned int cpu); | 44 | void (*unpark)(unsigned int cpu); |
43 | void (*pre_unpark)(unsigned int cpu); | 45 | void (*pre_unpark)(unsigned int cpu); |
46 | cpumask_var_t cpumask; | ||
44 | bool selfparking; | 47 | bool selfparking; |
45 | const char *thread_comm; | 48 | const char *thread_comm; |
46 | }; | 49 | }; |
47 | 50 | ||
48 | int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); | 51 | int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); |
49 | void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); | 52 | void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); |
53 | int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, | ||
54 | const struct cpumask *); | ||
50 | 55 | ||
51 | #endif | 56 | #endif |
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 79abb9c71772..1443d79e4fe6 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/pci.h> | 11 | #include <linux/pci.h> |
12 | #include <linux/aer.h> | 12 | #include <linux/aer.h> |
13 | #include <linux/cper.h> | 13 | #include <linux/cper.h> |
14 | #include <linux/mm.h> | ||
14 | 15 | ||
15 | /* | 16 | /* |
16 | * MCE Extended Error Log trace event | 17 | * MCE Extended Error Log trace event |
@@ -232,6 +233,90 @@ TRACE_EVENT(aer_event, | |||
232 | __print_flags(__entry->status, "|", aer_uncorrectable_errors)) | 233 | __print_flags(__entry->status, "|", aer_uncorrectable_errors)) |
233 | ); | 234 | ); |
234 | 235 | ||
236 | /* | ||
237 | * memory-failure recovery action result event | ||
238 | * | ||
239 | * unsigned long pfn - Page Frame Number of the corrupted page | ||
240 | * int type - Page types of the corrupted page | ||
241 | * int result - Result of recovery action | ||
242 | */ | ||
243 | |||
244 | #ifdef CONFIG_MEMORY_FAILURE | ||
245 | #define MF_ACTION_RESULT \ | ||
246 | EM ( MF_IGNORED, "Ignored" ) \ | ||
247 | EM ( MF_FAILED, "Failed" ) \ | ||
248 | EM ( MF_DELAYED, "Delayed" ) \ | ||
249 | EMe ( MF_RECOVERED, "Recovered" ) | ||
250 | |||
251 | #define MF_PAGE_TYPE \ | ||
252 | EM ( MF_MSG_KERNEL, "reserved kernel page" ) \ | ||
253 | EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \ | ||
254 | EM ( MF_MSG_SLAB, "kernel slab page" ) \ | ||
255 | EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \ | ||
256 | EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \ | ||
257 | EM ( MF_MSG_HUGE, "huge page" ) \ | ||
258 | EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ | ||
259 | EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ | ||
260 | EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ | ||
261 | EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ | ||
262 | EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" ) \ | ||
263 | EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" ) \ | ||
264 | EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" ) \ | ||
265 | EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" ) \ | ||
266 | EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" ) \ | ||
267 | EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \ | ||
268 | EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ | ||
269 | EM ( MF_MSG_BUDDY, "free buddy page" ) \ | ||
270 | EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \ | ||
271 | EMe ( MF_MSG_UNKNOWN, "unknown page" ) | ||
272 | |||
273 | /* | ||
274 | * First define the enums in MM_ACTION_RESULT to be exported to userspace | ||
275 | * via TRACE_DEFINE_ENUM(). | ||
276 | */ | ||
277 | #undef EM | ||
278 | #undef EMe | ||
279 | #define EM(a, b) TRACE_DEFINE_ENUM(a); | ||
280 | #define EMe(a, b) TRACE_DEFINE_ENUM(a); | ||
281 | |||
282 | MF_ACTION_RESULT | ||
283 | MF_PAGE_TYPE | ||
284 | |||
285 | /* | ||
286 | * Now redefine the EM() and EMe() macros to map the enums to the strings | ||
287 | * that will be printed in the output. | ||
288 | */ | ||
289 | #undef EM | ||
290 | #undef EMe | ||
291 | #define EM(a, b) { a, b }, | ||
292 | #define EMe(a, b) { a, b } | ||
293 | |||
294 | TRACE_EVENT(memory_failure_event, | ||
295 | TP_PROTO(unsigned long pfn, | ||
296 | int type, | ||
297 | int result), | ||
298 | |||
299 | TP_ARGS(pfn, type, result), | ||
300 | |||
301 | TP_STRUCT__entry( | ||
302 | __field(unsigned long, pfn) | ||
303 | __field(int, type) | ||
304 | __field(int, result) | ||
305 | ), | ||
306 | |||
307 | TP_fast_assign( | ||
308 | __entry->pfn = pfn; | ||
309 | __entry->type = type; | ||
310 | __entry->result = result; | ||
311 | ), | ||
312 | |||
313 | TP_printk("pfn %#lx: recovery action for %s: %s", | ||
314 | __entry->pfn, | ||
315 | __print_symbolic(__entry->type, MF_PAGE_TYPE), | ||
316 | __print_symbolic(__entry->result, MF_ACTION_RESULT) | ||
317 | ) | ||
318 | ); | ||
319 | #endif /* CONFIG_MEMORY_FAILURE */ | ||
235 | #endif /* _TRACE_HW_EVENT_MC_H */ | 320 | #endif /* _TRACE_HW_EVENT_MC_H */ |
236 | 321 | ||
237 | /* This part must be outside protection */ | 322 | /* This part must be outside protection */ |
diff --git a/kernel/exit.c b/kernel/exit.c index 22fcc05dec40..185752a729f6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -436,7 +436,7 @@ static void exit_mm(struct task_struct *tsk) | |||
436 | mm_update_next_owner(mm); | 436 | mm_update_next_owner(mm); |
437 | mmput(mm); | 437 | mmput(mm); |
438 | if (test_thread_flag(TIF_MEMDIE)) | 438 | if (test_thread_flag(TIF_MEMDIE)) |
439 | unmark_oom_victim(); | 439 | exit_oom_victim(); |
440 | } | 440 | } |
441 | 441 | ||
442 | static struct task_struct *find_alive_thread(struct task_struct *p) | 442 | static struct task_struct *find_alive_thread(struct task_struct *p) |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index c697f73d82d6..7c434c39f02a 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -232,7 +232,8 @@ void smpboot_unpark_threads(unsigned int cpu) | |||
232 | 232 | ||
233 | mutex_lock(&smpboot_threads_lock); | 233 | mutex_lock(&smpboot_threads_lock); |
234 | list_for_each_entry(cur, &hotplug_threads, list) | 234 | list_for_each_entry(cur, &hotplug_threads, list) |
235 | smpboot_unpark_thread(cur, cpu); | 235 | if (cpumask_test_cpu(cpu, cur->cpumask)) |
236 | smpboot_unpark_thread(cur, cpu); | ||
236 | mutex_unlock(&smpboot_threads_lock); | 237 | mutex_unlock(&smpboot_threads_lock); |
237 | } | 238 | } |
238 | 239 | ||
@@ -258,6 +259,15 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) | |||
258 | { | 259 | { |
259 | unsigned int cpu; | 260 | unsigned int cpu; |
260 | 261 | ||
262 | /* Unpark any threads that were voluntarily parked. */ | ||
263 | for_each_cpu_not(cpu, ht->cpumask) { | ||
264 | if (cpu_online(cpu)) { | ||
265 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | ||
266 | if (tsk) | ||
267 | kthread_unpark(tsk); | ||
268 | } | ||
269 | } | ||
270 | |||
261 | /* We need to destroy also the parked threads of offline cpus */ | 271 | /* We need to destroy also the parked threads of offline cpus */ |
262 | for_each_possible_cpu(cpu) { | 272 | for_each_possible_cpu(cpu) { |
263 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | 273 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); |
@@ -281,6 +291,10 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) | |||
281 | unsigned int cpu; | 291 | unsigned int cpu; |
282 | int ret = 0; | 292 | int ret = 0; |
283 | 293 | ||
294 | if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) | ||
295 | return -ENOMEM; | ||
296 | cpumask_copy(plug_thread->cpumask, cpu_possible_mask); | ||
297 | |||
284 | get_online_cpus(); | 298 | get_online_cpus(); |
285 | mutex_lock(&smpboot_threads_lock); | 299 | mutex_lock(&smpboot_threads_lock); |
286 | for_each_online_cpu(cpu) { | 300 | for_each_online_cpu(cpu) { |
@@ -313,9 +327,53 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) | |||
313 | smpboot_destroy_threads(plug_thread); | 327 | smpboot_destroy_threads(plug_thread); |
314 | mutex_unlock(&smpboot_threads_lock); | 328 | mutex_unlock(&smpboot_threads_lock); |
315 | put_online_cpus(); | 329 | put_online_cpus(); |
330 | free_cpumask_var(plug_thread->cpumask); | ||
316 | } | 331 | } |
317 | EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); | 332 | EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); |
318 | 333 | ||
334 | /** | ||
335 | * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked | ||
336 | * @plug_thread: Hotplug thread descriptor | ||
337 | * @new: Revised mask to use | ||
338 | * | ||
339 | * The cpumask field in the smp_hotplug_thread must not be updated directly | ||
340 | * by the client, but only by calling this function. | ||
341 | * This function can only be called on a registered smp_hotplug_thread. | ||
342 | */ | ||
343 | int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, | ||
344 | const struct cpumask *new) | ||
345 | { | ||
346 | struct cpumask *old = plug_thread->cpumask; | ||
347 | cpumask_var_t tmp; | ||
348 | unsigned int cpu; | ||
349 | |||
350 | if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) | ||
351 | return -ENOMEM; | ||
352 | |||
353 | get_online_cpus(); | ||
354 | mutex_lock(&smpboot_threads_lock); | ||
355 | |||
356 | /* Park threads that were exclusively enabled on the old mask. */ | ||
357 | cpumask_andnot(tmp, old, new); | ||
358 | for_each_cpu_and(cpu, tmp, cpu_online_mask) | ||
359 | smpboot_park_thread(plug_thread, cpu); | ||
360 | |||
361 | /* Unpark threads that are exclusively enabled on the new mask. */ | ||
362 | cpumask_andnot(tmp, new, old); | ||
363 | for_each_cpu_and(cpu, tmp, cpu_online_mask) | ||
364 | smpboot_unpark_thread(plug_thread, cpu); | ||
365 | |||
366 | cpumask_copy(old, new); | ||
367 | |||
368 | mutex_unlock(&smpboot_threads_lock); | ||
369 | put_online_cpus(); | ||
370 | |||
371 | free_cpumask_var(tmp); | ||
372 | |||
373 | return 0; | ||
374 | } | ||
375 | EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread); | ||
376 | |||
319 | static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); | 377 | static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); |
320 | 378 | ||
321 | /* | 379 | /* |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b13e9d2de302..812fcc3fd390 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -872,6 +872,13 @@ static struct ctl_table kern_table[] = { | |||
872 | .extra2 = &one, | 872 | .extra2 = &one, |
873 | }, | 873 | }, |
874 | { | 874 | { |
875 | .procname = "watchdog_cpumask", | ||
876 | .data = &watchdog_cpumask_bits, | ||
877 | .maxlen = NR_CPUS, | ||
878 | .mode = 0644, | ||
879 | .proc_handler = proc_watchdog_cpumask, | ||
880 | }, | ||
881 | { | ||
875 | .procname = "softlockup_panic", | 882 | .procname = "softlockup_panic", |
876 | .data = &softlockup_panic, | 883 | .data = &softlockup_panic, |
877 | .maxlen = sizeof(int), | 884 | .maxlen = sizeof(int), |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 581a68a04c64..a6ffa43f2993 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/sysctl.h> | 19 | #include <linux/sysctl.h> |
20 | #include <linux/smpboot.h> | 20 | #include <linux/smpboot.h> |
21 | #include <linux/sched/rt.h> | 21 | #include <linux/sched/rt.h> |
22 | #include <linux/tick.h> | ||
22 | 23 | ||
23 | #include <asm/irq_regs.h> | 24 | #include <asm/irq_regs.h> |
24 | #include <linux/kvm_para.h> | 25 | #include <linux/kvm_para.h> |
@@ -58,6 +59,12 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace; | |||
58 | #else | 59 | #else |
59 | #define sysctl_softlockup_all_cpu_backtrace 0 | 60 | #define sysctl_softlockup_all_cpu_backtrace 0 |
60 | #endif | 61 | #endif |
62 | static struct cpumask watchdog_cpumask __read_mostly; | ||
63 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); | ||
64 | |||
65 | /* Helper for online, unparked cpus. */ | ||
66 | #define for_each_watchdog_cpu(cpu) \ | ||
67 | for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) | ||
61 | 68 | ||
62 | static int __read_mostly watchdog_running; | 69 | static int __read_mostly watchdog_running; |
63 | static u64 __read_mostly sample_period; | 70 | static u64 __read_mostly sample_period; |
@@ -207,7 +214,7 @@ void touch_all_softlockup_watchdogs(void) | |||
207 | * do we care if a 0 races with a timestamp? | 214 | * do we care if a 0 races with a timestamp? |
208 | * all it means is the softlock check starts one cycle later | 215 | * all it means is the softlock check starts one cycle later |
209 | */ | 216 | */ |
210 | for_each_online_cpu(cpu) | 217 | for_each_watchdog_cpu(cpu) |
211 | per_cpu(watchdog_touch_ts, cpu) = 0; | 218 | per_cpu(watchdog_touch_ts, cpu) = 0; |
212 | } | 219 | } |
213 | 220 | ||
@@ -616,7 +623,7 @@ void watchdog_nmi_enable_all(void) | |||
616 | goto unlock; | 623 | goto unlock; |
617 | 624 | ||
618 | get_online_cpus(); | 625 | get_online_cpus(); |
619 | for_each_online_cpu(cpu) | 626 | for_each_watchdog_cpu(cpu) |
620 | watchdog_nmi_enable(cpu); | 627 | watchdog_nmi_enable(cpu); |
621 | put_online_cpus(); | 628 | put_online_cpus(); |
622 | 629 | ||
@@ -634,7 +641,7 @@ void watchdog_nmi_disable_all(void) | |||
634 | goto unlock; | 641 | goto unlock; |
635 | 642 | ||
636 | get_online_cpus(); | 643 | get_online_cpus(); |
637 | for_each_online_cpu(cpu) | 644 | for_each_watchdog_cpu(cpu) |
638 | watchdog_nmi_disable(cpu); | 645 | watchdog_nmi_disable(cpu); |
639 | put_online_cpus(); | 646 | put_online_cpus(); |
640 | 647 | ||
@@ -696,7 +703,7 @@ static void update_watchdog_all_cpus(void) | |||
696 | int cpu; | 703 | int cpu; |
697 | 704 | ||
698 | get_online_cpus(); | 705 | get_online_cpus(); |
699 | for_each_online_cpu(cpu) | 706 | for_each_watchdog_cpu(cpu) |
700 | update_watchdog(cpu); | 707 | update_watchdog(cpu); |
701 | put_online_cpus(); | 708 | put_online_cpus(); |
702 | } | 709 | } |
@@ -709,8 +716,12 @@ static int watchdog_enable_all_cpus(void) | |||
709 | err = smpboot_register_percpu_thread(&watchdog_threads); | 716 | err = smpboot_register_percpu_thread(&watchdog_threads); |
710 | if (err) | 717 | if (err) |
711 | pr_err("Failed to create watchdog threads, disabled\n"); | 718 | pr_err("Failed to create watchdog threads, disabled\n"); |
712 | else | 719 | else { |
720 | if (smpboot_update_cpumask_percpu_thread( | ||
721 | &watchdog_threads, &watchdog_cpumask)) | ||
722 | pr_err("Failed to set cpumask for watchdog threads\n"); | ||
713 | watchdog_running = 1; | 723 | watchdog_running = 1; |
724 | } | ||
714 | } else { | 725 | } else { |
715 | /* | 726 | /* |
716 | * Enable/disable the lockup detectors or | 727 | * Enable/disable the lockup detectors or |
@@ -879,12 +890,58 @@ out: | |||
879 | mutex_unlock(&watchdog_proc_mutex); | 890 | mutex_unlock(&watchdog_proc_mutex); |
880 | return err; | 891 | return err; |
881 | } | 892 | } |
893 | |||
894 | /* | ||
895 | * The cpumask is the mask of possible cpus that the watchdog can run | ||
896 | * on, not the mask of cpus it is actually running on. This allows the | ||
897 | * user to specify a mask that will include cpus that have not yet | ||
898 | * been brought online, if desired. | ||
899 | */ | ||
900 | int proc_watchdog_cpumask(struct ctl_table *table, int write, | ||
901 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
902 | { | ||
903 | int err; | ||
904 | |||
905 | mutex_lock(&watchdog_proc_mutex); | ||
906 | err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); | ||
907 | if (!err && write) { | ||
908 | /* Remove impossible cpus to keep sysctl output cleaner. */ | ||
909 | cpumask_and(&watchdog_cpumask, &watchdog_cpumask, | ||
910 | cpu_possible_mask); | ||
911 | |||
912 | if (watchdog_running) { | ||
913 | /* | ||
914 | * Failure would be due to being unable to allocate | ||
915 | * a temporary cpumask, so we are likely not in a | ||
916 | * position to do much else to make things better. | ||
917 | */ | ||
918 | if (smpboot_update_cpumask_percpu_thread( | ||
919 | &watchdog_threads, &watchdog_cpumask) != 0) | ||
920 | pr_err("cpumask update failed\n"); | ||
921 | } | ||
922 | } | ||
923 | mutex_unlock(&watchdog_proc_mutex); | ||
924 | return err; | ||
925 | } | ||
926 | |||
882 | #endif /* CONFIG_SYSCTL */ | 927 | #endif /* CONFIG_SYSCTL */ |
883 | 928 | ||
884 | void __init lockup_detector_init(void) | 929 | void __init lockup_detector_init(void) |
885 | { | 930 | { |
886 | set_sample_period(); | 931 | set_sample_period(); |
887 | 932 | ||
933 | #ifdef CONFIG_NO_HZ_FULL | ||
934 | if (tick_nohz_full_enabled()) { | ||
935 | if (!cpumask_empty(tick_nohz_full_mask)) | ||
936 | pr_info("Disabling watchdog on nohz_full cores by default\n"); | ||
937 | cpumask_andnot(&watchdog_cpumask, cpu_possible_mask, | ||
938 | tick_nohz_full_mask); | ||
939 | } else | ||
940 | cpumask_copy(&watchdog_cpumask, cpu_possible_mask); | ||
941 | #else | ||
942 | cpumask_copy(&watchdog_cpumask, cpu_possible_mask); | ||
943 | #endif | ||
944 | |||
888 | if (watchdog_enabled) | 945 | if (watchdog_enabled) |
889 | watchdog_enable_all_cpus(); | 946 | watchdog_enable_all_cpus(); |
890 | } | 947 | } |
diff --git a/mm/Kconfig b/mm/Kconfig index 390214da4546..c180af880ed5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -368,6 +368,7 @@ config MEMORY_FAILURE | |||
368 | depends on ARCH_SUPPORTS_MEMORY_FAILURE | 368 | depends on ARCH_SUPPORTS_MEMORY_FAILURE |
369 | bool "Enable recovery from hardware memory errors" | 369 | bool "Enable recovery from hardware memory errors" |
370 | select MEMORY_ISOLATION | 370 | select MEMORY_ISOLATION |
371 | select RAS | ||
371 | help | 372 | help |
372 | Enables code to recover from some memory failures on systems | 373 | Enables code to recover from some memory failures on systems |
373 | with MCA recovery. This allows a system to continue running | 374 | with MCA recovery. This allows a system to continue running |
@@ -182,7 +182,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, | |||
182 | if (!size || !memblock_is_region_reserved(base, size)) | 182 | if (!size || !memblock_is_region_reserved(base, size)) |
183 | return -EINVAL; | 183 | return -EINVAL; |
184 | 184 | ||
185 | /* ensure minimal alignment requied by mm core */ | 185 | /* ensure minimal alignment required by mm core */ |
186 | alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); | 186 | alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); |
187 | 187 | ||
188 | /* alignment should be aligned with order_per_bit */ | 188 | /* alignment should be aligned with order_per_bit */ |
@@ -238,7 +238,7 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
238 | /* | 238 | /* |
239 | * high_memory isn't direct mapped memory so retrieving its physical | 239 | * high_memory isn't direct mapped memory so retrieving its physical |
240 | * address isn't appropriate. But it would be useful to check the | 240 | * address isn't appropriate. But it would be useful to check the |
241 | * physical address of the highmem boundary so it's justfiable to get | 241 | * physical address of the highmem boundary so it's justifiable to get |
242 | * the physical address from it. On x86 there is a validation check for | 242 | * the physical address from it. On x86 there is a validation check for |
243 | * this case, so the following workaround is needed to avoid it. | 243 | * this case, so the following workaround is needed to avoid it. |
244 | */ | 244 | */ |
@@ -316,13 +316,15 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
316 | */ | 316 | */ |
317 | if (base < highmem_start && limit > highmem_start) { | 317 | if (base < highmem_start && limit > highmem_start) { |
318 | addr = memblock_alloc_range(size, alignment, | 318 | addr = memblock_alloc_range(size, alignment, |
319 | highmem_start, limit); | 319 | highmem_start, limit, |
320 | MEMBLOCK_NONE); | ||
320 | limit = highmem_start; | 321 | limit = highmem_start; |
321 | } | 322 | } |
322 | 323 | ||
323 | if (!addr) { | 324 | if (!addr) { |
324 | addr = memblock_alloc_range(size, alignment, base, | 325 | addr = memblock_alloc_range(size, alignment, base, |
325 | limit); | 326 | limit, |
327 | MEMBLOCK_NONE); | ||
326 | if (!addr) { | 328 | if (!addr) { |
327 | ret = -ENOMEM; | 329 | ret = -ENOMEM; |
328 | goto err; | 330 | goto err; |
diff --git a/mm/filemap.c b/mm/filemap.c index 6bf5e42d560a..8d17ceea8dbe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -196,7 +196,9 @@ void __delete_from_page_cache(struct page *page, void *shadow) | |||
196 | page->mapping = NULL; | 196 | page->mapping = NULL; |
197 | /* Leave page->index set: truncation lookup relies upon it */ | 197 | /* Leave page->index set: truncation lookup relies upon it */ |
198 | 198 | ||
199 | __dec_zone_page_state(page, NR_FILE_PAGES); | 199 | /* hugetlb pages do not participate in page cache accounting. */ |
200 | if (!PageHuge(page)) | ||
201 | __dec_zone_page_state(page, NR_FILE_PAGES); | ||
200 | if (PageSwapBacked(page)) | 202 | if (PageSwapBacked(page)) |
201 | __dec_zone_page_state(page, NR_SHMEM); | 203 | __dec_zone_page_state(page, NR_SHMEM); |
202 | BUG_ON(page_mapped(page)); | 204 | BUG_ON(page_mapped(page)); |
@@ -483,7 +485,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | |||
483 | error = radix_tree_insert(&mapping->page_tree, offset, new); | 485 | error = radix_tree_insert(&mapping->page_tree, offset, new); |
484 | BUG_ON(error); | 486 | BUG_ON(error); |
485 | mapping->nrpages++; | 487 | mapping->nrpages++; |
486 | __inc_zone_page_state(new, NR_FILE_PAGES); | 488 | |
489 | /* | ||
490 | * hugetlb pages do not participate in page cache accounting. | ||
491 | */ | ||
492 | if (!PageHuge(new)) | ||
493 | __inc_zone_page_state(new, NR_FILE_PAGES); | ||
487 | if (PageSwapBacked(new)) | 494 | if (PageSwapBacked(new)) |
488 | __inc_zone_page_state(new, NR_SHMEM); | 495 | __inc_zone_page_state(new, NR_SHMEM); |
489 | spin_unlock_irq(&mapping->tree_lock); | 496 | spin_unlock_irq(&mapping->tree_lock); |
@@ -575,7 +582,10 @@ static int __add_to_page_cache_locked(struct page *page, | |||
575 | radix_tree_preload_end(); | 582 | radix_tree_preload_end(); |
576 | if (unlikely(error)) | 583 | if (unlikely(error)) |
577 | goto err_insert; | 584 | goto err_insert; |
578 | __inc_zone_page_state(page, NR_FILE_PAGES); | 585 | |
586 | /* hugetlb pages do not participate in page cache accounting. */ | ||
587 | if (!huge) | ||
588 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
579 | spin_unlock_irq(&mapping->tree_lock); | 589 | spin_unlock_irq(&mapping->tree_lock); |
580 | if (!huge) | 590 | if (!huge) |
581 | mem_cgroup_commit_charge(page, memcg, false); | 591 | mem_cgroup_commit_charge(page, memcg, false); |
@@ -1654,8 +1664,8 @@ no_cached_page: | |||
1654 | error = -ENOMEM; | 1664 | error = -ENOMEM; |
1655 | goto out; | 1665 | goto out; |
1656 | } | 1666 | } |
1657 | error = add_to_page_cache_lru(page, mapping, | 1667 | error = add_to_page_cache_lru(page, mapping, index, |
1658 | index, GFP_KERNEL); | 1668 | GFP_KERNEL & mapping_gfp_mask(mapping)); |
1659 | if (error) { | 1669 | if (error) { |
1660 | page_cache_release(page); | 1670 | page_cache_release(page); |
1661 | if (error == -EEXIST) { | 1671 | if (error == -EEXIST) { |
@@ -1756,7 +1766,8 @@ static int page_cache_read(struct file *file, pgoff_t offset) | |||
1756 | if (!page) | 1766 | if (!page) |
1757 | return -ENOMEM; | 1767 | return -ENOMEM; |
1758 | 1768 | ||
1759 | ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); | 1769 | ret = add_to_page_cache_lru(page, mapping, offset, |
1770 | GFP_KERNEL & mapping_gfp_mask(mapping)); | ||
1760 | if (ret == 0) | 1771 | if (ret == 0) |
1761 | ret = mapping->a_ops->readpage(file, page); | 1772 | ret = mapping->a_ops->readpage(file, page); |
1762 | else if (ret == -EEXIST) | 1773 | else if (ret == -EEXIST) |
diff --git a/mm/frontswap.c b/mm/frontswap.c index 8d82809eb085..27a9924caf61 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c | |||
@@ -21,11 +21,16 @@ | |||
21 | #include <linux/swapfile.h> | 21 | #include <linux/swapfile.h> |
22 | 22 | ||
23 | /* | 23 | /* |
24 | * frontswap_ops is set by frontswap_register_ops to contain the pointers | 24 | * frontswap_ops are added by frontswap_register_ops, and provide the |
25 | * to the frontswap "backend" implementation functions. | 25 | * frontswap "backend" implementation functions. Multiple implementations |
26 | * may be registered, but implementations can never deregister. This | ||
27 | * is a simple singly-linked list of all registered implementations. | ||
26 | */ | 28 | */ |
27 | static struct frontswap_ops *frontswap_ops __read_mostly; | 29 | static struct frontswap_ops *frontswap_ops __read_mostly; |
28 | 30 | ||
31 | #define for_each_frontswap_ops(ops) \ | ||
32 | for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next) | ||
33 | |||
29 | /* | 34 | /* |
30 | * If enabled, frontswap_store will return failure even on success. As | 35 | * If enabled, frontswap_store will return failure even on success. As |
31 | * a result, the swap subsystem will always write the page to swap, in | 36 | * a result, the swap subsystem will always write the page to swap, in |
@@ -79,15 +84,6 @@ static inline void inc_frontswap_invalidates(void) { } | |||
79 | * on all frontswap functions to not call the backend until the backend | 84 | * on all frontswap functions to not call the backend until the backend |
80 | * has registered. | 85 | * has registered. |
81 | * | 86 | * |
82 | * Specifically when no backend is registered (nobody called | ||
83 | * frontswap_register_ops) all calls to frontswap_init (which is done via | ||
84 | * swapon -> enable_swap_info -> frontswap_init) are registered and remembered | ||
85 | * (via the setting of need_init bitmap) but fail to create tmem_pools. When a | ||
86 | * backend registers with frontswap at some later point the previous | ||
87 | * calls to frontswap_init are executed (by iterating over the need_init | ||
88 | * bitmap) to create tmem_pools and set the respective poolids. All of that is | ||
89 | * guarded by us using atomic bit operations on the 'need_init' bitmap. | ||
90 | * | ||
91 | * This would not guards us against the user deciding to call swapoff right as | 87 | * This would not guards us against the user deciding to call swapoff right as |
92 | * we are calling the backend to initialize (so swapon is in action). | 88 | * we are calling the backend to initialize (so swapon is in action). |
93 | * Fortunatly for us, the swapon_mutex has been taked by the callee so we are | 89 | * Fortunatly for us, the swapon_mutex has been taked by the callee so we are |
@@ -106,37 +102,64 @@ static inline void inc_frontswap_invalidates(void) { } | |||
106 | * | 102 | * |
107 | * Obviously the opposite (unloading the backend) must be done after all | 103 | * Obviously the opposite (unloading the backend) must be done after all |
108 | * the frontswap_[store|load|invalidate_area|invalidate_page] start | 104 | * the frontswap_[store|load|invalidate_area|invalidate_page] start |
109 | * ignorning or failing the requests - at which point frontswap_ops | 105 | * ignoring or failing the requests. However, there is currently no way |
110 | * would have to be made in some fashion atomic. | 106 | * to unload a backend once it is registered. |
111 | */ | 107 | */ |
112 | static DECLARE_BITMAP(need_init, MAX_SWAPFILES); | ||
113 | 108 | ||
114 | /* | 109 | /* |
115 | * Register operations for frontswap, returning previous thus allowing | 110 | * Register operations for frontswap |
116 | * detection of multiple backends and possible nesting. | ||
117 | */ | 111 | */ |
118 | struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops) | 112 | void frontswap_register_ops(struct frontswap_ops *ops) |
119 | { | 113 | { |
120 | struct frontswap_ops *old = frontswap_ops; | 114 | DECLARE_BITMAP(a, MAX_SWAPFILES); |
121 | int i; | 115 | DECLARE_BITMAP(b, MAX_SWAPFILES); |
122 | 116 | struct swap_info_struct *si; | |
123 | for (i = 0; i < MAX_SWAPFILES; i++) { | 117 | unsigned int i; |
124 | if (test_and_clear_bit(i, need_init)) { | 118 | |
125 | struct swap_info_struct *sis = swap_info[i]; | 119 | bitmap_zero(a, MAX_SWAPFILES); |
126 | /* __frontswap_init _should_ have set it! */ | 120 | bitmap_zero(b, MAX_SWAPFILES); |
127 | if (!sis->frontswap_map) | 121 | |
128 | return ERR_PTR(-EINVAL); | 122 | spin_lock(&swap_lock); |
129 | ops->init(i); | 123 | plist_for_each_entry(si, &swap_active_head, list) { |
130 | } | 124 | if (!WARN_ON(!si->frontswap_map)) |
125 | set_bit(si->type, a); | ||
131 | } | 126 | } |
127 | spin_unlock(&swap_lock); | ||
128 | |||
129 | /* the new ops needs to know the currently active swap devices */ | ||
130 | for_each_set_bit(i, a, MAX_SWAPFILES) | ||
131 | ops->init(i); | ||
132 | |||
132 | /* | 133 | /* |
133 | * We MUST have frontswap_ops set _after_ the frontswap_init's | 134 | * Setting frontswap_ops must happen after the ops->init() calls |
134 | * have been called. Otherwise __frontswap_store might fail. Hence | 135 | * above; cmpxchg implies smp_mb() which will ensure the init is |
135 | * the barrier to make sure compiler does not re-order us. | 136 | * complete at this point. |
136 | */ | 137 | */ |
137 | barrier(); | 138 | do { |
138 | frontswap_ops = ops; | 139 | ops->next = frontswap_ops; |
139 | return old; | 140 | } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); |
141 | |||
142 | spin_lock(&swap_lock); | ||
143 | plist_for_each_entry(si, &swap_active_head, list) { | ||
144 | if (si->frontswap_map) | ||
145 | set_bit(si->type, b); | ||
146 | } | ||
147 | spin_unlock(&swap_lock); | ||
148 | |||
149 | /* | ||
150 | * On the very unlikely chance that a swap device was added or | ||
151 | * removed between setting the "a" list bits and the ops init | ||
152 | * calls, we re-check and do init or invalidate for any changed | ||
153 | * bits. | ||
154 | */ | ||
155 | if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) { | ||
156 | for (i = 0; i < MAX_SWAPFILES; i++) { | ||
157 | if (!test_bit(i, a) && test_bit(i, b)) | ||
158 | ops->init(i); | ||
159 | else if (test_bit(i, a) && !test_bit(i, b)) | ||
160 | ops->invalidate_area(i); | ||
161 | } | ||
162 | } | ||
140 | } | 163 | } |
141 | EXPORT_SYMBOL(frontswap_register_ops); | 164 | EXPORT_SYMBOL(frontswap_register_ops); |
142 | 165 | ||
@@ -164,6 +187,7 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); | |||
164 | void __frontswap_init(unsigned type, unsigned long *map) | 187 | void __frontswap_init(unsigned type, unsigned long *map) |
165 | { | 188 | { |
166 | struct swap_info_struct *sis = swap_info[type]; | 189 | struct swap_info_struct *sis = swap_info[type]; |
190 | struct frontswap_ops *ops; | ||
167 | 191 | ||
168 | BUG_ON(sis == NULL); | 192 | BUG_ON(sis == NULL); |
169 | 193 | ||
@@ -179,28 +203,30 @@ void __frontswap_init(unsigned type, unsigned long *map) | |||
179 | * p->frontswap set to something valid to work properly. | 203 | * p->frontswap set to something valid to work properly. |
180 | */ | 204 | */ |
181 | frontswap_map_set(sis, map); | 205 | frontswap_map_set(sis, map); |
182 | if (frontswap_ops) | 206 | |
183 | frontswap_ops->init(type); | 207 | for_each_frontswap_ops(ops) |
184 | else { | 208 | ops->init(type); |
185 | BUG_ON(type >= MAX_SWAPFILES); | ||
186 | set_bit(type, need_init); | ||
187 | } | ||
188 | } | 209 | } |
189 | EXPORT_SYMBOL(__frontswap_init); | 210 | EXPORT_SYMBOL(__frontswap_init); |
190 | 211 | ||
191 | bool __frontswap_test(struct swap_info_struct *sis, | 212 | bool __frontswap_test(struct swap_info_struct *sis, |
192 | pgoff_t offset) | 213 | pgoff_t offset) |
193 | { | 214 | { |
194 | bool ret = false; | 215 | if (sis->frontswap_map) |
195 | 216 | return test_bit(offset, sis->frontswap_map); | |
196 | if (frontswap_ops && sis->frontswap_map) | 217 | return false; |
197 | ret = test_bit(offset, sis->frontswap_map); | ||
198 | return ret; | ||
199 | } | 218 | } |
200 | EXPORT_SYMBOL(__frontswap_test); | 219 | EXPORT_SYMBOL(__frontswap_test); |
201 | 220 | ||
221 | static inline void __frontswap_set(struct swap_info_struct *sis, | ||
222 | pgoff_t offset) | ||
223 | { | ||
224 | set_bit(offset, sis->frontswap_map); | ||
225 | atomic_inc(&sis->frontswap_pages); | ||
226 | } | ||
227 | |||
202 | static inline void __frontswap_clear(struct swap_info_struct *sis, | 228 | static inline void __frontswap_clear(struct swap_info_struct *sis, |
203 | pgoff_t offset) | 229 | pgoff_t offset) |
204 | { | 230 | { |
205 | clear_bit(offset, sis->frontswap_map); | 231 | clear_bit(offset, sis->frontswap_map); |
206 | atomic_dec(&sis->frontswap_pages); | 232 | atomic_dec(&sis->frontswap_pages); |
@@ -215,39 +241,46 @@ static inline void __frontswap_clear(struct swap_info_struct *sis, | |||
215 | */ | 241 | */ |
216 | int __frontswap_store(struct page *page) | 242 | int __frontswap_store(struct page *page) |
217 | { | 243 | { |
218 | int ret = -1, dup = 0; | 244 | int ret = -1; |
219 | swp_entry_t entry = { .val = page_private(page), }; | 245 | swp_entry_t entry = { .val = page_private(page), }; |
220 | int type = swp_type(entry); | 246 | int type = swp_type(entry); |
221 | struct swap_info_struct *sis = swap_info[type]; | 247 | struct swap_info_struct *sis = swap_info[type]; |
222 | pgoff_t offset = swp_offset(entry); | 248 | pgoff_t offset = swp_offset(entry); |
249 | struct frontswap_ops *ops; | ||
223 | 250 | ||
224 | /* | 251 | /* |
225 | * Return if no backend registed. | 252 | * Return if no backend registed. |
226 | * Don't need to inc frontswap_failed_stores here. | 253 | * Don't need to inc frontswap_failed_stores here. |
227 | */ | 254 | */ |
228 | if (!frontswap_ops) | 255 | if (!frontswap_ops) |
229 | return ret; | 256 | return -1; |
230 | 257 | ||
231 | BUG_ON(!PageLocked(page)); | 258 | BUG_ON(!PageLocked(page)); |
232 | BUG_ON(sis == NULL); | 259 | BUG_ON(sis == NULL); |
233 | if (__frontswap_test(sis, offset)) | 260 | |
234 | dup = 1; | 261 | /* |
235 | ret = frontswap_ops->store(type, offset, page); | 262 | * If a dup, we must remove the old page first; we can't leave the |
263 | * old page no matter if the store of the new page succeeds or fails, | ||
264 | * and we can't rely on the new page replacing the old page as we may | ||
265 | * not store to the same implementation that contains the old page. | ||
266 | */ | ||
267 | if (__frontswap_test(sis, offset)) { | ||
268 | __frontswap_clear(sis, offset); | ||
269 | for_each_frontswap_ops(ops) | ||
270 | ops->invalidate_page(type, offset); | ||
271 | } | ||
272 | |||
273 | /* Try to store in each implementation, until one succeeds. */ | ||
274 | for_each_frontswap_ops(ops) { | ||
275 | ret = ops->store(type, offset, page); | ||
276 | if (!ret) /* successful store */ | ||
277 | break; | ||
278 | } | ||
236 | if (ret == 0) { | 279 | if (ret == 0) { |
237 | set_bit(offset, sis->frontswap_map); | 280 | __frontswap_set(sis, offset); |
238 | inc_frontswap_succ_stores(); | 281 | inc_frontswap_succ_stores(); |
239 | if (!dup) | ||
240 | atomic_inc(&sis->frontswap_pages); | ||
241 | } else { | 282 | } else { |
242 | /* | ||
243 | failed dup always results in automatic invalidate of | ||
244 | the (older) page from frontswap | ||
245 | */ | ||
246 | inc_frontswap_failed_stores(); | 283 | inc_frontswap_failed_stores(); |
247 | if (dup) { | ||
248 | __frontswap_clear(sis, offset); | ||
249 | frontswap_ops->invalidate_page(type, offset); | ||
250 | } | ||
251 | } | 284 | } |
252 | if (frontswap_writethrough_enabled) | 285 | if (frontswap_writethrough_enabled) |
253 | /* report failure so swap also writes to swap device */ | 286 | /* report failure so swap also writes to swap device */ |
@@ -268,14 +301,22 @@ int __frontswap_load(struct page *page) | |||
268 | int type = swp_type(entry); | 301 | int type = swp_type(entry); |
269 | struct swap_info_struct *sis = swap_info[type]; | 302 | struct swap_info_struct *sis = swap_info[type]; |
270 | pgoff_t offset = swp_offset(entry); | 303 | pgoff_t offset = swp_offset(entry); |
304 | struct frontswap_ops *ops; | ||
305 | |||
306 | if (!frontswap_ops) | ||
307 | return -1; | ||
271 | 308 | ||
272 | BUG_ON(!PageLocked(page)); | 309 | BUG_ON(!PageLocked(page)); |
273 | BUG_ON(sis == NULL); | 310 | BUG_ON(sis == NULL); |
274 | /* | 311 | if (!__frontswap_test(sis, offset)) |
275 | * __frontswap_test() will check whether there is backend registered | 312 | return -1; |
276 | */ | 313 | |
277 | if (__frontswap_test(sis, offset)) | 314 | /* Try loading from each implementation, until one succeeds. */ |
278 | ret = frontswap_ops->load(type, offset, page); | 315 | for_each_frontswap_ops(ops) { |
316 | ret = ops->load(type, offset, page); | ||
317 | if (!ret) /* successful load */ | ||
318 | break; | ||
319 | } | ||
279 | if (ret == 0) { | 320 | if (ret == 0) { |
280 | inc_frontswap_loads(); | 321 | inc_frontswap_loads(); |
281 | if (frontswap_tmem_exclusive_gets_enabled) { | 322 | if (frontswap_tmem_exclusive_gets_enabled) { |
@@ -294,16 +335,19 @@ EXPORT_SYMBOL(__frontswap_load); | |||
294 | void __frontswap_invalidate_page(unsigned type, pgoff_t offset) | 335 | void __frontswap_invalidate_page(unsigned type, pgoff_t offset) |
295 | { | 336 | { |
296 | struct swap_info_struct *sis = swap_info[type]; | 337 | struct swap_info_struct *sis = swap_info[type]; |
338 | struct frontswap_ops *ops; | ||
339 | |||
340 | if (!frontswap_ops) | ||
341 | return; | ||
297 | 342 | ||
298 | BUG_ON(sis == NULL); | 343 | BUG_ON(sis == NULL); |
299 | /* | 344 | if (!__frontswap_test(sis, offset)) |
300 | * __frontswap_test() will check whether there is backend registered | 345 | return; |
301 | */ | 346 | |
302 | if (__frontswap_test(sis, offset)) { | 347 | for_each_frontswap_ops(ops) |
303 | frontswap_ops->invalidate_page(type, offset); | 348 | ops->invalidate_page(type, offset); |
304 | __frontswap_clear(sis, offset); | 349 | __frontswap_clear(sis, offset); |
305 | inc_frontswap_invalidates(); | 350 | inc_frontswap_invalidates(); |
306 | } | ||
307 | } | 351 | } |
308 | EXPORT_SYMBOL(__frontswap_invalidate_page); | 352 | EXPORT_SYMBOL(__frontswap_invalidate_page); |
309 | 353 | ||
@@ -314,16 +358,19 @@ EXPORT_SYMBOL(__frontswap_invalidate_page); | |||
314 | void __frontswap_invalidate_area(unsigned type) | 358 | void __frontswap_invalidate_area(unsigned type) |
315 | { | 359 | { |
316 | struct swap_info_struct *sis = swap_info[type]; | 360 | struct swap_info_struct *sis = swap_info[type]; |
361 | struct frontswap_ops *ops; | ||
317 | 362 | ||
318 | if (frontswap_ops) { | 363 | if (!frontswap_ops) |
319 | BUG_ON(sis == NULL); | 364 | return; |
320 | if (sis->frontswap_map == NULL) | 365 | |
321 | return; | 366 | BUG_ON(sis == NULL); |
322 | frontswap_ops->invalidate_area(type); | 367 | if (sis->frontswap_map == NULL) |
323 | atomic_set(&sis->frontswap_pages, 0); | 368 | return; |
324 | bitmap_zero(sis->frontswap_map, sis->max); | 369 | |
325 | } | 370 | for_each_frontswap_ops(ops) |
326 | clear_bit(type, need_init); | 371 | ops->invalidate_area(type); |
372 | atomic_set(&sis->frontswap_pages, 0); | ||
373 | bitmap_zero(sis->frontswap_map, sis->max); | ||
327 | } | 374 | } |
328 | EXPORT_SYMBOL(__frontswap_invalidate_area); | 375 | EXPORT_SYMBOL(__frontswap_invalidate_area); |
329 | 376 | ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 078832cf3636..c107094f79ba 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1031,7 +1031,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1031 | goto out_free_pages; | 1031 | goto out_free_pages; |
1032 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1032 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1033 | 1033 | ||
1034 | pmdp_clear_flush_notify(vma, haddr, pmd); | 1034 | pmdp_huge_clear_flush_notify(vma, haddr, pmd); |
1035 | /* leave pmd empty until pte is filled */ | 1035 | /* leave pmd empty until pte is filled */ |
1036 | 1036 | ||
1037 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 1037 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
@@ -1174,7 +1174,7 @@ alloc: | |||
1174 | pmd_t entry; | 1174 | pmd_t entry; |
1175 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); | 1175 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); |
1176 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1176 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1177 | pmdp_clear_flush_notify(vma, haddr, pmd); | 1177 | pmdp_huge_clear_flush_notify(vma, haddr, pmd); |
1178 | page_add_new_anon_rmap(new_page, vma, haddr); | 1178 | page_add_new_anon_rmap(new_page, vma, haddr); |
1179 | mem_cgroup_commit_charge(new_page, memcg, false); | 1179 | mem_cgroup_commit_charge(new_page, memcg, false); |
1180 | lru_cache_add_active_or_unevictable(new_page, vma); | 1180 | lru_cache_add_active_or_unevictable(new_page, vma); |
@@ -1396,12 +1396,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1396 | pmd_t orig_pmd; | 1396 | pmd_t orig_pmd; |
1397 | /* | 1397 | /* |
1398 | * For architectures like ppc64 we look at deposited pgtable | 1398 | * For architectures like ppc64 we look at deposited pgtable |
1399 | * when calling pmdp_get_and_clear. So do the | 1399 | * when calling pmdp_huge_get_and_clear. So do the |
1400 | * pgtable_trans_huge_withdraw after finishing pmdp related | 1400 | * pgtable_trans_huge_withdraw after finishing pmdp related |
1401 | * operations. | 1401 | * operations. |
1402 | */ | 1402 | */ |
1403 | orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd, | 1403 | orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, |
1404 | tlb->fullmm); | 1404 | tlb->fullmm); |
1405 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1405 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1406 | pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); | 1406 | pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); |
1407 | if (is_huge_zero_pmd(orig_pmd)) { | 1407 | if (is_huge_zero_pmd(orig_pmd)) { |
@@ -1459,7 +1459,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
1459 | new_ptl = pmd_lockptr(mm, new_pmd); | 1459 | new_ptl = pmd_lockptr(mm, new_pmd); |
1460 | if (new_ptl != old_ptl) | 1460 | if (new_ptl != old_ptl) |
1461 | spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); | 1461 | spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); |
1462 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | 1462 | pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); |
1463 | VM_BUG_ON(!pmd_none(*new_pmd)); | 1463 | VM_BUG_ON(!pmd_none(*new_pmd)); |
1464 | 1464 | ||
1465 | if (pmd_move_must_withdraw(new_ptl, old_ptl)) { | 1465 | if (pmd_move_must_withdraw(new_ptl, old_ptl)) { |
@@ -1505,7 +1505,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1505 | } | 1505 | } |
1506 | 1506 | ||
1507 | if (!prot_numa || !pmd_protnone(*pmd)) { | 1507 | if (!prot_numa || !pmd_protnone(*pmd)) { |
1508 | entry = pmdp_get_and_clear_notify(mm, addr, pmd); | 1508 | entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); |
1509 | entry = pmd_modify(entry, newprot); | 1509 | entry = pmd_modify(entry, newprot); |
1510 | if (preserve_write) | 1510 | if (preserve_write) |
1511 | entry = pmd_mkwrite(entry); | 1511 | entry = pmd_mkwrite(entry); |
@@ -2499,7 +2499,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2499 | * huge and small TLB entries for the same virtual address | 2499 | * huge and small TLB entries for the same virtual address |
2500 | * to avoid the risk of CPU bugs in that area. | 2500 | * to avoid the risk of CPU bugs in that area. |
2501 | */ | 2501 | */ |
2502 | _pmd = pmdp_clear_flush(vma, address, pmd); | 2502 | _pmd = pmdp_collapse_flush(vma, address, pmd); |
2503 | spin_unlock(pmd_ptl); | 2503 | spin_unlock(pmd_ptl); |
2504 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2504 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2505 | 2505 | ||
@@ -2799,7 +2799,7 @@ static void khugepaged_do_scan(void) | |||
2799 | 2799 | ||
2800 | cond_resched(); | 2800 | cond_resched(); |
2801 | 2801 | ||
2802 | if (unlikely(kthread_should_stop() || freezing(current))) | 2802 | if (unlikely(kthread_should_stop() || try_to_freeze())) |
2803 | break; | 2803 | break; |
2804 | 2804 | ||
2805 | spin_lock(&khugepaged_mm_lock); | 2805 | spin_lock(&khugepaged_mm_lock); |
@@ -2820,8 +2820,6 @@ static void khugepaged_do_scan(void) | |||
2820 | 2820 | ||
2821 | static void khugepaged_wait_work(void) | 2821 | static void khugepaged_wait_work(void) |
2822 | { | 2822 | { |
2823 | try_to_freeze(); | ||
2824 | |||
2825 | if (khugepaged_has_work()) { | 2823 | if (khugepaged_has_work()) { |
2826 | if (!khugepaged_scan_sleep_millisecs) | 2824 | if (!khugepaged_scan_sleep_millisecs) |
2827 | return; | 2825 | return; |
@@ -2865,7 +2863,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, | |||
2865 | pmd_t _pmd; | 2863 | pmd_t _pmd; |
2866 | int i; | 2864 | int i; |
2867 | 2865 | ||
2868 | pmdp_clear_flush_notify(vma, haddr, pmd); | 2866 | pmdp_huge_clear_flush_notify(vma, haddr, pmd); |
2869 | /* leave pmd empty until pte is filled */ | 2867 | /* leave pmd empty until pte is filled */ |
2870 | 2868 | ||
2871 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 2869 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 271e4432734c..75c0eef52c5d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -40,6 +40,11 @@ int hugepages_treat_as_movable; | |||
40 | int hugetlb_max_hstate __read_mostly; | 40 | int hugetlb_max_hstate __read_mostly; |
41 | unsigned int default_hstate_idx; | 41 | unsigned int default_hstate_idx; |
42 | struct hstate hstates[HUGE_MAX_HSTATE]; | 42 | struct hstate hstates[HUGE_MAX_HSTATE]; |
43 | /* | ||
44 | * Minimum page order among possible hugepage sizes, set to a proper value | ||
45 | * at boot time. | ||
46 | */ | ||
47 | static unsigned int minimum_order __read_mostly = UINT_MAX; | ||
43 | 48 | ||
44 | __initdata LIST_HEAD(huge_boot_pages); | 49 | __initdata LIST_HEAD(huge_boot_pages); |
45 | 50 | ||
@@ -212,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) | |||
212 | * Region tracking -- allows tracking of reservations and instantiated pages | 217 | * Region tracking -- allows tracking of reservations and instantiated pages |
213 | * across the pages in a mapping. | 218 | * across the pages in a mapping. |
214 | * | 219 | * |
215 | * The region data structures are embedded into a resv_map and | 220 | * The region data structures are embedded into a resv_map and protected |
216 | * protected by a resv_map's lock | 221 | * by a resv_map's lock. The set of regions within the resv_map represent |
222 | * reservations for huge pages, or huge pages that have already been | ||
223 | * instantiated within the map. The from and to elements are huge page | ||
224 | * indicies into the associated mapping. from indicates the starting index | ||
225 | * of the region. to represents the first index past the end of the region. | ||
226 | * | ||
227 | * For example, a file region structure with from == 0 and to == 4 represents | ||
228 | * four huge pages in a mapping. It is important to note that the to element | ||
229 | * represents the first element past the end of the region. This is used in | ||
230 | * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. | ||
231 | * | ||
232 | * Interval notation of the form [from, to) will be used to indicate that | ||
233 | * the endpoint from is inclusive and to is exclusive. | ||
217 | */ | 234 | */ |
218 | struct file_region { | 235 | struct file_region { |
219 | struct list_head link; | 236 | struct list_head link; |
@@ -221,10 +238,22 @@ struct file_region { | |||
221 | long to; | 238 | long to; |
222 | }; | 239 | }; |
223 | 240 | ||
241 | /* | ||
242 | * Add the huge page range represented by [f, t) to the reserve | ||
243 | * map. Existing regions will be expanded to accommodate the | ||
244 | * specified range. We know only existing regions need to be | ||
245 | * expanded, because region_add is only called after region_chg | ||
246 | * with the same range. If a new file_region structure must | ||
247 | * be allocated, it is done in region_chg. | ||
248 | * | ||
249 | * Return the number of new huge pages added to the map. This | ||
250 | * number is greater than or equal to zero. | ||
251 | */ | ||
224 | static long region_add(struct resv_map *resv, long f, long t) | 252 | static long region_add(struct resv_map *resv, long f, long t) |
225 | { | 253 | { |
226 | struct list_head *head = &resv->regions; | 254 | struct list_head *head = &resv->regions; |
227 | struct file_region *rg, *nrg, *trg; | 255 | struct file_region *rg, *nrg, *trg; |
256 | long add = 0; | ||
228 | 257 | ||
229 | spin_lock(&resv->lock); | 258 | spin_lock(&resv->lock); |
230 | /* Locate the region we are either in or before. */ | 259 | /* Locate the region we are either in or before. */ |
@@ -250,16 +279,45 @@ static long region_add(struct resv_map *resv, long f, long t) | |||
250 | if (rg->to > t) | 279 | if (rg->to > t) |
251 | t = rg->to; | 280 | t = rg->to; |
252 | if (rg != nrg) { | 281 | if (rg != nrg) { |
282 | /* Decrement return value by the deleted range. | ||
283 | * Another range will span this area so that by | ||
284 | * end of routine add will be >= zero | ||
285 | */ | ||
286 | add -= (rg->to - rg->from); | ||
253 | list_del(&rg->link); | 287 | list_del(&rg->link); |
254 | kfree(rg); | 288 | kfree(rg); |
255 | } | 289 | } |
256 | } | 290 | } |
291 | |||
292 | add += (nrg->from - f); /* Added to beginning of region */ | ||
257 | nrg->from = f; | 293 | nrg->from = f; |
294 | add += t - nrg->to; /* Added to end of region */ | ||
258 | nrg->to = t; | 295 | nrg->to = t; |
296 | |||
259 | spin_unlock(&resv->lock); | 297 | spin_unlock(&resv->lock); |
260 | return 0; | 298 | VM_BUG_ON(add < 0); |
299 | return add; | ||
261 | } | 300 | } |
262 | 301 | ||
302 | /* | ||
303 | * Examine the existing reserve map and determine how many | ||
304 | * huge pages in the specified range [f, t) are NOT currently | ||
305 | * represented. This routine is called before a subsequent | ||
306 | * call to region_add that will actually modify the reserve | ||
307 | * map to add the specified range [f, t). region_chg does | ||
308 | * not change the number of huge pages represented by the | ||
309 | * map. However, if the existing regions in the map can not | ||
310 | * be expanded to represent the new range, a new file_region | ||
311 | * structure is added to the map as a placeholder. This is | ||
312 | * so that the subsequent region_add call will have all the | ||
313 | * regions it needs and will not fail. | ||
314 | * | ||
315 | * Returns the number of huge pages that need to be added | ||
316 | * to the existing reservation map for the range [f, t). | ||
317 | * This number is greater or equal to zero. -ENOMEM is | ||
318 | * returned if a new file_region structure is needed and can | ||
319 | * not be allocated. | ||
320 | */ | ||
263 | static long region_chg(struct resv_map *resv, long f, long t) | 321 | static long region_chg(struct resv_map *resv, long f, long t) |
264 | { | 322 | { |
265 | struct list_head *head = &resv->regions; | 323 | struct list_head *head = &resv->regions; |
@@ -326,6 +384,11 @@ out_nrg: | |||
326 | return chg; | 384 | return chg; |
327 | } | 385 | } |
328 | 386 | ||
387 | /* | ||
388 | * Truncate the reserve map at index 'end'. Modify/truncate any | ||
389 | * region which contains end. Delete any regions past end. | ||
390 | * Return the number of huge pages removed from the map. | ||
391 | */ | ||
329 | static long region_truncate(struct resv_map *resv, long end) | 392 | static long region_truncate(struct resv_map *resv, long end) |
330 | { | 393 | { |
331 | struct list_head *head = &resv->regions; | 394 | struct list_head *head = &resv->regions; |
@@ -361,6 +424,10 @@ out: | |||
361 | return chg; | 424 | return chg; |
362 | } | 425 | } |
363 | 426 | ||
427 | /* | ||
428 | * Count and return the number of huge pages in the reserve map | ||
429 | * that intersect with the range [f, t). | ||
430 | */ | ||
364 | static long region_count(struct resv_map *resv, long f, long t) | 431 | static long region_count(struct resv_map *resv, long f, long t) |
365 | { | 432 | { |
366 | struct list_head *head = &resv->regions; | 433 | struct list_head *head = &resv->regions; |
@@ -1188,19 +1255,13 @@ static void dissolve_free_huge_page(struct page *page) | |||
1188 | */ | 1255 | */ |
1189 | void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) | 1256 | void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) |
1190 | { | 1257 | { |
1191 | unsigned int order = 8 * sizeof(void *); | ||
1192 | unsigned long pfn; | 1258 | unsigned long pfn; |
1193 | struct hstate *h; | ||
1194 | 1259 | ||
1195 | if (!hugepages_supported()) | 1260 | if (!hugepages_supported()) |
1196 | return; | 1261 | return; |
1197 | 1262 | ||
1198 | /* Set scan step to minimum hugepage size */ | 1263 | VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); |
1199 | for_each_hstate(h) | 1264 | for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) |
1200 | if (order > huge_page_order(h)) | ||
1201 | order = huge_page_order(h); | ||
1202 | VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); | ||
1203 | for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) | ||
1204 | dissolve_free_huge_page(pfn_to_page(pfn)); | 1265 | dissolve_free_huge_page(pfn_to_page(pfn)); |
1205 | } | 1266 | } |
1206 | 1267 | ||
@@ -1423,46 +1484,56 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
1423 | } | 1484 | } |
1424 | 1485 | ||
1425 | /* | 1486 | /* |
1426 | * Determine if the huge page at addr within the vma has an associated | 1487 | * vma_needs_reservation and vma_commit_reservation are used by the huge |
1427 | * reservation. Where it does not we will need to logically increase | 1488 | * page allocation routines to manage reservations. |
1428 | * reservation and actually increase subpool usage before an allocation | 1489 | * |
1429 | * can occur. Where any new reservation would be required the | 1490 | * vma_needs_reservation is called to determine if the huge page at addr |
1430 | * reservation change is prepared, but not committed. Once the page | 1491 | * within the vma has an associated reservation. If a reservation is |
1431 | * has been allocated from the subpool and instantiated the change should | 1492 | * needed, the value 1 is returned. The caller is then responsible for |
1432 | * be committed via vma_commit_reservation. No action is required on | 1493 | * managing the global reservation and subpool usage counts. After |
1433 | * failure. | 1494 | * the huge page has been allocated, vma_commit_reservation is called |
1495 | * to add the page to the reservation map. | ||
1496 | * | ||
1497 | * In the normal case, vma_commit_reservation returns the same value | ||
1498 | * as the preceding vma_needs_reservation call. The only time this | ||
1499 | * is not the case is if a reserve map was changed between calls. It | ||
1500 | * is the responsibility of the caller to notice the difference and | ||
1501 | * take appropriate action. | ||
1434 | */ | 1502 | */ |
1435 | static long vma_needs_reservation(struct hstate *h, | 1503 | static long __vma_reservation_common(struct hstate *h, |
1436 | struct vm_area_struct *vma, unsigned long addr) | 1504 | struct vm_area_struct *vma, unsigned long addr, |
1505 | bool commit) | ||
1437 | { | 1506 | { |
1438 | struct resv_map *resv; | 1507 | struct resv_map *resv; |
1439 | pgoff_t idx; | 1508 | pgoff_t idx; |
1440 | long chg; | 1509 | long ret; |
1441 | 1510 | ||
1442 | resv = vma_resv_map(vma); | 1511 | resv = vma_resv_map(vma); |
1443 | if (!resv) | 1512 | if (!resv) |
1444 | return 1; | 1513 | return 1; |
1445 | 1514 | ||
1446 | idx = vma_hugecache_offset(h, vma, addr); | 1515 | idx = vma_hugecache_offset(h, vma, addr); |
1447 | chg = region_chg(resv, idx, idx + 1); | 1516 | if (commit) |
1517 | ret = region_add(resv, idx, idx + 1); | ||
1518 | else | ||
1519 | ret = region_chg(resv, idx, idx + 1); | ||
1448 | 1520 | ||
1449 | if (vma->vm_flags & VM_MAYSHARE) | 1521 | if (vma->vm_flags & VM_MAYSHARE) |
1450 | return chg; | 1522 | return ret; |
1451 | else | 1523 | else |
1452 | return chg < 0 ? chg : 0; | 1524 | return ret < 0 ? ret : 0; |
1453 | } | 1525 | } |
1454 | static void vma_commit_reservation(struct hstate *h, | 1526 | |
1527 | static long vma_needs_reservation(struct hstate *h, | ||
1455 | struct vm_area_struct *vma, unsigned long addr) | 1528 | struct vm_area_struct *vma, unsigned long addr) |
1456 | { | 1529 | { |
1457 | struct resv_map *resv; | 1530 | return __vma_reservation_common(h, vma, addr, false); |
1458 | pgoff_t idx; | 1531 | } |
1459 | |||
1460 | resv = vma_resv_map(vma); | ||
1461 | if (!resv) | ||
1462 | return; | ||
1463 | 1532 | ||
1464 | idx = vma_hugecache_offset(h, vma, addr); | 1533 | static long vma_commit_reservation(struct hstate *h, |
1465 | region_add(resv, idx, idx + 1); | 1534 | struct vm_area_struct *vma, unsigned long addr) |
1535 | { | ||
1536 | return __vma_reservation_common(h, vma, addr, true); | ||
1466 | } | 1537 | } |
1467 | 1538 | ||
1468 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 1539 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
@@ -1471,7 +1542,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1471 | struct hugepage_subpool *spool = subpool_vma(vma); | 1542 | struct hugepage_subpool *spool = subpool_vma(vma); |
1472 | struct hstate *h = hstate_vma(vma); | 1543 | struct hstate *h = hstate_vma(vma); |
1473 | struct page *page; | 1544 | struct page *page; |
1474 | long chg; | 1545 | long chg, commit; |
1475 | int ret, idx; | 1546 | int ret, idx; |
1476 | struct hugetlb_cgroup *h_cg; | 1547 | struct hugetlb_cgroup *h_cg; |
1477 | 1548 | ||
@@ -1512,7 +1583,22 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1512 | 1583 | ||
1513 | set_page_private(page, (unsigned long)spool); | 1584 | set_page_private(page, (unsigned long)spool); |
1514 | 1585 | ||
1515 | vma_commit_reservation(h, vma, addr); | 1586 | commit = vma_commit_reservation(h, vma, addr); |
1587 | if (unlikely(chg > commit)) { | ||
1588 | /* | ||
1589 | * The page was added to the reservation map between | ||
1590 | * vma_needs_reservation and vma_commit_reservation. | ||
1591 | * This indicates a race with hugetlb_reserve_pages. | ||
1592 | * Adjust for the subpool count incremented above AND | ||
1593 | * in hugetlb_reserve_pages for the same page. Also, | ||
1594 | * the reservation count added in hugetlb_reserve_pages | ||
1595 | * no longer applies. | ||
1596 | */ | ||
1597 | long rsv_adjust; | ||
1598 | |||
1599 | rsv_adjust = hugepage_subpool_put_pages(spool, 1); | ||
1600 | hugetlb_acct_memory(h, -rsv_adjust); | ||
1601 | } | ||
1516 | return page; | 1602 | return page; |
1517 | 1603 | ||
1518 | out_uncharge_cgroup: | 1604 | out_uncharge_cgroup: |
@@ -1627,10 +1713,14 @@ static void __init hugetlb_init_hstates(void) | |||
1627 | struct hstate *h; | 1713 | struct hstate *h; |
1628 | 1714 | ||
1629 | for_each_hstate(h) { | 1715 | for_each_hstate(h) { |
1716 | if (minimum_order > huge_page_order(h)) | ||
1717 | minimum_order = huge_page_order(h); | ||
1718 | |||
1630 | /* oversize hugepages were init'ed in early boot */ | 1719 | /* oversize hugepages were init'ed in early boot */ |
1631 | if (!hstate_is_gigantic(h)) | 1720 | if (!hstate_is_gigantic(h)) |
1632 | hugetlb_hstate_alloc_pages(h); | 1721 | hugetlb_hstate_alloc_pages(h); |
1633 | } | 1722 | } |
1723 | VM_BUG_ON(minimum_order == UINT_MAX); | ||
1634 | } | 1724 | } |
1635 | 1725 | ||
1636 | static char * __init memfmt(char *buf, unsigned long n) | 1726 | static char * __init memfmt(char *buf, unsigned long n) |
@@ -3626,8 +3716,24 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
3626 | * consumed reservations are stored in the map. Hence, nothing | 3716 | * consumed reservations are stored in the map. Hence, nothing |
3627 | * else has to be done for private mappings here | 3717 | * else has to be done for private mappings here |
3628 | */ | 3718 | */ |
3629 | if (!vma || vma->vm_flags & VM_MAYSHARE) | 3719 | if (!vma || vma->vm_flags & VM_MAYSHARE) { |
3630 | region_add(resv_map, from, to); | 3720 | long add = region_add(resv_map, from, to); |
3721 | |||
3722 | if (unlikely(chg > add)) { | ||
3723 | /* | ||
3724 | * pages in this range were added to the reserve | ||
3725 | * map between region_chg and region_add. This | ||
3726 | * indicates a race with alloc_huge_page. Adjust | ||
3727 | * the subpool and reserve counts modified above | ||
3728 | * based on the difference. | ||
3729 | */ | ||
3730 | long rsv_adjust; | ||
3731 | |||
3732 | rsv_adjust = hugepage_subpool_put_pages(spool, | ||
3733 | chg - add); | ||
3734 | hugetlb_acct_memory(h, -rsv_adjust); | ||
3735 | } | ||
3736 | } | ||
3631 | return 0; | 3737 | return 0; |
3632 | out_err: | 3738 | out_err: |
3633 | if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) | 3739 | if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) |
@@ -3789,6 +3895,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3789 | { | 3895 | { |
3790 | return NULL; | 3896 | return NULL; |
3791 | } | 3897 | } |
3898 | |||
3899 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
3900 | { | ||
3901 | return 0; | ||
3902 | } | ||
3792 | #define want_pmd_share() (0) | 3903 | #define want_pmd_share() (0) |
3793 | #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ | 3904 | #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ |
3794 | 3905 | ||
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 4ca5fe0042e1..bf73ac17dad4 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -28,7 +28,7 @@ static int hwpoison_inject(void *data, u64 val) | |||
28 | /* | 28 | /* |
29 | * This implies unable to support free buddy pages. | 29 | * This implies unable to support free buddy pages. |
30 | */ | 30 | */ |
31 | if (!get_page_unless_zero(hpage)) | 31 | if (!get_hwpoison_page(p)) |
32 | return 0; | 32 | return 0; |
33 | 33 | ||
34 | if (!hwpoison_filter_enable) | 34 | if (!hwpoison_filter_enable) |
@@ -58,7 +58,7 @@ inject: | |||
58 | pr_info("Injecting memory failure at pfn %#lx\n", pfn); | 58 | pr_info("Injecting memory failure at pfn %#lx\n", pfn); |
59 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); | 59 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); |
60 | put_out: | 60 | put_out: |
61 | put_page(hpage); | 61 | put_page(p); |
62 | return 0; | 62 | return 0; |
63 | } | 63 | } |
64 | 64 | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f0fe4f2c1fa7..cf79f110157c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -53,6 +53,13 @@ | |||
53 | * modifications to the memory scanning parameters including the scan_thread | 53 | * modifications to the memory scanning parameters including the scan_thread |
54 | * pointer | 54 | * pointer |
55 | * | 55 | * |
56 | * Locks and mutexes are acquired/nested in the following order: | ||
57 | * | ||
58 | * scan_mutex [-> object->lock] -> kmemleak_lock -> other_object->lock (SINGLE_DEPTH_NESTING) | ||
59 | * | ||
60 | * No kmemleak_lock and object->lock nesting is allowed outside scan_mutex | ||
61 | * regions. | ||
62 | * | ||
56 | * The kmemleak_object structures have a use_count incremented or decremented | 63 | * The kmemleak_object structures have a use_count incremented or decremented |
57 | * using the get_object()/put_object() functions. When the use_count becomes | 64 | * using the get_object()/put_object() functions. When the use_count becomes |
58 | * 0, this count can no longer be incremented and put_object() schedules the | 65 | * 0, this count can no longer be incremented and put_object() schedules the |
@@ -195,6 +202,8 @@ static struct kmem_cache *scan_area_cache; | |||
195 | 202 | ||
196 | /* set if tracing memory operations is enabled */ | 203 | /* set if tracing memory operations is enabled */ |
197 | static int kmemleak_enabled; | 204 | static int kmemleak_enabled; |
205 | /* same as above but only for the kmemleak_free() callback */ | ||
206 | static int kmemleak_free_enabled; | ||
198 | /* set in the late_initcall if there were no errors */ | 207 | /* set in the late_initcall if there were no errors */ |
199 | static int kmemleak_initialized; | 208 | static int kmemleak_initialized; |
200 | /* enables or disables early logging of the memory operations */ | 209 | /* enables or disables early logging of the memory operations */ |
@@ -483,8 +492,7 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) | |||
483 | 492 | ||
484 | rcu_read_lock(); | 493 | rcu_read_lock(); |
485 | read_lock_irqsave(&kmemleak_lock, flags); | 494 | read_lock_irqsave(&kmemleak_lock, flags); |
486 | if (ptr >= min_addr && ptr < max_addr) | 495 | object = lookup_object(ptr, alias); |
487 | object = lookup_object(ptr, alias); | ||
488 | read_unlock_irqrestore(&kmemleak_lock, flags); | 496 | read_unlock_irqrestore(&kmemleak_lock, flags); |
489 | 497 | ||
490 | /* check whether the object is still available */ | 498 | /* check whether the object is still available */ |
@@ -496,6 +504,27 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) | |||
496 | } | 504 | } |
497 | 505 | ||
498 | /* | 506 | /* |
507 | * Look up an object in the object search tree and remove it from both | ||
508 | * object_tree_root and object_list. The returned object's use_count should be | ||
509 | * at least 1, as initially set by create_object(). | ||
510 | */ | ||
511 | static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias) | ||
512 | { | ||
513 | unsigned long flags; | ||
514 | struct kmemleak_object *object; | ||
515 | |||
516 | write_lock_irqsave(&kmemleak_lock, flags); | ||
517 | object = lookup_object(ptr, alias); | ||
518 | if (object) { | ||
519 | rb_erase(&object->rb_node, &object_tree_root); | ||
520 | list_del_rcu(&object->object_list); | ||
521 | } | ||
522 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
523 | |||
524 | return object; | ||
525 | } | ||
526 | |||
527 | /* | ||
499 | * Save stack trace to the given array of MAX_TRACE size. | 528 | * Save stack trace to the given array of MAX_TRACE size. |
500 | */ | 529 | */ |
501 | static int __save_stack_trace(unsigned long *trace) | 530 | static int __save_stack_trace(unsigned long *trace) |
@@ -580,11 +609,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
580 | kmemleak_stop("Cannot insert 0x%lx into the object " | 609 | kmemleak_stop("Cannot insert 0x%lx into the object " |
581 | "search tree (overlaps existing)\n", | 610 | "search tree (overlaps existing)\n", |
582 | ptr); | 611 | ptr); |
612 | /* | ||
613 | * No need for parent->lock here since "parent" cannot | ||
614 | * be freed while the kmemleak_lock is held. | ||
615 | */ | ||
616 | dump_object_info(parent); | ||
583 | kmem_cache_free(object_cache, object); | 617 | kmem_cache_free(object_cache, object); |
584 | object = parent; | 618 | object = NULL; |
585 | spin_lock(&object->lock); | ||
586 | dump_object_info(object); | ||
587 | spin_unlock(&object->lock); | ||
588 | goto out; | 619 | goto out; |
589 | } | 620 | } |
590 | } | 621 | } |
@@ -598,20 +629,14 @@ out: | |||
598 | } | 629 | } |
599 | 630 | ||
600 | /* | 631 | /* |
601 | * Remove the metadata (struct kmemleak_object) for a memory block from the | 632 | * Mark the object as not allocated and schedule RCU freeing via put_object(). |
602 | * object_list and object_tree_root and decrement its use_count. | ||
603 | */ | 633 | */ |
604 | static void __delete_object(struct kmemleak_object *object) | 634 | static void __delete_object(struct kmemleak_object *object) |
605 | { | 635 | { |
606 | unsigned long flags; | 636 | unsigned long flags; |
607 | 637 | ||
608 | write_lock_irqsave(&kmemleak_lock, flags); | ||
609 | rb_erase(&object->rb_node, &object_tree_root); | ||
610 | list_del_rcu(&object->object_list); | ||
611 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
612 | |||
613 | WARN_ON(!(object->flags & OBJECT_ALLOCATED)); | 638 | WARN_ON(!(object->flags & OBJECT_ALLOCATED)); |
614 | WARN_ON(atomic_read(&object->use_count) < 2); | 639 | WARN_ON(atomic_read(&object->use_count) < 1); |
615 | 640 | ||
616 | /* | 641 | /* |
617 | * Locking here also ensures that the corresponding memory block | 642 | * Locking here also ensures that the corresponding memory block |
@@ -631,7 +656,7 @@ static void delete_object_full(unsigned long ptr) | |||
631 | { | 656 | { |
632 | struct kmemleak_object *object; | 657 | struct kmemleak_object *object; |
633 | 658 | ||
634 | object = find_and_get_object(ptr, 0); | 659 | object = find_and_remove_object(ptr, 0); |
635 | if (!object) { | 660 | if (!object) { |
636 | #ifdef DEBUG | 661 | #ifdef DEBUG |
637 | kmemleak_warn("Freeing unknown object at 0x%08lx\n", | 662 | kmemleak_warn("Freeing unknown object at 0x%08lx\n", |
@@ -640,7 +665,6 @@ static void delete_object_full(unsigned long ptr) | |||
640 | return; | 665 | return; |
641 | } | 666 | } |
642 | __delete_object(object); | 667 | __delete_object(object); |
643 | put_object(object); | ||
644 | } | 668 | } |
645 | 669 | ||
646 | /* | 670 | /* |
@@ -653,7 +677,7 @@ static void delete_object_part(unsigned long ptr, size_t size) | |||
653 | struct kmemleak_object *object; | 677 | struct kmemleak_object *object; |
654 | unsigned long start, end; | 678 | unsigned long start, end; |
655 | 679 | ||
656 | object = find_and_get_object(ptr, 1); | 680 | object = find_and_remove_object(ptr, 1); |
657 | if (!object) { | 681 | if (!object) { |
658 | #ifdef DEBUG | 682 | #ifdef DEBUG |
659 | kmemleak_warn("Partially freeing unknown object at 0x%08lx " | 683 | kmemleak_warn("Partially freeing unknown object at 0x%08lx " |
@@ -661,7 +685,6 @@ static void delete_object_part(unsigned long ptr, size_t size) | |||
661 | #endif | 685 | #endif |
662 | return; | 686 | return; |
663 | } | 687 | } |
664 | __delete_object(object); | ||
665 | 688 | ||
666 | /* | 689 | /* |
667 | * Create one or two objects that may result from the memory block | 690 | * Create one or two objects that may result from the memory block |
@@ -679,7 +702,7 @@ static void delete_object_part(unsigned long ptr, size_t size) | |||
679 | create_object(ptr + size, end - ptr - size, object->min_count, | 702 | create_object(ptr + size, end - ptr - size, object->min_count, |
680 | GFP_KERNEL); | 703 | GFP_KERNEL); |
681 | 704 | ||
682 | put_object(object); | 705 | __delete_object(object); |
683 | } | 706 | } |
684 | 707 | ||
685 | static void __paint_it(struct kmemleak_object *object, int color) | 708 | static void __paint_it(struct kmemleak_object *object, int color) |
@@ -907,12 +930,13 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc); | |||
907 | * kmemleak_alloc_percpu - register a newly allocated __percpu object | 930 | * kmemleak_alloc_percpu - register a newly allocated __percpu object |
908 | * @ptr: __percpu pointer to beginning of the object | 931 | * @ptr: __percpu pointer to beginning of the object |
909 | * @size: size of the object | 932 | * @size: size of the object |
933 | * @gfp: flags used for kmemleak internal memory allocations | ||
910 | * | 934 | * |
911 | * This function is called from the kernel percpu allocator when a new object | 935 | * This function is called from the kernel percpu allocator when a new object |
912 | * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL | 936 | * (memory block) is allocated (alloc_percpu). |
913 | * allocation. | ||
914 | */ | 937 | */ |
915 | void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) | 938 | void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, |
939 | gfp_t gfp) | ||
916 | { | 940 | { |
917 | unsigned int cpu; | 941 | unsigned int cpu; |
918 | 942 | ||
@@ -925,7 +949,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) | |||
925 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) | 949 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) |
926 | for_each_possible_cpu(cpu) | 950 | for_each_possible_cpu(cpu) |
927 | create_object((unsigned long)per_cpu_ptr(ptr, cpu), | 951 | create_object((unsigned long)per_cpu_ptr(ptr, cpu), |
928 | size, 0, GFP_KERNEL); | 952 | size, 0, gfp); |
929 | else if (kmemleak_early_log) | 953 | else if (kmemleak_early_log) |
930 | log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); | 954 | log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); |
931 | } | 955 | } |
@@ -942,7 +966,7 @@ void __ref kmemleak_free(const void *ptr) | |||
942 | { | 966 | { |
943 | pr_debug("%s(0x%p)\n", __func__, ptr); | 967 | pr_debug("%s(0x%p)\n", __func__, ptr); |
944 | 968 | ||
945 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) | 969 | if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) |
946 | delete_object_full((unsigned long)ptr); | 970 | delete_object_full((unsigned long)ptr); |
947 | else if (kmemleak_early_log) | 971 | else if (kmemleak_early_log) |
948 | log_early(KMEMLEAK_FREE, ptr, 0, 0); | 972 | log_early(KMEMLEAK_FREE, ptr, 0, 0); |
@@ -982,7 +1006,7 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) | |||
982 | 1006 | ||
983 | pr_debug("%s(0x%p)\n", __func__, ptr); | 1007 | pr_debug("%s(0x%p)\n", __func__, ptr); |
984 | 1008 | ||
985 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) | 1009 | if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) |
986 | for_each_possible_cpu(cpu) | 1010 | for_each_possible_cpu(cpu) |
987 | delete_object_full((unsigned long)per_cpu_ptr(ptr, | 1011 | delete_object_full((unsigned long)per_cpu_ptr(ptr, |
988 | cpu)); | 1012 | cpu)); |
@@ -1148,19 +1172,18 @@ static int scan_should_stop(void) | |||
1148 | * found to the gray list. | 1172 | * found to the gray list. |
1149 | */ | 1173 | */ |
1150 | static void scan_block(void *_start, void *_end, | 1174 | static void scan_block(void *_start, void *_end, |
1151 | struct kmemleak_object *scanned, int allow_resched) | 1175 | struct kmemleak_object *scanned) |
1152 | { | 1176 | { |
1153 | unsigned long *ptr; | 1177 | unsigned long *ptr; |
1154 | unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); | 1178 | unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); |
1155 | unsigned long *end = _end - (BYTES_PER_POINTER - 1); | 1179 | unsigned long *end = _end - (BYTES_PER_POINTER - 1); |
1180 | unsigned long flags; | ||
1156 | 1181 | ||
1182 | read_lock_irqsave(&kmemleak_lock, flags); | ||
1157 | for (ptr = start; ptr < end; ptr++) { | 1183 | for (ptr = start; ptr < end; ptr++) { |
1158 | struct kmemleak_object *object; | 1184 | struct kmemleak_object *object; |
1159 | unsigned long flags; | ||
1160 | unsigned long pointer; | 1185 | unsigned long pointer; |
1161 | 1186 | ||
1162 | if (allow_resched) | ||
1163 | cond_resched(); | ||
1164 | if (scan_should_stop()) | 1187 | if (scan_should_stop()) |
1165 | break; | 1188 | break; |
1166 | 1189 | ||
@@ -1173,26 +1196,31 @@ static void scan_block(void *_start, void *_end, | |||
1173 | pointer = *ptr; | 1196 | pointer = *ptr; |
1174 | kasan_enable_current(); | 1197 | kasan_enable_current(); |
1175 | 1198 | ||
1176 | object = find_and_get_object(pointer, 1); | 1199 | if (pointer < min_addr || pointer >= max_addr) |
1200 | continue; | ||
1201 | |||
1202 | /* | ||
1203 | * No need for get_object() here since we hold kmemleak_lock. | ||
1204 | * object->use_count cannot be dropped to 0 while the object | ||
1205 | * is still present in object_tree_root and object_list | ||
1206 | * (with updates protected by kmemleak_lock). | ||
1207 | */ | ||
1208 | object = lookup_object(pointer, 1); | ||
1177 | if (!object) | 1209 | if (!object) |
1178 | continue; | 1210 | continue; |
1179 | if (object == scanned) { | 1211 | if (object == scanned) |
1180 | /* self referenced, ignore */ | 1212 | /* self referenced, ignore */ |
1181 | put_object(object); | ||
1182 | continue; | 1213 | continue; |
1183 | } | ||
1184 | 1214 | ||
1185 | /* | 1215 | /* |
1186 | * Avoid the lockdep recursive warning on object->lock being | 1216 | * Avoid the lockdep recursive warning on object->lock being |
1187 | * previously acquired in scan_object(). These locks are | 1217 | * previously acquired in scan_object(). These locks are |
1188 | * enclosed by scan_mutex. | 1218 | * enclosed by scan_mutex. |
1189 | */ | 1219 | */ |
1190 | spin_lock_irqsave_nested(&object->lock, flags, | 1220 | spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); |
1191 | SINGLE_DEPTH_NESTING); | ||
1192 | if (!color_white(object)) { | 1221 | if (!color_white(object)) { |
1193 | /* non-orphan, ignored or new */ | 1222 | /* non-orphan, ignored or new */ |
1194 | spin_unlock_irqrestore(&object->lock, flags); | 1223 | spin_unlock(&object->lock); |
1195 | put_object(object); | ||
1196 | continue; | 1224 | continue; |
1197 | } | 1225 | } |
1198 | 1226 | ||
@@ -1204,13 +1232,27 @@ static void scan_block(void *_start, void *_end, | |||
1204 | */ | 1232 | */ |
1205 | object->count++; | 1233 | object->count++; |
1206 | if (color_gray(object)) { | 1234 | if (color_gray(object)) { |
1235 | /* put_object() called when removing from gray_list */ | ||
1236 | WARN_ON(!get_object(object)); | ||
1207 | list_add_tail(&object->gray_list, &gray_list); | 1237 | list_add_tail(&object->gray_list, &gray_list); |
1208 | spin_unlock_irqrestore(&object->lock, flags); | ||
1209 | continue; | ||
1210 | } | 1238 | } |
1239 | spin_unlock(&object->lock); | ||
1240 | } | ||
1241 | read_unlock_irqrestore(&kmemleak_lock, flags); | ||
1242 | } | ||
1211 | 1243 | ||
1212 | spin_unlock_irqrestore(&object->lock, flags); | 1244 | /* |
1213 | put_object(object); | 1245 | * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency. |
1246 | */ | ||
1247 | static void scan_large_block(void *start, void *end) | ||
1248 | { | ||
1249 | void *next; | ||
1250 | |||
1251 | while (start < end) { | ||
1252 | next = min(start + MAX_SCAN_SIZE, end); | ||
1253 | scan_block(start, next, NULL); | ||
1254 | start = next; | ||
1255 | cond_resched(); | ||
1214 | } | 1256 | } |
1215 | } | 1257 | } |
1216 | 1258 | ||
@@ -1236,22 +1278,25 @@ static void scan_object(struct kmemleak_object *object) | |||
1236 | if (hlist_empty(&object->area_list)) { | 1278 | if (hlist_empty(&object->area_list)) { |
1237 | void *start = (void *)object->pointer; | 1279 | void *start = (void *)object->pointer; |
1238 | void *end = (void *)(object->pointer + object->size); | 1280 | void *end = (void *)(object->pointer + object->size); |
1281 | void *next; | ||
1239 | 1282 | ||
1240 | while (start < end && (object->flags & OBJECT_ALLOCATED) && | 1283 | do { |
1241 | !(object->flags & OBJECT_NO_SCAN)) { | 1284 | next = min(start + MAX_SCAN_SIZE, end); |
1242 | scan_block(start, min(start + MAX_SCAN_SIZE, end), | 1285 | scan_block(start, next, object); |
1243 | object, 0); | 1286 | |
1244 | start += MAX_SCAN_SIZE; | 1287 | start = next; |
1288 | if (start >= end) | ||
1289 | break; | ||
1245 | 1290 | ||
1246 | spin_unlock_irqrestore(&object->lock, flags); | 1291 | spin_unlock_irqrestore(&object->lock, flags); |
1247 | cond_resched(); | 1292 | cond_resched(); |
1248 | spin_lock_irqsave(&object->lock, flags); | 1293 | spin_lock_irqsave(&object->lock, flags); |
1249 | } | 1294 | } while (object->flags & OBJECT_ALLOCATED); |
1250 | } else | 1295 | } else |
1251 | hlist_for_each_entry(area, &object->area_list, node) | 1296 | hlist_for_each_entry(area, &object->area_list, node) |
1252 | scan_block((void *)area->start, | 1297 | scan_block((void *)area->start, |
1253 | (void *)(area->start + area->size), | 1298 | (void *)(area->start + area->size), |
1254 | object, 0); | 1299 | object); |
1255 | out: | 1300 | out: |
1256 | spin_unlock_irqrestore(&object->lock, flags); | 1301 | spin_unlock_irqrestore(&object->lock, flags); |
1257 | } | 1302 | } |
@@ -1328,14 +1373,14 @@ static void kmemleak_scan(void) | |||
1328 | rcu_read_unlock(); | 1373 | rcu_read_unlock(); |
1329 | 1374 | ||
1330 | /* data/bss scanning */ | 1375 | /* data/bss scanning */ |
1331 | scan_block(_sdata, _edata, NULL, 1); | 1376 | scan_large_block(_sdata, _edata); |
1332 | scan_block(__bss_start, __bss_stop, NULL, 1); | 1377 | scan_large_block(__bss_start, __bss_stop); |
1333 | 1378 | ||
1334 | #ifdef CONFIG_SMP | 1379 | #ifdef CONFIG_SMP |
1335 | /* per-cpu sections scanning */ | 1380 | /* per-cpu sections scanning */ |
1336 | for_each_possible_cpu(i) | 1381 | for_each_possible_cpu(i) |
1337 | scan_block(__per_cpu_start + per_cpu_offset(i), | 1382 | scan_large_block(__per_cpu_start + per_cpu_offset(i), |
1338 | __per_cpu_end + per_cpu_offset(i), NULL, 1); | 1383 | __per_cpu_end + per_cpu_offset(i)); |
1339 | #endif | 1384 | #endif |
1340 | 1385 | ||
1341 | /* | 1386 | /* |
@@ -1356,7 +1401,7 @@ static void kmemleak_scan(void) | |||
1356 | /* only scan if page is in use */ | 1401 | /* only scan if page is in use */ |
1357 | if (page_count(page) == 0) | 1402 | if (page_count(page) == 0) |
1358 | continue; | 1403 | continue; |
1359 | scan_block(page, page + 1, NULL, 1); | 1404 | scan_block(page, page + 1, NULL); |
1360 | } | 1405 | } |
1361 | } | 1406 | } |
1362 | put_online_mems(); | 1407 | put_online_mems(); |
@@ -1370,7 +1415,7 @@ static void kmemleak_scan(void) | |||
1370 | read_lock(&tasklist_lock); | 1415 | read_lock(&tasklist_lock); |
1371 | do_each_thread(g, p) { | 1416 | do_each_thread(g, p) { |
1372 | scan_block(task_stack_page(p), task_stack_page(p) + | 1417 | scan_block(task_stack_page(p), task_stack_page(p) + |
1373 | THREAD_SIZE, NULL, 0); | 1418 | THREAD_SIZE, NULL); |
1374 | } while_each_thread(g, p); | 1419 | } while_each_thread(g, p); |
1375 | read_unlock(&tasklist_lock); | 1420 | read_unlock(&tasklist_lock); |
1376 | } | 1421 | } |
@@ -1747,15 +1792,20 @@ static void __kmemleak_do_cleanup(void) | |||
1747 | */ | 1792 | */ |
1748 | static void kmemleak_do_cleanup(struct work_struct *work) | 1793 | static void kmemleak_do_cleanup(struct work_struct *work) |
1749 | { | 1794 | { |
1750 | mutex_lock(&scan_mutex); | ||
1751 | stop_scan_thread(); | 1795 | stop_scan_thread(); |
1752 | 1796 | ||
1797 | /* | ||
1798 | * Once the scan thread has stopped, it is safe to no longer track | ||
1799 | * object freeing. Ordering of the scan thread stopping and the memory | ||
1800 | * accesses below is guaranteed by the kthread_stop() function. | ||
1801 | */ | ||
1802 | kmemleak_free_enabled = 0; | ||
1803 | |||
1753 | if (!kmemleak_found_leaks) | 1804 | if (!kmemleak_found_leaks) |
1754 | __kmemleak_do_cleanup(); | 1805 | __kmemleak_do_cleanup(); |
1755 | else | 1806 | else |
1756 | pr_info("Kmemleak disabled without freeing internal data. " | 1807 | pr_info("Kmemleak disabled without freeing internal data. " |
1757 | "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); | 1808 | "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); |
1758 | mutex_unlock(&scan_mutex); | ||
1759 | } | 1809 | } |
1760 | 1810 | ||
1761 | static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); | 1811 | static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); |
@@ -1776,6 +1826,8 @@ static void kmemleak_disable(void) | |||
1776 | /* check whether it is too early for a kernel thread */ | 1826 | /* check whether it is too early for a kernel thread */ |
1777 | if (kmemleak_initialized) | 1827 | if (kmemleak_initialized) |
1778 | schedule_work(&cleanup_work); | 1828 | schedule_work(&cleanup_work); |
1829 | else | ||
1830 | kmemleak_free_enabled = 0; | ||
1779 | 1831 | ||
1780 | pr_info("Kernel memory leak detector disabled\n"); | 1832 | pr_info("Kernel memory leak detector disabled\n"); |
1781 | } | 1833 | } |
@@ -1840,8 +1892,10 @@ void __init kmemleak_init(void) | |||
1840 | if (kmemleak_error) { | 1892 | if (kmemleak_error) { |
1841 | local_irq_restore(flags); | 1893 | local_irq_restore(flags); |
1842 | return; | 1894 | return; |
1843 | } else | 1895 | } else { |
1844 | kmemleak_enabled = 1; | 1896 | kmemleak_enabled = 1; |
1897 | kmemleak_free_enabled = 1; | ||
1898 | } | ||
1845 | local_irq_restore(flags); | 1899 | local_irq_restore(flags); |
1846 | 1900 | ||
1847 | /* | 1901 | /* |
diff --git a/mm/memblock.c b/mm/memblock.c index 9318b567ed79..1b444c730846 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -54,10 +54,16 @@ int memblock_debug __initdata_memblock; | |||
54 | #ifdef CONFIG_MOVABLE_NODE | 54 | #ifdef CONFIG_MOVABLE_NODE |
55 | bool movable_node_enabled __initdata_memblock = false; | 55 | bool movable_node_enabled __initdata_memblock = false; |
56 | #endif | 56 | #endif |
57 | static bool system_has_some_mirror __initdata_memblock = false; | ||
57 | static int memblock_can_resize __initdata_memblock; | 58 | static int memblock_can_resize __initdata_memblock; |
58 | static int memblock_memory_in_slab __initdata_memblock = 0; | 59 | static int memblock_memory_in_slab __initdata_memblock = 0; |
59 | static int memblock_reserved_in_slab __initdata_memblock = 0; | 60 | static int memblock_reserved_in_slab __initdata_memblock = 0; |
60 | 61 | ||
62 | ulong __init_memblock choose_memblock_flags(void) | ||
63 | { | ||
64 | return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; | ||
65 | } | ||
66 | |||
61 | /* inline so we don't get a warning when pr_debug is compiled out */ | 67 | /* inline so we don't get a warning when pr_debug is compiled out */ |
62 | static __init_memblock const char * | 68 | static __init_memblock const char * |
63 | memblock_type_name(struct memblock_type *type) | 69 | memblock_type_name(struct memblock_type *type) |
@@ -107,6 +113,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
107 | * @size: size of free area to find | 113 | * @size: size of free area to find |
108 | * @align: alignment of free area to find | 114 | * @align: alignment of free area to find |
109 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | 115 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
116 | * @flags: pick from blocks based on memory attributes | ||
110 | * | 117 | * |
111 | * Utility called from memblock_find_in_range_node(), find free area bottom-up. | 118 | * Utility called from memblock_find_in_range_node(), find free area bottom-up. |
112 | * | 119 | * |
@@ -115,12 +122,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
115 | */ | 122 | */ |
116 | static phys_addr_t __init_memblock | 123 | static phys_addr_t __init_memblock |
117 | __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, | 124 | __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, |
118 | phys_addr_t size, phys_addr_t align, int nid) | 125 | phys_addr_t size, phys_addr_t align, int nid, |
126 | ulong flags) | ||
119 | { | 127 | { |
120 | phys_addr_t this_start, this_end, cand; | 128 | phys_addr_t this_start, this_end, cand; |
121 | u64 i; | 129 | u64 i; |
122 | 130 | ||
123 | for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { | 131 | for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) { |
124 | this_start = clamp(this_start, start, end); | 132 | this_start = clamp(this_start, start, end); |
125 | this_end = clamp(this_end, start, end); | 133 | this_end = clamp(this_end, start, end); |
126 | 134 | ||
@@ -139,6 +147,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, | |||
139 | * @size: size of free area to find | 147 | * @size: size of free area to find |
140 | * @align: alignment of free area to find | 148 | * @align: alignment of free area to find |
141 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | 149 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
150 | * @flags: pick from blocks based on memory attributes | ||
142 | * | 151 | * |
143 | * Utility called from memblock_find_in_range_node(), find free area top-down. | 152 | * Utility called from memblock_find_in_range_node(), find free area top-down. |
144 | * | 153 | * |
@@ -147,12 +156,14 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, | |||
147 | */ | 156 | */ |
148 | static phys_addr_t __init_memblock | 157 | static phys_addr_t __init_memblock |
149 | __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | 158 | __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, |
150 | phys_addr_t size, phys_addr_t align, int nid) | 159 | phys_addr_t size, phys_addr_t align, int nid, |
160 | ulong flags) | ||
151 | { | 161 | { |
152 | phys_addr_t this_start, this_end, cand; | 162 | phys_addr_t this_start, this_end, cand; |
153 | u64 i; | 163 | u64 i; |
154 | 164 | ||
155 | for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { | 165 | for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end, |
166 | NULL) { | ||
156 | this_start = clamp(this_start, start, end); | 167 | this_start = clamp(this_start, start, end); |
157 | this_end = clamp(this_end, start, end); | 168 | this_end = clamp(this_end, start, end); |
158 | 169 | ||
@@ -174,6 +185,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | |||
174 | * @start: start of candidate range | 185 | * @start: start of candidate range |
175 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | 186 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} |
176 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | 187 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
188 | * @flags: pick from blocks based on memory attributes | ||
177 | * | 189 | * |
178 | * Find @size free area aligned to @align in the specified range and node. | 190 | * Find @size free area aligned to @align in the specified range and node. |
179 | * | 191 | * |
@@ -190,7 +202,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | |||
190 | */ | 202 | */ |
191 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, | 203 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, |
192 | phys_addr_t align, phys_addr_t start, | 204 | phys_addr_t align, phys_addr_t start, |
193 | phys_addr_t end, int nid) | 205 | phys_addr_t end, int nid, ulong flags) |
194 | { | 206 | { |
195 | phys_addr_t kernel_end, ret; | 207 | phys_addr_t kernel_end, ret; |
196 | 208 | ||
@@ -215,7 +227,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, | |||
215 | 227 | ||
216 | /* ok, try bottom-up allocation first */ | 228 | /* ok, try bottom-up allocation first */ |
217 | ret = __memblock_find_range_bottom_up(bottom_up_start, end, | 229 | ret = __memblock_find_range_bottom_up(bottom_up_start, end, |
218 | size, align, nid); | 230 | size, align, nid, flags); |
219 | if (ret) | 231 | if (ret) |
220 | return ret; | 232 | return ret; |
221 | 233 | ||
@@ -233,7 +245,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, | |||
233 | "memory hotunplug may be affected\n"); | 245 | "memory hotunplug may be affected\n"); |
234 | } | 246 | } |
235 | 247 | ||
236 | return __memblock_find_range_top_down(start, end, size, align, nid); | 248 | return __memblock_find_range_top_down(start, end, size, align, nid, |
249 | flags); | ||
237 | } | 250 | } |
238 | 251 | ||
239 | /** | 252 | /** |
@@ -252,8 +265,21 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, | |||
252 | phys_addr_t end, phys_addr_t size, | 265 | phys_addr_t end, phys_addr_t size, |
253 | phys_addr_t align) | 266 | phys_addr_t align) |
254 | { | 267 | { |
255 | return memblock_find_in_range_node(size, align, start, end, | 268 | phys_addr_t ret; |
256 | NUMA_NO_NODE); | 269 | ulong flags = choose_memblock_flags(); |
270 | |||
271 | again: | ||
272 | ret = memblock_find_in_range_node(size, align, start, end, | ||
273 | NUMA_NO_NODE, flags); | ||
274 | |||
275 | if (!ret && (flags & MEMBLOCK_MIRROR)) { | ||
276 | pr_warn("Could not allocate %pap bytes of mirrored memory\n", | ||
277 | &size); | ||
278 | flags &= ~MEMBLOCK_MIRROR; | ||
279 | goto again; | ||
280 | } | ||
281 | |||
282 | return ret; | ||
257 | } | 283 | } |
258 | 284 | ||
259 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) | 285 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) |
@@ -779,9 +805,25 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | |||
779 | } | 805 | } |
780 | 806 | ||
781 | /** | 807 | /** |
808 | * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR. | ||
809 | * @base: the base phys addr of the region | ||
810 | * @size: the size of the region | ||
811 | * | ||
812 | * Return 0 on succees, -errno on failure. | ||
813 | */ | ||
814 | int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) | ||
815 | { | ||
816 | system_has_some_mirror = true; | ||
817 | |||
818 | return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); | ||
819 | } | ||
820 | |||
821 | |||
822 | /** | ||
782 | * __next__mem_range - next function for for_each_free_mem_range() etc. | 823 | * __next__mem_range - next function for for_each_free_mem_range() etc. |
783 | * @idx: pointer to u64 loop variable | 824 | * @idx: pointer to u64 loop variable |
784 | * @nid: node selector, %NUMA_NO_NODE for all nodes | 825 | * @nid: node selector, %NUMA_NO_NODE for all nodes |
826 | * @flags: pick from blocks based on memory attributes | ||
785 | * @type_a: pointer to memblock_type from where the range is taken | 827 | * @type_a: pointer to memblock_type from where the range is taken |
786 | * @type_b: pointer to memblock_type which excludes memory from being taken | 828 | * @type_b: pointer to memblock_type which excludes memory from being taken |
787 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 829 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
@@ -803,7 +845,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | |||
803 | * As both region arrays are sorted, the function advances the two indices | 845 | * As both region arrays are sorted, the function advances the two indices |
804 | * in lockstep and returns each intersection. | 846 | * in lockstep and returns each intersection. |
805 | */ | 847 | */ |
806 | void __init_memblock __next_mem_range(u64 *idx, int nid, | 848 | void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, |
807 | struct memblock_type *type_a, | 849 | struct memblock_type *type_a, |
808 | struct memblock_type *type_b, | 850 | struct memblock_type *type_b, |
809 | phys_addr_t *out_start, | 851 | phys_addr_t *out_start, |
@@ -831,6 +873,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, | |||
831 | if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) | 873 | if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) |
832 | continue; | 874 | continue; |
833 | 875 | ||
876 | /* if we want mirror memory skip non-mirror memory regions */ | ||
877 | if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) | ||
878 | continue; | ||
879 | |||
834 | if (!type_b) { | 880 | if (!type_b) { |
835 | if (out_start) | 881 | if (out_start) |
836 | *out_start = m_start; | 882 | *out_start = m_start; |
@@ -895,6 +941,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, | |||
895 | * | 941 | * |
896 | * @idx: pointer to u64 loop variable | 942 | * @idx: pointer to u64 loop variable |
897 | * @nid: nid: node selector, %NUMA_NO_NODE for all nodes | 943 | * @nid: nid: node selector, %NUMA_NO_NODE for all nodes |
944 | * @flags: pick from blocks based on memory attributes | ||
898 | * @type_a: pointer to memblock_type from where the range is taken | 945 | * @type_a: pointer to memblock_type from where the range is taken |
899 | * @type_b: pointer to memblock_type which excludes memory from being taken | 946 | * @type_b: pointer to memblock_type which excludes memory from being taken |
900 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 947 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
@@ -903,7 +950,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, | |||
903 | * | 950 | * |
904 | * Reverse of __next_mem_range(). | 951 | * Reverse of __next_mem_range(). |
905 | */ | 952 | */ |
906 | void __init_memblock __next_mem_range_rev(u64 *idx, int nid, | 953 | void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, |
907 | struct memblock_type *type_a, | 954 | struct memblock_type *type_a, |
908 | struct memblock_type *type_b, | 955 | struct memblock_type *type_b, |
909 | phys_addr_t *out_start, | 956 | phys_addr_t *out_start, |
@@ -935,6 +982,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, | |||
935 | if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) | 982 | if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) |
936 | continue; | 983 | continue; |
937 | 984 | ||
985 | /* if we want mirror memory skip non-mirror memory regions */ | ||
986 | if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) | ||
987 | continue; | ||
988 | |||
938 | if (!type_b) { | 989 | if (!type_b) { |
939 | if (out_start) | 990 | if (out_start) |
940 | *out_start = m_start; | 991 | *out_start = m_start; |
@@ -1050,14 +1101,15 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, | |||
1050 | 1101 | ||
1051 | static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, | 1102 | static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, |
1052 | phys_addr_t align, phys_addr_t start, | 1103 | phys_addr_t align, phys_addr_t start, |
1053 | phys_addr_t end, int nid) | 1104 | phys_addr_t end, int nid, ulong flags) |
1054 | { | 1105 | { |
1055 | phys_addr_t found; | 1106 | phys_addr_t found; |
1056 | 1107 | ||
1057 | if (!align) | 1108 | if (!align) |
1058 | align = SMP_CACHE_BYTES; | 1109 | align = SMP_CACHE_BYTES; |
1059 | 1110 | ||
1060 | found = memblock_find_in_range_node(size, align, start, end, nid); | 1111 | found = memblock_find_in_range_node(size, align, start, end, nid, |
1112 | flags); | ||
1061 | if (found && !memblock_reserve(found, size)) { | 1113 | if (found && !memblock_reserve(found, size)) { |
1062 | /* | 1114 | /* |
1063 | * The min_count is set to 0 so that memblock allocations are | 1115 | * The min_count is set to 0 so that memblock allocations are |
@@ -1070,26 +1122,40 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, | |||
1070 | } | 1122 | } |
1071 | 1123 | ||
1072 | phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, | 1124 | phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, |
1073 | phys_addr_t start, phys_addr_t end) | 1125 | phys_addr_t start, phys_addr_t end, |
1126 | ulong flags) | ||
1074 | { | 1127 | { |
1075 | return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); | 1128 | return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, |
1129 | flags); | ||
1076 | } | 1130 | } |
1077 | 1131 | ||
1078 | static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, | 1132 | static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, |
1079 | phys_addr_t align, phys_addr_t max_addr, | 1133 | phys_addr_t align, phys_addr_t max_addr, |
1080 | int nid) | 1134 | int nid, ulong flags) |
1081 | { | 1135 | { |
1082 | return memblock_alloc_range_nid(size, align, 0, max_addr, nid); | 1136 | return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); |
1083 | } | 1137 | } |
1084 | 1138 | ||
1085 | phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) | 1139 | phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) |
1086 | { | 1140 | { |
1087 | return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); | 1141 | ulong flags = choose_memblock_flags(); |
1142 | phys_addr_t ret; | ||
1143 | |||
1144 | again: | ||
1145 | ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, | ||
1146 | nid, flags); | ||
1147 | |||
1148 | if (!ret && (flags & MEMBLOCK_MIRROR)) { | ||
1149 | flags &= ~MEMBLOCK_MIRROR; | ||
1150 | goto again; | ||
1151 | } | ||
1152 | return ret; | ||
1088 | } | 1153 | } |
1089 | 1154 | ||
1090 | phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) | 1155 | phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) |
1091 | { | 1156 | { |
1092 | return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); | 1157 | return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE, |
1158 | MEMBLOCK_NONE); | ||
1093 | } | 1159 | } |
1094 | 1160 | ||
1095 | phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) | 1161 | phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) |
@@ -1153,6 +1219,7 @@ static void * __init memblock_virt_alloc_internal( | |||
1153 | { | 1219 | { |
1154 | phys_addr_t alloc; | 1220 | phys_addr_t alloc; |
1155 | void *ptr; | 1221 | void *ptr; |
1222 | ulong flags = choose_memblock_flags(); | ||
1156 | 1223 | ||
1157 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | 1224 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) |
1158 | nid = NUMA_NO_NODE; | 1225 | nid = NUMA_NO_NODE; |
@@ -1173,13 +1240,14 @@ static void * __init memblock_virt_alloc_internal( | |||
1173 | 1240 | ||
1174 | again: | 1241 | again: |
1175 | alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, | 1242 | alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, |
1176 | nid); | 1243 | nid, flags); |
1177 | if (alloc) | 1244 | if (alloc) |
1178 | goto done; | 1245 | goto done; |
1179 | 1246 | ||
1180 | if (nid != NUMA_NO_NODE) { | 1247 | if (nid != NUMA_NO_NODE) { |
1181 | alloc = memblock_find_in_range_node(size, align, min_addr, | 1248 | alloc = memblock_find_in_range_node(size, align, min_addr, |
1182 | max_addr, NUMA_NO_NODE); | 1249 | max_addr, NUMA_NO_NODE, |
1250 | flags); | ||
1183 | if (alloc) | 1251 | if (alloc) |
1184 | goto done; | 1252 | goto done; |
1185 | } | 1253 | } |
@@ -1187,10 +1255,16 @@ again: | |||
1187 | if (min_addr) { | 1255 | if (min_addr) { |
1188 | min_addr = 0; | 1256 | min_addr = 0; |
1189 | goto again; | 1257 | goto again; |
1190 | } else { | ||
1191 | goto error; | ||
1192 | } | 1258 | } |
1193 | 1259 | ||
1260 | if (flags & MEMBLOCK_MIRROR) { | ||
1261 | flags &= ~MEMBLOCK_MIRROR; | ||
1262 | pr_warn("Could not allocate %pap bytes of mirrored memory\n", | ||
1263 | &size); | ||
1264 | goto again; | ||
1265 | } | ||
1266 | |||
1267 | return NULL; | ||
1194 | done: | 1268 | done: |
1195 | memblock_reserve(alloc, size); | 1269 | memblock_reserve(alloc, size); |
1196 | ptr = phys_to_virt(alloc); | 1270 | ptr = phys_to_virt(alloc); |
@@ -1205,9 +1279,6 @@ done: | |||
1205 | kmemleak_alloc(ptr, size, 0, 0); | 1279 | kmemleak_alloc(ptr, size, 0, 0); |
1206 | 1280 | ||
1207 | return ptr; | 1281 | return ptr; |
1208 | |||
1209 | error: | ||
1210 | return NULL; | ||
1211 | } | 1282 | } |
1212 | 1283 | ||
1213 | /** | 1284 | /** |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a04225d372ba..e65f7b0131d3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -285,9 +285,9 @@ struct mem_cgroup { | |||
285 | */ | 285 | */ |
286 | bool use_hierarchy; | 286 | bool use_hierarchy; |
287 | 287 | ||
288 | /* protected by memcg_oom_lock */ | ||
288 | bool oom_lock; | 289 | bool oom_lock; |
289 | atomic_t under_oom; | 290 | int under_oom; |
290 | atomic_t oom_wakeups; | ||
291 | 291 | ||
292 | int swappiness; | 292 | int swappiness; |
293 | /* OOM-Killer disable */ | 293 | /* OOM-Killer disable */ |
@@ -1530,14 +1530,16 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1530 | unsigned int points = 0; | 1530 | unsigned int points = 0; |
1531 | struct task_struct *chosen = NULL; | 1531 | struct task_struct *chosen = NULL; |
1532 | 1532 | ||
1533 | mutex_lock(&oom_lock); | ||
1534 | |||
1533 | /* | 1535 | /* |
1534 | * If current has a pending SIGKILL or is exiting, then automatically | 1536 | * If current has a pending SIGKILL or is exiting, then automatically |
1535 | * select it. The goal is to allow it to allocate so that it may | 1537 | * select it. The goal is to allow it to allocate so that it may |
1536 | * quickly exit and free its memory. | 1538 | * quickly exit and free its memory. |
1537 | */ | 1539 | */ |
1538 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { | 1540 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { |
1539 | mark_tsk_oom_victim(current); | 1541 | mark_oom_victim(current); |
1540 | return; | 1542 | goto unlock; |
1541 | } | 1543 | } |
1542 | 1544 | ||
1543 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); | 1545 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); |
@@ -1564,7 +1566,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1564 | mem_cgroup_iter_break(memcg, iter); | 1566 | mem_cgroup_iter_break(memcg, iter); |
1565 | if (chosen) | 1567 | if (chosen) |
1566 | put_task_struct(chosen); | 1568 | put_task_struct(chosen); |
1567 | return; | 1569 | goto unlock; |
1568 | case OOM_SCAN_OK: | 1570 | case OOM_SCAN_OK: |
1569 | break; | 1571 | break; |
1570 | }; | 1572 | }; |
@@ -1585,11 +1587,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1585 | css_task_iter_end(&it); | 1587 | css_task_iter_end(&it); |
1586 | } | 1588 | } |
1587 | 1589 | ||
1588 | if (!chosen) | 1590 | if (chosen) { |
1589 | return; | 1591 | points = chosen_points * 1000 / totalpages; |
1590 | points = chosen_points * 1000 / totalpages; | 1592 | oom_kill_process(chosen, gfp_mask, order, points, totalpages, |
1591 | oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, | 1593 | memcg, NULL, "Memory cgroup out of memory"); |
1592 | NULL, "Memory cgroup out of memory"); | 1594 | } |
1595 | unlock: | ||
1596 | mutex_unlock(&oom_lock); | ||
1593 | } | 1597 | } |
1594 | 1598 | ||
1595 | #if MAX_NUMNODES > 1 | 1599 | #if MAX_NUMNODES > 1 |
@@ -1806,8 +1810,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) | |||
1806 | { | 1810 | { |
1807 | struct mem_cgroup *iter; | 1811 | struct mem_cgroup *iter; |
1808 | 1812 | ||
1813 | spin_lock(&memcg_oom_lock); | ||
1809 | for_each_mem_cgroup_tree(iter, memcg) | 1814 | for_each_mem_cgroup_tree(iter, memcg) |
1810 | atomic_inc(&iter->under_oom); | 1815 | iter->under_oom++; |
1816 | spin_unlock(&memcg_oom_lock); | ||
1811 | } | 1817 | } |
1812 | 1818 | ||
1813 | static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) | 1819 | static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) |
@@ -1816,11 +1822,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) | |||
1816 | 1822 | ||
1817 | /* | 1823 | /* |
1818 | * When a new child is created while the hierarchy is under oom, | 1824 | * When a new child is created while the hierarchy is under oom, |
1819 | * mem_cgroup_oom_lock() may not be called. We have to use | 1825 | * mem_cgroup_oom_lock() may not be called. Watch for underflow. |
1820 | * atomic_add_unless() here. | ||
1821 | */ | 1826 | */ |
1827 | spin_lock(&memcg_oom_lock); | ||
1822 | for_each_mem_cgroup_tree(iter, memcg) | 1828 | for_each_mem_cgroup_tree(iter, memcg) |
1823 | atomic_add_unless(&iter->under_oom, -1, 0); | 1829 | if (iter->under_oom > 0) |
1830 | iter->under_oom--; | ||
1831 | spin_unlock(&memcg_oom_lock); | ||
1824 | } | 1832 | } |
1825 | 1833 | ||
1826 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1834 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
@@ -1846,17 +1854,18 @@ static int memcg_oom_wake_function(wait_queue_t *wait, | |||
1846 | return autoremove_wake_function(wait, mode, sync, arg); | 1854 | return autoremove_wake_function(wait, mode, sync, arg); |
1847 | } | 1855 | } |
1848 | 1856 | ||
1849 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) | ||
1850 | { | ||
1851 | atomic_inc(&memcg->oom_wakeups); | ||
1852 | /* for filtering, pass "memcg" as argument. */ | ||
1853 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); | ||
1854 | } | ||
1855 | |||
1856 | static void memcg_oom_recover(struct mem_cgroup *memcg) | 1857 | static void memcg_oom_recover(struct mem_cgroup *memcg) |
1857 | { | 1858 | { |
1858 | if (memcg && atomic_read(&memcg->under_oom)) | 1859 | /* |
1859 | memcg_wakeup_oom(memcg); | 1860 | * For the following lockless ->under_oom test, the only required |
1861 | * guarantee is that it must see the state asserted by an OOM when | ||
1862 | * this function is called as a result of userland actions | ||
1863 | * triggered by the notification of the OOM. This is trivially | ||
1864 | * achieved by invoking mem_cgroup_mark_under_oom() before | ||
1865 | * triggering notification. | ||
1866 | */ | ||
1867 | if (memcg && memcg->under_oom) | ||
1868 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); | ||
1860 | } | 1869 | } |
1861 | 1870 | ||
1862 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 1871 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
@@ -3864,7 +3873,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, | |||
3864 | list_add(&event->list, &memcg->oom_notify); | 3873 | list_add(&event->list, &memcg->oom_notify); |
3865 | 3874 | ||
3866 | /* already in OOM ? */ | 3875 | /* already in OOM ? */ |
3867 | if (atomic_read(&memcg->under_oom)) | 3876 | if (memcg->under_oom) |
3868 | eventfd_signal(eventfd, 1); | 3877 | eventfd_signal(eventfd, 1); |
3869 | spin_unlock(&memcg_oom_lock); | 3878 | spin_unlock(&memcg_oom_lock); |
3870 | 3879 | ||
@@ -3893,7 +3902,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) | |||
3893 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); | 3902 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); |
3894 | 3903 | ||
3895 | seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); | 3904 | seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); |
3896 | seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); | 3905 | seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); |
3897 | return 0; | 3906 | return 0; |
3898 | } | 3907 | } |
3899 | 3908 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 501820c815b3..c53543d89282 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -20,6 +20,14 @@ | |||
20 | * this code has to be extremely careful. Generally it tries to use | 20 | * this code has to be extremely careful. Generally it tries to use |
21 | * normal locking rules, as in get the standard locks, even if that means | 21 | * normal locking rules, as in get the standard locks, even if that means |
22 | * the error handling takes potentially a long time. | 22 | * the error handling takes potentially a long time. |
23 | * | ||
24 | * It can be very tempting to add handling for obscure cases here. | ||
25 | * In general any code for handling new cases should only be added iff: | ||
26 | * - You know how to test it. | ||
27 | * - You have a test that can be added to mce-test | ||
28 | * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ | ||
29 | * - The case actually shows up as a frequent (top 10) page state in | ||
30 | * tools/vm/page-types when running a real workload. | ||
23 | * | 31 | * |
24 | * There are several operations here with exponential complexity because | 32 | * There are several operations here with exponential complexity because |
25 | * of unsuitable VM data structures. For example the operation to map back | 33 | * of unsuitable VM data structures. For example the operation to map back |
@@ -28,13 +36,6 @@ | |||
28 | * are rare we hope to get away with this. This avoids impacting the core | 36 | * are rare we hope to get away with this. This avoids impacting the core |
29 | * VM. | 37 | * VM. |
30 | */ | 38 | */ |
31 | |||
32 | /* | ||
33 | * Notebook: | ||
34 | * - hugetlb needs more code | ||
35 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages | ||
36 | * - pass bad pages to kdump next kernel | ||
37 | */ | ||
38 | #include <linux/kernel.h> | 39 | #include <linux/kernel.h> |
39 | #include <linux/mm.h> | 40 | #include <linux/mm.h> |
40 | #include <linux/page-flags.h> | 41 | #include <linux/page-flags.h> |
@@ -56,6 +57,7 @@ | |||
56 | #include <linux/mm_inline.h> | 57 | #include <linux/mm_inline.h> |
57 | #include <linux/kfifo.h> | 58 | #include <linux/kfifo.h> |
58 | #include "internal.h" | 59 | #include "internal.h" |
60 | #include "ras/ras_event.h" | ||
59 | 61 | ||
60 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 62 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
61 | 63 | ||
@@ -503,68 +505,34 @@ static void collect_procs(struct page *page, struct list_head *tokill, | |||
503 | kfree(tk); | 505 | kfree(tk); |
504 | } | 506 | } |
505 | 507 | ||
506 | /* | ||
507 | * Error handlers for various types of pages. | ||
508 | */ | ||
509 | |||
510 | enum outcome { | ||
511 | IGNORED, /* Error: cannot be handled */ | ||
512 | FAILED, /* Error: handling failed */ | ||
513 | DELAYED, /* Will be handled later */ | ||
514 | RECOVERED, /* Successfully recovered */ | ||
515 | }; | ||
516 | |||
517 | static const char *action_name[] = { | 508 | static const char *action_name[] = { |
518 | [IGNORED] = "Ignored", | 509 | [MF_IGNORED] = "Ignored", |
519 | [FAILED] = "Failed", | 510 | [MF_FAILED] = "Failed", |
520 | [DELAYED] = "Delayed", | 511 | [MF_DELAYED] = "Delayed", |
521 | [RECOVERED] = "Recovered", | 512 | [MF_RECOVERED] = "Recovered", |
522 | }; | ||
523 | |||
524 | enum action_page_type { | ||
525 | MSG_KERNEL, | ||
526 | MSG_KERNEL_HIGH_ORDER, | ||
527 | MSG_SLAB, | ||
528 | MSG_DIFFERENT_COMPOUND, | ||
529 | MSG_POISONED_HUGE, | ||
530 | MSG_HUGE, | ||
531 | MSG_FREE_HUGE, | ||
532 | MSG_UNMAP_FAILED, | ||
533 | MSG_DIRTY_SWAPCACHE, | ||
534 | MSG_CLEAN_SWAPCACHE, | ||
535 | MSG_DIRTY_MLOCKED_LRU, | ||
536 | MSG_CLEAN_MLOCKED_LRU, | ||
537 | MSG_DIRTY_UNEVICTABLE_LRU, | ||
538 | MSG_CLEAN_UNEVICTABLE_LRU, | ||
539 | MSG_DIRTY_LRU, | ||
540 | MSG_CLEAN_LRU, | ||
541 | MSG_TRUNCATED_LRU, | ||
542 | MSG_BUDDY, | ||
543 | MSG_BUDDY_2ND, | ||
544 | MSG_UNKNOWN, | ||
545 | }; | 513 | }; |
546 | 514 | ||
547 | static const char * const action_page_types[] = { | 515 | static const char * const action_page_types[] = { |
548 | [MSG_KERNEL] = "reserved kernel page", | 516 | [MF_MSG_KERNEL] = "reserved kernel page", |
549 | [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", | 517 | [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", |
550 | [MSG_SLAB] = "kernel slab page", | 518 | [MF_MSG_SLAB] = "kernel slab page", |
551 | [MSG_DIFFERENT_COMPOUND] = "different compound page after locking", | 519 | [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", |
552 | [MSG_POISONED_HUGE] = "huge page already hardware poisoned", | 520 | [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", |
553 | [MSG_HUGE] = "huge page", | 521 | [MF_MSG_HUGE] = "huge page", |
554 | [MSG_FREE_HUGE] = "free huge page", | 522 | [MF_MSG_FREE_HUGE] = "free huge page", |
555 | [MSG_UNMAP_FAILED] = "unmapping failed page", | 523 | [MF_MSG_UNMAP_FAILED] = "unmapping failed page", |
556 | [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", | 524 | [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", |
557 | [MSG_CLEAN_SWAPCACHE] = "clean swapcache page", | 525 | [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", |
558 | [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", | 526 | [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", |
559 | [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", | 527 | [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", |
560 | [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", | 528 | [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", |
561 | [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", | 529 | [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", |
562 | [MSG_DIRTY_LRU] = "dirty LRU page", | 530 | [MF_MSG_DIRTY_LRU] = "dirty LRU page", |
563 | [MSG_CLEAN_LRU] = "clean LRU page", | 531 | [MF_MSG_CLEAN_LRU] = "clean LRU page", |
564 | [MSG_TRUNCATED_LRU] = "already truncated LRU page", | 532 | [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", |
565 | [MSG_BUDDY] = "free buddy page", | 533 | [MF_MSG_BUDDY] = "free buddy page", |
566 | [MSG_BUDDY_2ND] = "free buddy page (2nd try)", | 534 | [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", |
567 | [MSG_UNKNOWN] = "unknown page", | 535 | [MF_MSG_UNKNOWN] = "unknown page", |
568 | }; | 536 | }; |
569 | 537 | ||
570 | /* | 538 | /* |
@@ -598,7 +566,7 @@ static int delete_from_lru_cache(struct page *p) | |||
598 | */ | 566 | */ |
599 | static int me_kernel(struct page *p, unsigned long pfn) | 567 | static int me_kernel(struct page *p, unsigned long pfn) |
600 | { | 568 | { |
601 | return IGNORED; | 569 | return MF_IGNORED; |
602 | } | 570 | } |
603 | 571 | ||
604 | /* | 572 | /* |
@@ -607,7 +575,7 @@ static int me_kernel(struct page *p, unsigned long pfn) | |||
607 | static int me_unknown(struct page *p, unsigned long pfn) | 575 | static int me_unknown(struct page *p, unsigned long pfn) |
608 | { | 576 | { |
609 | printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); | 577 | printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); |
610 | return FAILED; | 578 | return MF_FAILED; |
611 | } | 579 | } |
612 | 580 | ||
613 | /* | 581 | /* |
@@ -616,7 +584,7 @@ static int me_unknown(struct page *p, unsigned long pfn) | |||
616 | static int me_pagecache_clean(struct page *p, unsigned long pfn) | 584 | static int me_pagecache_clean(struct page *p, unsigned long pfn) |
617 | { | 585 | { |
618 | int err; | 586 | int err; |
619 | int ret = FAILED; | 587 | int ret = MF_FAILED; |
620 | struct address_space *mapping; | 588 | struct address_space *mapping; |
621 | 589 | ||
622 | delete_from_lru_cache(p); | 590 | delete_from_lru_cache(p); |
@@ -626,7 +594,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
626 | * should be the one m_f() holds. | 594 | * should be the one m_f() holds. |
627 | */ | 595 | */ |
628 | if (PageAnon(p)) | 596 | if (PageAnon(p)) |
629 | return RECOVERED; | 597 | return MF_RECOVERED; |
630 | 598 | ||
631 | /* | 599 | /* |
632 | * Now truncate the page in the page cache. This is really | 600 | * Now truncate the page in the page cache. This is really |
@@ -640,7 +608,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
640 | /* | 608 | /* |
641 | * Page has been teared down in the meanwhile | 609 | * Page has been teared down in the meanwhile |
642 | */ | 610 | */ |
643 | return FAILED; | 611 | return MF_FAILED; |
644 | } | 612 | } |
645 | 613 | ||
646 | /* | 614 | /* |
@@ -657,7 +625,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
657 | !try_to_release_page(p, GFP_NOIO)) { | 625 | !try_to_release_page(p, GFP_NOIO)) { |
658 | pr_info("MCE %#lx: failed to release buffers\n", pfn); | 626 | pr_info("MCE %#lx: failed to release buffers\n", pfn); |
659 | } else { | 627 | } else { |
660 | ret = RECOVERED; | 628 | ret = MF_RECOVERED; |
661 | } | 629 | } |
662 | } else { | 630 | } else { |
663 | /* | 631 | /* |
@@ -665,7 +633,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
665 | * This fails on dirty or anything with private pages | 633 | * This fails on dirty or anything with private pages |
666 | */ | 634 | */ |
667 | if (invalidate_inode_page(p)) | 635 | if (invalidate_inode_page(p)) |
668 | ret = RECOVERED; | 636 | ret = MF_RECOVERED; |
669 | else | 637 | else |
670 | printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", | 638 | printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", |
671 | pfn); | 639 | pfn); |
@@ -751,9 +719,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) | |||
751 | ClearPageUptodate(p); | 719 | ClearPageUptodate(p); |
752 | 720 | ||
753 | if (!delete_from_lru_cache(p)) | 721 | if (!delete_from_lru_cache(p)) |
754 | return DELAYED; | 722 | return MF_DELAYED; |
755 | else | 723 | else |
756 | return FAILED; | 724 | return MF_FAILED; |
757 | } | 725 | } |
758 | 726 | ||
759 | static int me_swapcache_clean(struct page *p, unsigned long pfn) | 727 | static int me_swapcache_clean(struct page *p, unsigned long pfn) |
@@ -761,9 +729,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) | |||
761 | delete_from_swap_cache(p); | 729 | delete_from_swap_cache(p); |
762 | 730 | ||
763 | if (!delete_from_lru_cache(p)) | 731 | if (!delete_from_lru_cache(p)) |
764 | return RECOVERED; | 732 | return MF_RECOVERED; |
765 | else | 733 | else |
766 | return FAILED; | 734 | return MF_FAILED; |
767 | } | 735 | } |
768 | 736 | ||
769 | /* | 737 | /* |
@@ -776,6 +744,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
776 | { | 744 | { |
777 | int res = 0; | 745 | int res = 0; |
778 | struct page *hpage = compound_head(p); | 746 | struct page *hpage = compound_head(p); |
747 | |||
748 | if (!PageHuge(hpage)) | ||
749 | return MF_DELAYED; | ||
750 | |||
779 | /* | 751 | /* |
780 | * We can safely recover from error on free or reserved (i.e. | 752 | * We can safely recover from error on free or reserved (i.e. |
781 | * not in-use) hugepage by dequeuing it from freelist. | 753 | * not in-use) hugepage by dequeuing it from freelist. |
@@ -789,9 +761,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
789 | if (!(page_mapping(hpage) || PageAnon(hpage))) { | 761 | if (!(page_mapping(hpage) || PageAnon(hpage))) { |
790 | res = dequeue_hwpoisoned_huge_page(hpage); | 762 | res = dequeue_hwpoisoned_huge_page(hpage); |
791 | if (!res) | 763 | if (!res) |
792 | return RECOVERED; | 764 | return MF_RECOVERED; |
793 | } | 765 | } |
794 | return DELAYED; | 766 | return MF_DELAYED; |
795 | } | 767 | } |
796 | 768 | ||
797 | /* | 769 | /* |
@@ -823,10 +795,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
823 | static struct page_state { | 795 | static struct page_state { |
824 | unsigned long mask; | 796 | unsigned long mask; |
825 | unsigned long res; | 797 | unsigned long res; |
826 | enum action_page_type type; | 798 | enum mf_action_page_type type; |
827 | int (*action)(struct page *p, unsigned long pfn); | 799 | int (*action)(struct page *p, unsigned long pfn); |
828 | } error_states[] = { | 800 | } error_states[] = { |
829 | { reserved, reserved, MSG_KERNEL, me_kernel }, | 801 | { reserved, reserved, MF_MSG_KERNEL, me_kernel }, |
830 | /* | 802 | /* |
831 | * free pages are specially detected outside this table: | 803 | * free pages are specially detected outside this table: |
832 | * PG_buddy pages only make a small fraction of all free pages. | 804 | * PG_buddy pages only make a small fraction of all free pages. |
@@ -837,31 +809,31 @@ static struct page_state { | |||
837 | * currently unused objects without touching them. But just | 809 | * currently unused objects without touching them. But just |
838 | * treat it as standard kernel for now. | 810 | * treat it as standard kernel for now. |
839 | */ | 811 | */ |
840 | { slab, slab, MSG_SLAB, me_kernel }, | 812 | { slab, slab, MF_MSG_SLAB, me_kernel }, |
841 | 813 | ||
842 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | 814 | #ifdef CONFIG_PAGEFLAGS_EXTENDED |
843 | { head, head, MSG_HUGE, me_huge_page }, | 815 | { head, head, MF_MSG_HUGE, me_huge_page }, |
844 | { tail, tail, MSG_HUGE, me_huge_page }, | 816 | { tail, tail, MF_MSG_HUGE, me_huge_page }, |
845 | #else | 817 | #else |
846 | { compound, compound, MSG_HUGE, me_huge_page }, | 818 | { compound, compound, MF_MSG_HUGE, me_huge_page }, |
847 | #endif | 819 | #endif |
848 | 820 | ||
849 | { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, | 821 | { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, |
850 | { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, | 822 | { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, |
851 | 823 | ||
852 | { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, | 824 | { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, |
853 | { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, | 825 | { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, |
854 | 826 | ||
855 | { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, | 827 | { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, |
856 | { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, | 828 | { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, |
857 | 829 | ||
858 | { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty }, | 830 | { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty }, |
859 | { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean }, | 831 | { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean }, |
860 | 832 | ||
861 | /* | 833 | /* |
862 | * Catchall entry: must be at end. | 834 | * Catchall entry: must be at end. |
863 | */ | 835 | */ |
864 | { 0, 0, MSG_UNKNOWN, me_unknown }, | 836 | { 0, 0, MF_MSG_UNKNOWN, me_unknown }, |
865 | }; | 837 | }; |
866 | 838 | ||
867 | #undef dirty | 839 | #undef dirty |
@@ -881,8 +853,11 @@ static struct page_state { | |||
881 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of | 853 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of |
882 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). | 854 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). |
883 | */ | 855 | */ |
884 | static void action_result(unsigned long pfn, enum action_page_type type, int result) | 856 | static void action_result(unsigned long pfn, enum mf_action_page_type type, |
857 | enum mf_result result) | ||
885 | { | 858 | { |
859 | trace_memory_failure_event(pfn, type, result); | ||
860 | |||
886 | pr_err("MCE %#lx: recovery action for %s: %s\n", | 861 | pr_err("MCE %#lx: recovery action for %s: %s\n", |
887 | pfn, action_page_types[type], action_name[result]); | 862 | pfn, action_page_types[type], action_name[result]); |
888 | } | 863 | } |
@@ -896,13 +871,13 @@ static int page_action(struct page_state *ps, struct page *p, | |||
896 | result = ps->action(p, pfn); | 871 | result = ps->action(p, pfn); |
897 | 872 | ||
898 | count = page_count(p) - 1; | 873 | count = page_count(p) - 1; |
899 | if (ps->action == me_swapcache_dirty && result == DELAYED) | 874 | if (ps->action == me_swapcache_dirty && result == MF_DELAYED) |
900 | count--; | 875 | count--; |
901 | if (count != 0) { | 876 | if (count != 0) { |
902 | printk(KERN_ERR | 877 | printk(KERN_ERR |
903 | "MCE %#lx: %s still referenced by %d users\n", | 878 | "MCE %#lx: %s still referenced by %d users\n", |
904 | pfn, action_page_types[ps->type], count); | 879 | pfn, action_page_types[ps->type], count); |
905 | result = FAILED; | 880 | result = MF_FAILED; |
906 | } | 881 | } |
907 | action_result(pfn, ps->type, result); | 882 | action_result(pfn, ps->type, result); |
908 | 883 | ||
@@ -911,9 +886,42 @@ static int page_action(struct page_state *ps, struct page *p, | |||
911 | * Could adjust zone counters here to correct for the missing page. | 886 | * Could adjust zone counters here to correct for the missing page. |
912 | */ | 887 | */ |
913 | 888 | ||
914 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; | 889 | return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; |
915 | } | 890 | } |
916 | 891 | ||
892 | /** | ||
893 | * get_hwpoison_page() - Get refcount for memory error handling: | ||
894 | * @page: raw error page (hit by memory error) | ||
895 | * | ||
896 | * Return: return 0 if failed to grab the refcount, otherwise true (some | ||
897 | * non-zero value.) | ||
898 | */ | ||
899 | int get_hwpoison_page(struct page *page) | ||
900 | { | ||
901 | struct page *head = compound_head(page); | ||
902 | |||
903 | if (PageHuge(head)) | ||
904 | return get_page_unless_zero(head); | ||
905 | |||
906 | /* | ||
907 | * Thp tail page has special refcounting rule (refcount of tail pages | ||
908 | * is stored in ->_mapcount,) so we can't call get_page_unless_zero() | ||
909 | * directly for tail pages. | ||
910 | */ | ||
911 | if (PageTransHuge(head)) { | ||
912 | if (get_page_unless_zero(head)) { | ||
913 | if (PageTail(page)) | ||
914 | get_page(page); | ||
915 | return 1; | ||
916 | } else { | ||
917 | return 0; | ||
918 | } | ||
919 | } | ||
920 | |||
921 | return get_page_unless_zero(page); | ||
922 | } | ||
923 | EXPORT_SYMBOL_GPL(get_hwpoison_page); | ||
924 | |||
917 | /* | 925 | /* |
918 | * Do all that is necessary to remove user space mappings. Unmap | 926 | * Do all that is necessary to remove user space mappings. Unmap |
919 | * the pages and send SIGBUS to the processes if the data was dirty. | 927 | * the pages and send SIGBUS to the processes if the data was dirty. |
@@ -927,7 +935,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
927 | int ret; | 935 | int ret; |
928 | int kill = 1, forcekill; | 936 | int kill = 1, forcekill; |
929 | struct page *hpage = *hpagep; | 937 | struct page *hpage = *hpagep; |
930 | struct page *ppage; | ||
931 | 938 | ||
932 | /* | 939 | /* |
933 | * Here we are interested only in user-mapped pages, so skip any | 940 | * Here we are interested only in user-mapped pages, so skip any |
@@ -977,59 +984,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
977 | } | 984 | } |
978 | 985 | ||
979 | /* | 986 | /* |
980 | * ppage: poisoned page | ||
981 | * if p is regular page(4k page) | ||
982 | * ppage == real poisoned page; | ||
983 | * else p is hugetlb or THP, ppage == head page. | ||
984 | */ | ||
985 | ppage = hpage; | ||
986 | |||
987 | if (PageTransHuge(hpage)) { | ||
988 | /* | ||
989 | * Verify that this isn't a hugetlbfs head page, the check for | ||
990 | * PageAnon is just for avoid tripping a split_huge_page | ||
991 | * internal debug check, as split_huge_page refuses to deal with | ||
992 | * anything that isn't an anon page. PageAnon can't go away fro | ||
993 | * under us because we hold a refcount on the hpage, without a | ||
994 | * refcount on the hpage. split_huge_page can't be safely called | ||
995 | * in the first place, having a refcount on the tail isn't | ||
996 | * enough * to be safe. | ||
997 | */ | ||
998 | if (!PageHuge(hpage) && PageAnon(hpage)) { | ||
999 | if (unlikely(split_huge_page(hpage))) { | ||
1000 | /* | ||
1001 | * FIXME: if splitting THP is failed, it is | ||
1002 | * better to stop the following operation rather | ||
1003 | * than causing panic by unmapping. System might | ||
1004 | * survive if the page is freed later. | ||
1005 | */ | ||
1006 | printk(KERN_INFO | ||
1007 | "MCE %#lx: failed to split THP\n", pfn); | ||
1008 | |||
1009 | BUG_ON(!PageHWPoison(p)); | ||
1010 | return SWAP_FAIL; | ||
1011 | } | ||
1012 | /* | ||
1013 | * We pinned the head page for hwpoison handling, | ||
1014 | * now we split the thp and we are interested in | ||
1015 | * the hwpoisoned raw page, so move the refcount | ||
1016 | * to it. Similarly, page lock is shifted. | ||
1017 | */ | ||
1018 | if (hpage != p) { | ||
1019 | if (!(flags & MF_COUNT_INCREASED)) { | ||
1020 | put_page(hpage); | ||
1021 | get_page(p); | ||
1022 | } | ||
1023 | lock_page(p); | ||
1024 | unlock_page(hpage); | ||
1025 | *hpagep = p; | ||
1026 | } | ||
1027 | /* THP is split, so ppage should be the real poisoned page. */ | ||
1028 | ppage = p; | ||
1029 | } | ||
1030 | } | ||
1031 | |||
1032 | /* | ||
1033 | * First collect all the processes that have the page | 987 | * First collect all the processes that have the page |
1034 | * mapped in dirty form. This has to be done before try_to_unmap, | 988 | * mapped in dirty form. This has to be done before try_to_unmap, |
1035 | * because ttu takes the rmap data structures down. | 989 | * because ttu takes the rmap data structures down. |
@@ -1038,12 +992,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
1038 | * there's nothing that can be done. | 992 | * there's nothing that can be done. |
1039 | */ | 993 | */ |
1040 | if (kill) | 994 | if (kill) |
1041 | collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); | 995 | collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); |
1042 | 996 | ||
1043 | ret = try_to_unmap(ppage, ttu); | 997 | ret = try_to_unmap(hpage, ttu); |
1044 | if (ret != SWAP_SUCCESS) | 998 | if (ret != SWAP_SUCCESS) |
1045 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | 999 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", |
1046 | pfn, page_mapcount(ppage)); | 1000 | pfn, page_mapcount(hpage)); |
1047 | 1001 | ||
1048 | /* | 1002 | /* |
1049 | * Now that the dirty bit has been propagated to the | 1003 | * Now that the dirty bit has been propagated to the |
@@ -1055,7 +1009,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
1055 | * use a more force-full uncatchable kill to prevent | 1009 | * use a more force-full uncatchable kill to prevent |
1056 | * any accesses to the poisoned memory. | 1010 | * any accesses to the poisoned memory. |
1057 | */ | 1011 | */ |
1058 | forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); | 1012 | forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); |
1059 | kill_procs(&tokill, forcekill, trapno, | 1013 | kill_procs(&tokill, forcekill, trapno, |
1060 | ret != SWAP_SUCCESS, p, pfn, flags); | 1014 | ret != SWAP_SUCCESS, p, pfn, flags); |
1061 | 1015 | ||
@@ -1101,6 +1055,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1101 | struct page_state *ps; | 1055 | struct page_state *ps; |
1102 | struct page *p; | 1056 | struct page *p; |
1103 | struct page *hpage; | 1057 | struct page *hpage; |
1058 | struct page *orig_head; | ||
1104 | int res; | 1059 | int res; |
1105 | unsigned int nr_pages; | 1060 | unsigned int nr_pages; |
1106 | unsigned long page_flags; | 1061 | unsigned long page_flags; |
@@ -1116,7 +1071,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1116 | } | 1071 | } |
1117 | 1072 | ||
1118 | p = pfn_to_page(pfn); | 1073 | p = pfn_to_page(pfn); |
1119 | hpage = compound_head(p); | 1074 | orig_head = hpage = compound_head(p); |
1120 | if (TestSetPageHWPoison(p)) { | 1075 | if (TestSetPageHWPoison(p)) { |
1121 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); | 1076 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); |
1122 | return 0; | 1077 | return 0; |
@@ -1149,10 +1104,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1149 | * In fact it's dangerous to directly bump up page count from 0, | 1104 | * In fact it's dangerous to directly bump up page count from 0, |
1150 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | 1105 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. |
1151 | */ | 1106 | */ |
1152 | if (!(flags & MF_COUNT_INCREASED) && | 1107 | if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { |
1153 | !get_page_unless_zero(hpage)) { | ||
1154 | if (is_free_buddy_page(p)) { | 1108 | if (is_free_buddy_page(p)) { |
1155 | action_result(pfn, MSG_BUDDY, DELAYED); | 1109 | action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); |
1156 | return 0; | 1110 | return 0; |
1157 | } else if (PageHuge(hpage)) { | 1111 | } else if (PageHuge(hpage)) { |
1158 | /* | 1112 | /* |
@@ -1169,16 +1123,39 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1169 | } | 1123 | } |
1170 | set_page_hwpoison_huge_page(hpage); | 1124 | set_page_hwpoison_huge_page(hpage); |
1171 | res = dequeue_hwpoisoned_huge_page(hpage); | 1125 | res = dequeue_hwpoisoned_huge_page(hpage); |
1172 | action_result(pfn, MSG_FREE_HUGE, | 1126 | action_result(pfn, MF_MSG_FREE_HUGE, |
1173 | res ? IGNORED : DELAYED); | 1127 | res ? MF_IGNORED : MF_DELAYED); |
1174 | unlock_page(hpage); | 1128 | unlock_page(hpage); |
1175 | return res; | 1129 | return res; |
1176 | } else { | 1130 | } else { |
1177 | action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED); | 1131 | action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); |
1178 | return -EBUSY; | 1132 | return -EBUSY; |
1179 | } | 1133 | } |
1180 | } | 1134 | } |
1181 | 1135 | ||
1136 | if (!PageHuge(p) && PageTransHuge(hpage)) { | ||
1137 | if (!PageAnon(hpage)) { | ||
1138 | pr_err("MCE: %#lx: non anonymous thp\n", pfn); | ||
1139 | if (TestClearPageHWPoison(p)) | ||
1140 | atomic_long_sub(nr_pages, &num_poisoned_pages); | ||
1141 | put_page(p); | ||
1142 | if (p != hpage) | ||
1143 | put_page(hpage); | ||
1144 | return -EBUSY; | ||
1145 | } | ||
1146 | if (unlikely(split_huge_page(hpage))) { | ||
1147 | pr_err("MCE: %#lx: thp split failed\n", pfn); | ||
1148 | if (TestClearPageHWPoison(p)) | ||
1149 | atomic_long_sub(nr_pages, &num_poisoned_pages); | ||
1150 | put_page(p); | ||
1151 | if (p != hpage) | ||
1152 | put_page(hpage); | ||
1153 | return -EBUSY; | ||
1154 | } | ||
1155 | VM_BUG_ON_PAGE(!page_count(p), p); | ||
1156 | hpage = compound_head(p); | ||
1157 | } | ||
1158 | |||
1182 | /* | 1159 | /* |
1183 | * We ignore non-LRU pages for good reasons. | 1160 | * We ignore non-LRU pages for good reasons. |
1184 | * - PG_locked is only well defined for LRU pages and a few others | 1161 | * - PG_locked is only well defined for LRU pages and a few others |
@@ -1188,18 +1165,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1188 | * walked by the page reclaim code, however that's not a big loss. | 1165 | * walked by the page reclaim code, however that's not a big loss. |
1189 | */ | 1166 | */ |
1190 | if (!PageHuge(p)) { | 1167 | if (!PageHuge(p)) { |
1191 | if (!PageLRU(hpage)) | 1168 | if (!PageLRU(p)) |
1192 | shake_page(hpage, 0); | 1169 | shake_page(p, 0); |
1193 | if (!PageLRU(hpage)) { | 1170 | if (!PageLRU(p)) { |
1194 | /* | 1171 | /* |
1195 | * shake_page could have turned it free. | 1172 | * shake_page could have turned it free. |
1196 | */ | 1173 | */ |
1197 | if (is_free_buddy_page(p)) { | 1174 | if (is_free_buddy_page(p)) { |
1198 | if (flags & MF_COUNT_INCREASED) | 1175 | if (flags & MF_COUNT_INCREASED) |
1199 | action_result(pfn, MSG_BUDDY, DELAYED); | 1176 | action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); |
1200 | else | 1177 | else |
1201 | action_result(pfn, MSG_BUDDY_2ND, | 1178 | action_result(pfn, MF_MSG_BUDDY_2ND, |
1202 | DELAYED); | 1179 | MF_DELAYED); |
1203 | return 0; | 1180 | return 0; |
1204 | } | 1181 | } |
1205 | } | 1182 | } |
@@ -1211,8 +1188,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1211 | * The page could have changed compound pages during the locking. | 1188 | * The page could have changed compound pages during the locking. |
1212 | * If this happens just bail out. | 1189 | * If this happens just bail out. |
1213 | */ | 1190 | */ |
1214 | if (compound_head(p) != hpage) { | 1191 | if (PageCompound(p) && compound_head(p) != orig_head) { |
1215 | action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED); | 1192 | action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); |
1216 | res = -EBUSY; | 1193 | res = -EBUSY; |
1217 | goto out; | 1194 | goto out; |
1218 | } | 1195 | } |
@@ -1252,7 +1229,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1252 | * on the head page to show that the hugepage is hwpoisoned | 1229 | * on the head page to show that the hugepage is hwpoisoned |
1253 | */ | 1230 | */ |
1254 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { | 1231 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { |
1255 | action_result(pfn, MSG_POISONED_HUGE, IGNORED); | 1232 | action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); |
1256 | unlock_page(hpage); | 1233 | unlock_page(hpage); |
1257 | put_page(hpage); | 1234 | put_page(hpage); |
1258 | return 0; | 1235 | return 0; |
@@ -1281,7 +1258,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1281 | */ | 1258 | */ |
1282 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) | 1259 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) |
1283 | != SWAP_SUCCESS) { | 1260 | != SWAP_SUCCESS) { |
1284 | action_result(pfn, MSG_UNMAP_FAILED, IGNORED); | 1261 | action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); |
1285 | res = -EBUSY; | 1262 | res = -EBUSY; |
1286 | goto out; | 1263 | goto out; |
1287 | } | 1264 | } |
@@ -1290,7 +1267,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1290 | * Torn down by someone else? | 1267 | * Torn down by someone else? |
1291 | */ | 1268 | */ |
1292 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { | 1269 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { |
1293 | action_result(pfn, MSG_TRUNCATED_LRU, IGNORED); | 1270 | action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); |
1294 | res = -EBUSY; | 1271 | res = -EBUSY; |
1295 | goto out; | 1272 | goto out; |
1296 | } | 1273 | } |
@@ -1450,12 +1427,12 @@ int unpoison_memory(unsigned long pfn) | |||
1450 | */ | 1427 | */ |
1451 | if (!PageHuge(page) && PageTransHuge(page)) { | 1428 | if (!PageHuge(page) && PageTransHuge(page)) { |
1452 | pr_info("MCE: Memory failure is now running on %#lx\n", pfn); | 1429 | pr_info("MCE: Memory failure is now running on %#lx\n", pfn); |
1453 | return 0; | 1430 | return 0; |
1454 | } | 1431 | } |
1455 | 1432 | ||
1456 | nr_pages = 1 << compound_order(page); | 1433 | nr_pages = 1 << compound_order(page); |
1457 | 1434 | ||
1458 | if (!get_page_unless_zero(page)) { | 1435 | if (!get_hwpoison_page(p)) { |
1459 | /* | 1436 | /* |
1460 | * Since HWPoisoned hugepage should have non-zero refcount, | 1437 | * Since HWPoisoned hugepage should have non-zero refcount, |
1461 | * race between memory failure and unpoison seems to happen. | 1438 | * race between memory failure and unpoison seems to happen. |
@@ -1523,7 +1500,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1523 | * When the target page is a free hugepage, just remove it | 1500 | * When the target page is a free hugepage, just remove it |
1524 | * from free hugepage list. | 1501 | * from free hugepage list. |
1525 | */ | 1502 | */ |
1526 | if (!get_page_unless_zero(compound_head(p))) { | 1503 | if (!get_hwpoison_page(p)) { |
1527 | if (PageHuge(p)) { | 1504 | if (PageHuge(p)) { |
1528 | pr_info("%s: %#lx free huge page\n", __func__, pfn); | 1505 | pr_info("%s: %#lx free huge page\n", __func__, pfn); |
1529 | ret = 0; | 1506 | ret = 0; |
@@ -1694,20 +1671,7 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1694 | if (ret > 0) | 1671 | if (ret > 0) |
1695 | ret = -EIO; | 1672 | ret = -EIO; |
1696 | } else { | 1673 | } else { |
1697 | /* | ||
1698 | * After page migration succeeds, the source page can | ||
1699 | * be trapped in pagevec and actual freeing is delayed. | ||
1700 | * Freeing code works differently based on PG_hwpoison, | ||
1701 | * so there's a race. We need to make sure that the | ||
1702 | * source page should be freed back to buddy before | ||
1703 | * setting PG_hwpoison. | ||
1704 | */ | ||
1705 | if (!is_free_buddy_page(page)) | ||
1706 | drain_all_pages(page_zone(page)); | ||
1707 | SetPageHWPoison(page); | 1674 | SetPageHWPoison(page); |
1708 | if (!is_free_buddy_page(page)) | ||
1709 | pr_info("soft offline: %#lx: page leaked\n", | ||
1710 | pfn); | ||
1711 | atomic_long_inc(&num_poisoned_pages); | 1675 | atomic_long_inc(&num_poisoned_pages); |
1712 | } | 1676 | } |
1713 | } else { | 1677 | } else { |
@@ -1759,14 +1723,6 @@ int soft_offline_page(struct page *page, int flags) | |||
1759 | 1723 | ||
1760 | get_online_mems(); | 1724 | get_online_mems(); |
1761 | 1725 | ||
1762 | /* | ||
1763 | * Isolate the page, so that it doesn't get reallocated if it | ||
1764 | * was free. This flag should be kept set until the source page | ||
1765 | * is freed and PG_hwpoison on it is set. | ||
1766 | */ | ||
1767 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
1768 | set_migratetype_isolate(page, true); | ||
1769 | |||
1770 | ret = get_any_page(page, pfn, flags); | 1726 | ret = get_any_page(page, pfn, flags); |
1771 | put_online_mems(); | 1727 | put_online_mems(); |
1772 | if (ret > 0) { /* for in-use pages */ | 1728 | if (ret > 0) { /* for in-use pages */ |
@@ -1785,6 +1741,5 @@ int soft_offline_page(struct page *page, int flags) | |||
1785 | atomic_long_inc(&num_poisoned_pages); | 1741 | atomic_long_inc(&num_poisoned_pages); |
1786 | } | 1742 | } |
1787 | } | 1743 | } |
1788 | unset_migratetype_isolate(page, MIGRATE_MOVABLE); | ||
1789 | return ret; | 1744 | return ret; |
1790 | } | 1745 | } |
diff --git a/mm/memory.c b/mm/memory.c index 17734c3c1183..11b9ca176740 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2081,11 +2081,12 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2081 | goto oom; | 2081 | goto oom; |
2082 | cow_user_page(new_page, old_page, address, vma); | 2082 | cow_user_page(new_page, old_page, address, vma); |
2083 | } | 2083 | } |
2084 | __SetPageUptodate(new_page); | ||
2085 | 2084 | ||
2086 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) | 2085 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) |
2087 | goto oom_free_new; | 2086 | goto oom_free_new; |
2088 | 2087 | ||
2088 | __SetPageUptodate(new_page); | ||
2089 | |||
2089 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2090 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2090 | 2091 | ||
2091 | /* | 2092 | /* |
@@ -2689,6 +2690,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2689 | page = alloc_zeroed_user_highpage_movable(vma, address); | 2690 | page = alloc_zeroed_user_highpage_movable(vma, address); |
2690 | if (!page) | 2691 | if (!page) |
2691 | goto oom; | 2692 | goto oom; |
2693 | |||
2694 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) | ||
2695 | goto oom_free_page; | ||
2696 | |||
2692 | /* | 2697 | /* |
2693 | * The memory barrier inside __SetPageUptodate makes sure that | 2698 | * The memory barrier inside __SetPageUptodate makes sure that |
2694 | * preceeding stores to the page contents become visible before | 2699 | * preceeding stores to the page contents become visible before |
@@ -2696,9 +2701,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2696 | */ | 2701 | */ |
2697 | __SetPageUptodate(page); | 2702 | __SetPageUptodate(page); |
2698 | 2703 | ||
2699 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) | ||
2700 | goto oom_free_page; | ||
2701 | |||
2702 | entry = mk_pte(page, vma->vm_page_prot); | 2704 | entry = mk_pte(page, vma->vm_page_prot); |
2703 | if (vma->vm_flags & VM_WRITE) | 2705 | if (vma->vm_flags & VM_WRITE) |
2704 | entry = pte_mkwrite(pte_mkdirty(entry)); | 2706 | entry = pte_mkwrite(pte_mkdirty(entry)); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9e88f749aa51..26fbba7d888f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -513,6 +513,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, | |||
513 | break; | 513 | break; |
514 | err = 0; | 514 | err = 0; |
515 | } | 515 | } |
516 | vmemmap_populate_print_last(); | ||
516 | 517 | ||
517 | return err; | 518 | return err; |
518 | } | 519 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 747743237d9f..99d4c1d0b858 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1972,35 +1972,41 @@ retry_cpuset: | |||
1972 | pol = get_vma_policy(vma, addr); | 1972 | pol = get_vma_policy(vma, addr); |
1973 | cpuset_mems_cookie = read_mems_allowed_begin(); | 1973 | cpuset_mems_cookie = read_mems_allowed_begin(); |
1974 | 1974 | ||
1975 | if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage && | 1975 | if (pol->mode == MPOL_INTERLEAVE) { |
1976 | pol->mode != MPOL_INTERLEAVE)) { | 1976 | unsigned nid; |
1977 | |||
1978 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | ||
1979 | mpol_cond_put(pol); | ||
1980 | page = alloc_page_interleave(gfp, order, nid); | ||
1981 | goto out; | ||
1982 | } | ||
1983 | |||
1984 | if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { | ||
1985 | int hpage_node = node; | ||
1986 | |||
1977 | /* | 1987 | /* |
1978 | * For hugepage allocation and non-interleave policy which | 1988 | * For hugepage allocation and non-interleave policy which |
1979 | * allows the current node, we only try to allocate from the | 1989 | * allows the current node (or other explicitly preferred |
1980 | * current node and don't fall back to other nodes, as the | 1990 | * node) we only try to allocate from the current/preferred |
1981 | * cost of remote accesses would likely offset THP benefits. | 1991 | * node and don't fall back to other nodes, as the cost of |
1992 | * remote accesses would likely offset THP benefits. | ||
1982 | * | 1993 | * |
1983 | * If the policy is interleave, or does not allow the current | 1994 | * If the policy is interleave, or does not allow the current |
1984 | * node in its nodemask, we allocate the standard way. | 1995 | * node in its nodemask, we allocate the standard way. |
1985 | */ | 1996 | */ |
1997 | if (pol->mode == MPOL_PREFERRED && | ||
1998 | !(pol->flags & MPOL_F_LOCAL)) | ||
1999 | hpage_node = pol->v.preferred_node; | ||
2000 | |||
1986 | nmask = policy_nodemask(gfp, pol); | 2001 | nmask = policy_nodemask(gfp, pol); |
1987 | if (!nmask || node_isset(node, *nmask)) { | 2002 | if (!nmask || node_isset(hpage_node, *nmask)) { |
1988 | mpol_cond_put(pol); | 2003 | mpol_cond_put(pol); |
1989 | page = alloc_pages_exact_node(node, | 2004 | page = alloc_pages_exact_node(hpage_node, |
1990 | gfp | __GFP_THISNODE, order); | 2005 | gfp | __GFP_THISNODE, order); |
1991 | goto out; | 2006 | goto out; |
1992 | } | 2007 | } |
1993 | } | 2008 | } |
1994 | 2009 | ||
1995 | if (pol->mode == MPOL_INTERLEAVE) { | ||
1996 | unsigned nid; | ||
1997 | |||
1998 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | ||
1999 | mpol_cond_put(pol); | ||
2000 | page = alloc_page_interleave(gfp, order, nid); | ||
2001 | goto out; | ||
2002 | } | ||
2003 | |||
2004 | nmask = policy_nodemask(gfp, pol); | 2010 | nmask = policy_nodemask(gfp, pol); |
2005 | zl = policy_zonelist(gfp, pol, node); | 2011 | zl = policy_zonelist(gfp, pol, node); |
2006 | mpol_cond_put(pol); | 2012 | mpol_cond_put(pol); |
diff --git a/mm/memtest.c b/mm/memtest.c index 1997d934b13b..0a1cc133f6d7 100644 --- a/mm/memtest.c +++ b/mm/memtest.c | |||
@@ -74,7 +74,8 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) | |||
74 | u64 i; | 74 | u64 i; |
75 | phys_addr_t this_start, this_end; | 75 | phys_addr_t this_start, this_end; |
76 | 76 | ||
77 | for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { | 77 | for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start, |
78 | &this_end, NULL) { | ||
78 | this_start = clamp(this_start, start, end); | 79 | this_start = clamp(this_start, start, end); |
79 | this_end = clamp(this_end, start, end); | 80 | this_end = clamp(this_end, start, end); |
80 | if (this_start < this_end) { | 81 | if (this_start < this_end) { |
diff --git a/mm/migrate.c b/mm/migrate.c index f53838fe3dfe..ee401e4e5ef1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -918,7 +918,8 @@ out: | |||
918 | static ICE_noinline int unmap_and_move(new_page_t get_new_page, | 918 | static ICE_noinline int unmap_and_move(new_page_t get_new_page, |
919 | free_page_t put_new_page, | 919 | free_page_t put_new_page, |
920 | unsigned long private, struct page *page, | 920 | unsigned long private, struct page *page, |
921 | int force, enum migrate_mode mode) | 921 | int force, enum migrate_mode mode, |
922 | enum migrate_reason reason) | ||
922 | { | 923 | { |
923 | int rc = 0; | 924 | int rc = 0; |
924 | int *result = NULL; | 925 | int *result = NULL; |
@@ -949,7 +950,8 @@ out: | |||
949 | list_del(&page->lru); | 950 | list_del(&page->lru); |
950 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 951 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
951 | page_is_file_cache(page)); | 952 | page_is_file_cache(page)); |
952 | putback_lru_page(page); | 953 | if (reason != MR_MEMORY_FAILURE) |
954 | putback_lru_page(page); | ||
953 | } | 955 | } |
954 | 956 | ||
955 | /* | 957 | /* |
@@ -1122,7 +1124,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, | |||
1122 | pass > 2, mode); | 1124 | pass > 2, mode); |
1123 | else | 1125 | else |
1124 | rc = unmap_and_move(get_new_page, put_new_page, | 1126 | rc = unmap_and_move(get_new_page, put_new_page, |
1125 | private, page, pass > 2, mode); | 1127 | private, page, pass > 2, mode, |
1128 | reason); | ||
1126 | 1129 | ||
1127 | switch(rc) { | 1130 | switch(rc) { |
1128 | case -ENOMEM: | 1131 | case -ENOMEM: |
@@ -1796,7 +1799,7 @@ fail_putback: | |||
1796 | */ | 1799 | */ |
1797 | flush_cache_range(vma, mmun_start, mmun_end); | 1800 | flush_cache_range(vma, mmun_start, mmun_end); |
1798 | page_add_anon_rmap(new_page, vma, mmun_start); | 1801 | page_add_anon_rmap(new_page, vma, mmun_start); |
1799 | pmdp_clear_flush_notify(vma, mmun_start, pmd); | 1802 | pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); |
1800 | set_pmd_at(mm, mmun_start, pmd, entry); | 1803 | set_pmd_at(mm, mmun_start, pmd, entry); |
1801 | flush_tlb_range(vma, mmun_start, mmun_end); | 1804 | flush_tlb_range(vma, mmun_start, mmun_end); |
1802 | update_mmu_cache_pmd(vma, address, &entry); | 1805 | update_mmu_cache_pmd(vma, address, &entry); |
@@ -1258,6 +1258,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1258 | 1258 | ||
1259 | *populate = 0; | 1259 | *populate = 0; |
1260 | 1260 | ||
1261 | if (!len) | ||
1262 | return -EINVAL; | ||
1263 | |||
1261 | /* | 1264 | /* |
1262 | * Does the application expect PROT_READ to imply PROT_EXEC? | 1265 | * Does the application expect PROT_READ to imply PROT_EXEC? |
1263 | * | 1266 | * |
@@ -1268,9 +1271,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1268 | if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) | 1271 | if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) |
1269 | prot |= PROT_EXEC; | 1272 | prot |= PROT_EXEC; |
1270 | 1273 | ||
1271 | if (!len) | ||
1272 | return -EINVAL; | ||
1273 | |||
1274 | if (!(flags & MAP_FIXED)) | 1274 | if (!(flags & MAP_FIXED)) |
1275 | addr = round_hint_to_min(addr); | 1275 | addr = round_hint_to_min(addr); |
1276 | 1276 | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index 88584838e704..e7d6f1171ecb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -29,6 +29,8 @@ | |||
29 | #include <asm/cacheflush.h> | 29 | #include <asm/cacheflush.h> |
30 | #include <asm/tlbflush.h> | 30 | #include <asm/tlbflush.h> |
31 | 31 | ||
32 | #include "internal.h" | ||
33 | |||
32 | /* | 34 | /* |
33 | * For a prot_numa update we only hold mmap_sem for read so there is a | 35 | * For a prot_numa update we only hold mmap_sem for read so there is a |
34 | * potential race with faulting where a pmd was temporarily none. This | 36 | * potential race with faulting where a pmd was temporarily none. This |
@@ -322,6 +324,15 @@ success: | |||
322 | change_protection(vma, start, end, vma->vm_page_prot, | 324 | change_protection(vma, start, end, vma->vm_page_prot, |
323 | dirty_accountable, 0); | 325 | dirty_accountable, 0); |
324 | 326 | ||
327 | /* | ||
328 | * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major | ||
329 | * fault on access. | ||
330 | */ | ||
331 | if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && | ||
332 | (newflags & VM_WRITE)) { | ||
333 | populate_vma_page_range(vma, start, end, NULL); | ||
334 | } | ||
335 | |||
325 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 336 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
326 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 337 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
327 | perf_event_mmap(vma); | 338 | perf_event_mmap(vma); |
diff --git a/mm/mremap.c b/mm/mremap.c index 034e2d360652..a7c93eceb1c8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/mmu_notifier.h> | 22 | #include <linux/mmu_notifier.h> |
23 | #include <linux/sched/sysctl.h> | 23 | #include <linux/sched/sysctl.h> |
24 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
25 | #include <linux/mm-arch-hooks.h> | ||
25 | 26 | ||
26 | #include <asm/cacheflush.h> | 27 | #include <asm/cacheflush.h> |
27 | #include <asm/tlbflush.h> | 28 | #include <asm/tlbflush.h> |
@@ -286,13 +287,17 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
286 | old_len = new_len; | 287 | old_len = new_len; |
287 | old_addr = new_addr; | 288 | old_addr = new_addr; |
288 | new_addr = -ENOMEM; | 289 | new_addr = -ENOMEM; |
289 | } else if (vma->vm_file && vma->vm_file->f_op->mremap) { | 290 | } else { |
290 | err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); | 291 | if (vma->vm_file && vma->vm_file->f_op->mremap) { |
291 | if (err < 0) { | 292 | err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); |
292 | move_page_tables(new_vma, new_addr, vma, old_addr, | 293 | if (err < 0) { |
293 | moved_len, true); | 294 | move_page_tables(new_vma, new_addr, vma, |
294 | return err; | 295 | old_addr, moved_len, true); |
296 | return err; | ||
297 | } | ||
295 | } | 298 | } |
299 | arch_remap(mm, old_addr, old_addr + old_len, | ||
300 | new_addr, new_addr + new_len); | ||
296 | } | 301 | } |
297 | 302 | ||
298 | /* Conceal VM_ACCOUNT so old reservation is not undone */ | 303 | /* Conceal VM_ACCOUNT so old reservation is not undone */ |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 90b50468333e..5258386fa1be 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -37,11 +37,20 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | |||
37 | { | 37 | { |
38 | void *ptr; | 38 | void *ptr; |
39 | u64 addr; | 39 | u64 addr; |
40 | ulong flags = choose_memblock_flags(); | ||
40 | 41 | ||
41 | if (limit > memblock.current_limit) | 42 | if (limit > memblock.current_limit) |
42 | limit = memblock.current_limit; | 43 | limit = memblock.current_limit; |
43 | 44 | ||
44 | addr = memblock_find_in_range_node(size, align, goal, limit, nid); | 45 | again: |
46 | addr = memblock_find_in_range_node(size, align, goal, limit, nid, | ||
47 | flags); | ||
48 | if (!addr && (flags & MEMBLOCK_MIRROR)) { | ||
49 | flags &= ~MEMBLOCK_MIRROR; | ||
50 | pr_warn("Could not allocate %pap bytes of mirrored memory\n", | ||
51 | &size); | ||
52 | goto again; | ||
53 | } | ||
45 | if (!addr) | 54 | if (!addr) |
46 | return NULL; | 55 | return NULL; |
47 | 56 | ||
@@ -121,7 +130,8 @@ static unsigned long __init free_low_memory_core_early(void) | |||
121 | 130 | ||
122 | memblock_clear_hotplug(0, -1); | 131 | memblock_clear_hotplug(0, -1); |
123 | 132 | ||
124 | for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) | 133 | for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, |
134 | NULL) | ||
125 | count += __free_memory_core(start, end); | 135 | count += __free_memory_core(start, end); |
126 | 136 | ||
127 | #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK | 137 | #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK |
diff --git a/mm/nommu.c b/mm/nommu.c index e544508e2a4b..05e7447d960b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -42,22 +42,6 @@ | |||
42 | #include <asm/mmu_context.h> | 42 | #include <asm/mmu_context.h> |
43 | #include "internal.h" | 43 | #include "internal.h" |
44 | 44 | ||
45 | #if 0 | ||
46 | #define kenter(FMT, ...) \ | ||
47 | printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) | ||
48 | #define kleave(FMT, ...) \ | ||
49 | printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) | ||
50 | #define kdebug(FMT, ...) \ | ||
51 | printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) | ||
52 | #else | ||
53 | #define kenter(FMT, ...) \ | ||
54 | no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) | ||
55 | #define kleave(FMT, ...) \ | ||
56 | no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) | ||
57 | #define kdebug(FMT, ...) \ | ||
58 | no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) | ||
59 | #endif | ||
60 | |||
61 | void *high_memory; | 45 | void *high_memory; |
62 | EXPORT_SYMBOL(high_memory); | 46 | EXPORT_SYMBOL(high_memory); |
63 | struct page *mem_map; | 47 | struct page *mem_map; |
@@ -665,11 +649,7 @@ static void free_page_series(unsigned long from, unsigned long to) | |||
665 | for (; from < to; from += PAGE_SIZE) { | 649 | for (; from < to; from += PAGE_SIZE) { |
666 | struct page *page = virt_to_page(from); | 650 | struct page *page = virt_to_page(from); |
667 | 651 | ||
668 | kdebug("- free %lx", from); | ||
669 | atomic_long_dec(&mmap_pages_allocated); | 652 | atomic_long_dec(&mmap_pages_allocated); |
670 | if (page_count(page) != 1) | ||
671 | kdebug("free page %p: refcount not one: %d", | ||
672 | page, page_count(page)); | ||
673 | put_page(page); | 653 | put_page(page); |
674 | } | 654 | } |
675 | } | 655 | } |
@@ -683,8 +663,6 @@ static void free_page_series(unsigned long from, unsigned long to) | |||
683 | static void __put_nommu_region(struct vm_region *region) | 663 | static void __put_nommu_region(struct vm_region *region) |
684 | __releases(nommu_region_sem) | 664 | __releases(nommu_region_sem) |
685 | { | 665 | { |
686 | kenter("%p{%d}", region, region->vm_usage); | ||
687 | |||
688 | BUG_ON(!nommu_region_tree.rb_node); | 666 | BUG_ON(!nommu_region_tree.rb_node); |
689 | 667 | ||
690 | if (--region->vm_usage == 0) { | 668 | if (--region->vm_usage == 0) { |
@@ -697,10 +675,8 @@ static void __put_nommu_region(struct vm_region *region) | |||
697 | 675 | ||
698 | /* IO memory and memory shared directly out of the pagecache | 676 | /* IO memory and memory shared directly out of the pagecache |
699 | * from ramfs/tmpfs mustn't be released here */ | 677 | * from ramfs/tmpfs mustn't be released here */ |
700 | if (region->vm_flags & VM_MAPPED_COPY) { | 678 | if (region->vm_flags & VM_MAPPED_COPY) |
701 | kdebug("free series"); | ||
702 | free_page_series(region->vm_start, region->vm_top); | 679 | free_page_series(region->vm_start, region->vm_top); |
703 | } | ||
704 | kmem_cache_free(vm_region_jar, region); | 680 | kmem_cache_free(vm_region_jar, region); |
705 | } else { | 681 | } else { |
706 | up_write(&nommu_region_sem); | 682 | up_write(&nommu_region_sem); |
@@ -744,8 +720,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
744 | struct address_space *mapping; | 720 | struct address_space *mapping; |
745 | struct rb_node **p, *parent, *rb_prev; | 721 | struct rb_node **p, *parent, *rb_prev; |
746 | 722 | ||
747 | kenter(",%p", vma); | ||
748 | |||
749 | BUG_ON(!vma->vm_region); | 723 | BUG_ON(!vma->vm_region); |
750 | 724 | ||
751 | mm->map_count++; | 725 | mm->map_count++; |
@@ -813,8 +787,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
813 | struct mm_struct *mm = vma->vm_mm; | 787 | struct mm_struct *mm = vma->vm_mm; |
814 | struct task_struct *curr = current; | 788 | struct task_struct *curr = current; |
815 | 789 | ||
816 | kenter("%p", vma); | ||
817 | |||
818 | protect_vma(vma, 0); | 790 | protect_vma(vma, 0); |
819 | 791 | ||
820 | mm->map_count--; | 792 | mm->map_count--; |
@@ -854,7 +826,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
854 | */ | 826 | */ |
855 | static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) | 827 | static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) |
856 | { | 828 | { |
857 | kenter("%p", vma); | ||
858 | if (vma->vm_ops && vma->vm_ops->close) | 829 | if (vma->vm_ops && vma->vm_ops->close) |
859 | vma->vm_ops->close(vma); | 830 | vma->vm_ops->close(vma); |
860 | if (vma->vm_file) | 831 | if (vma->vm_file) |
@@ -957,12 +928,8 @@ static int validate_mmap_request(struct file *file, | |||
957 | int ret; | 928 | int ret; |
958 | 929 | ||
959 | /* do the simple checks first */ | 930 | /* do the simple checks first */ |
960 | if (flags & MAP_FIXED) { | 931 | if (flags & MAP_FIXED) |
961 | printk(KERN_DEBUG | ||
962 | "%d: Can't do fixed-address/overlay mmap of RAM\n", | ||
963 | current->pid); | ||
964 | return -EINVAL; | 932 | return -EINVAL; |
965 | } | ||
966 | 933 | ||
967 | if ((flags & MAP_TYPE) != MAP_PRIVATE && | 934 | if ((flags & MAP_TYPE) != MAP_PRIVATE && |
968 | (flags & MAP_TYPE) != MAP_SHARED) | 935 | (flags & MAP_TYPE) != MAP_SHARED) |
@@ -1060,8 +1027,7 @@ static int validate_mmap_request(struct file *file, | |||
1060 | ) { | 1027 | ) { |
1061 | capabilities &= ~NOMMU_MAP_DIRECT; | 1028 | capabilities &= ~NOMMU_MAP_DIRECT; |
1062 | if (flags & MAP_SHARED) { | 1029 | if (flags & MAP_SHARED) { |
1063 | printk(KERN_WARNING | 1030 | pr_warn("MAP_SHARED not completely supported on !MMU\n"); |
1064 | "MAP_SHARED not completely supported on !MMU\n"); | ||
1065 | return -EINVAL; | 1031 | return -EINVAL; |
1066 | } | 1032 | } |
1067 | } | 1033 | } |
@@ -1205,16 +1171,12 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1205 | * we're allocating is smaller than a page | 1171 | * we're allocating is smaller than a page |
1206 | */ | 1172 | */ |
1207 | order = get_order(len); | 1173 | order = get_order(len); |
1208 | kdebug("alloc order %d for %lx", order, len); | ||
1209 | |||
1210 | total = 1 << order; | 1174 | total = 1 << order; |
1211 | point = len >> PAGE_SHIFT; | 1175 | point = len >> PAGE_SHIFT; |
1212 | 1176 | ||
1213 | /* we don't want to allocate a power-of-2 sized page set */ | 1177 | /* we don't want to allocate a power-of-2 sized page set */ |
1214 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { | 1178 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) |
1215 | total = point; | 1179 | total = point; |
1216 | kdebug("try to alloc exact %lu pages", total); | ||
1217 | } | ||
1218 | 1180 | ||
1219 | base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL); | 1181 | base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL); |
1220 | if (!base) | 1182 | if (!base) |
@@ -1285,18 +1247,14 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1285 | unsigned long capabilities, vm_flags, result; | 1247 | unsigned long capabilities, vm_flags, result; |
1286 | int ret; | 1248 | int ret; |
1287 | 1249 | ||
1288 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); | ||
1289 | |||
1290 | *populate = 0; | 1250 | *populate = 0; |
1291 | 1251 | ||
1292 | /* decide whether we should attempt the mapping, and if so what sort of | 1252 | /* decide whether we should attempt the mapping, and if so what sort of |
1293 | * mapping */ | 1253 | * mapping */ |
1294 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, | 1254 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, |
1295 | &capabilities); | 1255 | &capabilities); |
1296 | if (ret < 0) { | 1256 | if (ret < 0) |
1297 | kleave(" = %d [val]", ret); | ||
1298 | return ret; | 1257 | return ret; |
1299 | } | ||
1300 | 1258 | ||
1301 | /* we ignore the address hint */ | 1259 | /* we ignore the address hint */ |
1302 | addr = 0; | 1260 | addr = 0; |
@@ -1383,11 +1341,9 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1383 | vma->vm_start = start; | 1341 | vma->vm_start = start; |
1384 | vma->vm_end = start + len; | 1342 | vma->vm_end = start + len; |
1385 | 1343 | ||
1386 | if (pregion->vm_flags & VM_MAPPED_COPY) { | 1344 | if (pregion->vm_flags & VM_MAPPED_COPY) |
1387 | kdebug("share copy"); | ||
1388 | vma->vm_flags |= VM_MAPPED_COPY; | 1345 | vma->vm_flags |= VM_MAPPED_COPY; |
1389 | } else { | 1346 | else { |
1390 | kdebug("share mmap"); | ||
1391 | ret = do_mmap_shared_file(vma); | 1347 | ret = do_mmap_shared_file(vma); |
1392 | if (ret < 0) { | 1348 | if (ret < 0) { |
1393 | vma->vm_region = NULL; | 1349 | vma->vm_region = NULL; |
@@ -1467,7 +1423,6 @@ share: | |||
1467 | 1423 | ||
1468 | up_write(&nommu_region_sem); | 1424 | up_write(&nommu_region_sem); |
1469 | 1425 | ||
1470 | kleave(" = %lx", result); | ||
1471 | return result; | 1426 | return result; |
1472 | 1427 | ||
1473 | error_just_free: | 1428 | error_just_free: |
@@ -1479,27 +1434,24 @@ error: | |||
1479 | if (vma->vm_file) | 1434 | if (vma->vm_file) |
1480 | fput(vma->vm_file); | 1435 | fput(vma->vm_file); |
1481 | kmem_cache_free(vm_area_cachep, vma); | 1436 | kmem_cache_free(vm_area_cachep, vma); |
1482 | kleave(" = %d", ret); | ||
1483 | return ret; | 1437 | return ret; |
1484 | 1438 | ||
1485 | sharing_violation: | 1439 | sharing_violation: |
1486 | up_write(&nommu_region_sem); | 1440 | up_write(&nommu_region_sem); |
1487 | printk(KERN_WARNING "Attempt to share mismatched mappings\n"); | 1441 | pr_warn("Attempt to share mismatched mappings\n"); |
1488 | ret = -EINVAL; | 1442 | ret = -EINVAL; |
1489 | goto error; | 1443 | goto error; |
1490 | 1444 | ||
1491 | error_getting_vma: | 1445 | error_getting_vma: |
1492 | kmem_cache_free(vm_region_jar, region); | 1446 | kmem_cache_free(vm_region_jar, region); |
1493 | printk(KERN_WARNING "Allocation of vma for %lu byte allocation" | 1447 | pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", |
1494 | " from process %d failed\n", | 1448 | len, current->pid); |
1495 | len, current->pid); | ||
1496 | show_free_areas(0); | 1449 | show_free_areas(0); |
1497 | return -ENOMEM; | 1450 | return -ENOMEM; |
1498 | 1451 | ||
1499 | error_getting_region: | 1452 | error_getting_region: |
1500 | printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" | 1453 | pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", |
1501 | " from process %d failed\n", | 1454 | len, current->pid); |
1502 | len, current->pid); | ||
1503 | show_free_areas(0); | 1455 | show_free_areas(0); |
1504 | return -ENOMEM; | 1456 | return -ENOMEM; |
1505 | } | 1457 | } |
@@ -1563,8 +1515,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1563 | struct vm_region *region; | 1515 | struct vm_region *region; |
1564 | unsigned long npages; | 1516 | unsigned long npages; |
1565 | 1517 | ||
1566 | kenter(""); | ||
1567 | |||
1568 | /* we're only permitted to split anonymous regions (these should have | 1518 | /* we're only permitted to split anonymous regions (these should have |
1569 | * only a single usage on the region) */ | 1519 | * only a single usage on the region) */ |
1570 | if (vma->vm_file) | 1520 | if (vma->vm_file) |
@@ -1628,8 +1578,6 @@ static int shrink_vma(struct mm_struct *mm, | |||
1628 | { | 1578 | { |
1629 | struct vm_region *region; | 1579 | struct vm_region *region; |
1630 | 1580 | ||
1631 | kenter(""); | ||
1632 | |||
1633 | /* adjust the VMA's pointers, which may reposition it in the MM's tree | 1581 | /* adjust the VMA's pointers, which may reposition it in the MM's tree |
1634 | * and list */ | 1582 | * and list */ |
1635 | delete_vma_from_mm(vma); | 1583 | delete_vma_from_mm(vma); |
@@ -1669,8 +1617,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1669 | unsigned long end; | 1617 | unsigned long end; |
1670 | int ret; | 1618 | int ret; |
1671 | 1619 | ||
1672 | kenter(",%lx,%zx", start, len); | ||
1673 | |||
1674 | len = PAGE_ALIGN(len); | 1620 | len = PAGE_ALIGN(len); |
1675 | if (len == 0) | 1621 | if (len == 0) |
1676 | return -EINVAL; | 1622 | return -EINVAL; |
@@ -1682,11 +1628,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1682 | if (!vma) { | 1628 | if (!vma) { |
1683 | static int limit; | 1629 | static int limit; |
1684 | if (limit < 5) { | 1630 | if (limit < 5) { |
1685 | printk(KERN_WARNING | 1631 | pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n", |
1686 | "munmap of memory not mmapped by process %d" | 1632 | current->pid, current->comm, |
1687 | " (%s): 0x%lx-0x%lx\n", | 1633 | start, start + len - 1); |
1688 | current->pid, current->comm, | ||
1689 | start, start + len - 1); | ||
1690 | limit++; | 1634 | limit++; |
1691 | } | 1635 | } |
1692 | return -EINVAL; | 1636 | return -EINVAL; |
@@ -1695,38 +1639,27 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1695 | /* we're allowed to split an anonymous VMA but not a file-backed one */ | 1639 | /* we're allowed to split an anonymous VMA but not a file-backed one */ |
1696 | if (vma->vm_file) { | 1640 | if (vma->vm_file) { |
1697 | do { | 1641 | do { |
1698 | if (start > vma->vm_start) { | 1642 | if (start > vma->vm_start) |
1699 | kleave(" = -EINVAL [miss]"); | ||
1700 | return -EINVAL; | 1643 | return -EINVAL; |
1701 | } | ||
1702 | if (end == vma->vm_end) | 1644 | if (end == vma->vm_end) |
1703 | goto erase_whole_vma; | 1645 | goto erase_whole_vma; |
1704 | vma = vma->vm_next; | 1646 | vma = vma->vm_next; |
1705 | } while (vma); | 1647 | } while (vma); |
1706 | kleave(" = -EINVAL [split file]"); | ||
1707 | return -EINVAL; | 1648 | return -EINVAL; |
1708 | } else { | 1649 | } else { |
1709 | /* the chunk must be a subset of the VMA found */ | 1650 | /* the chunk must be a subset of the VMA found */ |
1710 | if (start == vma->vm_start && end == vma->vm_end) | 1651 | if (start == vma->vm_start && end == vma->vm_end) |
1711 | goto erase_whole_vma; | 1652 | goto erase_whole_vma; |
1712 | if (start < vma->vm_start || end > vma->vm_end) { | 1653 | if (start < vma->vm_start || end > vma->vm_end) |
1713 | kleave(" = -EINVAL [superset]"); | ||
1714 | return -EINVAL; | 1654 | return -EINVAL; |
1715 | } | 1655 | if (start & ~PAGE_MASK) |
1716 | if (start & ~PAGE_MASK) { | ||
1717 | kleave(" = -EINVAL [unaligned start]"); | ||
1718 | return -EINVAL; | 1656 | return -EINVAL; |
1719 | } | 1657 | if (end != vma->vm_end && end & ~PAGE_MASK) |
1720 | if (end != vma->vm_end && end & ~PAGE_MASK) { | ||
1721 | kleave(" = -EINVAL [unaligned split]"); | ||
1722 | return -EINVAL; | 1658 | return -EINVAL; |
1723 | } | ||
1724 | if (start != vma->vm_start && end != vma->vm_end) { | 1659 | if (start != vma->vm_start && end != vma->vm_end) { |
1725 | ret = split_vma(mm, vma, start, 1); | 1660 | ret = split_vma(mm, vma, start, 1); |
1726 | if (ret < 0) { | 1661 | if (ret < 0) |
1727 | kleave(" = %d [split]", ret); | ||
1728 | return ret; | 1662 | return ret; |
1729 | } | ||
1730 | } | 1663 | } |
1731 | return shrink_vma(mm, vma, start, end); | 1664 | return shrink_vma(mm, vma, start, end); |
1732 | } | 1665 | } |
@@ -1734,7 +1667,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1734 | erase_whole_vma: | 1667 | erase_whole_vma: |
1735 | delete_vma_from_mm(vma); | 1668 | delete_vma_from_mm(vma); |
1736 | delete_vma(mm, vma); | 1669 | delete_vma(mm, vma); |
1737 | kleave(" = 0"); | ||
1738 | return 0; | 1670 | return 0; |
1739 | } | 1671 | } |
1740 | EXPORT_SYMBOL(do_munmap); | 1672 | EXPORT_SYMBOL(do_munmap); |
@@ -1766,8 +1698,6 @@ void exit_mmap(struct mm_struct *mm) | |||
1766 | if (!mm) | 1698 | if (!mm) |
1767 | return; | 1699 | return; |
1768 | 1700 | ||
1769 | kenter(""); | ||
1770 | |||
1771 | mm->total_vm = 0; | 1701 | mm->total_vm = 0; |
1772 | 1702 | ||
1773 | while ((vma = mm->mmap)) { | 1703 | while ((vma = mm->mmap)) { |
@@ -1776,8 +1706,6 @@ void exit_mmap(struct mm_struct *mm) | |||
1776 | delete_vma(mm, vma); | 1706 | delete_vma(mm, vma); |
1777 | cond_resched(); | 1707 | cond_resched(); |
1778 | } | 1708 | } |
1779 | |||
1780 | kleave(""); | ||
1781 | } | 1709 | } |
1782 | 1710 | ||
1783 | unsigned long vm_brk(unsigned long addr, unsigned long len) | 1711 | unsigned long vm_brk(unsigned long addr, unsigned long len) |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2b665da1b3c9..dff991e0681e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -42,7 +42,8 @@ | |||
42 | int sysctl_panic_on_oom; | 42 | int sysctl_panic_on_oom; |
43 | int sysctl_oom_kill_allocating_task; | 43 | int sysctl_oom_kill_allocating_task; |
44 | int sysctl_oom_dump_tasks = 1; | 44 | int sysctl_oom_dump_tasks = 1; |
45 | static DEFINE_SPINLOCK(zone_scan_lock); | 45 | |
46 | DEFINE_MUTEX(oom_lock); | ||
46 | 47 | ||
47 | #ifdef CONFIG_NUMA | 48 | #ifdef CONFIG_NUMA |
48 | /** | 49 | /** |
@@ -405,16 +406,15 @@ static atomic_t oom_victims = ATOMIC_INIT(0); | |||
405 | static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); | 406 | static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); |
406 | 407 | ||
407 | bool oom_killer_disabled __read_mostly; | 408 | bool oom_killer_disabled __read_mostly; |
408 | static DECLARE_RWSEM(oom_sem); | ||
409 | 409 | ||
410 | /** | 410 | /** |
411 | * mark_tsk_oom_victim - marks the given task as OOM victim. | 411 | * mark_oom_victim - mark the given task as OOM victim |
412 | * @tsk: task to mark | 412 | * @tsk: task to mark |
413 | * | 413 | * |
414 | * Has to be called with oom_sem taken for read and never after | 414 | * Has to be called with oom_lock held and never after |
415 | * oom has been disabled already. | 415 | * oom has been disabled already. |
416 | */ | 416 | */ |
417 | void mark_tsk_oom_victim(struct task_struct *tsk) | 417 | void mark_oom_victim(struct task_struct *tsk) |
418 | { | 418 | { |
419 | WARN_ON(oom_killer_disabled); | 419 | WARN_ON(oom_killer_disabled); |
420 | /* OOM killer might race with memcg OOM */ | 420 | /* OOM killer might race with memcg OOM */ |
@@ -431,23 +431,14 @@ void mark_tsk_oom_victim(struct task_struct *tsk) | |||
431 | } | 431 | } |
432 | 432 | ||
433 | /** | 433 | /** |
434 | * unmark_oom_victim - unmarks the current task as OOM victim. | 434 | * exit_oom_victim - note the exit of an OOM victim |
435 | * | ||
436 | * Wakes up all waiters in oom_killer_disable() | ||
437 | */ | 435 | */ |
438 | void unmark_oom_victim(void) | 436 | void exit_oom_victim(void) |
439 | { | 437 | { |
440 | if (!test_and_clear_thread_flag(TIF_MEMDIE)) | 438 | clear_thread_flag(TIF_MEMDIE); |
441 | return; | ||
442 | 439 | ||
443 | down_read(&oom_sem); | 440 | if (!atomic_dec_return(&oom_victims)) |
444 | /* | ||
445 | * There is no need to signal the lasst oom_victim if there | ||
446 | * is nobody who cares. | ||
447 | */ | ||
448 | if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) | ||
449 | wake_up_all(&oom_victims_wait); | 441 | wake_up_all(&oom_victims_wait); |
450 | up_read(&oom_sem); | ||
451 | } | 442 | } |
452 | 443 | ||
453 | /** | 444 | /** |
@@ -469,14 +460,14 @@ bool oom_killer_disable(void) | |||
469 | * Make sure to not race with an ongoing OOM killer | 460 | * Make sure to not race with an ongoing OOM killer |
470 | * and that the current is not the victim. | 461 | * and that the current is not the victim. |
471 | */ | 462 | */ |
472 | down_write(&oom_sem); | 463 | mutex_lock(&oom_lock); |
473 | if (test_thread_flag(TIF_MEMDIE)) { | 464 | if (test_thread_flag(TIF_MEMDIE)) { |
474 | up_write(&oom_sem); | 465 | mutex_unlock(&oom_lock); |
475 | return false; | 466 | return false; |
476 | } | 467 | } |
477 | 468 | ||
478 | oom_killer_disabled = true; | 469 | oom_killer_disabled = true; |
479 | up_write(&oom_sem); | 470 | mutex_unlock(&oom_lock); |
480 | 471 | ||
481 | wait_event(oom_victims_wait, !atomic_read(&oom_victims)); | 472 | wait_event(oom_victims_wait, !atomic_read(&oom_victims)); |
482 | 473 | ||
@@ -488,9 +479,7 @@ bool oom_killer_disable(void) | |||
488 | */ | 479 | */ |
489 | void oom_killer_enable(void) | 480 | void oom_killer_enable(void) |
490 | { | 481 | { |
491 | down_write(&oom_sem); | ||
492 | oom_killer_disabled = false; | 482 | oom_killer_disabled = false; |
493 | up_write(&oom_sem); | ||
494 | } | 483 | } |
495 | 484 | ||
496 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 485 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
@@ -517,7 +506,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
517 | */ | 506 | */ |
518 | task_lock(p); | 507 | task_lock(p); |
519 | if (p->mm && task_will_free_mem(p)) { | 508 | if (p->mm && task_will_free_mem(p)) { |
520 | mark_tsk_oom_victim(p); | 509 | mark_oom_victim(p); |
521 | task_unlock(p); | 510 | task_unlock(p); |
522 | put_task_struct(p); | 511 | put_task_struct(p); |
523 | return; | 512 | return; |
@@ -528,7 +517,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
528 | dump_header(p, gfp_mask, order, memcg, nodemask); | 517 | dump_header(p, gfp_mask, order, memcg, nodemask); |
529 | 518 | ||
530 | task_lock(p); | 519 | task_lock(p); |
531 | pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", | 520 | pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", |
532 | message, task_pid_nr(p), p->comm, points); | 521 | message, task_pid_nr(p), p->comm, points); |
533 | task_unlock(p); | 522 | task_unlock(p); |
534 | 523 | ||
@@ -572,7 +561,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
572 | 561 | ||
573 | /* mm cannot safely be dereferenced after task_unlock(victim) */ | 562 | /* mm cannot safely be dereferenced after task_unlock(victim) */ |
574 | mm = victim->mm; | 563 | mm = victim->mm; |
575 | mark_tsk_oom_victim(victim); | 564 | mark_oom_victim(victim); |
576 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | 565 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
577 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), | 566 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), |
578 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), | 567 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), |
@@ -645,52 +634,6 @@ int unregister_oom_notifier(struct notifier_block *nb) | |||
645 | } | 634 | } |
646 | EXPORT_SYMBOL_GPL(unregister_oom_notifier); | 635 | EXPORT_SYMBOL_GPL(unregister_oom_notifier); |
647 | 636 | ||
648 | /* | ||
649 | * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero | ||
650 | * if a parallel OOM killing is already taking place that includes a zone in | ||
651 | * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. | ||
652 | */ | ||
653 | bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) | ||
654 | { | ||
655 | struct zoneref *z; | ||
656 | struct zone *zone; | ||
657 | bool ret = true; | ||
658 | |||
659 | spin_lock(&zone_scan_lock); | ||
660 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) | ||
661 | if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) { | ||
662 | ret = false; | ||
663 | goto out; | ||
664 | } | ||
665 | |||
666 | /* | ||
667 | * Lock each zone in the zonelist under zone_scan_lock so a parallel | ||
668 | * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. | ||
669 | */ | ||
670 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) | ||
671 | set_bit(ZONE_OOM_LOCKED, &zone->flags); | ||
672 | |||
673 | out: | ||
674 | spin_unlock(&zone_scan_lock); | ||
675 | return ret; | ||
676 | } | ||
677 | |||
678 | /* | ||
679 | * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed | ||
680 | * allocation attempts with zonelists containing them may now recall the OOM | ||
681 | * killer, if necessary. | ||
682 | */ | ||
683 | void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) | ||
684 | { | ||
685 | struct zoneref *z; | ||
686 | struct zone *zone; | ||
687 | |||
688 | spin_lock(&zone_scan_lock); | ||
689 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) | ||
690 | clear_bit(ZONE_OOM_LOCKED, &zone->flags); | ||
691 | spin_unlock(&zone_scan_lock); | ||
692 | } | ||
693 | |||
694 | /** | 637 | /** |
695 | * __out_of_memory - kill the "best" process when we run out of memory | 638 | * __out_of_memory - kill the "best" process when we run out of memory |
696 | * @zonelist: zonelist pointer | 639 | * @zonelist: zonelist pointer |
@@ -704,8 +647,8 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
704 | * OR try to be smart about which process to kill. Note that we | 647 | * OR try to be smart about which process to kill. Note that we |
705 | * don't have to be perfect here, we just have to be good. | 648 | * don't have to be perfect here, we just have to be good. |
706 | */ | 649 | */ |
707 | static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 650 | bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
708 | int order, nodemask_t *nodemask, bool force_kill) | 651 | int order, nodemask_t *nodemask, bool force_kill) |
709 | { | 652 | { |
710 | const nodemask_t *mpol_mask; | 653 | const nodemask_t *mpol_mask; |
711 | struct task_struct *p; | 654 | struct task_struct *p; |
@@ -715,10 +658,13 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
715 | enum oom_constraint constraint = CONSTRAINT_NONE; | 658 | enum oom_constraint constraint = CONSTRAINT_NONE; |
716 | int killed = 0; | 659 | int killed = 0; |
717 | 660 | ||
661 | if (oom_killer_disabled) | ||
662 | return false; | ||
663 | |||
718 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); | 664 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); |
719 | if (freed > 0) | 665 | if (freed > 0) |
720 | /* Got some memory back in the last second. */ | 666 | /* Got some memory back in the last second. */ |
721 | return; | 667 | goto out; |
722 | 668 | ||
723 | /* | 669 | /* |
724 | * If current has a pending SIGKILL or is exiting, then automatically | 670 | * If current has a pending SIGKILL or is exiting, then automatically |
@@ -730,8 +676,8 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
730 | */ | 676 | */ |
731 | if (current->mm && | 677 | if (current->mm && |
732 | (fatal_signal_pending(current) || task_will_free_mem(current))) { | 678 | (fatal_signal_pending(current) || task_will_free_mem(current))) { |
733 | mark_tsk_oom_victim(current); | 679 | mark_oom_victim(current); |
734 | return; | 680 | goto out; |
735 | } | 681 | } |
736 | 682 | ||
737 | /* | 683 | /* |
@@ -771,32 +717,8 @@ out: | |||
771 | */ | 717 | */ |
772 | if (killed) | 718 | if (killed) |
773 | schedule_timeout_killable(1); | 719 | schedule_timeout_killable(1); |
774 | } | ||
775 | |||
776 | /** | ||
777 | * out_of_memory - tries to invoke OOM killer. | ||
778 | * @zonelist: zonelist pointer | ||
779 | * @gfp_mask: memory allocation flags | ||
780 | * @order: amount of memory being requested as a power of 2 | ||
781 | * @nodemask: nodemask passed to page allocator | ||
782 | * @force_kill: true if a task must be killed, even if others are exiting | ||
783 | * | ||
784 | * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() | ||
785 | * when it returns false. Otherwise returns true. | ||
786 | */ | ||
787 | bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | ||
788 | int order, nodemask_t *nodemask, bool force_kill) | ||
789 | { | ||
790 | bool ret = false; | ||
791 | |||
792 | down_read(&oom_sem); | ||
793 | if (!oom_killer_disabled) { | ||
794 | __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); | ||
795 | ret = true; | ||
796 | } | ||
797 | up_read(&oom_sem); | ||
798 | 720 | ||
799 | return ret; | 721 | return true; |
800 | } | 722 | } |
801 | 723 | ||
802 | /* | 724 | /* |
@@ -806,27 +728,21 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
806 | */ | 728 | */ |
807 | void pagefault_out_of_memory(void) | 729 | void pagefault_out_of_memory(void) |
808 | { | 730 | { |
809 | struct zonelist *zonelist; | ||
810 | |||
811 | down_read(&oom_sem); | ||
812 | if (mem_cgroup_oom_synchronize(true)) | 731 | if (mem_cgroup_oom_synchronize(true)) |
813 | goto unlock; | 732 | return; |
814 | 733 | ||
815 | zonelist = node_zonelist(first_memory_node, GFP_KERNEL); | 734 | if (!mutex_trylock(&oom_lock)) |
816 | if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { | 735 | return; |
817 | if (!oom_killer_disabled) | ||
818 | __out_of_memory(NULL, 0, 0, NULL, false); | ||
819 | else | ||
820 | /* | ||
821 | * There shouldn't be any user tasks runable while the | ||
822 | * OOM killer is disabled so the current task has to | ||
823 | * be a racing OOM victim for which oom_killer_disable() | ||
824 | * is waiting for. | ||
825 | */ | ||
826 | WARN_ON(test_thread_flag(TIF_MEMDIE)); | ||
827 | 736 | ||
828 | oom_zonelist_unlock(zonelist, GFP_KERNEL); | 737 | if (!out_of_memory(NULL, 0, 0, NULL, false)) { |
738 | /* | ||
739 | * There shouldn't be any user tasks runnable while the | ||
740 | * OOM killer is disabled, so the current task has to | ||
741 | * be a racing OOM victim for which oom_killer_disable() | ||
742 | * is waiting for. | ||
743 | */ | ||
744 | WARN_ON(test_thread_flag(TIF_MEMDIE)); | ||
829 | } | 745 | } |
830 | unlock: | 746 | |
831 | up_read(&oom_sem); | 747 | mutex_unlock(&oom_lock); |
832 | } | 748 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2fd31aebef30..5e6fa06f2784 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -380,20 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
380 | } | 380 | } |
381 | } | 381 | } |
382 | 382 | ||
383 | static inline void prep_zero_page(struct page *page, unsigned int order, | ||
384 | gfp_t gfp_flags) | ||
385 | { | ||
386 | int i; | ||
387 | |||
388 | /* | ||
389 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO | ||
390 | * and __GFP_HIGHMEM from hard or soft interrupt context. | ||
391 | */ | ||
392 | VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); | ||
393 | for (i = 0; i < (1 << order); i++) | ||
394 | clear_highpage(page + i); | ||
395 | } | ||
396 | |||
397 | #ifdef CONFIG_DEBUG_PAGEALLOC | 383 | #ifdef CONFIG_DEBUG_PAGEALLOC |
398 | unsigned int _debug_guardpage_minorder; | 384 | unsigned int _debug_guardpage_minorder; |
399 | bool _debug_pagealloc_enabled __read_mostly; | 385 | bool _debug_pagealloc_enabled __read_mostly; |
@@ -975,7 +961,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, | |||
975 | kasan_alloc_pages(page, order); | 961 | kasan_alloc_pages(page, order); |
976 | 962 | ||
977 | if (gfp_flags & __GFP_ZERO) | 963 | if (gfp_flags & __GFP_ZERO) |
978 | prep_zero_page(page, order, gfp_flags); | 964 | for (i = 0; i < (1 << order); i++) |
965 | clear_highpage(page + i); | ||
979 | 966 | ||
980 | if (order && (gfp_flags & __GFP_COMP)) | 967 | if (order && (gfp_flags & __GFP_COMP)) |
981 | prep_compound_page(page, order); | 968 | prep_compound_page(page, order); |
@@ -2322,48 +2309,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | |||
2322 | show_mem(filter); | 2309 | show_mem(filter); |
2323 | } | 2310 | } |
2324 | 2311 | ||
2325 | static inline int | ||
2326 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | ||
2327 | unsigned long did_some_progress, | ||
2328 | unsigned long pages_reclaimed) | ||
2329 | { | ||
2330 | /* Do not loop if specifically requested */ | ||
2331 | if (gfp_mask & __GFP_NORETRY) | ||
2332 | return 0; | ||
2333 | |||
2334 | /* Always retry if specifically requested */ | ||
2335 | if (gfp_mask & __GFP_NOFAIL) | ||
2336 | return 1; | ||
2337 | |||
2338 | /* | ||
2339 | * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim | ||
2340 | * making forward progress without invoking OOM. Suspend also disables | ||
2341 | * storage devices so kswapd will not help. Bail if we are suspending. | ||
2342 | */ | ||
2343 | if (!did_some_progress && pm_suspended_storage()) | ||
2344 | return 0; | ||
2345 | |||
2346 | /* | ||
2347 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
2348 | * means __GFP_NOFAIL, but that may not be true in other | ||
2349 | * implementations. | ||
2350 | */ | ||
2351 | if (order <= PAGE_ALLOC_COSTLY_ORDER) | ||
2352 | return 1; | ||
2353 | |||
2354 | /* | ||
2355 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
2356 | * specified, then we retry until we no longer reclaim any pages | ||
2357 | * (above), or we've reclaimed an order of pages at least as | ||
2358 | * large as the allocation's order. In both cases, if the | ||
2359 | * allocation still fails, we stop retrying. | ||
2360 | */ | ||
2361 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) | ||
2362 | return 1; | ||
2363 | |||
2364 | return 0; | ||
2365 | } | ||
2366 | |||
2367 | static inline struct page * | 2312 | static inline struct page * |
2368 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | 2313 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
2369 | const struct alloc_context *ac, unsigned long *did_some_progress) | 2314 | const struct alloc_context *ac, unsigned long *did_some_progress) |
@@ -2373,10 +2318,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2373 | *did_some_progress = 0; | 2318 | *did_some_progress = 0; |
2374 | 2319 | ||
2375 | /* | 2320 | /* |
2376 | * Acquire the per-zone oom lock for each zone. If that | 2321 | * Acquire the oom lock. If that fails, somebody else is |
2377 | * fails, somebody else is making progress for us. | 2322 | * making progress for us. |
2378 | */ | 2323 | */ |
2379 | if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { | 2324 | if (!mutex_trylock(&oom_lock)) { |
2380 | *did_some_progress = 1; | 2325 | *did_some_progress = 1; |
2381 | schedule_timeout_uninterruptible(1); | 2326 | schedule_timeout_uninterruptible(1); |
2382 | return NULL; | 2327 | return NULL; |
@@ -2402,16 +2347,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2402 | /* The OOM killer does not needlessly kill tasks for lowmem */ | 2347 | /* The OOM killer does not needlessly kill tasks for lowmem */ |
2403 | if (ac->high_zoneidx < ZONE_NORMAL) | 2348 | if (ac->high_zoneidx < ZONE_NORMAL) |
2404 | goto out; | 2349 | goto out; |
2405 | /* The OOM killer does not compensate for light reclaim */ | 2350 | /* The OOM killer does not compensate for IO-less reclaim */ |
2406 | if (!(gfp_mask & __GFP_FS)) { | 2351 | if (!(gfp_mask & __GFP_FS)) { |
2407 | /* | 2352 | /* |
2408 | * XXX: Page reclaim didn't yield anything, | 2353 | * XXX: Page reclaim didn't yield anything, |
2409 | * and the OOM killer can't be invoked, but | 2354 | * and the OOM killer can't be invoked, but |
2410 | * keep looping as per should_alloc_retry(). | 2355 | * keep looping as per tradition. |
2411 | */ | 2356 | */ |
2412 | *did_some_progress = 1; | 2357 | *did_some_progress = 1; |
2413 | goto out; | 2358 | goto out; |
2414 | } | 2359 | } |
2360 | if (pm_suspended_storage()) | ||
2361 | goto out; | ||
2415 | /* The OOM killer may not free memory on a specific node */ | 2362 | /* The OOM killer may not free memory on a specific node */ |
2416 | if (gfp_mask & __GFP_THISNODE) | 2363 | if (gfp_mask & __GFP_THISNODE) |
2417 | goto out; | 2364 | goto out; |
@@ -2421,7 +2368,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2421 | || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) | 2368 | || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) |
2422 | *did_some_progress = 1; | 2369 | *did_some_progress = 1; |
2423 | out: | 2370 | out: |
2424 | oom_zonelist_unlock(ac->zonelist, gfp_mask); | 2371 | mutex_unlock(&oom_lock); |
2425 | return page; | 2372 | return page; |
2426 | } | 2373 | } |
2427 | 2374 | ||
@@ -2794,40 +2741,40 @@ retry: | |||
2794 | if (page) | 2741 | if (page) |
2795 | goto got_pg; | 2742 | goto got_pg; |
2796 | 2743 | ||
2797 | /* Check if we should retry the allocation */ | 2744 | /* Do not loop if specifically requested */ |
2745 | if (gfp_mask & __GFP_NORETRY) | ||
2746 | goto noretry; | ||
2747 | |||
2748 | /* Keep reclaiming pages as long as there is reasonable progress */ | ||
2798 | pages_reclaimed += did_some_progress; | 2749 | pages_reclaimed += did_some_progress; |
2799 | if (should_alloc_retry(gfp_mask, order, did_some_progress, | 2750 | if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || |
2800 | pages_reclaimed)) { | 2751 | ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { |
2801 | /* | ||
2802 | * If we fail to make progress by freeing individual | ||
2803 | * pages, but the allocation wants us to keep going, | ||
2804 | * start OOM killing tasks. | ||
2805 | */ | ||
2806 | if (!did_some_progress) { | ||
2807 | page = __alloc_pages_may_oom(gfp_mask, order, ac, | ||
2808 | &did_some_progress); | ||
2809 | if (page) | ||
2810 | goto got_pg; | ||
2811 | if (!did_some_progress) | ||
2812 | goto nopage; | ||
2813 | } | ||
2814 | /* Wait for some write requests to complete then retry */ | 2752 | /* Wait for some write requests to complete then retry */ |
2815 | wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); | 2753 | wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); |
2816 | goto retry; | 2754 | goto retry; |
2817 | } else { | ||
2818 | /* | ||
2819 | * High-order allocations do not necessarily loop after | ||
2820 | * direct reclaim and reclaim/compaction depends on compaction | ||
2821 | * being called after reclaim so call directly if necessary | ||
2822 | */ | ||
2823 | page = __alloc_pages_direct_compact(gfp_mask, order, | ||
2824 | alloc_flags, ac, migration_mode, | ||
2825 | &contended_compaction, | ||
2826 | &deferred_compaction); | ||
2827 | if (page) | ||
2828 | goto got_pg; | ||
2829 | } | 2755 | } |
2830 | 2756 | ||
2757 | /* Reclaim has failed us, start killing things */ | ||
2758 | page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); | ||
2759 | if (page) | ||
2760 | goto got_pg; | ||
2761 | |||
2762 | /* Retry as long as the OOM killer is making progress */ | ||
2763 | if (did_some_progress) | ||
2764 | goto retry; | ||
2765 | |||
2766 | noretry: | ||
2767 | /* | ||
2768 | * High-order allocations do not necessarily loop after | ||
2769 | * direct reclaim and reclaim/compaction depends on compaction | ||
2770 | * being called after reclaim so call directly if necessary | ||
2771 | */ | ||
2772 | page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, | ||
2773 | ac, migration_mode, | ||
2774 | &contended_compaction, | ||
2775 | &deferred_compaction); | ||
2776 | if (page) | ||
2777 | goto got_pg; | ||
2831 | nopage: | 2778 | nopage: |
2832 | warn_alloc_failed(gfp_mask, order, NULL); | 2779 | warn_alloc_failed(gfp_mask, order, NULL); |
2833 | got_pg: | 2780 | got_pg: |
@@ -4867,22 +4814,28 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | |||
4867 | unsigned long *zones_size, | 4814 | unsigned long *zones_size, |
4868 | unsigned long *zholes_size) | 4815 | unsigned long *zholes_size) |
4869 | { | 4816 | { |
4870 | unsigned long realtotalpages, totalpages = 0; | 4817 | unsigned long realtotalpages = 0, totalpages = 0; |
4871 | enum zone_type i; | 4818 | enum zone_type i; |
4872 | 4819 | ||
4873 | for (i = 0; i < MAX_NR_ZONES; i++) | 4820 | for (i = 0; i < MAX_NR_ZONES; i++) { |
4874 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, | 4821 | struct zone *zone = pgdat->node_zones + i; |
4875 | node_start_pfn, | 4822 | unsigned long size, real_size; |
4876 | node_end_pfn, | 4823 | |
4877 | zones_size); | 4824 | size = zone_spanned_pages_in_node(pgdat->node_id, i, |
4878 | pgdat->node_spanned_pages = totalpages; | 4825 | node_start_pfn, |
4879 | 4826 | node_end_pfn, | |
4880 | realtotalpages = totalpages; | 4827 | zones_size); |
4881 | for (i = 0; i < MAX_NR_ZONES; i++) | 4828 | real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, |
4882 | realtotalpages -= | ||
4883 | zone_absent_pages_in_node(pgdat->node_id, i, | ||
4884 | node_start_pfn, node_end_pfn, | 4829 | node_start_pfn, node_end_pfn, |
4885 | zholes_size); | 4830 | zholes_size); |
4831 | zone->spanned_pages = size; | ||
4832 | zone->present_pages = real_size; | ||
4833 | |||
4834 | totalpages += size; | ||
4835 | realtotalpages += real_size; | ||
4836 | } | ||
4837 | |||
4838 | pgdat->node_spanned_pages = totalpages; | ||
4886 | pgdat->node_present_pages = realtotalpages; | 4839 | pgdat->node_present_pages = realtotalpages; |
4887 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, | 4840 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, |
4888 | realtotalpages); | 4841 | realtotalpages); |
@@ -4992,8 +4945,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, | |||
4992 | * NOTE: pgdat should get zeroed by caller. | 4945 | * NOTE: pgdat should get zeroed by caller. |
4993 | */ | 4946 | */ |
4994 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, | 4947 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, |
4995 | unsigned long node_start_pfn, unsigned long node_end_pfn, | 4948 | unsigned long node_start_pfn, unsigned long node_end_pfn) |
4996 | unsigned long *zones_size, unsigned long *zholes_size) | ||
4997 | { | 4949 | { |
4998 | enum zone_type j; | 4950 | enum zone_type j; |
4999 | int nid = pgdat->node_id; | 4951 | int nid = pgdat->node_id; |
@@ -5014,12 +4966,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
5014 | struct zone *zone = pgdat->node_zones + j; | 4966 | struct zone *zone = pgdat->node_zones + j; |
5015 | unsigned long size, realsize, freesize, memmap_pages; | 4967 | unsigned long size, realsize, freesize, memmap_pages; |
5016 | 4968 | ||
5017 | size = zone_spanned_pages_in_node(nid, j, node_start_pfn, | 4969 | size = zone->spanned_pages; |
5018 | node_end_pfn, zones_size); | 4970 | realsize = freesize = zone->present_pages; |
5019 | realsize = freesize = size - zone_absent_pages_in_node(nid, j, | ||
5020 | node_start_pfn, | ||
5021 | node_end_pfn, | ||
5022 | zholes_size); | ||
5023 | 4971 | ||
5024 | /* | 4972 | /* |
5025 | * Adjust freesize so that it accounts for how much memory | 4973 | * Adjust freesize so that it accounts for how much memory |
@@ -5054,8 +5002,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
5054 | nr_kernel_pages -= memmap_pages; | 5002 | nr_kernel_pages -= memmap_pages; |
5055 | nr_all_pages += freesize; | 5003 | nr_all_pages += freesize; |
5056 | 5004 | ||
5057 | zone->spanned_pages = size; | ||
5058 | zone->present_pages = realsize; | ||
5059 | /* | 5005 | /* |
5060 | * Set an approximate value for lowmem here, it will be adjusted | 5006 | * Set an approximate value for lowmem here, it will be adjusted |
5061 | * when the bootmem allocator frees pages into the buddy system. | 5007 | * when the bootmem allocator frees pages into the buddy system. |
@@ -5161,8 +5107,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
5161 | (unsigned long)pgdat->node_mem_map); | 5107 | (unsigned long)pgdat->node_mem_map); |
5162 | #endif | 5108 | #endif |
5163 | 5109 | ||
5164 | free_area_init_core(pgdat, start_pfn, end_pfn, | 5110 | free_area_init_core(pgdat, start_pfn, end_pfn); |
5165 | zones_size, zholes_size); | ||
5166 | } | 5111 | } |
5167 | 5112 | ||
5168 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 5113 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
@@ -6111,9 +6056,9 @@ out: | |||
6111 | return ret; | 6056 | return ret; |
6112 | } | 6057 | } |
6113 | 6058 | ||
6059 | #ifdef CONFIG_NUMA | ||
6114 | int hashdist = HASHDIST_DEFAULT; | 6060 | int hashdist = HASHDIST_DEFAULT; |
6115 | 6061 | ||
6116 | #ifdef CONFIG_NUMA | ||
6117 | static int __init set_hashdist(char *str) | 6062 | static int __init set_hashdist(char *str) |
6118 | { | 6063 | { |
6119 | if (!str) | 6064 | if (!str) |
diff --git a/mm/percpu.c b/mm/percpu.c index dfd02484e8de..2dd74487a0af 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1030,7 +1030,7 @@ area_found: | |||
1030 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); | 1030 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); |
1031 | 1031 | ||
1032 | ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); | 1032 | ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); |
1033 | kmemleak_alloc_percpu(ptr, size); | 1033 | kmemleak_alloc_percpu(ptr, size, gfp); |
1034 | return ptr; | 1034 | return ptr; |
1035 | 1035 | ||
1036 | fail_unlock: | 1036 | fail_unlock: |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c25f94b33811..6b674e00153c 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -119,14 +119,15 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, | |||
119 | } | 119 | } |
120 | #endif | 120 | #endif |
121 | 121 | ||
122 | #ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH | 122 | #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH |
123 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 123 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
124 | pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, | 124 | pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, |
125 | pmd_t *pmdp) | 125 | pmd_t *pmdp) |
126 | { | 126 | { |
127 | pmd_t pmd; | 127 | pmd_t pmd; |
128 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 128 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
129 | pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); | 129 | VM_BUG_ON(!pmd_trans_huge(*pmdp)); |
130 | pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); | ||
130 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | 131 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
131 | return pmd; | 132 | return pmd; |
132 | } | 133 | } |
@@ -198,3 +199,23 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, | |||
198 | } | 199 | } |
199 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 200 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
200 | #endif | 201 | #endif |
202 | |||
203 | #ifndef pmdp_collapse_flush | ||
204 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
205 | pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, | ||
206 | pmd_t *pmdp) | ||
207 | { | ||
208 | /* | ||
209 | * pmd and hugepage pte format are same. So we could | ||
210 | * use the same function. | ||
211 | */ | ||
212 | pmd_t pmd; | ||
213 | |||
214 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
215 | VM_BUG_ON(pmd_trans_huge(*pmdp)); | ||
216 | pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); | ||
217 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
218 | return pmd; | ||
219 | } | ||
220 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
221 | #endif | ||
@@ -625,7 +625,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) | |||
625 | 625 | ||
626 | pmd = pmd_offset(pud, address); | 626 | pmd = pmd_offset(pud, address); |
627 | /* | 627 | /* |
628 | * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at() | 628 | * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() |
629 | * without holding anon_vma lock for write. So when looking for a | 629 | * without holding anon_vma lock for write. So when looking for a |
630 | * genuine pmde (in which to find pte), test present and !THP together. | 630 | * genuine pmde (in which to find pte), test present and !THP together. |
631 | */ | 631 | */ |
@@ -950,7 +950,12 @@ void page_move_anon_rmap(struct page *page, | |||
950 | VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); | 950 | VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); |
951 | 951 | ||
952 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 952 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
953 | page->mapping = (struct address_space *) anon_vma; | 953 | /* |
954 | * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written | ||
955 | * simultaneously, so a concurrent reader (eg page_referenced()'s | ||
956 | * PageAnon()) will not see one without the other. | ||
957 | */ | ||
958 | WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); | ||
954 | } | 959 | } |
955 | 960 | ||
956 | /** | 961 | /** |
diff --git a/mm/shmem.c b/mm/shmem.c index 3759099d8ce4..4caf8ed24d65 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -569,7 +569,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
569 | i_size_write(inode, newsize); | 569 | i_size_write(inode, newsize); |
570 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 570 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
571 | } | 571 | } |
572 | if (newsize < oldsize) { | 572 | if (newsize <= oldsize) { |
573 | loff_t holebegin = round_up(newsize, PAGE_SIZE); | 573 | loff_t holebegin = round_up(newsize, PAGE_SIZE); |
574 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); | 574 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); |
575 | shmem_truncate_range(inode, newsize, (loff_t)-1); | 575 | shmem_truncate_range(inode, newsize, (loff_t)-1); |
@@ -1454,6 +1454,7 @@ void __init kmem_cache_init(void) | |||
1454 | kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", | 1454 | kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", |
1455 | kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); | 1455 | kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); |
1456 | slab_state = PARTIAL_NODE; | 1456 | slab_state = PARTIAL_NODE; |
1457 | setup_kmalloc_cache_index_table(); | ||
1457 | 1458 | ||
1458 | slab_early_init = 0; | 1459 | slab_early_init = 0; |
1459 | 1460 | ||
@@ -71,6 +71,7 @@ unsigned long calculate_alignment(unsigned long flags, | |||
71 | 71 | ||
72 | #ifndef CONFIG_SLOB | 72 | #ifndef CONFIG_SLOB |
73 | /* Kmalloc array related functions */ | 73 | /* Kmalloc array related functions */ |
74 | void setup_kmalloc_cache_index_table(void); | ||
74 | void create_kmalloc_caches(unsigned long); | 75 | void create_kmalloc_caches(unsigned long); |
75 | 76 | ||
76 | /* Find the kmalloc slab corresponding for a certain size */ | 77 | /* Find the kmalloc slab corresponding for a certain size */ |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 999bb3424d44..9f8d71f78404 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -784,25 +784,45 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) | |||
784 | } | 784 | } |
785 | 785 | ||
786 | /* | 786 | /* |
787 | * Create the kmalloc array. Some of the regular kmalloc arrays | 787 | * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. |
788 | * may already have been created because they were needed to | 788 | * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is |
789 | * enable allocations for slab creation. | 789 | * kmalloc-67108864. |
790 | */ | 790 | */ |
791 | void __init create_kmalloc_caches(unsigned long flags) | 791 | static struct { |
792 | const char *name; | ||
793 | unsigned long size; | ||
794 | } const kmalloc_info[] __initconst = { | ||
795 | {NULL, 0}, {"kmalloc-96", 96}, | ||
796 | {"kmalloc-192", 192}, {"kmalloc-8", 8}, | ||
797 | {"kmalloc-16", 16}, {"kmalloc-32", 32}, | ||
798 | {"kmalloc-64", 64}, {"kmalloc-128", 128}, | ||
799 | {"kmalloc-256", 256}, {"kmalloc-512", 512}, | ||
800 | {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, | ||
801 | {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, | ||
802 | {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, | ||
803 | {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, | ||
804 | {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, | ||
805 | {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, | ||
806 | {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, | ||
807 | {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, | ||
808 | {"kmalloc-67108864", 67108864} | ||
809 | }; | ||
810 | |||
811 | /* | ||
812 | * Patch up the size_index table if we have strange large alignment | ||
813 | * requirements for the kmalloc array. This is only the case for | ||
814 | * MIPS it seems. The standard arches will not generate any code here. | ||
815 | * | ||
816 | * Largest permitted alignment is 256 bytes due to the way we | ||
817 | * handle the index determination for the smaller caches. | ||
818 | * | ||
819 | * Make sure that nothing crazy happens if someone starts tinkering | ||
820 | * around with ARCH_KMALLOC_MINALIGN | ||
821 | */ | ||
822 | void __init setup_kmalloc_cache_index_table(void) | ||
792 | { | 823 | { |
793 | int i; | 824 | int i; |
794 | 825 | ||
795 | /* | ||
796 | * Patch up the size_index table if we have strange large alignment | ||
797 | * requirements for the kmalloc array. This is only the case for | ||
798 | * MIPS it seems. The standard arches will not generate any code here. | ||
799 | * | ||
800 | * Largest permitted alignment is 256 bytes due to the way we | ||
801 | * handle the index determination for the smaller caches. | ||
802 | * | ||
803 | * Make sure that nothing crazy happens if someone starts tinkering | ||
804 | * around with ARCH_KMALLOC_MINALIGN | ||
805 | */ | ||
806 | BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || | 826 | BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || |
807 | (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); | 827 | (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); |
808 | 828 | ||
@@ -833,39 +853,41 @@ void __init create_kmalloc_caches(unsigned long flags) | |||
833 | for (i = 128 + 8; i <= 192; i += 8) | 853 | for (i = 128 + 8; i <= 192; i += 8) |
834 | size_index[size_index_elem(i)] = 8; | 854 | size_index[size_index_elem(i)] = 8; |
835 | } | 855 | } |
836 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { | 856 | } |
857 | |||
858 | /* | ||
859 | * Create the kmalloc array. Some of the regular kmalloc arrays | ||
860 | * may already have been created because they were needed to | ||
861 | * enable allocations for slab creation. | ||
862 | */ | ||
863 | void __init create_kmalloc_caches(unsigned long flags) | ||
864 | { | ||
865 | int i; | ||
866 | |||
867 | for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { | ||
837 | if (!kmalloc_caches[i]) { | 868 | if (!kmalloc_caches[i]) { |
838 | kmalloc_caches[i] = create_kmalloc_cache(NULL, | 869 | kmalloc_caches[i] = create_kmalloc_cache( |
839 | 1 << i, flags); | 870 | kmalloc_info[i].name, |
871 | kmalloc_info[i].size, | ||
872 | flags); | ||
840 | } | 873 | } |
841 | 874 | ||
842 | /* | 875 | /* |
843 | * Caches that are not of the two-to-the-power-of size. | 876 | * "i == 2" is the "kmalloc-192" case which is the last special |
844 | * These have to be created immediately after the | 877 | * case for initialization and it's the point to jump to |
845 | * earlier power of two caches | 878 | * allocate the minimize size of the object. In slab allocator, |
879 | * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4 | ||
880 | * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is | ||
881 | * defined, it may be larger than 2^5 and here is also the | ||
882 | * trick to skip the empty gap. | ||
846 | */ | 883 | */ |
847 | if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) | 884 | if (i == 2) |
848 | kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); | 885 | i = (KMALLOC_SHIFT_LOW - 1); |
849 | |||
850 | if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) | ||
851 | kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); | ||
852 | } | 886 | } |
853 | 887 | ||
854 | /* Kmalloc array is now usable */ | 888 | /* Kmalloc array is now usable */ |
855 | slab_state = UP; | 889 | slab_state = UP; |
856 | 890 | ||
857 | for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { | ||
858 | struct kmem_cache *s = kmalloc_caches[i]; | ||
859 | char *n; | ||
860 | |||
861 | if (s) { | ||
862 | n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i)); | ||
863 | |||
864 | BUG_ON(!n); | ||
865 | s->name = n; | ||
866 | } | ||
867 | } | ||
868 | |||
869 | #ifdef CONFIG_ZONE_DMA | 891 | #ifdef CONFIG_ZONE_DMA |
870 | for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { | 892 | for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { |
871 | struct kmem_cache *s = kmalloc_caches[i]; | 893 | struct kmem_cache *s = kmalloc_caches[i]; |
@@ -3700,6 +3700,7 @@ void __init kmem_cache_init(void) | |||
3700 | kmem_cache_node = bootstrap(&boot_kmem_cache_node); | 3700 | kmem_cache_node = bootstrap(&boot_kmem_cache_node); |
3701 | 3701 | ||
3702 | /* Now we can use the kmem_cache to allocate kmalloc slabs */ | 3702 | /* Now we can use the kmem_cache to allocate kmalloc slabs */ |
3703 | setup_kmalloc_cache_index_table(); | ||
3703 | create_kmalloc_caches(0); | 3704 | create_kmalloc_caches(0); |
3704 | 3705 | ||
3705 | #ifdef CONFIG_SMP | 3706 | #ifdef CONFIG_SMP |
@@ -131,7 +131,6 @@ void put_unrefcounted_compound_page(struct page *page_head, struct page *page) | |||
131 | * here, see the comment above this function. | 131 | * here, see the comment above this function. |
132 | */ | 132 | */ |
133 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); | 133 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); |
134 | VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); | ||
135 | if (put_page_testzero(page_head)) { | 134 | if (put_page_testzero(page_head)) { |
136 | /* | 135 | /* |
137 | * If this is the tail of a slab THP page, | 136 | * If this is the tail of a slab THP page, |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 5e8eadd71bac..19ef01e90ac4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2646,7 +2646,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2646 | 2646 | ||
2647 | for (i = 0; i <= ZONE_NORMAL; i++) { | 2647 | for (i = 0; i <= ZONE_NORMAL; i++) { |
2648 | zone = &pgdat->node_zones[i]; | 2648 | zone = &pgdat->node_zones[i]; |
2649 | if (!populated_zone(zone)) | 2649 | if (!populated_zone(zone) || |
2650 | zone_reclaimable_pages(zone) == 0) | ||
2650 | continue; | 2651 | continue; |
2651 | 2652 | ||
2652 | pfmemalloc_reserve += min_wmark_pages(zone); | 2653 | pfmemalloc_reserve += min_wmark_pages(zone); |
@@ -3596,7 +3597,7 @@ int zone_reclaim_mode __read_mostly; | |||
3596 | #define RECLAIM_OFF 0 | 3597 | #define RECLAIM_OFF 0 |
3597 | #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ | 3598 | #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ |
3598 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ | 3599 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ |
3599 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ | 3600 | #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ |
3600 | 3601 | ||
3601 | /* | 3602 | /* |
3602 | * Priority for ZONE_RECLAIM. This determines the fraction of pages | 3603 | * Priority for ZONE_RECLAIM. This determines the fraction of pages |
@@ -3638,12 +3639,12 @@ static long zone_pagecache_reclaimable(struct zone *zone) | |||
3638 | long delta = 0; | 3639 | long delta = 0; |
3639 | 3640 | ||
3640 | /* | 3641 | /* |
3641 | * If RECLAIM_SWAP is set, then all file pages are considered | 3642 | * If RECLAIM_UNMAP is set, then all file pages are considered |
3642 | * potentially reclaimable. Otherwise, we have to worry about | 3643 | * potentially reclaimable. Otherwise, we have to worry about |
3643 | * pages like swapcache and zone_unmapped_file_pages() provides | 3644 | * pages like swapcache and zone_unmapped_file_pages() provides |
3644 | * a better estimate | 3645 | * a better estimate |
3645 | */ | 3646 | */ |
3646 | if (zone_reclaim_mode & RECLAIM_SWAP) | 3647 | if (zone_reclaim_mode & RECLAIM_UNMAP) |
3647 | nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); | 3648 | nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); |
3648 | else | 3649 | else |
3649 | nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); | 3650 | nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); |
@@ -3674,15 +3675,15 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3674 | .order = order, | 3675 | .order = order, |
3675 | .priority = ZONE_RECLAIM_PRIORITY, | 3676 | .priority = ZONE_RECLAIM_PRIORITY, |
3676 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 3677 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
3677 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 3678 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), |
3678 | .may_swap = 1, | 3679 | .may_swap = 1, |
3679 | }; | 3680 | }; |
3680 | 3681 | ||
3681 | cond_resched(); | 3682 | cond_resched(); |
3682 | /* | 3683 | /* |
3683 | * We need to be able to allocate from the reserves for RECLAIM_SWAP | 3684 | * We need to be able to allocate from the reserves for RECLAIM_UNMAP |
3684 | * and we also need to be able to write out pages for RECLAIM_WRITE | 3685 | * and we also need to be able to write out pages for RECLAIM_WRITE |
3685 | * and RECLAIM_SWAP. | 3686 | * and RECLAIM_UNMAP. |
3686 | */ | 3687 | */ |
3687 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; | 3688 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; |
3688 | lockdep_set_current_reclaim_state(gfp_mask); | 3689 | lockdep_set_current_reclaim_state(gfp_mask); |