diff options
154 files changed, 3209 insertions, 1364 deletions
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index 3c94ff3f9693..f2235a162529 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt | |||
@@ -445,7 +445,7 @@ across partially overlapping sets of CPUs would risk unstable dynamics | |||
445 | that would be beyond our understanding. So if each of two partially | 445 | that would be beyond our understanding. So if each of two partially |
446 | overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we | 446 | overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we |
447 | form a single sched domain that is a superset of both. We won't move | 447 | form a single sched domain that is a superset of both. We won't move |
448 | a task to a CPU outside it cpuset, but the scheduler load balancing | 448 | a task to a CPU outside its cpuset, but the scheduler load balancing |
449 | code might waste some compute cycles considering that possibility. | 449 | code might waste some compute cycles considering that possibility. |
450 | 450 | ||
451 | This mismatch is why there is not a simple one-to-one relation | 451 | This mismatch is why there is not a simple one-to-one relation |
@@ -552,8 +552,8 @@ otherwise initial value -1 that indicates the cpuset has no request. | |||
552 | 1 : search siblings (hyperthreads in a core). | 552 | 1 : search siblings (hyperthreads in a core). |
553 | 2 : search cores in a package. | 553 | 2 : search cores in a package. |
554 | 3 : search cpus in a node [= system wide on non-NUMA system] | 554 | 3 : search cpus in a node [= system wide on non-NUMA system] |
555 | ( 4 : search nodes in a chunk of node [on NUMA system] ) | 555 | 4 : search nodes in a chunk of node [on NUMA system] |
556 | ( 5 : search system wide [on NUMA system] ) | 556 | 5 : search system wide [on NUMA system] |
557 | 557 | ||
558 | The system default is architecture dependent. The system default | 558 | The system default is architecture dependent. The system default |
559 | can be changed using the relax_domain_level= boot parameter. | 559 | can be changed using the relax_domain_level= boot parameter. |
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 46b2b5080317..a22df3ad35ff 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -326,7 +326,7 @@ per cgroup, instead of globally. | |||
326 | 326 | ||
327 | * tcp memory pressure: sockets memory pressure for the tcp protocol. | 327 | * tcp memory pressure: sockets memory pressure for the tcp protocol. |
328 | 328 | ||
329 | 2.7.3 Common use cases | 329 | 2.7.2 Common use cases |
330 | 330 | ||
331 | Because the "kmem" counter is fed to the main user counter, kernel memory can | 331 | Because the "kmem" counter is fed to the main user counter, kernel memory can |
332 | never be limited completely independently of user memory. Say "U" is the user | 332 | never be limited completely independently of user memory. Say "U" is the user |
@@ -354,19 +354,19 @@ set: | |||
354 | 354 | ||
355 | 3. User Interface | 355 | 3. User Interface |
356 | 356 | ||
357 | 0. Configuration | 357 | 3.0. Configuration |
358 | 358 | ||
359 | a. Enable CONFIG_CGROUPS | 359 | a. Enable CONFIG_CGROUPS |
360 | b. Enable CONFIG_MEMCG | 360 | b. Enable CONFIG_MEMCG |
361 | c. Enable CONFIG_MEMCG_SWAP (to use swap extension) | 361 | c. Enable CONFIG_MEMCG_SWAP (to use swap extension) |
362 | d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) | 362 | d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) |
363 | 363 | ||
364 | 1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) | 364 | 3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) |
365 | # mount -t tmpfs none /sys/fs/cgroup | 365 | # mount -t tmpfs none /sys/fs/cgroup |
366 | # mkdir /sys/fs/cgroup/memory | 366 | # mkdir /sys/fs/cgroup/memory |
367 | # mount -t cgroup none /sys/fs/cgroup/memory -o memory | 367 | # mount -t cgroup none /sys/fs/cgroup/memory -o memory |
368 | 368 | ||
369 | 2. Make the new group and move bash into it | 369 | 3.2. Make the new group and move bash into it |
370 | # mkdir /sys/fs/cgroup/memory/0 | 370 | # mkdir /sys/fs/cgroup/memory/0 |
371 | # echo $$ > /sys/fs/cgroup/memory/0/tasks | 371 | # echo $$ > /sys/fs/cgroup/memory/0/tasks |
372 | 372 | ||
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 43ecdcd39df2..4a337daf0c09 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -829,6 +829,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
829 | CONFIG_DEBUG_PAGEALLOC, hence this option will not help | 829 | CONFIG_DEBUG_PAGEALLOC, hence this option will not help |
830 | tracking down these problems. | 830 | tracking down these problems. |
831 | 831 | ||
832 | debug_pagealloc= | ||
833 | [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this | ||
834 | parameter enables the feature at boot time. In | ||
835 | default, it is disabled. We can avoid allocating huge | ||
836 | chunk of memory for debug pagealloc if we don't enable | ||
837 | it at boot time and the system will work mostly same | ||
838 | with the kernel built without CONFIG_DEBUG_PAGEALLOC. | ||
839 | on: enable the feature | ||
840 | |||
832 | debugpat [X86] Enable PAT debugging | 841 | debugpat [X86] Enable PAT debugging |
833 | 842 | ||
834 | decnet.addr= [HW,NET] | 843 | decnet.addr= [HW,NET] |
@@ -1228,9 +1237,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1228 | multiple times interleaved with hugepages= to reserve | 1237 | multiple times interleaved with hugepages= to reserve |
1229 | huge pages of different sizes. Valid pages sizes on | 1238 | huge pages of different sizes. Valid pages sizes on |
1230 | x86-64 are 2M (when the CPU supports "pse") and 1G | 1239 | x86-64 are 2M (when the CPU supports "pse") and 1G |
1231 | (when the CPU supports the "pdpe1gb" cpuinfo flag) | 1240 | (when the CPU supports the "pdpe1gb" cpuinfo flag). |
1232 | Note that 1GB pages can only be allocated at boot time | ||
1233 | using hugepages= and not freed afterwards. | ||
1234 | 1241 | ||
1235 | hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) | 1242 | hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) |
1236 | terminal devices. Valid values: 0..8 | 1243 | terminal devices. Valid values: 0..8 |
@@ -2506,6 +2513,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
2506 | OSS [HW,OSS] | 2513 | OSS [HW,OSS] |
2507 | See Documentation/sound/oss/oss-parameters.txt | 2514 | See Documentation/sound/oss/oss-parameters.txt |
2508 | 2515 | ||
2516 | page_owner= [KNL] Boot-time page_owner enabling option. | ||
2517 | Storage of the information about who allocated | ||
2518 | each page is disabled in default. With this switch, | ||
2519 | we can turn it on. | ||
2520 | on: enable the feature | ||
2521 | |||
2509 | panic= [KNL] Kernel behaviour on panic: delay <timeout> | 2522 | panic= [KNL] Kernel behaviour on panic: delay <timeout> |
2510 | timeout > 0: seconds before rebooting | 2523 | timeout > 0: seconds before rebooting |
2511 | timeout = 0: wait forever | 2524 | timeout = 0: wait forever |
diff --git a/Documentation/local_ops.txt b/Documentation/local_ops.txt index 300da4bdfdbd..407576a23317 100644 --- a/Documentation/local_ops.txt +++ b/Documentation/local_ops.txt | |||
@@ -8,6 +8,11 @@ to implement them for any given architecture and shows how they can be used | |||
8 | properly. It also stresses on the precautions that must be taken when reading | 8 | properly. It also stresses on the precautions that must be taken when reading |
9 | those local variables across CPUs when the order of memory writes matters. | 9 | those local variables across CPUs when the order of memory writes matters. |
10 | 10 | ||
11 | Note that local_t based operations are not recommended for general kernel use. | ||
12 | Please use the this_cpu operations instead unless there is really a special purpose. | ||
13 | Most uses of local_t in the kernel have been replaced by this_cpu operations. | ||
14 | this_cpu operations combine the relocation with the local_t like semantics in | ||
15 | a single instruction and yield more compact and faster executing code. | ||
11 | 16 | ||
12 | 17 | ||
13 | * Purpose of local atomic operations | 18 | * Purpose of local atomic operations |
@@ -87,10 +92,10 @@ the per cpu variable. For instance : | |||
87 | local_inc(&get_cpu_var(counters)); | 92 | local_inc(&get_cpu_var(counters)); |
88 | put_cpu_var(counters); | 93 | put_cpu_var(counters); |
89 | 94 | ||
90 | If you are already in a preemption-safe context, you can directly use | 95 | If you are already in a preemption-safe context, you can use |
91 | __get_cpu_var() instead. | 96 | this_cpu_ptr() instead. |
92 | 97 | ||
93 | local_inc(&__get_cpu_var(counters)); | 98 | local_inc(this_cpu_ptr(&counters)); |
94 | 99 | ||
95 | 100 | ||
96 | 101 | ||
@@ -134,7 +139,7 @@ static void test_each(void *info) | |||
134 | { | 139 | { |
135 | /* Increment the counter from a non preemptible context */ | 140 | /* Increment the counter from a non preemptible context */ |
136 | printk("Increment on cpu %d\n", smp_processor_id()); | 141 | printk("Increment on cpu %d\n", smp_processor_id()); |
137 | local_inc(&__get_cpu_var(counters)); | 142 | local_inc(this_cpu_ptr(&counters)); |
138 | 143 | ||
139 | /* This is what incrementing the variable would look like within a | 144 | /* This is what incrementing the variable would look like within a |
140 | * preemptible context (it disables preemption) : | 145 | * preemptible context (it disables preemption) : |
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index b5d0c8501a18..75511efefc64 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -116,10 +116,12 @@ set during run time. | |||
116 | 116 | ||
117 | auto_msgmni: | 117 | auto_msgmni: |
118 | 118 | ||
119 | Enables/Disables automatic recomputing of msgmni upon memory add/remove | 119 | This variable has no effect and may be removed in future kernel |
120 | or upon ipc namespace creation/removal (see the msgmni description | 120 | releases. Reading it always returns 0. |
121 | above). Echoing "1" into this file enables msgmni automatic recomputing. | 121 | Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni |
122 | Echoing "0" turns it off. auto_msgmni default value is 1. | 122 | upon memory add/remove or upon ipc namespace creation/removal. |
123 | Echoing "1" into this file enabled msgmni automatic recomputing. | ||
124 | Echoing "0" turned it off. auto_msgmni default value was 1. | ||
123 | 125 | ||
124 | 126 | ||
125 | ============================================================== | 127 | ============================================================== |
diff --git a/Documentation/vm/page_owner.txt b/Documentation/vm/page_owner.txt new file mode 100644 index 000000000000..8f3ce9b3aa11 --- /dev/null +++ b/Documentation/vm/page_owner.txt | |||
@@ -0,0 +1,81 @@ | |||
1 | page owner: Tracking about who allocated each page | ||
2 | ----------------------------------------------------------- | ||
3 | |||
4 | * Introduction | ||
5 | |||
6 | page owner is for the tracking about who allocated each page. | ||
7 | It can be used to debug memory leak or to find a memory hogger. | ||
8 | When allocation happens, information about allocation such as call stack | ||
9 | and order of pages is stored into certain storage for each page. | ||
10 | When we need to know about status of all pages, we can get and analyze | ||
11 | this information. | ||
12 | |||
13 | Although we already have tracepoint for tracing page allocation/free, | ||
14 | using it for analyzing who allocate each page is rather complex. We need | ||
15 | to enlarge the trace buffer for preventing overlapping until userspace | ||
16 | program launched. And, launched program continually dump out the trace | ||
17 | buffer for later analysis and it would change system behviour with more | ||
18 | possibility rather than just keeping it in memory, so bad for debugging. | ||
19 | |||
20 | page owner can also be used for various purposes. For example, accurate | ||
21 | fragmentation statistics can be obtained through gfp flag information of | ||
22 | each page. It is already implemented and activated if page owner is | ||
23 | enabled. Other usages are more than welcome. | ||
24 | |||
25 | page owner is disabled in default. So, if you'd like to use it, you need | ||
26 | to add "page_owner=on" into your boot cmdline. If the kernel is built | ||
27 | with page owner and page owner is disabled in runtime due to no enabling | ||
28 | boot option, runtime overhead is marginal. If disabled in runtime, it | ||
29 | doesn't require memory to store owner information, so there is no runtime | ||
30 | memory overhead. And, page owner inserts just two unlikely branches into | ||
31 | the page allocator hotpath and if it returns false then allocation is | ||
32 | done like as the kernel without page owner. These two unlikely branches | ||
33 | would not affect to allocation performance. Following is the kernel's | ||
34 | code size change due to this facility. | ||
35 | |||
36 | - Without page owner | ||
37 | text data bss dec hex filename | ||
38 | 40662 1493 644 42799 a72f mm/page_alloc.o | ||
39 | |||
40 | - With page owner | ||
41 | text data bss dec hex filename | ||
42 | 40892 1493 644 43029 a815 mm/page_alloc.o | ||
43 | 1427 24 8 1459 5b3 mm/page_ext.o | ||
44 | 2722 50 0 2772 ad4 mm/page_owner.o | ||
45 | |||
46 | Although, roughly, 4 KB code is added in total, page_alloc.o increase by | ||
47 | 230 bytes and only half of it is in hotpath. Building the kernel with | ||
48 | page owner and turning it on if needed would be great option to debug | ||
49 | kernel memory problem. | ||
50 | |||
51 | There is one notice that is caused by implementation detail. page owner | ||
52 | stores information into the memory from struct page extension. This memory | ||
53 | is initialized some time later than that page allocator starts in sparse | ||
54 | memory system, so, until initialization, many pages can be allocated and | ||
55 | they would have no owner information. To fix it up, these early allocated | ||
56 | pages are investigated and marked as allocated in initialization phase. | ||
57 | Although it doesn't mean that they have the right owner information, | ||
58 | at least, we can tell whether the page is allocated or not, | ||
59 | more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages | ||
60 | are catched and marked, although they are mostly allocated from struct | ||
61 | page extension feature. Anyway, after that, no page is left in | ||
62 | un-tracking state. | ||
63 | |||
64 | * Usage | ||
65 | |||
66 | 1) Build user-space helper | ||
67 | cd tools/vm | ||
68 | make page_owner_sort | ||
69 | |||
70 | 2) Enable page owner | ||
71 | Add "page_owner=on" to boot cmdline. | ||
72 | |||
73 | 3) Do the job what you want to debug | ||
74 | |||
75 | 4) Analyze information from page owner | ||
76 | cat /sys/kernel/debug/page_owner > page_owner_full.txt | ||
77 | grep -v ^PFN page_owner_full.txt > page_owner.txt | ||
78 | ./page_owner_sort page_owner.txt sorted_page_owner.txt | ||
79 | |||
80 | See the result about who allocated each page | ||
81 | in the sorted_page_owner.txt. | ||
diff --git a/MAINTAINERS b/MAINTAINERS index 326dc2d1652d..1f0ef48830f9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -4045,7 +4045,7 @@ F: drivers/tty/serial/ucc_uart.c | |||
4045 | FREESCALE SOC SOUND DRIVERS | 4045 | FREESCALE SOC SOUND DRIVERS |
4046 | M: Timur Tabi <timur@tabi.org> | 4046 | M: Timur Tabi <timur@tabi.org> |
4047 | M: Nicolin Chen <nicoleotsuka@gmail.com> | 4047 | M: Nicolin Chen <nicoleotsuka@gmail.com> |
4048 | M: Xiubo Li <Li.Xiubo@freescale.com> | 4048 | M: Xiubo Li <Xiubo.Lee@gmail.com> |
4049 | L: alsa-devel@alsa-project.org (moderated for non-subscribers) | 4049 | L: alsa-devel@alsa-project.org (moderated for non-subscribers) |
4050 | L: linuxppc-dev@lists.ozlabs.org | 4050 | L: linuxppc-dev@lists.ozlabs.org |
4051 | S: Maintained | 4051 | S: Maintained |
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0bee1fe209b1..97d07ed60a0b 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig | |||
@@ -5,6 +5,7 @@ config ARM | |||
5 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE | 5 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE |
6 | select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST | 6 | select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST |
7 | select ARCH_HAVE_CUSTOM_GPIO_H | 7 | select ARCH_HAVE_CUSTOM_GPIO_H |
8 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
8 | select ARCH_MIGHT_HAVE_PC_PARPORT | 9 | select ARCH_MIGHT_HAVE_PC_PARPORT |
9 | select ARCH_SUPPORTS_ATOMIC_RMW | 10 | select ARCH_SUPPORTS_ATOMIC_RMW |
10 | select ARCH_USE_BUILTIN_BSWAP | 11 | select ARCH_USE_BUILTIN_BSWAP |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6b1ebd964c10..688db03ef5b8 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig | |||
@@ -2,6 +2,7 @@ config ARM64 | |||
2 | def_bool y | 2 | def_bool y |
3 | select ARCH_BINFMT_ELF_RANDOMIZE_PIE | 3 | select ARCH_BINFMT_ELF_RANDOMIZE_PIE |
4 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE | 4 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE |
5 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
5 | select ARCH_HAS_SG_CHAIN | 6 | select ARCH_HAS_SG_CHAIN |
6 | select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST | 7 | select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST |
7 | select ARCH_USE_CMPXCHG_LOCKREF | 8 | select ARCH_USE_CMPXCHG_LOCKREF |
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index a7736fa0580c..0bce820428fc 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig | |||
@@ -1,5 +1,6 @@ | |||
1 | config MICROBLAZE | 1 | config MICROBLAZE |
2 | def_bool y | 2 | def_bool y |
3 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
3 | select ARCH_MIGHT_HAVE_PC_PARPORT | 4 | select ARCH_MIGHT_HAVE_PC_PARPORT |
4 | select ARCH_WANT_IPC_PARSE_VERSION | 5 | select ARCH_WANT_IPC_PARSE_VERSION |
5 | select ARCH_WANT_OPTIONAL_GPIOLIB | 6 | select ARCH_WANT_OPTIONAL_GPIOLIB |
diff --git a/arch/parisc/lib/fixup.S b/arch/parisc/lib/fixup.S index f8c45cc2947d..536ef66bb94b 100644 --- a/arch/parisc/lib/fixup.S +++ b/arch/parisc/lib/fixup.S | |||
@@ -38,14 +38,14 @@ | |||
38 | LDREGX \t2(\t1),\t2 | 38 | LDREGX \t2(\t1),\t2 |
39 | addil LT%exception_data,%r27 | 39 | addil LT%exception_data,%r27 |
40 | LDREG RT%exception_data(%r1),\t1 | 40 | LDREG RT%exception_data(%r1),\t1 |
41 | /* t1 = &__get_cpu_var(exception_data) */ | 41 | /* t1 = this_cpu_ptr(&exception_data) */ |
42 | add,l \t1,\t2,\t1 | 42 | add,l \t1,\t2,\t1 |
43 | /* t1 = t1->fault_ip */ | 43 | /* t1 = t1->fault_ip */ |
44 | LDREG EXCDATA_IP(\t1), \t1 | 44 | LDREG EXCDATA_IP(\t1), \t1 |
45 | .endm | 45 | .endm |
46 | #else | 46 | #else |
47 | .macro get_fault_ip t1 t2 | 47 | .macro get_fault_ip t1 t2 |
48 | /* t1 = &__get_cpu_var(exception_data) */ | 48 | /* t1 = this_cpu_ptr(&exception_data) */ |
49 | addil LT%exception_data,%r27 | 49 | addil LT%exception_data,%r27 |
50 | LDREG RT%exception_data(%r1),\t2 | 50 | LDREG RT%exception_data(%r1),\t2 |
51 | /* t1 = t2->fault_ip */ | 51 | /* t1 = t2->fault_ip */ |
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index af696874248b..a2a168e2dfe7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig | |||
@@ -129,6 +129,7 @@ config PPC | |||
129 | select HAVE_BPF_JIT if PPC64 | 129 | select HAVE_BPF_JIT if PPC64 |
130 | select HAVE_ARCH_JUMP_LABEL | 130 | select HAVE_ARCH_JUMP_LABEL |
131 | select ARCH_HAVE_NMI_SAFE_CMPXCHG | 131 | select ARCH_HAVE_NMI_SAFE_CMPXCHG |
132 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
132 | select GENERIC_SMP_IDLE_THREAD | 133 | select GENERIC_SMP_IDLE_THREAD |
133 | select GENERIC_CMOS_UPDATE | 134 | select GENERIC_CMOS_UPDATE |
134 | select GENERIC_TIME_VSYSCALL_OLD | 135 | select GENERIC_TIME_VSYSCALL_OLD |
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index e56a307bc676..2c2022d16059 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c | |||
@@ -1514,7 +1514,7 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) | |||
1514 | mmu_kernel_ssize, 0); | 1514 | mmu_kernel_ssize, 0); |
1515 | } | 1515 | } |
1516 | 1516 | ||
1517 | void kernel_map_pages(struct page *page, int numpages, int enable) | 1517 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
1518 | { | 1518 | { |
1519 | unsigned long flags, vaddr, lmi; | 1519 | unsigned long flags, vaddr, lmi; |
1520 | int i; | 1520 | int i; |
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index d545b1231594..50fad3801f30 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c | |||
@@ -429,7 +429,7 @@ static int change_page_attr(struct page *page, int numpages, pgprot_t prot) | |||
429 | } | 429 | } |
430 | 430 | ||
431 | 431 | ||
432 | void kernel_map_pages(struct page *page, int numpages, int enable) | 432 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
433 | { | 433 | { |
434 | if (PageHighMem(page)) | 434 | if (PageHighMem(page)) |
435 | return; | 435 | return; |
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f2cf1f90295b..68b68d755fdf 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig | |||
@@ -65,6 +65,7 @@ config S390 | |||
65 | def_bool y | 65 | def_bool y |
66 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE | 66 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE |
67 | select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS | 67 | select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS |
68 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
68 | select ARCH_HAVE_NMI_SAFE_CMPXCHG | 69 | select ARCH_HAVE_NMI_SAFE_CMPXCHG |
69 | select ARCH_INLINE_READ_LOCK | 70 | select ARCH_INLINE_READ_LOCK |
70 | select ARCH_INLINE_READ_LOCK_BH | 71 | select ARCH_INLINE_READ_LOCK_BH |
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 3fef3b299665..426c9d462d1c 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c | |||
@@ -120,7 +120,7 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) | |||
120 | } | 120 | } |
121 | } | 121 | } |
122 | 122 | ||
123 | void kernel_map_pages(struct page *page, int numpages, int enable) | 123 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
124 | { | 124 | { |
125 | unsigned long address; | 125 | unsigned long address; |
126 | int nr, i, j; | 126 | int nr, i, j; |
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index a1403470f80e..c6b6ee5f38b2 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig | |||
@@ -16,6 +16,7 @@ config SUPERH | |||
16 | select HAVE_DEBUG_BUGVERBOSE | 16 | select HAVE_DEBUG_BUGVERBOSE |
17 | select ARCH_HAVE_CUSTOM_GPIO_H | 17 | select ARCH_HAVE_CUSTOM_GPIO_H |
18 | select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) | 18 | select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) |
19 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
19 | select PERF_USE_VMALLOC | 20 | select PERF_USE_VMALLOC |
20 | select HAVE_DEBUG_KMEMLEAK | 21 | select HAVE_DEBUG_KMEMLEAK |
21 | select HAVE_KERNEL_GZIP | 22 | select HAVE_KERNEL_GZIP |
diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h index 46d83842eddc..6f35f4df17f2 100644 --- a/arch/sparc/include/uapi/asm/unistd.h +++ b/arch/sparc/include/uapi/asm/unistd.h | |||
@@ -415,8 +415,9 @@ | |||
415 | #define __NR_getrandom 347 | 415 | #define __NR_getrandom 347 |
416 | #define __NR_memfd_create 348 | 416 | #define __NR_memfd_create 348 |
417 | #define __NR_bpf 349 | 417 | #define __NR_bpf 349 |
418 | #define __NR_execveat 350 | ||
418 | 419 | ||
419 | #define NR_syscalls 350 | 420 | #define NR_syscalls 351 |
420 | 421 | ||
421 | /* Bitmask values returned from kern_features system call. */ | 422 | /* Bitmask values returned from kern_features system call. */ |
422 | #define KERN_FEATURE_MIXED_MODE_STACK 0x00000001 | 423 | #define KERN_FEATURE_MIXED_MODE_STACK 0x00000001 |
diff --git a/arch/sparc/kernel/syscalls.S b/arch/sparc/kernel/syscalls.S index 33a17e7b3ccd..bb0008927598 100644 --- a/arch/sparc/kernel/syscalls.S +++ b/arch/sparc/kernel/syscalls.S | |||
@@ -6,6 +6,11 @@ sys64_execve: | |||
6 | jmpl %g1, %g0 | 6 | jmpl %g1, %g0 |
7 | flushw | 7 | flushw |
8 | 8 | ||
9 | sys64_execveat: | ||
10 | set sys_execveat, %g1 | ||
11 | jmpl %g1, %g0 | ||
12 | flushw | ||
13 | |||
9 | #ifdef CONFIG_COMPAT | 14 | #ifdef CONFIG_COMPAT |
10 | sunos_execv: | 15 | sunos_execv: |
11 | mov %g0, %o2 | 16 | mov %g0, %o2 |
@@ -13,6 +18,11 @@ sys32_execve: | |||
13 | set compat_sys_execve, %g1 | 18 | set compat_sys_execve, %g1 |
14 | jmpl %g1, %g0 | 19 | jmpl %g1, %g0 |
15 | flushw | 20 | flushw |
21 | |||
22 | sys32_execveat: | ||
23 | set compat_sys_execveat, %g1 | ||
24 | jmpl %g1, %g0 | ||
25 | flushw | ||
16 | #endif | 26 | #endif |
17 | 27 | ||
18 | .align 32 | 28 | .align 32 |
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index ad0cdf497b78..e31a9056a303 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S | |||
@@ -87,3 +87,4 @@ sys_call_table: | |||
87 | /*335*/ .long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev | 87 | /*335*/ .long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev |
88 | /*340*/ .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr | 88 | /*340*/ .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr |
89 | /*345*/ .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf | 89 | /*345*/ .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf |
90 | /*350*/ .long sys_execveat | ||
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index 580cde9370c9..d72f76ae70eb 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S | |||
@@ -88,6 +88,7 @@ sys_call_table32: | |||
88 | .word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev | 88 | .word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev |
89 | /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr | 89 | /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr |
90 | .word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf | 90 | .word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf |
91 | /*350*/ .word sys32_execveat | ||
91 | 92 | ||
92 | #endif /* CONFIG_COMPAT */ | 93 | #endif /* CONFIG_COMPAT */ |
93 | 94 | ||
@@ -167,3 +168,4 @@ sys_call_table: | |||
167 | .word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev | 168 | .word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev |
168 | /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr | 169 | /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr |
169 | .word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf | 170 | .word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf |
171 | /*350*/ .word sys64_execveat | ||
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 2d91c62f7f5f..3ea267c53320 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c | |||
@@ -1621,7 +1621,7 @@ static void __init kernel_physical_mapping_init(void) | |||
1621 | } | 1621 | } |
1622 | 1622 | ||
1623 | #ifdef CONFIG_DEBUG_PAGEALLOC | 1623 | #ifdef CONFIG_DEBUG_PAGEALLOC |
1624 | void kernel_map_pages(struct page *page, int numpages, int enable) | 1624 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
1625 | { | 1625 | { |
1626 | unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT; | 1626 | unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT; |
1627 | unsigned long phys_end = phys_start + (numpages * PAGE_SIZE); | 1627 | unsigned long phys_end = phys_start + (numpages * PAGE_SIZE); |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bea3a0159496..d69f1cd87fd9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -24,6 +24,7 @@ config X86 | |||
24 | select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI | 24 | select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI |
25 | select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS | 25 | select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS |
26 | select ARCH_HAS_FAST_MULTIPLIER | 26 | select ARCH_HAS_FAST_MULTIPLIER |
27 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
27 | select ARCH_MIGHT_HAVE_PC_PARPORT | 28 | select ARCH_MIGHT_HAVE_PC_PARPORT |
28 | select ARCH_MIGHT_HAVE_PC_SERIO | 29 | select ARCH_MIGHT_HAVE_PC_SERIO |
29 | select HAVE_AOUT if X86_32 | 30 | select HAVE_AOUT if X86_32 |
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 5d7b381da692..2eccc8932ae6 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c | |||
@@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall) | |||
35 | case __NR_socketcall: | 35 | case __NR_socketcall: |
36 | return 4; | 36 | return 4; |
37 | case __NR_execve: | 37 | case __NR_execve: |
38 | case __NR_execveat: | ||
38 | return 5; | 39 | return 5; |
39 | default: | 40 | default: |
40 | return 1; | 41 | return 1; |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ffe71228fc10..82e8a1d44658 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
@@ -480,6 +480,7 @@ GLOBAL(\label) | |||
480 | PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn | 480 | PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn |
481 | PTREGSCALL stub32_sigreturn, sys32_sigreturn | 481 | PTREGSCALL stub32_sigreturn, sys32_sigreturn |
482 | PTREGSCALL stub32_execve, compat_sys_execve | 482 | PTREGSCALL stub32_execve, compat_sys_execve |
483 | PTREGSCALL stub32_execveat, compat_sys_execveat | ||
483 | PTREGSCALL stub32_fork, sys_fork | 484 | PTREGSCALL stub32_fork, sys_fork |
484 | PTREGSCALL stub32_vfork, sys_vfork | 485 | PTREGSCALL stub32_vfork, sys_vfork |
485 | 486 | ||
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index 06d3e5a14d9d..f3672508b249 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c | |||
@@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall) | |||
50 | case __NR_openat: | 50 | case __NR_openat: |
51 | return 3; | 51 | return 3; |
52 | case __NR_execve: | 52 | case __NR_execve: |
53 | case __NR_execveat: | ||
53 | return 5; | 54 | return 5; |
54 | default: | 55 | default: |
55 | return 0; | 56 | return 0; |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c0226ab54106..90878aa38dbd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -652,6 +652,20 @@ ENTRY(stub_execve) | |||
652 | CFI_ENDPROC | 652 | CFI_ENDPROC |
653 | END(stub_execve) | 653 | END(stub_execve) |
654 | 654 | ||
655 | ENTRY(stub_execveat) | ||
656 | CFI_STARTPROC | ||
657 | addq $8, %rsp | ||
658 | PARTIAL_FRAME 0 | ||
659 | SAVE_REST | ||
660 | FIXUP_TOP_OF_STACK %r11 | ||
661 | call sys_execveat | ||
662 | RESTORE_TOP_OF_STACK %r11 | ||
663 | movq %rax,RAX(%rsp) | ||
664 | RESTORE_REST | ||
665 | jmp int_ret_from_sys_call | ||
666 | CFI_ENDPROC | ||
667 | END(stub_execveat) | ||
668 | |||
655 | /* | 669 | /* |
656 | * sigreturn is special because it needs to restore all registers on return. | 670 | * sigreturn is special because it needs to restore all registers on return. |
657 | * This cannot be done with SYSRET, so use the IRET return path instead. | 671 | * This cannot be done with SYSRET, so use the IRET return path instead. |
@@ -697,6 +711,20 @@ ENTRY(stub_x32_execve) | |||
697 | CFI_ENDPROC | 711 | CFI_ENDPROC |
698 | END(stub_x32_execve) | 712 | END(stub_x32_execve) |
699 | 713 | ||
714 | ENTRY(stub_x32_execveat) | ||
715 | CFI_STARTPROC | ||
716 | addq $8, %rsp | ||
717 | PARTIAL_FRAME 0 | ||
718 | SAVE_REST | ||
719 | FIXUP_TOP_OF_STACK %r11 | ||
720 | call compat_sys_execveat | ||
721 | RESTORE_TOP_OF_STACK %r11 | ||
722 | movq %rax,RAX(%rsp) | ||
723 | RESTORE_REST | ||
724 | jmp int_ret_from_sys_call | ||
725 | CFI_ENDPROC | ||
726 | END(stub_x32_execveat) | ||
727 | |||
700 | #endif | 728 | #endif |
701 | 729 | ||
702 | /* | 730 | /* |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a3a5d46605d2..dfaf2e0f5f8f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -1817,7 +1817,7 @@ static int __set_pages_np(struct page *page, int numpages) | |||
1817 | return __change_page_attr_set_clr(&cpa, 0); | 1817 | return __change_page_attr_set_clr(&cpa, 0); |
1818 | } | 1818 | } |
1819 | 1819 | ||
1820 | void kernel_map_pages(struct page *page, int numpages, int enable) | 1820 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
1821 | { | 1821 | { |
1822 | if (PageHighMem(page)) | 1822 | if (PageHighMem(page)) |
1823 | return; | 1823 | return; |
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 9fe1b5d002f0..b3560ece1c9f 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl | |||
@@ -364,3 +364,4 @@ | |||
364 | 355 i386 getrandom sys_getrandom | 364 | 355 i386 getrandom sys_getrandom |
365 | 356 i386 memfd_create sys_memfd_create | 365 | 356 i386 memfd_create sys_memfd_create |
366 | 357 i386 bpf sys_bpf | 366 | 357 i386 bpf sys_bpf |
367 | 358 i386 execveat sys_execveat stub32_execveat | ||
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 281150b539a2..8d656fbb57aa 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl | |||
@@ -328,6 +328,7 @@ | |||
328 | 319 common memfd_create sys_memfd_create | 328 | 319 common memfd_create sys_memfd_create |
329 | 320 common kexec_file_load sys_kexec_file_load | 329 | 320 common kexec_file_load sys_kexec_file_load |
330 | 321 common bpf sys_bpf | 330 | 321 common bpf sys_bpf |
331 | 322 64 execveat stub_execveat | ||
331 | 332 | ||
332 | # | 333 | # |
333 | # x32-specific system call numbers start at 512 to avoid cache impact | 334 | # x32-specific system call numbers start at 512 to avoid cache impact |
@@ -366,3 +367,4 @@ | |||
366 | 542 x32 getsockopt compat_sys_getsockopt | 367 | 542 x32 getsockopt compat_sys_getsockopt |
367 | 543 x32 io_setup compat_sys_io_setup | 368 | 543 x32 io_setup compat_sys_io_setup |
368 | 544 x32 io_submit compat_sys_io_submit | 369 | 544 x32 io_submit compat_sys_io_submit |
370 | 545 x32 execveat stub_x32_execveat | ||
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index f2f0723070ca..20c3649d0691 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #define stub_fork sys_fork | 31 | #define stub_fork sys_fork |
32 | #define stub_vfork sys_vfork | 32 | #define stub_vfork sys_vfork |
33 | #define stub_execve sys_execve | 33 | #define stub_execve sys_execve |
34 | #define stub_execveat sys_execveat | ||
34 | #define stub_rt_sigreturn sys_rt_sigreturn | 35 | #define stub_rt_sigreturn sys_rt_sigreturn |
35 | 36 | ||
36 | #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) | 37 | #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 7c5d87191b28..85be040a21c8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -228,8 +228,8 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t | |||
228 | struct page *first_page; | 228 | struct page *first_page; |
229 | int ret; | 229 | int ret; |
230 | 230 | ||
231 | first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); | 231 | start_pfn = phys_index << PFN_SECTION_SHIFT; |
232 | start_pfn = page_to_pfn(first_page); | 232 | first_page = pfn_to_page(start_pfn); |
233 | 233 | ||
234 | switch (action) { | 234 | switch (action) { |
235 | case MEM_ONLINE: | 235 | case MEM_ONLINE: |
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3920ee45aa59..bd8bda386e02 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c | |||
@@ -44,15 +44,14 @@ static const char *default_compressor = "lzo"; | |||
44 | static unsigned int num_devices = 1; | 44 | static unsigned int num_devices = 1; |
45 | 45 | ||
46 | #define ZRAM_ATTR_RO(name) \ | 46 | #define ZRAM_ATTR_RO(name) \ |
47 | static ssize_t zram_attr_##name##_show(struct device *d, \ | 47 | static ssize_t name##_show(struct device *d, \ |
48 | struct device_attribute *attr, char *b) \ | 48 | struct device_attribute *attr, char *b) \ |
49 | { \ | 49 | { \ |
50 | struct zram *zram = dev_to_zram(d); \ | 50 | struct zram *zram = dev_to_zram(d); \ |
51 | return scnprintf(b, PAGE_SIZE, "%llu\n", \ | 51 | return scnprintf(b, PAGE_SIZE, "%llu\n", \ |
52 | (u64)atomic64_read(&zram->stats.name)); \ | 52 | (u64)atomic64_read(&zram->stats.name)); \ |
53 | } \ | 53 | } \ |
54 | static struct device_attribute dev_attr_##name = \ | 54 | static DEVICE_ATTR_RO(name); |
55 | __ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL); | ||
56 | 55 | ||
57 | static inline int init_done(struct zram *zram) | 56 | static inline int init_done(struct zram *zram) |
58 | { | 57 | { |
@@ -287,19 +286,18 @@ static inline int is_partial_io(struct bio_vec *bvec) | |||
287 | /* | 286 | /* |
288 | * Check if request is within bounds and aligned on zram logical blocks. | 287 | * Check if request is within bounds and aligned on zram logical blocks. |
289 | */ | 288 | */ |
290 | static inline int valid_io_request(struct zram *zram, struct bio *bio) | 289 | static inline int valid_io_request(struct zram *zram, |
290 | sector_t start, unsigned int size) | ||
291 | { | 291 | { |
292 | u64 start, end, bound; | 292 | u64 end, bound; |
293 | 293 | ||
294 | /* unaligned request */ | 294 | /* unaligned request */ |
295 | if (unlikely(bio->bi_iter.bi_sector & | 295 | if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) |
296 | (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) | ||
297 | return 0; | 296 | return 0; |
298 | if (unlikely(bio->bi_iter.bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) | 297 | if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) |
299 | return 0; | 298 | return 0; |
300 | 299 | ||
301 | start = bio->bi_iter.bi_sector; | 300 | end = start + (size >> SECTOR_SHIFT); |
302 | end = start + (bio->bi_iter.bi_size >> SECTOR_SHIFT); | ||
303 | bound = zram->disksize >> SECTOR_SHIFT; | 301 | bound = zram->disksize >> SECTOR_SHIFT; |
304 | /* out of range range */ | 302 | /* out of range range */ |
305 | if (unlikely(start >= bound || end > bound || start > end)) | 303 | if (unlikely(start >= bound || end > bound || start > end)) |
@@ -453,7 +451,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) | |||
453 | } | 451 | } |
454 | 452 | ||
455 | static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, | 453 | static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, |
456 | u32 index, int offset, struct bio *bio) | 454 | u32 index, int offset) |
457 | { | 455 | { |
458 | int ret; | 456 | int ret; |
459 | struct page *page; | 457 | struct page *page; |
@@ -645,14 +643,13 @@ out: | |||
645 | } | 643 | } |
646 | 644 | ||
647 | static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, | 645 | static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, |
648 | int offset, struct bio *bio) | 646 | int offset, int rw) |
649 | { | 647 | { |
650 | int ret; | 648 | int ret; |
651 | int rw = bio_data_dir(bio); | ||
652 | 649 | ||
653 | if (rw == READ) { | 650 | if (rw == READ) { |
654 | atomic64_inc(&zram->stats.num_reads); | 651 | atomic64_inc(&zram->stats.num_reads); |
655 | ret = zram_bvec_read(zram, bvec, index, offset, bio); | 652 | ret = zram_bvec_read(zram, bvec, index, offset); |
656 | } else { | 653 | } else { |
657 | atomic64_inc(&zram->stats.num_writes); | 654 | atomic64_inc(&zram->stats.num_writes); |
658 | ret = zram_bvec_write(zram, bvec, index, offset); | 655 | ret = zram_bvec_write(zram, bvec, index, offset); |
@@ -853,7 +850,7 @@ out: | |||
853 | 850 | ||
854 | static void __zram_make_request(struct zram *zram, struct bio *bio) | 851 | static void __zram_make_request(struct zram *zram, struct bio *bio) |
855 | { | 852 | { |
856 | int offset; | 853 | int offset, rw; |
857 | u32 index; | 854 | u32 index; |
858 | struct bio_vec bvec; | 855 | struct bio_vec bvec; |
859 | struct bvec_iter iter; | 856 | struct bvec_iter iter; |
@@ -868,6 +865,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) | |||
868 | return; | 865 | return; |
869 | } | 866 | } |
870 | 867 | ||
868 | rw = bio_data_dir(bio); | ||
871 | bio_for_each_segment(bvec, bio, iter) { | 869 | bio_for_each_segment(bvec, bio, iter) { |
872 | int max_transfer_size = PAGE_SIZE - offset; | 870 | int max_transfer_size = PAGE_SIZE - offset; |
873 | 871 | ||
@@ -882,15 +880,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) | |||
882 | bv.bv_len = max_transfer_size; | 880 | bv.bv_len = max_transfer_size; |
883 | bv.bv_offset = bvec.bv_offset; | 881 | bv.bv_offset = bvec.bv_offset; |
884 | 882 | ||
885 | if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0) | 883 | if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) |
886 | goto out; | 884 | goto out; |
887 | 885 | ||
888 | bv.bv_len = bvec.bv_len - max_transfer_size; | 886 | bv.bv_len = bvec.bv_len - max_transfer_size; |
889 | bv.bv_offset += max_transfer_size; | 887 | bv.bv_offset += max_transfer_size; |
890 | if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0) | 888 | if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0) |
891 | goto out; | 889 | goto out; |
892 | } else | 890 | } else |
893 | if (zram_bvec_rw(zram, &bvec, index, offset, bio) < 0) | 891 | if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0) |
894 | goto out; | 892 | goto out; |
895 | 893 | ||
896 | update_position(&index, &offset, &bvec); | 894 | update_position(&index, &offset, &bvec); |
@@ -915,7 +913,8 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) | |||
915 | if (unlikely(!init_done(zram))) | 913 | if (unlikely(!init_done(zram))) |
916 | goto error; | 914 | goto error; |
917 | 915 | ||
918 | if (!valid_io_request(zram, bio)) { | 916 | if (!valid_io_request(zram, bio->bi_iter.bi_sector, |
917 | bio->bi_iter.bi_size)) { | ||
919 | atomic64_inc(&zram->stats.invalid_io); | 918 | atomic64_inc(&zram->stats.invalid_io); |
920 | goto error; | 919 | goto error; |
921 | } | 920 | } |
@@ -945,25 +944,64 @@ static void zram_slot_free_notify(struct block_device *bdev, | |||
945 | atomic64_inc(&zram->stats.notify_free); | 944 | atomic64_inc(&zram->stats.notify_free); |
946 | } | 945 | } |
947 | 946 | ||
947 | static int zram_rw_page(struct block_device *bdev, sector_t sector, | ||
948 | struct page *page, int rw) | ||
949 | { | ||
950 | int offset, err; | ||
951 | u32 index; | ||
952 | struct zram *zram; | ||
953 | struct bio_vec bv; | ||
954 | |||
955 | zram = bdev->bd_disk->private_data; | ||
956 | if (!valid_io_request(zram, sector, PAGE_SIZE)) { | ||
957 | atomic64_inc(&zram->stats.invalid_io); | ||
958 | return -EINVAL; | ||
959 | } | ||
960 | |||
961 | down_read(&zram->init_lock); | ||
962 | if (unlikely(!init_done(zram))) { | ||
963 | err = -EIO; | ||
964 | goto out_unlock; | ||
965 | } | ||
966 | |||
967 | index = sector >> SECTORS_PER_PAGE_SHIFT; | ||
968 | offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT; | ||
969 | |||
970 | bv.bv_page = page; | ||
971 | bv.bv_len = PAGE_SIZE; | ||
972 | bv.bv_offset = 0; | ||
973 | |||
974 | err = zram_bvec_rw(zram, &bv, index, offset, rw); | ||
975 | out_unlock: | ||
976 | up_read(&zram->init_lock); | ||
977 | /* | ||
978 | * If I/O fails, just return error(ie, non-zero) without | ||
979 | * calling page_endio. | ||
980 | * It causes resubmit the I/O with bio request by upper functions | ||
981 | * of rw_page(e.g., swap_readpage, __swap_writepage) and | ||
982 | * bio->bi_end_io does things to handle the error | ||
983 | * (e.g., SetPageError, set_page_dirty and extra works). | ||
984 | */ | ||
985 | if (err == 0) | ||
986 | page_endio(page, rw, 0); | ||
987 | return err; | ||
988 | } | ||
989 | |||
948 | static const struct block_device_operations zram_devops = { | 990 | static const struct block_device_operations zram_devops = { |
949 | .swap_slot_free_notify = zram_slot_free_notify, | 991 | .swap_slot_free_notify = zram_slot_free_notify, |
992 | .rw_page = zram_rw_page, | ||
950 | .owner = THIS_MODULE | 993 | .owner = THIS_MODULE |
951 | }; | 994 | }; |
952 | 995 | ||
953 | static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, | 996 | static DEVICE_ATTR_RW(disksize); |
954 | disksize_show, disksize_store); | 997 | static DEVICE_ATTR_RO(initstate); |
955 | static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); | 998 | static DEVICE_ATTR_WO(reset); |
956 | static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); | 999 | static DEVICE_ATTR_RO(orig_data_size); |
957 | static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); | 1000 | static DEVICE_ATTR_RO(mem_used_total); |
958 | static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); | 1001 | static DEVICE_ATTR_RW(mem_limit); |
959 | static DEVICE_ATTR(mem_limit, S_IRUGO | S_IWUSR, mem_limit_show, | 1002 | static DEVICE_ATTR_RW(mem_used_max); |
960 | mem_limit_store); | 1003 | static DEVICE_ATTR_RW(max_comp_streams); |
961 | static DEVICE_ATTR(mem_used_max, S_IRUGO | S_IWUSR, mem_used_max_show, | 1004 | static DEVICE_ATTR_RW(comp_algorithm); |
962 | mem_used_max_store); | ||
963 | static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, | ||
964 | max_comp_streams_show, max_comp_streams_store); | ||
965 | static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, | ||
966 | comp_algorithm_show, comp_algorithm_store); | ||
967 | 1005 | ||
968 | ZRAM_ATTR_RO(num_reads); | 1006 | ZRAM_ATTR_RO(num_reads); |
969 | ZRAM_ATTR_RO(num_writes); | 1007 | ZRAM_ATTR_RO(num_writes); |
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c6ee271317f5..b05a816b09ac 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h | |||
@@ -66,8 +66,8 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; | |||
66 | /* Flags for zram pages (table[page_no].value) */ | 66 | /* Flags for zram pages (table[page_no].value) */ |
67 | enum zram_pageflags { | 67 | enum zram_pageflags { |
68 | /* Page consists entirely of zeros */ | 68 | /* Page consists entirely of zeros */ |
69 | ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1, | 69 | ZRAM_ZERO = ZRAM_FLAG_SHIFT, |
70 | ZRAM_ACCESS, /* page in now accessed */ | 70 | ZRAM_ACCESS, /* page is now accessed */ |
71 | 71 | ||
72 | __NR_ZRAM_PAGEFLAGS, | 72 | __NR_ZRAM_PAGEFLAGS, |
73 | }; | 73 | }; |
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index a2d87a60c27f..bea878f8e7d3 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c | |||
@@ -509,45 +509,67 @@ static void finish_pri_tag(struct device_state *dev_state, | |||
509 | spin_unlock_irqrestore(&pasid_state->lock, flags); | 509 | spin_unlock_irqrestore(&pasid_state->lock, flags); |
510 | } | 510 | } |
511 | 511 | ||
512 | static void handle_fault_error(struct fault *fault) | ||
513 | { | ||
514 | int status; | ||
515 | |||
516 | if (!fault->dev_state->inv_ppr_cb) { | ||
517 | set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); | ||
518 | return; | ||
519 | } | ||
520 | |||
521 | status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, | ||
522 | fault->pasid, | ||
523 | fault->address, | ||
524 | fault->flags); | ||
525 | switch (status) { | ||
526 | case AMD_IOMMU_INV_PRI_RSP_SUCCESS: | ||
527 | set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); | ||
528 | break; | ||
529 | case AMD_IOMMU_INV_PRI_RSP_INVALID: | ||
530 | set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); | ||
531 | break; | ||
532 | case AMD_IOMMU_INV_PRI_RSP_FAIL: | ||
533 | set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); | ||
534 | break; | ||
535 | default: | ||
536 | BUG(); | ||
537 | } | ||
538 | } | ||
539 | |||
512 | static void do_fault(struct work_struct *work) | 540 | static void do_fault(struct work_struct *work) |
513 | { | 541 | { |
514 | struct fault *fault = container_of(work, struct fault, work); | 542 | struct fault *fault = container_of(work, struct fault, work); |
515 | int npages, write; | 543 | struct mm_struct *mm; |
516 | struct page *page; | 544 | struct vm_area_struct *vma; |
545 | u64 address; | ||
546 | int ret, write; | ||
517 | 547 | ||
518 | write = !!(fault->flags & PPR_FAULT_WRITE); | 548 | write = !!(fault->flags & PPR_FAULT_WRITE); |
519 | 549 | ||
520 | down_read(&fault->state->mm->mmap_sem); | 550 | mm = fault->state->mm; |
521 | npages = get_user_pages(NULL, fault->state->mm, | 551 | address = fault->address; |
522 | fault->address, 1, write, 0, &page, NULL); | 552 | |
523 | up_read(&fault->state->mm->mmap_sem); | 553 | down_read(&mm->mmap_sem); |
524 | 554 | vma = find_extend_vma(mm, address); | |
525 | if (npages == 1) { | 555 | if (!vma || address < vma->vm_start) { |
526 | put_page(page); | 556 | /* failed to get a vma in the right range */ |
527 | } else if (fault->dev_state->inv_ppr_cb) { | 557 | up_read(&mm->mmap_sem); |
528 | int status; | 558 | handle_fault_error(fault); |
529 | 559 | goto out; | |
530 | status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, | 560 | } |
531 | fault->pasid, | 561 | |
532 | fault->address, | 562 | ret = handle_mm_fault(mm, vma, address, write); |
533 | fault->flags); | 563 | if (ret & VM_FAULT_ERROR) { |
534 | switch (status) { | 564 | /* failed to service fault */ |
535 | case AMD_IOMMU_INV_PRI_RSP_SUCCESS: | 565 | up_read(&mm->mmap_sem); |
536 | set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); | 566 | handle_fault_error(fault); |
537 | break; | 567 | goto out; |
538 | case AMD_IOMMU_INV_PRI_RSP_INVALID: | ||
539 | set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); | ||
540 | break; | ||
541 | case AMD_IOMMU_INV_PRI_RSP_FAIL: | ||
542 | set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); | ||
543 | break; | ||
544 | default: | ||
545 | BUG(); | ||
546 | } | ||
547 | } else { | ||
548 | set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); | ||
549 | } | 568 | } |
550 | 569 | ||
570 | up_read(&mm->mmap_sem); | ||
571 | |||
572 | out: | ||
551 | finish_pri_tag(fault->dev_state, fault->state, fault->tag); | 573 | finish_pri_tag(fault->dev_state, fault->state, fault->tag); |
552 | 574 | ||
553 | put_pasid_state(fault->state); | 575 | put_pasid_state(fault->state); |
diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c index 2cd8ffe5c698..942b267c6271 100644 --- a/drivers/rtc/rtc-snvs.c +++ b/drivers/rtc/rtc-snvs.c | |||
@@ -344,13 +344,20 @@ static int snvs_rtc_resume(struct device *dev) | |||
344 | 344 | ||
345 | return 0; | 345 | return 0; |
346 | } | 346 | } |
347 | #endif | ||
348 | 347 | ||
349 | static const struct dev_pm_ops snvs_rtc_pm_ops = { | 348 | static const struct dev_pm_ops snvs_rtc_pm_ops = { |
350 | .suspend_noirq = snvs_rtc_suspend, | 349 | .suspend_noirq = snvs_rtc_suspend, |
351 | .resume_noirq = snvs_rtc_resume, | 350 | .resume_noirq = snvs_rtc_resume, |
352 | }; | 351 | }; |
353 | 352 | ||
353 | #define SNVS_RTC_PM_OPS (&snvs_rtc_pm_ops) | ||
354 | |||
355 | #else | ||
356 | |||
357 | #define SNVS_RTC_PM_OPS NULL | ||
358 | |||
359 | #endif | ||
360 | |||
354 | static const struct of_device_id snvs_dt_ids[] = { | 361 | static const struct of_device_id snvs_dt_ids[] = { |
355 | { .compatible = "fsl,sec-v4.0-mon-rtc-lp", }, | 362 | { .compatible = "fsl,sec-v4.0-mon-rtc-lp", }, |
356 | { /* sentinel */ } | 363 | { /* sentinel */ } |
@@ -361,7 +368,7 @@ static struct platform_driver snvs_rtc_driver = { | |||
361 | .driver = { | 368 | .driver = { |
362 | .name = "snvs_rtc", | 369 | .name = "snvs_rtc", |
363 | .owner = THIS_MODULE, | 370 | .owner = THIS_MODULE, |
364 | .pm = &snvs_rtc_pm_ops, | 371 | .pm = SNVS_RTC_PM_OPS, |
365 | .of_match_table = snvs_dt_ids, | 372 | .of_match_table = snvs_dt_ids, |
366 | }, | 373 | }, |
367 | .probe = snvs_rtc_probe, | 374 | .probe = snvs_rtc_probe, |
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index ad4f5790a76f..46f8ef42559e 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c | |||
@@ -418,7 +418,7 @@ out: | |||
418 | } | 418 | } |
419 | 419 | ||
420 | /* | 420 | /* |
421 | * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab | 421 | * ashmem_shrink - our cache shrinker, called from mm/vmscan.c |
422 | * | 422 | * |
423 | * 'nr_to_scan' is the number of objects to scan for freeing. | 423 | * 'nr_to_scan' is the number of objects to scan for freeing. |
424 | * | 424 | * |
@@ -785,7 +785,6 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
785 | .nr_to_scan = LONG_MAX, | 785 | .nr_to_scan = LONG_MAX, |
786 | }; | 786 | }; |
787 | ret = ashmem_shrink_count(&ashmem_shrinker, &sc); | 787 | ret = ashmem_shrink_count(&ashmem_shrinker, &sc); |
788 | nodes_setall(sc.nodes_to_scan); | ||
789 | ashmem_shrink_scan(&ashmem_shrinker, &sc); | 788 | ashmem_shrink_scan(&ashmem_shrinker, &sc); |
790 | } | 789 | } |
791 | break; | 790 | break; |
diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 9bca88159725..ff44ff3ff015 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h | |||
@@ -135,8 +135,10 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); | |||
135 | extern void secs_to_datestamp(time_t secs, struct affs_date *ds); | 135 | extern void secs_to_datestamp(time_t secs, struct affs_date *ds); |
136 | extern umode_t prot_to_mode(u32 prot); | 136 | extern umode_t prot_to_mode(u32 prot); |
137 | extern void mode_to_prot(struct inode *inode); | 137 | extern void mode_to_prot(struct inode *inode); |
138 | __printf(3, 4) | ||
138 | extern void affs_error(struct super_block *sb, const char *function, | 139 | extern void affs_error(struct super_block *sb, const char *function, |
139 | const char *fmt, ...); | 140 | const char *fmt, ...); |
141 | __printf(3, 4) | ||
140 | extern void affs_warning(struct super_block *sb, const char *function, | 142 | extern void affs_warning(struct super_block *sb, const char *function, |
141 | const char *fmt, ...); | 143 | const char *fmt, ...); |
142 | extern bool affs_nofilenametruncate(const struct dentry *dentry); | 144 | extern bool affs_nofilenametruncate(const struct dentry *dentry); |
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 937ce8754b24..c852f2fa1710 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c | |||
@@ -10,8 +10,6 @@ | |||
10 | 10 | ||
11 | #include "affs.h" | 11 | #include "affs.h" |
12 | 12 | ||
13 | static char ErrorBuffer[256]; | ||
14 | |||
15 | /* | 13 | /* |
16 | * Functions for accessing Amiga-FFS structures. | 14 | * Functions for accessing Amiga-FFS structures. |
17 | */ | 15 | */ |
@@ -444,30 +442,30 @@ mode_to_prot(struct inode *inode) | |||
444 | void | 442 | void |
445 | affs_error(struct super_block *sb, const char *function, const char *fmt, ...) | 443 | affs_error(struct super_block *sb, const char *function, const char *fmt, ...) |
446 | { | 444 | { |
447 | va_list args; | 445 | struct va_format vaf; |
448 | 446 | va_list args; | |
449 | va_start(args,fmt); | ||
450 | vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); | ||
451 | va_end(args); | ||
452 | 447 | ||
453 | pr_crit("error (device %s): %s(): %s\n", sb->s_id, | 448 | va_start(args, fmt); |
454 | function,ErrorBuffer); | 449 | vaf.fmt = fmt; |
450 | vaf.va = &args; | ||
451 | pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf); | ||
455 | if (!(sb->s_flags & MS_RDONLY)) | 452 | if (!(sb->s_flags & MS_RDONLY)) |
456 | pr_warn("Remounting filesystem read-only\n"); | 453 | pr_warn("Remounting filesystem read-only\n"); |
457 | sb->s_flags |= MS_RDONLY; | 454 | sb->s_flags |= MS_RDONLY; |
455 | va_end(args); | ||
458 | } | 456 | } |
459 | 457 | ||
460 | void | 458 | void |
461 | affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) | 459 | affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) |
462 | { | 460 | { |
463 | va_list args; | 461 | struct va_format vaf; |
462 | va_list args; | ||
464 | 463 | ||
465 | va_start(args,fmt); | 464 | va_start(args, fmt); |
466 | vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); | 465 | vaf.fmt = fmt; |
466 | vaf.va = &args; | ||
467 | pr_warn("(device %s): %s(): %pV\n", sb->s_id, function, &vaf); | ||
467 | va_end(args); | 468 | va_end(args); |
468 | |||
469 | pr_warn("(device %s): %s(): %s\n", sb->s_id, | ||
470 | function,ErrorBuffer); | ||
471 | } | 469 | } |
472 | 470 | ||
473 | bool | 471 | bool |
diff --git a/fs/affs/file.c b/fs/affs/file.c index 1ed590aafecf..8faa6593ca6d 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c | |||
@@ -12,35 +12,10 @@ | |||
12 | * affs regular file handling primitives | 12 | * affs regular file handling primitives |
13 | */ | 13 | */ |
14 | 14 | ||
15 | #include <linux/aio.h> | ||
15 | #include "affs.h" | 16 | #include "affs.h" |
16 | 17 | ||
17 | #if PAGE_SIZE < 4096 | ||
18 | #error PAGE_SIZE must be at least 4096 | ||
19 | #endif | ||
20 | |||
21 | static int affs_grow_extcache(struct inode *inode, u32 lc_idx); | ||
22 | static struct buffer_head *affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext); | ||
23 | static inline struct buffer_head *affs_get_extblock(struct inode *inode, u32 ext); | ||
24 | static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); | 18 | static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); |
25 | static int affs_file_open(struct inode *inode, struct file *filp); | ||
26 | static int affs_file_release(struct inode *inode, struct file *filp); | ||
27 | |||
28 | const struct file_operations affs_file_operations = { | ||
29 | .llseek = generic_file_llseek, | ||
30 | .read = new_sync_read, | ||
31 | .read_iter = generic_file_read_iter, | ||
32 | .write = new_sync_write, | ||
33 | .write_iter = generic_file_write_iter, | ||
34 | .mmap = generic_file_mmap, | ||
35 | .open = affs_file_open, | ||
36 | .release = affs_file_release, | ||
37 | .fsync = affs_file_fsync, | ||
38 | .splice_read = generic_file_splice_read, | ||
39 | }; | ||
40 | |||
41 | const struct inode_operations affs_file_inode_operations = { | ||
42 | .setattr = affs_notify_change, | ||
43 | }; | ||
44 | 19 | ||
45 | static int | 20 | static int |
46 | affs_file_open(struct inode *inode, struct file *filp) | 21 | affs_file_open(struct inode *inode, struct file *filp) |
@@ -355,7 +330,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul | |||
355 | 330 | ||
356 | /* store new block */ | 331 | /* store new block */ |
357 | if (bh_result->b_blocknr) | 332 | if (bh_result->b_blocknr) |
358 | affs_warning(sb, "get_block", "block already set (%x)", bh_result->b_blocknr); | 333 | affs_warning(sb, "get_block", "block already set (%lx)", |
334 | (unsigned long)bh_result->b_blocknr); | ||
359 | AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr); | 335 | AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr); |
360 | AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1); | 336 | AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1); |
361 | affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1); | 337 | affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1); |
@@ -377,7 +353,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul | |||
377 | return 0; | 353 | return 0; |
378 | 354 | ||
379 | err_big: | 355 | err_big: |
380 | affs_error(inode->i_sb,"get_block","strange block request %d", block); | 356 | affs_error(inode->i_sb, "get_block", "strange block request %d", |
357 | (int)block); | ||
381 | return -EIO; | 358 | return -EIO; |
382 | err_ext: | 359 | err_ext: |
383 | // unlock cache | 360 | // unlock cache |
@@ -412,6 +389,22 @@ static void affs_write_failed(struct address_space *mapping, loff_t to) | |||
412 | } | 389 | } |
413 | } | 390 | } |
414 | 391 | ||
392 | static ssize_t | ||
393 | affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, | ||
394 | loff_t offset) | ||
395 | { | ||
396 | struct file *file = iocb->ki_filp; | ||
397 | struct address_space *mapping = file->f_mapping; | ||
398 | struct inode *inode = mapping->host; | ||
399 | size_t count = iov_iter_count(iter); | ||
400 | ssize_t ret; | ||
401 | |||
402 | ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block); | ||
403 | if (ret < 0 && (rw & WRITE)) | ||
404 | affs_write_failed(mapping, offset + count); | ||
405 | return ret; | ||
406 | } | ||
407 | |||
415 | static int affs_write_begin(struct file *file, struct address_space *mapping, | 408 | static int affs_write_begin(struct file *file, struct address_space *mapping, |
416 | loff_t pos, unsigned len, unsigned flags, | 409 | loff_t pos, unsigned len, unsigned flags, |
417 | struct page **pagep, void **fsdata) | 410 | struct page **pagep, void **fsdata) |
@@ -438,6 +431,7 @@ const struct address_space_operations affs_aops = { | |||
438 | .writepage = affs_writepage, | 431 | .writepage = affs_writepage, |
439 | .write_begin = affs_write_begin, | 432 | .write_begin = affs_write_begin, |
440 | .write_end = generic_write_end, | 433 | .write_end = generic_write_end, |
434 | .direct_IO = affs_direct_IO, | ||
441 | .bmap = _affs_bmap | 435 | .bmap = _affs_bmap |
442 | }; | 436 | }; |
443 | 437 | ||
@@ -867,8 +861,9 @@ affs_truncate(struct inode *inode) | |||
867 | // lock cache | 861 | // lock cache |
868 | ext_bh = affs_get_extblock(inode, ext); | 862 | ext_bh = affs_get_extblock(inode, ext); |
869 | if (IS_ERR(ext_bh)) { | 863 | if (IS_ERR(ext_bh)) { |
870 | affs_warning(sb, "truncate", "unexpected read error for ext block %u (%d)", | 864 | affs_warning(sb, "truncate", |
871 | ext, PTR_ERR(ext_bh)); | 865 | "unexpected read error for ext block %u (%ld)", |
866 | (unsigned int)ext, PTR_ERR(ext_bh)); | ||
872 | return; | 867 | return; |
873 | } | 868 | } |
874 | if (AFFS_I(inode)->i_lc) { | 869 | if (AFFS_I(inode)->i_lc) { |
@@ -914,8 +909,9 @@ affs_truncate(struct inode *inode) | |||
914 | struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); | 909 | struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); |
915 | u32 tmp; | 910 | u32 tmp; |
916 | if (IS_ERR(bh)) { | 911 | if (IS_ERR(bh)) { |
917 | affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", | 912 | affs_warning(sb, "truncate", |
918 | ext, PTR_ERR(bh)); | 913 | "unexpected read error for last block %u (%ld)", |
914 | (unsigned int)ext, PTR_ERR(bh)); | ||
919 | return; | 915 | return; |
920 | } | 916 | } |
921 | tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); | 917 | tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); |
@@ -961,3 +957,19 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) | |||
961 | mutex_unlock(&inode->i_mutex); | 957 | mutex_unlock(&inode->i_mutex); |
962 | return ret; | 958 | return ret; |
963 | } | 959 | } |
960 | const struct file_operations affs_file_operations = { | ||
961 | .llseek = generic_file_llseek, | ||
962 | .read = new_sync_read, | ||
963 | .read_iter = generic_file_read_iter, | ||
964 | .write = new_sync_write, | ||
965 | .write_iter = generic_file_write_iter, | ||
966 | .mmap = generic_file_mmap, | ||
967 | .open = affs_file_open, | ||
968 | .release = affs_file_release, | ||
969 | .fsync = affs_file_fsync, | ||
970 | .splice_read = generic_file_splice_read, | ||
971 | }; | ||
972 | |||
973 | const struct inode_operations affs_file_inode_operations = { | ||
974 | .setattr = affs_notify_change, | ||
975 | }; | ||
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index b94d1cc9cd30..edf47774b03d 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c | |||
@@ -269,10 +269,6 @@ more: | |||
269 | } | 269 | } |
270 | ctx->pos++; | 270 | ctx->pos++; |
271 | goto more; | 271 | goto more; |
272 | |||
273 | befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos); | ||
274 | |||
275 | return 0; | ||
276 | } | 272 | } |
277 | 273 | ||
278 | static struct inode * | 274 | static struct inode * |
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index f37b08cea1f7..490538536cb4 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c | |||
@@ -42,6 +42,10 @@ static int load_em86(struct linux_binprm *bprm) | |||
42 | return -ENOEXEC; | 42 | return -ENOEXEC; |
43 | } | 43 | } |
44 | 44 | ||
45 | /* Need to be able to load the file after exec */ | ||
46 | if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) | ||
47 | return -ENOENT; | ||
48 | |||
45 | allow_write_access(bprm->file); | 49 | allow_write_access(bprm->file); |
46 | fput(bprm->file); | 50 | fput(bprm->file); |
47 | bprm->file = NULL; | 51 | bprm->file = NULL; |
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 70789e198dea..c04ef1d4f18a 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c | |||
@@ -144,6 +144,10 @@ static int load_misc_binary(struct linux_binprm *bprm) | |||
144 | if (!fmt) | 144 | if (!fmt) |
145 | goto ret; | 145 | goto ret; |
146 | 146 | ||
147 | /* Need to be able to load the file after exec */ | ||
148 | if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) | ||
149 | return -ENOENT; | ||
150 | |||
147 | if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { | 151 | if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { |
148 | retval = remove_arg_zero(bprm); | 152 | retval = remove_arg_zero(bprm); |
149 | if (retval) | 153 | if (retval) |
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 5027a3e14922..afdf4e3cafc2 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c | |||
@@ -24,6 +24,16 @@ static int load_script(struct linux_binprm *bprm) | |||
24 | 24 | ||
25 | if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) | 25 | if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) |
26 | return -ENOEXEC; | 26 | return -ENOEXEC; |
27 | |||
28 | /* | ||
29 | * If the script filename will be inaccessible after exec, typically | ||
30 | * because it is a "/dev/fd/<fd>/.." path against an O_CLOEXEC fd, give | ||
31 | * up now (on the assumption that the interpreter will want to load | ||
32 | * this file). | ||
33 | */ | ||
34 | if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) | ||
35 | return -ENOENT; | ||
36 | |||
27 | /* | 37 | /* |
28 | * This section does the #! interpretation. | 38 | * This section does the #! interpretation. |
29 | * Sorta complicated, but hopefully it will work. -TYT | 39 | * Sorta complicated, but hopefully it will work. -TYT |
diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 1de7294aad20..2bc2c87f35e7 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c | |||
@@ -40,13 +40,14 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) | |||
40 | static void drop_slab(void) | 40 | static void drop_slab(void) |
41 | { | 41 | { |
42 | int nr_objects; | 42 | int nr_objects; |
43 | struct shrink_control shrink = { | ||
44 | .gfp_mask = GFP_KERNEL, | ||
45 | }; | ||
46 | 43 | ||
47 | nodes_setall(shrink.nodes_to_scan); | ||
48 | do { | 44 | do { |
49 | nr_objects = shrink_slab(&shrink, 1000, 1000); | 45 | int nid; |
46 | |||
47 | nr_objects = 0; | ||
48 | for_each_online_node(nid) | ||
49 | nr_objects += shrink_node_slabs(GFP_KERNEL, nid, | ||
50 | 1000, 1000); | ||
50 | } while (nr_objects > 10); | 51 | } while (nr_objects > 10); |
51 | } | 52 | } |
52 | 53 | ||
@@ -748,18 +748,25 @@ EXPORT_SYMBOL(setup_arg_pages); | |||
748 | 748 | ||
749 | #endif /* CONFIG_MMU */ | 749 | #endif /* CONFIG_MMU */ |
750 | 750 | ||
751 | static struct file *do_open_exec(struct filename *name) | 751 | static struct file *do_open_execat(int fd, struct filename *name, int flags) |
752 | { | 752 | { |
753 | struct file *file; | 753 | struct file *file; |
754 | int err; | 754 | int err; |
755 | static const struct open_flags open_exec_flags = { | 755 | struct open_flags open_exec_flags = { |
756 | .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, | 756 | .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, |
757 | .acc_mode = MAY_EXEC | MAY_OPEN, | 757 | .acc_mode = MAY_EXEC | MAY_OPEN, |
758 | .intent = LOOKUP_OPEN, | 758 | .intent = LOOKUP_OPEN, |
759 | .lookup_flags = LOOKUP_FOLLOW, | 759 | .lookup_flags = LOOKUP_FOLLOW, |
760 | }; | 760 | }; |
761 | 761 | ||
762 | file = do_filp_open(AT_FDCWD, name, &open_exec_flags); | 762 | if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) |
763 | return ERR_PTR(-EINVAL); | ||
764 | if (flags & AT_SYMLINK_NOFOLLOW) | ||
765 | open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; | ||
766 | if (flags & AT_EMPTY_PATH) | ||
767 | open_exec_flags.lookup_flags |= LOOKUP_EMPTY; | ||
768 | |||
769 | file = do_filp_open(fd, name, &open_exec_flags); | ||
763 | if (IS_ERR(file)) | 770 | if (IS_ERR(file)) |
764 | goto out; | 771 | goto out; |
765 | 772 | ||
@@ -770,12 +777,13 @@ static struct file *do_open_exec(struct filename *name) | |||
770 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) | 777 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) |
771 | goto exit; | 778 | goto exit; |
772 | 779 | ||
773 | fsnotify_open(file); | ||
774 | |||
775 | err = deny_write_access(file); | 780 | err = deny_write_access(file); |
776 | if (err) | 781 | if (err) |
777 | goto exit; | 782 | goto exit; |
778 | 783 | ||
784 | if (name->name[0] != '\0') | ||
785 | fsnotify_open(file); | ||
786 | |||
779 | out: | 787 | out: |
780 | return file; | 788 | return file; |
781 | 789 | ||
@@ -787,7 +795,7 @@ exit: | |||
787 | struct file *open_exec(const char *name) | 795 | struct file *open_exec(const char *name) |
788 | { | 796 | { |
789 | struct filename tmp = { .name = name }; | 797 | struct filename tmp = { .name = name }; |
790 | return do_open_exec(&tmp); | 798 | return do_open_execat(AT_FDCWD, &tmp, 0); |
791 | } | 799 | } |
792 | EXPORT_SYMBOL(open_exec); | 800 | EXPORT_SYMBOL(open_exec); |
793 | 801 | ||
@@ -1428,10 +1436,12 @@ static int exec_binprm(struct linux_binprm *bprm) | |||
1428 | /* | 1436 | /* |
1429 | * sys_execve() executes a new program. | 1437 | * sys_execve() executes a new program. |
1430 | */ | 1438 | */ |
1431 | static int do_execve_common(struct filename *filename, | 1439 | static int do_execveat_common(int fd, struct filename *filename, |
1432 | struct user_arg_ptr argv, | 1440 | struct user_arg_ptr argv, |
1433 | struct user_arg_ptr envp) | 1441 | struct user_arg_ptr envp, |
1442 | int flags) | ||
1434 | { | 1443 | { |
1444 | char *pathbuf = NULL; | ||
1435 | struct linux_binprm *bprm; | 1445 | struct linux_binprm *bprm; |
1436 | struct file *file; | 1446 | struct file *file; |
1437 | struct files_struct *displaced; | 1447 | struct files_struct *displaced; |
@@ -1472,7 +1482,7 @@ static int do_execve_common(struct filename *filename, | |||
1472 | check_unsafe_exec(bprm); | 1482 | check_unsafe_exec(bprm); |
1473 | current->in_execve = 1; | 1483 | current->in_execve = 1; |
1474 | 1484 | ||
1475 | file = do_open_exec(filename); | 1485 | file = do_open_execat(fd, filename, flags); |
1476 | retval = PTR_ERR(file); | 1486 | retval = PTR_ERR(file); |
1477 | if (IS_ERR(file)) | 1487 | if (IS_ERR(file)) |
1478 | goto out_unmark; | 1488 | goto out_unmark; |
@@ -1480,7 +1490,28 @@ static int do_execve_common(struct filename *filename, | |||
1480 | sched_exec(); | 1490 | sched_exec(); |
1481 | 1491 | ||
1482 | bprm->file = file; | 1492 | bprm->file = file; |
1483 | bprm->filename = bprm->interp = filename->name; | 1493 | if (fd == AT_FDCWD || filename->name[0] == '/') { |
1494 | bprm->filename = filename->name; | ||
1495 | } else { | ||
1496 | if (filename->name[0] == '\0') | ||
1497 | pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd); | ||
1498 | else | ||
1499 | pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s", | ||
1500 | fd, filename->name); | ||
1501 | if (!pathbuf) { | ||
1502 | retval = -ENOMEM; | ||
1503 | goto out_unmark; | ||
1504 | } | ||
1505 | /* | ||
1506 | * Record that a name derived from an O_CLOEXEC fd will be | ||
1507 | * inaccessible after exec. Relies on having exclusive access to | ||
1508 | * current->files (due to unshare_files above). | ||
1509 | */ | ||
1510 | if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) | ||
1511 | bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; | ||
1512 | bprm->filename = pathbuf; | ||
1513 | } | ||
1514 | bprm->interp = bprm->filename; | ||
1484 | 1515 | ||
1485 | retval = bprm_mm_init(bprm); | 1516 | retval = bprm_mm_init(bprm); |
1486 | if (retval) | 1517 | if (retval) |
@@ -1521,6 +1552,7 @@ static int do_execve_common(struct filename *filename, | |||
1521 | acct_update_integrals(current); | 1552 | acct_update_integrals(current); |
1522 | task_numa_free(current); | 1553 | task_numa_free(current); |
1523 | free_bprm(bprm); | 1554 | free_bprm(bprm); |
1555 | kfree(pathbuf); | ||
1524 | putname(filename); | 1556 | putname(filename); |
1525 | if (displaced) | 1557 | if (displaced) |
1526 | put_files_struct(displaced); | 1558 | put_files_struct(displaced); |
@@ -1538,6 +1570,7 @@ out_unmark: | |||
1538 | 1570 | ||
1539 | out_free: | 1571 | out_free: |
1540 | free_bprm(bprm); | 1572 | free_bprm(bprm); |
1573 | kfree(pathbuf); | ||
1541 | 1574 | ||
1542 | out_files: | 1575 | out_files: |
1543 | if (displaced) | 1576 | if (displaced) |
@@ -1553,7 +1586,18 @@ int do_execve(struct filename *filename, | |||
1553 | { | 1586 | { |
1554 | struct user_arg_ptr argv = { .ptr.native = __argv }; | 1587 | struct user_arg_ptr argv = { .ptr.native = __argv }; |
1555 | struct user_arg_ptr envp = { .ptr.native = __envp }; | 1588 | struct user_arg_ptr envp = { .ptr.native = __envp }; |
1556 | return do_execve_common(filename, argv, envp); | 1589 | return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); |
1590 | } | ||
1591 | |||
1592 | int do_execveat(int fd, struct filename *filename, | ||
1593 | const char __user *const __user *__argv, | ||
1594 | const char __user *const __user *__envp, | ||
1595 | int flags) | ||
1596 | { | ||
1597 | struct user_arg_ptr argv = { .ptr.native = __argv }; | ||
1598 | struct user_arg_ptr envp = { .ptr.native = __envp }; | ||
1599 | |||
1600 | return do_execveat_common(fd, filename, argv, envp, flags); | ||
1557 | } | 1601 | } |
1558 | 1602 | ||
1559 | #ifdef CONFIG_COMPAT | 1603 | #ifdef CONFIG_COMPAT |
@@ -1569,7 +1613,23 @@ static int compat_do_execve(struct filename *filename, | |||
1569 | .is_compat = true, | 1613 | .is_compat = true, |
1570 | .ptr.compat = __envp, | 1614 | .ptr.compat = __envp, |
1571 | }; | 1615 | }; |
1572 | return do_execve_common(filename, argv, envp); | 1616 | return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); |
1617 | } | ||
1618 | |||
1619 | static int compat_do_execveat(int fd, struct filename *filename, | ||
1620 | const compat_uptr_t __user *__argv, | ||
1621 | const compat_uptr_t __user *__envp, | ||
1622 | int flags) | ||
1623 | { | ||
1624 | struct user_arg_ptr argv = { | ||
1625 | .is_compat = true, | ||
1626 | .ptr.compat = __argv, | ||
1627 | }; | ||
1628 | struct user_arg_ptr envp = { | ||
1629 | .is_compat = true, | ||
1630 | .ptr.compat = __envp, | ||
1631 | }; | ||
1632 | return do_execveat_common(fd, filename, argv, envp, flags); | ||
1573 | } | 1633 | } |
1574 | #endif | 1634 | #endif |
1575 | 1635 | ||
@@ -1609,6 +1669,20 @@ SYSCALL_DEFINE3(execve, | |||
1609 | { | 1669 | { |
1610 | return do_execve(getname(filename), argv, envp); | 1670 | return do_execve(getname(filename), argv, envp); |
1611 | } | 1671 | } |
1672 | |||
1673 | SYSCALL_DEFINE5(execveat, | ||
1674 | int, fd, const char __user *, filename, | ||
1675 | const char __user *const __user *, argv, | ||
1676 | const char __user *const __user *, envp, | ||
1677 | int, flags) | ||
1678 | { | ||
1679 | int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; | ||
1680 | |||
1681 | return do_execveat(fd, | ||
1682 | getname_flags(filename, lookup_flags, NULL), | ||
1683 | argv, envp, flags); | ||
1684 | } | ||
1685 | |||
1612 | #ifdef CONFIG_COMPAT | 1686 | #ifdef CONFIG_COMPAT |
1613 | COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, | 1687 | COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, |
1614 | const compat_uptr_t __user *, argv, | 1688 | const compat_uptr_t __user *, argv, |
@@ -1616,4 +1690,17 @@ COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, | |||
1616 | { | 1690 | { |
1617 | return compat_do_execve(getname(filename), argv, envp); | 1691 | return compat_do_execve(getname(filename), argv, envp); |
1618 | } | 1692 | } |
1693 | |||
1694 | COMPAT_SYSCALL_DEFINE5(execveat, int, fd, | ||
1695 | const char __user *, filename, | ||
1696 | const compat_uptr_t __user *, argv, | ||
1697 | const compat_uptr_t __user *, envp, | ||
1698 | int, flags) | ||
1699 | { | ||
1700 | int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; | ||
1701 | |||
1702 | return compat_do_execveat(fd, | ||
1703 | getname_flags(filename, lookup_flags, NULL), | ||
1704 | argv, envp, flags); | ||
1705 | } | ||
1619 | #endif | 1706 | #endif |
diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e0c4ba39a377..64e295e8ff38 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h | |||
@@ -370,6 +370,7 @@ extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, | |||
370 | int datasync); | 370 | int datasync); |
371 | 371 | ||
372 | /* fat/inode.c */ | 372 | /* fat/inode.c */ |
373 | extern int fat_block_truncate_page(struct inode *inode, loff_t from); | ||
373 | extern void fat_attach(struct inode *inode, loff_t i_pos); | 374 | extern void fat_attach(struct inode *inode, loff_t i_pos); |
374 | extern void fat_detach(struct inode *inode); | 375 | extern void fat_detach(struct inode *inode); |
375 | extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); | 376 | extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); |
diff --git a/fs/fat/file.c b/fs/fat/file.c index 85f79a89e747..8429c68e3057 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c | |||
@@ -443,6 +443,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) | |||
443 | } | 443 | } |
444 | 444 | ||
445 | if (attr->ia_valid & ATTR_SIZE) { | 445 | if (attr->ia_valid & ATTR_SIZE) { |
446 | error = fat_block_truncate_page(inode, attr->ia_size); | ||
447 | if (error) | ||
448 | goto out; | ||
446 | down_write(&MSDOS_I(inode)->truncate_lock); | 449 | down_write(&MSDOS_I(inode)->truncate_lock); |
447 | truncate_setsize(inode, attr->ia_size); | 450 | truncate_setsize(inode, attr->ia_size); |
448 | fat_truncate_blocks(inode, attr->ia_size); | 451 | fat_truncate_blocks(inode, attr->ia_size); |
diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 756aead10d96..7b41a2dcdd76 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c | |||
@@ -294,6 +294,18 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block) | |||
294 | return blocknr; | 294 | return blocknr; |
295 | } | 295 | } |
296 | 296 | ||
297 | /* | ||
298 | * fat_block_truncate_page() zeroes out a mapping from file offset `from' | ||
299 | * up to the end of the block which corresponds to `from'. | ||
300 | * This is required during truncate to physically zeroout the tail end | ||
301 | * of that block so it doesn't yield old data if the file is later grown. | ||
302 | * Also, avoid causing failure from fsx for cases of "data past EOF" | ||
303 | */ | ||
304 | int fat_block_truncate_page(struct inode *inode, loff_t from) | ||
305 | { | ||
306 | return block_truncate_page(inode->i_mapping, from, fat_get_block); | ||
307 | } | ||
308 | |||
297 | static const struct address_space_operations fat_aops = { | 309 | static const struct address_space_operations fat_aops = { |
298 | .readpage = fat_readpage, | 310 | .readpage = fat_readpage, |
299 | .readpages = fat_readpages, | 311 | .readpages = fat_readpages, |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 1e2872b25343..5eba47f593f8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) | |||
412 | pgoff = offset >> PAGE_SHIFT; | 412 | pgoff = offset >> PAGE_SHIFT; |
413 | 413 | ||
414 | i_size_write(inode, offset); | 414 | i_size_write(inode, offset); |
415 | mutex_lock(&mapping->i_mmap_mutex); | 415 | i_mmap_lock_write(mapping); |
416 | if (!RB_EMPTY_ROOT(&mapping->i_mmap)) | 416 | if (!RB_EMPTY_ROOT(&mapping->i_mmap)) |
417 | hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); | 417 | hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); |
418 | mutex_unlock(&mapping->i_mmap_mutex); | 418 | i_mmap_unlock_write(mapping); |
419 | truncate_hugepages(inode, offset); | 419 | truncate_hugepages(inode, offset); |
420 | return 0; | 420 | return 0; |
421 | } | 421 | } |
@@ -472,12 +472,12 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, | |||
472 | } | 472 | } |
473 | 473 | ||
474 | /* | 474 | /* |
475 | * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never | 475 | * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never |
476 | * be taken from reclaim -- unlike regular filesystems. This needs an | 476 | * be taken from reclaim -- unlike regular filesystems. This needs an |
477 | * annotation because huge_pmd_share() does an allocation under | 477 | * annotation because huge_pmd_share() does an allocation under |
478 | * i_mmap_mutex. | 478 | * i_mmap_rwsem. |
479 | */ | 479 | */ |
480 | static struct lock_class_key hugetlbfs_i_mmap_mutex_key; | 480 | static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; |
481 | 481 | ||
482 | static struct inode *hugetlbfs_get_inode(struct super_block *sb, | 482 | static struct inode *hugetlbfs_get_inode(struct super_block *sb, |
483 | struct inode *dir, | 483 | struct inode *dir, |
@@ -495,8 +495,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, | |||
495 | struct hugetlbfs_inode_info *info; | 495 | struct hugetlbfs_inode_info *info; |
496 | inode->i_ino = get_next_ino(); | 496 | inode->i_ino = get_next_ino(); |
497 | inode_init_owner(inode, dir, mode); | 497 | inode_init_owner(inode, dir, mode); |
498 | lockdep_set_class(&inode->i_mapping->i_mmap_mutex, | 498 | lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, |
499 | &hugetlbfs_i_mmap_mutex_key); | 499 | &hugetlbfs_i_mmap_rwsem_key); |
500 | inode->i_mapping->a_ops = &hugetlbfs_aops; | 500 | inode->i_mapping->a_ops = &hugetlbfs_aops; |
501 | inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; | 501 | inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; |
502 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 502 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
diff --git a/fs/inode.c b/fs/inode.c index 2ed95f7caa4f..ad60555b4768 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -346,7 +346,7 @@ void address_space_init_once(struct address_space *mapping) | |||
346 | memset(mapping, 0, sizeof(*mapping)); | 346 | memset(mapping, 0, sizeof(*mapping)); |
347 | INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); | 347 | INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); |
348 | spin_lock_init(&mapping->tree_lock); | 348 | spin_lock_init(&mapping->tree_lock); |
349 | mutex_init(&mapping->i_mmap_mutex); | 349 | init_rwsem(&mapping->i_mmap_rwsem); |
350 | INIT_LIST_HEAD(&mapping->private_list); | 350 | INIT_LIST_HEAD(&mapping->private_list); |
351 | spin_lock_init(&mapping->private_lock); | 351 | spin_lock_init(&mapping->private_lock); |
352 | mapping->i_mmap = RB_ROOT; | 352 | mapping->i_mmap = RB_ROOT; |
diff --git a/fs/namei.c b/fs/namei.c index db5fe86319e6..ca814165d84c 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -130,7 +130,7 @@ void final_putname(struct filename *name) | |||
130 | 130 | ||
131 | #define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) | 131 | #define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) |
132 | 132 | ||
133 | static struct filename * | 133 | struct filename * |
134 | getname_flags(const char __user *filename, int flags, int *empty) | 134 | getname_flags(const char __user *filename, int flags, int *empty) |
135 | { | 135 | { |
136 | struct filename *result, *err; | 136 | struct filename *result, *err; |
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index caaaf9dfe353..44523f4a6084 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c | |||
@@ -69,8 +69,8 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) | |||
69 | if (old_mask == new_mask) | 69 | if (old_mask == new_mask) |
70 | return; | 70 | return; |
71 | 71 | ||
72 | if (fsn_mark->i.inode) | 72 | if (fsn_mark->inode) |
73 | fsnotify_recalc_inode_mask(fsn_mark->i.inode); | 73 | fsnotify_recalc_inode_mask(fsn_mark->inode); |
74 | } | 74 | } |
75 | 75 | ||
76 | /* | 76 | /* |
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 6ffd220eb14d..58b7cdb63da9 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c | |||
@@ -80,7 +80,7 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) | |||
80 | return; | 80 | return; |
81 | 81 | ||
82 | inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); | 82 | inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); |
83 | inode = igrab(mark->i.inode); | 83 | inode = igrab(mark->inode); |
84 | if (inode) { | 84 | if (inode) { |
85 | seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", | 85 | seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", |
86 | inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, | 86 | inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, |
@@ -112,7 +112,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) | |||
112 | mflags |= FAN_MARK_IGNORED_SURV_MODIFY; | 112 | mflags |= FAN_MARK_IGNORED_SURV_MODIFY; |
113 | 113 | ||
114 | if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { | 114 | if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { |
115 | inode = igrab(mark->i.inode); | 115 | inode = igrab(mark->inode); |
116 | if (!inode) | 116 | if (!inode) |
117 | return; | 117 | return; |
118 | seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", | 118 | seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", |
@@ -122,7 +122,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) | |||
122 | seq_putc(m, '\n'); | 122 | seq_putc(m, '\n'); |
123 | iput(inode); | 123 | iput(inode); |
124 | } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { | 124 | } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { |
125 | struct mount *mnt = real_mount(mark->m.mnt); | 125 | struct mount *mnt = real_mount(mark->mnt); |
126 | 126 | ||
127 | seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", | 127 | seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", |
128 | mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); | 128 | mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); |
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 41e39102743a..dd3fb0b17be7 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c | |||
@@ -242,13 +242,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, | |||
242 | 242 | ||
243 | if (inode_node) { | 243 | if (inode_node) { |
244 | inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), | 244 | inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), |
245 | struct fsnotify_mark, i.i_list); | 245 | struct fsnotify_mark, obj_list); |
246 | inode_group = inode_mark->group; | 246 | inode_group = inode_mark->group; |
247 | } | 247 | } |
248 | 248 | ||
249 | if (vfsmount_node) { | 249 | if (vfsmount_node) { |
250 | vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), | 250 | vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), |
251 | struct fsnotify_mark, m.m_list); | 251 | struct fsnotify_mark, obj_list); |
252 | vfsmount_group = vfsmount_mark->group; | 252 | vfsmount_group = vfsmount_mark->group; |
253 | } | 253 | } |
254 | 254 | ||
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 3b68b0ae0a97..13a00be516d2 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h | |||
@@ -12,12 +12,19 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group); | |||
12 | /* protects reads of inode and vfsmount marks list */ | 12 | /* protects reads of inode and vfsmount marks list */ |
13 | extern struct srcu_struct fsnotify_mark_srcu; | 13 | extern struct srcu_struct fsnotify_mark_srcu; |
14 | 14 | ||
15 | /* Calculate mask of events for a list of marks */ | ||
16 | extern u32 fsnotify_recalc_mask(struct hlist_head *head); | ||
17 | |||
15 | /* compare two groups for sorting of marks lists */ | 18 | /* compare two groups for sorting of marks lists */ |
16 | extern int fsnotify_compare_groups(struct fsnotify_group *a, | 19 | extern int fsnotify_compare_groups(struct fsnotify_group *a, |
17 | struct fsnotify_group *b); | 20 | struct fsnotify_group *b); |
18 | 21 | ||
19 | extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, | 22 | extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, |
20 | __u32 mask); | 23 | __u32 mask); |
24 | /* Add mark to a proper place in mark list */ | ||
25 | extern int fsnotify_add_mark_list(struct hlist_head *head, | ||
26 | struct fsnotify_mark *mark, | ||
27 | int allow_dups); | ||
21 | /* add a mark to an inode */ | 28 | /* add a mark to an inode */ |
22 | extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, | 29 | extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, |
23 | struct fsnotify_group *group, struct inode *inode, | 30 | struct fsnotify_group *group, struct inode *inode, |
@@ -31,6 +38,11 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, | |||
31 | extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); | 38 | extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); |
32 | /* inode specific destruction of a mark */ | 39 | /* inode specific destruction of a mark */ |
33 | extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); | 40 | extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); |
41 | /* Destroy all marks in the given list */ | ||
42 | extern void fsnotify_destroy_marks(struct list_head *to_free); | ||
43 | /* Find mark belonging to given group in the list of marks */ | ||
44 | extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, | ||
45 | struct fsnotify_group *group); | ||
34 | /* run the list of all marks associated with inode and flag them to be freed */ | 46 | /* run the list of all marks associated with inode and flag them to be freed */ |
35 | extern void fsnotify_clear_marks_by_inode(struct inode *inode); | 47 | extern void fsnotify_clear_marks_by_inode(struct inode *inode); |
36 | /* run the list of all marks associated with vfsmount and flag them to be freed */ | 48 | /* run the list of all marks associated with vfsmount and flag them to be freed */ |
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index dfbf5447eea4..3daf513ee99e 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c | |||
@@ -31,28 +31,13 @@ | |||
31 | #include "../internal.h" | 31 | #include "../internal.h" |
32 | 32 | ||
33 | /* | 33 | /* |
34 | * Recalculate the mask of events relevant to a given inode locked. | ||
35 | */ | ||
36 | static void fsnotify_recalc_inode_mask_locked(struct inode *inode) | ||
37 | { | ||
38 | struct fsnotify_mark *mark; | ||
39 | __u32 new_mask = 0; | ||
40 | |||
41 | assert_spin_locked(&inode->i_lock); | ||
42 | |||
43 | hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) | ||
44 | new_mask |= mark->mask; | ||
45 | inode->i_fsnotify_mask = new_mask; | ||
46 | } | ||
47 | |||
48 | /* | ||
49 | * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types | 34 | * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types |
50 | * any notifier is interested in hearing for this inode. | 35 | * any notifier is interested in hearing for this inode. |
51 | */ | 36 | */ |
52 | void fsnotify_recalc_inode_mask(struct inode *inode) | 37 | void fsnotify_recalc_inode_mask(struct inode *inode) |
53 | { | 38 | { |
54 | spin_lock(&inode->i_lock); | 39 | spin_lock(&inode->i_lock); |
55 | fsnotify_recalc_inode_mask_locked(inode); | 40 | inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); |
56 | spin_unlock(&inode->i_lock); | 41 | spin_unlock(&inode->i_lock); |
57 | 42 | ||
58 | __fsnotify_update_child_dentry_flags(inode); | 43 | __fsnotify_update_child_dentry_flags(inode); |
@@ -60,23 +45,22 @@ void fsnotify_recalc_inode_mask(struct inode *inode) | |||
60 | 45 | ||
61 | void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) | 46 | void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) |
62 | { | 47 | { |
63 | struct inode *inode = mark->i.inode; | 48 | struct inode *inode = mark->inode; |
64 | 49 | ||
65 | BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); | 50 | BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); |
66 | assert_spin_locked(&mark->lock); | 51 | assert_spin_locked(&mark->lock); |
67 | 52 | ||
68 | spin_lock(&inode->i_lock); | 53 | spin_lock(&inode->i_lock); |
69 | 54 | ||
70 | hlist_del_init_rcu(&mark->i.i_list); | 55 | hlist_del_init_rcu(&mark->obj_list); |
71 | mark->i.inode = NULL; | 56 | mark->inode = NULL; |
72 | 57 | ||
73 | /* | 58 | /* |
74 | * this mark is now off the inode->i_fsnotify_marks list and we | 59 | * this mark is now off the inode->i_fsnotify_marks list and we |
75 | * hold the inode->i_lock, so this is the perfect time to update the | 60 | * hold the inode->i_lock, so this is the perfect time to update the |
76 | * inode->i_fsnotify_mask | 61 | * inode->i_fsnotify_mask |
77 | */ | 62 | */ |
78 | fsnotify_recalc_inode_mask_locked(inode); | 63 | inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); |
79 | |||
80 | spin_unlock(&inode->i_lock); | 64 | spin_unlock(&inode->i_lock); |
81 | } | 65 | } |
82 | 66 | ||
@@ -85,30 +69,19 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) | |||
85 | */ | 69 | */ |
86 | void fsnotify_clear_marks_by_inode(struct inode *inode) | 70 | void fsnotify_clear_marks_by_inode(struct inode *inode) |
87 | { | 71 | { |
88 | struct fsnotify_mark *mark, *lmark; | 72 | struct fsnotify_mark *mark; |
89 | struct hlist_node *n; | 73 | struct hlist_node *n; |
90 | LIST_HEAD(free_list); | 74 | LIST_HEAD(free_list); |
91 | 75 | ||
92 | spin_lock(&inode->i_lock); | 76 | spin_lock(&inode->i_lock); |
93 | hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, i.i_list) { | 77 | hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) { |
94 | list_add(&mark->i.free_i_list, &free_list); | 78 | list_add(&mark->free_list, &free_list); |
95 | hlist_del_init_rcu(&mark->i.i_list); | 79 | hlist_del_init_rcu(&mark->obj_list); |
96 | fsnotify_get_mark(mark); | 80 | fsnotify_get_mark(mark); |
97 | } | 81 | } |
98 | spin_unlock(&inode->i_lock); | 82 | spin_unlock(&inode->i_lock); |
99 | 83 | ||
100 | list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) { | 84 | fsnotify_destroy_marks(&free_list); |
101 | struct fsnotify_group *group; | ||
102 | |||
103 | spin_lock(&mark->lock); | ||
104 | fsnotify_get_group(mark->group); | ||
105 | group = mark->group; | ||
106 | spin_unlock(&mark->lock); | ||
107 | |||
108 | fsnotify_destroy_mark(mark, group); | ||
109 | fsnotify_put_mark(mark); | ||
110 | fsnotify_put_group(group); | ||
111 | } | ||
112 | } | 85 | } |
113 | 86 | ||
114 | /* | 87 | /* |
@@ -123,34 +96,13 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) | |||
123 | * given a group and inode, find the mark associated with that combination. | 96 | * given a group and inode, find the mark associated with that combination. |
124 | * if found take a reference to that mark and return it, else return NULL | 97 | * if found take a reference to that mark and return it, else return NULL |
125 | */ | 98 | */ |
126 | static struct fsnotify_mark *fsnotify_find_inode_mark_locked( | ||
127 | struct fsnotify_group *group, | ||
128 | struct inode *inode) | ||
129 | { | ||
130 | struct fsnotify_mark *mark; | ||
131 | |||
132 | assert_spin_locked(&inode->i_lock); | ||
133 | |||
134 | hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) { | ||
135 | if (mark->group == group) { | ||
136 | fsnotify_get_mark(mark); | ||
137 | return mark; | ||
138 | } | ||
139 | } | ||
140 | return NULL; | ||
141 | } | ||
142 | |||
143 | /* | ||
144 | * given a group and inode, find the mark associated with that combination. | ||
145 | * if found take a reference to that mark and return it, else return NULL | ||
146 | */ | ||
147 | struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, | 99 | struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, |
148 | struct inode *inode) | 100 | struct inode *inode) |
149 | { | 101 | { |
150 | struct fsnotify_mark *mark; | 102 | struct fsnotify_mark *mark; |
151 | 103 | ||
152 | spin_lock(&inode->i_lock); | 104 | spin_lock(&inode->i_lock); |
153 | mark = fsnotify_find_inode_mark_locked(group, inode); | 105 | mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); |
154 | spin_unlock(&inode->i_lock); | 106 | spin_unlock(&inode->i_lock); |
155 | 107 | ||
156 | return mark; | 108 | return mark; |
@@ -168,10 +120,10 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, | |||
168 | assert_spin_locked(&mark->lock); | 120 | assert_spin_locked(&mark->lock); |
169 | 121 | ||
170 | if (mask && | 122 | if (mask && |
171 | mark->i.inode && | 123 | mark->inode && |
172 | !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { | 124 | !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { |
173 | mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; | 125 | mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; |
174 | inode = igrab(mark->i.inode); | 126 | inode = igrab(mark->inode); |
175 | /* | 127 | /* |
176 | * we shouldn't be able to get here if the inode wasn't | 128 | * we shouldn't be able to get here if the inode wasn't |
177 | * already safely held in memory. But bug in case it | 129 | * already safely held in memory. But bug in case it |
@@ -192,9 +144,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, | |||
192 | struct fsnotify_group *group, struct inode *inode, | 144 | struct fsnotify_group *group, struct inode *inode, |
193 | int allow_dups) | 145 | int allow_dups) |
194 | { | 146 | { |
195 | struct fsnotify_mark *lmark, *last = NULL; | 147 | int ret; |
196 | int ret = 0; | ||
197 | int cmp; | ||
198 | 148 | ||
199 | mark->flags |= FSNOTIFY_MARK_FLAG_INODE; | 149 | mark->flags |= FSNOTIFY_MARK_FLAG_INODE; |
200 | 150 | ||
@@ -202,37 +152,10 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, | |||
202 | assert_spin_locked(&mark->lock); | 152 | assert_spin_locked(&mark->lock); |
203 | 153 | ||
204 | spin_lock(&inode->i_lock); | 154 | spin_lock(&inode->i_lock); |
205 | 155 | mark->inode = inode; | |
206 | mark->i.inode = inode; | 156 | ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark, |
207 | 157 | allow_dups); | |
208 | /* is mark the first mark? */ | 158 | inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); |
209 | if (hlist_empty(&inode->i_fsnotify_marks)) { | ||
210 | hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks); | ||
211 | goto out; | ||
212 | } | ||
213 | |||
214 | /* should mark be in the middle of the current list? */ | ||
215 | hlist_for_each_entry(lmark, &inode->i_fsnotify_marks, i.i_list) { | ||
216 | last = lmark; | ||
217 | |||
218 | if ((lmark->group == group) && !allow_dups) { | ||
219 | ret = -EEXIST; | ||
220 | goto out; | ||
221 | } | ||
222 | |||
223 | cmp = fsnotify_compare_groups(lmark->group, mark->group); | ||
224 | if (cmp < 0) | ||
225 | continue; | ||
226 | |||
227 | hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list); | ||
228 | goto out; | ||
229 | } | ||
230 | |||
231 | BUG_ON(last == NULL); | ||
232 | /* mark should be the last entry. last is the current last entry */ | ||
233 | hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list); | ||
234 | out: | ||
235 | fsnotify_recalc_inode_mask_locked(inode); | ||
236 | spin_unlock(&inode->i_lock); | 159 | spin_unlock(&inode->i_lock); |
237 | 160 | ||
238 | return ret; | 161 | return ret; |
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 7d888d77d59a..2cd900c2c737 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c | |||
@@ -156,7 +156,7 @@ static int idr_callback(int id, void *p, void *data) | |||
156 | */ | 156 | */ |
157 | if (fsn_mark) | 157 | if (fsn_mark) |
158 | printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n", | 158 | printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n", |
159 | fsn_mark->group, fsn_mark->i.inode, i_mark->wd); | 159 | fsn_mark->group, fsn_mark->inode, i_mark->wd); |
160 | return 0; | 160 | return 0; |
161 | } | 161 | } |
162 | 162 | ||
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 283aa312d745..450648697433 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c | |||
@@ -433,7 +433,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, | |||
433 | if (wd == -1) { | 433 | if (wd == -1) { |
434 | WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" | 434 | WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" |
435 | " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, | 435 | " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, |
436 | i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); | 436 | i_mark->fsn_mark.group, i_mark->fsn_mark.inode); |
437 | goto out; | 437 | goto out; |
438 | } | 438 | } |
439 | 439 | ||
@@ -442,7 +442,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, | |||
442 | if (unlikely(!found_i_mark)) { | 442 | if (unlikely(!found_i_mark)) { |
443 | WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" | 443 | WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" |
444 | " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, | 444 | " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, |
445 | i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); | 445 | i_mark->fsn_mark.group, i_mark->fsn_mark.inode); |
446 | goto out; | 446 | goto out; |
447 | } | 447 | } |
448 | 448 | ||
@@ -456,9 +456,9 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, | |||
456 | "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d " | 456 | "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d " |
457 | "found_i_mark->group=%p found_i_mark->inode=%p\n", | 457 | "found_i_mark->group=%p found_i_mark->inode=%p\n", |
458 | __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, | 458 | __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, |
459 | i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd, | 459 | i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd, |
460 | found_i_mark->fsn_mark.group, | 460 | found_i_mark->fsn_mark.group, |
461 | found_i_mark->fsn_mark.i.inode); | 461 | found_i_mark->fsn_mark.inode); |
462 | goto out; | 462 | goto out; |
463 | } | 463 | } |
464 | 464 | ||
@@ -470,7 +470,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, | |||
470 | if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) { | 470 | if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) { |
471 | printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" | 471 | printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" |
472 | " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, | 472 | " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, |
473 | i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); | 473 | i_mark->fsn_mark.group, i_mark->fsn_mark.inode); |
474 | /* we can't really recover with bad ref cnting.. */ | 474 | /* we can't really recover with bad ref cnting.. */ |
475 | BUG(); | 475 | BUG(); |
476 | } | 476 | } |
diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 34c38fabf514..92e48c70f0f0 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c | |||
@@ -110,6 +110,17 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) | |||
110 | } | 110 | } |
111 | } | 111 | } |
112 | 112 | ||
113 | /* Calculate mask of events for a list of marks */ | ||
114 | u32 fsnotify_recalc_mask(struct hlist_head *head) | ||
115 | { | ||
116 | u32 new_mask = 0; | ||
117 | struct fsnotify_mark *mark; | ||
118 | |||
119 | hlist_for_each_entry(mark, head, obj_list) | ||
120 | new_mask |= mark->mask; | ||
121 | return new_mask; | ||
122 | } | ||
123 | |||
113 | /* | 124 | /* |
114 | * Any time a mark is getting freed we end up here. | 125 | * Any time a mark is getting freed we end up here. |
115 | * The caller had better be holding a reference to this mark so we don't actually | 126 | * The caller had better be holding a reference to this mark so we don't actually |
@@ -133,7 +144,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, | |||
133 | mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; | 144 | mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; |
134 | 145 | ||
135 | if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { | 146 | if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { |
136 | inode = mark->i.inode; | 147 | inode = mark->inode; |
137 | fsnotify_destroy_inode_mark(mark); | 148 | fsnotify_destroy_inode_mark(mark); |
138 | } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) | 149 | } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) |
139 | fsnotify_destroy_vfsmount_mark(mark); | 150 | fsnotify_destroy_vfsmount_mark(mark); |
@@ -150,7 +161,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, | |||
150 | mutex_unlock(&group->mark_mutex); | 161 | mutex_unlock(&group->mark_mutex); |
151 | 162 | ||
152 | spin_lock(&destroy_lock); | 163 | spin_lock(&destroy_lock); |
153 | list_add(&mark->destroy_list, &destroy_list); | 164 | list_add(&mark->g_list, &destroy_list); |
154 | spin_unlock(&destroy_lock); | 165 | spin_unlock(&destroy_lock); |
155 | wake_up(&destroy_waitq); | 166 | wake_up(&destroy_waitq); |
156 | /* | 167 | /* |
@@ -192,6 +203,27 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, | |||
192 | mutex_unlock(&group->mark_mutex); | 203 | mutex_unlock(&group->mark_mutex); |
193 | } | 204 | } |
194 | 205 | ||
206 | /* | ||
207 | * Destroy all marks in the given list. The marks must be already detached from | ||
208 | * the original inode / vfsmount. | ||
209 | */ | ||
210 | void fsnotify_destroy_marks(struct list_head *to_free) | ||
211 | { | ||
212 | struct fsnotify_mark *mark, *lmark; | ||
213 | struct fsnotify_group *group; | ||
214 | |||
215 | list_for_each_entry_safe(mark, lmark, to_free, free_list) { | ||
216 | spin_lock(&mark->lock); | ||
217 | fsnotify_get_group(mark->group); | ||
218 | group = mark->group; | ||
219 | spin_unlock(&mark->lock); | ||
220 | |||
221 | fsnotify_destroy_mark(mark, group); | ||
222 | fsnotify_put_mark(mark); | ||
223 | fsnotify_put_group(group); | ||
224 | } | ||
225 | } | ||
226 | |||
195 | void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) | 227 | void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) |
196 | { | 228 | { |
197 | assert_spin_locked(&mark->lock); | 229 | assert_spin_locked(&mark->lock); |
@@ -245,6 +277,39 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) | |||
245 | return -1; | 277 | return -1; |
246 | } | 278 | } |
247 | 279 | ||
280 | /* Add mark into proper place in given list of marks */ | ||
281 | int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, | ||
282 | int allow_dups) | ||
283 | { | ||
284 | struct fsnotify_mark *lmark, *last = NULL; | ||
285 | int cmp; | ||
286 | |||
287 | /* is mark the first mark? */ | ||
288 | if (hlist_empty(head)) { | ||
289 | hlist_add_head_rcu(&mark->obj_list, head); | ||
290 | return 0; | ||
291 | } | ||
292 | |||
293 | /* should mark be in the middle of the current list? */ | ||
294 | hlist_for_each_entry(lmark, head, obj_list) { | ||
295 | last = lmark; | ||
296 | |||
297 | if ((lmark->group == mark->group) && !allow_dups) | ||
298 | return -EEXIST; | ||
299 | |||
300 | cmp = fsnotify_compare_groups(lmark->group, mark->group); | ||
301 | if (cmp >= 0) { | ||
302 | hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); | ||
303 | return 0; | ||
304 | } | ||
305 | } | ||
306 | |||
307 | BUG_ON(last == NULL); | ||
308 | /* mark should be the last entry. last is the current last entry */ | ||
309 | hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); | ||
310 | return 0; | ||
311 | } | ||
312 | |||
248 | /* | 313 | /* |
249 | * Attach an initialized mark to a given group and fs object. | 314 | * Attach an initialized mark to a given group and fs object. |
250 | * These marks may be used for the fsnotify backend to determine which | 315 | * These marks may be used for the fsnotify backend to determine which |
@@ -305,7 +370,7 @@ err: | |||
305 | spin_unlock(&mark->lock); | 370 | spin_unlock(&mark->lock); |
306 | 371 | ||
307 | spin_lock(&destroy_lock); | 372 | spin_lock(&destroy_lock); |
308 | list_add(&mark->destroy_list, &destroy_list); | 373 | list_add(&mark->g_list, &destroy_list); |
309 | spin_unlock(&destroy_lock); | 374 | spin_unlock(&destroy_lock); |
310 | wake_up(&destroy_waitq); | 375 | wake_up(&destroy_waitq); |
311 | 376 | ||
@@ -323,6 +388,24 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, | |||
323 | } | 388 | } |
324 | 389 | ||
325 | /* | 390 | /* |
391 | * Given a list of marks, find the mark associated with given group. If found | ||
392 | * take a reference to that mark and return it, else return NULL. | ||
393 | */ | ||
394 | struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, | ||
395 | struct fsnotify_group *group) | ||
396 | { | ||
397 | struct fsnotify_mark *mark; | ||
398 | |||
399 | hlist_for_each_entry(mark, head, obj_list) { | ||
400 | if (mark->group == group) { | ||
401 | fsnotify_get_mark(mark); | ||
402 | return mark; | ||
403 | } | ||
404 | } | ||
405 | return NULL; | ||
406 | } | ||
407 | |||
408 | /* | ||
326 | * clear any marks in a group in which mark->flags & flags is true | 409 | * clear any marks in a group in which mark->flags & flags is true |
327 | */ | 410 | */ |
328 | void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, | 411 | void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, |
@@ -352,8 +435,8 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group) | |||
352 | void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) | 435 | void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) |
353 | { | 436 | { |
354 | assert_spin_locked(&old->lock); | 437 | assert_spin_locked(&old->lock); |
355 | new->i.inode = old->i.inode; | 438 | new->inode = old->inode; |
356 | new->m.mnt = old->m.mnt; | 439 | new->mnt = old->mnt; |
357 | if (old->group) | 440 | if (old->group) |
358 | fsnotify_get_group(old->group); | 441 | fsnotify_get_group(old->group); |
359 | new->group = old->group; | 442 | new->group = old->group; |
@@ -386,8 +469,8 @@ static int fsnotify_mark_destroy(void *ignored) | |||
386 | 469 | ||
387 | synchronize_srcu(&fsnotify_mark_srcu); | 470 | synchronize_srcu(&fsnotify_mark_srcu); |
388 | 471 | ||
389 | list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) { | 472 | list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { |
390 | list_del_init(&mark->destroy_list); | 473 | list_del_init(&mark->g_list); |
391 | fsnotify_put_mark(mark); | 474 | fsnotify_put_mark(mark); |
392 | } | 475 | } |
393 | 476 | ||
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index faefa72a11eb..326b148e623c 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c | |||
@@ -32,31 +32,20 @@ | |||
32 | 32 | ||
33 | void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) | 33 | void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) |
34 | { | 34 | { |
35 | struct fsnotify_mark *mark, *lmark; | 35 | struct fsnotify_mark *mark; |
36 | struct hlist_node *n; | 36 | struct hlist_node *n; |
37 | struct mount *m = real_mount(mnt); | 37 | struct mount *m = real_mount(mnt); |
38 | LIST_HEAD(free_list); | 38 | LIST_HEAD(free_list); |
39 | 39 | ||
40 | spin_lock(&mnt->mnt_root->d_lock); | 40 | spin_lock(&mnt->mnt_root->d_lock); |
41 | hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, m.m_list) { | 41 | hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) { |
42 | list_add(&mark->m.free_m_list, &free_list); | 42 | list_add(&mark->free_list, &free_list); |
43 | hlist_del_init_rcu(&mark->m.m_list); | 43 | hlist_del_init_rcu(&mark->obj_list); |
44 | fsnotify_get_mark(mark); | 44 | fsnotify_get_mark(mark); |
45 | } | 45 | } |
46 | spin_unlock(&mnt->mnt_root->d_lock); | 46 | spin_unlock(&mnt->mnt_root->d_lock); |
47 | 47 | ||
48 | list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) { | 48 | fsnotify_destroy_marks(&free_list); |
49 | struct fsnotify_group *group; | ||
50 | |||
51 | spin_lock(&mark->lock); | ||
52 | fsnotify_get_group(mark->group); | ||
53 | group = mark->group; | ||
54 | spin_unlock(&mark->lock); | ||
55 | |||
56 | fsnotify_destroy_mark(mark, group); | ||
57 | fsnotify_put_mark(mark); | ||
58 | fsnotify_put_group(group); | ||
59 | } | ||
60 | } | 49 | } |
61 | 50 | ||
62 | void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) | 51 | void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) |
@@ -65,66 +54,35 @@ void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) | |||
65 | } | 54 | } |
66 | 55 | ||
67 | /* | 56 | /* |
68 | * Recalculate the mask of events relevant to a given vfsmount locked. | ||
69 | */ | ||
70 | static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt) | ||
71 | { | ||
72 | struct mount *m = real_mount(mnt); | ||
73 | struct fsnotify_mark *mark; | ||
74 | __u32 new_mask = 0; | ||
75 | |||
76 | assert_spin_locked(&mnt->mnt_root->d_lock); | ||
77 | |||
78 | hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) | ||
79 | new_mask |= mark->mask; | ||
80 | m->mnt_fsnotify_mask = new_mask; | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types | 57 | * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types |
85 | * any notifier is interested in hearing for this mount point | 58 | * any notifier is interested in hearing for this mount point |
86 | */ | 59 | */ |
87 | void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) | 60 | void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) |
88 | { | 61 | { |
62 | struct mount *m = real_mount(mnt); | ||
63 | |||
89 | spin_lock(&mnt->mnt_root->d_lock); | 64 | spin_lock(&mnt->mnt_root->d_lock); |
90 | fsnotify_recalc_vfsmount_mask_locked(mnt); | 65 | m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); |
91 | spin_unlock(&mnt->mnt_root->d_lock); | 66 | spin_unlock(&mnt->mnt_root->d_lock); |
92 | } | 67 | } |
93 | 68 | ||
94 | void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) | 69 | void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) |
95 | { | 70 | { |
96 | struct vfsmount *mnt = mark->m.mnt; | 71 | struct vfsmount *mnt = mark->mnt; |
72 | struct mount *m = real_mount(mnt); | ||
97 | 73 | ||
98 | BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); | 74 | BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); |
99 | assert_spin_locked(&mark->lock); | 75 | assert_spin_locked(&mark->lock); |
100 | 76 | ||
101 | spin_lock(&mnt->mnt_root->d_lock); | 77 | spin_lock(&mnt->mnt_root->d_lock); |
102 | 78 | ||
103 | hlist_del_init_rcu(&mark->m.m_list); | 79 | hlist_del_init_rcu(&mark->obj_list); |
104 | mark->m.mnt = NULL; | 80 | mark->mnt = NULL; |
105 | |||
106 | fsnotify_recalc_vfsmount_mask_locked(mnt); | ||
107 | 81 | ||
82 | m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); | ||
108 | spin_unlock(&mnt->mnt_root->d_lock); | 83 | spin_unlock(&mnt->mnt_root->d_lock); |
109 | } | 84 | } |
110 | 85 | ||
111 | static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group, | ||
112 | struct vfsmount *mnt) | ||
113 | { | ||
114 | struct mount *m = real_mount(mnt); | ||
115 | struct fsnotify_mark *mark; | ||
116 | |||
117 | assert_spin_locked(&mnt->mnt_root->d_lock); | ||
118 | |||
119 | hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) { | ||
120 | if (mark->group == group) { | ||
121 | fsnotify_get_mark(mark); | ||
122 | return mark; | ||
123 | } | ||
124 | } | ||
125 | return NULL; | ||
126 | } | ||
127 | |||
128 | /* | 86 | /* |
129 | * given a group and vfsmount, find the mark associated with that combination. | 87 | * given a group and vfsmount, find the mark associated with that combination. |
130 | * if found take a reference to that mark and return it, else return NULL | 88 | * if found take a reference to that mark and return it, else return NULL |
@@ -132,10 +90,11 @@ static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_ | |||
132 | struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, | 90 | struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, |
133 | struct vfsmount *mnt) | 91 | struct vfsmount *mnt) |
134 | { | 92 | { |
93 | struct mount *m = real_mount(mnt); | ||
135 | struct fsnotify_mark *mark; | 94 | struct fsnotify_mark *mark; |
136 | 95 | ||
137 | spin_lock(&mnt->mnt_root->d_lock); | 96 | spin_lock(&mnt->mnt_root->d_lock); |
138 | mark = fsnotify_find_vfsmount_mark_locked(group, mnt); | 97 | mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group); |
139 | spin_unlock(&mnt->mnt_root->d_lock); | 98 | spin_unlock(&mnt->mnt_root->d_lock); |
140 | 99 | ||
141 | return mark; | 100 | return mark; |
@@ -151,9 +110,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, | |||
151 | int allow_dups) | 110 | int allow_dups) |
152 | { | 111 | { |
153 | struct mount *m = real_mount(mnt); | 112 | struct mount *m = real_mount(mnt); |
154 | struct fsnotify_mark *lmark, *last = NULL; | 113 | int ret; |
155 | int ret = 0; | ||
156 | int cmp; | ||
157 | 114 | ||
158 | mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; | 115 | mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; |
159 | 116 | ||
@@ -161,37 +118,9 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, | |||
161 | assert_spin_locked(&mark->lock); | 118 | assert_spin_locked(&mark->lock); |
162 | 119 | ||
163 | spin_lock(&mnt->mnt_root->d_lock); | 120 | spin_lock(&mnt->mnt_root->d_lock); |
164 | 121 | mark->mnt = mnt; | |
165 | mark->m.mnt = mnt; | 122 | ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups); |
166 | 123 | m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); | |
167 | /* is mark the first mark? */ | ||
168 | if (hlist_empty(&m->mnt_fsnotify_marks)) { | ||
169 | hlist_add_head_rcu(&mark->m.m_list, &m->mnt_fsnotify_marks); | ||
170 | goto out; | ||
171 | } | ||
172 | |||
173 | /* should mark be in the middle of the current list? */ | ||
174 | hlist_for_each_entry(lmark, &m->mnt_fsnotify_marks, m.m_list) { | ||
175 | last = lmark; | ||
176 | |||
177 | if ((lmark->group == group) && !allow_dups) { | ||
178 | ret = -EEXIST; | ||
179 | goto out; | ||
180 | } | ||
181 | |||
182 | cmp = fsnotify_compare_groups(lmark->group, mark->group); | ||
183 | if (cmp < 0) | ||
184 | continue; | ||
185 | |||
186 | hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list); | ||
187 | goto out; | ||
188 | } | ||
189 | |||
190 | BUG_ON(last == NULL); | ||
191 | /* mark should be the last entry. last is the current last entry */ | ||
192 | hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list); | ||
193 | out: | ||
194 | fsnotify_recalc_vfsmount_mask_locked(mnt); | ||
195 | spin_unlock(&mnt->mnt_root->d_lock); | 124 | spin_unlock(&mnt->mnt_root->d_lock); |
196 | 125 | ||
197 | return ret; | 126 | return ret; |
@@ -295,6 +295,17 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) | |||
295 | 295 | ||
296 | sb_start_write(inode->i_sb); | 296 | sb_start_write(inode->i_sb); |
297 | ret = file->f_op->fallocate(file, mode, offset, len); | 297 | ret = file->f_op->fallocate(file, mode, offset, len); |
298 | |||
299 | /* | ||
300 | * Create inotify and fanotify events. | ||
301 | * | ||
302 | * To keep the logic simple always create events if fallocate succeeds. | ||
303 | * This implies that events are even created if the file size remains | ||
304 | * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. | ||
305 | */ | ||
306 | if (ret == 0) | ||
307 | fsnotify_modify(file); | ||
308 | |||
298 | sb_end_write(inode->i_sb); | 309 | sb_end_write(inode->i_sb); |
299 | return ret; | 310 | return ret; |
300 | } | 311 | } |
diff --git a/fs/seq_file.c b/fs/seq_file.c index 353948ba1c5b..dbf3a59c86bb 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c | |||
@@ -25,7 +25,11 @@ static void *seq_buf_alloc(unsigned long size) | |||
25 | { | 25 | { |
26 | void *buf; | 26 | void *buf; |
27 | 27 | ||
28 | buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); | 28 | /* |
29 | * __GFP_NORETRY to avoid oom-killings with high-order allocations - | ||
30 | * it's better to fall back to vmalloc() than to kill things. | ||
31 | */ | ||
32 | buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); | ||
29 | if (!buf && size > PAGE_SIZE) | 33 | if (!buf && size > PAGE_SIZE) |
30 | buf = vmalloc(size); | 34 | buf = vmalloc(size); |
31 | return buf; | 35 | return buf; |
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 61f29e5ea840..576e4639ca60 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h | |||
@@ -53,6 +53,10 @@ struct linux_binprm { | |||
53 | #define BINPRM_FLAGS_EXECFD_BIT 1 | 53 | #define BINPRM_FLAGS_EXECFD_BIT 1 |
54 | #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) | 54 | #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) |
55 | 55 | ||
56 | /* filename of the binary will be inaccessible after exec */ | ||
57 | #define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2 | ||
58 | #define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT) | ||
59 | |||
56 | /* Function parameter for binfmt->coredump */ | 60 | /* Function parameter for binfmt->coredump */ |
57 | struct coredump_params { | 61 | struct coredump_params { |
58 | const siginfo_t *siginfo; | 62 | const siginfo_t *siginfo; |
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index e1c8d080c427..34e020c23644 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h | |||
@@ -45,6 +45,7 @@ | |||
45 | * bitmap_set(dst, pos, nbits) Set specified bit area | 45 | * bitmap_set(dst, pos, nbits) Set specified bit area |
46 | * bitmap_clear(dst, pos, nbits) Clear specified bit area | 46 | * bitmap_clear(dst, pos, nbits) Clear specified bit area |
47 | * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area | 47 | * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area |
48 | * bitmap_find_next_zero_area_off(buf, len, pos, n, mask) as above | ||
48 | * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n | 49 | * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n |
49 | * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n | 50 | * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n |
50 | * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) | 51 | * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) |
@@ -114,11 +115,36 @@ extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); | |||
114 | 115 | ||
115 | extern void bitmap_set(unsigned long *map, unsigned int start, int len); | 116 | extern void bitmap_set(unsigned long *map, unsigned int start, int len); |
116 | extern void bitmap_clear(unsigned long *map, unsigned int start, int len); | 117 | extern void bitmap_clear(unsigned long *map, unsigned int start, int len); |
117 | extern unsigned long bitmap_find_next_zero_area(unsigned long *map, | 118 | |
118 | unsigned long size, | 119 | extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map, |
119 | unsigned long start, | 120 | unsigned long size, |
120 | unsigned int nr, | 121 | unsigned long start, |
121 | unsigned long align_mask); | 122 | unsigned int nr, |
123 | unsigned long align_mask, | ||
124 | unsigned long align_offset); | ||
125 | |||
126 | /** | ||
127 | * bitmap_find_next_zero_area - find a contiguous aligned zero area | ||
128 | * @map: The address to base the search on | ||
129 | * @size: The bitmap size in bits | ||
130 | * @start: The bitnumber to start searching at | ||
131 | * @nr: The number of zeroed bits we're looking for | ||
132 | * @align_mask: Alignment mask for zero area | ||
133 | * | ||
134 | * The @align_mask should be one less than a power of 2; the effect is that | ||
135 | * the bit offset of all zero areas this function finds is multiples of that | ||
136 | * power of 2. A @align_mask of 0 means no alignment is required. | ||
137 | */ | ||
138 | static inline unsigned long | ||
139 | bitmap_find_next_zero_area(unsigned long *map, | ||
140 | unsigned long size, | ||
141 | unsigned long start, | ||
142 | unsigned int nr, | ||
143 | unsigned long align_mask) | ||
144 | { | ||
145 | return bitmap_find_next_zero_area_off(map, size, start, nr, | ||
146 | align_mask, 0); | ||
147 | } | ||
122 | 148 | ||
123 | extern int bitmap_scnprintf(char *buf, unsigned int len, | 149 | extern int bitmap_scnprintf(char *buf, unsigned int len, |
124 | const unsigned long *src, int nbits); | 150 | const unsigned long *src, int nbits); |
diff --git a/include/linux/compat.h b/include/linux/compat.h index e6494261eaff..7450ca2ac1fc 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h | |||
@@ -357,6 +357,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int); | |||
357 | 357 | ||
358 | asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, | 358 | asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, |
359 | const compat_uptr_t __user *envp); | 359 | const compat_uptr_t __user *envp); |
360 | asmlinkage long compat_sys_execveat(int dfd, const char __user *filename, | ||
361 | const compat_uptr_t __user *argv, | ||
362 | const compat_uptr_t __user *envp, int flags); | ||
360 | 363 | ||
361 | asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, | 364 | asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, |
362 | compat_ulong_t __user *outp, compat_ulong_t __user *exp, | 365 | compat_ulong_t __user *outp, compat_ulong_t __user *exp, |
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index c6f996f2abb6..798fad9e420d 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h | |||
@@ -5,6 +5,7 @@ | |||
5 | 5 | ||
6 | #include <linux/types.h> | 6 | #include <linux/types.h> |
7 | #include <linux/debugfs.h> | 7 | #include <linux/debugfs.h> |
8 | #include <linux/ratelimit.h> | ||
8 | #include <linux/atomic.h> | 9 | #include <linux/atomic.h> |
9 | 10 | ||
10 | /* | 11 | /* |
@@ -25,14 +26,18 @@ struct fault_attr { | |||
25 | unsigned long reject_end; | 26 | unsigned long reject_end; |
26 | 27 | ||
27 | unsigned long count; | 28 | unsigned long count; |
29 | struct ratelimit_state ratelimit_state; | ||
30 | struct dentry *dname; | ||
28 | }; | 31 | }; |
29 | 32 | ||
30 | #define FAULT_ATTR_INITIALIZER { \ | 33 | #define FAULT_ATTR_INITIALIZER { \ |
31 | .interval = 1, \ | 34 | .interval = 1, \ |
32 | .times = ATOMIC_INIT(1), \ | 35 | .times = ATOMIC_INIT(1), \ |
33 | .require_end = ULONG_MAX, \ | 36 | .require_end = ULONG_MAX, \ |
34 | .stacktrace_depth = 32, \ | 37 | .stacktrace_depth = 32, \ |
35 | .verbose = 2, \ | 38 | .ratelimit_state = RATELIMIT_STATE_INIT_DISABLED, \ |
39 | .verbose = 2, \ | ||
40 | .dname = NULL, \ | ||
36 | } | 41 | } |
37 | 42 | ||
38 | #define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER | 43 | #define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER |
diff --git a/include/linux/fs.h b/include/linux/fs.h index bb29b02d9bb6..4193a0bd99b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/pid.h> | 18 | #include <linux/pid.h> |
19 | #include <linux/bug.h> | 19 | #include <linux/bug.h> |
20 | #include <linux/mutex.h> | 20 | #include <linux/mutex.h> |
21 | #include <linux/rwsem.h> | ||
21 | #include <linux/capability.h> | 22 | #include <linux/capability.h> |
22 | #include <linux/semaphore.h> | 23 | #include <linux/semaphore.h> |
23 | #include <linux/fiemap.h> | 24 | #include <linux/fiemap.h> |
@@ -401,7 +402,7 @@ struct address_space { | |||
401 | atomic_t i_mmap_writable;/* count VM_SHARED mappings */ | 402 | atomic_t i_mmap_writable;/* count VM_SHARED mappings */ |
402 | struct rb_root i_mmap; /* tree of private and shared mappings */ | 403 | struct rb_root i_mmap; /* tree of private and shared mappings */ |
403 | struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ | 404 | struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ |
404 | struct mutex i_mmap_mutex; /* protect tree, count, list */ | 405 | struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ |
405 | /* Protected by tree_lock together with the radix tree */ | 406 | /* Protected by tree_lock together with the radix tree */ |
406 | unsigned long nrpages; /* number of total pages */ | 407 | unsigned long nrpages; /* number of total pages */ |
407 | unsigned long nrshadows; /* number of shadow entries */ | 408 | unsigned long nrshadows; /* number of shadow entries */ |
@@ -467,6 +468,26 @@ struct block_device { | |||
467 | 468 | ||
468 | int mapping_tagged(struct address_space *mapping, int tag); | 469 | int mapping_tagged(struct address_space *mapping, int tag); |
469 | 470 | ||
471 | static inline void i_mmap_lock_write(struct address_space *mapping) | ||
472 | { | ||
473 | down_write(&mapping->i_mmap_rwsem); | ||
474 | } | ||
475 | |||
476 | static inline void i_mmap_unlock_write(struct address_space *mapping) | ||
477 | { | ||
478 | up_write(&mapping->i_mmap_rwsem); | ||
479 | } | ||
480 | |||
481 | static inline void i_mmap_lock_read(struct address_space *mapping) | ||
482 | { | ||
483 | down_read(&mapping->i_mmap_rwsem); | ||
484 | } | ||
485 | |||
486 | static inline void i_mmap_unlock_read(struct address_space *mapping) | ||
487 | { | ||
488 | up_read(&mapping->i_mmap_rwsem); | ||
489 | } | ||
490 | |||
470 | /* | 491 | /* |
471 | * Might pages of this file be mapped into userspace? | 492 | * Might pages of this file be mapped into userspace? |
472 | */ | 493 | */ |
@@ -2075,6 +2096,7 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *); | |||
2075 | extern struct file * dentry_open(const struct path *, int, const struct cred *); | 2096 | extern struct file * dentry_open(const struct path *, int, const struct cred *); |
2076 | extern int filp_close(struct file *, fl_owner_t id); | 2097 | extern int filp_close(struct file *, fl_owner_t id); |
2077 | 2098 | ||
2099 | extern struct filename *getname_flags(const char __user *, int, int *); | ||
2078 | extern struct filename *getname(const char __user *); | 2100 | extern struct filename *getname(const char __user *); |
2079 | extern struct filename *getname_kernel(const char *); | 2101 | extern struct filename *getname_kernel(const char *); |
2080 | 2102 | ||
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index ca060d7c4fa6..0f313f93c586 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h | |||
@@ -197,24 +197,6 @@ struct fsnotify_group { | |||
197 | #define FSNOTIFY_EVENT_INODE 2 | 197 | #define FSNOTIFY_EVENT_INODE 2 |
198 | 198 | ||
199 | /* | 199 | /* |
200 | * Inode specific fields in an fsnotify_mark | ||
201 | */ | ||
202 | struct fsnotify_inode_mark { | ||
203 | struct inode *inode; /* inode this mark is associated with */ | ||
204 | struct hlist_node i_list; /* list of marks by inode->i_fsnotify_marks */ | ||
205 | struct list_head free_i_list; /* tmp list used when freeing this mark */ | ||
206 | }; | ||
207 | |||
208 | /* | ||
209 | * Mount point specific fields in an fsnotify_mark | ||
210 | */ | ||
211 | struct fsnotify_vfsmount_mark { | ||
212 | struct vfsmount *mnt; /* vfsmount this mark is associated with */ | ||
213 | struct hlist_node m_list; /* list of marks by inode->i_fsnotify_marks */ | ||
214 | struct list_head free_m_list; /* tmp list used when freeing this mark */ | ||
215 | }; | ||
216 | |||
217 | /* | ||
218 | * a mark is simply an object attached to an in core inode which allows an | 200 | * a mark is simply an object attached to an in core inode which allows an |
219 | * fsnotify listener to indicate they are either no longer interested in events | 201 | * fsnotify listener to indicate they are either no longer interested in events |
220 | * of a type matching mask or only interested in those events. | 202 | * of a type matching mask or only interested in those events. |
@@ -230,11 +212,17 @@ struct fsnotify_mark { | |||
230 | * in kernel that found and may be using this mark. */ | 212 | * in kernel that found and may be using this mark. */ |
231 | atomic_t refcnt; /* active things looking at this mark */ | 213 | atomic_t refcnt; /* active things looking at this mark */ |
232 | struct fsnotify_group *group; /* group this mark is for */ | 214 | struct fsnotify_group *group; /* group this mark is for */ |
233 | struct list_head g_list; /* list of marks by group->i_fsnotify_marks */ | 215 | struct list_head g_list; /* list of marks by group->i_fsnotify_marks |
216 | * Also reused for queueing mark into | ||
217 | * destroy_list when it's waiting for | ||
218 | * the end of SRCU period before it can | ||
219 | * be freed */ | ||
234 | spinlock_t lock; /* protect group and inode */ | 220 | spinlock_t lock; /* protect group and inode */ |
221 | struct hlist_node obj_list; /* list of marks for inode / vfsmount */ | ||
222 | struct list_head free_list; /* tmp list used when freeing this mark */ | ||
235 | union { | 223 | union { |
236 | struct fsnotify_inode_mark i; | 224 | struct inode *inode; /* inode this mark is associated with */ |
237 | struct fsnotify_vfsmount_mark m; | 225 | struct vfsmount *mnt; /* vfsmount this mark is associated with */ |
238 | }; | 226 | }; |
239 | __u32 ignored_mask; /* events types to ignore */ | 227 | __u32 ignored_mask; /* events types to ignore */ |
240 | #define FSNOTIFY_MARK_FLAG_INODE 0x01 | 228 | #define FSNOTIFY_MARK_FLAG_INODE 0x01 |
@@ -243,7 +231,6 @@ struct fsnotify_mark { | |||
243 | #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08 | 231 | #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08 |
244 | #define FSNOTIFY_MARK_FLAG_ALIVE 0x10 | 232 | #define FSNOTIFY_MARK_FLAG_ALIVE 0x10 |
245 | unsigned int flags; /* vfsmount or inode mark? */ | 233 | unsigned int flags; /* vfsmount or inode mark? */ |
246 | struct list_head destroy_list; | ||
247 | void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */ | 234 | void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */ |
248 | }; | 235 | }; |
249 | 236 | ||
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 07d2699cdb51..b840e3b2770d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -110,11 +110,8 @@ struct vm_area_struct; | |||
110 | #define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ | 110 | #define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ |
111 | __GFP_RECLAIMABLE) | 111 | __GFP_RECLAIMABLE) |
112 | #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) | 112 | #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) |
113 | #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ | 113 | #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) |
114 | __GFP_HIGHMEM) | 114 | #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) |
115 | #define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \ | ||
116 | __GFP_HARDWALL | __GFP_HIGHMEM | \ | ||
117 | __GFP_MOVABLE) | ||
118 | #define GFP_IOFS (__GFP_IO | __GFP_FS) | 115 | #define GFP_IOFS (__GFP_IO | __GFP_FS) |
119 | #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ | 116 | #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ |
120 | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ | 117 | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ |
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 35e7eca4e33b..e365d5ec69cb 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h | |||
@@ -7,15 +7,6 @@ | |||
7 | #include <linux/notifier.h> | 7 | #include <linux/notifier.h> |
8 | #include <linux/nsproxy.h> | 8 | #include <linux/nsproxy.h> |
9 | 9 | ||
10 | /* | ||
11 | * ipc namespace events | ||
12 | */ | ||
13 | #define IPCNS_MEMCHANGED 0x00000001 /* Notify lowmem size changed */ | ||
14 | #define IPCNS_CREATED 0x00000002 /* Notify new ipc namespace created */ | ||
15 | #define IPCNS_REMOVED 0x00000003 /* Notify ipc namespace removed */ | ||
16 | |||
17 | #define IPCNS_CALLBACK_PRI 0 | ||
18 | |||
19 | struct user_namespace; | 10 | struct user_namespace; |
20 | 11 | ||
21 | struct ipc_ids { | 12 | struct ipc_ids { |
@@ -38,7 +29,6 @@ struct ipc_namespace { | |||
38 | unsigned int msg_ctlmni; | 29 | unsigned int msg_ctlmni; |
39 | atomic_t msg_bytes; | 30 | atomic_t msg_bytes; |
40 | atomic_t msg_hdrs; | 31 | atomic_t msg_hdrs; |
41 | int auto_msgmni; | ||
42 | 32 | ||
43 | size_t shm_ctlmax; | 33 | size_t shm_ctlmax; |
44 | size_t shm_ctlall; | 34 | size_t shm_ctlall; |
@@ -77,18 +67,8 @@ extern atomic_t nr_ipc_ns; | |||
77 | extern spinlock_t mq_lock; | 67 | extern spinlock_t mq_lock; |
78 | 68 | ||
79 | #ifdef CONFIG_SYSVIPC | 69 | #ifdef CONFIG_SYSVIPC |
80 | extern int register_ipcns_notifier(struct ipc_namespace *); | ||
81 | extern int cond_register_ipcns_notifier(struct ipc_namespace *); | ||
82 | extern void unregister_ipcns_notifier(struct ipc_namespace *); | ||
83 | extern int ipcns_notify(unsigned long); | ||
84 | extern void shm_destroy_orphaned(struct ipc_namespace *ns); | 70 | extern void shm_destroy_orphaned(struct ipc_namespace *ns); |
85 | #else /* CONFIG_SYSVIPC */ | 71 | #else /* CONFIG_SYSVIPC */ |
86 | static inline int register_ipcns_notifier(struct ipc_namespace *ns) | ||
87 | { return 0; } | ||
88 | static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns) | ||
89 | { return 0; } | ||
90 | static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { } | ||
91 | static inline int ipcns_notify(unsigned long l) { return 0; } | ||
92 | static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} | 72 | static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} |
93 | #endif /* CONFIG_SYSVIPC */ | 73 | #endif /* CONFIG_SYSVIPC */ |
94 | 74 | ||
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 057e95971014..e705467ddb47 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h | |||
@@ -21,6 +21,8 @@ | |||
21 | #ifndef __KMEMLEAK_H | 21 | #ifndef __KMEMLEAK_H |
22 | #define __KMEMLEAK_H | 22 | #define __KMEMLEAK_H |
23 | 23 | ||
24 | #include <linux/slab.h> | ||
25 | |||
24 | #ifdef CONFIG_DEBUG_KMEMLEAK | 26 | #ifdef CONFIG_DEBUG_KMEMLEAK |
25 | 27 | ||
26 | extern void kmemleak_init(void) __ref; | 28 | extern void kmemleak_init(void) __ref; |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6ea9f919e888..7c95af8d552c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -400,8 +400,8 @@ int memcg_cache_id(struct mem_cgroup *memcg); | |||
400 | 400 | ||
401 | void memcg_update_array_size(int num_groups); | 401 | void memcg_update_array_size(int num_groups); |
402 | 402 | ||
403 | struct kmem_cache * | 403 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); |
404 | __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); | 404 | void __memcg_kmem_put_cache(struct kmem_cache *cachep); |
405 | 405 | ||
406 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); | 406 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); |
407 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); | 407 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); |
@@ -492,7 +492,13 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
492 | if (unlikely(fatal_signal_pending(current))) | 492 | if (unlikely(fatal_signal_pending(current))) |
493 | return cachep; | 493 | return cachep; |
494 | 494 | ||
495 | return __memcg_kmem_get_cache(cachep, gfp); | 495 | return __memcg_kmem_get_cache(cachep); |
496 | } | ||
497 | |||
498 | static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) | ||
499 | { | ||
500 | if (memcg_kmem_enabled()) | ||
501 | __memcg_kmem_put_cache(cachep); | ||
496 | } | 502 | } |
497 | #else | 503 | #else |
498 | #define for_each_memcg_cache_index(_idx) \ | 504 | #define for_each_memcg_cache_index(_idx) \ |
@@ -528,6 +534,10 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
528 | { | 534 | { |
529 | return cachep; | 535 | return cachep; |
530 | } | 536 | } |
537 | |||
538 | static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) | ||
539 | { | ||
540 | } | ||
531 | #endif /* CONFIG_MEMCG_KMEM */ | 541 | #endif /* CONFIG_MEMCG_KMEM */ |
532 | #endif /* _LINUX_MEMCONTROL_H */ | 542 | #endif /* _LINUX_MEMCONTROL_H */ |
533 | 543 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b337efbe533..c0a67b894c4c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/bit_spinlock.h> | 19 | #include <linux/bit_spinlock.h> |
20 | #include <linux/shrinker.h> | 20 | #include <linux/shrinker.h> |
21 | #include <linux/resource.h> | 21 | #include <linux/resource.h> |
22 | #include <linux/page_ext.h> | ||
22 | 23 | ||
23 | struct mempolicy; | 24 | struct mempolicy; |
24 | struct anon_vma; | 25 | struct anon_vma; |
@@ -2060,7 +2061,22 @@ static inline void vm_stat_account(struct mm_struct *mm, | |||
2060 | #endif /* CONFIG_PROC_FS */ | 2061 | #endif /* CONFIG_PROC_FS */ |
2061 | 2062 | ||
2062 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2063 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2063 | extern void kernel_map_pages(struct page *page, int numpages, int enable); | 2064 | extern bool _debug_pagealloc_enabled; |
2065 | extern void __kernel_map_pages(struct page *page, int numpages, int enable); | ||
2066 | |||
2067 | static inline bool debug_pagealloc_enabled(void) | ||
2068 | { | ||
2069 | return _debug_pagealloc_enabled; | ||
2070 | } | ||
2071 | |||
2072 | static inline void | ||
2073 | kernel_map_pages(struct page *page, int numpages, int enable) | ||
2074 | { | ||
2075 | if (!debug_pagealloc_enabled()) | ||
2076 | return; | ||
2077 | |||
2078 | __kernel_map_pages(page, numpages, enable); | ||
2079 | } | ||
2064 | #ifdef CONFIG_HIBERNATION | 2080 | #ifdef CONFIG_HIBERNATION |
2065 | extern bool kernel_page_present(struct page *page); | 2081 | extern bool kernel_page_present(struct page *page); |
2066 | #endif /* CONFIG_HIBERNATION */ | 2082 | #endif /* CONFIG_HIBERNATION */ |
@@ -2094,9 +2110,9 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, | |||
2094 | void __user *, size_t *, loff_t *); | 2110 | void __user *, size_t *, loff_t *); |
2095 | #endif | 2111 | #endif |
2096 | 2112 | ||
2097 | unsigned long shrink_slab(struct shrink_control *shrink, | 2113 | unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, |
2098 | unsigned long nr_pages_scanned, | 2114 | unsigned long nr_scanned, |
2099 | unsigned long lru_pages); | 2115 | unsigned long nr_eligible); |
2100 | 2116 | ||
2101 | #ifndef CONFIG_MMU | 2117 | #ifndef CONFIG_MMU |
2102 | #define randomize_va_space 0 | 2118 | #define randomize_va_space 0 |
@@ -2155,20 +2171,36 @@ extern void copy_user_huge_page(struct page *dst, struct page *src, | |||
2155 | unsigned int pages_per_huge_page); | 2171 | unsigned int pages_per_huge_page); |
2156 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | 2172 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ |
2157 | 2173 | ||
2174 | extern struct page_ext_operations debug_guardpage_ops; | ||
2175 | extern struct page_ext_operations page_poisoning_ops; | ||
2176 | |||
2158 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2177 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2159 | extern unsigned int _debug_guardpage_minorder; | 2178 | extern unsigned int _debug_guardpage_minorder; |
2179 | extern bool _debug_guardpage_enabled; | ||
2160 | 2180 | ||
2161 | static inline unsigned int debug_guardpage_minorder(void) | 2181 | static inline unsigned int debug_guardpage_minorder(void) |
2162 | { | 2182 | { |
2163 | return _debug_guardpage_minorder; | 2183 | return _debug_guardpage_minorder; |
2164 | } | 2184 | } |
2165 | 2185 | ||
2186 | static inline bool debug_guardpage_enabled(void) | ||
2187 | { | ||
2188 | return _debug_guardpage_enabled; | ||
2189 | } | ||
2190 | |||
2166 | static inline bool page_is_guard(struct page *page) | 2191 | static inline bool page_is_guard(struct page *page) |
2167 | { | 2192 | { |
2168 | return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | 2193 | struct page_ext *page_ext; |
2194 | |||
2195 | if (!debug_guardpage_enabled()) | ||
2196 | return false; | ||
2197 | |||
2198 | page_ext = lookup_page_ext(page); | ||
2199 | return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); | ||
2169 | } | 2200 | } |
2170 | #else | 2201 | #else |
2171 | static inline unsigned int debug_guardpage_minorder(void) { return 0; } | 2202 | static inline unsigned int debug_guardpage_minorder(void) { return 0; } |
2203 | static inline bool debug_guardpage_enabled(void) { return false; } | ||
2172 | static inline bool page_is_guard(struct page *page) { return false; } | 2204 | static inline bool page_is_guard(struct page *page) { return false; } |
2173 | #endif /* CONFIG_DEBUG_PAGEALLOC */ | 2205 | #endif /* CONFIG_DEBUG_PAGEALLOC */ |
2174 | 2206 | ||
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bf9f57529dcf..6d34aa266a8c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -10,7 +10,6 @@ | |||
10 | #include <linux/rwsem.h> | 10 | #include <linux/rwsem.h> |
11 | #include <linux/completion.h> | 11 | #include <linux/completion.h> |
12 | #include <linux/cpumask.h> | 12 | #include <linux/cpumask.h> |
13 | #include <linux/page-debug-flags.h> | ||
14 | #include <linux/uprobes.h> | 13 | #include <linux/uprobes.h> |
15 | #include <linux/page-flags-layout.h> | 14 | #include <linux/page-flags-layout.h> |
16 | #include <asm/page.h> | 15 | #include <asm/page.h> |
@@ -186,9 +185,6 @@ struct page { | |||
186 | void *virtual; /* Kernel virtual address (NULL if | 185 | void *virtual; /* Kernel virtual address (NULL if |
187 | not kmapped, ie. highmem) */ | 186 | not kmapped, ie. highmem) */ |
188 | #endif /* WANT_PAGE_VIRTUAL */ | 187 | #endif /* WANT_PAGE_VIRTUAL */ |
189 | #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS | ||
190 | unsigned long debug_flags; /* Use atomic bitops on this */ | ||
191 | #endif | ||
192 | 188 | ||
193 | #ifdef CONFIG_KMEMCHECK | 189 | #ifdef CONFIG_KMEMCHECK |
194 | /* | 190 | /* |
@@ -534,4 +530,12 @@ enum tlb_flush_reason { | |||
534 | NR_TLB_FLUSH_REASONS, | 530 | NR_TLB_FLUSH_REASONS, |
535 | }; | 531 | }; |
536 | 532 | ||
533 | /* | ||
534 | * A swap entry has to fit into a "unsigned long", as the entry is hidden | ||
535 | * in the "index" field of the swapper address space. | ||
536 | */ | ||
537 | typedef struct { | ||
538 | unsigned long val; | ||
539 | } swp_entry_t; | ||
540 | |||
537 | #endif /* _LINUX_MM_TYPES_H */ | 541 | #endif /* _LINUX_MM_TYPES_H */ |
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 88787bb4b3b9..ab8564b03468 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h | |||
@@ -154,7 +154,7 @@ struct mmu_notifier_ops { | |||
154 | * Therefore notifier chains can only be traversed when either | 154 | * Therefore notifier chains can only be traversed when either |
155 | * | 155 | * |
156 | * 1. mmap_sem is held. | 156 | * 1. mmap_sem is held. |
157 | * 2. One of the reverse map locks is held (i_mmap_mutex or anon_vma->rwsem). | 157 | * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). |
158 | * 3. No other concurrent thread can access the list (release) | 158 | * 3. No other concurrent thread can access the list (release) |
159 | */ | 159 | */ |
160 | struct mmu_notifier { | 160 | struct mmu_notifier { |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3879d7664dfc..2f0856d14b21 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -722,6 +722,9 @@ typedef struct pglist_data { | |||
722 | int nr_zones; | 722 | int nr_zones; |
723 | #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ | 723 | #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ |
724 | struct page *node_mem_map; | 724 | struct page *node_mem_map; |
725 | #ifdef CONFIG_PAGE_EXTENSION | ||
726 | struct page_ext *node_page_ext; | ||
727 | #endif | ||
725 | #endif | 728 | #endif |
726 | #ifndef CONFIG_NO_BOOTMEM | 729 | #ifndef CONFIG_NO_BOOTMEM |
727 | struct bootmem_data *bdata; | 730 | struct bootmem_data *bdata; |
@@ -1075,6 +1078,7 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) | |||
1075 | #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) | 1078 | #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) |
1076 | 1079 | ||
1077 | struct page; | 1080 | struct page; |
1081 | struct page_ext; | ||
1078 | struct mem_section { | 1082 | struct mem_section { |
1079 | /* | 1083 | /* |
1080 | * This is, logically, a pointer to an array of struct | 1084 | * This is, logically, a pointer to an array of struct |
@@ -1092,6 +1096,14 @@ struct mem_section { | |||
1092 | 1096 | ||
1093 | /* See declaration of similar field in struct zone */ | 1097 | /* See declaration of similar field in struct zone */ |
1094 | unsigned long *pageblock_flags; | 1098 | unsigned long *pageblock_flags; |
1099 | #ifdef CONFIG_PAGE_EXTENSION | ||
1100 | /* | ||
1101 | * If !SPARSEMEM, pgdat doesn't have page_ext pointer. We use | ||
1102 | * section. (see page_ext.h about this.) | ||
1103 | */ | ||
1104 | struct page_ext *page_ext; | ||
1105 | unsigned long pad; | ||
1106 | #endif | ||
1095 | /* | 1107 | /* |
1096 | * WARNING: mem_section must be a power-of-2 in size for the | 1108 | * WARNING: mem_section must be a power-of-2 in size for the |
1097 | * calculation and use of SECTION_ROOT_MASK to make sense. | 1109 | * calculation and use of SECTION_ROOT_MASK to make sense. |
diff --git a/include/linux/oom.h b/include/linux/oom.h index e8d6e1058723..853698c721f7 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -92,6 +92,17 @@ static inline bool oom_gfp_allowed(gfp_t gfp_mask) | |||
92 | 92 | ||
93 | extern struct task_struct *find_lock_task_mm(struct task_struct *p); | 93 | extern struct task_struct *find_lock_task_mm(struct task_struct *p); |
94 | 94 | ||
95 | static inline bool task_will_free_mem(struct task_struct *task) | ||
96 | { | ||
97 | /* | ||
98 | * A coredumping process may sleep for an extended period in exit_mm(), | ||
99 | * so the oom killer cannot assume that the process will promptly exit | ||
100 | * and release memory. | ||
101 | */ | ||
102 | return (task->flags & PF_EXITING) && | ||
103 | !(task->signal->flags & SIGNAL_GROUP_COREDUMP); | ||
104 | } | ||
105 | |||
95 | /* sysctls */ | 106 | /* sysctls */ |
96 | extern int sysctl_oom_dump_tasks; | 107 | extern int sysctl_oom_dump_tasks; |
97 | extern int sysctl_oom_kill_allocating_task; | 108 | extern int sysctl_oom_kill_allocating_task; |
diff --git a/include/linux/page-debug-flags.h b/include/linux/page-debug-flags.h deleted file mode 100644 index 22691f614043..000000000000 --- a/include/linux/page-debug-flags.h +++ /dev/null | |||
@@ -1,32 +0,0 @@ | |||
1 | #ifndef LINUX_PAGE_DEBUG_FLAGS_H | ||
2 | #define LINUX_PAGE_DEBUG_FLAGS_H | ||
3 | |||
4 | /* | ||
5 | * page->debug_flags bits: | ||
6 | * | ||
7 | * PAGE_DEBUG_FLAG_POISON is set for poisoned pages. This is used to | ||
8 | * implement generic debug pagealloc feature. The pages are filled with | ||
9 | * poison patterns and set this flag after free_pages(). The poisoned | ||
10 | * pages are verified whether the patterns are not corrupted and clear | ||
11 | * the flag before alloc_pages(). | ||
12 | */ | ||
13 | |||
14 | enum page_debug_flags { | ||
15 | PAGE_DEBUG_FLAG_POISON, /* Page is poisoned */ | ||
16 | PAGE_DEBUG_FLAG_GUARD, | ||
17 | }; | ||
18 | |||
19 | /* | ||
20 | * Ensure that CONFIG_WANT_PAGE_DEBUG_FLAGS reliably | ||
21 | * gets turned off when no debug features are enabling it! | ||
22 | */ | ||
23 | |||
24 | #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS | ||
25 | #if !defined(CONFIG_PAGE_POISONING) && \ | ||
26 | !defined(CONFIG_PAGE_GUARD) \ | ||
27 | /* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */ | ||
28 | #error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features! | ||
29 | #endif | ||
30 | #endif /* CONFIG_WANT_PAGE_DEBUG_FLAGS */ | ||
31 | |||
32 | #endif /* LINUX_PAGE_DEBUG_FLAGS_H */ | ||
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h new file mode 100644 index 000000000000..d2a2c84c72d0 --- /dev/null +++ b/include/linux/page_ext.h | |||
@@ -0,0 +1,84 @@ | |||
1 | #ifndef __LINUX_PAGE_EXT_H | ||
2 | #define __LINUX_PAGE_EXT_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | #include <linux/stacktrace.h> | ||
6 | |||
7 | struct pglist_data; | ||
8 | struct page_ext_operations { | ||
9 | bool (*need)(void); | ||
10 | void (*init)(void); | ||
11 | }; | ||
12 | |||
13 | #ifdef CONFIG_PAGE_EXTENSION | ||
14 | |||
15 | /* | ||
16 | * page_ext->flags bits: | ||
17 | * | ||
18 | * PAGE_EXT_DEBUG_POISON is set for poisoned pages. This is used to | ||
19 | * implement generic debug pagealloc feature. The pages are filled with | ||
20 | * poison patterns and set this flag after free_pages(). The poisoned | ||
21 | * pages are verified whether the patterns are not corrupted and clear | ||
22 | * the flag before alloc_pages(). | ||
23 | */ | ||
24 | |||
25 | enum page_ext_flags { | ||
26 | PAGE_EXT_DEBUG_POISON, /* Page is poisoned */ | ||
27 | PAGE_EXT_DEBUG_GUARD, | ||
28 | PAGE_EXT_OWNER, | ||
29 | }; | ||
30 | |||
31 | /* | ||
32 | * Page Extension can be considered as an extended mem_map. | ||
33 | * A page_ext page is associated with every page descriptor. The | ||
34 | * page_ext helps us add more information about the page. | ||
35 | * All page_ext are allocated at boot or memory hotplug event, | ||
36 | * then the page_ext for pfn always exists. | ||
37 | */ | ||
38 | struct page_ext { | ||
39 | unsigned long flags; | ||
40 | #ifdef CONFIG_PAGE_OWNER | ||
41 | unsigned int order; | ||
42 | gfp_t gfp_mask; | ||
43 | struct stack_trace trace; | ||
44 | unsigned long trace_entries[8]; | ||
45 | #endif | ||
46 | }; | ||
47 | |||
48 | extern void pgdat_page_ext_init(struct pglist_data *pgdat); | ||
49 | |||
50 | #ifdef CONFIG_SPARSEMEM | ||
51 | static inline void page_ext_init_flatmem(void) | ||
52 | { | ||
53 | } | ||
54 | extern void page_ext_init(void); | ||
55 | #else | ||
56 | extern void page_ext_init_flatmem(void); | ||
57 | static inline void page_ext_init(void) | ||
58 | { | ||
59 | } | ||
60 | #endif | ||
61 | |||
62 | struct page_ext *lookup_page_ext(struct page *page); | ||
63 | |||
64 | #else /* !CONFIG_PAGE_EXTENSION */ | ||
65 | struct page_ext; | ||
66 | |||
67 | static inline void pgdat_page_ext_init(struct pglist_data *pgdat) | ||
68 | { | ||
69 | } | ||
70 | |||
71 | static inline struct page_ext *lookup_page_ext(struct page *page) | ||
72 | { | ||
73 | return NULL; | ||
74 | } | ||
75 | |||
76 | static inline void page_ext_init(void) | ||
77 | { | ||
78 | } | ||
79 | |||
80 | static inline void page_ext_init_flatmem(void) | ||
81 | { | ||
82 | } | ||
83 | #endif /* CONFIG_PAGE_EXTENSION */ | ||
84 | #endif /* __LINUX_PAGE_EXT_H */ | ||
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h new file mode 100644 index 000000000000..b48c3471c254 --- /dev/null +++ b/include/linux/page_owner.h | |||
@@ -0,0 +1,38 @@ | |||
1 | #ifndef __LINUX_PAGE_OWNER_H | ||
2 | #define __LINUX_PAGE_OWNER_H | ||
3 | |||
4 | #ifdef CONFIG_PAGE_OWNER | ||
5 | extern bool page_owner_inited; | ||
6 | extern struct page_ext_operations page_owner_ops; | ||
7 | |||
8 | extern void __reset_page_owner(struct page *page, unsigned int order); | ||
9 | extern void __set_page_owner(struct page *page, | ||
10 | unsigned int order, gfp_t gfp_mask); | ||
11 | |||
12 | static inline void reset_page_owner(struct page *page, unsigned int order) | ||
13 | { | ||
14 | if (likely(!page_owner_inited)) | ||
15 | return; | ||
16 | |||
17 | __reset_page_owner(page, order); | ||
18 | } | ||
19 | |||
20 | static inline void set_page_owner(struct page *page, | ||
21 | unsigned int order, gfp_t gfp_mask) | ||
22 | { | ||
23 | if (likely(!page_owner_inited)) | ||
24 | return; | ||
25 | |||
26 | __set_page_owner(page, order, gfp_mask); | ||
27 | } | ||
28 | #else | ||
29 | static inline void reset_page_owner(struct page *page, unsigned int order) | ||
30 | { | ||
31 | } | ||
32 | static inline void set_page_owner(struct page *page, | ||
33 | unsigned int order, gfp_t gfp_mask) | ||
34 | { | ||
35 | } | ||
36 | |||
37 | #endif /* CONFIG_PAGE_OWNER */ | ||
38 | #endif /* __LINUX_PAGE_OWNER_H */ | ||
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 420032d41d27..57f3a1c550dc 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h | |||
@@ -254,8 +254,6 @@ do { \ | |||
254 | #endif /* CONFIG_SMP */ | 254 | #endif /* CONFIG_SMP */ |
255 | 255 | ||
256 | #define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu)) | 256 | #define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu)) |
257 | #define __raw_get_cpu_var(var) (*raw_cpu_ptr(&(var))) | ||
258 | #define __get_cpu_var(var) (*this_cpu_ptr(&(var))) | ||
259 | 257 | ||
260 | /* | 258 | /* |
261 | * Must be an lvalue. Since @var must be a simple identifier, | 259 | * Must be an lvalue. Since @var must be a simple identifier, |
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index 0a260d8a18bf..18102529254e 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h | |||
@@ -17,14 +17,20 @@ struct ratelimit_state { | |||
17 | unsigned long begin; | 17 | unsigned long begin; |
18 | }; | 18 | }; |
19 | 19 | ||
20 | #define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \ | 20 | #define RATELIMIT_STATE_INIT(name, interval_init, burst_init) { \ |
21 | \ | ||
22 | struct ratelimit_state name = { \ | ||
23 | .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ | 21 | .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ |
24 | .interval = interval_init, \ | 22 | .interval = interval_init, \ |
25 | .burst = burst_init, \ | 23 | .burst = burst_init, \ |
26 | } | 24 | } |
27 | 25 | ||
26 | #define RATELIMIT_STATE_INIT_DISABLED \ | ||
27 | RATELIMIT_STATE_INIT(ratelimit_state, 0, DEFAULT_RATELIMIT_BURST) | ||
28 | |||
29 | #define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \ | ||
30 | \ | ||
31 | struct ratelimit_state name = \ | ||
32 | RATELIMIT_STATE_INIT(name, interval_init, burst_init) \ | ||
33 | |||
28 | static inline void ratelimit_state_init(struct ratelimit_state *rs, | 34 | static inline void ratelimit_state_init(struct ratelimit_state *rs, |
29 | int interval, int burst) | 35 | int interval, int burst) |
30 | { | 36 | { |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 55f5ee7cc3d3..8db31ef98d2f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1364,6 +1364,10 @@ struct task_struct { | |||
1364 | unsigned sched_reset_on_fork:1; | 1364 | unsigned sched_reset_on_fork:1; |
1365 | unsigned sched_contributes_to_load:1; | 1365 | unsigned sched_contributes_to_load:1; |
1366 | 1366 | ||
1367 | #ifdef CONFIG_MEMCG_KMEM | ||
1368 | unsigned memcg_kmem_skip_account:1; | ||
1369 | #endif | ||
1370 | |||
1367 | unsigned long atomic_flags; /* Flags needing atomic access. */ | 1371 | unsigned long atomic_flags; /* Flags needing atomic access. */ |
1368 | 1372 | ||
1369 | pid_t pid; | 1373 | pid_t pid; |
@@ -1679,8 +1683,7 @@ struct task_struct { | |||
1679 | /* bitmask and counter of trace recursion */ | 1683 | /* bitmask and counter of trace recursion */ |
1680 | unsigned long trace_recursion; | 1684 | unsigned long trace_recursion; |
1681 | #endif /* CONFIG_TRACING */ | 1685 | #endif /* CONFIG_TRACING */ |
1682 | #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ | 1686 | #ifdef CONFIG_MEMCG |
1683 | unsigned int memcg_kmem_skip_account; | ||
1684 | struct memcg_oom_info { | 1687 | struct memcg_oom_info { |
1685 | struct mem_cgroup *memcg; | 1688 | struct mem_cgroup *memcg; |
1686 | gfp_t gfp_mask; | 1689 | gfp_t gfp_mask; |
@@ -2482,6 +2485,10 @@ extern void do_group_exit(int); | |||
2482 | extern int do_execve(struct filename *, | 2485 | extern int do_execve(struct filename *, |
2483 | const char __user * const __user *, | 2486 | const char __user * const __user *, |
2484 | const char __user * const __user *); | 2487 | const char __user * const __user *); |
2488 | extern int do_execveat(int, struct filename *, | ||
2489 | const char __user * const __user *, | ||
2490 | const char __user * const __user *, | ||
2491 | int); | ||
2485 | extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); | 2492 | extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); |
2486 | struct task_struct *fork_idle(int); | 2493 | struct task_struct *fork_idle(int); |
2487 | extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); | 2494 | extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); |
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 68c097077ef0..f4aee75f00b1 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h | |||
@@ -18,8 +18,6 @@ struct shrink_control { | |||
18 | */ | 18 | */ |
19 | unsigned long nr_to_scan; | 19 | unsigned long nr_to_scan; |
20 | 20 | ||
21 | /* shrink from these nodes */ | ||
22 | nodemask_t nodes_to_scan; | ||
23 | /* current node being shrunk (for NUMA aware shrinkers) */ | 21 | /* current node being shrunk (for NUMA aware shrinkers) */ |
24 | int nid; | 22 | int nid; |
25 | }; | 23 | }; |
diff --git a/include/linux/slab.h b/include/linux/slab.h index 8a2457d42fc8..9a139b637069 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -493,7 +493,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) | |||
493 | * @memcg: pointer to the memcg this cache belongs to | 493 | * @memcg: pointer to the memcg this cache belongs to |
494 | * @list: list_head for the list of all caches in this memcg | 494 | * @list: list_head for the list of all caches in this memcg |
495 | * @root_cache: pointer to the global, root cache, this cache was derived from | 495 | * @root_cache: pointer to the global, root cache, this cache was derived from |
496 | * @nr_pages: number of pages that belongs to this cache. | ||
497 | */ | 496 | */ |
498 | struct memcg_cache_params { | 497 | struct memcg_cache_params { |
499 | bool is_root_cache; | 498 | bool is_root_cache; |
@@ -506,7 +505,6 @@ struct memcg_cache_params { | |||
506 | struct mem_cgroup *memcg; | 505 | struct mem_cgroup *memcg; |
507 | struct list_head list; | 506 | struct list_head list; |
508 | struct kmem_cache *root_cache; | 507 | struct kmem_cache *root_cache; |
509 | atomic_t nr_pages; | ||
510 | }; | 508 | }; |
511 | }; | 509 | }; |
512 | }; | 510 | }; |
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 115b570e3bff..669045ab73f3 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef __LINUX_STACKTRACE_H | 1 | #ifndef __LINUX_STACKTRACE_H |
2 | #define __LINUX_STACKTRACE_H | 2 | #define __LINUX_STACKTRACE_H |
3 | 3 | ||
4 | #include <linux/types.h> | ||
5 | |||
4 | struct task_struct; | 6 | struct task_struct; |
5 | struct pt_regs; | 7 | struct pt_regs; |
6 | 8 | ||
@@ -20,6 +22,8 @@ extern void save_stack_trace_tsk(struct task_struct *tsk, | |||
20 | struct stack_trace *trace); | 22 | struct stack_trace *trace); |
21 | 23 | ||
22 | extern void print_stack_trace(struct stack_trace *trace, int spaces); | 24 | extern void print_stack_trace(struct stack_trace *trace, int spaces); |
25 | extern int snprint_stack_trace(char *buf, size_t size, | ||
26 | struct stack_trace *trace, int spaces); | ||
23 | 27 | ||
24 | #ifdef CONFIG_USER_STACKTRACE_SUPPORT | 28 | #ifdef CONFIG_USER_STACKTRACE_SUPPORT |
25 | extern void save_stack_trace_user(struct stack_trace *trace); | 29 | extern void save_stack_trace_user(struct stack_trace *trace); |
@@ -32,6 +36,7 @@ extern void save_stack_trace_user(struct stack_trace *trace); | |||
32 | # define save_stack_trace_tsk(tsk, trace) do { } while (0) | 36 | # define save_stack_trace_tsk(tsk, trace) do { } while (0) |
33 | # define save_stack_trace_user(trace) do { } while (0) | 37 | # define save_stack_trace_user(trace) do { } while (0) |
34 | # define print_stack_trace(trace, spaces) do { } while (0) | 38 | # define print_stack_trace(trace, spaces) do { } while (0) |
39 | # define snprint_stack_trace(buf, size, trace, spaces) do { } while (0) | ||
35 | #endif | 40 | #endif |
36 | 41 | ||
37 | #endif | 42 | #endif |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 37a585beef5c..34e8b60ab973 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -102,14 +102,6 @@ union swap_header { | |||
102 | } info; | 102 | } info; |
103 | }; | 103 | }; |
104 | 104 | ||
105 | /* A swap entry has to fit into a "unsigned long", as | ||
106 | * the entry is hidden in the "index" field of the | ||
107 | * swapper address space. | ||
108 | */ | ||
109 | typedef struct { | ||
110 | unsigned long val; | ||
111 | } swp_entry_t; | ||
112 | |||
113 | /* | 105 | /* |
114 | * current->reclaim_state points to one of these when a task is running | 106 | * current->reclaim_state points to one of these when a task is running |
115 | * memory reclaim | 107 | * memory reclaim |
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c9afdc7a7f84..85893d744901 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h | |||
@@ -877,4 +877,9 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, | |||
877 | asmlinkage long sys_getrandom(char __user *buf, size_t count, | 877 | asmlinkage long sys_getrandom(char __user *buf, size_t count, |
878 | unsigned int flags); | 878 | unsigned int flags); |
879 | asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); | 879 | asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); |
880 | |||
881 | asmlinkage long sys_execveat(int dfd, const char __user *filename, | ||
882 | const char __user *const __user *argv, | ||
883 | const char __user *const __user *envp, int flags); | ||
884 | |||
880 | #endif | 885 | #endif |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 730334cdf037..9246d32dc973 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -90,6 +90,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
90 | #ifdef CONFIG_DEBUG_VM_VMACACHE | 90 | #ifdef CONFIG_DEBUG_VM_VMACACHE |
91 | VMACACHE_FIND_CALLS, | 91 | VMACACHE_FIND_CALLS, |
92 | VMACACHE_FIND_HITS, | 92 | VMACACHE_FIND_HITS, |
93 | VMACACHE_FULL_FLUSHES, | ||
93 | #endif | 94 | #endif |
94 | NR_VM_EVENT_ITEMS | 95 | NR_VM_EVENT_ITEMS |
95 | }; | 96 | }; |
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 22749c134117..e016bd9b1a04 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h | |||
@@ -707,9 +707,11 @@ __SYSCALL(__NR_getrandom, sys_getrandom) | |||
707 | __SYSCALL(__NR_memfd_create, sys_memfd_create) | 707 | __SYSCALL(__NR_memfd_create, sys_memfd_create) |
708 | #define __NR_bpf 280 | 708 | #define __NR_bpf 280 |
709 | __SYSCALL(__NR_bpf, sys_bpf) | 709 | __SYSCALL(__NR_bpf, sys_bpf) |
710 | #define __NR_execveat 281 | ||
711 | __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) | ||
710 | 712 | ||
711 | #undef __NR_syscalls | 713 | #undef __NR_syscalls |
712 | #define __NR_syscalls 281 | 714 | #define __NR_syscalls 282 |
713 | 715 | ||
714 | /* | 716 | /* |
715 | * All syscalls below here should go away really, | 717 | * All syscalls below here should go away really, |
diff --git a/include/uapi/linux/msg.h b/include/uapi/linux/msg.h index a70375526578..f51c8001dbe5 100644 --- a/include/uapi/linux/msg.h +++ b/include/uapi/linux/msg.h | |||
@@ -51,16 +51,28 @@ struct msginfo { | |||
51 | }; | 51 | }; |
52 | 52 | ||
53 | /* | 53 | /* |
54 | * Scaling factor to compute msgmni: | 54 | * MSGMNI, MSGMAX and MSGMNB are default values which can be |
55 | * the memory dedicated to msg queues (msgmni * msgmnb) should occupy | 55 | * modified by sysctl. |
56 | * at most 1/MSG_MEM_SCALE of the lowmem (see the formula in ipc/msg.c): | 56 | * |
57 | * up to 8MB : msgmni = 16 (MSGMNI) | 57 | * MSGMNI is the upper limit for the number of messages queues per |
58 | * 4 GB : msgmni = 8K | 58 | * namespace. |
59 | * more than 16 GB : msgmni = 32K (IPCMNI) | 59 | * It has been chosen to be as large possible without facilitating |
60 | * scenarios where userspace causes overflows when adjusting the limits via | ||
61 | * operations of the form retrieve current limit; add X; update limit". | ||
62 | * | ||
63 | * MSGMNB is the default size of a new message queue. Non-root tasks can | ||
64 | * decrease the size with msgctl(IPC_SET), root tasks | ||
65 | * (actually: CAP_SYS_RESOURCE) can both increase and decrease the queue | ||
66 | * size. The optimal value is application dependent. | ||
67 | * 16384 is used because it was always used (since 0.99.10) | ||
68 | * | ||
69 | * MAXMAX is the maximum size of an individual message, it's a global | ||
70 | * (per-namespace) limit that applies for all message queues. | ||
71 | * It's set to 1/2 of MSGMNB, to ensure that at least two messages fit into | ||
72 | * the queue. This is also an arbitrary choice (since 2.6.0). | ||
60 | */ | 73 | */ |
61 | #define MSG_MEM_SCALE 32 | ||
62 | 74 | ||
63 | #define MSGMNI 16 /* <= IPCMNI */ /* max # of msg queue identifiers */ | 75 | #define MSGMNI 32000 /* <= IPCMNI */ /* max # of msg queue identifiers */ |
64 | #define MSGMAX 8192 /* <= INT_MAX */ /* max size of message (bytes) */ | 76 | #define MSGMAX 8192 /* <= INT_MAX */ /* max size of message (bytes) */ |
65 | #define MSGMNB 16384 /* <= INT_MAX */ /* default max size of a message queue */ | 77 | #define MSGMNB 16384 /* <= INT_MAX */ /* default max size of a message queue */ |
66 | 78 | ||
diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h index 541fce03b50c..dd73b908b2f3 100644 --- a/include/uapi/linux/sem.h +++ b/include/uapi/linux/sem.h | |||
@@ -63,10 +63,22 @@ struct seminfo { | |||
63 | int semaem; | 63 | int semaem; |
64 | }; | 64 | }; |
65 | 65 | ||
66 | #define SEMMNI 128 /* <= IPCMNI max # of semaphore identifiers */ | 66 | /* |
67 | #define SEMMSL 250 /* <= 8 000 max num of semaphores per id */ | 67 | * SEMMNI, SEMMSL and SEMMNS are default values which can be |
68 | * modified by sysctl. | ||
69 | * The values has been chosen to be larger than necessary for any | ||
70 | * known configuration. | ||
71 | * | ||
72 | * SEMOPM should not be increased beyond 1000, otherwise there is the | ||
73 | * risk that semop()/semtimedop() fails due to kernel memory fragmentation when | ||
74 | * allocating the sop array. | ||
75 | */ | ||
76 | |||
77 | |||
78 | #define SEMMNI 32000 /* <= IPCMNI max # of semaphore identifiers */ | ||
79 | #define SEMMSL 32000 /* <= INT_MAX max num of semaphores per id */ | ||
68 | #define SEMMNS (SEMMNI*SEMMSL) /* <= INT_MAX max # of semaphores in system */ | 80 | #define SEMMNS (SEMMNI*SEMMSL) /* <= INT_MAX max # of semaphores in system */ |
69 | #define SEMOPM 32 /* <= 1 000 max num of ops per semop call */ | 81 | #define SEMOPM 500 /* <= 1 000 max num of ops per semop call */ |
70 | #define SEMVMX 32767 /* <= 32767 semaphore maximum value */ | 82 | #define SEMVMX 32767 /* <= 32767 semaphore maximum value */ |
71 | #define SEMAEM SEMVMX /* adjust on exit max value */ | 83 | #define SEMAEM SEMVMX /* adjust on exit max value */ |
72 | 84 | ||
diff --git a/init/main.c b/init/main.c index ca380ec685de..ed7e7ad5fee0 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/mempolicy.h> | 51 | #include <linux/mempolicy.h> |
52 | #include <linux/key.h> | 52 | #include <linux/key.h> |
53 | #include <linux/buffer_head.h> | 53 | #include <linux/buffer_head.h> |
54 | #include <linux/page_ext.h> | ||
54 | #include <linux/debug_locks.h> | 55 | #include <linux/debug_locks.h> |
55 | #include <linux/debugobjects.h> | 56 | #include <linux/debugobjects.h> |
56 | #include <linux/lockdep.h> | 57 | #include <linux/lockdep.h> |
@@ -484,6 +485,11 @@ void __init __weak thread_info_cache_init(void) | |||
484 | */ | 485 | */ |
485 | static void __init mm_init(void) | 486 | static void __init mm_init(void) |
486 | { | 487 | { |
488 | /* | ||
489 | * page_ext requires contiguous pages, | ||
490 | * bigger than MAX_ORDER unless SPARSEMEM. | ||
491 | */ | ||
492 | page_ext_init_flatmem(); | ||
487 | mem_init(); | 493 | mem_init(); |
488 | kmem_cache_init(); | 494 | kmem_cache_init(); |
489 | percpu_init_late(); | 495 | percpu_init_late(); |
@@ -621,6 +627,7 @@ asmlinkage __visible void __init start_kernel(void) | |||
621 | initrd_start = 0; | 627 | initrd_start = 0; |
622 | } | 628 | } |
623 | #endif | 629 | #endif |
630 | page_ext_init(); | ||
624 | debug_objects_mem_init(); | 631 | debug_objects_mem_init(); |
625 | kmemleak_init(); | 632 | kmemleak_init(); |
626 | setup_per_cpu_pageset(); | 633 | setup_per_cpu_pageset(); |
diff --git a/ipc/Makefile b/ipc/Makefile index 9075e172e52c..86c7300ecdf5 100644 --- a/ipc/Makefile +++ b/ipc/Makefile | |||
@@ -3,7 +3,7 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o | 5 | obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o |
6 | obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o syscall.o | 6 | obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o syscall.o |
7 | obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o | 7 | obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o |
8 | obj_mq-$(CONFIG_COMPAT) += compat_mq.o | 8 | obj_mq-$(CONFIG_COMPAT) += compat_mq.o |
9 | obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) | 9 | obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) |
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index e8075b247497..8ad93c29f511 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c | |||
@@ -62,29 +62,6 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, | |||
62 | return err; | 62 | return err; |
63 | } | 63 | } |
64 | 64 | ||
65 | static int proc_ipc_callback_dointvec_minmax(struct ctl_table *table, int write, | ||
66 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
67 | { | ||
68 | struct ctl_table ipc_table; | ||
69 | size_t lenp_bef = *lenp; | ||
70 | int rc; | ||
71 | |||
72 | memcpy(&ipc_table, table, sizeof(ipc_table)); | ||
73 | ipc_table.data = get_ipc(table); | ||
74 | |||
75 | rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); | ||
76 | |||
77 | if (write && !rc && lenp_bef == *lenp) | ||
78 | /* | ||
79 | * Tunable has successfully been changed by hand. Disable its | ||
80 | * automatic adjustment. This simply requires unregistering | ||
81 | * the notifiers that trigger recalculation. | ||
82 | */ | ||
83 | unregister_ipcns_notifier(current->nsproxy->ipc_ns); | ||
84 | |||
85 | return rc; | ||
86 | } | ||
87 | |||
88 | static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, | 65 | static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, |
89 | void __user *buffer, size_t *lenp, loff_t *ppos) | 66 | void __user *buffer, size_t *lenp, loff_t *ppos) |
90 | { | 67 | { |
@@ -96,54 +73,19 @@ static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, | |||
96 | lenp, ppos); | 73 | lenp, ppos); |
97 | } | 74 | } |
98 | 75 | ||
99 | /* | 76 | static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, |
100 | * Routine that is called when the file "auto_msgmni" has successfully been | ||
101 | * written. | ||
102 | * Two values are allowed: | ||
103 | * 0: unregister msgmni's callback routine from the ipc namespace notifier | ||
104 | * chain. This means that msgmni won't be recomputed anymore upon memory | ||
105 | * add/remove or ipc namespace creation/removal. | ||
106 | * 1: register back the callback routine. | ||
107 | */ | ||
108 | static void ipc_auto_callback(int val) | ||
109 | { | ||
110 | if (!val) | ||
111 | unregister_ipcns_notifier(current->nsproxy->ipc_ns); | ||
112 | else { | ||
113 | /* | ||
114 | * Re-enable automatic recomputing only if not already | ||
115 | * enabled. | ||
116 | */ | ||
117 | recompute_msgmni(current->nsproxy->ipc_ns); | ||
118 | cond_register_ipcns_notifier(current->nsproxy->ipc_ns); | ||
119 | } | ||
120 | } | ||
121 | |||
122 | static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write, | ||
123 | void __user *buffer, size_t *lenp, loff_t *ppos) | 77 | void __user *buffer, size_t *lenp, loff_t *ppos) |
124 | { | 78 | { |
125 | struct ctl_table ipc_table; | 79 | struct ctl_table ipc_table; |
126 | int oldval; | 80 | int dummy = 0; |
127 | int rc; | ||
128 | 81 | ||
129 | memcpy(&ipc_table, table, sizeof(ipc_table)); | 82 | memcpy(&ipc_table, table, sizeof(ipc_table)); |
130 | ipc_table.data = get_ipc(table); | 83 | ipc_table.data = &dummy; |
131 | oldval = *((int *)(ipc_table.data)); | ||
132 | 84 | ||
133 | rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); | 85 | if (write) |
86 | pr_info_once("writing to auto_msgmni has no effect"); | ||
134 | 87 | ||
135 | if (write && !rc) { | 88 | return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); |
136 | int newval = *((int *)(ipc_table.data)); | ||
137 | /* | ||
138 | * The file "auto_msgmni" has correctly been set. | ||
139 | * React by (un)registering the corresponding tunable, if the | ||
140 | * value has changed. | ||
141 | */ | ||
142 | if (newval != oldval) | ||
143 | ipc_auto_callback(newval); | ||
144 | } | ||
145 | |||
146 | return rc; | ||
147 | } | 89 | } |
148 | 90 | ||
149 | #else | 91 | #else |
@@ -151,8 +93,7 @@ static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write, | |||
151 | #define proc_ipc_dointvec NULL | 93 | #define proc_ipc_dointvec NULL |
152 | #define proc_ipc_dointvec_minmax NULL | 94 | #define proc_ipc_dointvec_minmax NULL |
153 | #define proc_ipc_dointvec_minmax_orphans NULL | 95 | #define proc_ipc_dointvec_minmax_orphans NULL |
154 | #define proc_ipc_callback_dointvec_minmax NULL | 96 | #define proc_ipc_auto_msgmni NULL |
155 | #define proc_ipcauto_dointvec_minmax NULL | ||
156 | #endif | 97 | #endif |
157 | 98 | ||
158 | static int zero; | 99 | static int zero; |
@@ -204,11 +145,20 @@ static struct ctl_table ipc_kern_table[] = { | |||
204 | .data = &init_ipc_ns.msg_ctlmni, | 145 | .data = &init_ipc_ns.msg_ctlmni, |
205 | .maxlen = sizeof(init_ipc_ns.msg_ctlmni), | 146 | .maxlen = sizeof(init_ipc_ns.msg_ctlmni), |
206 | .mode = 0644, | 147 | .mode = 0644, |
207 | .proc_handler = proc_ipc_callback_dointvec_minmax, | 148 | .proc_handler = proc_ipc_dointvec_minmax, |
208 | .extra1 = &zero, | 149 | .extra1 = &zero, |
209 | .extra2 = &int_max, | 150 | .extra2 = &int_max, |
210 | }, | 151 | }, |
211 | { | 152 | { |
153 | .procname = "auto_msgmni", | ||
154 | .data = NULL, | ||
155 | .maxlen = sizeof(int), | ||
156 | .mode = 0644, | ||
157 | .proc_handler = proc_ipc_auto_msgmni, | ||
158 | .extra1 = &zero, | ||
159 | .extra2 = &one, | ||
160 | }, | ||
161 | { | ||
212 | .procname = "msgmnb", | 162 | .procname = "msgmnb", |
213 | .data = &init_ipc_ns.msg_ctlmnb, | 163 | .data = &init_ipc_ns.msg_ctlmnb, |
214 | .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), | 164 | .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), |
@@ -224,15 +174,6 @@ static struct ctl_table ipc_kern_table[] = { | |||
224 | .mode = 0644, | 174 | .mode = 0644, |
225 | .proc_handler = proc_ipc_dointvec, | 175 | .proc_handler = proc_ipc_dointvec, |
226 | }, | 176 | }, |
227 | { | ||
228 | .procname = "auto_msgmni", | ||
229 | .data = &init_ipc_ns.auto_msgmni, | ||
230 | .maxlen = sizeof(int), | ||
231 | .mode = 0644, | ||
232 | .proc_handler = proc_ipcauto_dointvec_minmax, | ||
233 | .extra1 = &zero, | ||
234 | .extra2 = &one, | ||
235 | }, | ||
236 | #ifdef CONFIG_CHECKPOINT_RESTORE | 177 | #ifdef CONFIG_CHECKPOINT_RESTORE |
237 | { | 178 | { |
238 | .procname = "sem_next_id", | 179 | .procname = "sem_next_id", |
diff --git a/ipc/ipcns_notifier.c b/ipc/ipcns_notifier.c deleted file mode 100644 index b9b31a4f77e1..000000000000 --- a/ipc/ipcns_notifier.c +++ /dev/null | |||
@@ -1,92 +0,0 @@ | |||
1 | /* | ||
2 | * linux/ipc/ipcns_notifier.c | ||
3 | * Copyright (C) 2007 BULL SA. Nadia Derbey | ||
4 | * | ||
5 | * Notification mechanism for ipc namespaces: | ||
6 | * The callback routine registered in the memory chain invokes the ipcns | ||
7 | * notifier chain with the IPCNS_MEMCHANGED event. | ||
8 | * Each callback routine registered in the ipcns namespace recomputes msgmni | ||
9 | * for the owning namespace. | ||
10 | */ | ||
11 | |||
12 | #include <linux/msg.h> | ||
13 | #include <linux/rcupdate.h> | ||
14 | #include <linux/notifier.h> | ||
15 | #include <linux/nsproxy.h> | ||
16 | #include <linux/ipc_namespace.h> | ||
17 | |||
18 | #include "util.h" | ||
19 | |||
20 | |||
21 | |||
22 | static BLOCKING_NOTIFIER_HEAD(ipcns_chain); | ||
23 | |||
24 | |||
25 | static int ipcns_callback(struct notifier_block *self, | ||
26 | unsigned long action, void *arg) | ||
27 | { | ||
28 | struct ipc_namespace *ns; | ||
29 | |||
30 | switch (action) { | ||
31 | case IPCNS_MEMCHANGED: /* amount of lowmem has changed */ | ||
32 | case IPCNS_CREATED: | ||
33 | case IPCNS_REMOVED: | ||
34 | /* | ||
35 | * It's time to recompute msgmni | ||
36 | */ | ||
37 | ns = container_of(self, struct ipc_namespace, ipcns_nb); | ||
38 | /* | ||
39 | * No need to get a reference on the ns: the 1st job of | ||
40 | * free_ipc_ns() is to unregister the callback routine. | ||
41 | * blocking_notifier_chain_unregister takes the wr lock to do | ||
42 | * it. | ||
43 | * When this callback routine is called the rd lock is held by | ||
44 | * blocking_notifier_call_chain. | ||
45 | * So the ipc ns cannot be freed while we are here. | ||
46 | */ | ||
47 | recompute_msgmni(ns); | ||
48 | break; | ||
49 | default: | ||
50 | break; | ||
51 | } | ||
52 | |||
53 | return NOTIFY_OK; | ||
54 | } | ||
55 | |||
56 | int register_ipcns_notifier(struct ipc_namespace *ns) | ||
57 | { | ||
58 | int rc; | ||
59 | |||
60 | memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb)); | ||
61 | ns->ipcns_nb.notifier_call = ipcns_callback; | ||
62 | ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI; | ||
63 | rc = blocking_notifier_chain_register(&ipcns_chain, &ns->ipcns_nb); | ||
64 | if (!rc) | ||
65 | ns->auto_msgmni = 1; | ||
66 | return rc; | ||
67 | } | ||
68 | |||
69 | int cond_register_ipcns_notifier(struct ipc_namespace *ns) | ||
70 | { | ||
71 | int rc; | ||
72 | |||
73 | memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb)); | ||
74 | ns->ipcns_nb.notifier_call = ipcns_callback; | ||
75 | ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI; | ||
76 | rc = blocking_notifier_chain_cond_register(&ipcns_chain, | ||
77 | &ns->ipcns_nb); | ||
78 | if (!rc) | ||
79 | ns->auto_msgmni = 1; | ||
80 | return rc; | ||
81 | } | ||
82 | |||
83 | void unregister_ipcns_notifier(struct ipc_namespace *ns) | ||
84 | { | ||
85 | blocking_notifier_chain_unregister(&ipcns_chain, &ns->ipcns_nb); | ||
86 | ns->auto_msgmni = 0; | ||
87 | } | ||
88 | |||
89 | int ipcns_notify(unsigned long val) | ||
90 | { | ||
91 | return blocking_notifier_call_chain(&ipcns_chain, val, NULL); | ||
92 | } | ||
@@ -989,43 +989,12 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, | |||
989 | return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); | 989 | return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); |
990 | } | 990 | } |
991 | 991 | ||
992 | /* | ||
993 | * Scale msgmni with the available lowmem size: the memory dedicated to msg | ||
994 | * queues should occupy at most 1/MSG_MEM_SCALE of lowmem. | ||
995 | * Also take into account the number of nsproxies created so far. | ||
996 | * This should be done staying within the (MSGMNI , IPCMNI/nr_ipc_ns) range. | ||
997 | */ | ||
998 | void recompute_msgmni(struct ipc_namespace *ns) | ||
999 | { | ||
1000 | struct sysinfo i; | ||
1001 | unsigned long allowed; | ||
1002 | int nb_ns; | ||
1003 | |||
1004 | si_meminfo(&i); | ||
1005 | allowed = (((i.totalram - i.totalhigh) / MSG_MEM_SCALE) * i.mem_unit) | ||
1006 | / MSGMNB; | ||
1007 | nb_ns = atomic_read(&nr_ipc_ns); | ||
1008 | allowed /= nb_ns; | ||
1009 | |||
1010 | if (allowed < MSGMNI) { | ||
1011 | ns->msg_ctlmni = MSGMNI; | ||
1012 | return; | ||
1013 | } | ||
1014 | |||
1015 | if (allowed > IPCMNI / nb_ns) { | ||
1016 | ns->msg_ctlmni = IPCMNI / nb_ns; | ||
1017 | return; | ||
1018 | } | ||
1019 | |||
1020 | ns->msg_ctlmni = allowed; | ||
1021 | } | ||
1022 | 992 | ||
1023 | void msg_init_ns(struct ipc_namespace *ns) | 993 | void msg_init_ns(struct ipc_namespace *ns) |
1024 | { | 994 | { |
1025 | ns->msg_ctlmax = MSGMAX; | 995 | ns->msg_ctlmax = MSGMAX; |
1026 | ns->msg_ctlmnb = MSGMNB; | 996 | ns->msg_ctlmnb = MSGMNB; |
1027 | 997 | ns->msg_ctlmni = MSGMNI; | |
1028 | recompute_msgmni(ns); | ||
1029 | 998 | ||
1030 | atomic_set(&ns->msg_bytes, 0); | 999 | atomic_set(&ns->msg_bytes, 0); |
1031 | atomic_set(&ns->msg_hdrs, 0); | 1000 | atomic_set(&ns->msg_hdrs, 0); |
@@ -1069,9 +1038,6 @@ void __init msg_init(void) | |||
1069 | { | 1038 | { |
1070 | msg_init_ns(&init_ipc_ns); | 1039 | msg_init_ns(&init_ipc_ns); |
1071 | 1040 | ||
1072 | printk(KERN_INFO "msgmni has been set to %d\n", | ||
1073 | init_ipc_ns.msg_ctlmni); | ||
1074 | |||
1075 | ipc_init_proc_interface("sysvipc/msg", | 1041 | ipc_init_proc_interface("sysvipc/msg", |
1076 | " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", | 1042 | " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", |
1077 | IPC_MSG_IDS, sysvipc_msg_proc_show); | 1043 | IPC_MSG_IDS, sysvipc_msg_proc_show); |
diff --git a/ipc/namespace.c b/ipc/namespace.c index b54468e48e32..1a3ffd40356e 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c | |||
@@ -45,14 +45,6 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, | |||
45 | msg_init_ns(ns); | 45 | msg_init_ns(ns); |
46 | shm_init_ns(ns); | 46 | shm_init_ns(ns); |
47 | 47 | ||
48 | /* | ||
49 | * msgmni has already been computed for the new ipc ns. | ||
50 | * Thus, do the ipcns creation notification before registering that | ||
51 | * new ipcns in the chain. | ||
52 | */ | ||
53 | ipcns_notify(IPCNS_CREATED); | ||
54 | register_ipcns_notifier(ns); | ||
55 | |||
56 | ns->user_ns = get_user_ns(user_ns); | 48 | ns->user_ns = get_user_ns(user_ns); |
57 | 49 | ||
58 | return ns; | 50 | return ns; |
@@ -99,25 +91,11 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, | |||
99 | 91 | ||
100 | static void free_ipc_ns(struct ipc_namespace *ns) | 92 | static void free_ipc_ns(struct ipc_namespace *ns) |
101 | { | 93 | { |
102 | /* | ||
103 | * Unregistering the hotplug notifier at the beginning guarantees | ||
104 | * that the ipc namespace won't be freed while we are inside the | ||
105 | * callback routine. Since the blocking_notifier_chain_XXX routines | ||
106 | * hold a rw lock on the notifier list, unregister_ipcns_notifier() | ||
107 | * won't take the rw lock before blocking_notifier_call_chain() has | ||
108 | * released the rd lock. | ||
109 | */ | ||
110 | unregister_ipcns_notifier(ns); | ||
111 | sem_exit_ns(ns); | 94 | sem_exit_ns(ns); |
112 | msg_exit_ns(ns); | 95 | msg_exit_ns(ns); |
113 | shm_exit_ns(ns); | 96 | shm_exit_ns(ns); |
114 | atomic_dec(&nr_ipc_ns); | 97 | atomic_dec(&nr_ipc_ns); |
115 | 98 | ||
116 | /* | ||
117 | * Do the ipcns removal notification after decrementing nr_ipc_ns in | ||
118 | * order to have a correct value when recomputing msgmni. | ||
119 | */ | ||
120 | ipcns_notify(IPCNS_REMOVED); | ||
121 | put_user_ns(ns->user_ns); | 99 | put_user_ns(ns->user_ns); |
122 | proc_free_inum(ns->proc_inum); | 100 | proc_free_inum(ns->proc_inum); |
123 | kfree(ns); | 101 | kfree(ns); |
@@ -326,10 +326,17 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, | |||
326 | 326 | ||
327 | /* Then check that the global lock is free */ | 327 | /* Then check that the global lock is free */ |
328 | if (!spin_is_locked(&sma->sem_perm.lock)) { | 328 | if (!spin_is_locked(&sma->sem_perm.lock)) { |
329 | /* spin_is_locked() is not a memory barrier */ | 329 | /* |
330 | smp_mb(); | 330 | * The ipc object lock check must be visible on all |
331 | * cores before rechecking the complex count. Otherwise | ||
332 | * we can race with another thread that does: | ||
333 | * complex_count++; | ||
334 | * spin_unlock(sem_perm.lock); | ||
335 | */ | ||
336 | smp_rmb(); | ||
331 | 337 | ||
332 | /* Now repeat the test of complex_count: | 338 | /* |
339 | * Now repeat the test of complex_count: | ||
333 | * It can't change anymore until we drop sem->lock. | 340 | * It can't change anymore until we drop sem->lock. |
334 | * Thus: if is now 0, then it will stay 0. | 341 | * Thus: if is now 0, then it will stay 0. |
335 | */ | 342 | */ |
@@ -219,7 +219,8 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) | |||
219 | if (!is_file_hugepages(shm_file)) | 219 | if (!is_file_hugepages(shm_file)) |
220 | shmem_lock(shm_file, 0, shp->mlock_user); | 220 | shmem_lock(shm_file, 0, shp->mlock_user); |
221 | else if (shp->mlock_user) | 221 | else if (shp->mlock_user) |
222 | user_shm_unlock(file_inode(shm_file)->i_size, shp->mlock_user); | 222 | user_shm_unlock(i_size_read(file_inode(shm_file)), |
223 | shp->mlock_user); | ||
223 | fput(shm_file); | 224 | fput(shm_file); |
224 | ipc_rcu_putref(shp, shm_rcu_free); | 225 | ipc_rcu_putref(shp, shm_rcu_free); |
225 | } | 226 | } |
@@ -1229,6 +1230,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) | |||
1229 | int retval = -EINVAL; | 1230 | int retval = -EINVAL; |
1230 | #ifdef CONFIG_MMU | 1231 | #ifdef CONFIG_MMU |
1231 | loff_t size = 0; | 1232 | loff_t size = 0; |
1233 | struct file *file; | ||
1232 | struct vm_area_struct *next; | 1234 | struct vm_area_struct *next; |
1233 | #endif | 1235 | #endif |
1234 | 1236 | ||
@@ -1245,7 +1247,8 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) | |||
1245 | * started at address shmaddr. It records it's size and then unmaps | 1247 | * started at address shmaddr. It records it's size and then unmaps |
1246 | * it. | 1248 | * it. |
1247 | * - Then it unmaps all shm vmas that started at shmaddr and that | 1249 | * - Then it unmaps all shm vmas that started at shmaddr and that |
1248 | * are within the initially determined size. | 1250 | * are within the initially determined size and that are from the |
1251 | * same shm segment from which we determined the size. | ||
1249 | * Errors from do_munmap are ignored: the function only fails if | 1252 | * Errors from do_munmap are ignored: the function only fails if |
1250 | * it's called with invalid parameters or if it's called to unmap | 1253 | * it's called with invalid parameters or if it's called to unmap |
1251 | * a part of a vma. Both calls in this function are for full vmas, | 1254 | * a part of a vma. Both calls in this function are for full vmas, |
@@ -1271,8 +1274,14 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) | |||
1271 | if ((vma->vm_ops == &shm_vm_ops) && | 1274 | if ((vma->vm_ops == &shm_vm_ops) && |
1272 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { | 1275 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { |
1273 | 1276 | ||
1274 | 1277 | /* | |
1275 | size = file_inode(vma->vm_file)->i_size; | 1278 | * Record the file of the shm segment being |
1279 | * unmapped. With mremap(), someone could place | ||
1280 | * page from another segment but with equal offsets | ||
1281 | * in the range we are unmapping. | ||
1282 | */ | ||
1283 | file = vma->vm_file; | ||
1284 | size = i_size_read(file_inode(vma->vm_file)); | ||
1276 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); | 1285 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); |
1277 | /* | 1286 | /* |
1278 | * We discovered the size of the shm segment, so | 1287 | * We discovered the size of the shm segment, so |
@@ -1298,8 +1307,8 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) | |||
1298 | 1307 | ||
1299 | /* finding a matching vma now does not alter retval */ | 1308 | /* finding a matching vma now does not alter retval */ |
1300 | if ((vma->vm_ops == &shm_vm_ops) && | 1309 | if ((vma->vm_ops == &shm_vm_ops) && |
1301 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) | 1310 | ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && |
1302 | 1311 | (vma->vm_file == file)) | |
1303 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); | 1312 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); |
1304 | vma = next; | 1313 | vma = next; |
1305 | } | 1314 | } |
diff --git a/ipc/util.c b/ipc/util.c index 88adc329888c..106bed0378ab 100644 --- a/ipc/util.c +++ b/ipc/util.c | |||
@@ -71,44 +71,6 @@ struct ipc_proc_iface { | |||
71 | int (*show)(struct seq_file *, void *); | 71 | int (*show)(struct seq_file *, void *); |
72 | }; | 72 | }; |
73 | 73 | ||
74 | static void ipc_memory_notifier(struct work_struct *work) | ||
75 | { | ||
76 | ipcns_notify(IPCNS_MEMCHANGED); | ||
77 | } | ||
78 | |||
79 | static int ipc_memory_callback(struct notifier_block *self, | ||
80 | unsigned long action, void *arg) | ||
81 | { | ||
82 | static DECLARE_WORK(ipc_memory_wq, ipc_memory_notifier); | ||
83 | |||
84 | switch (action) { | ||
85 | case MEM_ONLINE: /* memory successfully brought online */ | ||
86 | case MEM_OFFLINE: /* or offline: it's time to recompute msgmni */ | ||
87 | /* | ||
88 | * This is done by invoking the ipcns notifier chain with the | ||
89 | * IPC_MEMCHANGED event. | ||
90 | * In order not to keep the lock on the hotplug memory chain | ||
91 | * for too long, queue a work item that will, when waken up, | ||
92 | * activate the ipcns notification chain. | ||
93 | */ | ||
94 | schedule_work(&ipc_memory_wq); | ||
95 | break; | ||
96 | case MEM_GOING_ONLINE: | ||
97 | case MEM_GOING_OFFLINE: | ||
98 | case MEM_CANCEL_ONLINE: | ||
99 | case MEM_CANCEL_OFFLINE: | ||
100 | default: | ||
101 | break; | ||
102 | } | ||
103 | |||
104 | return NOTIFY_OK; | ||
105 | } | ||
106 | |||
107 | static struct notifier_block ipc_memory_nb = { | ||
108 | .notifier_call = ipc_memory_callback, | ||
109 | .priority = IPC_CALLBACK_PRI, | ||
110 | }; | ||
111 | |||
112 | /** | 74 | /** |
113 | * ipc_init - initialise ipc subsystem | 75 | * ipc_init - initialise ipc subsystem |
114 | * | 76 | * |
@@ -124,8 +86,6 @@ static int __init ipc_init(void) | |||
124 | sem_init(); | 86 | sem_init(); |
125 | msg_init(); | 87 | msg_init(); |
126 | shm_init(); | 88 | shm_init(); |
127 | register_hotmemory_notifier(&ipc_memory_nb); | ||
128 | register_ipcns_notifier(&init_ipc_ns); | ||
129 | return 0; | 89 | return 0; |
130 | } | 90 | } |
131 | device_initcall(ipc_init); | 91 | device_initcall(ipc_init); |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 80f29e015570..2e0c97427b33 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -174,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk) | |||
174 | struct fsnotify_mark *entry = &chunk->mark; | 174 | struct fsnotify_mark *entry = &chunk->mark; |
175 | struct list_head *list; | 175 | struct list_head *list; |
176 | 176 | ||
177 | if (!entry->i.inode) | 177 | if (!entry->inode) |
178 | return; | 178 | return; |
179 | list = chunk_hash(entry->i.inode); | 179 | list = chunk_hash(entry->inode); |
180 | list_add_rcu(&chunk->hash, list); | 180 | list_add_rcu(&chunk->hash, list); |
181 | } | 181 | } |
182 | 182 | ||
@@ -188,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) | |||
188 | 188 | ||
189 | list_for_each_entry_rcu(p, list, hash) { | 189 | list_for_each_entry_rcu(p, list, hash) { |
190 | /* mark.inode may have gone NULL, but who cares? */ | 190 | /* mark.inode may have gone NULL, but who cares? */ |
191 | if (p->mark.i.inode == inode) { | 191 | if (p->mark.inode == inode) { |
192 | atomic_long_inc(&p->refs); | 192 | atomic_long_inc(&p->refs); |
193 | return p; | 193 | return p; |
194 | } | 194 | } |
@@ -231,7 +231,7 @@ static void untag_chunk(struct node *p) | |||
231 | new = alloc_chunk(size); | 231 | new = alloc_chunk(size); |
232 | 232 | ||
233 | spin_lock(&entry->lock); | 233 | spin_lock(&entry->lock); |
234 | if (chunk->dead || !entry->i.inode) { | 234 | if (chunk->dead || !entry->inode) { |
235 | spin_unlock(&entry->lock); | 235 | spin_unlock(&entry->lock); |
236 | if (new) | 236 | if (new) |
237 | free_chunk(new); | 237 | free_chunk(new); |
@@ -258,7 +258,7 @@ static void untag_chunk(struct node *p) | |||
258 | goto Fallback; | 258 | goto Fallback; |
259 | 259 | ||
260 | fsnotify_duplicate_mark(&new->mark, entry); | 260 | fsnotify_duplicate_mark(&new->mark, entry); |
261 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { | 261 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) { |
262 | fsnotify_put_mark(&new->mark); | 262 | fsnotify_put_mark(&new->mark); |
263 | goto Fallback; | 263 | goto Fallback; |
264 | } | 264 | } |
@@ -386,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
386 | chunk_entry = &chunk->mark; | 386 | chunk_entry = &chunk->mark; |
387 | 387 | ||
388 | spin_lock(&old_entry->lock); | 388 | spin_lock(&old_entry->lock); |
389 | if (!old_entry->i.inode) { | 389 | if (!old_entry->inode) { |
390 | /* old_entry is being shot, lets just lie */ | 390 | /* old_entry is being shot, lets just lie */ |
391 | spin_unlock(&old_entry->lock); | 391 | spin_unlock(&old_entry->lock); |
392 | fsnotify_put_mark(old_entry); | 392 | fsnotify_put_mark(old_entry); |
@@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
395 | } | 395 | } |
396 | 396 | ||
397 | fsnotify_duplicate_mark(chunk_entry, old_entry); | 397 | fsnotify_duplicate_mark(chunk_entry, old_entry); |
398 | if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { | 398 | if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) { |
399 | spin_unlock(&old_entry->lock); | 399 | spin_unlock(&old_entry->lock); |
400 | fsnotify_put_mark(chunk_entry); | 400 | fsnotify_put_mark(chunk_entry); |
401 | fsnotify_put_mark(old_entry); | 401 | fsnotify_put_mark(old_entry); |
@@ -611,7 +611,7 @@ void audit_trim_trees(void) | |||
611 | list_for_each_entry(node, &tree->chunks, list) { | 611 | list_for_each_entry(node, &tree->chunks, list) { |
612 | struct audit_chunk *chunk = find_chunk(node); | 612 | struct audit_chunk *chunk = find_chunk(node); |
613 | /* this could be NULL if the watch is dying else where... */ | 613 | /* this could be NULL if the watch is dying else where... */ |
614 | struct inode *inode = chunk->mark.i.inode; | 614 | struct inode *inode = chunk->mark.inode; |
615 | node->index |= 1U<<31; | 615 | node->index |= 1U<<31; |
616 | if (iterate_mounts(compare_root, inode, root_mnt)) | 616 | if (iterate_mounts(compare_root, inode, root_mnt)) |
617 | node->index &= ~(1U<<31); | 617 | node->index &= ~(1U<<31); |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ed8f2cde34c5..995a95f61a19 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
724 | int more = 0; | 724 | int more = 0; |
725 | 725 | ||
726 | again: | 726 | again: |
727 | mutex_lock(&mapping->i_mmap_mutex); | 727 | i_mmap_lock_read(mapping); |
728 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 728 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
729 | if (!valid_vma(vma, is_register)) | 729 | if (!valid_vma(vma, is_register)) |
730 | continue; | 730 | continue; |
731 | 731 | ||
732 | if (!prev && !more) { | 732 | if (!prev && !more) { |
733 | /* | 733 | /* |
734 | * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through | 734 | * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through |
735 | * reclaim. This is optimistic, no harm done if it fails. | 735 | * reclaim. This is optimistic, no harm done if it fails. |
736 | */ | 736 | */ |
737 | prev = kmalloc(sizeof(struct map_info), | 737 | prev = kmalloc(sizeof(struct map_info), |
@@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
755 | info->mm = vma->vm_mm; | 755 | info->mm = vma->vm_mm; |
756 | info->vaddr = offset_to_vaddr(vma, offset); | 756 | info->vaddr = offset_to_vaddr(vma, offset); |
757 | } | 757 | } |
758 | mutex_unlock(&mapping->i_mmap_mutex); | 758 | i_mmap_unlock_read(mapping); |
759 | 759 | ||
760 | if (!more) | 760 | if (!more) |
761 | goto out; | 761 | goto out; |
diff --git a/kernel/fork.c b/kernel/fork.c index 9ca84189cfc2..4dc2ddade9f1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
433 | get_file(file); | 433 | get_file(file); |
434 | if (tmp->vm_flags & VM_DENYWRITE) | 434 | if (tmp->vm_flags & VM_DENYWRITE) |
435 | atomic_dec(&inode->i_writecount); | 435 | atomic_dec(&inode->i_writecount); |
436 | mutex_lock(&mapping->i_mmap_mutex); | 436 | i_mmap_lock_write(mapping); |
437 | if (tmp->vm_flags & VM_SHARED) | 437 | if (tmp->vm_flags & VM_SHARED) |
438 | atomic_inc(&mapping->i_mmap_writable); | 438 | atomic_inc(&mapping->i_mmap_writable); |
439 | flush_dcache_mmap_lock(mapping); | 439 | flush_dcache_mmap_lock(mapping); |
@@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
445 | vma_interval_tree_insert_after(tmp, mpnt, | 445 | vma_interval_tree_insert_after(tmp, mpnt, |
446 | &mapping->i_mmap); | 446 | &mapping->i_mmap); |
447 | flush_dcache_mmap_unlock(mapping); | 447 | flush_dcache_mmap_unlock(mapping); |
448 | mutex_unlock(&mapping->i_mmap_mutex); | 448 | i_mmap_unlock_write(mapping); |
449 | } | 449 | } |
450 | 450 | ||
451 | /* | 451 | /* |
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3b7408759bdf..c92e44855ddd 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
@@ -32,10 +32,13 @@ config GCOV_KERNEL | |||
32 | Note that the debugfs filesystem has to be mounted to access | 32 | Note that the debugfs filesystem has to be mounted to access |
33 | profiling data. | 33 | profiling data. |
34 | 34 | ||
35 | config ARCH_HAS_GCOV_PROFILE_ALL | ||
36 | def_bool n | ||
37 | |||
35 | config GCOV_PROFILE_ALL | 38 | config GCOV_PROFILE_ALL |
36 | bool "Profile entire Kernel" | 39 | bool "Profile entire Kernel" |
37 | depends on GCOV_KERNEL | 40 | depends on GCOV_KERNEL |
38 | depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 | 41 | depends on ARCH_HAS_GCOV_PROFILE_ALL |
39 | default n | 42 | default n |
40 | ---help--- | 43 | ---help--- |
41 | This options activates profiling for the entire kernel. | 44 | This options activates profiling for the entire kernel. |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 2abf9f6e9a61..9a8a01abbaed 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, | |||
600 | if (!kexec_on_panic) { | 600 | if (!kexec_on_panic) { |
601 | image->swap_page = kimage_alloc_control_pages(image, 0); | 601 | image->swap_page = kimage_alloc_control_pages(image, 0); |
602 | if (!image->swap_page) { | 602 | if (!image->swap_page) { |
603 | pr_err(KERN_ERR "Could not allocate swap buffer\n"); | 603 | pr_err("Could not allocate swap buffer\n"); |
604 | goto out_free_control_pages; | 604 | goto out_free_control_pages; |
605 | } | 605 | } |
606 | } | 606 | } |
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 00fe55cc5a82..b6e4c16377c7 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c | |||
@@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces) | |||
25 | } | 25 | } |
26 | EXPORT_SYMBOL_GPL(print_stack_trace); | 26 | EXPORT_SYMBOL_GPL(print_stack_trace); |
27 | 27 | ||
28 | int snprint_stack_trace(char *buf, size_t size, | ||
29 | struct stack_trace *trace, int spaces) | ||
30 | { | ||
31 | int i; | ||
32 | unsigned long ip; | ||
33 | int generated; | ||
34 | int total = 0; | ||
35 | |||
36 | if (WARN_ON(!trace->entries)) | ||
37 | return 0; | ||
38 | |||
39 | for (i = 0; i < trace->nr_entries; i++) { | ||
40 | ip = trace->entries[i]; | ||
41 | generated = snprintf(buf, size, "%*c[<%p>] %pS\n", | ||
42 | 1 + spaces, ' ', (void *) ip, (void *) ip); | ||
43 | |||
44 | total += generated; | ||
45 | |||
46 | /* Assume that generated isn't a negative number */ | ||
47 | if (generated >= size) { | ||
48 | buf += size; | ||
49 | size = 0; | ||
50 | } else { | ||
51 | buf += generated; | ||
52 | size -= generated; | ||
53 | } | ||
54 | } | ||
55 | |||
56 | return total; | ||
57 | } | ||
58 | EXPORT_SYMBOL_GPL(snprint_stack_trace); | ||
59 | |||
28 | /* | 60 | /* |
29 | * Architectures that do not implement save_stack_trace_tsk or | 61 | * Architectures that do not implement save_stack_trace_tsk or |
30 | * save_stack_trace_regs get this weak alias and a once-per-bootup warning | 62 | * save_stack_trace_regs get this weak alias and a once-per-bootup warning |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 61eea02b53f5..5adcb0ae3a58 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -226,3 +226,6 @@ cond_syscall(sys_seccomp); | |||
226 | 226 | ||
227 | /* access BPF programs and maps */ | 227 | /* access BPF programs and maps */ |
228 | cond_syscall(sys_bpf); | 228 | cond_syscall(sys_bpf); |
229 | |||
230 | /* execveat */ | ||
231 | cond_syscall(sys_execveat); | ||
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d780351835e9..5f2ce616c046 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
@@ -227,6 +227,22 @@ config UNUSED_SYMBOLS | |||
227 | you really need it, and what the merge plan to the mainline kernel for | 227 | you really need it, and what the merge plan to the mainline kernel for |
228 | your module is. | 228 | your module is. |
229 | 229 | ||
230 | config PAGE_OWNER | ||
231 | bool "Track page owner" | ||
232 | depends on DEBUG_KERNEL && STACKTRACE_SUPPORT | ||
233 | select DEBUG_FS | ||
234 | select STACKTRACE | ||
235 | select PAGE_EXTENSION | ||
236 | help | ||
237 | This keeps track of what call chain is the owner of a page, may | ||
238 | help to find bare alloc_page(s) leaks. Even if you include this | ||
239 | feature on your build, it is disabled in default. You should pass | ||
240 | "page_owner=on" to boot parameter in order to enable it. Eats | ||
241 | a fair amount of memory if enabled. See tools/vm/page_owner_sort.c | ||
242 | for user-space helper. | ||
243 | |||
244 | If unsure, say N. | ||
245 | |||
230 | config DEBUG_FS | 246 | config DEBUG_FS |
231 | bool "Debug Filesystem" | 247 | bool "Debug Filesystem" |
232 | help | 248 | help |
diff --git a/lib/audit.c b/lib/audit.c index 1d726a22565b..b8fb5ee81e26 100644 --- a/lib/audit.c +++ b/lib/audit.c | |||
@@ -54,6 +54,9 @@ int audit_classify_syscall(int abi, unsigned syscall) | |||
54 | case __NR_socketcall: | 54 | case __NR_socketcall: |
55 | return 4; | 55 | return 4; |
56 | #endif | 56 | #endif |
57 | #ifdef __NR_execveat | ||
58 | case __NR_execveat: | ||
59 | #endif | ||
57 | case __NR_execve: | 60 | case __NR_execve: |
58 | return 5; | 61 | return 5; |
59 | default: | 62 | default: |
diff --git a/lib/bitmap.c b/lib/bitmap.c index b499ab6ada29..969ae8fbc85b 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c | |||
@@ -326,30 +326,32 @@ void bitmap_clear(unsigned long *map, unsigned int start, int len) | |||
326 | } | 326 | } |
327 | EXPORT_SYMBOL(bitmap_clear); | 327 | EXPORT_SYMBOL(bitmap_clear); |
328 | 328 | ||
329 | /* | 329 | /** |
330 | * bitmap_find_next_zero_area - find a contiguous aligned zero area | 330 | * bitmap_find_next_zero_area_off - find a contiguous aligned zero area |
331 | * @map: The address to base the search on | 331 | * @map: The address to base the search on |
332 | * @size: The bitmap size in bits | 332 | * @size: The bitmap size in bits |
333 | * @start: The bitnumber to start searching at | 333 | * @start: The bitnumber to start searching at |
334 | * @nr: The number of zeroed bits we're looking for | 334 | * @nr: The number of zeroed bits we're looking for |
335 | * @align_mask: Alignment mask for zero area | 335 | * @align_mask: Alignment mask for zero area |
336 | * @align_offset: Alignment offset for zero area. | ||
336 | * | 337 | * |
337 | * The @align_mask should be one less than a power of 2; the effect is that | 338 | * The @align_mask should be one less than a power of 2; the effect is that |
338 | * the bit offset of all zero areas this function finds is multiples of that | 339 | * the bit offset of all zero areas this function finds plus @align_offset |
339 | * power of 2. A @align_mask of 0 means no alignment is required. | 340 | * is multiple of that power of 2. |
340 | */ | 341 | */ |
341 | unsigned long bitmap_find_next_zero_area(unsigned long *map, | 342 | unsigned long bitmap_find_next_zero_area_off(unsigned long *map, |
342 | unsigned long size, | 343 | unsigned long size, |
343 | unsigned long start, | 344 | unsigned long start, |
344 | unsigned int nr, | 345 | unsigned int nr, |
345 | unsigned long align_mask) | 346 | unsigned long align_mask, |
347 | unsigned long align_offset) | ||
346 | { | 348 | { |
347 | unsigned long index, end, i; | 349 | unsigned long index, end, i; |
348 | again: | 350 | again: |
349 | index = find_next_zero_bit(map, size, start); | 351 | index = find_next_zero_bit(map, size, start); |
350 | 352 | ||
351 | /* Align allocation */ | 353 | /* Align allocation */ |
352 | index = __ALIGN_MASK(index, align_mask); | 354 | index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset; |
353 | 355 | ||
354 | end = index + nr; | 356 | end = index + nr; |
355 | if (end > size) | 357 | if (end > size) |
@@ -361,7 +363,7 @@ again: | |||
361 | } | 363 | } |
362 | return index; | 364 | return index; |
363 | } | 365 | } |
364 | EXPORT_SYMBOL(bitmap_find_next_zero_area); | 366 | EXPORT_SYMBOL(bitmap_find_next_zero_area_off); |
365 | 367 | ||
366 | /* | 368 | /* |
367 | * Bitmap printing & parsing functions: first version by Nadia Yvette Chambers, | 369 | * Bitmap printing & parsing functions: first version by Nadia Yvette Chambers, |
diff --git a/lib/decompress.c b/lib/decompress.c index 37f3c786348f..528ff932d8e4 100644 --- a/lib/decompress.c +++ b/lib/decompress.c | |||
@@ -44,8 +44,8 @@ struct compress_format { | |||
44 | }; | 44 | }; |
45 | 45 | ||
46 | static const struct compress_format compressed_formats[] __initconst = { | 46 | static const struct compress_format compressed_formats[] __initconst = { |
47 | { {037, 0213}, "gzip", gunzip }, | 47 | { {0x1f, 0x8b}, "gzip", gunzip }, |
48 | { {037, 0236}, "gzip", gunzip }, | 48 | { {0x1f, 0x9e}, "gzip", gunzip }, |
49 | { {0x42, 0x5a}, "bzip2", bunzip2 }, | 49 | { {0x42, 0x5a}, "bzip2", bunzip2 }, |
50 | { {0x5d, 0x00}, "lzma", unlzma }, | 50 | { {0x5d, 0x00}, "lzma", unlzma }, |
51 | { {0xfd, 0x37}, "xz", unxz }, | 51 | { {0xfd, 0x37}, "xz", unxz }, |
diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c index 8290e0bef7ea..6dd0335ea61b 100644 --- a/lib/decompress_bunzip2.c +++ b/lib/decompress_bunzip2.c | |||
@@ -184,7 +184,7 @@ static int INIT get_next_block(struct bunzip_data *bd) | |||
184 | if (get_bits(bd, 1)) | 184 | if (get_bits(bd, 1)) |
185 | return RETVAL_OBSOLETE_INPUT; | 185 | return RETVAL_OBSOLETE_INPUT; |
186 | origPtr = get_bits(bd, 24); | 186 | origPtr = get_bits(bd, 24); |
187 | if (origPtr > dbufSize) | 187 | if (origPtr >= dbufSize) |
188 | return RETVAL_DATA_ERROR; | 188 | return RETVAL_DATA_ERROR; |
189 | /* mapping table: if some byte values are never used (encoding things | 189 | /* mapping table: if some byte values are never used (encoding things |
190 | like ascii text), the compression code removes the gaps to have fewer | 190 | like ascii text), the compression code removes the gaps to have fewer |
diff --git a/lib/fault-inject.c b/lib/fault-inject.c index d7d501ea856d..f1cdeb024d17 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c | |||
@@ -40,10 +40,16 @@ EXPORT_SYMBOL_GPL(setup_fault_attr); | |||
40 | 40 | ||
41 | static void fail_dump(struct fault_attr *attr) | 41 | static void fail_dump(struct fault_attr *attr) |
42 | { | 42 | { |
43 | if (attr->verbose > 0) | 43 | if (attr->verbose > 0 && __ratelimit(&attr->ratelimit_state)) { |
44 | printk(KERN_NOTICE "FAULT_INJECTION: forcing a failure\n"); | 44 | printk(KERN_NOTICE "FAULT_INJECTION: forcing a failure.\n" |
45 | if (attr->verbose > 1) | 45 | "name %pd, interval %lu, probability %lu, " |
46 | dump_stack(); | 46 | "space %d, times %d\n", attr->dname, |
47 | attr->probability, attr->interval, | ||
48 | atomic_read(&attr->space), | ||
49 | atomic_read(&attr->times)); | ||
50 | if (attr->verbose > 1) | ||
51 | dump_stack(); | ||
52 | } | ||
47 | } | 53 | } |
48 | 54 | ||
49 | #define atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0) | 55 | #define atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0) |
@@ -202,6 +208,12 @@ struct dentry *fault_create_debugfs_attr(const char *name, | |||
202 | goto fail; | 208 | goto fail; |
203 | if (!debugfs_create_ul("verbose", mode, dir, &attr->verbose)) | 209 | if (!debugfs_create_ul("verbose", mode, dir, &attr->verbose)) |
204 | goto fail; | 210 | goto fail; |
211 | if (!debugfs_create_u32("verbose_ratelimit_interval_ms", mode, dir, | ||
212 | &attr->ratelimit_state.interval)) | ||
213 | goto fail; | ||
214 | if (!debugfs_create_u32("verbose_ratelimit_burst", mode, dir, | ||
215 | &attr->ratelimit_state.burst)) | ||
216 | goto fail; | ||
205 | if (!debugfs_create_bool("task-filter", mode, dir, &attr->task_filter)) | 217 | if (!debugfs_create_bool("task-filter", mode, dir, &attr->task_filter)) |
206 | goto fail; | 218 | goto fail; |
207 | 219 | ||
@@ -222,6 +234,7 @@ struct dentry *fault_create_debugfs_attr(const char *name, | |||
222 | 234 | ||
223 | #endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ | 235 | #endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ |
224 | 236 | ||
237 | attr->dname = dget(dir); | ||
225 | return dir; | 238 | return dir; |
226 | fail: | 239 | fail: |
227 | debugfs_remove_recursive(dir); | 240 | debugfs_remove_recursive(dir); |
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 4b2443254de2..56badfc4810a 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -1,8 +1,18 @@ | |||
1 | config PAGE_EXTENSION | ||
2 | bool "Extend memmap on extra space for more information on page" | ||
3 | ---help--- | ||
4 | Extend memmap on extra space for more information on page. This | ||
5 | could be used for debugging features that need to insert extra | ||
6 | field for every page. This extension enables us to save memory | ||
7 | by not allocating this extra memory according to boottime | ||
8 | configuration. | ||
9 | |||
1 | config DEBUG_PAGEALLOC | 10 | config DEBUG_PAGEALLOC |
2 | bool "Debug page memory allocations" | 11 | bool "Debug page memory allocations" |
3 | depends on DEBUG_KERNEL | 12 | depends on DEBUG_KERNEL |
4 | depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC | 13 | depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC |
5 | depends on !KMEMCHECK | 14 | depends on !KMEMCHECK |
15 | select PAGE_EXTENSION | ||
6 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC | 16 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC |
7 | select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC | 17 | select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC |
8 | ---help--- | 18 | ---help--- |
diff --git a/mm/Makefile b/mm/Makefile index b3c6ce932c64..4bf586e66378 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -63,6 +63,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | |||
63 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 63 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
64 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | 64 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o |
65 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 65 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
66 | obj-$(CONFIG_PAGE_OWNER) += page_owner.o | ||
66 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | 67 | obj-$(CONFIG_CLEANCACHE) += cleancache.o |
67 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o | 68 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o |
68 | obj-$(CONFIG_ZPOOL) += zpool.o | 69 | obj-$(CONFIG_ZPOOL) += zpool.o |
@@ -71,3 +72,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o | |||
71 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o | 72 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o |
72 | obj-$(CONFIG_CMA) += cma.o | 73 | obj-$(CONFIG_CMA) += cma.o |
73 | obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o | 74 | obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o |
75 | obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o | ||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/log2.h> | 33 | #include <linux/log2.h> |
34 | #include <linux/cma.h> | 34 | #include <linux/cma.h> |
35 | #include <linux/highmem.h> | 35 | #include <linux/highmem.h> |
36 | #include <linux/io.h> | ||
36 | 37 | ||
37 | struct cma { | 38 | struct cma { |
38 | unsigned long base_pfn; | 39 | unsigned long base_pfn; |
@@ -63,6 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) | |||
63 | return (1UL << (align_order - cma->order_per_bit)) - 1; | 64 | return (1UL << (align_order - cma->order_per_bit)) - 1; |
64 | } | 65 | } |
65 | 66 | ||
67 | static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) | ||
68 | { | ||
69 | unsigned int alignment; | ||
70 | |||
71 | if (align_order <= cma->order_per_bit) | ||
72 | return 0; | ||
73 | alignment = 1UL << (align_order - cma->order_per_bit); | ||
74 | return ALIGN(cma->base_pfn, alignment) - | ||
75 | (cma->base_pfn >> cma->order_per_bit); | ||
76 | } | ||
77 | |||
66 | static unsigned long cma_bitmap_maxno(struct cma *cma) | 78 | static unsigned long cma_bitmap_maxno(struct cma *cma) |
67 | { | 79 | { |
68 | return cma->count >> cma->order_per_bit; | 80 | return cma->count >> cma->order_per_bit; |
@@ -313,6 +325,11 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
313 | } | 325 | } |
314 | } | 326 | } |
315 | 327 | ||
328 | /* | ||
329 | * kmemleak scans/reads tracked objects for pointers to other | ||
330 | * objects but this address isn't mapped and accessible | ||
331 | */ | ||
332 | kmemleak_ignore(phys_to_virt(addr)); | ||
316 | base = addr; | 333 | base = addr; |
317 | } | 334 | } |
318 | 335 | ||
@@ -340,7 +357,7 @@ err: | |||
340 | */ | 357 | */ |
341 | struct page *cma_alloc(struct cma *cma, int count, unsigned int align) | 358 | struct page *cma_alloc(struct cma *cma, int count, unsigned int align) |
342 | { | 359 | { |
343 | unsigned long mask, pfn, start = 0; | 360 | unsigned long mask, offset, pfn, start = 0; |
344 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; | 361 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; |
345 | struct page *page = NULL; | 362 | struct page *page = NULL; |
346 | int ret; | 363 | int ret; |
@@ -355,13 +372,15 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) | |||
355 | return NULL; | 372 | return NULL; |
356 | 373 | ||
357 | mask = cma_bitmap_aligned_mask(cma, align); | 374 | mask = cma_bitmap_aligned_mask(cma, align); |
375 | offset = cma_bitmap_aligned_offset(cma, align); | ||
358 | bitmap_maxno = cma_bitmap_maxno(cma); | 376 | bitmap_maxno = cma_bitmap_maxno(cma); |
359 | bitmap_count = cma_bitmap_pages_to_bits(cma, count); | 377 | bitmap_count = cma_bitmap_pages_to_bits(cma, count); |
360 | 378 | ||
361 | for (;;) { | 379 | for (;;) { |
362 | mutex_lock(&cma->lock); | 380 | mutex_lock(&cma->lock); |
363 | bitmap_no = bitmap_find_next_zero_area(cma->bitmap, | 381 | bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, |
364 | bitmap_maxno, start, bitmap_count, mask); | 382 | bitmap_maxno, start, bitmap_count, mask, |
383 | offset); | ||
365 | if (bitmap_no >= bitmap_maxno) { | 384 | if (bitmap_no >= bitmap_maxno) { |
366 | mutex_unlock(&cma->lock); | 385 | mutex_unlock(&cma->lock); |
367 | break; | 386 | break; |
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c index 789ff70c8a4a..5bf5906ce13b 100644 --- a/mm/debug-pagealloc.c +++ b/mm/debug-pagealloc.c | |||
@@ -2,23 +2,55 @@ | |||
2 | #include <linux/string.h> | 2 | #include <linux/string.h> |
3 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
4 | #include <linux/highmem.h> | 4 | #include <linux/highmem.h> |
5 | #include <linux/page-debug-flags.h> | 5 | #include <linux/page_ext.h> |
6 | #include <linux/poison.h> | 6 | #include <linux/poison.h> |
7 | #include <linux/ratelimit.h> | 7 | #include <linux/ratelimit.h> |
8 | 8 | ||
9 | static bool page_poisoning_enabled __read_mostly; | ||
10 | |||
11 | static bool need_page_poisoning(void) | ||
12 | { | ||
13 | if (!debug_pagealloc_enabled()) | ||
14 | return false; | ||
15 | |||
16 | return true; | ||
17 | } | ||
18 | |||
19 | static void init_page_poisoning(void) | ||
20 | { | ||
21 | if (!debug_pagealloc_enabled()) | ||
22 | return; | ||
23 | |||
24 | page_poisoning_enabled = true; | ||
25 | } | ||
26 | |||
27 | struct page_ext_operations page_poisoning_ops = { | ||
28 | .need = need_page_poisoning, | ||
29 | .init = init_page_poisoning, | ||
30 | }; | ||
31 | |||
9 | static inline void set_page_poison(struct page *page) | 32 | static inline void set_page_poison(struct page *page) |
10 | { | 33 | { |
11 | __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); | 34 | struct page_ext *page_ext; |
35 | |||
36 | page_ext = lookup_page_ext(page); | ||
37 | __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | ||
12 | } | 38 | } |
13 | 39 | ||
14 | static inline void clear_page_poison(struct page *page) | 40 | static inline void clear_page_poison(struct page *page) |
15 | { | 41 | { |
16 | __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); | 42 | struct page_ext *page_ext; |
43 | |||
44 | page_ext = lookup_page_ext(page); | ||
45 | __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | ||
17 | } | 46 | } |
18 | 47 | ||
19 | static inline bool page_poison(struct page *page) | 48 | static inline bool page_poison(struct page *page) |
20 | { | 49 | { |
21 | return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); | 50 | struct page_ext *page_ext; |
51 | |||
52 | page_ext = lookup_page_ext(page); | ||
53 | return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | ||
22 | } | 54 | } |
23 | 55 | ||
24 | static void poison_page(struct page *page) | 56 | static void poison_page(struct page *page) |
@@ -93,8 +125,11 @@ static void unpoison_pages(struct page *page, int n) | |||
93 | unpoison_page(page + i); | 125 | unpoison_page(page + i); |
94 | } | 126 | } |
95 | 127 | ||
96 | void kernel_map_pages(struct page *page, int numpages, int enable) | 128 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
97 | { | 129 | { |
130 | if (!page_poisoning_enabled) | ||
131 | return; | ||
132 | |||
98 | if (enable) | 133 | if (enable) |
99 | unpoison_pages(page, numpages); | 134 | unpoison_pages(page, numpages); |
100 | else | 135 | else |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 3bcfd81db45e..2ad7adf4f0a4 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -117,7 +117,11 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) | |||
117 | __filemap_fdatawrite_range(mapping, offset, endbyte, | 117 | __filemap_fdatawrite_range(mapping, offset, endbyte, |
118 | WB_SYNC_NONE); | 118 | WB_SYNC_NONE); |
119 | 119 | ||
120 | /* First and last FULL page! */ | 120 | /* |
121 | * First and last FULL page! Partial pages are deliberately | ||
122 | * preserved on the expectation that it is better to preserve | ||
123 | * needed memory than to discard unneeded memory. | ||
124 | */ | ||
121 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; | 125 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; |
122 | end_index = (endbyte >> PAGE_CACHE_SHIFT); | 126 | end_index = (endbyte >> PAGE_CACHE_SHIFT); |
123 | 127 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 14b4642279f1..e8905bc3cbd7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -62,16 +62,16 @@ | |||
62 | /* | 62 | /* |
63 | * Lock ordering: | 63 | * Lock ordering: |
64 | * | 64 | * |
65 | * ->i_mmap_mutex (truncate_pagecache) | 65 | * ->i_mmap_rwsem (truncate_pagecache) |
66 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 66 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
67 | * ->swap_lock (exclusive_swap_page, others) | 67 | * ->swap_lock (exclusive_swap_page, others) |
68 | * ->mapping->tree_lock | 68 | * ->mapping->tree_lock |
69 | * | 69 | * |
70 | * ->i_mutex | 70 | * ->i_mutex |
71 | * ->i_mmap_mutex (truncate->unmap_mapping_range) | 71 | * ->i_mmap_rwsem (truncate->unmap_mapping_range) |
72 | * | 72 | * |
73 | * ->mmap_sem | 73 | * ->mmap_sem |
74 | * ->i_mmap_mutex | 74 | * ->i_mmap_rwsem |
75 | * ->page_table_lock or pte_lock (various, mainly in memory.c) | 75 | * ->page_table_lock or pte_lock (various, mainly in memory.c) |
76 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) | 76 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) |
77 | * | 77 | * |
@@ -85,7 +85,7 @@ | |||
85 | * sb_lock (fs/fs-writeback.c) | 85 | * sb_lock (fs/fs-writeback.c) |
86 | * ->mapping->tree_lock (__sync_single_inode) | 86 | * ->mapping->tree_lock (__sync_single_inode) |
87 | * | 87 | * |
88 | * ->i_mmap_mutex | 88 | * ->i_mmap_rwsem |
89 | * ->anon_vma.lock (vma_adjust) | 89 | * ->anon_vma.lock (vma_adjust) |
90 | * | 90 | * |
91 | * ->anon_vma.lock | 91 | * ->anon_vma.lock |
@@ -105,7 +105,7 @@ | |||
105 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 105 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
106 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 106 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
107 | * | 107 | * |
108 | * ->i_mmap_mutex | 108 | * ->i_mmap_rwsem |
109 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 109 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
110 | */ | 110 | */ |
111 | 111 | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index d8d9fe3f685c..0d105aeff82f 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -155,22 +155,14 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) | |||
155 | EXPORT_SYMBOL_GPL(xip_file_read); | 155 | EXPORT_SYMBOL_GPL(xip_file_read); |
156 | 156 | ||
157 | /* | 157 | /* |
158 | * __xip_unmap is invoked from xip_unmap and | 158 | * __xip_unmap is invoked from xip_unmap and xip_write |
159 | * xip_write | ||
160 | * | 159 | * |
161 | * This function walks all vmas of the address_space and unmaps the | 160 | * This function walks all vmas of the address_space and unmaps the |
162 | * __xip_sparse_page when found at pgoff. | 161 | * __xip_sparse_page when found at pgoff. |
163 | */ | 162 | */ |
164 | static void | 163 | static void __xip_unmap(struct address_space * mapping, unsigned long pgoff) |
165 | __xip_unmap (struct address_space * mapping, | ||
166 | unsigned long pgoff) | ||
167 | { | 164 | { |
168 | struct vm_area_struct *vma; | 165 | struct vm_area_struct *vma; |
169 | struct mm_struct *mm; | ||
170 | unsigned long address; | ||
171 | pte_t *pte; | ||
172 | pte_t pteval; | ||
173 | spinlock_t *ptl; | ||
174 | struct page *page; | 166 | struct page *page; |
175 | unsigned count; | 167 | unsigned count; |
176 | int locked = 0; | 168 | int locked = 0; |
@@ -182,11 +174,14 @@ __xip_unmap (struct address_space * mapping, | |||
182 | return; | 174 | return; |
183 | 175 | ||
184 | retry: | 176 | retry: |
185 | mutex_lock(&mapping->i_mmap_mutex); | 177 | i_mmap_lock_read(mapping); |
186 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 178 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
187 | mm = vma->vm_mm; | 179 | pte_t *pte, pteval; |
188 | address = vma->vm_start + | 180 | spinlock_t *ptl; |
181 | struct mm_struct *mm = vma->vm_mm; | ||
182 | unsigned long address = vma->vm_start + | ||
189 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 183 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
184 | |||
190 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 185 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
191 | pte = page_check_address(page, mm, address, &ptl, 1); | 186 | pte = page_check_address(page, mm, address, &ptl, 1); |
192 | if (pte) { | 187 | if (pte) { |
@@ -202,7 +197,7 @@ retry: | |||
202 | page_cache_release(page); | 197 | page_cache_release(page); |
203 | } | 198 | } |
204 | } | 199 | } |
205 | mutex_unlock(&mapping->i_mmap_mutex); | 200 | i_mmap_unlock_read(mapping); |
206 | 201 | ||
207 | if (locked) { | 202 | if (locked) { |
208 | mutex_unlock(&xip_sparse_mutex); | 203 | mutex_unlock(&xip_sparse_mutex); |
diff --git a/mm/fremap.c b/mm/fremap.c index 72b8fa361433..11ef7ec40d13 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -238,13 +238,13 @@ get_write_lock: | |||
238 | } | 238 | } |
239 | goto out_freed; | 239 | goto out_freed; |
240 | } | 240 | } |
241 | mutex_lock(&mapping->i_mmap_mutex); | 241 | i_mmap_lock_write(mapping); |
242 | flush_dcache_mmap_lock(mapping); | 242 | flush_dcache_mmap_lock(mapping); |
243 | vma->vm_flags |= VM_NONLINEAR; | 243 | vma->vm_flags |= VM_NONLINEAR; |
244 | vma_interval_tree_remove(vma, &mapping->i_mmap); | 244 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
245 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 245 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
246 | flush_dcache_mmap_unlock(mapping); | 246 | flush_dcache_mmap_unlock(mapping); |
247 | mutex_unlock(&mapping->i_mmap_mutex); | 247 | i_mmap_unlock_write(mapping); |
248 | } | 248 | } |
249 | 249 | ||
250 | if (vma->vm_flags & VM_LOCKED) { | 250 | if (vma->vm_flags & VM_LOCKED) { |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 919b86a2164d..47f6070d7c46 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1457,7 +1457,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
1457 | return 0; | 1457 | return 0; |
1458 | 1458 | ||
1459 | found: | 1459 | found: |
1460 | BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); | 1460 | BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); |
1461 | /* Put them into a private list first because mem_map is not up yet */ | 1461 | /* Put them into a private list first because mem_map is not up yet */ |
1462 | list_add(&m->list, &huge_boot_pages); | 1462 | list_add(&m->list, &huge_boot_pages); |
1463 | m->hstate = h; | 1463 | m->hstate = h; |
@@ -2083,7 +2083,7 @@ static void hugetlb_register_node(struct node *node) | |||
2083 | * devices of nodes that have memory. All on-line nodes should have | 2083 | * devices of nodes that have memory. All on-line nodes should have |
2084 | * registered their associated device by this time. | 2084 | * registered their associated device by this time. |
2085 | */ | 2085 | */ |
2086 | static void hugetlb_register_all_nodes(void) | 2086 | static void __init hugetlb_register_all_nodes(void) |
2087 | { | 2087 | { |
2088 | int nid; | 2088 | int nid; |
2089 | 2089 | ||
@@ -2726,9 +2726,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, | |||
2726 | * on its way out. We're lucky that the flag has such an appropriate | 2726 | * on its way out. We're lucky that the flag has such an appropriate |
2727 | * name, and can in fact be safely cleared here. We could clear it | 2727 | * name, and can in fact be safely cleared here. We could clear it |
2728 | * before the __unmap_hugepage_range above, but all that's necessary | 2728 | * before the __unmap_hugepage_range above, but all that's necessary |
2729 | * is to clear it before releasing the i_mmap_mutex. This works | 2729 | * is to clear it before releasing the i_mmap_rwsem. This works |
2730 | * because in the context this is called, the VMA is about to be | 2730 | * because in the context this is called, the VMA is about to be |
2731 | * destroyed and the i_mmap_mutex is held. | 2731 | * destroyed and the i_mmap_rwsem is held. |
2732 | */ | 2732 | */ |
2733 | vma->vm_flags &= ~VM_MAYSHARE; | 2733 | vma->vm_flags &= ~VM_MAYSHARE; |
2734 | } | 2734 | } |
@@ -2774,7 +2774,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2774 | * this mapping should be shared between all the VMAs, | 2774 | * this mapping should be shared between all the VMAs, |
2775 | * __unmap_hugepage_range() is called as the lock is already held | 2775 | * __unmap_hugepage_range() is called as the lock is already held |
2776 | */ | 2776 | */ |
2777 | mutex_lock(&mapping->i_mmap_mutex); | 2777 | i_mmap_lock_write(mapping); |
2778 | vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { | 2778 | vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { |
2779 | /* Do not unmap the current VMA */ | 2779 | /* Do not unmap the current VMA */ |
2780 | if (iter_vma == vma) | 2780 | if (iter_vma == vma) |
@@ -2791,7 +2791,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2791 | unmap_hugepage_range(iter_vma, address, | 2791 | unmap_hugepage_range(iter_vma, address, |
2792 | address + huge_page_size(h), page); | 2792 | address + huge_page_size(h), page); |
2793 | } | 2793 | } |
2794 | mutex_unlock(&mapping->i_mmap_mutex); | 2794 | i_mmap_unlock_write(mapping); |
2795 | } | 2795 | } |
2796 | 2796 | ||
2797 | /* | 2797 | /* |
@@ -3348,7 +3348,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3348 | flush_cache_range(vma, address, end); | 3348 | flush_cache_range(vma, address, end); |
3349 | 3349 | ||
3350 | mmu_notifier_invalidate_range_start(mm, start, end); | 3350 | mmu_notifier_invalidate_range_start(mm, start, end); |
3351 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3351 | i_mmap_lock_write(vma->vm_file->f_mapping); |
3352 | for (; address < end; address += huge_page_size(h)) { | 3352 | for (; address < end; address += huge_page_size(h)) { |
3353 | spinlock_t *ptl; | 3353 | spinlock_t *ptl; |
3354 | ptep = huge_pte_offset(mm, address); | 3354 | ptep = huge_pte_offset(mm, address); |
@@ -3370,13 +3370,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3370 | spin_unlock(ptl); | 3370 | spin_unlock(ptl); |
3371 | } | 3371 | } |
3372 | /* | 3372 | /* |
3373 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare | 3373 | * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare |
3374 | * may have cleared our pud entry and done put_page on the page table: | 3374 | * may have cleared our pud entry and done put_page on the page table: |
3375 | * once we release i_mmap_mutex, another task can do the final put_page | 3375 | * once we release i_mmap_rwsem, another task can do the final put_page |
3376 | * and that page table be reused and filled with junk. | 3376 | * and that page table be reused and filled with junk. |
3377 | */ | 3377 | */ |
3378 | flush_tlb_range(vma, start, end); | 3378 | flush_tlb_range(vma, start, end); |
3379 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3379 | i_mmap_unlock_write(vma->vm_file->f_mapping); |
3380 | mmu_notifier_invalidate_range_end(mm, start, end); | 3380 | mmu_notifier_invalidate_range_end(mm, start, end); |
3381 | 3381 | ||
3382 | return pages << h->order; | 3382 | return pages << h->order; |
@@ -3525,7 +3525,7 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) | |||
3525 | * and returns the corresponding pte. While this is not necessary for the | 3525 | * and returns the corresponding pte. While this is not necessary for the |
3526 | * !shared pmd case because we can allocate the pmd later as well, it makes the | 3526 | * !shared pmd case because we can allocate the pmd later as well, it makes the |
3527 | * code much cleaner. pmd allocation is essential for the shared case because | 3527 | * code much cleaner. pmd allocation is essential for the shared case because |
3528 | * pud has to be populated inside the same i_mmap_mutex section - otherwise | 3528 | * pud has to be populated inside the same i_mmap_rwsem section - otherwise |
3529 | * racing tasks could either miss the sharing (see huge_pte_offset) or select a | 3529 | * racing tasks could either miss the sharing (see huge_pte_offset) or select a |
3530 | * bad pmd for sharing. | 3530 | * bad pmd for sharing. |
3531 | */ | 3531 | */ |
@@ -3544,7 +3544,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3544 | if (!vma_shareable(vma, addr)) | 3544 | if (!vma_shareable(vma, addr)) |
3545 | return (pte_t *)pmd_alloc(mm, pud, addr); | 3545 | return (pte_t *)pmd_alloc(mm, pud, addr); |
3546 | 3546 | ||
3547 | mutex_lock(&mapping->i_mmap_mutex); | 3547 | i_mmap_lock_write(mapping); |
3548 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { | 3548 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { |
3549 | if (svma == vma) | 3549 | if (svma == vma) |
3550 | continue; | 3550 | continue; |
@@ -3572,7 +3572,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3572 | spin_unlock(ptl); | 3572 | spin_unlock(ptl); |
3573 | out: | 3573 | out: |
3574 | pte = (pte_t *)pmd_alloc(mm, pud, addr); | 3574 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
3575 | mutex_unlock(&mapping->i_mmap_mutex); | 3575 | i_mmap_unlock_write(mapping); |
3576 | return pte; | 3576 | return pte; |
3577 | } | 3577 | } |
3578 | 3578 | ||
diff --git a/mm/memblock.c b/mm/memblock.c index 6ecb0d937fb5..252b77bdf65e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -715,16 +715,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | |||
715 | } | 715 | } |
716 | 716 | ||
717 | /** | 717 | /** |
718 | * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. | ||
719 | * @base: the base phys addr of the region | ||
720 | * @size: the size of the region | ||
721 | * | 718 | * |
722 | * This function isolates region [@base, @base + @size), and mark it with flag | 719 | * This function isolates region [@base, @base + @size), and sets/clears flag |
723 | * MEMBLOCK_HOTPLUG. | ||
724 | * | 720 | * |
725 | * Return 0 on succees, -errno on failure. | 721 | * Return 0 on succees, -errno on failure. |
726 | */ | 722 | */ |
727 | int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | 723 | static int __init_memblock memblock_setclr_flag(phys_addr_t base, |
724 | phys_addr_t size, int set, int flag) | ||
728 | { | 725 | { |
729 | struct memblock_type *type = &memblock.memory; | 726 | struct memblock_type *type = &memblock.memory; |
730 | int i, ret, start_rgn, end_rgn; | 727 | int i, ret, start_rgn, end_rgn; |
@@ -734,37 +731,37 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | |||
734 | return ret; | 731 | return ret; |
735 | 732 | ||
736 | for (i = start_rgn; i < end_rgn; i++) | 733 | for (i = start_rgn; i < end_rgn; i++) |
737 | memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); | 734 | if (set) |
735 | memblock_set_region_flags(&type->regions[i], flag); | ||
736 | else | ||
737 | memblock_clear_region_flags(&type->regions[i], flag); | ||
738 | 738 | ||
739 | memblock_merge_regions(type); | 739 | memblock_merge_regions(type); |
740 | return 0; | 740 | return 0; |
741 | } | 741 | } |
742 | 742 | ||
743 | /** | 743 | /** |
744 | * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. | 744 | * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. |
745 | * @base: the base phys addr of the region | 745 | * @base: the base phys addr of the region |
746 | * @size: the size of the region | 746 | * @size: the size of the region |
747 | * | 747 | * |
748 | * This function isolates region [@base, @base + @size), and clear flag | 748 | * Return 0 on succees, -errno on failure. |
749 | * MEMBLOCK_HOTPLUG for the isolated regions. | 749 | */ |
750 | int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | ||
751 | { | ||
752 | return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG); | ||
753 | } | ||
754 | |||
755 | /** | ||
756 | * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. | ||
757 | * @base: the base phys addr of the region | ||
758 | * @size: the size of the region | ||
750 | * | 759 | * |
751 | * Return 0 on succees, -errno on failure. | 760 | * Return 0 on succees, -errno on failure. |
752 | */ | 761 | */ |
753 | int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | 762 | int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) |
754 | { | 763 | { |
755 | struct memblock_type *type = &memblock.memory; | 764 | return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG); |
756 | int i, ret, start_rgn, end_rgn; | ||
757 | |||
758 | ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); | ||
759 | if (ret) | ||
760 | return ret; | ||
761 | |||
762 | for (i = start_rgn; i < end_rgn; i++) | ||
763 | memblock_clear_region_flags(&type->regions[i], | ||
764 | MEMBLOCK_HOTPLUG); | ||
765 | |||
766 | memblock_merge_regions(type); | ||
767 | return 0; | ||
768 | } | 765 | } |
769 | 766 | ||
770 | /** | 767 | /** |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 85df503ec023..ef91e856c7e4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -296,7 +296,6 @@ struct mem_cgroup { | |||
296 | * Should the accounting and control be hierarchical, per subtree? | 296 | * Should the accounting and control be hierarchical, per subtree? |
297 | */ | 297 | */ |
298 | bool use_hierarchy; | 298 | bool use_hierarchy; |
299 | unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ | ||
300 | 299 | ||
301 | bool oom_lock; | 300 | bool oom_lock; |
302 | atomic_t under_oom; | 301 | atomic_t under_oom; |
@@ -366,22 +365,11 @@ struct mem_cgroup { | |||
366 | /* WARNING: nodeinfo must be the last member here */ | 365 | /* WARNING: nodeinfo must be the last member here */ |
367 | }; | 366 | }; |
368 | 367 | ||
369 | /* internal only representation about the status of kmem accounting. */ | ||
370 | enum { | ||
371 | KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ | ||
372 | }; | ||
373 | |||
374 | #ifdef CONFIG_MEMCG_KMEM | 368 | #ifdef CONFIG_MEMCG_KMEM |
375 | static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) | ||
376 | { | ||
377 | set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | ||
378 | } | ||
379 | |||
380 | static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | 369 | static bool memcg_kmem_is_active(struct mem_cgroup *memcg) |
381 | { | 370 | { |
382 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | 371 | return memcg->kmemcg_id >= 0; |
383 | } | 372 | } |
384 | |||
385 | #endif | 373 | #endif |
386 | 374 | ||
387 | /* Stuffs for move charges at task migration. */ | 375 | /* Stuffs for move charges at task migration. */ |
@@ -1571,7 +1559,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1571 | * select it. The goal is to allow it to allocate so that it may | 1559 | * select it. The goal is to allow it to allocate so that it may |
1572 | * quickly exit and free its memory. | 1560 | * quickly exit and free its memory. |
1573 | */ | 1561 | */ |
1574 | if (fatal_signal_pending(current) || current->flags & PF_EXITING) { | 1562 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { |
1575 | set_thread_flag(TIF_MEMDIE); | 1563 | set_thread_flag(TIF_MEMDIE); |
1576 | return; | 1564 | return; |
1577 | } | 1565 | } |
@@ -1628,6 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1628 | NULL, "Memory cgroup out of memory"); | 1616 | NULL, "Memory cgroup out of memory"); |
1629 | } | 1617 | } |
1630 | 1618 | ||
1619 | #if MAX_NUMNODES > 1 | ||
1620 | |||
1631 | /** | 1621 | /** |
1632 | * test_mem_cgroup_node_reclaimable | 1622 | * test_mem_cgroup_node_reclaimable |
1633 | * @memcg: the target memcg | 1623 | * @memcg: the target memcg |
@@ -1650,7 +1640,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | |||
1650 | return false; | 1640 | return false; |
1651 | 1641 | ||
1652 | } | 1642 | } |
1653 | #if MAX_NUMNODES > 1 | ||
1654 | 1643 | ||
1655 | /* | 1644 | /* |
1656 | * Always updating the nodemask is not very good - even if we have an empty | 1645 | * Always updating the nodemask is not very good - even if we have an empty |
@@ -2646,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg, | |||
2646 | if (!cachep) | 2635 | if (!cachep) |
2647 | return; | 2636 | return; |
2648 | 2637 | ||
2649 | css_get(&memcg->css); | ||
2650 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | 2638 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); |
2651 | 2639 | ||
2652 | /* | 2640 | /* |
@@ -2680,40 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep) | |||
2680 | list_del(&cachep->memcg_params->list); | 2668 | list_del(&cachep->memcg_params->list); |
2681 | 2669 | ||
2682 | kmem_cache_destroy(cachep); | 2670 | kmem_cache_destroy(cachep); |
2683 | |||
2684 | /* drop the reference taken in memcg_register_cache */ | ||
2685 | css_put(&memcg->css); | ||
2686 | } | ||
2687 | |||
2688 | /* | ||
2689 | * During the creation a new cache, we need to disable our accounting mechanism | ||
2690 | * altogether. This is true even if we are not creating, but rather just | ||
2691 | * enqueing new caches to be created. | ||
2692 | * | ||
2693 | * This is because that process will trigger allocations; some visible, like | ||
2694 | * explicit kmallocs to auxiliary data structures, name strings and internal | ||
2695 | * cache structures; some well concealed, like INIT_WORK() that can allocate | ||
2696 | * objects during debug. | ||
2697 | * | ||
2698 | * If any allocation happens during memcg_kmem_get_cache, we will recurse back | ||
2699 | * to it. This may not be a bounded recursion: since the first cache creation | ||
2700 | * failed to complete (waiting on the allocation), we'll just try to create the | ||
2701 | * cache again, failing at the same point. | ||
2702 | * | ||
2703 | * memcg_kmem_get_cache is prepared to abort after seeing a positive count of | ||
2704 | * memcg_kmem_skip_account. So we enclose anything that might allocate memory | ||
2705 | * inside the following two functions. | ||
2706 | */ | ||
2707 | static inline void memcg_stop_kmem_account(void) | ||
2708 | { | ||
2709 | VM_BUG_ON(!current->mm); | ||
2710 | current->memcg_kmem_skip_account++; | ||
2711 | } | ||
2712 | |||
2713 | static inline void memcg_resume_kmem_account(void) | ||
2714 | { | ||
2715 | VM_BUG_ON(!current->mm); | ||
2716 | current->memcg_kmem_skip_account--; | ||
2717 | } | 2671 | } |
2718 | 2672 | ||
2719 | int __memcg_cleanup_cache_params(struct kmem_cache *s) | 2673 | int __memcg_cleanup_cache_params(struct kmem_cache *s) |
@@ -2747,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg) | |||
2747 | mutex_lock(&memcg_slab_mutex); | 2701 | mutex_lock(&memcg_slab_mutex); |
2748 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { | 2702 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { |
2749 | cachep = memcg_params_to_cache(params); | 2703 | cachep = memcg_params_to_cache(params); |
2750 | kmem_cache_shrink(cachep); | 2704 | memcg_unregister_cache(cachep); |
2751 | if (atomic_read(&cachep->memcg_params->nr_pages) == 0) | ||
2752 | memcg_unregister_cache(cachep); | ||
2753 | } | 2705 | } |
2754 | mutex_unlock(&memcg_slab_mutex); | 2706 | mutex_unlock(&memcg_slab_mutex); |
2755 | } | 2707 | } |
@@ -2784,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
2784 | struct memcg_register_cache_work *cw; | 2736 | struct memcg_register_cache_work *cw; |
2785 | 2737 | ||
2786 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); | 2738 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); |
2787 | if (cw == NULL) { | 2739 | if (!cw) |
2788 | css_put(&memcg->css); | ||
2789 | return; | 2740 | return; |
2790 | } | 2741 | |
2742 | css_get(&memcg->css); | ||
2791 | 2743 | ||
2792 | cw->memcg = memcg; | 2744 | cw->memcg = memcg; |
2793 | cw->cachep = cachep; | 2745 | cw->cachep = cachep; |
@@ -2810,20 +2762,16 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
2810 | * this point we can't allow ourselves back into memcg_kmem_get_cache, | 2762 | * this point we can't allow ourselves back into memcg_kmem_get_cache, |
2811 | * the safest choice is to do it like this, wrapping the whole function. | 2763 | * the safest choice is to do it like this, wrapping the whole function. |
2812 | */ | 2764 | */ |
2813 | memcg_stop_kmem_account(); | 2765 | current->memcg_kmem_skip_account = 1; |
2814 | __memcg_schedule_register_cache(memcg, cachep); | 2766 | __memcg_schedule_register_cache(memcg, cachep); |
2815 | memcg_resume_kmem_account(); | 2767 | current->memcg_kmem_skip_account = 0; |
2816 | } | 2768 | } |
2817 | 2769 | ||
2818 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) | 2770 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) |
2819 | { | 2771 | { |
2820 | unsigned int nr_pages = 1 << order; | 2772 | unsigned int nr_pages = 1 << order; |
2821 | int res; | ||
2822 | 2773 | ||
2823 | res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); | 2774 | return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); |
2824 | if (!res) | ||
2825 | atomic_add(nr_pages, &cachep->memcg_params->nr_pages); | ||
2826 | return res; | ||
2827 | } | 2775 | } |
2828 | 2776 | ||
2829 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | 2777 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) |
@@ -2831,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | |||
2831 | unsigned int nr_pages = 1 << order; | 2779 | unsigned int nr_pages = 1 << order; |
2832 | 2780 | ||
2833 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); | 2781 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); |
2834 | atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); | ||
2835 | } | 2782 | } |
2836 | 2783 | ||
2837 | /* | 2784 | /* |
@@ -2847,8 +2794,7 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | |||
2847 | * Can't be called in interrupt context or from kernel threads. | 2794 | * Can't be called in interrupt context or from kernel threads. |
2848 | * This function needs to be called with rcu_read_lock() held. | 2795 | * This function needs to be called with rcu_read_lock() held. |
2849 | */ | 2796 | */ |
2850 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | 2797 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) |
2851 | gfp_t gfp) | ||
2852 | { | 2798 | { |
2853 | struct mem_cgroup *memcg; | 2799 | struct mem_cgroup *memcg; |
2854 | struct kmem_cache *memcg_cachep; | 2800 | struct kmem_cache *memcg_cachep; |
@@ -2856,25 +2802,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
2856 | VM_BUG_ON(!cachep->memcg_params); | 2802 | VM_BUG_ON(!cachep->memcg_params); |
2857 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); | 2803 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); |
2858 | 2804 | ||
2859 | if (!current->mm || current->memcg_kmem_skip_account) | 2805 | if (current->memcg_kmem_skip_account) |
2860 | return cachep; | 2806 | return cachep; |
2861 | 2807 | ||
2862 | rcu_read_lock(); | 2808 | memcg = get_mem_cgroup_from_mm(current->mm); |
2863 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); | ||
2864 | |||
2865 | if (!memcg_kmem_is_active(memcg)) | 2809 | if (!memcg_kmem_is_active(memcg)) |
2866 | goto out; | 2810 | goto out; |
2867 | 2811 | ||
2868 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); | 2812 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); |
2869 | if (likely(memcg_cachep)) { | 2813 | if (likely(memcg_cachep)) |
2870 | cachep = memcg_cachep; | 2814 | return memcg_cachep; |
2871 | goto out; | ||
2872 | } | ||
2873 | |||
2874 | /* The corresponding put will be done in the workqueue. */ | ||
2875 | if (!css_tryget_online(&memcg->css)) | ||
2876 | goto out; | ||
2877 | rcu_read_unlock(); | ||
2878 | 2815 | ||
2879 | /* | 2816 | /* |
2880 | * If we are in a safe context (can wait, and not in interrupt | 2817 | * If we are in a safe context (can wait, and not in interrupt |
@@ -2889,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
2889 | * defer everything. | 2826 | * defer everything. |
2890 | */ | 2827 | */ |
2891 | memcg_schedule_register_cache(memcg, cachep); | 2828 | memcg_schedule_register_cache(memcg, cachep); |
2892 | return cachep; | ||
2893 | out: | 2829 | out: |
2894 | rcu_read_unlock(); | 2830 | css_put(&memcg->css); |
2895 | return cachep; | 2831 | return cachep; |
2896 | } | 2832 | } |
2897 | 2833 | ||
2834 | void __memcg_kmem_put_cache(struct kmem_cache *cachep) | ||
2835 | { | ||
2836 | if (!is_root_cache(cachep)) | ||
2837 | css_put(&cachep->memcg_params->memcg->css); | ||
2838 | } | ||
2839 | |||
2898 | /* | 2840 | /* |
2899 | * We need to verify if the allocation against current->mm->owner's memcg is | 2841 | * We need to verify if the allocation against current->mm->owner's memcg is |
2900 | * possible for the given order. But the page is not allocated yet, so we'll | 2842 | * possible for the given order. But the page is not allocated yet, so we'll |
@@ -2917,34 +2859,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
2917 | 2859 | ||
2918 | *_memcg = NULL; | 2860 | *_memcg = NULL; |
2919 | 2861 | ||
2920 | /* | ||
2921 | * Disabling accounting is only relevant for some specific memcg | ||
2922 | * internal allocations. Therefore we would initially not have such | ||
2923 | * check here, since direct calls to the page allocator that are | ||
2924 | * accounted to kmemcg (alloc_kmem_pages and friends) only happen | ||
2925 | * outside memcg core. We are mostly concerned with cache allocations, | ||
2926 | * and by having this test at memcg_kmem_get_cache, we are already able | ||
2927 | * to relay the allocation to the root cache and bypass the memcg cache | ||
2928 | * altogether. | ||
2929 | * | ||
2930 | * There is one exception, though: the SLUB allocator does not create | ||
2931 | * large order caches, but rather service large kmallocs directly from | ||
2932 | * the page allocator. Therefore, the following sequence when backed by | ||
2933 | * the SLUB allocator: | ||
2934 | * | ||
2935 | * memcg_stop_kmem_account(); | ||
2936 | * kmalloc(<large_number>) | ||
2937 | * memcg_resume_kmem_account(); | ||
2938 | * | ||
2939 | * would effectively ignore the fact that we should skip accounting, | ||
2940 | * since it will drive us directly to this function without passing | ||
2941 | * through the cache selector memcg_kmem_get_cache. Such large | ||
2942 | * allocations are extremely rare but can happen, for instance, for the | ||
2943 | * cache arrays. We bring this test here. | ||
2944 | */ | ||
2945 | if (!current->mm || current->memcg_kmem_skip_account) | ||
2946 | return true; | ||
2947 | |||
2948 | memcg = get_mem_cgroup_from_mm(current->mm); | 2862 | memcg = get_mem_cgroup_from_mm(current->mm); |
2949 | 2863 | ||
2950 | if (!memcg_kmem_is_active(memcg)) { | 2864 | if (!memcg_kmem_is_active(memcg)) { |
@@ -2985,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) | |||
2985 | memcg_uncharge_kmem(memcg, 1 << order); | 2899 | memcg_uncharge_kmem(memcg, 1 << order); |
2986 | page->mem_cgroup = NULL; | 2900 | page->mem_cgroup = NULL; |
2987 | } | 2901 | } |
2988 | #else | ||
2989 | static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) | ||
2990 | { | ||
2991 | } | ||
2992 | #endif /* CONFIG_MEMCG_KMEM */ | 2902 | #endif /* CONFIG_MEMCG_KMEM */ |
2993 | 2903 | ||
2994 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2904 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
@@ -3539,12 +3449,6 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, | |||
3539 | return 0; | 3449 | return 0; |
3540 | 3450 | ||
3541 | /* | 3451 | /* |
3542 | * We are going to allocate memory for data shared by all memory | ||
3543 | * cgroups so let's stop accounting here. | ||
3544 | */ | ||
3545 | memcg_stop_kmem_account(); | ||
3546 | |||
3547 | /* | ||
3548 | * For simplicity, we won't allow this to be disabled. It also can't | 3452 | * For simplicity, we won't allow this to be disabled. It also can't |
3549 | * be changed if the cgroup has children already, or if tasks had | 3453 | * be changed if the cgroup has children already, or if tasks had |
3550 | * already joined. | 3454 | * already joined. |
@@ -3570,25 +3474,22 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, | |||
3570 | goto out; | 3474 | goto out; |
3571 | } | 3475 | } |
3572 | 3476 | ||
3573 | memcg->kmemcg_id = memcg_id; | ||
3574 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
3575 | |||
3576 | /* | 3477 | /* |
3577 | * We couldn't have accounted to this cgroup, because it hasn't got the | 3478 | * We couldn't have accounted to this cgroup, because it hasn't got |
3578 | * active bit set yet, so this should succeed. | 3479 | * activated yet, so this should succeed. |
3579 | */ | 3480 | */ |
3580 | err = page_counter_limit(&memcg->kmem, nr_pages); | 3481 | err = page_counter_limit(&memcg->kmem, nr_pages); |
3581 | VM_BUG_ON(err); | 3482 | VM_BUG_ON(err); |
3582 | 3483 | ||
3583 | static_key_slow_inc(&memcg_kmem_enabled_key); | 3484 | static_key_slow_inc(&memcg_kmem_enabled_key); |
3584 | /* | 3485 | /* |
3585 | * Setting the active bit after enabling static branching will | 3486 | * A memory cgroup is considered kmem-active as soon as it gets |
3487 | * kmemcg_id. Setting the id after enabling static branching will | ||
3586 | * guarantee no one starts accounting before all call sites are | 3488 | * guarantee no one starts accounting before all call sites are |
3587 | * patched. | 3489 | * patched. |
3588 | */ | 3490 | */ |
3589 | memcg_kmem_set_active(memcg); | 3491 | memcg->kmemcg_id = memcg_id; |
3590 | out: | 3492 | out: |
3591 | memcg_resume_kmem_account(); | ||
3592 | return err; | 3493 | return err; |
3593 | } | 3494 | } |
3594 | 3495 | ||
@@ -3791,11 +3692,6 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) | |||
3791 | } | 3692 | } |
3792 | #endif /* CONFIG_NUMA */ | 3693 | #endif /* CONFIG_NUMA */ |
3793 | 3694 | ||
3794 | static inline void mem_cgroup_lru_names_not_uptodate(void) | ||
3795 | { | ||
3796 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | ||
3797 | } | ||
3798 | |||
3799 | static int memcg_stat_show(struct seq_file *m, void *v) | 3695 | static int memcg_stat_show(struct seq_file *m, void *v) |
3800 | { | 3696 | { |
3801 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 3697 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
@@ -3803,6 +3699,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) | |||
3803 | struct mem_cgroup *mi; | 3699 | struct mem_cgroup *mi; |
3804 | unsigned int i; | 3700 | unsigned int i; |
3805 | 3701 | ||
3702 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | ||
3703 | |||
3806 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 3704 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
3807 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) | 3705 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
3808 | continue; | 3706 | continue; |
@@ -4259,7 +4157,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
4259 | { | 4157 | { |
4260 | int ret; | 4158 | int ret; |
4261 | 4159 | ||
4262 | memcg->kmemcg_id = -1; | ||
4263 | ret = memcg_propagate_kmem(memcg); | 4160 | ret = memcg_propagate_kmem(memcg); |
4264 | if (ret) | 4161 | if (ret) |
4265 | return ret; | 4162 | return ret; |
@@ -4269,6 +4166,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
4269 | 4166 | ||
4270 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4167 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
4271 | { | 4168 | { |
4169 | memcg_unregister_all_caches(memcg); | ||
4272 | mem_cgroup_sockets_destroy(memcg); | 4170 | mem_cgroup_sockets_destroy(memcg); |
4273 | } | 4171 | } |
4274 | #else | 4172 | #else |
@@ -4724,17 +4622,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
4724 | 4622 | ||
4725 | free_percpu(memcg->stat); | 4623 | free_percpu(memcg->stat); |
4726 | 4624 | ||
4727 | /* | ||
4728 | * We need to make sure that (at least for now), the jump label | ||
4729 | * destruction code runs outside of the cgroup lock. This is because | ||
4730 | * get_online_cpus(), which is called from the static_branch update, | ||
4731 | * can't be called inside the cgroup_lock. cpusets are the ones | ||
4732 | * enforcing this dependency, so if they ever change, we might as well. | ||
4733 | * | ||
4734 | * schedule_work() will guarantee this happens. Be careful if you need | ||
4735 | * to move this code around, and make sure it is outside | ||
4736 | * the cgroup_lock. | ||
4737 | */ | ||
4738 | disarm_static_keys(memcg); | 4625 | disarm_static_keys(memcg); |
4739 | kfree(memcg); | 4626 | kfree(memcg); |
4740 | } | 4627 | } |
@@ -4804,6 +4691,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
4804 | vmpressure_init(&memcg->vmpressure); | 4691 | vmpressure_init(&memcg->vmpressure); |
4805 | INIT_LIST_HEAD(&memcg->event_list); | 4692 | INIT_LIST_HEAD(&memcg->event_list); |
4806 | spin_lock_init(&memcg->event_list_lock); | 4693 | spin_lock_init(&memcg->event_list_lock); |
4694 | #ifdef CONFIG_MEMCG_KMEM | ||
4695 | memcg->kmemcg_id = -1; | ||
4696 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
4697 | #endif | ||
4807 | 4698 | ||
4808 | return &memcg->css; | 4699 | return &memcg->css; |
4809 | 4700 | ||
@@ -4885,7 +4776,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
4885 | } | 4776 | } |
4886 | spin_unlock(&memcg->event_list_lock); | 4777 | spin_unlock(&memcg->event_list_lock); |
4887 | 4778 | ||
4888 | memcg_unregister_all_caches(memcg); | ||
4889 | vmpressure_cleanup(&memcg->vmpressure); | 4779 | vmpressure_cleanup(&memcg->vmpressure); |
4890 | } | 4780 | } |
4891 | 4781 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e5ee0ca7ae85..feb803bf3443 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -239,19 +239,14 @@ void shake_page(struct page *p, int access) | |||
239 | } | 239 | } |
240 | 240 | ||
241 | /* | 241 | /* |
242 | * Only call shrink_slab here (which would also shrink other caches) if | 242 | * Only call shrink_node_slabs here (which would also shrink |
243 | * access is not potentially fatal. | 243 | * other caches) if access is not potentially fatal. |
244 | */ | 244 | */ |
245 | if (access) { | 245 | if (access) { |
246 | int nr; | 246 | int nr; |
247 | int nid = page_to_nid(p); | 247 | int nid = page_to_nid(p); |
248 | do { | 248 | do { |
249 | struct shrink_control shrink = { | 249 | nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); |
250 | .gfp_mask = GFP_KERNEL, | ||
251 | }; | ||
252 | node_set(nid, shrink.nodes_to_scan); | ||
253 | |||
254 | nr = shrink_slab(&shrink, 1000, 1000); | ||
255 | if (page_count(p) == 1) | 250 | if (page_count(p) == 1) |
256 | break; | 251 | break; |
257 | } while (nr > 10); | 252 | } while (nr > 10); |
@@ -466,7 +461,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
466 | struct task_struct *tsk; | 461 | struct task_struct *tsk; |
467 | struct address_space *mapping = page->mapping; | 462 | struct address_space *mapping = page->mapping; |
468 | 463 | ||
469 | mutex_lock(&mapping->i_mmap_mutex); | 464 | i_mmap_lock_read(mapping); |
470 | read_lock(&tasklist_lock); | 465 | read_lock(&tasklist_lock); |
471 | for_each_process(tsk) { | 466 | for_each_process(tsk) { |
472 | pgoff_t pgoff = page_to_pgoff(page); | 467 | pgoff_t pgoff = page_to_pgoff(page); |
@@ -488,7 +483,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
488 | } | 483 | } |
489 | } | 484 | } |
490 | read_unlock(&tasklist_lock); | 485 | read_unlock(&tasklist_lock); |
491 | mutex_unlock(&mapping->i_mmap_mutex); | 486 | i_mmap_unlock_read(mapping); |
492 | } | 487 | } |
493 | 488 | ||
494 | /* | 489 | /* |
diff --git a/mm/memory.c b/mm/memory.c index 4b5a282e1107..fbf74112de5b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1326,9 +1326,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1326 | * safe to do nothing in this case. | 1326 | * safe to do nothing in this case. |
1327 | */ | 1327 | */ |
1328 | if (vma->vm_file) { | 1328 | if (vma->vm_file) { |
1329 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 1329 | i_mmap_lock_write(vma->vm_file->f_mapping); |
1330 | __unmap_hugepage_range_final(tlb, vma, start, end, NULL); | 1330 | __unmap_hugepage_range_final(tlb, vma, start, end, NULL); |
1331 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 1331 | i_mmap_unlock_write(vma->vm_file->f_mapping); |
1332 | } | 1332 | } |
1333 | } else | 1333 | } else |
1334 | unmap_page_range(tlb, vma, start, end, details); | 1334 | unmap_page_range(tlb, vma, start, end, details); |
@@ -2377,12 +2377,12 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2377 | details.last_index = ULONG_MAX; | 2377 | details.last_index = ULONG_MAX; |
2378 | 2378 | ||
2379 | 2379 | ||
2380 | mutex_lock(&mapping->i_mmap_mutex); | 2380 | i_mmap_lock_read(mapping); |
2381 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) | 2381 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) |
2382 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2382 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
2383 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | 2383 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) |
2384 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | 2384 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); |
2385 | mutex_unlock(&mapping->i_mmap_mutex); | 2385 | i_mmap_unlock_read(mapping); |
2386 | } | 2386 | } |
2387 | EXPORT_SYMBOL(unmap_mapping_range); | 2387 | EXPORT_SYMBOL(unmap_mapping_range); |
2388 | 2388 | ||
@@ -3365,6 +3365,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3365 | 3365 | ||
3366 | return ret; | 3366 | return ret; |
3367 | } | 3367 | } |
3368 | EXPORT_SYMBOL_GPL(handle_mm_fault); | ||
3368 | 3369 | ||
3369 | #ifndef __PAGETABLE_PUD_FOLDED | 3370 | #ifndef __PAGETABLE_PUD_FOLDED |
3370 | /* | 3371 | /* |
diff --git a/mm/migrate.c b/mm/migrate.c index 01439953abf5..253474c22239 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -746,7 +746,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
746 | * MIGRATEPAGE_SUCCESS - success | 746 | * MIGRATEPAGE_SUCCESS - success |
747 | */ | 747 | */ |
748 | static int move_to_new_page(struct page *newpage, struct page *page, | 748 | static int move_to_new_page(struct page *newpage, struct page *page, |
749 | int remap_swapcache, enum migrate_mode mode) | 749 | int page_was_mapped, enum migrate_mode mode) |
750 | { | 750 | { |
751 | struct address_space *mapping; | 751 | struct address_space *mapping; |
752 | int rc; | 752 | int rc; |
@@ -784,7 +784,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
784 | newpage->mapping = NULL; | 784 | newpage->mapping = NULL; |
785 | } else { | 785 | } else { |
786 | mem_cgroup_migrate(page, newpage, false); | 786 | mem_cgroup_migrate(page, newpage, false); |
787 | if (remap_swapcache) | 787 | if (page_was_mapped) |
788 | remove_migration_ptes(page, newpage); | 788 | remove_migration_ptes(page, newpage); |
789 | page->mapping = NULL; | 789 | page->mapping = NULL; |
790 | } | 790 | } |
@@ -798,7 +798,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
798 | int force, enum migrate_mode mode) | 798 | int force, enum migrate_mode mode) |
799 | { | 799 | { |
800 | int rc = -EAGAIN; | 800 | int rc = -EAGAIN; |
801 | int remap_swapcache = 1; | 801 | int page_was_mapped = 0; |
802 | struct anon_vma *anon_vma = NULL; | 802 | struct anon_vma *anon_vma = NULL; |
803 | 803 | ||
804 | if (!trylock_page(page)) { | 804 | if (!trylock_page(page)) { |
@@ -870,7 +870,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
870 | * migrated but are not remapped when migration | 870 | * migrated but are not remapped when migration |
871 | * completes | 871 | * completes |
872 | */ | 872 | */ |
873 | remap_swapcache = 0; | ||
874 | } else { | 873 | } else { |
875 | goto out_unlock; | 874 | goto out_unlock; |
876 | } | 875 | } |
@@ -910,13 +909,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
910 | } | 909 | } |
911 | 910 | ||
912 | /* Establish migration ptes or remove ptes */ | 911 | /* Establish migration ptes or remove ptes */ |
913 | try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 912 | if (page_mapped(page)) { |
913 | try_to_unmap(page, | ||
914 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | ||
915 | page_was_mapped = 1; | ||
916 | } | ||
914 | 917 | ||
915 | skip_unmap: | 918 | skip_unmap: |
916 | if (!page_mapped(page)) | 919 | if (!page_mapped(page)) |
917 | rc = move_to_new_page(newpage, page, remap_swapcache, mode); | 920 | rc = move_to_new_page(newpage, page, page_was_mapped, mode); |
918 | 921 | ||
919 | if (rc && remap_swapcache) | 922 | if (rc && page_was_mapped) |
920 | remove_migration_ptes(page, page); | 923 | remove_migration_ptes(page, page); |
921 | 924 | ||
922 | /* Drop an anon_vma reference if we took one */ | 925 | /* Drop an anon_vma reference if we took one */ |
@@ -1017,6 +1020,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1017 | { | 1020 | { |
1018 | int rc = 0; | 1021 | int rc = 0; |
1019 | int *result = NULL; | 1022 | int *result = NULL; |
1023 | int page_was_mapped = 0; | ||
1020 | struct page *new_hpage; | 1024 | struct page *new_hpage; |
1021 | struct anon_vma *anon_vma = NULL; | 1025 | struct anon_vma *anon_vma = NULL; |
1022 | 1026 | ||
@@ -1047,12 +1051,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1047 | if (PageAnon(hpage)) | 1051 | if (PageAnon(hpage)) |
1048 | anon_vma = page_get_anon_vma(hpage); | 1052 | anon_vma = page_get_anon_vma(hpage); |
1049 | 1053 | ||
1050 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 1054 | if (page_mapped(hpage)) { |
1055 | try_to_unmap(hpage, | ||
1056 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | ||
1057 | page_was_mapped = 1; | ||
1058 | } | ||
1051 | 1059 | ||
1052 | if (!page_mapped(hpage)) | 1060 | if (!page_mapped(hpage)) |
1053 | rc = move_to_new_page(new_hpage, hpage, 1, mode); | 1061 | rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode); |
1054 | 1062 | ||
1055 | if (rc != MIGRATEPAGE_SUCCESS) | 1063 | if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) |
1056 | remove_migration_ptes(hpage, hpage); | 1064 | remove_migration_ptes(hpage, hpage); |
1057 | 1065 | ||
1058 | if (anon_vma) | 1066 | if (anon_vma) |
diff --git a/mm/mincore.c b/mm/mincore.c index 725c80961048..c8c528b36641 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -137,8 +137,11 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
137 | } else { /* pte is a swap entry */ | 137 | } else { /* pte is a swap entry */ |
138 | swp_entry_t entry = pte_to_swp_entry(pte); | 138 | swp_entry_t entry = pte_to_swp_entry(pte); |
139 | 139 | ||
140 | if (is_migration_entry(entry)) { | 140 | if (non_swap_entry(entry)) { |
141 | /* migration entries are always uptodate */ | 141 | /* |
142 | * migration or hwpoison entries are always | ||
143 | * uptodate | ||
144 | */ | ||
142 | *vec = 1; | 145 | *vec = 1; |
143 | } else { | 146 | } else { |
144 | #ifdef CONFIG_SWAP | 147 | #ifdef CONFIG_SWAP |
@@ -232,7 +232,7 @@ error: | |||
232 | } | 232 | } |
233 | 233 | ||
234 | /* | 234 | /* |
235 | * Requires inode->i_mapping->i_mmap_mutex | 235 | * Requires inode->i_mapping->i_mmap_rwsem |
236 | */ | 236 | */ |
237 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, | 237 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, |
238 | struct file *file, struct address_space *mapping) | 238 | struct file *file, struct address_space *mapping) |
@@ -260,9 +260,9 @@ void unlink_file_vma(struct vm_area_struct *vma) | |||
260 | 260 | ||
261 | if (file) { | 261 | if (file) { |
262 | struct address_space *mapping = file->f_mapping; | 262 | struct address_space *mapping = file->f_mapping; |
263 | mutex_lock(&mapping->i_mmap_mutex); | 263 | i_mmap_lock_write(mapping); |
264 | __remove_shared_vm_struct(vma, file, mapping); | 264 | __remove_shared_vm_struct(vma, file, mapping); |
265 | mutex_unlock(&mapping->i_mmap_mutex); | 265 | i_mmap_unlock_write(mapping); |
266 | } | 266 | } |
267 | } | 267 | } |
268 | 268 | ||
@@ -674,14 +674,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
674 | 674 | ||
675 | if (vma->vm_file) { | 675 | if (vma->vm_file) { |
676 | mapping = vma->vm_file->f_mapping; | 676 | mapping = vma->vm_file->f_mapping; |
677 | mutex_lock(&mapping->i_mmap_mutex); | 677 | i_mmap_lock_write(mapping); |
678 | } | 678 | } |
679 | 679 | ||
680 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 680 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
681 | __vma_link_file(vma); | 681 | __vma_link_file(vma); |
682 | 682 | ||
683 | if (mapping) | 683 | if (mapping) |
684 | mutex_unlock(&mapping->i_mmap_mutex); | 684 | i_mmap_unlock_write(mapping); |
685 | 685 | ||
686 | mm->map_count++; | 686 | mm->map_count++; |
687 | validate_mm(mm); | 687 | validate_mm(mm); |
@@ -796,7 +796,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
796 | next->vm_end); | 796 | next->vm_end); |
797 | } | 797 | } |
798 | 798 | ||
799 | mutex_lock(&mapping->i_mmap_mutex); | 799 | i_mmap_lock_write(mapping); |
800 | if (insert) { | 800 | if (insert) { |
801 | /* | 801 | /* |
802 | * Put into interval tree now, so instantiated pages | 802 | * Put into interval tree now, so instantiated pages |
@@ -883,7 +883,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
883 | anon_vma_unlock_write(anon_vma); | 883 | anon_vma_unlock_write(anon_vma); |
884 | } | 884 | } |
885 | if (mapping) | 885 | if (mapping) |
886 | mutex_unlock(&mapping->i_mmap_mutex); | 886 | i_mmap_unlock_write(mapping); |
887 | 887 | ||
888 | if (root) { | 888 | if (root) { |
889 | uprobe_mmap(vma); | 889 | uprobe_mmap(vma); |
@@ -2362,6 +2362,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
2362 | } | 2362 | } |
2363 | #endif | 2363 | #endif |
2364 | 2364 | ||
2365 | EXPORT_SYMBOL_GPL(find_extend_vma); | ||
2366 | |||
2365 | /* | 2367 | /* |
2366 | * Ok - we have the memory areas we should free on the vma list, | 2368 | * Ok - we have the memory areas we should free on the vma list, |
2367 | * so release them, and do the vma updates. | 2369 | * so release them, and do the vma updates. |
@@ -2791,7 +2793,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2791 | 2793 | ||
2792 | /* Insert vm structure into process list sorted by address | 2794 | /* Insert vm structure into process list sorted by address |
2793 | * and into the inode's i_mmap tree. If vm_file is non-NULL | 2795 | * and into the inode's i_mmap tree. If vm_file is non-NULL |
2794 | * then i_mmap_mutex is taken here. | 2796 | * then i_mmap_rwsem is taken here. |
2795 | */ | 2797 | */ |
2796 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 2798 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
2797 | { | 2799 | { |
@@ -3086,7 +3088,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
3086 | */ | 3088 | */ |
3087 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) | 3089 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) |
3088 | BUG(); | 3090 | BUG(); |
3089 | mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); | 3091 | down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); |
3090 | } | 3092 | } |
3091 | } | 3093 | } |
3092 | 3094 | ||
@@ -3113,7 +3115,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
3113 | * vma in this mm is backed by the same anon_vma or address_space. | 3115 | * vma in this mm is backed by the same anon_vma or address_space. |
3114 | * | 3116 | * |
3115 | * We can take all the locks in random order because the VM code | 3117 | * We can take all the locks in random order because the VM code |
3116 | * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never | 3118 | * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never |
3117 | * takes more than one of them in a row. Secondly we're protected | 3119 | * takes more than one of them in a row. Secondly we're protected |
3118 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | 3120 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. |
3119 | * | 3121 | * |
@@ -3182,7 +3184,7 @@ static void vm_unlock_mapping(struct address_space *mapping) | |||
3182 | * AS_MM_ALL_LOCKS can't change to 0 from under us | 3184 | * AS_MM_ALL_LOCKS can't change to 0 from under us |
3183 | * because we hold the mm_all_locks_mutex. | 3185 | * because we hold the mm_all_locks_mutex. |
3184 | */ | 3186 | */ |
3185 | mutex_unlock(&mapping->i_mmap_mutex); | 3187 | i_mmap_unlock_write(mapping); |
3186 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, | 3188 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, |
3187 | &mapping->flags)) | 3189 | &mapping->flags)) |
3188 | BUG(); | 3190 | BUG(); |
diff --git a/mm/mremap.c b/mm/mremap.c index b147f66f4c40..84aa36f9f308 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -99,7 +99,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
99 | spinlock_t *old_ptl, *new_ptl; | 99 | spinlock_t *old_ptl, *new_ptl; |
100 | 100 | ||
101 | /* | 101 | /* |
102 | * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma | 102 | * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma |
103 | * locks to ensure that rmap will always observe either the old or the | 103 | * locks to ensure that rmap will always observe either the old or the |
104 | * new ptes. This is the easiest way to avoid races with | 104 | * new ptes. This is the easiest way to avoid races with |
105 | * truncate_pagecache(), page migration, etc... | 105 | * truncate_pagecache(), page migration, etc... |
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
119 | if (need_rmap_locks) { | 119 | if (need_rmap_locks) { |
120 | if (vma->vm_file) { | 120 | if (vma->vm_file) { |
121 | mapping = vma->vm_file->f_mapping; | 121 | mapping = vma->vm_file->f_mapping; |
122 | mutex_lock(&mapping->i_mmap_mutex); | 122 | i_mmap_lock_write(mapping); |
123 | } | 123 | } |
124 | if (vma->anon_vma) { | 124 | if (vma->anon_vma) { |
125 | anon_vma = vma->anon_vma; | 125 | anon_vma = vma->anon_vma; |
@@ -156,7 +156,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
156 | if (anon_vma) | 156 | if (anon_vma) |
157 | anon_vma_unlock_write(anon_vma); | 157 | anon_vma_unlock_write(anon_vma); |
158 | if (mapping) | 158 | if (mapping) |
159 | mutex_unlock(&mapping->i_mmap_mutex); | 159 | i_mmap_unlock_write(mapping); |
160 | } | 160 | } |
161 | 161 | ||
162 | #define LATENCY_LIMIT (64 * PAGE_SIZE) | 162 | #define LATENCY_LIMIT (64 * PAGE_SIZE) |
diff --git a/mm/nommu.c b/mm/nommu.c index bd1808e194a7..b51eadf6d952 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -722,11 +722,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
722 | if (vma->vm_file) { | 722 | if (vma->vm_file) { |
723 | mapping = vma->vm_file->f_mapping; | 723 | mapping = vma->vm_file->f_mapping; |
724 | 724 | ||
725 | mutex_lock(&mapping->i_mmap_mutex); | 725 | i_mmap_lock_write(mapping); |
726 | flush_dcache_mmap_lock(mapping); | 726 | flush_dcache_mmap_lock(mapping); |
727 | vma_interval_tree_insert(vma, &mapping->i_mmap); | 727 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
728 | flush_dcache_mmap_unlock(mapping); | 728 | flush_dcache_mmap_unlock(mapping); |
729 | mutex_unlock(&mapping->i_mmap_mutex); | 729 | i_mmap_unlock_write(mapping); |
730 | } | 730 | } |
731 | 731 | ||
732 | /* add the VMA to the tree */ | 732 | /* add the VMA to the tree */ |
@@ -795,11 +795,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
795 | if (vma->vm_file) { | 795 | if (vma->vm_file) { |
796 | mapping = vma->vm_file->f_mapping; | 796 | mapping = vma->vm_file->f_mapping; |
797 | 797 | ||
798 | mutex_lock(&mapping->i_mmap_mutex); | 798 | i_mmap_lock_write(mapping); |
799 | flush_dcache_mmap_lock(mapping); | 799 | flush_dcache_mmap_lock(mapping); |
800 | vma_interval_tree_remove(vma, &mapping->i_mmap); | 800 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
801 | flush_dcache_mmap_unlock(mapping); | 801 | flush_dcache_mmap_unlock(mapping); |
802 | mutex_unlock(&mapping->i_mmap_mutex); | 802 | i_mmap_unlock_write(mapping); |
803 | } | 803 | } |
804 | 804 | ||
805 | /* remove from the MM's tree and list */ | 805 | /* remove from the MM's tree and list */ |
@@ -1149,8 +1149,7 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1149 | unsigned long len, | 1149 | unsigned long len, |
1150 | unsigned long capabilities) | 1150 | unsigned long capabilities) |
1151 | { | 1151 | { |
1152 | struct page *pages; | 1152 | unsigned long total, point; |
1153 | unsigned long total, point, n; | ||
1154 | void *base; | 1153 | void *base; |
1155 | int ret, order; | 1154 | int ret, order; |
1156 | 1155 | ||
@@ -1182,33 +1181,23 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1182 | order = get_order(len); | 1181 | order = get_order(len); |
1183 | kdebug("alloc order %d for %lx", order, len); | 1182 | kdebug("alloc order %d for %lx", order, len); |
1184 | 1183 | ||
1185 | pages = alloc_pages(GFP_KERNEL, order); | ||
1186 | if (!pages) | ||
1187 | goto enomem; | ||
1188 | |||
1189 | total = 1 << order; | 1184 | total = 1 << order; |
1190 | atomic_long_add(total, &mmap_pages_allocated); | ||
1191 | |||
1192 | point = len >> PAGE_SHIFT; | 1185 | point = len >> PAGE_SHIFT; |
1193 | 1186 | ||
1194 | /* we allocated a power-of-2 sized page set, so we may want to trim off | 1187 | /* we don't want to allocate a power-of-2 sized page set */ |
1195 | * the excess */ | ||
1196 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { | 1188 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { |
1197 | while (total > point) { | 1189 | total = point; |
1198 | order = ilog2(total - point); | 1190 | kdebug("try to alloc exact %lu pages", total); |
1199 | n = 1 << order; | 1191 | base = alloc_pages_exact(len, GFP_KERNEL); |
1200 | kdebug("shave %lu/%lu @%lu", n, total - point, total); | 1192 | } else { |
1201 | atomic_long_sub(n, &mmap_pages_allocated); | 1193 | base = (void *)__get_free_pages(GFP_KERNEL, order); |
1202 | total -= n; | ||
1203 | set_page_refcounted(pages + total); | ||
1204 | __free_pages(pages + total, order); | ||
1205 | } | ||
1206 | } | 1194 | } |
1207 | 1195 | ||
1208 | for (point = 1; point < total; point++) | 1196 | if (!base) |
1209 | set_page_refcounted(&pages[point]); | 1197 | goto enomem; |
1198 | |||
1199 | atomic_long_add(total, &mmap_pages_allocated); | ||
1210 | 1200 | ||
1211 | base = page_address(pages); | ||
1212 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; | 1201 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; |
1213 | region->vm_start = (unsigned long) base; | 1202 | region->vm_start = (unsigned long) base; |
1214 | region->vm_end = region->vm_start + len; | 1203 | region->vm_end = region->vm_start + len; |
@@ -2094,14 +2083,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2094 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 2083 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
2095 | 2084 | ||
2096 | down_write(&nommu_region_sem); | 2085 | down_write(&nommu_region_sem); |
2097 | mutex_lock(&inode->i_mapping->i_mmap_mutex); | 2086 | i_mmap_lock_read(inode->i_mapping); |
2098 | 2087 | ||
2099 | /* search for VMAs that fall within the dead zone */ | 2088 | /* search for VMAs that fall within the dead zone */ |
2100 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { | 2089 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { |
2101 | /* found one - only interested if it's shared out of the page | 2090 | /* found one - only interested if it's shared out of the page |
2102 | * cache */ | 2091 | * cache */ |
2103 | if (vma->vm_flags & VM_SHARED) { | 2092 | if (vma->vm_flags & VM_SHARED) { |
2104 | mutex_unlock(&inode->i_mapping->i_mmap_mutex); | 2093 | i_mmap_unlock_read(inode->i_mapping); |
2105 | up_write(&nommu_region_sem); | 2094 | up_write(&nommu_region_sem); |
2106 | return -ETXTBSY; /* not quite true, but near enough */ | 2095 | return -ETXTBSY; /* not quite true, but near enough */ |
2107 | } | 2096 | } |
@@ -2113,8 +2102,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2113 | * we don't check for any regions that start beyond the EOF as there | 2102 | * we don't check for any regions that start beyond the EOF as there |
2114 | * shouldn't be any | 2103 | * shouldn't be any |
2115 | */ | 2104 | */ |
2116 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, | 2105 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) { |
2117 | 0, ULONG_MAX) { | ||
2118 | if (!(vma->vm_flags & VM_SHARED)) | 2106 | if (!(vma->vm_flags & VM_SHARED)) |
2119 | continue; | 2107 | continue; |
2120 | 2108 | ||
@@ -2129,7 +2117,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2129 | } | 2117 | } |
2130 | } | 2118 | } |
2131 | 2119 | ||
2132 | mutex_unlock(&inode->i_mapping->i_mmap_mutex); | 2120 | i_mmap_unlock_read(inode->i_mapping); |
2133 | up_write(&nommu_region_sem); | 2121 | up_write(&nommu_region_sem); |
2134 | return 0; | 2122 | return 0; |
2135 | } | 2123 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 864bba992735..d503e9ce1c7b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -281,14 +281,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
281 | if (oom_task_origin(task)) | 281 | if (oom_task_origin(task)) |
282 | return OOM_SCAN_SELECT; | 282 | return OOM_SCAN_SELECT; |
283 | 283 | ||
284 | if (task->flags & PF_EXITING && !force_kill) { | 284 | if (task_will_free_mem(task) && !force_kill) |
285 | /* | 285 | return OOM_SCAN_ABORT; |
286 | * If this task is not being ptraced on exit, then wait for it | 286 | |
287 | * to finish before killing some other task unnecessarily. | ||
288 | */ | ||
289 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) | ||
290 | return OOM_SCAN_ABORT; | ||
291 | } | ||
292 | return OOM_SCAN_OK; | 287 | return OOM_SCAN_OK; |
293 | } | 288 | } |
294 | 289 | ||
@@ -443,7 +438,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
443 | * If the task is already exiting, don't alarm the sysadmin or kill | 438 | * If the task is already exiting, don't alarm the sysadmin or kill |
444 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 439 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
445 | */ | 440 | */ |
446 | if (p->flags & PF_EXITING) { | 441 | if (task_will_free_mem(p)) { |
447 | set_tsk_thread_flag(p, TIF_MEMDIE); | 442 | set_tsk_thread_flag(p, TIF_MEMDIE); |
448 | put_task_struct(p); | 443 | put_task_struct(p); |
449 | return; | 444 | return; |
@@ -649,7 +644,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
649 | * select it. The goal is to allow it to allocate so that it may | 644 | * select it. The goal is to allow it to allocate so that it may |
650 | * quickly exit and free its memory. | 645 | * quickly exit and free its memory. |
651 | */ | 646 | */ |
652 | if (fatal_signal_pending(current) || current->flags & PF_EXITING) { | 647 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { |
653 | set_thread_flag(TIF_MEMDIE); | 648 | set_thread_flag(TIF_MEMDIE); |
654 | return; | 649 | return; |
655 | } | 650 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df542feaac3b..fa974d87f60d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
49 | #include <linux/fault-inject.h> | 49 | #include <linux/fault-inject.h> |
50 | #include <linux/page-isolation.h> | 50 | #include <linux/page-isolation.h> |
51 | #include <linux/page_ext.h> | ||
51 | #include <linux/debugobjects.h> | 52 | #include <linux/debugobjects.h> |
52 | #include <linux/kmemleak.h> | 53 | #include <linux/kmemleak.h> |
53 | #include <linux/compaction.h> | 54 | #include <linux/compaction.h> |
@@ -55,9 +56,10 @@ | |||
55 | #include <linux/prefetch.h> | 56 | #include <linux/prefetch.h> |
56 | #include <linux/mm_inline.h> | 57 | #include <linux/mm_inline.h> |
57 | #include <linux/migrate.h> | 58 | #include <linux/migrate.h> |
58 | #include <linux/page-debug-flags.h> | 59 | #include <linux/page_ext.h> |
59 | #include <linux/hugetlb.h> | 60 | #include <linux/hugetlb.h> |
60 | #include <linux/sched/rt.h> | 61 | #include <linux/sched/rt.h> |
62 | #include <linux/page_owner.h> | ||
61 | 63 | ||
62 | #include <asm/sections.h> | 64 | #include <asm/sections.h> |
63 | #include <asm/tlbflush.h> | 65 | #include <asm/tlbflush.h> |
@@ -424,6 +426,42 @@ static inline void prep_zero_page(struct page *page, unsigned int order, | |||
424 | 426 | ||
425 | #ifdef CONFIG_DEBUG_PAGEALLOC | 427 | #ifdef CONFIG_DEBUG_PAGEALLOC |
426 | unsigned int _debug_guardpage_minorder; | 428 | unsigned int _debug_guardpage_minorder; |
429 | bool _debug_pagealloc_enabled __read_mostly; | ||
430 | bool _debug_guardpage_enabled __read_mostly; | ||
431 | |||
432 | static int __init early_debug_pagealloc(char *buf) | ||
433 | { | ||
434 | if (!buf) | ||
435 | return -EINVAL; | ||
436 | |||
437 | if (strcmp(buf, "on") == 0) | ||
438 | _debug_pagealloc_enabled = true; | ||
439 | |||
440 | return 0; | ||
441 | } | ||
442 | early_param("debug_pagealloc", early_debug_pagealloc); | ||
443 | |||
444 | static bool need_debug_guardpage(void) | ||
445 | { | ||
446 | /* If we don't use debug_pagealloc, we don't need guard page */ | ||
447 | if (!debug_pagealloc_enabled()) | ||
448 | return false; | ||
449 | |||
450 | return true; | ||
451 | } | ||
452 | |||
453 | static void init_debug_guardpage(void) | ||
454 | { | ||
455 | if (!debug_pagealloc_enabled()) | ||
456 | return; | ||
457 | |||
458 | _debug_guardpage_enabled = true; | ||
459 | } | ||
460 | |||
461 | struct page_ext_operations debug_guardpage_ops = { | ||
462 | .need = need_debug_guardpage, | ||
463 | .init = init_debug_guardpage, | ||
464 | }; | ||
427 | 465 | ||
428 | static int __init debug_guardpage_minorder_setup(char *buf) | 466 | static int __init debug_guardpage_minorder_setup(char *buf) |
429 | { | 467 | { |
@@ -439,18 +477,44 @@ static int __init debug_guardpage_minorder_setup(char *buf) | |||
439 | } | 477 | } |
440 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); | 478 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); |
441 | 479 | ||
442 | static inline void set_page_guard_flag(struct page *page) | 480 | static inline void set_page_guard(struct zone *zone, struct page *page, |
481 | unsigned int order, int migratetype) | ||
443 | { | 482 | { |
444 | __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | 483 | struct page_ext *page_ext; |
484 | |||
485 | if (!debug_guardpage_enabled()) | ||
486 | return; | ||
487 | |||
488 | page_ext = lookup_page_ext(page); | ||
489 | __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); | ||
490 | |||
491 | INIT_LIST_HEAD(&page->lru); | ||
492 | set_page_private(page, order); | ||
493 | /* Guard pages are not available for any usage */ | ||
494 | __mod_zone_freepage_state(zone, -(1 << order), migratetype); | ||
445 | } | 495 | } |
446 | 496 | ||
447 | static inline void clear_page_guard_flag(struct page *page) | 497 | static inline void clear_page_guard(struct zone *zone, struct page *page, |
498 | unsigned int order, int migratetype) | ||
448 | { | 499 | { |
449 | __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | 500 | struct page_ext *page_ext; |
501 | |||
502 | if (!debug_guardpage_enabled()) | ||
503 | return; | ||
504 | |||
505 | page_ext = lookup_page_ext(page); | ||
506 | __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); | ||
507 | |||
508 | set_page_private(page, 0); | ||
509 | if (!is_migrate_isolate(migratetype)) | ||
510 | __mod_zone_freepage_state(zone, (1 << order), migratetype); | ||
450 | } | 511 | } |
451 | #else | 512 | #else |
452 | static inline void set_page_guard_flag(struct page *page) { } | 513 | struct page_ext_operations debug_guardpage_ops = { NULL, }; |
453 | static inline void clear_page_guard_flag(struct page *page) { } | 514 | static inline void set_page_guard(struct zone *zone, struct page *page, |
515 | unsigned int order, int migratetype) {} | ||
516 | static inline void clear_page_guard(struct zone *zone, struct page *page, | ||
517 | unsigned int order, int migratetype) {} | ||
454 | #endif | 518 | #endif |
455 | 519 | ||
456 | static inline void set_page_order(struct page *page, unsigned int order) | 520 | static inline void set_page_order(struct page *page, unsigned int order) |
@@ -581,12 +645,7 @@ static inline void __free_one_page(struct page *page, | |||
581 | * merge with it and move up one order. | 645 | * merge with it and move up one order. |
582 | */ | 646 | */ |
583 | if (page_is_guard(buddy)) { | 647 | if (page_is_guard(buddy)) { |
584 | clear_page_guard_flag(buddy); | 648 | clear_page_guard(zone, buddy, order, migratetype); |
585 | set_page_private(buddy, 0); | ||
586 | if (!is_migrate_isolate(migratetype)) { | ||
587 | __mod_zone_freepage_state(zone, 1 << order, | ||
588 | migratetype); | ||
589 | } | ||
590 | } else { | 649 | } else { |
591 | list_del(&buddy->lru); | 650 | list_del(&buddy->lru); |
592 | zone->free_area[order].nr_free--; | 651 | zone->free_area[order].nr_free--; |
@@ -755,6 +814,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
755 | if (bad) | 814 | if (bad) |
756 | return false; | 815 | return false; |
757 | 816 | ||
817 | reset_page_owner(page, order); | ||
818 | |||
758 | if (!PageHighMem(page)) { | 819 | if (!PageHighMem(page)) { |
759 | debug_check_no_locks_freed(page_address(page), | 820 | debug_check_no_locks_freed(page_address(page), |
760 | PAGE_SIZE << order); | 821 | PAGE_SIZE << order); |
@@ -861,23 +922,18 @@ static inline void expand(struct zone *zone, struct page *page, | |||
861 | size >>= 1; | 922 | size >>= 1; |
862 | VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); | 923 | VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); |
863 | 924 | ||
864 | #ifdef CONFIG_DEBUG_PAGEALLOC | 925 | if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && |
865 | if (high < debug_guardpage_minorder()) { | 926 | debug_guardpage_enabled() && |
927 | high < debug_guardpage_minorder()) { | ||
866 | /* | 928 | /* |
867 | * Mark as guard pages (or page), that will allow to | 929 | * Mark as guard pages (or page), that will allow to |
868 | * merge back to allocator when buddy will be freed. | 930 | * merge back to allocator when buddy will be freed. |
869 | * Corresponding page table entries will not be touched, | 931 | * Corresponding page table entries will not be touched, |
870 | * pages will stay not present in virtual address space | 932 | * pages will stay not present in virtual address space |
871 | */ | 933 | */ |
872 | INIT_LIST_HEAD(&page[size].lru); | 934 | set_page_guard(zone, &page[size], high, migratetype); |
873 | set_page_guard_flag(&page[size]); | ||
874 | set_page_private(&page[size], high); | ||
875 | /* Guard pages are not available for any usage */ | ||
876 | __mod_zone_freepage_state(zone, -(1 << high), | ||
877 | migratetype); | ||
878 | continue; | 935 | continue; |
879 | } | 936 | } |
880 | #endif | ||
881 | list_add(&page[size].lru, &area->free_list[migratetype]); | 937 | list_add(&page[size].lru, &area->free_list[migratetype]); |
882 | area->nr_free++; | 938 | area->nr_free++; |
883 | set_page_order(&page[size], high); | 939 | set_page_order(&page[size], high); |
@@ -935,6 +991,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | |||
935 | if (order && (gfp_flags & __GFP_COMP)) | 991 | if (order && (gfp_flags & __GFP_COMP)) |
936 | prep_compound_page(page, order); | 992 | prep_compound_page(page, order); |
937 | 993 | ||
994 | set_page_owner(page, order, gfp_flags); | ||
995 | |||
938 | return 0; | 996 | return 0; |
939 | } | 997 | } |
940 | 998 | ||
@@ -1507,8 +1565,11 @@ void split_page(struct page *page, unsigned int order) | |||
1507 | split_page(virt_to_page(page[0].shadow), order); | 1565 | split_page(virt_to_page(page[0].shadow), order); |
1508 | #endif | 1566 | #endif |
1509 | 1567 | ||
1510 | for (i = 1; i < (1 << order); i++) | 1568 | set_page_owner(page, 0, 0); |
1569 | for (i = 1; i < (1 << order); i++) { | ||
1511 | set_page_refcounted(page + i); | 1570 | set_page_refcounted(page + i); |
1571 | set_page_owner(page + i, 0, 0); | ||
1572 | } | ||
1512 | } | 1573 | } |
1513 | EXPORT_SYMBOL_GPL(split_page); | 1574 | EXPORT_SYMBOL_GPL(split_page); |
1514 | 1575 | ||
@@ -1548,6 +1609,7 @@ int __isolate_free_page(struct page *page, unsigned int order) | |||
1548 | } | 1609 | } |
1549 | } | 1610 | } |
1550 | 1611 | ||
1612 | set_page_owner(page, order, 0); | ||
1551 | return 1UL << order; | 1613 | return 1UL << order; |
1552 | } | 1614 | } |
1553 | 1615 | ||
@@ -4856,6 +4918,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4856 | #endif | 4918 | #endif |
4857 | init_waitqueue_head(&pgdat->kswapd_wait); | 4919 | init_waitqueue_head(&pgdat->kswapd_wait); |
4858 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | 4920 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4921 | pgdat_page_ext_init(pgdat); | ||
4859 | 4922 | ||
4860 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4923 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4861 | struct zone *zone = pgdat->node_zones + j; | 4924 | struct zone *zone = pgdat->node_zones + j; |
@@ -4874,16 +4937,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4874 | * and per-cpu initialisations | 4937 | * and per-cpu initialisations |
4875 | */ | 4938 | */ |
4876 | memmap_pages = calc_memmap_size(size, realsize); | 4939 | memmap_pages = calc_memmap_size(size, realsize); |
4877 | if (freesize >= memmap_pages) { | 4940 | if (!is_highmem_idx(j)) { |
4878 | freesize -= memmap_pages; | 4941 | if (freesize >= memmap_pages) { |
4879 | if (memmap_pages) | 4942 | freesize -= memmap_pages; |
4880 | printk(KERN_DEBUG | 4943 | if (memmap_pages) |
4881 | " %s zone: %lu pages used for memmap\n", | 4944 | printk(KERN_DEBUG |
4882 | zone_names[j], memmap_pages); | 4945 | " %s zone: %lu pages used for memmap\n", |
4883 | } else | 4946 | zone_names[j], memmap_pages); |
4884 | printk(KERN_WARNING | 4947 | } else |
4885 | " %s zone: %lu pages exceeds freesize %lu\n", | 4948 | printk(KERN_WARNING |
4886 | zone_names[j], memmap_pages, freesize); | 4949 | " %s zone: %lu pages exceeds freesize %lu\n", |
4950 | zone_names[j], memmap_pages, freesize); | ||
4951 | } | ||
4887 | 4952 | ||
4888 | /* Account for reserved pages */ | 4953 | /* Account for reserved pages */ |
4889 | if (j == 0 && freesize > dma_reserve) { | 4954 | if (j == 0 && freesize > dma_reserve) { |
@@ -6221,9 +6286,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
6221 | if (!PageLRU(page)) | 6286 | if (!PageLRU(page)) |
6222 | found++; | 6287 | found++; |
6223 | /* | 6288 | /* |
6224 | * If there are RECLAIMABLE pages, we need to check it. | 6289 | * If there are RECLAIMABLE pages, we need to check |
6225 | * But now, memory offline itself doesn't call shrink_slab() | 6290 | * it. But now, memory offline itself doesn't call |
6226 | * and it still to be fixed. | 6291 | * shrink_node_slabs() and it still to be fixed. |
6227 | */ | 6292 | */ |
6228 | /* | 6293 | /* |
6229 | * If the page is not RAM, page_count()should be 0. | 6294 | * If the page is not RAM, page_count()should be 0. |
diff --git a/mm/page_ext.c b/mm/page_ext.c new file mode 100644 index 000000000000..d86fd2f5353f --- /dev/null +++ b/mm/page_ext.c | |||
@@ -0,0 +1,403 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/mmzone.h> | ||
3 | #include <linux/bootmem.h> | ||
4 | #include <linux/page_ext.h> | ||
5 | #include <linux/memory.h> | ||
6 | #include <linux/vmalloc.h> | ||
7 | #include <linux/kmemleak.h> | ||
8 | #include <linux/page_owner.h> | ||
9 | |||
10 | /* | ||
11 | * struct page extension | ||
12 | * | ||
13 | * This is the feature to manage memory for extended data per page. | ||
14 | * | ||
15 | * Until now, we must modify struct page itself to store extra data per page. | ||
16 | * This requires rebuilding the kernel and it is really time consuming process. | ||
17 | * And, sometimes, rebuild is impossible due to third party module dependency. | ||
18 | * At last, enlarging struct page could cause un-wanted system behaviour change. | ||
19 | * | ||
20 | * This feature is intended to overcome above mentioned problems. This feature | ||
21 | * allocates memory for extended data per page in certain place rather than | ||
22 | * the struct page itself. This memory can be accessed by the accessor | ||
23 | * functions provided by this code. During the boot process, it checks whether | ||
24 | * allocation of huge chunk of memory is needed or not. If not, it avoids | ||
25 | * allocating memory at all. With this advantage, we can include this feature | ||
26 | * into the kernel in default and can avoid rebuild and solve related problems. | ||
27 | * | ||
28 | * To help these things to work well, there are two callbacks for clients. One | ||
29 | * is the need callback which is mandatory if user wants to avoid useless | ||
30 | * memory allocation at boot-time. The other is optional, init callback, which | ||
31 | * is used to do proper initialization after memory is allocated. | ||
32 | * | ||
33 | * The need callback is used to decide whether extended memory allocation is | ||
34 | * needed or not. Sometimes users want to deactivate some features in this | ||
35 | * boot and extra memory would be unneccessary. In this case, to avoid | ||
36 | * allocating huge chunk of memory, each clients represent their need of | ||
37 | * extra memory through the need callback. If one of the need callbacks | ||
38 | * returns true, it means that someone needs extra memory so that | ||
39 | * page extension core should allocates memory for page extension. If | ||
40 | * none of need callbacks return true, memory isn't needed at all in this boot | ||
41 | * and page extension core can skip to allocate memory. As result, | ||
42 | * none of memory is wasted. | ||
43 | * | ||
44 | * The init callback is used to do proper initialization after page extension | ||
45 | * is completely initialized. In sparse memory system, extra memory is | ||
46 | * allocated some time later than memmap is allocated. In other words, lifetime | ||
47 | * of memory for page extension isn't same with memmap for struct page. | ||
48 | * Therefore, clients can't store extra data until page extension is | ||
49 | * initialized, even if pages are allocated and used freely. This could | ||
50 | * cause inadequate state of extra data per page, so, to prevent it, client | ||
51 | * can utilize this callback to initialize the state of it correctly. | ||
52 | */ | ||
53 | |||
54 | static struct page_ext_operations *page_ext_ops[] = { | ||
55 | &debug_guardpage_ops, | ||
56 | #ifdef CONFIG_PAGE_POISONING | ||
57 | &page_poisoning_ops, | ||
58 | #endif | ||
59 | #ifdef CONFIG_PAGE_OWNER | ||
60 | &page_owner_ops, | ||
61 | #endif | ||
62 | }; | ||
63 | |||
64 | static unsigned long total_usage; | ||
65 | |||
66 | static bool __init invoke_need_callbacks(void) | ||
67 | { | ||
68 | int i; | ||
69 | int entries = ARRAY_SIZE(page_ext_ops); | ||
70 | |||
71 | for (i = 0; i < entries; i++) { | ||
72 | if (page_ext_ops[i]->need && page_ext_ops[i]->need()) | ||
73 | return true; | ||
74 | } | ||
75 | |||
76 | return false; | ||
77 | } | ||
78 | |||
79 | static void __init invoke_init_callbacks(void) | ||
80 | { | ||
81 | int i; | ||
82 | int entries = ARRAY_SIZE(page_ext_ops); | ||
83 | |||
84 | for (i = 0; i < entries; i++) { | ||
85 | if (page_ext_ops[i]->init) | ||
86 | page_ext_ops[i]->init(); | ||
87 | } | ||
88 | } | ||
89 | |||
90 | #if !defined(CONFIG_SPARSEMEM) | ||
91 | |||
92 | |||
93 | void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) | ||
94 | { | ||
95 | pgdat->node_page_ext = NULL; | ||
96 | } | ||
97 | |||
98 | struct page_ext *lookup_page_ext(struct page *page) | ||
99 | { | ||
100 | unsigned long pfn = page_to_pfn(page); | ||
101 | unsigned long offset; | ||
102 | struct page_ext *base; | ||
103 | |||
104 | base = NODE_DATA(page_to_nid(page))->node_page_ext; | ||
105 | #ifdef CONFIG_DEBUG_VM | ||
106 | /* | ||
107 | * The sanity checks the page allocator does upon freeing a | ||
108 | * page can reach here before the page_ext arrays are | ||
109 | * allocated when feeding a range of pages to the allocator | ||
110 | * for the first time during bootup or memory hotplug. | ||
111 | */ | ||
112 | if (unlikely(!base)) | ||
113 | return NULL; | ||
114 | #endif | ||
115 | offset = pfn - round_down(node_start_pfn(page_to_nid(page)), | ||
116 | MAX_ORDER_NR_PAGES); | ||
117 | return base + offset; | ||
118 | } | ||
119 | |||
120 | static int __init alloc_node_page_ext(int nid) | ||
121 | { | ||
122 | struct page_ext *base; | ||
123 | unsigned long table_size; | ||
124 | unsigned long nr_pages; | ||
125 | |||
126 | nr_pages = NODE_DATA(nid)->node_spanned_pages; | ||
127 | if (!nr_pages) | ||
128 | return 0; | ||
129 | |||
130 | /* | ||
131 | * Need extra space if node range is not aligned with | ||
132 | * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm | ||
133 | * checks buddy's status, range could be out of exact node range. | ||
134 | */ | ||
135 | if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || | ||
136 | !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) | ||
137 | nr_pages += MAX_ORDER_NR_PAGES; | ||
138 | |||
139 | table_size = sizeof(struct page_ext) * nr_pages; | ||
140 | |||
141 | base = memblock_virt_alloc_try_nid_nopanic( | ||
142 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), | ||
143 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
144 | if (!base) | ||
145 | return -ENOMEM; | ||
146 | NODE_DATA(nid)->node_page_ext = base; | ||
147 | total_usage += table_size; | ||
148 | return 0; | ||
149 | } | ||
150 | |||
151 | void __init page_ext_init_flatmem(void) | ||
152 | { | ||
153 | |||
154 | int nid, fail; | ||
155 | |||
156 | if (!invoke_need_callbacks()) | ||
157 | return; | ||
158 | |||
159 | for_each_online_node(nid) { | ||
160 | fail = alloc_node_page_ext(nid); | ||
161 | if (fail) | ||
162 | goto fail; | ||
163 | } | ||
164 | pr_info("allocated %ld bytes of page_ext\n", total_usage); | ||
165 | invoke_init_callbacks(); | ||
166 | return; | ||
167 | |||
168 | fail: | ||
169 | pr_crit("allocation of page_ext failed.\n"); | ||
170 | panic("Out of memory"); | ||
171 | } | ||
172 | |||
173 | #else /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
174 | |||
175 | struct page_ext *lookup_page_ext(struct page *page) | ||
176 | { | ||
177 | unsigned long pfn = page_to_pfn(page); | ||
178 | struct mem_section *section = __pfn_to_section(pfn); | ||
179 | #ifdef CONFIG_DEBUG_VM | ||
180 | /* | ||
181 | * The sanity checks the page allocator does upon freeing a | ||
182 | * page can reach here before the page_ext arrays are | ||
183 | * allocated when feeding a range of pages to the allocator | ||
184 | * for the first time during bootup or memory hotplug. | ||
185 | */ | ||
186 | if (!section->page_ext) | ||
187 | return NULL; | ||
188 | #endif | ||
189 | return section->page_ext + pfn; | ||
190 | } | ||
191 | |||
192 | static void *__meminit alloc_page_ext(size_t size, int nid) | ||
193 | { | ||
194 | gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; | ||
195 | void *addr = NULL; | ||
196 | |||
197 | addr = alloc_pages_exact_nid(nid, size, flags); | ||
198 | if (addr) { | ||
199 | kmemleak_alloc(addr, size, 1, flags); | ||
200 | return addr; | ||
201 | } | ||
202 | |||
203 | if (node_state(nid, N_HIGH_MEMORY)) | ||
204 | addr = vzalloc_node(size, nid); | ||
205 | else | ||
206 | addr = vzalloc(size); | ||
207 | |||
208 | return addr; | ||
209 | } | ||
210 | |||
211 | static int __meminit init_section_page_ext(unsigned long pfn, int nid) | ||
212 | { | ||
213 | struct mem_section *section; | ||
214 | struct page_ext *base; | ||
215 | unsigned long table_size; | ||
216 | |||
217 | section = __pfn_to_section(pfn); | ||
218 | |||
219 | if (section->page_ext) | ||
220 | return 0; | ||
221 | |||
222 | table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; | ||
223 | base = alloc_page_ext(table_size, nid); | ||
224 | |||
225 | /* | ||
226 | * The value stored in section->page_ext is (base - pfn) | ||
227 | * and it does not point to the memory block allocated above, | ||
228 | * causing kmemleak false positives. | ||
229 | */ | ||
230 | kmemleak_not_leak(base); | ||
231 | |||
232 | if (!base) { | ||
233 | pr_err("page ext allocation failure\n"); | ||
234 | return -ENOMEM; | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * The passed "pfn" may not be aligned to SECTION. For the calculation | ||
239 | * we need to apply a mask. | ||
240 | */ | ||
241 | pfn &= PAGE_SECTION_MASK; | ||
242 | section->page_ext = base - pfn; | ||
243 | total_usage += table_size; | ||
244 | return 0; | ||
245 | } | ||
246 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
247 | static void free_page_ext(void *addr) | ||
248 | { | ||
249 | if (is_vmalloc_addr(addr)) { | ||
250 | vfree(addr); | ||
251 | } else { | ||
252 | struct page *page = virt_to_page(addr); | ||
253 | size_t table_size; | ||
254 | |||
255 | table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; | ||
256 | |||
257 | BUG_ON(PageReserved(page)); | ||
258 | free_pages_exact(addr, table_size); | ||
259 | } | ||
260 | } | ||
261 | |||
262 | static void __free_page_ext(unsigned long pfn) | ||
263 | { | ||
264 | struct mem_section *ms; | ||
265 | struct page_ext *base; | ||
266 | |||
267 | ms = __pfn_to_section(pfn); | ||
268 | if (!ms || !ms->page_ext) | ||
269 | return; | ||
270 | base = ms->page_ext + pfn; | ||
271 | free_page_ext(base); | ||
272 | ms->page_ext = NULL; | ||
273 | } | ||
274 | |||
275 | static int __meminit online_page_ext(unsigned long start_pfn, | ||
276 | unsigned long nr_pages, | ||
277 | int nid) | ||
278 | { | ||
279 | unsigned long start, end, pfn; | ||
280 | int fail = 0; | ||
281 | |||
282 | start = SECTION_ALIGN_DOWN(start_pfn); | ||
283 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); | ||
284 | |||
285 | if (nid == -1) { | ||
286 | /* | ||
287 | * In this case, "nid" already exists and contains valid memory. | ||
288 | * "start_pfn" passed to us is a pfn which is an arg for | ||
289 | * online__pages(), and start_pfn should exist. | ||
290 | */ | ||
291 | nid = pfn_to_nid(start_pfn); | ||
292 | VM_BUG_ON(!node_state(nid, N_ONLINE)); | ||
293 | } | ||
294 | |||
295 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { | ||
296 | if (!pfn_present(pfn)) | ||
297 | continue; | ||
298 | fail = init_section_page_ext(pfn, nid); | ||
299 | } | ||
300 | if (!fail) | ||
301 | return 0; | ||
302 | |||
303 | /* rollback */ | ||
304 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
305 | __free_page_ext(pfn); | ||
306 | |||
307 | return -ENOMEM; | ||
308 | } | ||
309 | |||
310 | static int __meminit offline_page_ext(unsigned long start_pfn, | ||
311 | unsigned long nr_pages, int nid) | ||
312 | { | ||
313 | unsigned long start, end, pfn; | ||
314 | |||
315 | start = SECTION_ALIGN_DOWN(start_pfn); | ||
316 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); | ||
317 | |||
318 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
319 | __free_page_ext(pfn); | ||
320 | return 0; | ||
321 | |||
322 | } | ||
323 | |||
324 | static int __meminit page_ext_callback(struct notifier_block *self, | ||
325 | unsigned long action, void *arg) | ||
326 | { | ||
327 | struct memory_notify *mn = arg; | ||
328 | int ret = 0; | ||
329 | |||
330 | switch (action) { | ||
331 | case MEM_GOING_ONLINE: | ||
332 | ret = online_page_ext(mn->start_pfn, | ||
333 | mn->nr_pages, mn->status_change_nid); | ||
334 | break; | ||
335 | case MEM_OFFLINE: | ||
336 | offline_page_ext(mn->start_pfn, | ||
337 | mn->nr_pages, mn->status_change_nid); | ||
338 | break; | ||
339 | case MEM_CANCEL_ONLINE: | ||
340 | offline_page_ext(mn->start_pfn, | ||
341 | mn->nr_pages, mn->status_change_nid); | ||
342 | break; | ||
343 | case MEM_GOING_OFFLINE: | ||
344 | break; | ||
345 | case MEM_ONLINE: | ||
346 | case MEM_CANCEL_OFFLINE: | ||
347 | break; | ||
348 | } | ||
349 | |||
350 | return notifier_from_errno(ret); | ||
351 | } | ||
352 | |||
353 | #endif | ||
354 | |||
355 | void __init page_ext_init(void) | ||
356 | { | ||
357 | unsigned long pfn; | ||
358 | int nid; | ||
359 | |||
360 | if (!invoke_need_callbacks()) | ||
361 | return; | ||
362 | |||
363 | for_each_node_state(nid, N_MEMORY) { | ||
364 | unsigned long start_pfn, end_pfn; | ||
365 | |||
366 | start_pfn = node_start_pfn(nid); | ||
367 | end_pfn = node_end_pfn(nid); | ||
368 | /* | ||
369 | * start_pfn and end_pfn may not be aligned to SECTION and the | ||
370 | * page->flags of out of node pages are not initialized. So we | ||
371 | * scan [start_pfn, the biggest section's pfn < end_pfn) here. | ||
372 | */ | ||
373 | for (pfn = start_pfn; pfn < end_pfn; | ||
374 | pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { | ||
375 | |||
376 | if (!pfn_valid(pfn)) | ||
377 | continue; | ||
378 | /* | ||
379 | * Nodes's pfns can be overlapping. | ||
380 | * We know some arch can have a nodes layout such as | ||
381 | * -------------pfn--------------> | ||
382 | * N0 | N1 | N2 | N0 | N1 | N2|.... | ||
383 | */ | ||
384 | if (pfn_to_nid(pfn) != nid) | ||
385 | continue; | ||
386 | if (init_section_page_ext(pfn, nid)) | ||
387 | goto oom; | ||
388 | } | ||
389 | } | ||
390 | hotplug_memory_notifier(page_ext_callback, 0); | ||
391 | pr_info("allocated %ld bytes of page_ext\n", total_usage); | ||
392 | invoke_init_callbacks(); | ||
393 | return; | ||
394 | |||
395 | oom: | ||
396 | panic("Out of memory"); | ||
397 | } | ||
398 | |||
399 | void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) | ||
400 | { | ||
401 | } | ||
402 | |||
403 | #endif | ||
diff --git a/mm/page_owner.c b/mm/page_owner.c new file mode 100644 index 000000000000..9ab4a9b5bc09 --- /dev/null +++ b/mm/page_owner.c | |||
@@ -0,0 +1,311 @@ | |||
1 | #include <linux/debugfs.h> | ||
2 | #include <linux/mm.h> | ||
3 | #include <linux/slab.h> | ||
4 | #include <linux/uaccess.h> | ||
5 | #include <linux/bootmem.h> | ||
6 | #include <linux/stacktrace.h> | ||
7 | #include <linux/page_owner.h> | ||
8 | #include "internal.h" | ||
9 | |||
10 | static bool page_owner_disabled = true; | ||
11 | bool page_owner_inited __read_mostly; | ||
12 | |||
13 | static void init_early_allocated_pages(void); | ||
14 | |||
15 | static int early_page_owner_param(char *buf) | ||
16 | { | ||
17 | if (!buf) | ||
18 | return -EINVAL; | ||
19 | |||
20 | if (strcmp(buf, "on") == 0) | ||
21 | page_owner_disabled = false; | ||
22 | |||
23 | return 0; | ||
24 | } | ||
25 | early_param("page_owner", early_page_owner_param); | ||
26 | |||
27 | static bool need_page_owner(void) | ||
28 | { | ||
29 | if (page_owner_disabled) | ||
30 | return false; | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | static void init_page_owner(void) | ||
36 | { | ||
37 | if (page_owner_disabled) | ||
38 | return; | ||
39 | |||
40 | page_owner_inited = true; | ||
41 | init_early_allocated_pages(); | ||
42 | } | ||
43 | |||
44 | struct page_ext_operations page_owner_ops = { | ||
45 | .need = need_page_owner, | ||
46 | .init = init_page_owner, | ||
47 | }; | ||
48 | |||
49 | void __reset_page_owner(struct page *page, unsigned int order) | ||
50 | { | ||
51 | int i; | ||
52 | struct page_ext *page_ext; | ||
53 | |||
54 | for (i = 0; i < (1 << order); i++) { | ||
55 | page_ext = lookup_page_ext(page + i); | ||
56 | __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); | ||
57 | } | ||
58 | } | ||
59 | |||
60 | void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) | ||
61 | { | ||
62 | struct page_ext *page_ext; | ||
63 | struct stack_trace *trace; | ||
64 | |||
65 | page_ext = lookup_page_ext(page); | ||
66 | |||
67 | trace = &page_ext->trace; | ||
68 | trace->nr_entries = 0; | ||
69 | trace->max_entries = ARRAY_SIZE(page_ext->trace_entries); | ||
70 | trace->entries = &page_ext->trace_entries[0]; | ||
71 | trace->skip = 3; | ||
72 | save_stack_trace(&page_ext->trace); | ||
73 | |||
74 | page_ext->order = order; | ||
75 | page_ext->gfp_mask = gfp_mask; | ||
76 | |||
77 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); | ||
78 | } | ||
79 | |||
80 | static ssize_t | ||
81 | print_page_owner(char __user *buf, size_t count, unsigned long pfn, | ||
82 | struct page *page, struct page_ext *page_ext) | ||
83 | { | ||
84 | int ret; | ||
85 | int pageblock_mt, page_mt; | ||
86 | char *kbuf; | ||
87 | |||
88 | kbuf = kmalloc(count, GFP_KERNEL); | ||
89 | if (!kbuf) | ||
90 | return -ENOMEM; | ||
91 | |||
92 | ret = snprintf(kbuf, count, | ||
93 | "Page allocated via order %u, mask 0x%x\n", | ||
94 | page_ext->order, page_ext->gfp_mask); | ||
95 | |||
96 | if (ret >= count) | ||
97 | goto err; | ||
98 | |||
99 | /* Print information relevant to grouping pages by mobility */ | ||
100 | pageblock_mt = get_pfnblock_migratetype(page, pfn); | ||
101 | page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); | ||
102 | ret += snprintf(kbuf + ret, count - ret, | ||
103 | "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", | ||
104 | pfn, | ||
105 | pfn >> pageblock_order, | ||
106 | pageblock_mt, | ||
107 | pageblock_mt != page_mt ? "Fallback" : " ", | ||
108 | PageLocked(page) ? "K" : " ", | ||
109 | PageError(page) ? "E" : " ", | ||
110 | PageReferenced(page) ? "R" : " ", | ||
111 | PageUptodate(page) ? "U" : " ", | ||
112 | PageDirty(page) ? "D" : " ", | ||
113 | PageLRU(page) ? "L" : " ", | ||
114 | PageActive(page) ? "A" : " ", | ||
115 | PageSlab(page) ? "S" : " ", | ||
116 | PageWriteback(page) ? "W" : " ", | ||
117 | PageCompound(page) ? "C" : " ", | ||
118 | PageSwapCache(page) ? "B" : " ", | ||
119 | PageMappedToDisk(page) ? "M" : " "); | ||
120 | |||
121 | if (ret >= count) | ||
122 | goto err; | ||
123 | |||
124 | ret += snprint_stack_trace(kbuf + ret, count - ret, | ||
125 | &page_ext->trace, 0); | ||
126 | if (ret >= count) | ||
127 | goto err; | ||
128 | |||
129 | ret += snprintf(kbuf + ret, count - ret, "\n"); | ||
130 | if (ret >= count) | ||
131 | goto err; | ||
132 | |||
133 | if (copy_to_user(buf, kbuf, ret)) | ||
134 | ret = -EFAULT; | ||
135 | |||
136 | kfree(kbuf); | ||
137 | return ret; | ||
138 | |||
139 | err: | ||
140 | kfree(kbuf); | ||
141 | return -ENOMEM; | ||
142 | } | ||
143 | |||
144 | static ssize_t | ||
145 | read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) | ||
146 | { | ||
147 | unsigned long pfn; | ||
148 | struct page *page; | ||
149 | struct page_ext *page_ext; | ||
150 | |||
151 | if (!page_owner_inited) | ||
152 | return -EINVAL; | ||
153 | |||
154 | page = NULL; | ||
155 | pfn = min_low_pfn + *ppos; | ||
156 | |||
157 | /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ | ||
158 | while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) | ||
159 | pfn++; | ||
160 | |||
161 | drain_all_pages(NULL); | ||
162 | |||
163 | /* Find an allocated page */ | ||
164 | for (; pfn < max_pfn; pfn++) { | ||
165 | /* | ||
166 | * If the new page is in a new MAX_ORDER_NR_PAGES area, | ||
167 | * validate the area as existing, skip it if not | ||
168 | */ | ||
169 | if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { | ||
170 | pfn += MAX_ORDER_NR_PAGES - 1; | ||
171 | continue; | ||
172 | } | ||
173 | |||
174 | /* Check for holes within a MAX_ORDER area */ | ||
175 | if (!pfn_valid_within(pfn)) | ||
176 | continue; | ||
177 | |||
178 | page = pfn_to_page(pfn); | ||
179 | if (PageBuddy(page)) { | ||
180 | unsigned long freepage_order = page_order_unsafe(page); | ||
181 | |||
182 | if (freepage_order < MAX_ORDER) | ||
183 | pfn += (1UL << freepage_order) - 1; | ||
184 | continue; | ||
185 | } | ||
186 | |||
187 | page_ext = lookup_page_ext(page); | ||
188 | |||
189 | /* | ||
190 | * Some pages could be missed by concurrent allocation or free, | ||
191 | * because we don't hold the zone lock. | ||
192 | */ | ||
193 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | ||
194 | continue; | ||
195 | |||
196 | /* Record the next PFN to read in the file offset */ | ||
197 | *ppos = (pfn - min_low_pfn) + 1; | ||
198 | |||
199 | return print_page_owner(buf, count, pfn, page, page_ext); | ||
200 | } | ||
201 | |||
202 | return 0; | ||
203 | } | ||
204 | |||
205 | static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) | ||
206 | { | ||
207 | struct page *page; | ||
208 | struct page_ext *page_ext; | ||
209 | unsigned long pfn = zone->zone_start_pfn, block_end_pfn; | ||
210 | unsigned long end_pfn = pfn + zone->spanned_pages; | ||
211 | unsigned long count = 0; | ||
212 | |||
213 | /* Scan block by block. First and last block may be incomplete */ | ||
214 | pfn = zone->zone_start_pfn; | ||
215 | |||
216 | /* | ||
217 | * Walk the zone in pageblock_nr_pages steps. If a page block spans | ||
218 | * a zone boundary, it will be double counted between zones. This does | ||
219 | * not matter as the mixed block count will still be correct | ||
220 | */ | ||
221 | for (; pfn < end_pfn; ) { | ||
222 | if (!pfn_valid(pfn)) { | ||
223 | pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); | ||
224 | continue; | ||
225 | } | ||
226 | |||
227 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
228 | block_end_pfn = min(block_end_pfn, end_pfn); | ||
229 | |||
230 | page = pfn_to_page(pfn); | ||
231 | |||
232 | for (; pfn < block_end_pfn; pfn++) { | ||
233 | if (!pfn_valid_within(pfn)) | ||
234 | continue; | ||
235 | |||
236 | page = pfn_to_page(pfn); | ||
237 | |||
238 | /* | ||
239 | * We are safe to check buddy flag and order, because | ||
240 | * this is init stage and only single thread runs. | ||
241 | */ | ||
242 | if (PageBuddy(page)) { | ||
243 | pfn += (1UL << page_order(page)) - 1; | ||
244 | continue; | ||
245 | } | ||
246 | |||
247 | if (PageReserved(page)) | ||
248 | continue; | ||
249 | |||
250 | page_ext = lookup_page_ext(page); | ||
251 | |||
252 | /* Maybe overraping zone */ | ||
253 | if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | ||
254 | continue; | ||
255 | |||
256 | /* Found early allocated page */ | ||
257 | set_page_owner(page, 0, 0); | ||
258 | count++; | ||
259 | } | ||
260 | } | ||
261 | |||
262 | pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", | ||
263 | pgdat->node_id, zone->name, count); | ||
264 | } | ||
265 | |||
266 | static void init_zones_in_node(pg_data_t *pgdat) | ||
267 | { | ||
268 | struct zone *zone; | ||
269 | struct zone *node_zones = pgdat->node_zones; | ||
270 | unsigned long flags; | ||
271 | |||
272 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
273 | if (!populated_zone(zone)) | ||
274 | continue; | ||
275 | |||
276 | spin_lock_irqsave(&zone->lock, flags); | ||
277 | init_pages_in_zone(pgdat, zone); | ||
278 | spin_unlock_irqrestore(&zone->lock, flags); | ||
279 | } | ||
280 | } | ||
281 | |||
282 | static void init_early_allocated_pages(void) | ||
283 | { | ||
284 | pg_data_t *pgdat; | ||
285 | |||
286 | drain_all_pages(NULL); | ||
287 | for_each_online_pgdat(pgdat) | ||
288 | init_zones_in_node(pgdat); | ||
289 | } | ||
290 | |||
291 | static const struct file_operations proc_page_owner_operations = { | ||
292 | .read = read_page_owner, | ||
293 | }; | ||
294 | |||
295 | static int __init pageowner_init(void) | ||
296 | { | ||
297 | struct dentry *dentry; | ||
298 | |||
299 | if (!page_owner_inited) { | ||
300 | pr_info("page_owner is disabled\n"); | ||
301 | return 0; | ||
302 | } | ||
303 | |||
304 | dentry = debugfs_create_file("page_owner", S_IRUSR, NULL, | ||
305 | NULL, &proc_page_owner_operations); | ||
306 | if (IS_ERR(dentry)) | ||
307 | return PTR_ERR(dentry); | ||
308 | |||
309 | return 0; | ||
310 | } | ||
311 | module_init(pageowner_init) | ||
@@ -23,7 +23,7 @@ | |||
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * mm->mmap_sem | 24 | * mm->mmap_sem |
25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
26 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_rwsem |
27 | * anon_vma->rwsem | 27 | * anon_vma->rwsem |
28 | * mm->page_table_lock or pte_lock | 28 | * mm->page_table_lock or pte_lock |
29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
@@ -1260,7 +1260,7 @@ out_mlock: | |||
1260 | /* | 1260 | /* |
1261 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | 1261 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes |
1262 | * unstable result and race. Plus, We can't wait here because | 1262 | * unstable result and race. Plus, We can't wait here because |
1263 | * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. | 1263 | * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem. |
1264 | * if trylock failed, the page remain in evictable lru and later | 1264 | * if trylock failed, the page remain in evictable lru and later |
1265 | * vmscan could retry to move the page to unevictable lru if the | 1265 | * vmscan could retry to move the page to unevictable lru if the |
1266 | * page is actually mlocked. | 1266 | * page is actually mlocked. |
@@ -1635,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, | |||
1635 | static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | 1635 | static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) |
1636 | { | 1636 | { |
1637 | struct anon_vma *anon_vma; | 1637 | struct anon_vma *anon_vma; |
1638 | pgoff_t pgoff = page_to_pgoff(page); | 1638 | pgoff_t pgoff; |
1639 | struct anon_vma_chain *avc; | 1639 | struct anon_vma_chain *avc; |
1640 | int ret = SWAP_AGAIN; | 1640 | int ret = SWAP_AGAIN; |
1641 | 1641 | ||
@@ -1643,6 +1643,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | |||
1643 | if (!anon_vma) | 1643 | if (!anon_vma) |
1644 | return ret; | 1644 | return ret; |
1645 | 1645 | ||
1646 | pgoff = page_to_pgoff(page); | ||
1646 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1647 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1647 | struct vm_area_struct *vma = avc->vma; | 1648 | struct vm_area_struct *vma = avc->vma; |
1648 | unsigned long address = vma_address(page, vma); | 1649 | unsigned long address = vma_address(page, vma); |
@@ -1676,7 +1677,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | |||
1676 | static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | 1677 | static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) |
1677 | { | 1678 | { |
1678 | struct address_space *mapping = page->mapping; | 1679 | struct address_space *mapping = page->mapping; |
1679 | pgoff_t pgoff = page_to_pgoff(page); | 1680 | pgoff_t pgoff; |
1680 | struct vm_area_struct *vma; | 1681 | struct vm_area_struct *vma; |
1681 | int ret = SWAP_AGAIN; | 1682 | int ret = SWAP_AGAIN; |
1682 | 1683 | ||
@@ -1684,13 +1685,15 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
1684 | * The page lock not only makes sure that page->mapping cannot | 1685 | * The page lock not only makes sure that page->mapping cannot |
1685 | * suddenly be NULLified by truncation, it makes sure that the | 1686 | * suddenly be NULLified by truncation, it makes sure that the |
1686 | * structure at mapping cannot be freed and reused yet, | 1687 | * structure at mapping cannot be freed and reused yet, |
1687 | * so we can safely take mapping->i_mmap_mutex. | 1688 | * so we can safely take mapping->i_mmap_rwsem. |
1688 | */ | 1689 | */ |
1689 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 1690 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
1690 | 1691 | ||
1691 | if (!mapping) | 1692 | if (!mapping) |
1692 | return ret; | 1693 | return ret; |
1693 | mutex_lock(&mapping->i_mmap_mutex); | 1694 | |
1695 | pgoff = page_to_pgoff(page); | ||
1696 | i_mmap_lock_read(mapping); | ||
1694 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 1697 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1695 | unsigned long address = vma_address(page, vma); | 1698 | unsigned long address = vma_address(page, vma); |
1696 | 1699 | ||
@@ -1711,9 +1714,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
1711 | goto done; | 1714 | goto done; |
1712 | 1715 | ||
1713 | ret = rwc->file_nonlinear(page, mapping, rwc->arg); | 1716 | ret = rwc->file_nonlinear(page, mapping, rwc->arg); |
1714 | |||
1715 | done: | 1717 | done: |
1716 | mutex_unlock(&mapping->i_mmap_mutex); | 1718 | i_mmap_unlock_read(mapping); |
1717 | return ret; | 1719 | return ret; |
1718 | } | 1720 | } |
1719 | 1721 | ||
@@ -3015,7 +3015,7 @@ retry: | |||
3015 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 3015 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
3016 | nid = zone_to_nid(zone); | 3016 | nid = zone_to_nid(zone); |
3017 | 3017 | ||
3018 | if (cpuset_zone_allowed(zone, flags | __GFP_HARDWALL) && | 3018 | if (cpuset_zone_allowed(zone, flags) && |
3019 | get_node(cache, nid) && | 3019 | get_node(cache, nid) && |
3020 | get_node(cache, nid)->free_objects) { | 3020 | get_node(cache, nid)->free_objects) { |
3021 | obj = ____cache_alloc_node(cache, | 3021 | obj = ____cache_alloc_node(cache, |
@@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3182 | memset(ptr, 0, cachep->object_size); | 3182 | memset(ptr, 0, cachep->object_size); |
3183 | } | 3183 | } |
3184 | 3184 | ||
3185 | memcg_kmem_put_cache(cachep); | ||
3185 | return ptr; | 3186 | return ptr; |
3186 | } | 3187 | } |
3187 | 3188 | ||
@@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) | |||
3247 | memset(objp, 0, cachep->object_size); | 3248 | memset(objp, 0, cachep->object_size); |
3248 | } | 3249 | } |
3249 | 3250 | ||
3251 | memcg_kmem_put_cache(cachep); | ||
3250 | return objp; | 3252 | return objp; |
3251 | } | 3253 | } |
3252 | 3254 | ||
@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x) | |||
1233 | kmemleak_free(x); | 1233 | kmemleak_free(x); |
1234 | } | 1234 | } |
1235 | 1235 | ||
1236 | static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) | 1236 | static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, |
1237 | gfp_t flags) | ||
1237 | { | 1238 | { |
1238 | flags &= gfp_allowed_mask; | 1239 | flags &= gfp_allowed_mask; |
1239 | lockdep_trace_alloc(flags); | 1240 | lockdep_trace_alloc(flags); |
1240 | might_sleep_if(flags & __GFP_WAIT); | 1241 | might_sleep_if(flags & __GFP_WAIT); |
1241 | 1242 | ||
1242 | return should_failslab(s->object_size, flags, s->flags); | 1243 | if (should_failslab(s->object_size, flags, s->flags)) |
1244 | return NULL; | ||
1245 | |||
1246 | return memcg_kmem_get_cache(s, flags); | ||
1243 | } | 1247 | } |
1244 | 1248 | ||
1245 | static inline void slab_post_alloc_hook(struct kmem_cache *s, | 1249 | static inline void slab_post_alloc_hook(struct kmem_cache *s, |
@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, | |||
1248 | flags &= gfp_allowed_mask; | 1252 | flags &= gfp_allowed_mask; |
1249 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | 1253 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); |
1250 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); | 1254 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); |
1255 | memcg_kmem_put_cache(s); | ||
1251 | } | 1256 | } |
1252 | 1257 | ||
1253 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | 1258 | static inline void slab_free_hook(struct kmem_cache *s, void *x) |
@@ -1665,8 +1670,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1665 | 1670 | ||
1666 | n = get_node(s, zone_to_nid(zone)); | 1671 | n = get_node(s, zone_to_nid(zone)); |
1667 | 1672 | ||
1668 | if (n && cpuset_zone_allowed(zone, | 1673 | if (n && cpuset_zone_allowed(zone, flags) && |
1669 | flags | __GFP_HARDWALL) && | ||
1670 | n->nr_partial > s->min_partial) { | 1674 | n->nr_partial > s->min_partial) { |
1671 | object = get_partial_node(s, n, c, flags); | 1675 | object = get_partial_node(s, n, c, flags); |
1672 | if (object) { | 1676 | if (object) { |
@@ -2384,10 +2388,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, | |||
2384 | struct page *page; | 2388 | struct page *page; |
2385 | unsigned long tid; | 2389 | unsigned long tid; |
2386 | 2390 | ||
2387 | if (slab_pre_alloc_hook(s, gfpflags)) | 2391 | s = slab_pre_alloc_hook(s, gfpflags); |
2392 | if (!s) | ||
2388 | return NULL; | 2393 | return NULL; |
2389 | |||
2390 | s = memcg_kmem_get_cache(s, gfpflags); | ||
2391 | redo: | 2394 | redo: |
2392 | /* | 2395 | /* |
2393 | * Must read kmem_cache cpu data via this cpu ptr. Preemption is | 2396 | * Must read kmem_cache cpu data via this cpu ptr. Preemption is |
diff --git a/mm/vmacache.c b/mm/vmacache.c index 9f25af825dec..b6e3662fe339 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c | |||
@@ -17,6 +17,8 @@ void vmacache_flush_all(struct mm_struct *mm) | |||
17 | { | 17 | { |
18 | struct task_struct *g, *p; | 18 | struct task_struct *g, *p; |
19 | 19 | ||
20 | count_vm_vmacache_event(VMACACHE_FULL_FLUSHES); | ||
21 | |||
20 | /* | 22 | /* |
21 | * Single threaded tasks need not iterate the entire | 23 | * Single threaded tasks need not iterate the entire |
22 | * list of process. We can avoid the flushing as well | 24 | * list of process. We can avoid the flushing as well |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8a18196fcdff..39c338896416 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -2574,10 +2574,10 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) | |||
2574 | if (!counters) | 2574 | if (!counters) |
2575 | return; | 2575 | return; |
2576 | 2576 | ||
2577 | /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ | ||
2578 | smp_rmb(); | ||
2579 | if (v->flags & VM_UNINITIALIZED) | 2577 | if (v->flags & VM_UNINITIALIZED) |
2580 | return; | 2578 | return; |
2579 | /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ | ||
2580 | smp_rmb(); | ||
2581 | 2581 | ||
2582 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); | 2582 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); |
2583 | 2583 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index a384339bf718..bd9a72bc4a1b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -229,9 +229,10 @@ EXPORT_SYMBOL(unregister_shrinker); | |||
229 | 229 | ||
230 | #define SHRINK_BATCH 128 | 230 | #define SHRINK_BATCH 128 |
231 | 231 | ||
232 | static unsigned long | 232 | static unsigned long shrink_slabs(struct shrink_control *shrinkctl, |
233 | shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | 233 | struct shrinker *shrinker, |
234 | unsigned long nr_pages_scanned, unsigned long lru_pages) | 234 | unsigned long nr_scanned, |
235 | unsigned long nr_eligible) | ||
235 | { | 236 | { |
236 | unsigned long freed = 0; | 237 | unsigned long freed = 0; |
237 | unsigned long long delta; | 238 | unsigned long long delta; |
@@ -255,9 +256,9 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
255 | nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); | 256 | nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); |
256 | 257 | ||
257 | total_scan = nr; | 258 | total_scan = nr; |
258 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | 259 | delta = (4 * nr_scanned) / shrinker->seeks; |
259 | delta *= freeable; | 260 | delta *= freeable; |
260 | do_div(delta, lru_pages + 1); | 261 | do_div(delta, nr_eligible + 1); |
261 | total_scan += delta; | 262 | total_scan += delta; |
262 | if (total_scan < 0) { | 263 | if (total_scan < 0) { |
263 | pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", | 264 | pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", |
@@ -289,8 +290,8 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
289 | total_scan = freeable * 2; | 290 | total_scan = freeable * 2; |
290 | 291 | ||
291 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, | 292 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, |
292 | nr_pages_scanned, lru_pages, | 293 | nr_scanned, nr_eligible, |
293 | freeable, delta, total_scan); | 294 | freeable, delta, total_scan); |
294 | 295 | ||
295 | /* | 296 | /* |
296 | * Normally, we should not scan less than batch_size objects in one | 297 | * Normally, we should not scan less than batch_size objects in one |
@@ -339,34 +340,37 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
339 | return freed; | 340 | return freed; |
340 | } | 341 | } |
341 | 342 | ||
342 | /* | 343 | /** |
343 | * Call the shrink functions to age shrinkable caches | 344 | * shrink_node_slabs - shrink slab caches of a given node |
344 | * | 345 | * @gfp_mask: allocation context |
345 | * Here we assume it costs one seek to replace a lru page and that it also | 346 | * @nid: node whose slab caches to target |
346 | * takes a seek to recreate a cache object. With this in mind we age equal | 347 | * @nr_scanned: pressure numerator |
347 | * percentages of the lru and ageable caches. This should balance the seeks | 348 | * @nr_eligible: pressure denominator |
348 | * generated by these structures. | ||
349 | * | 349 | * |
350 | * If the vm encountered mapped pages on the LRU it increase the pressure on | 350 | * Call the shrink functions to age shrinkable caches. |
351 | * slab to avoid swapping. | ||
352 | * | 351 | * |
353 | * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. | 352 | * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, |
353 | * unaware shrinkers will receive a node id of 0 instead. | ||
354 | * | 354 | * |
355 | * `lru_pages' represents the number of on-LRU pages in all the zones which | 355 | * @nr_scanned and @nr_eligible form a ratio that indicate how much of |
356 | * are eligible for the caller's allocation attempt. It is used for balancing | 356 | * the available objects should be scanned. Page reclaim for example |
357 | * slab reclaim versus page reclaim. | 357 | * passes the number of pages scanned and the number of pages on the |
358 | * LRU lists that it considered on @nid, plus a bias in @nr_scanned | ||
359 | * when it encountered mapped pages. The ratio is further biased by | ||
360 | * the ->seeks setting of the shrink function, which indicates the | ||
361 | * cost to recreate an object relative to that of an LRU page. | ||
358 | * | 362 | * |
359 | * Returns the number of slab objects which we shrunk. | 363 | * Returns the number of reclaimed slab objects. |
360 | */ | 364 | */ |
361 | unsigned long shrink_slab(struct shrink_control *shrinkctl, | 365 | unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, |
362 | unsigned long nr_pages_scanned, | 366 | unsigned long nr_scanned, |
363 | unsigned long lru_pages) | 367 | unsigned long nr_eligible) |
364 | { | 368 | { |
365 | struct shrinker *shrinker; | 369 | struct shrinker *shrinker; |
366 | unsigned long freed = 0; | 370 | unsigned long freed = 0; |
367 | 371 | ||
368 | if (nr_pages_scanned == 0) | 372 | if (nr_scanned == 0) |
369 | nr_pages_scanned = SWAP_CLUSTER_MAX; | 373 | nr_scanned = SWAP_CLUSTER_MAX; |
370 | 374 | ||
371 | if (!down_read_trylock(&shrinker_rwsem)) { | 375 | if (!down_read_trylock(&shrinker_rwsem)) { |
372 | /* | 376 | /* |
@@ -380,20 +384,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, | |||
380 | } | 384 | } |
381 | 385 | ||
382 | list_for_each_entry(shrinker, &shrinker_list, list) { | 386 | list_for_each_entry(shrinker, &shrinker_list, list) { |
383 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { | 387 | struct shrink_control sc = { |
384 | shrinkctl->nid = 0; | 388 | .gfp_mask = gfp_mask, |
385 | freed += shrink_slab_node(shrinkctl, shrinker, | 389 | .nid = nid, |
386 | nr_pages_scanned, lru_pages); | 390 | }; |
387 | continue; | ||
388 | } | ||
389 | 391 | ||
390 | for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { | 392 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) |
391 | if (node_online(shrinkctl->nid)) | 393 | sc.nid = 0; |
392 | freed += shrink_slab_node(shrinkctl, shrinker, | ||
393 | nr_pages_scanned, lru_pages); | ||
394 | 394 | ||
395 | } | 395 | freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); |
396 | } | 396 | } |
397 | |||
397 | up_read(&shrinker_rwsem); | 398 | up_read(&shrinker_rwsem); |
398 | out: | 399 | out: |
399 | cond_resched(); | 400 | cond_resched(); |
@@ -1876,7 +1877,8 @@ enum scan_balance { | |||
1876 | * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan | 1877 | * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan |
1877 | */ | 1878 | */ |
1878 | static void get_scan_count(struct lruvec *lruvec, int swappiness, | 1879 | static void get_scan_count(struct lruvec *lruvec, int swappiness, |
1879 | struct scan_control *sc, unsigned long *nr) | 1880 | struct scan_control *sc, unsigned long *nr, |
1881 | unsigned long *lru_pages) | ||
1880 | { | 1882 | { |
1881 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | 1883 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1882 | u64 fraction[2]; | 1884 | u64 fraction[2]; |
@@ -2022,6 +2024,7 @@ out: | |||
2022 | some_scanned = false; | 2024 | some_scanned = false; |
2023 | /* Only use force_scan on second pass. */ | 2025 | /* Only use force_scan on second pass. */ |
2024 | for (pass = 0; !some_scanned && pass < 2; pass++) { | 2026 | for (pass = 0; !some_scanned && pass < 2; pass++) { |
2027 | *lru_pages = 0; | ||
2025 | for_each_evictable_lru(lru) { | 2028 | for_each_evictable_lru(lru) { |
2026 | int file = is_file_lru(lru); | 2029 | int file = is_file_lru(lru); |
2027 | unsigned long size; | 2030 | unsigned long size; |
@@ -2048,14 +2051,19 @@ out: | |||
2048 | case SCAN_FILE: | 2051 | case SCAN_FILE: |
2049 | case SCAN_ANON: | 2052 | case SCAN_ANON: |
2050 | /* Scan one type exclusively */ | 2053 | /* Scan one type exclusively */ |
2051 | if ((scan_balance == SCAN_FILE) != file) | 2054 | if ((scan_balance == SCAN_FILE) != file) { |
2055 | size = 0; | ||
2052 | scan = 0; | 2056 | scan = 0; |
2057 | } | ||
2053 | break; | 2058 | break; |
2054 | default: | 2059 | default: |
2055 | /* Look ma, no brain */ | 2060 | /* Look ma, no brain */ |
2056 | BUG(); | 2061 | BUG(); |
2057 | } | 2062 | } |
2063 | |||
2064 | *lru_pages += size; | ||
2058 | nr[lru] = scan; | 2065 | nr[lru] = scan; |
2066 | |||
2059 | /* | 2067 | /* |
2060 | * Skip the second pass and don't force_scan, | 2068 | * Skip the second pass and don't force_scan, |
2061 | * if we found something to scan. | 2069 | * if we found something to scan. |
@@ -2069,7 +2077,7 @@ out: | |||
2069 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 2077 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
2070 | */ | 2078 | */ |
2071 | static void shrink_lruvec(struct lruvec *lruvec, int swappiness, | 2079 | static void shrink_lruvec(struct lruvec *lruvec, int swappiness, |
2072 | struct scan_control *sc) | 2080 | struct scan_control *sc, unsigned long *lru_pages) |
2073 | { | 2081 | { |
2074 | unsigned long nr[NR_LRU_LISTS]; | 2082 | unsigned long nr[NR_LRU_LISTS]; |
2075 | unsigned long targets[NR_LRU_LISTS]; | 2083 | unsigned long targets[NR_LRU_LISTS]; |
@@ -2080,7 +2088,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, | |||
2080 | struct blk_plug plug; | 2088 | struct blk_plug plug; |
2081 | bool scan_adjusted; | 2089 | bool scan_adjusted; |
2082 | 2090 | ||
2083 | get_scan_count(lruvec, swappiness, sc, nr); | 2091 | get_scan_count(lruvec, swappiness, sc, nr, lru_pages); |
2084 | 2092 | ||
2085 | /* Record the original scan target for proportional adjustments later */ | 2093 | /* Record the original scan target for proportional adjustments later */ |
2086 | memcpy(targets, nr, sizeof(nr)); | 2094 | memcpy(targets, nr, sizeof(nr)); |
@@ -2258,7 +2266,8 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2258 | } | 2266 | } |
2259 | } | 2267 | } |
2260 | 2268 | ||
2261 | static bool shrink_zone(struct zone *zone, struct scan_control *sc) | 2269 | static bool shrink_zone(struct zone *zone, struct scan_control *sc, |
2270 | bool is_classzone) | ||
2262 | { | 2271 | { |
2263 | unsigned long nr_reclaimed, nr_scanned; | 2272 | unsigned long nr_reclaimed, nr_scanned; |
2264 | bool reclaimable = false; | 2273 | bool reclaimable = false; |
@@ -2269,6 +2278,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2269 | .zone = zone, | 2278 | .zone = zone, |
2270 | .priority = sc->priority, | 2279 | .priority = sc->priority, |
2271 | }; | 2280 | }; |
2281 | unsigned long zone_lru_pages = 0; | ||
2272 | struct mem_cgroup *memcg; | 2282 | struct mem_cgroup *memcg; |
2273 | 2283 | ||
2274 | nr_reclaimed = sc->nr_reclaimed; | 2284 | nr_reclaimed = sc->nr_reclaimed; |
@@ -2276,13 +2286,15 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2276 | 2286 | ||
2277 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 2287 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
2278 | do { | 2288 | do { |
2289 | unsigned long lru_pages; | ||
2279 | struct lruvec *lruvec; | 2290 | struct lruvec *lruvec; |
2280 | int swappiness; | 2291 | int swappiness; |
2281 | 2292 | ||
2282 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2293 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2283 | swappiness = mem_cgroup_swappiness(memcg); | 2294 | swappiness = mem_cgroup_swappiness(memcg); |
2284 | 2295 | ||
2285 | shrink_lruvec(lruvec, swappiness, sc); | 2296 | shrink_lruvec(lruvec, swappiness, sc, &lru_pages); |
2297 | zone_lru_pages += lru_pages; | ||
2286 | 2298 | ||
2287 | /* | 2299 | /* |
2288 | * Direct reclaim and kswapd have to scan all memory | 2300 | * Direct reclaim and kswapd have to scan all memory |
@@ -2302,6 +2314,25 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2302 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 2314 | memcg = mem_cgroup_iter(root, memcg, &reclaim); |
2303 | } while (memcg); | 2315 | } while (memcg); |
2304 | 2316 | ||
2317 | /* | ||
2318 | * Shrink the slab caches in the same proportion that | ||
2319 | * the eligible LRU pages were scanned. | ||
2320 | */ | ||
2321 | if (global_reclaim(sc) && is_classzone) { | ||
2322 | struct reclaim_state *reclaim_state; | ||
2323 | |||
2324 | shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), | ||
2325 | sc->nr_scanned - nr_scanned, | ||
2326 | zone_lru_pages); | ||
2327 | |||
2328 | reclaim_state = current->reclaim_state; | ||
2329 | if (reclaim_state) { | ||
2330 | sc->nr_reclaimed += | ||
2331 | reclaim_state->reclaimed_slab; | ||
2332 | reclaim_state->reclaimed_slab = 0; | ||
2333 | } | ||
2334 | } | ||
2335 | |||
2305 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | 2336 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, |
2306 | sc->nr_scanned - nr_scanned, | 2337 | sc->nr_scanned - nr_scanned, |
2307 | sc->nr_reclaimed - nr_reclaimed); | 2338 | sc->nr_reclaimed - nr_reclaimed); |
@@ -2376,12 +2407,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2376 | struct zone *zone; | 2407 | struct zone *zone; |
2377 | unsigned long nr_soft_reclaimed; | 2408 | unsigned long nr_soft_reclaimed; |
2378 | unsigned long nr_soft_scanned; | 2409 | unsigned long nr_soft_scanned; |
2379 | unsigned long lru_pages = 0; | ||
2380 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
2381 | gfp_t orig_mask; | 2410 | gfp_t orig_mask; |
2382 | struct shrink_control shrink = { | ||
2383 | .gfp_mask = sc->gfp_mask, | ||
2384 | }; | ||
2385 | enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); | 2411 | enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); |
2386 | bool reclaimable = false; | 2412 | bool reclaimable = false; |
2387 | 2413 | ||
@@ -2394,12 +2420,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2394 | if (buffer_heads_over_limit) | 2420 | if (buffer_heads_over_limit) |
2395 | sc->gfp_mask |= __GFP_HIGHMEM; | 2421 | sc->gfp_mask |= __GFP_HIGHMEM; |
2396 | 2422 | ||
2397 | nodes_clear(shrink.nodes_to_scan); | ||
2398 | |||
2399 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2423 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2400 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2424 | requested_highidx, sc->nodemask) { |
2425 | enum zone_type classzone_idx; | ||
2426 | |||
2401 | if (!populated_zone(zone)) | 2427 | if (!populated_zone(zone)) |
2402 | continue; | 2428 | continue; |
2429 | |||
2430 | classzone_idx = requested_highidx; | ||
2431 | while (!populated_zone(zone->zone_pgdat->node_zones + | ||
2432 | classzone_idx)) | ||
2433 | classzone_idx--; | ||
2434 | |||
2403 | /* | 2435 | /* |
2404 | * Take care memory controller reclaiming has small influence | 2436 | * Take care memory controller reclaiming has small influence |
2405 | * to global LRU. | 2437 | * to global LRU. |
@@ -2409,9 +2441,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2409 | GFP_KERNEL | __GFP_HARDWALL)) | 2441 | GFP_KERNEL | __GFP_HARDWALL)) |
2410 | continue; | 2442 | continue; |
2411 | 2443 | ||
2412 | lru_pages += zone_reclaimable_pages(zone); | ||
2413 | node_set(zone_to_nid(zone), shrink.nodes_to_scan); | ||
2414 | |||
2415 | if (sc->priority != DEF_PRIORITY && | 2444 | if (sc->priority != DEF_PRIORITY && |
2416 | !zone_reclaimable(zone)) | 2445 | !zone_reclaimable(zone)) |
2417 | continue; /* Let kswapd poll it */ | 2446 | continue; /* Let kswapd poll it */ |
@@ -2450,7 +2479,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2450 | /* need some check for avoid more shrink_zone() */ | 2479 | /* need some check for avoid more shrink_zone() */ |
2451 | } | 2480 | } |
2452 | 2481 | ||
2453 | if (shrink_zone(zone, sc)) | 2482 | if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx)) |
2454 | reclaimable = true; | 2483 | reclaimable = true; |
2455 | 2484 | ||
2456 | if (global_reclaim(sc) && | 2485 | if (global_reclaim(sc) && |
@@ -2459,20 +2488,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2459 | } | 2488 | } |
2460 | 2489 | ||
2461 | /* | 2490 | /* |
2462 | * Don't shrink slabs when reclaiming memory from over limit cgroups | ||
2463 | * but do shrink slab at least once when aborting reclaim for | ||
2464 | * compaction to avoid unevenly scanning file/anon LRU pages over slab | ||
2465 | * pages. | ||
2466 | */ | ||
2467 | if (global_reclaim(sc)) { | ||
2468 | shrink_slab(&shrink, sc->nr_scanned, lru_pages); | ||
2469 | if (reclaim_state) { | ||
2470 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | ||
2471 | reclaim_state->reclaimed_slab = 0; | ||
2472 | } | ||
2473 | } | ||
2474 | |||
2475 | /* | ||
2476 | * Restore to original mask to avoid the impact on the caller if we | 2491 | * Restore to original mask to avoid the impact on the caller if we |
2477 | * promoted it to __GFP_HIGHMEM. | 2492 | * promoted it to __GFP_HIGHMEM. |
2478 | */ | 2493 | */ |
@@ -2736,6 +2751,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2736 | }; | 2751 | }; |
2737 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2752 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2738 | int swappiness = mem_cgroup_swappiness(memcg); | 2753 | int swappiness = mem_cgroup_swappiness(memcg); |
2754 | unsigned long lru_pages; | ||
2739 | 2755 | ||
2740 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2756 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2741 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2757 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
@@ -2751,7 +2767,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2751 | * will pick up pages from other mem cgroup's as well. We hack | 2767 | * will pick up pages from other mem cgroup's as well. We hack |
2752 | * the priority and make it zero. | 2768 | * the priority and make it zero. |
2753 | */ | 2769 | */ |
2754 | shrink_lruvec(lruvec, swappiness, &sc); | 2770 | shrink_lruvec(lruvec, swappiness, &sc, &lru_pages); |
2755 | 2771 | ||
2756 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2772 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2757 | 2773 | ||
@@ -2932,15 +2948,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2932 | static bool kswapd_shrink_zone(struct zone *zone, | 2948 | static bool kswapd_shrink_zone(struct zone *zone, |
2933 | int classzone_idx, | 2949 | int classzone_idx, |
2934 | struct scan_control *sc, | 2950 | struct scan_control *sc, |
2935 | unsigned long lru_pages, | ||
2936 | unsigned long *nr_attempted) | 2951 | unsigned long *nr_attempted) |
2937 | { | 2952 | { |
2938 | int testorder = sc->order; | 2953 | int testorder = sc->order; |
2939 | unsigned long balance_gap; | 2954 | unsigned long balance_gap; |
2940 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
2941 | struct shrink_control shrink = { | ||
2942 | .gfp_mask = sc->gfp_mask, | ||
2943 | }; | ||
2944 | bool lowmem_pressure; | 2955 | bool lowmem_pressure; |
2945 | 2956 | ||
2946 | /* Reclaim above the high watermark. */ | 2957 | /* Reclaim above the high watermark. */ |
@@ -2975,13 +2986,7 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2975 | balance_gap, classzone_idx)) | 2986 | balance_gap, classzone_idx)) |
2976 | return true; | 2987 | return true; |
2977 | 2988 | ||
2978 | shrink_zone(zone, sc); | 2989 | shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); |
2979 | nodes_clear(shrink.nodes_to_scan); | ||
2980 | node_set(zone_to_nid(zone), shrink.nodes_to_scan); | ||
2981 | |||
2982 | reclaim_state->reclaimed_slab = 0; | ||
2983 | shrink_slab(&shrink, sc->nr_scanned, lru_pages); | ||
2984 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | ||
2985 | 2990 | ||
2986 | /* Account for the number of pages attempted to reclaim */ | 2991 | /* Account for the number of pages attempted to reclaim */ |
2987 | *nr_attempted += sc->nr_to_reclaim; | 2992 | *nr_attempted += sc->nr_to_reclaim; |
@@ -3042,7 +3047,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3042 | count_vm_event(PAGEOUTRUN); | 3047 | count_vm_event(PAGEOUTRUN); |
3043 | 3048 | ||
3044 | do { | 3049 | do { |
3045 | unsigned long lru_pages = 0; | ||
3046 | unsigned long nr_attempted = 0; | 3050 | unsigned long nr_attempted = 0; |
3047 | bool raise_priority = true; | 3051 | bool raise_priority = true; |
3048 | bool pgdat_needs_compaction = (order > 0); | 3052 | bool pgdat_needs_compaction = (order > 0); |
@@ -3102,8 +3106,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3102 | if (!populated_zone(zone)) | 3106 | if (!populated_zone(zone)) |
3103 | continue; | 3107 | continue; |
3104 | 3108 | ||
3105 | lru_pages += zone_reclaimable_pages(zone); | ||
3106 | |||
3107 | /* | 3109 | /* |
3108 | * If any zone is currently balanced then kswapd will | 3110 | * If any zone is currently balanced then kswapd will |
3109 | * not call compaction as it is expected that the | 3111 | * not call compaction as it is expected that the |
@@ -3159,8 +3161,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3159 | * that that high watermark would be met at 100% | 3161 | * that that high watermark would be met at 100% |
3160 | * efficiency. | 3162 | * efficiency. |
3161 | */ | 3163 | */ |
3162 | if (kswapd_shrink_zone(zone, end_zone, &sc, | 3164 | if (kswapd_shrink_zone(zone, end_zone, |
3163 | lru_pages, &nr_attempted)) | 3165 | &sc, &nr_attempted)) |
3164 | raise_priority = false; | 3166 | raise_priority = false; |
3165 | } | 3167 | } |
3166 | 3168 | ||
@@ -3612,10 +3614,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3612 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 3614 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
3613 | .may_swap = 1, | 3615 | .may_swap = 1, |
3614 | }; | 3616 | }; |
3615 | struct shrink_control shrink = { | ||
3616 | .gfp_mask = sc.gfp_mask, | ||
3617 | }; | ||
3618 | unsigned long nr_slab_pages0, nr_slab_pages1; | ||
3619 | 3617 | ||
3620 | cond_resched(); | 3618 | cond_resched(); |
3621 | /* | 3619 | /* |
@@ -3634,44 +3632,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3634 | * priorities until we have enough memory freed. | 3632 | * priorities until we have enough memory freed. |
3635 | */ | 3633 | */ |
3636 | do { | 3634 | do { |
3637 | shrink_zone(zone, &sc); | 3635 | shrink_zone(zone, &sc, true); |
3638 | } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); | 3636 | } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); |
3639 | } | 3637 | } |
3640 | 3638 | ||
3641 | nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | ||
3642 | if (nr_slab_pages0 > zone->min_slab_pages) { | ||
3643 | /* | ||
3644 | * shrink_slab() does not currently allow us to determine how | ||
3645 | * many pages were freed in this zone. So we take the current | ||
3646 | * number of slab pages and shake the slab until it is reduced | ||
3647 | * by the same nr_pages that we used for reclaiming unmapped | ||
3648 | * pages. | ||
3649 | */ | ||
3650 | nodes_clear(shrink.nodes_to_scan); | ||
3651 | node_set(zone_to_nid(zone), shrink.nodes_to_scan); | ||
3652 | for (;;) { | ||
3653 | unsigned long lru_pages = zone_reclaimable_pages(zone); | ||
3654 | |||
3655 | /* No reclaimable slab or very low memory pressure */ | ||
3656 | if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) | ||
3657 | break; | ||
3658 | |||
3659 | /* Freed enough memory */ | ||
3660 | nr_slab_pages1 = zone_page_state(zone, | ||
3661 | NR_SLAB_RECLAIMABLE); | ||
3662 | if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) | ||
3663 | break; | ||
3664 | } | ||
3665 | |||
3666 | /* | ||
3667 | * Update nr_reclaimed by the number of slab pages we | ||
3668 | * reclaimed from this zone. | ||
3669 | */ | ||
3670 | nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | ||
3671 | if (nr_slab_pages1 < nr_slab_pages0) | ||
3672 | sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; | ||
3673 | } | ||
3674 | |||
3675 | p->reclaim_state = NULL; | 3639 | p->reclaim_state = NULL; |
3676 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 3640 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
3677 | lockdep_clear_current_reclaim_state(); | 3641 | lockdep_clear_current_reclaim_state(); |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 1b12d390dc68..1284f89fca08 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -22,6 +22,8 @@ | |||
22 | #include <linux/writeback.h> | 22 | #include <linux/writeback.h> |
23 | #include <linux/compaction.h> | 23 | #include <linux/compaction.h> |
24 | #include <linux/mm_inline.h> | 24 | #include <linux/mm_inline.h> |
25 | #include <linux/page_ext.h> | ||
26 | #include <linux/page_owner.h> | ||
25 | 27 | ||
26 | #include "internal.h" | 28 | #include "internal.h" |
27 | 29 | ||
@@ -898,6 +900,7 @@ const char * const vmstat_text[] = { | |||
898 | #ifdef CONFIG_DEBUG_VM_VMACACHE | 900 | #ifdef CONFIG_DEBUG_VM_VMACACHE |
899 | "vmacache_find_calls", | 901 | "vmacache_find_calls", |
900 | "vmacache_find_hits", | 902 | "vmacache_find_hits", |
903 | "vmacache_full_flushes", | ||
901 | #endif | 904 | #endif |
902 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 905 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
903 | }; | 906 | }; |
@@ -1017,6 +1020,104 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) | |||
1017 | return 0; | 1020 | return 0; |
1018 | } | 1021 | } |
1019 | 1022 | ||
1023 | #ifdef CONFIG_PAGE_OWNER | ||
1024 | static void pagetypeinfo_showmixedcount_print(struct seq_file *m, | ||
1025 | pg_data_t *pgdat, | ||
1026 | struct zone *zone) | ||
1027 | { | ||
1028 | struct page *page; | ||
1029 | struct page_ext *page_ext; | ||
1030 | unsigned long pfn = zone->zone_start_pfn, block_end_pfn; | ||
1031 | unsigned long end_pfn = pfn + zone->spanned_pages; | ||
1032 | unsigned long count[MIGRATE_TYPES] = { 0, }; | ||
1033 | int pageblock_mt, page_mt; | ||
1034 | int i; | ||
1035 | |||
1036 | /* Scan block by block. First and last block may be incomplete */ | ||
1037 | pfn = zone->zone_start_pfn; | ||
1038 | |||
1039 | /* | ||
1040 | * Walk the zone in pageblock_nr_pages steps. If a page block spans | ||
1041 | * a zone boundary, it will be double counted between zones. This does | ||
1042 | * not matter as the mixed block count will still be correct | ||
1043 | */ | ||
1044 | for (; pfn < end_pfn; ) { | ||
1045 | if (!pfn_valid(pfn)) { | ||
1046 | pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); | ||
1047 | continue; | ||
1048 | } | ||
1049 | |||
1050 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
1051 | block_end_pfn = min(block_end_pfn, end_pfn); | ||
1052 | |||
1053 | page = pfn_to_page(pfn); | ||
1054 | pageblock_mt = get_pfnblock_migratetype(page, pfn); | ||
1055 | |||
1056 | for (; pfn < block_end_pfn; pfn++) { | ||
1057 | if (!pfn_valid_within(pfn)) | ||
1058 | continue; | ||
1059 | |||
1060 | page = pfn_to_page(pfn); | ||
1061 | if (PageBuddy(page)) { | ||
1062 | pfn += (1UL << page_order(page)) - 1; | ||
1063 | continue; | ||
1064 | } | ||
1065 | |||
1066 | if (PageReserved(page)) | ||
1067 | continue; | ||
1068 | |||
1069 | page_ext = lookup_page_ext(page); | ||
1070 | |||
1071 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | ||
1072 | continue; | ||
1073 | |||
1074 | page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); | ||
1075 | if (pageblock_mt != page_mt) { | ||
1076 | if (is_migrate_cma(pageblock_mt)) | ||
1077 | count[MIGRATE_MOVABLE]++; | ||
1078 | else | ||
1079 | count[pageblock_mt]++; | ||
1080 | |||
1081 | pfn = block_end_pfn; | ||
1082 | break; | ||
1083 | } | ||
1084 | pfn += (1UL << page_ext->order) - 1; | ||
1085 | } | ||
1086 | } | ||
1087 | |||
1088 | /* Print counts */ | ||
1089 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
1090 | for (i = 0; i < MIGRATE_TYPES; i++) | ||
1091 | seq_printf(m, "%12lu ", count[i]); | ||
1092 | seq_putc(m, '\n'); | ||
1093 | } | ||
1094 | #endif /* CONFIG_PAGE_OWNER */ | ||
1095 | |||
1096 | /* | ||
1097 | * Print out the number of pageblocks for each migratetype that contain pages | ||
1098 | * of other types. This gives an indication of how well fallbacks are being | ||
1099 | * contained by rmqueue_fallback(). It requires information from PAGE_OWNER | ||
1100 | * to determine what is going on | ||
1101 | */ | ||
1102 | static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) | ||
1103 | { | ||
1104 | #ifdef CONFIG_PAGE_OWNER | ||
1105 | int mtype; | ||
1106 | |||
1107 | if (!page_owner_inited) | ||
1108 | return; | ||
1109 | |||
1110 | drain_all_pages(NULL); | ||
1111 | |||
1112 | seq_printf(m, "\n%-23s", "Number of mixed blocks "); | ||
1113 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) | ||
1114 | seq_printf(m, "%12s ", migratetype_names[mtype]); | ||
1115 | seq_putc(m, '\n'); | ||
1116 | |||
1117 | walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); | ||
1118 | #endif /* CONFIG_PAGE_OWNER */ | ||
1119 | } | ||
1120 | |||
1020 | /* | 1121 | /* |
1021 | * This prints out statistics in relation to grouping pages by mobility. | 1122 | * This prints out statistics in relation to grouping pages by mobility. |
1022 | * It is expensive to collect so do not constantly read the file. | 1123 | * It is expensive to collect so do not constantly read the file. |
@@ -1034,6 +1135,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) | |||
1034 | seq_putc(m, '\n'); | 1135 | seq_putc(m, '\n'); |
1035 | pagetypeinfo_showfree(m, pgdat); | 1136 | pagetypeinfo_showfree(m, pgdat); |
1036 | pagetypeinfo_showblockcount(m, pgdat); | 1137 | pagetypeinfo_showblockcount(m, pgdat); |
1138 | pagetypeinfo_showmixedcount(m, pgdat); | ||
1037 | 1139 | ||
1038 | return 0; | 1140 | return 0; |
1039 | } | 1141 | } |
@@ -132,7 +132,7 @@ static struct zbud_ops zbud_zpool_ops = { | |||
132 | 132 | ||
133 | static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) | 133 | static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) |
134 | { | 134 | { |
135 | return zbud_create_pool(gfp, &zbud_zpool_ops); | 135 | return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); |
136 | } | 136 | } |
137 | 137 | ||
138 | static void zbud_zpool_destroy(void *pool) | 138 | static void zbud_zpool_destroy(void *pool) |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 839a48c3ca27..4d0a063145ec 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -155,8 +155,6 @@ | |||
155 | * (reason above) | 155 | * (reason above) |
156 | */ | 156 | */ |
157 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) | 157 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) |
158 | #define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ | ||
159 | ZS_SIZE_CLASS_DELTA + 1) | ||
160 | 158 | ||
161 | /* | 159 | /* |
162 | * We do not maintain any list for completely empty or full pages | 160 | * We do not maintain any list for completely empty or full pages |
@@ -171,6 +169,11 @@ enum fullness_group { | |||
171 | }; | 169 | }; |
172 | 170 | ||
173 | /* | 171 | /* |
172 | * number of size_classes | ||
173 | */ | ||
174 | static int zs_size_classes; | ||
175 | |||
176 | /* | ||
174 | * We assign a page to ZS_ALMOST_EMPTY fullness group when: | 177 | * We assign a page to ZS_ALMOST_EMPTY fullness group when: |
175 | * n <= N / f, where | 178 | * n <= N / f, where |
176 | * n = number of allocated objects | 179 | * n = number of allocated objects |
@@ -214,7 +217,7 @@ struct link_free { | |||
214 | }; | 217 | }; |
215 | 218 | ||
216 | struct zs_pool { | 219 | struct zs_pool { |
217 | struct size_class size_class[ZS_SIZE_CLASSES]; | 220 | struct size_class **size_class; |
218 | 221 | ||
219 | gfp_t flags; /* allocation flags used when growing pool */ | 222 | gfp_t flags; /* allocation flags used when growing pool */ |
220 | atomic_long_t pages_allocated; | 223 | atomic_long_t pages_allocated; |
@@ -468,7 +471,7 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, | |||
468 | if (newfg == currfg) | 471 | if (newfg == currfg) |
469 | goto out; | 472 | goto out; |
470 | 473 | ||
471 | class = &pool->size_class[class_idx]; | 474 | class = pool->size_class[class_idx]; |
472 | remove_zspage(page, class, currfg); | 475 | remove_zspage(page, class, currfg); |
473 | insert_zspage(page, class, newfg); | 476 | insert_zspage(page, class, newfg); |
474 | set_zspage_mapping(page, class_idx, newfg); | 477 | set_zspage_mapping(page, class_idx, newfg); |
@@ -629,6 +632,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
629 | struct page *next_page; | 632 | struct page *next_page; |
630 | struct link_free *link; | 633 | struct link_free *link; |
631 | unsigned int i = 1; | 634 | unsigned int i = 1; |
635 | void *vaddr; | ||
632 | 636 | ||
633 | /* | 637 | /* |
634 | * page->index stores offset of first object starting | 638 | * page->index stores offset of first object starting |
@@ -639,8 +643,8 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
639 | if (page != first_page) | 643 | if (page != first_page) |
640 | page->index = off; | 644 | page->index = off; |
641 | 645 | ||
642 | link = (struct link_free *)kmap_atomic(page) + | 646 | vaddr = kmap_atomic(page); |
643 | off / sizeof(*link); | 647 | link = (struct link_free *)vaddr + off / sizeof(*link); |
644 | 648 | ||
645 | while ((off += class->size) < PAGE_SIZE) { | 649 | while ((off += class->size) < PAGE_SIZE) { |
646 | link->next = obj_location_to_handle(page, i++); | 650 | link->next = obj_location_to_handle(page, i++); |
@@ -654,7 +658,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
654 | */ | 658 | */ |
655 | next_page = get_next_page(page); | 659 | next_page = get_next_page(page); |
656 | link->next = obj_location_to_handle(next_page, 0); | 660 | link->next = obj_location_to_handle(next_page, 0); |
657 | kunmap_atomic(link); | 661 | kunmap_atomic(vaddr); |
658 | page = next_page; | 662 | page = next_page; |
659 | off %= PAGE_SIZE; | 663 | off %= PAGE_SIZE; |
660 | } | 664 | } |
@@ -784,7 +788,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) | |||
784 | */ | 788 | */ |
785 | if (area->vm_buf) | 789 | if (area->vm_buf) |
786 | return 0; | 790 | return 0; |
787 | area->vm_buf = (char *)__get_free_page(GFP_KERNEL); | 791 | area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); |
788 | if (!area->vm_buf) | 792 | if (!area->vm_buf) |
789 | return -ENOMEM; | 793 | return -ENOMEM; |
790 | return 0; | 794 | return 0; |
@@ -792,8 +796,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) | |||
792 | 796 | ||
793 | static inline void __zs_cpu_down(struct mapping_area *area) | 797 | static inline void __zs_cpu_down(struct mapping_area *area) |
794 | { | 798 | { |
795 | if (area->vm_buf) | 799 | kfree(area->vm_buf); |
796 | free_page((unsigned long)area->vm_buf); | ||
797 | area->vm_buf = NULL; | 800 | area->vm_buf = NULL; |
798 | } | 801 | } |
799 | 802 | ||
@@ -881,14 +884,10 @@ static struct notifier_block zs_cpu_nb = { | |||
881 | .notifier_call = zs_cpu_notifier | 884 | .notifier_call = zs_cpu_notifier |
882 | }; | 885 | }; |
883 | 886 | ||
884 | static void zs_exit(void) | 887 | static void zs_unregister_cpu_notifier(void) |
885 | { | 888 | { |
886 | int cpu; | 889 | int cpu; |
887 | 890 | ||
888 | #ifdef CONFIG_ZPOOL | ||
889 | zpool_unregister_driver(&zs_zpool_driver); | ||
890 | #endif | ||
891 | |||
892 | cpu_notifier_register_begin(); | 891 | cpu_notifier_register_begin(); |
893 | 892 | ||
894 | for_each_online_cpu(cpu) | 893 | for_each_online_cpu(cpu) |
@@ -898,31 +897,74 @@ static void zs_exit(void) | |||
898 | cpu_notifier_register_done(); | 897 | cpu_notifier_register_done(); |
899 | } | 898 | } |
900 | 899 | ||
901 | static int zs_init(void) | 900 | static int zs_register_cpu_notifier(void) |
902 | { | 901 | { |
903 | int cpu, ret; | 902 | int cpu, uninitialized_var(ret); |
904 | 903 | ||
905 | cpu_notifier_register_begin(); | 904 | cpu_notifier_register_begin(); |
906 | 905 | ||
907 | __register_cpu_notifier(&zs_cpu_nb); | 906 | __register_cpu_notifier(&zs_cpu_nb); |
908 | for_each_online_cpu(cpu) { | 907 | for_each_online_cpu(cpu) { |
909 | ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 908 | ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
910 | if (notifier_to_errno(ret)) { | 909 | if (notifier_to_errno(ret)) |
911 | cpu_notifier_register_done(); | 910 | break; |
912 | goto fail; | ||
913 | } | ||
914 | } | 911 | } |
915 | 912 | ||
916 | cpu_notifier_register_done(); | 913 | cpu_notifier_register_done(); |
914 | return notifier_to_errno(ret); | ||
915 | } | ||
916 | |||
917 | static void init_zs_size_classes(void) | ||
918 | { | ||
919 | int nr; | ||
917 | 920 | ||
921 | nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; | ||
922 | if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) | ||
923 | nr += 1; | ||
924 | |||
925 | zs_size_classes = nr; | ||
926 | } | ||
927 | |||
928 | static void __exit zs_exit(void) | ||
929 | { | ||
918 | #ifdef CONFIG_ZPOOL | 930 | #ifdef CONFIG_ZPOOL |
919 | zpool_register_driver(&zs_zpool_driver); | 931 | zpool_unregister_driver(&zs_zpool_driver); |
920 | #endif | 932 | #endif |
933 | zs_unregister_cpu_notifier(); | ||
934 | } | ||
921 | 935 | ||
936 | static int __init zs_init(void) | ||
937 | { | ||
938 | int ret = zs_register_cpu_notifier(); | ||
939 | |||
940 | if (ret) { | ||
941 | zs_unregister_cpu_notifier(); | ||
942 | return ret; | ||
943 | } | ||
944 | |||
945 | init_zs_size_classes(); | ||
946 | |||
947 | #ifdef CONFIG_ZPOOL | ||
948 | zpool_register_driver(&zs_zpool_driver); | ||
949 | #endif | ||
922 | return 0; | 950 | return 0; |
923 | fail: | 951 | } |
924 | zs_exit(); | 952 | |
925 | return notifier_to_errno(ret); | 953 | static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) |
954 | { | ||
955 | return pages_per_zspage * PAGE_SIZE / size; | ||
956 | } | ||
957 | |||
958 | static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | ||
959 | { | ||
960 | if (prev->pages_per_zspage != pages_per_zspage) | ||
961 | return false; | ||
962 | |||
963 | if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) | ||
964 | != get_maxobj_per_zspage(size, pages_per_zspage)) | ||
965 | return false; | ||
966 | |||
967 | return true; | ||
926 | } | 968 | } |
927 | 969 | ||
928 | /** | 970 | /** |
@@ -937,33 +979,71 @@ fail: | |||
937 | */ | 979 | */ |
938 | struct zs_pool *zs_create_pool(gfp_t flags) | 980 | struct zs_pool *zs_create_pool(gfp_t flags) |
939 | { | 981 | { |
940 | int i, ovhd_size; | 982 | int i; |
941 | struct zs_pool *pool; | 983 | struct zs_pool *pool; |
984 | struct size_class *prev_class = NULL; | ||
942 | 985 | ||
943 | ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); | 986 | pool = kzalloc(sizeof(*pool), GFP_KERNEL); |
944 | pool = kzalloc(ovhd_size, GFP_KERNEL); | ||
945 | if (!pool) | 987 | if (!pool) |
946 | return NULL; | 988 | return NULL; |
947 | 989 | ||
948 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { | 990 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), |
991 | GFP_KERNEL); | ||
992 | if (!pool->size_class) { | ||
993 | kfree(pool); | ||
994 | return NULL; | ||
995 | } | ||
996 | |||
997 | /* | ||
998 | * Iterate reversly, because, size of size_class that we want to use | ||
999 | * for merging should be larger or equal to current size. | ||
1000 | */ | ||
1001 | for (i = zs_size_classes - 1; i >= 0; i--) { | ||
949 | int size; | 1002 | int size; |
1003 | int pages_per_zspage; | ||
950 | struct size_class *class; | 1004 | struct size_class *class; |
951 | 1005 | ||
952 | size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; | 1006 | size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; |
953 | if (size > ZS_MAX_ALLOC_SIZE) | 1007 | if (size > ZS_MAX_ALLOC_SIZE) |
954 | size = ZS_MAX_ALLOC_SIZE; | 1008 | size = ZS_MAX_ALLOC_SIZE; |
1009 | pages_per_zspage = get_pages_per_zspage(size); | ||
1010 | |||
1011 | /* | ||
1012 | * size_class is used for normal zsmalloc operation such | ||
1013 | * as alloc/free for that size. Although it is natural that we | ||
1014 | * have one size_class for each size, there is a chance that we | ||
1015 | * can get more memory utilization if we use one size_class for | ||
1016 | * many different sizes whose size_class have same | ||
1017 | * characteristics. So, we makes size_class point to | ||
1018 | * previous size_class if possible. | ||
1019 | */ | ||
1020 | if (prev_class) { | ||
1021 | if (can_merge(prev_class, size, pages_per_zspage)) { | ||
1022 | pool->size_class[i] = prev_class; | ||
1023 | continue; | ||
1024 | } | ||
1025 | } | ||
1026 | |||
1027 | class = kzalloc(sizeof(struct size_class), GFP_KERNEL); | ||
1028 | if (!class) | ||
1029 | goto err; | ||
955 | 1030 | ||
956 | class = &pool->size_class[i]; | ||
957 | class->size = size; | 1031 | class->size = size; |
958 | class->index = i; | 1032 | class->index = i; |
1033 | class->pages_per_zspage = pages_per_zspage; | ||
959 | spin_lock_init(&class->lock); | 1034 | spin_lock_init(&class->lock); |
960 | class->pages_per_zspage = get_pages_per_zspage(size); | 1035 | pool->size_class[i] = class; |
961 | 1036 | ||
1037 | prev_class = class; | ||
962 | } | 1038 | } |
963 | 1039 | ||
964 | pool->flags = flags; | 1040 | pool->flags = flags; |
965 | 1041 | ||
966 | return pool; | 1042 | return pool; |
1043 | |||
1044 | err: | ||
1045 | zs_destroy_pool(pool); | ||
1046 | return NULL; | ||
967 | } | 1047 | } |
968 | EXPORT_SYMBOL_GPL(zs_create_pool); | 1048 | EXPORT_SYMBOL_GPL(zs_create_pool); |
969 | 1049 | ||
@@ -971,9 +1051,15 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
971 | { | 1051 | { |
972 | int i; | 1052 | int i; |
973 | 1053 | ||
974 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { | 1054 | for (i = 0; i < zs_size_classes; i++) { |
975 | int fg; | 1055 | int fg; |
976 | struct size_class *class = &pool->size_class[i]; | 1056 | struct size_class *class = pool->size_class[i]; |
1057 | |||
1058 | if (!class) | ||
1059 | continue; | ||
1060 | |||
1061 | if (class->index != i) | ||
1062 | continue; | ||
977 | 1063 | ||
978 | for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { | 1064 | for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { |
979 | if (class->fullness_list[fg]) { | 1065 | if (class->fullness_list[fg]) { |
@@ -981,7 +1067,10 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
981 | class->size, fg); | 1067 | class->size, fg); |
982 | } | 1068 | } |
983 | } | 1069 | } |
1070 | kfree(class); | ||
984 | } | 1071 | } |
1072 | |||
1073 | kfree(pool->size_class); | ||
985 | kfree(pool); | 1074 | kfree(pool); |
986 | } | 1075 | } |
987 | EXPORT_SYMBOL_GPL(zs_destroy_pool); | 1076 | EXPORT_SYMBOL_GPL(zs_destroy_pool); |
@@ -999,8 +1088,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
999 | { | 1088 | { |
1000 | unsigned long obj; | 1089 | unsigned long obj; |
1001 | struct link_free *link; | 1090 | struct link_free *link; |
1002 | int class_idx; | ||
1003 | struct size_class *class; | 1091 | struct size_class *class; |
1092 | void *vaddr; | ||
1004 | 1093 | ||
1005 | struct page *first_page, *m_page; | 1094 | struct page *first_page, *m_page; |
1006 | unsigned long m_objidx, m_offset; | 1095 | unsigned long m_objidx, m_offset; |
@@ -1008,9 +1097,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
1008 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) | 1097 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) |
1009 | return 0; | 1098 | return 0; |
1010 | 1099 | ||
1011 | class_idx = get_size_class_index(size); | 1100 | class = pool->size_class[get_size_class_index(size)]; |
1012 | class = &pool->size_class[class_idx]; | ||
1013 | BUG_ON(class_idx != class->index); | ||
1014 | 1101 | ||
1015 | spin_lock(&class->lock); | 1102 | spin_lock(&class->lock); |
1016 | first_page = find_get_zspage(class); | 1103 | first_page = find_get_zspage(class); |
@@ -1031,11 +1118,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
1031 | obj_handle_to_location(obj, &m_page, &m_objidx); | 1118 | obj_handle_to_location(obj, &m_page, &m_objidx); |
1032 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); | 1119 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); |
1033 | 1120 | ||
1034 | link = (struct link_free *)kmap_atomic(m_page) + | 1121 | vaddr = kmap_atomic(m_page); |
1035 | m_offset / sizeof(*link); | 1122 | link = (struct link_free *)vaddr + m_offset / sizeof(*link); |
1036 | first_page->freelist = link->next; | 1123 | first_page->freelist = link->next; |
1037 | memset(link, POISON_INUSE, sizeof(*link)); | 1124 | memset(link, POISON_INUSE, sizeof(*link)); |
1038 | kunmap_atomic(link); | 1125 | kunmap_atomic(vaddr); |
1039 | 1126 | ||
1040 | first_page->inuse++; | 1127 | first_page->inuse++; |
1041 | /* Now move the zspage to another fullness group, if required */ | 1128 | /* Now move the zspage to another fullness group, if required */ |
@@ -1051,6 +1138,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj) | |||
1051 | struct link_free *link; | 1138 | struct link_free *link; |
1052 | struct page *first_page, *f_page; | 1139 | struct page *first_page, *f_page; |
1053 | unsigned long f_objidx, f_offset; | 1140 | unsigned long f_objidx, f_offset; |
1141 | void *vaddr; | ||
1054 | 1142 | ||
1055 | int class_idx; | 1143 | int class_idx; |
1056 | struct size_class *class; | 1144 | struct size_class *class; |
@@ -1063,16 +1151,16 @@ void zs_free(struct zs_pool *pool, unsigned long obj) | |||
1063 | first_page = get_first_page(f_page); | 1151 | first_page = get_first_page(f_page); |
1064 | 1152 | ||
1065 | get_zspage_mapping(first_page, &class_idx, &fullness); | 1153 | get_zspage_mapping(first_page, &class_idx, &fullness); |
1066 | class = &pool->size_class[class_idx]; | 1154 | class = pool->size_class[class_idx]; |
1067 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); | 1155 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); |
1068 | 1156 | ||
1069 | spin_lock(&class->lock); | 1157 | spin_lock(&class->lock); |
1070 | 1158 | ||
1071 | /* Insert this object in containing zspage's freelist */ | 1159 | /* Insert this object in containing zspage's freelist */ |
1072 | link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) | 1160 | vaddr = kmap_atomic(f_page); |
1073 | + f_offset); | 1161 | link = (struct link_free *)(vaddr + f_offset); |
1074 | link->next = first_page->freelist; | 1162 | link->next = first_page->freelist; |
1075 | kunmap_atomic(link); | 1163 | kunmap_atomic(vaddr); |
1076 | first_page->freelist = (void *)obj; | 1164 | first_page->freelist = (void *)obj; |
1077 | 1165 | ||
1078 | first_page->inuse--; | 1166 | first_page->inuse--; |
@@ -1124,7 +1212,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
1124 | 1212 | ||
1125 | obj_handle_to_location(handle, &page, &obj_idx); | 1213 | obj_handle_to_location(handle, &page, &obj_idx); |
1126 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1214 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); |
1127 | class = &pool->size_class[class_idx]; | 1215 | class = pool->size_class[class_idx]; |
1128 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1216 | off = obj_idx_to_offset(page, obj_idx, class->size); |
1129 | 1217 | ||
1130 | area = &get_cpu_var(zs_map_area); | 1218 | area = &get_cpu_var(zs_map_area); |
@@ -1158,7 +1246,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
1158 | 1246 | ||
1159 | obj_handle_to_location(handle, &page, &obj_idx); | 1247 | obj_handle_to_location(handle, &page, &obj_idx); |
1160 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1248 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); |
1161 | class = &pool->size_class[class_idx]; | 1249 | class = pool->size_class[class_idx]; |
1162 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1250 | off = obj_idx_to_offset(page, obj_idx, class->size); |
1163 | 1251 | ||
1164 | area = this_cpu_ptr(&zs_map_area); | 1252 | area = this_cpu_ptr(&zs_map_area); |
diff --git a/mm/zswap.c b/mm/zswap.c index c1543061a192..0cfce9bc51e4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -149,11 +149,10 @@ static int __init zswap_comp_init(void) | |||
149 | return 0; | 149 | return 0; |
150 | } | 150 | } |
151 | 151 | ||
152 | static void zswap_comp_exit(void) | 152 | static void __init zswap_comp_exit(void) |
153 | { | 153 | { |
154 | /* free percpu transforms */ | 154 | /* free percpu transforms */ |
155 | if (zswap_comp_pcpu_tfms) | 155 | free_percpu(zswap_comp_pcpu_tfms); |
156 | free_percpu(zswap_comp_pcpu_tfms); | ||
157 | } | 156 | } |
158 | 157 | ||
159 | /********************************* | 158 | /********************************* |
@@ -206,7 +205,7 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; | |||
206 | **********************************/ | 205 | **********************************/ |
207 | static struct kmem_cache *zswap_entry_cache; | 206 | static struct kmem_cache *zswap_entry_cache; |
208 | 207 | ||
209 | static int zswap_entry_cache_create(void) | 208 | static int __init zswap_entry_cache_create(void) |
210 | { | 209 | { |
211 | zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); | 210 | zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); |
212 | return zswap_entry_cache == NULL; | 211 | return zswap_entry_cache == NULL; |
@@ -389,7 +388,7 @@ static struct notifier_block zswap_cpu_notifier_block = { | |||
389 | .notifier_call = zswap_cpu_notifier | 388 | .notifier_call = zswap_cpu_notifier |
390 | }; | 389 | }; |
391 | 390 | ||
392 | static int zswap_cpu_init(void) | 391 | static int __init zswap_cpu_init(void) |
393 | { | 392 | { |
394 | unsigned long cpu; | 393 | unsigned long cpu; |
395 | 394 | ||
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 45f145c6f843..c14893b501a9 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile | |||
@@ -15,6 +15,7 @@ TARGETS += user | |||
15 | TARGETS += sysctl | 15 | TARGETS += sysctl |
16 | TARGETS += firmware | 16 | TARGETS += firmware |
17 | TARGETS += ftrace | 17 | TARGETS += ftrace |
18 | TARGETS += exec | ||
18 | 19 | ||
19 | TARGETS_HOTPLUG = cpu-hotplug | 20 | TARGETS_HOTPLUG = cpu-hotplug |
20 | TARGETS_HOTPLUG += memory-hotplug | 21 | TARGETS_HOTPLUG += memory-hotplug |
diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore new file mode 100644 index 000000000000..64073e050c6a --- /dev/null +++ b/tools/testing/selftests/exec/.gitignore | |||
@@ -0,0 +1,9 @@ | |||
1 | subdir* | ||
2 | script* | ||
3 | execveat | ||
4 | execveat.symlink | ||
5 | execveat.moved | ||
6 | execveat.path.ephemeral | ||
7 | execveat.ephemeral | ||
8 | execveat.denatured | ||
9 | xxxxxxxx* \ No newline at end of file | ||
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile new file mode 100644 index 000000000000..66dfc2ce1788 --- /dev/null +++ b/tools/testing/selftests/exec/Makefile | |||
@@ -0,0 +1,25 @@ | |||
1 | CC = $(CROSS_COMPILE)gcc | ||
2 | CFLAGS = -Wall | ||
3 | BINARIES = execveat | ||
4 | DEPS = execveat.symlink execveat.denatured script subdir | ||
5 | all: $(BINARIES) $(DEPS) | ||
6 | |||
7 | subdir: | ||
8 | mkdir -p $@ | ||
9 | script: | ||
10 | echo '#!/bin/sh' > $@ | ||
11 | echo 'exit $$*' >> $@ | ||
12 | chmod +x $@ | ||
13 | execveat.symlink: execveat | ||
14 | ln -s -f $< $@ | ||
15 | execveat.denatured: execveat | ||
16 | cp $< $@ | ||
17 | chmod -x $@ | ||
18 | %: %.c | ||
19 | $(CC) $(CFLAGS) -o $@ $^ | ||
20 | |||
21 | run_tests: all | ||
22 | ./execveat | ||
23 | |||
24 | clean: | ||
25 | rm -rf $(BINARIES) $(DEPS) subdir.moved execveat.moved xxxxx* | ||
diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c new file mode 100644 index 000000000000..33a5c06d95ca --- /dev/null +++ b/tools/testing/selftests/exec/execveat.c | |||
@@ -0,0 +1,397 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2014 Google, Inc. | ||
3 | * | ||
4 | * Licensed under the terms of the GNU GPL License version 2 | ||
5 | * | ||
6 | * Selftests for execveat(2). | ||
7 | */ | ||
8 | |||
9 | #define _GNU_SOURCE /* to get O_PATH, AT_EMPTY_PATH */ | ||
10 | #include <sys/sendfile.h> | ||
11 | #include <sys/stat.h> | ||
12 | #include <sys/syscall.h> | ||
13 | #include <sys/types.h> | ||
14 | #include <sys/wait.h> | ||
15 | #include <errno.h> | ||
16 | #include <fcntl.h> | ||
17 | #include <limits.h> | ||
18 | #include <stdio.h> | ||
19 | #include <stdlib.h> | ||
20 | #include <string.h> | ||
21 | #include <unistd.h> | ||
22 | |||
23 | static char longpath[2 * PATH_MAX] = ""; | ||
24 | static char *envp[] = { "IN_TEST=yes", NULL, NULL }; | ||
25 | static char *argv[] = { "execveat", "99", NULL }; | ||
26 | |||
27 | static int execveat_(int fd, const char *path, char **argv, char **envp, | ||
28 | int flags) | ||
29 | { | ||
30 | #ifdef __NR_execveat | ||
31 | return syscall(__NR_execveat, fd, path, argv, envp, flags); | ||
32 | #else | ||
33 | errno = -ENOSYS; | ||
34 | return -1; | ||
35 | #endif | ||
36 | } | ||
37 | |||
38 | #define check_execveat_fail(fd, path, flags, errno) \ | ||
39 | _check_execveat_fail(fd, path, flags, errno, #errno) | ||
40 | static int _check_execveat_fail(int fd, const char *path, int flags, | ||
41 | int expected_errno, const char *errno_str) | ||
42 | { | ||
43 | int rc; | ||
44 | |||
45 | errno = 0; | ||
46 | printf("Check failure of execveat(%d, '%s', %d) with %s... ", | ||
47 | fd, path?:"(null)", flags, errno_str); | ||
48 | rc = execveat_(fd, path, argv, envp, flags); | ||
49 | |||
50 | if (rc > 0) { | ||
51 | printf("[FAIL] (unexpected success from execveat(2))\n"); | ||
52 | return 1; | ||
53 | } | ||
54 | if (errno != expected_errno) { | ||
55 | printf("[FAIL] (expected errno %d (%s) not %d (%s)\n", | ||
56 | expected_errno, strerror(expected_errno), | ||
57 | errno, strerror(errno)); | ||
58 | return 1; | ||
59 | } | ||
60 | printf("[OK]\n"); | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | static int check_execveat_invoked_rc(int fd, const char *path, int flags, | ||
65 | int expected_rc) | ||
66 | { | ||
67 | int status; | ||
68 | int rc; | ||
69 | pid_t child; | ||
70 | int pathlen = path ? strlen(path) : 0; | ||
71 | |||
72 | if (pathlen > 40) | ||
73 | printf("Check success of execveat(%d, '%.20s...%s', %d)... ", | ||
74 | fd, path, (path + pathlen - 20), flags); | ||
75 | else | ||
76 | printf("Check success of execveat(%d, '%s', %d)... ", | ||
77 | fd, path?:"(null)", flags); | ||
78 | child = fork(); | ||
79 | if (child < 0) { | ||
80 | printf("[FAIL] (fork() failed)\n"); | ||
81 | return 1; | ||
82 | } | ||
83 | if (child == 0) { | ||
84 | /* Child: do execveat(). */ | ||
85 | rc = execveat_(fd, path, argv, envp, flags); | ||
86 | printf("[FAIL]: execveat() failed, rc=%d errno=%d (%s)\n", | ||
87 | rc, errno, strerror(errno)); | ||
88 | exit(1); /* should not reach here */ | ||
89 | } | ||
90 | /* Parent: wait for & check child's exit status. */ | ||
91 | rc = waitpid(child, &status, 0); | ||
92 | if (rc != child) { | ||
93 | printf("[FAIL] (waitpid(%d,...) returned %d)\n", child, rc); | ||
94 | return 1; | ||
95 | } | ||
96 | if (!WIFEXITED(status)) { | ||
97 | printf("[FAIL] (child %d did not exit cleanly, status=%08x)\n", | ||
98 | child, status); | ||
99 | return 1; | ||
100 | } | ||
101 | if (WEXITSTATUS(status) != expected_rc) { | ||
102 | printf("[FAIL] (child %d exited with %d not %d)\n", | ||
103 | child, WEXITSTATUS(status), expected_rc); | ||
104 | return 1; | ||
105 | } | ||
106 | printf("[OK]\n"); | ||
107 | return 0; | ||
108 | } | ||
109 | |||
110 | static int check_execveat(int fd, const char *path, int flags) | ||
111 | { | ||
112 | return check_execveat_invoked_rc(fd, path, flags, 99); | ||
113 | } | ||
114 | |||
115 | static char *concat(const char *left, const char *right) | ||
116 | { | ||
117 | char *result = malloc(strlen(left) + strlen(right) + 1); | ||
118 | |||
119 | strcpy(result, left); | ||
120 | strcat(result, right); | ||
121 | return result; | ||
122 | } | ||
123 | |||
124 | static int open_or_die(const char *filename, int flags) | ||
125 | { | ||
126 | int fd = open(filename, flags); | ||
127 | |||
128 | if (fd < 0) { | ||
129 | printf("Failed to open '%s'; " | ||
130 | "check prerequisites are available\n", filename); | ||
131 | exit(1); | ||
132 | } | ||
133 | return fd; | ||
134 | } | ||
135 | |||
136 | static void exe_cp(const char *src, const char *dest) | ||
137 | { | ||
138 | int in_fd = open_or_die(src, O_RDONLY); | ||
139 | int out_fd = open(dest, O_RDWR|O_CREAT|O_TRUNC, 0755); | ||
140 | struct stat info; | ||
141 | |||
142 | fstat(in_fd, &info); | ||
143 | sendfile(out_fd, in_fd, NULL, info.st_size); | ||
144 | close(in_fd); | ||
145 | close(out_fd); | ||
146 | } | ||
147 | |||
148 | #define XX_DIR_LEN 200 | ||
149 | static int check_execveat_pathmax(int dot_dfd, const char *src, int is_script) | ||
150 | { | ||
151 | int fail = 0; | ||
152 | int ii, count, len; | ||
153 | char longname[XX_DIR_LEN + 1]; | ||
154 | int fd; | ||
155 | |||
156 | if (*longpath == '\0') { | ||
157 | /* Create a filename close to PATH_MAX in length */ | ||
158 | memset(longname, 'x', XX_DIR_LEN - 1); | ||
159 | longname[XX_DIR_LEN - 1] = '/'; | ||
160 | longname[XX_DIR_LEN] = '\0'; | ||
161 | count = (PATH_MAX - 3) / XX_DIR_LEN; | ||
162 | for (ii = 0; ii < count; ii++) { | ||
163 | strcat(longpath, longname); | ||
164 | mkdir(longpath, 0755); | ||
165 | } | ||
166 | len = (PATH_MAX - 3) - (count * XX_DIR_LEN); | ||
167 | if (len <= 0) | ||
168 | len = 1; | ||
169 | memset(longname, 'y', len); | ||
170 | longname[len] = '\0'; | ||
171 | strcat(longpath, longname); | ||
172 | } | ||
173 | exe_cp(src, longpath); | ||
174 | |||
175 | /* | ||
176 | * Execute as a pre-opened file descriptor, which works whether this is | ||
177 | * a script or not (because the interpreter sees a filename like | ||
178 | * "/dev/fd/20"). | ||
179 | */ | ||
180 | fd = open(longpath, O_RDONLY); | ||
181 | if (fd > 0) { | ||
182 | printf("Invoke copy of '%s' via filename of length %lu:\n", | ||
183 | src, strlen(longpath)); | ||
184 | fail += check_execveat(fd, "", AT_EMPTY_PATH); | ||
185 | } else { | ||
186 | printf("Failed to open length %lu filename, errno=%d (%s)\n", | ||
187 | strlen(longpath), errno, strerror(errno)); | ||
188 | fail++; | ||
189 | } | ||
190 | |||
191 | /* | ||
192 | * Execute as a long pathname relative to ".". If this is a script, | ||
193 | * the interpreter will launch but fail to open the script because its | ||
194 | * name ("/dev/fd/5/xxx....") is bigger than PATH_MAX. | ||
195 | */ | ||
196 | if (is_script) | ||
197 | fail += check_execveat_invoked_rc(dot_dfd, longpath, 0, 127); | ||
198 | else | ||
199 | fail += check_execveat(dot_dfd, longpath, 0); | ||
200 | |||
201 | return fail; | ||
202 | } | ||
203 | |||
204 | static int run_tests(void) | ||
205 | { | ||
206 | int fail = 0; | ||
207 | char *fullname = realpath("execveat", NULL); | ||
208 | char *fullname_script = realpath("script", NULL); | ||
209 | char *fullname_symlink = concat(fullname, ".symlink"); | ||
210 | int subdir_dfd = open_or_die("subdir", O_DIRECTORY|O_RDONLY); | ||
211 | int subdir_dfd_ephemeral = open_or_die("subdir.ephemeral", | ||
212 | O_DIRECTORY|O_RDONLY); | ||
213 | int dot_dfd = open_or_die(".", O_DIRECTORY|O_RDONLY); | ||
214 | int dot_dfd_path = open_or_die(".", O_DIRECTORY|O_RDONLY|O_PATH); | ||
215 | int dot_dfd_cloexec = open_or_die(".", O_DIRECTORY|O_RDONLY|O_CLOEXEC); | ||
216 | int fd = open_or_die("execveat", O_RDONLY); | ||
217 | int fd_path = open_or_die("execveat", O_RDONLY|O_PATH); | ||
218 | int fd_symlink = open_or_die("execveat.symlink", O_RDONLY); | ||
219 | int fd_denatured = open_or_die("execveat.denatured", O_RDONLY); | ||
220 | int fd_denatured_path = open_or_die("execveat.denatured", | ||
221 | O_RDONLY|O_PATH); | ||
222 | int fd_script = open_or_die("script", O_RDONLY); | ||
223 | int fd_ephemeral = open_or_die("execveat.ephemeral", O_RDONLY); | ||
224 | int fd_ephemeral_path = open_or_die("execveat.path.ephemeral", | ||
225 | O_RDONLY|O_PATH); | ||
226 | int fd_script_ephemeral = open_or_die("script.ephemeral", O_RDONLY); | ||
227 | int fd_cloexec = open_or_die("execveat", O_RDONLY|O_CLOEXEC); | ||
228 | int fd_script_cloexec = open_or_die("script", O_RDONLY|O_CLOEXEC); | ||
229 | |||
230 | /* Change file position to confirm it doesn't affect anything */ | ||
231 | lseek(fd, 10, SEEK_SET); | ||
232 | |||
233 | /* Normal executable file: */ | ||
234 | /* dfd + path */ | ||
235 | fail += check_execveat(subdir_dfd, "../execveat", 0); | ||
236 | fail += check_execveat(dot_dfd, "execveat", 0); | ||
237 | fail += check_execveat(dot_dfd_path, "execveat", 0); | ||
238 | /* absolute path */ | ||
239 | fail += check_execveat(AT_FDCWD, fullname, 0); | ||
240 | /* absolute path with nonsense dfd */ | ||
241 | fail += check_execveat(99, fullname, 0); | ||
242 | /* fd + no path */ | ||
243 | fail += check_execveat(fd, "", AT_EMPTY_PATH); | ||
244 | /* O_CLOEXEC fd + no path */ | ||
245 | fail += check_execveat(fd_cloexec, "", AT_EMPTY_PATH); | ||
246 | /* O_PATH fd */ | ||
247 | fail += check_execveat(fd_path, "", AT_EMPTY_PATH); | ||
248 | |||
249 | /* Mess with executable file that's already open: */ | ||
250 | /* fd + no path to a file that's been renamed */ | ||
251 | rename("execveat.ephemeral", "execveat.moved"); | ||
252 | fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH); | ||
253 | /* fd + no path to a file that's been deleted */ | ||
254 | unlink("execveat.moved"); /* remove the file now fd open */ | ||
255 | fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH); | ||
256 | |||
257 | /* Mess with executable file that's already open with O_PATH */ | ||
258 | /* fd + no path to a file that's been deleted */ | ||
259 | unlink("execveat.path.ephemeral"); | ||
260 | fail += check_execveat(fd_ephemeral_path, "", AT_EMPTY_PATH); | ||
261 | |||
262 | /* Invalid argument failures */ | ||
263 | fail += check_execveat_fail(fd, "", 0, ENOENT); | ||
264 | fail += check_execveat_fail(fd, NULL, AT_EMPTY_PATH, EFAULT); | ||
265 | |||
266 | /* Symlink to executable file: */ | ||
267 | /* dfd + path */ | ||
268 | fail += check_execveat(dot_dfd, "execveat.symlink", 0); | ||
269 | fail += check_execveat(dot_dfd_path, "execveat.symlink", 0); | ||
270 | /* absolute path */ | ||
271 | fail += check_execveat(AT_FDCWD, fullname_symlink, 0); | ||
272 | /* fd + no path, even with AT_SYMLINK_NOFOLLOW (already followed) */ | ||
273 | fail += check_execveat(fd_symlink, "", AT_EMPTY_PATH); | ||
274 | fail += check_execveat(fd_symlink, "", | ||
275 | AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW); | ||
276 | |||
277 | /* Symlink fails when AT_SYMLINK_NOFOLLOW set: */ | ||
278 | /* dfd + path */ | ||
279 | fail += check_execveat_fail(dot_dfd, "execveat.symlink", | ||
280 | AT_SYMLINK_NOFOLLOW, ELOOP); | ||
281 | fail += check_execveat_fail(dot_dfd_path, "execveat.symlink", | ||
282 | AT_SYMLINK_NOFOLLOW, ELOOP); | ||
283 | /* absolute path */ | ||
284 | fail += check_execveat_fail(AT_FDCWD, fullname_symlink, | ||
285 | AT_SYMLINK_NOFOLLOW, ELOOP); | ||
286 | |||
287 | /* Shell script wrapping executable file: */ | ||
288 | /* dfd + path */ | ||
289 | fail += check_execveat(subdir_dfd, "../script", 0); | ||
290 | fail += check_execveat(dot_dfd, "script", 0); | ||
291 | fail += check_execveat(dot_dfd_path, "script", 0); | ||
292 | /* absolute path */ | ||
293 | fail += check_execveat(AT_FDCWD, fullname_script, 0); | ||
294 | /* fd + no path */ | ||
295 | fail += check_execveat(fd_script, "", AT_EMPTY_PATH); | ||
296 | fail += check_execveat(fd_script, "", | ||
297 | AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW); | ||
298 | /* O_CLOEXEC fd fails for a script (as script file inaccessible) */ | ||
299 | fail += check_execveat_fail(fd_script_cloexec, "", AT_EMPTY_PATH, | ||
300 | ENOENT); | ||
301 | fail += check_execveat_fail(dot_dfd_cloexec, "script", 0, ENOENT); | ||
302 | |||
303 | /* Mess with script file that's already open: */ | ||
304 | /* fd + no path to a file that's been renamed */ | ||
305 | rename("script.ephemeral", "script.moved"); | ||
306 | fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH); | ||
307 | /* fd + no path to a file that's been deleted */ | ||
308 | unlink("script.moved"); /* remove the file while fd open */ | ||
309 | fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH); | ||
310 | |||
311 | /* Rename a subdirectory in the path: */ | ||
312 | rename("subdir.ephemeral", "subdir.moved"); | ||
313 | fail += check_execveat(subdir_dfd_ephemeral, "../script", 0); | ||
314 | fail += check_execveat(subdir_dfd_ephemeral, "script", 0); | ||
315 | /* Remove the subdir and its contents */ | ||
316 | unlink("subdir.moved/script"); | ||
317 | unlink("subdir.moved"); | ||
318 | /* Shell loads via deleted subdir OK because name starts with .. */ | ||
319 | fail += check_execveat(subdir_dfd_ephemeral, "../script", 0); | ||
320 | fail += check_execveat_fail(subdir_dfd_ephemeral, "script", 0, ENOENT); | ||
321 | |||
322 | /* Flag values other than AT_SYMLINK_NOFOLLOW => EINVAL */ | ||
323 | fail += check_execveat_fail(dot_dfd, "execveat", 0xFFFF, EINVAL); | ||
324 | /* Invalid path => ENOENT */ | ||
325 | fail += check_execveat_fail(dot_dfd, "no-such-file", 0, ENOENT); | ||
326 | fail += check_execveat_fail(dot_dfd_path, "no-such-file", 0, ENOENT); | ||
327 | fail += check_execveat_fail(AT_FDCWD, "no-such-file", 0, ENOENT); | ||
328 | /* Attempt to execute directory => EACCES */ | ||
329 | fail += check_execveat_fail(dot_dfd, "", AT_EMPTY_PATH, EACCES); | ||
330 | /* Attempt to execute non-executable => EACCES */ | ||
331 | fail += check_execveat_fail(dot_dfd, "Makefile", 0, EACCES); | ||
332 | fail += check_execveat_fail(fd_denatured, "", AT_EMPTY_PATH, EACCES); | ||
333 | fail += check_execveat_fail(fd_denatured_path, "", AT_EMPTY_PATH, | ||
334 | EACCES); | ||
335 | /* Attempt to execute nonsense FD => EBADF */ | ||
336 | fail += check_execveat_fail(99, "", AT_EMPTY_PATH, EBADF); | ||
337 | fail += check_execveat_fail(99, "execveat", 0, EBADF); | ||
338 | /* Attempt to execute relative to non-directory => ENOTDIR */ | ||
339 | fail += check_execveat_fail(fd, "execveat", 0, ENOTDIR); | ||
340 | |||
341 | fail += check_execveat_pathmax(dot_dfd, "execveat", 0); | ||
342 | fail += check_execveat_pathmax(dot_dfd, "script", 1); | ||
343 | return fail; | ||
344 | } | ||
345 | |||
346 | static void prerequisites(void) | ||
347 | { | ||
348 | int fd; | ||
349 | const char *script = "#!/bin/sh\nexit $*\n"; | ||
350 | |||
351 | /* Create ephemeral copies of files */ | ||
352 | exe_cp("execveat", "execveat.ephemeral"); | ||
353 | exe_cp("execveat", "execveat.path.ephemeral"); | ||
354 | exe_cp("script", "script.ephemeral"); | ||
355 | mkdir("subdir.ephemeral", 0755); | ||
356 | |||
357 | fd = open("subdir.ephemeral/script", O_RDWR|O_CREAT|O_TRUNC, 0755); | ||
358 | write(fd, script, strlen(script)); | ||
359 | close(fd); | ||
360 | } | ||
361 | |||
362 | int main(int argc, char **argv) | ||
363 | { | ||
364 | int ii; | ||
365 | int rc; | ||
366 | const char *verbose = getenv("VERBOSE"); | ||
367 | |||
368 | if (argc >= 2) { | ||
369 | /* If we are invoked with an argument, don't run tests. */ | ||
370 | const char *in_test = getenv("IN_TEST"); | ||
371 | |||
372 | if (verbose) { | ||
373 | printf(" invoked with:"); | ||
374 | for (ii = 0; ii < argc; ii++) | ||
375 | printf(" [%d]='%s'", ii, argv[ii]); | ||
376 | printf("\n"); | ||
377 | } | ||
378 | |||
379 | /* Check expected environment transferred. */ | ||
380 | if (!in_test || strcmp(in_test, "yes") != 0) { | ||
381 | printf("[FAIL] (no IN_TEST=yes in env)\n"); | ||
382 | return 1; | ||
383 | } | ||
384 | |||
385 | /* Use the final argument as an exit code. */ | ||
386 | rc = atoi(argv[argc - 1]); | ||
387 | fflush(stdout); | ||
388 | } else { | ||
389 | prerequisites(); | ||
390 | if (verbose) | ||
391 | envp[1] = "VERBOSE=1"; | ||
392 | rc = run_tests(); | ||
393 | if (rc > 0) | ||
394 | printf("%d tests failed\n", rc); | ||
395 | } | ||
396 | return rc; | ||
397 | } | ||
diff --git a/tools/vm/Makefile b/tools/vm/Makefile index 3d907dacf2ac..ac884b65a072 100644 --- a/tools/vm/Makefile +++ b/tools/vm/Makefile | |||
@@ -1,6 +1,6 @@ | |||
1 | # Makefile for vm tools | 1 | # Makefile for vm tools |
2 | # | 2 | # |
3 | TARGETS=page-types slabinfo | 3 | TARGETS=page-types slabinfo page_owner_sort |
4 | 4 | ||
5 | LIB_DIR = ../lib/api | 5 | LIB_DIR = ../lib/api |
6 | LIBS = $(LIB_DIR)/libapikfs.a | 6 | LIBS = $(LIB_DIR)/libapikfs.a |
@@ -18,5 +18,5 @@ $(LIBS): | |||
18 | $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) | 18 | $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) |
19 | 19 | ||
20 | clean: | 20 | clean: |
21 | $(RM) page-types slabinfo | 21 | $(RM) page-types slabinfo page_owner_sort |
22 | make -C $(LIB_DIR) clean | 22 | make -C $(LIB_DIR) clean |
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c new file mode 100644 index 000000000000..77147b42d598 --- /dev/null +++ b/tools/vm/page_owner_sort.c | |||
@@ -0,0 +1,144 @@ | |||
1 | /* | ||
2 | * User-space helper to sort the output of /sys/kernel/debug/page_owner | ||
3 | * | ||
4 | * Example use: | ||
5 | * cat /sys/kernel/debug/page_owner > page_owner_full.txt | ||
6 | * grep -v ^PFN page_owner_full.txt > page_owner.txt | ||
7 | * ./sort page_owner.txt sorted_page_owner.txt | ||
8 | */ | ||
9 | |||
10 | #include <stdio.h> | ||
11 | #include <stdlib.h> | ||
12 | #include <sys/types.h> | ||
13 | #include <sys/stat.h> | ||
14 | #include <fcntl.h> | ||
15 | #include <unistd.h> | ||
16 | #include <string.h> | ||
17 | |||
18 | struct block_list { | ||
19 | char *txt; | ||
20 | int len; | ||
21 | int num; | ||
22 | }; | ||
23 | |||
24 | |||
25 | static struct block_list *list; | ||
26 | static int list_size; | ||
27 | static int max_size; | ||
28 | |||
29 | struct block_list *block_head; | ||
30 | |||
31 | int read_block(char *buf, int buf_size, FILE *fin) | ||
32 | { | ||
33 | char *curr = buf, *const buf_end = buf + buf_size; | ||
34 | |||
35 | while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) { | ||
36 | if (*curr == '\n') /* empty line */ | ||
37 | return curr - buf; | ||
38 | curr += strlen(curr); | ||
39 | } | ||
40 | |||
41 | return -1; /* EOF or no space left in buf. */ | ||
42 | } | ||
43 | |||
44 | static int compare_txt(const void *p1, const void *p2) | ||
45 | { | ||
46 | const struct block_list *l1 = p1, *l2 = p2; | ||
47 | |||
48 | return strcmp(l1->txt, l2->txt); | ||
49 | } | ||
50 | |||
51 | static int compare_num(const void *p1, const void *p2) | ||
52 | { | ||
53 | const struct block_list *l1 = p1, *l2 = p2; | ||
54 | |||
55 | return l2->num - l1->num; | ||
56 | } | ||
57 | |||
58 | static void add_list(char *buf, int len) | ||
59 | { | ||
60 | if (list_size != 0 && | ||
61 | len == list[list_size-1].len && | ||
62 | memcmp(buf, list[list_size-1].txt, len) == 0) { | ||
63 | list[list_size-1].num++; | ||
64 | return; | ||
65 | } | ||
66 | if (list_size == max_size) { | ||
67 | printf("max_size too small??\n"); | ||
68 | exit(1); | ||
69 | } | ||
70 | list[list_size].txt = malloc(len+1); | ||
71 | list[list_size].len = len; | ||
72 | list[list_size].num = 1; | ||
73 | memcpy(list[list_size].txt, buf, len); | ||
74 | list[list_size].txt[len] = 0; | ||
75 | list_size++; | ||
76 | if (list_size % 1000 == 0) { | ||
77 | printf("loaded %d\r", list_size); | ||
78 | fflush(stdout); | ||
79 | } | ||
80 | } | ||
81 | |||
82 | #define BUF_SIZE 1024 | ||
83 | |||
84 | int main(int argc, char **argv) | ||
85 | { | ||
86 | FILE *fin, *fout; | ||
87 | char buf[BUF_SIZE]; | ||
88 | int ret, i, count; | ||
89 | struct block_list *list2; | ||
90 | struct stat st; | ||
91 | |||
92 | if (argc < 3) { | ||
93 | printf("Usage: ./program <input> <output>\n"); | ||
94 | perror("open: "); | ||
95 | exit(1); | ||
96 | } | ||
97 | |||
98 | fin = fopen(argv[1], "r"); | ||
99 | fout = fopen(argv[2], "w"); | ||
100 | if (!fin || !fout) { | ||
101 | printf("Usage: ./program <input> <output>\n"); | ||
102 | perror("open: "); | ||
103 | exit(1); | ||
104 | } | ||
105 | |||
106 | fstat(fileno(fin), &st); | ||
107 | max_size = st.st_size / 100; /* hack ... */ | ||
108 | |||
109 | list = malloc(max_size * sizeof(*list)); | ||
110 | |||
111 | for ( ; ; ) { | ||
112 | ret = read_block(buf, BUF_SIZE, fin); | ||
113 | if (ret < 0) | ||
114 | break; | ||
115 | |||
116 | add_list(buf, ret); | ||
117 | } | ||
118 | |||
119 | printf("loaded %d\n", list_size); | ||
120 | |||
121 | printf("sorting ....\n"); | ||
122 | |||
123 | qsort(list, list_size, sizeof(list[0]), compare_txt); | ||
124 | |||
125 | list2 = malloc(sizeof(*list) * list_size); | ||
126 | |||
127 | printf("culling\n"); | ||
128 | |||
129 | for (i = count = 0; i < list_size; i++) { | ||
130 | if (count == 0 || | ||
131 | strcmp(list2[count-1].txt, list[i].txt) != 0) { | ||
132 | list2[count++] = list[i]; | ||
133 | } else { | ||
134 | list2[count-1].num += list[i].num; | ||
135 | } | ||
136 | } | ||
137 | |||
138 | qsort(list2, count, sizeof(list[0]), compare_num); | ||
139 | |||
140 | for (i = 0; i < count; i++) | ||
141 | fprintf(fout, "%d times:\n%s\n", list2[i].num, list2[i].txt); | ||
142 | |||
143 | return 0; | ||
144 | } | ||
diff --git a/usr/Kconfig b/usr/Kconfig index 2d4c77eecf2e..572dcf7b6a44 100644 --- a/usr/Kconfig +++ b/usr/Kconfig | |||
@@ -46,17 +46,17 @@ config INITRAMFS_ROOT_GID | |||
46 | If you are not sure, leave it set to "0". | 46 | If you are not sure, leave it set to "0". |
47 | 47 | ||
48 | config RD_GZIP | 48 | config RD_GZIP |
49 | bool "Support initial ramdisks compressed using gzip" if EXPERT | 49 | bool "Support initial ramdisks compressed using gzip" |
50 | default y | ||
51 | depends on BLK_DEV_INITRD | 50 | depends on BLK_DEV_INITRD |
51 | default y | ||
52 | select DECOMPRESS_GZIP | 52 | select DECOMPRESS_GZIP |
53 | help | 53 | help |
54 | Support loading of a gzip encoded initial ramdisk or cpio buffer. | 54 | Support loading of a gzip encoded initial ramdisk or cpio buffer. |
55 | If unsure, say Y. | 55 | If unsure, say Y. |
56 | 56 | ||
57 | config RD_BZIP2 | 57 | config RD_BZIP2 |
58 | bool "Support initial ramdisks compressed using bzip2" if EXPERT | 58 | bool "Support initial ramdisks compressed using bzip2" |
59 | default !EXPERT | 59 | default y |
60 | depends on BLK_DEV_INITRD | 60 | depends on BLK_DEV_INITRD |
61 | select DECOMPRESS_BZIP2 | 61 | select DECOMPRESS_BZIP2 |
62 | help | 62 | help |
@@ -64,8 +64,8 @@ config RD_BZIP2 | |||
64 | If unsure, say N. | 64 | If unsure, say N. |
65 | 65 | ||
66 | config RD_LZMA | 66 | config RD_LZMA |
67 | bool "Support initial ramdisks compressed using LZMA" if EXPERT | 67 | bool "Support initial ramdisks compressed using LZMA" |
68 | default !EXPERT | 68 | default y |
69 | depends on BLK_DEV_INITRD | 69 | depends on BLK_DEV_INITRD |
70 | select DECOMPRESS_LZMA | 70 | select DECOMPRESS_LZMA |
71 | help | 71 | help |
@@ -73,17 +73,17 @@ config RD_LZMA | |||
73 | If unsure, say N. | 73 | If unsure, say N. |
74 | 74 | ||
75 | config RD_XZ | 75 | config RD_XZ |
76 | bool "Support initial ramdisks compressed using XZ" if EXPERT | 76 | bool "Support initial ramdisks compressed using XZ" |
77 | default !EXPERT | ||
78 | depends on BLK_DEV_INITRD | 77 | depends on BLK_DEV_INITRD |
78 | default y | ||
79 | select DECOMPRESS_XZ | 79 | select DECOMPRESS_XZ |
80 | help | 80 | help |
81 | Support loading of a XZ encoded initial ramdisk or cpio buffer. | 81 | Support loading of a XZ encoded initial ramdisk or cpio buffer. |
82 | If unsure, say N. | 82 | If unsure, say N. |
83 | 83 | ||
84 | config RD_LZO | 84 | config RD_LZO |
85 | bool "Support initial ramdisks compressed using LZO" if EXPERT | 85 | bool "Support initial ramdisks compressed using LZO" |
86 | default !EXPERT | 86 | default y |
87 | depends on BLK_DEV_INITRD | 87 | depends on BLK_DEV_INITRD |
88 | select DECOMPRESS_LZO | 88 | select DECOMPRESS_LZO |
89 | help | 89 | help |
@@ -91,8 +91,8 @@ config RD_LZO | |||
91 | If unsure, say N. | 91 | If unsure, say N. |
92 | 92 | ||
93 | config RD_LZ4 | 93 | config RD_LZ4 |
94 | bool "Support initial ramdisks compressed using LZ4" if EXPERT | 94 | bool "Support initial ramdisks compressed using LZ4" |
95 | default !EXPERT | 95 | default y |
96 | depends on BLK_DEV_INITRD | 96 | depends on BLK_DEV_INITRD |
97 | select DECOMPRESS_LZ4 | 97 | select DECOMPRESS_LZ4 |
98 | help | 98 | help |