diff options
154 files changed, 3209 insertions, 1364 deletions
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index 3c94ff3f9693..f2235a162529 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt | |||
| @@ -445,7 +445,7 @@ across partially overlapping sets of CPUs would risk unstable dynamics | |||
| 445 | that would be beyond our understanding. So if each of two partially | 445 | that would be beyond our understanding. So if each of two partially |
| 446 | overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we | 446 | overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we |
| 447 | form a single sched domain that is a superset of both. We won't move | 447 | form a single sched domain that is a superset of both. We won't move |
| 448 | a task to a CPU outside it cpuset, but the scheduler load balancing | 448 | a task to a CPU outside its cpuset, but the scheduler load balancing |
| 449 | code might waste some compute cycles considering that possibility. | 449 | code might waste some compute cycles considering that possibility. |
| 450 | 450 | ||
| 451 | This mismatch is why there is not a simple one-to-one relation | 451 | This mismatch is why there is not a simple one-to-one relation |
| @@ -552,8 +552,8 @@ otherwise initial value -1 that indicates the cpuset has no request. | |||
| 552 | 1 : search siblings (hyperthreads in a core). | 552 | 1 : search siblings (hyperthreads in a core). |
| 553 | 2 : search cores in a package. | 553 | 2 : search cores in a package. |
| 554 | 3 : search cpus in a node [= system wide on non-NUMA system] | 554 | 3 : search cpus in a node [= system wide on non-NUMA system] |
| 555 | ( 4 : search nodes in a chunk of node [on NUMA system] ) | 555 | 4 : search nodes in a chunk of node [on NUMA system] |
| 556 | ( 5 : search system wide [on NUMA system] ) | 556 | 5 : search system wide [on NUMA system] |
| 557 | 557 | ||
| 558 | The system default is architecture dependent. The system default | 558 | The system default is architecture dependent. The system default |
| 559 | can be changed using the relax_domain_level= boot parameter. | 559 | can be changed using the relax_domain_level= boot parameter. |
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 46b2b5080317..a22df3ad35ff 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
| @@ -326,7 +326,7 @@ per cgroup, instead of globally. | |||
| 326 | 326 | ||
| 327 | * tcp memory pressure: sockets memory pressure for the tcp protocol. | 327 | * tcp memory pressure: sockets memory pressure for the tcp protocol. |
| 328 | 328 | ||
| 329 | 2.7.3 Common use cases | 329 | 2.7.2 Common use cases |
| 330 | 330 | ||
| 331 | Because the "kmem" counter is fed to the main user counter, kernel memory can | 331 | Because the "kmem" counter is fed to the main user counter, kernel memory can |
| 332 | never be limited completely independently of user memory. Say "U" is the user | 332 | never be limited completely independently of user memory. Say "U" is the user |
| @@ -354,19 +354,19 @@ set: | |||
| 354 | 354 | ||
| 355 | 3. User Interface | 355 | 3. User Interface |
| 356 | 356 | ||
| 357 | 0. Configuration | 357 | 3.0. Configuration |
| 358 | 358 | ||
| 359 | a. Enable CONFIG_CGROUPS | 359 | a. Enable CONFIG_CGROUPS |
| 360 | b. Enable CONFIG_MEMCG | 360 | b. Enable CONFIG_MEMCG |
| 361 | c. Enable CONFIG_MEMCG_SWAP (to use swap extension) | 361 | c. Enable CONFIG_MEMCG_SWAP (to use swap extension) |
| 362 | d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) | 362 | d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) |
| 363 | 363 | ||
| 364 | 1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) | 364 | 3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) |
| 365 | # mount -t tmpfs none /sys/fs/cgroup | 365 | # mount -t tmpfs none /sys/fs/cgroup |
| 366 | # mkdir /sys/fs/cgroup/memory | 366 | # mkdir /sys/fs/cgroup/memory |
| 367 | # mount -t cgroup none /sys/fs/cgroup/memory -o memory | 367 | # mount -t cgroup none /sys/fs/cgroup/memory -o memory |
| 368 | 368 | ||
| 369 | 2. Make the new group and move bash into it | 369 | 3.2. Make the new group and move bash into it |
| 370 | # mkdir /sys/fs/cgroup/memory/0 | 370 | # mkdir /sys/fs/cgroup/memory/0 |
| 371 | # echo $$ > /sys/fs/cgroup/memory/0/tasks | 371 | # echo $$ > /sys/fs/cgroup/memory/0/tasks |
| 372 | 372 | ||
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 43ecdcd39df2..4a337daf0c09 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
| @@ -829,6 +829,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
| 829 | CONFIG_DEBUG_PAGEALLOC, hence this option will not help | 829 | CONFIG_DEBUG_PAGEALLOC, hence this option will not help |
| 830 | tracking down these problems. | 830 | tracking down these problems. |
| 831 | 831 | ||
| 832 | debug_pagealloc= | ||
| 833 | [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this | ||
| 834 | parameter enables the feature at boot time. In | ||
| 835 | default, it is disabled. We can avoid allocating huge | ||
| 836 | chunk of memory for debug pagealloc if we don't enable | ||
| 837 | it at boot time and the system will work mostly same | ||
| 838 | with the kernel built without CONFIG_DEBUG_PAGEALLOC. | ||
| 839 | on: enable the feature | ||
| 840 | |||
| 832 | debugpat [X86] Enable PAT debugging | 841 | debugpat [X86] Enable PAT debugging |
| 833 | 842 | ||
| 834 | decnet.addr= [HW,NET] | 843 | decnet.addr= [HW,NET] |
| @@ -1228,9 +1237,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
| 1228 | multiple times interleaved with hugepages= to reserve | 1237 | multiple times interleaved with hugepages= to reserve |
| 1229 | huge pages of different sizes. Valid pages sizes on | 1238 | huge pages of different sizes. Valid pages sizes on |
| 1230 | x86-64 are 2M (when the CPU supports "pse") and 1G | 1239 | x86-64 are 2M (when the CPU supports "pse") and 1G |
| 1231 | (when the CPU supports the "pdpe1gb" cpuinfo flag) | 1240 | (when the CPU supports the "pdpe1gb" cpuinfo flag). |
| 1232 | Note that 1GB pages can only be allocated at boot time | ||
| 1233 | using hugepages= and not freed afterwards. | ||
| 1234 | 1241 | ||
| 1235 | hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) | 1242 | hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) |
| 1236 | terminal devices. Valid values: 0..8 | 1243 | terminal devices. Valid values: 0..8 |
| @@ -2506,6 +2513,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
| 2506 | OSS [HW,OSS] | 2513 | OSS [HW,OSS] |
| 2507 | See Documentation/sound/oss/oss-parameters.txt | 2514 | See Documentation/sound/oss/oss-parameters.txt |
| 2508 | 2515 | ||
| 2516 | page_owner= [KNL] Boot-time page_owner enabling option. | ||
| 2517 | Storage of the information about who allocated | ||
| 2518 | each page is disabled in default. With this switch, | ||
| 2519 | we can turn it on. | ||
| 2520 | on: enable the feature | ||
| 2521 | |||
| 2509 | panic= [KNL] Kernel behaviour on panic: delay <timeout> | 2522 | panic= [KNL] Kernel behaviour on panic: delay <timeout> |
| 2510 | timeout > 0: seconds before rebooting | 2523 | timeout > 0: seconds before rebooting |
| 2511 | timeout = 0: wait forever | 2524 | timeout = 0: wait forever |
diff --git a/Documentation/local_ops.txt b/Documentation/local_ops.txt index 300da4bdfdbd..407576a23317 100644 --- a/Documentation/local_ops.txt +++ b/Documentation/local_ops.txt | |||
| @@ -8,6 +8,11 @@ to implement them for any given architecture and shows how they can be used | |||
| 8 | properly. It also stresses on the precautions that must be taken when reading | 8 | properly. It also stresses on the precautions that must be taken when reading |
| 9 | those local variables across CPUs when the order of memory writes matters. | 9 | those local variables across CPUs when the order of memory writes matters. |
| 10 | 10 | ||
| 11 | Note that local_t based operations are not recommended for general kernel use. | ||
| 12 | Please use the this_cpu operations instead unless there is really a special purpose. | ||
| 13 | Most uses of local_t in the kernel have been replaced by this_cpu operations. | ||
| 14 | this_cpu operations combine the relocation with the local_t like semantics in | ||
| 15 | a single instruction and yield more compact and faster executing code. | ||
| 11 | 16 | ||
| 12 | 17 | ||
| 13 | * Purpose of local atomic operations | 18 | * Purpose of local atomic operations |
| @@ -87,10 +92,10 @@ the per cpu variable. For instance : | |||
| 87 | local_inc(&get_cpu_var(counters)); | 92 | local_inc(&get_cpu_var(counters)); |
| 88 | put_cpu_var(counters); | 93 | put_cpu_var(counters); |
| 89 | 94 | ||
| 90 | If you are already in a preemption-safe context, you can directly use | 95 | If you are already in a preemption-safe context, you can use |
| 91 | __get_cpu_var() instead. | 96 | this_cpu_ptr() instead. |
| 92 | 97 | ||
| 93 | local_inc(&__get_cpu_var(counters)); | 98 | local_inc(this_cpu_ptr(&counters)); |
| 94 | 99 | ||
| 95 | 100 | ||
| 96 | 101 | ||
| @@ -134,7 +139,7 @@ static void test_each(void *info) | |||
| 134 | { | 139 | { |
| 135 | /* Increment the counter from a non preemptible context */ | 140 | /* Increment the counter from a non preemptible context */ |
| 136 | printk("Increment on cpu %d\n", smp_processor_id()); | 141 | printk("Increment on cpu %d\n", smp_processor_id()); |
| 137 | local_inc(&__get_cpu_var(counters)); | 142 | local_inc(this_cpu_ptr(&counters)); |
| 138 | 143 | ||
| 139 | /* This is what incrementing the variable would look like within a | 144 | /* This is what incrementing the variable would look like within a |
| 140 | * preemptible context (it disables preemption) : | 145 | * preemptible context (it disables preemption) : |
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index b5d0c8501a18..75511efefc64 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
| @@ -116,10 +116,12 @@ set during run time. | |||
| 116 | 116 | ||
| 117 | auto_msgmni: | 117 | auto_msgmni: |
| 118 | 118 | ||
| 119 | Enables/Disables automatic recomputing of msgmni upon memory add/remove | 119 | This variable has no effect and may be removed in future kernel |
| 120 | or upon ipc namespace creation/removal (see the msgmni description | 120 | releases. Reading it always returns 0. |
| 121 | above). Echoing "1" into this file enables msgmni automatic recomputing. | 121 | Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni |
| 122 | Echoing "0" turns it off. auto_msgmni default value is 1. | 122 | upon memory add/remove or upon ipc namespace creation/removal. |
| 123 | Echoing "1" into this file enabled msgmni automatic recomputing. | ||
| 124 | Echoing "0" turned it off. auto_msgmni default value was 1. | ||
| 123 | 125 | ||
| 124 | 126 | ||
| 125 | ============================================================== | 127 | ============================================================== |
diff --git a/Documentation/vm/page_owner.txt b/Documentation/vm/page_owner.txt new file mode 100644 index 000000000000..8f3ce9b3aa11 --- /dev/null +++ b/Documentation/vm/page_owner.txt | |||
| @@ -0,0 +1,81 @@ | |||
| 1 | page owner: Tracking about who allocated each page | ||
| 2 | ----------------------------------------------------------- | ||
| 3 | |||
| 4 | * Introduction | ||
| 5 | |||
| 6 | page owner is for the tracking about who allocated each page. | ||
| 7 | It can be used to debug memory leak or to find a memory hogger. | ||
| 8 | When allocation happens, information about allocation such as call stack | ||
| 9 | and order of pages is stored into certain storage for each page. | ||
| 10 | When we need to know about status of all pages, we can get and analyze | ||
| 11 | this information. | ||
| 12 | |||
| 13 | Although we already have tracepoint for tracing page allocation/free, | ||
| 14 | using it for analyzing who allocate each page is rather complex. We need | ||
| 15 | to enlarge the trace buffer for preventing overlapping until userspace | ||
| 16 | program launched. And, launched program continually dump out the trace | ||
| 17 | buffer for later analysis and it would change system behviour with more | ||
| 18 | possibility rather than just keeping it in memory, so bad for debugging. | ||
| 19 | |||
| 20 | page owner can also be used for various purposes. For example, accurate | ||
| 21 | fragmentation statistics can be obtained through gfp flag information of | ||
| 22 | each page. It is already implemented and activated if page owner is | ||
| 23 | enabled. Other usages are more than welcome. | ||
| 24 | |||
| 25 | page owner is disabled in default. So, if you'd like to use it, you need | ||
| 26 | to add "page_owner=on" into your boot cmdline. If the kernel is built | ||
| 27 | with page owner and page owner is disabled in runtime due to no enabling | ||
| 28 | boot option, runtime overhead is marginal. If disabled in runtime, it | ||
| 29 | doesn't require memory to store owner information, so there is no runtime | ||
| 30 | memory overhead. And, page owner inserts just two unlikely branches into | ||
| 31 | the page allocator hotpath and if it returns false then allocation is | ||
| 32 | done like as the kernel without page owner. These two unlikely branches | ||
| 33 | would not affect to allocation performance. Following is the kernel's | ||
| 34 | code size change due to this facility. | ||
| 35 | |||
| 36 | - Without page owner | ||
| 37 | text data bss dec hex filename | ||
| 38 | 40662 1493 644 42799 a72f mm/page_alloc.o | ||
| 39 | |||
| 40 | - With page owner | ||
| 41 | text data bss dec hex filename | ||
| 42 | 40892 1493 644 43029 a815 mm/page_alloc.o | ||
| 43 | 1427 24 8 1459 5b3 mm/page_ext.o | ||
| 44 | 2722 50 0 2772 ad4 mm/page_owner.o | ||
| 45 | |||
| 46 | Although, roughly, 4 KB code is added in total, page_alloc.o increase by | ||
| 47 | 230 bytes and only half of it is in hotpath. Building the kernel with | ||
| 48 | page owner and turning it on if needed would be great option to debug | ||
| 49 | kernel memory problem. | ||
| 50 | |||
| 51 | There is one notice that is caused by implementation detail. page owner | ||
| 52 | stores information into the memory from struct page extension. This memory | ||
| 53 | is initialized some time later than that page allocator starts in sparse | ||
| 54 | memory system, so, until initialization, many pages can be allocated and | ||
| 55 | they would have no owner information. To fix it up, these early allocated | ||
| 56 | pages are investigated and marked as allocated in initialization phase. | ||
| 57 | Although it doesn't mean that they have the right owner information, | ||
| 58 | at least, we can tell whether the page is allocated or not, | ||
| 59 | more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages | ||
| 60 | are catched and marked, although they are mostly allocated from struct | ||
| 61 | page extension feature. Anyway, after that, no page is left in | ||
| 62 | un-tracking state. | ||
| 63 | |||
| 64 | * Usage | ||
| 65 | |||
| 66 | 1) Build user-space helper | ||
| 67 | cd tools/vm | ||
| 68 | make page_owner_sort | ||
| 69 | |||
| 70 | 2) Enable page owner | ||
| 71 | Add "page_owner=on" to boot cmdline. | ||
| 72 | |||
| 73 | 3) Do the job what you want to debug | ||
| 74 | |||
| 75 | 4) Analyze information from page owner | ||
| 76 | cat /sys/kernel/debug/page_owner > page_owner_full.txt | ||
| 77 | grep -v ^PFN page_owner_full.txt > page_owner.txt | ||
| 78 | ./page_owner_sort page_owner.txt sorted_page_owner.txt | ||
| 79 | |||
| 80 | See the result about who allocated each page | ||
| 81 | in the sorted_page_owner.txt. | ||
diff --git a/MAINTAINERS b/MAINTAINERS index 326dc2d1652d..1f0ef48830f9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -4045,7 +4045,7 @@ F: drivers/tty/serial/ucc_uart.c | |||
| 4045 | FREESCALE SOC SOUND DRIVERS | 4045 | FREESCALE SOC SOUND DRIVERS |
| 4046 | M: Timur Tabi <timur@tabi.org> | 4046 | M: Timur Tabi <timur@tabi.org> |
| 4047 | M: Nicolin Chen <nicoleotsuka@gmail.com> | 4047 | M: Nicolin Chen <nicoleotsuka@gmail.com> |
| 4048 | M: Xiubo Li <Li.Xiubo@freescale.com> | 4048 | M: Xiubo Li <Xiubo.Lee@gmail.com> |
| 4049 | L: alsa-devel@alsa-project.org (moderated for non-subscribers) | 4049 | L: alsa-devel@alsa-project.org (moderated for non-subscribers) |
| 4050 | L: linuxppc-dev@lists.ozlabs.org | 4050 | L: linuxppc-dev@lists.ozlabs.org |
| 4051 | S: Maintained | 4051 | S: Maintained |
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0bee1fe209b1..97d07ed60a0b 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig | |||
| @@ -5,6 +5,7 @@ config ARM | |||
| 5 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE | 5 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE |
| 6 | select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST | 6 | select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST |
| 7 | select ARCH_HAVE_CUSTOM_GPIO_H | 7 | select ARCH_HAVE_CUSTOM_GPIO_H |
| 8 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
| 8 | select ARCH_MIGHT_HAVE_PC_PARPORT | 9 | select ARCH_MIGHT_HAVE_PC_PARPORT |
| 9 | select ARCH_SUPPORTS_ATOMIC_RMW | 10 | select ARCH_SUPPORTS_ATOMIC_RMW |
| 10 | select ARCH_USE_BUILTIN_BSWAP | 11 | select ARCH_USE_BUILTIN_BSWAP |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6b1ebd964c10..688db03ef5b8 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig | |||
| @@ -2,6 +2,7 @@ config ARM64 | |||
| 2 | def_bool y | 2 | def_bool y |
| 3 | select ARCH_BINFMT_ELF_RANDOMIZE_PIE | 3 | select ARCH_BINFMT_ELF_RANDOMIZE_PIE |
| 4 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE | 4 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE |
| 5 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
| 5 | select ARCH_HAS_SG_CHAIN | 6 | select ARCH_HAS_SG_CHAIN |
| 6 | select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST | 7 | select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST |
| 7 | select ARCH_USE_CMPXCHG_LOCKREF | 8 | select ARCH_USE_CMPXCHG_LOCKREF |
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index a7736fa0580c..0bce820428fc 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | config MICROBLAZE | 1 | config MICROBLAZE |
| 2 | def_bool y | 2 | def_bool y |
| 3 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
| 3 | select ARCH_MIGHT_HAVE_PC_PARPORT | 4 | select ARCH_MIGHT_HAVE_PC_PARPORT |
| 4 | select ARCH_WANT_IPC_PARSE_VERSION | 5 | select ARCH_WANT_IPC_PARSE_VERSION |
| 5 | select ARCH_WANT_OPTIONAL_GPIOLIB | 6 | select ARCH_WANT_OPTIONAL_GPIOLIB |
diff --git a/arch/parisc/lib/fixup.S b/arch/parisc/lib/fixup.S index f8c45cc2947d..536ef66bb94b 100644 --- a/arch/parisc/lib/fixup.S +++ b/arch/parisc/lib/fixup.S | |||
| @@ -38,14 +38,14 @@ | |||
| 38 | LDREGX \t2(\t1),\t2 | 38 | LDREGX \t2(\t1),\t2 |
| 39 | addil LT%exception_data,%r27 | 39 | addil LT%exception_data,%r27 |
| 40 | LDREG RT%exception_data(%r1),\t1 | 40 | LDREG RT%exception_data(%r1),\t1 |
| 41 | /* t1 = &__get_cpu_var(exception_data) */ | 41 | /* t1 = this_cpu_ptr(&exception_data) */ |
| 42 | add,l \t1,\t2,\t1 | 42 | add,l \t1,\t2,\t1 |
| 43 | /* t1 = t1->fault_ip */ | 43 | /* t1 = t1->fault_ip */ |
| 44 | LDREG EXCDATA_IP(\t1), \t1 | 44 | LDREG EXCDATA_IP(\t1), \t1 |
| 45 | .endm | 45 | .endm |
| 46 | #else | 46 | #else |
| 47 | .macro get_fault_ip t1 t2 | 47 | .macro get_fault_ip t1 t2 |
| 48 | /* t1 = &__get_cpu_var(exception_data) */ | 48 | /* t1 = this_cpu_ptr(&exception_data) */ |
| 49 | addil LT%exception_data,%r27 | 49 | addil LT%exception_data,%r27 |
| 50 | LDREG RT%exception_data(%r1),\t2 | 50 | LDREG RT%exception_data(%r1),\t2 |
| 51 | /* t1 = t2->fault_ip */ | 51 | /* t1 = t2->fault_ip */ |
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index af696874248b..a2a168e2dfe7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig | |||
| @@ -129,6 +129,7 @@ config PPC | |||
| 129 | select HAVE_BPF_JIT if PPC64 | 129 | select HAVE_BPF_JIT if PPC64 |
| 130 | select HAVE_ARCH_JUMP_LABEL | 130 | select HAVE_ARCH_JUMP_LABEL |
| 131 | select ARCH_HAVE_NMI_SAFE_CMPXCHG | 131 | select ARCH_HAVE_NMI_SAFE_CMPXCHG |
| 132 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
| 132 | select GENERIC_SMP_IDLE_THREAD | 133 | select GENERIC_SMP_IDLE_THREAD |
| 133 | select GENERIC_CMOS_UPDATE | 134 | select GENERIC_CMOS_UPDATE |
| 134 | select GENERIC_TIME_VSYSCALL_OLD | 135 | select GENERIC_TIME_VSYSCALL_OLD |
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index e56a307bc676..2c2022d16059 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c | |||
| @@ -1514,7 +1514,7 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) | |||
| 1514 | mmu_kernel_ssize, 0); | 1514 | mmu_kernel_ssize, 0); |
| 1515 | } | 1515 | } |
| 1516 | 1516 | ||
| 1517 | void kernel_map_pages(struct page *page, int numpages, int enable) | 1517 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
| 1518 | { | 1518 | { |
| 1519 | unsigned long flags, vaddr, lmi; | 1519 | unsigned long flags, vaddr, lmi; |
| 1520 | int i; | 1520 | int i; |
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index d545b1231594..50fad3801f30 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c | |||
| @@ -429,7 +429,7 @@ static int change_page_attr(struct page *page, int numpages, pgprot_t prot) | |||
| 429 | } | 429 | } |
| 430 | 430 | ||
| 431 | 431 | ||
| 432 | void kernel_map_pages(struct page *page, int numpages, int enable) | 432 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
| 433 | { | 433 | { |
| 434 | if (PageHighMem(page)) | 434 | if (PageHighMem(page)) |
| 435 | return; | 435 | return; |
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f2cf1f90295b..68b68d755fdf 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig | |||
| @@ -65,6 +65,7 @@ config S390 | |||
| 65 | def_bool y | 65 | def_bool y |
| 66 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE | 66 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE |
| 67 | select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS | 67 | select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS |
| 68 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
| 68 | select ARCH_HAVE_NMI_SAFE_CMPXCHG | 69 | select ARCH_HAVE_NMI_SAFE_CMPXCHG |
| 69 | select ARCH_INLINE_READ_LOCK | 70 | select ARCH_INLINE_READ_LOCK |
| 70 | select ARCH_INLINE_READ_LOCK_BH | 71 | select ARCH_INLINE_READ_LOCK_BH |
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 3fef3b299665..426c9d462d1c 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c | |||
| @@ -120,7 +120,7 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) | |||
| 120 | } | 120 | } |
| 121 | } | 121 | } |
| 122 | 122 | ||
| 123 | void kernel_map_pages(struct page *page, int numpages, int enable) | 123 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
| 124 | { | 124 | { |
| 125 | unsigned long address; | 125 | unsigned long address; |
| 126 | int nr, i, j; | 126 | int nr, i, j; |
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index a1403470f80e..c6b6ee5f38b2 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig | |||
| @@ -16,6 +16,7 @@ config SUPERH | |||
| 16 | select HAVE_DEBUG_BUGVERBOSE | 16 | select HAVE_DEBUG_BUGVERBOSE |
| 17 | select ARCH_HAVE_CUSTOM_GPIO_H | 17 | select ARCH_HAVE_CUSTOM_GPIO_H |
| 18 | select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) | 18 | select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) |
| 19 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
| 19 | select PERF_USE_VMALLOC | 20 | select PERF_USE_VMALLOC |
| 20 | select HAVE_DEBUG_KMEMLEAK | 21 | select HAVE_DEBUG_KMEMLEAK |
| 21 | select HAVE_KERNEL_GZIP | 22 | select HAVE_KERNEL_GZIP |
diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h index 46d83842eddc..6f35f4df17f2 100644 --- a/arch/sparc/include/uapi/asm/unistd.h +++ b/arch/sparc/include/uapi/asm/unistd.h | |||
| @@ -415,8 +415,9 @@ | |||
| 415 | #define __NR_getrandom 347 | 415 | #define __NR_getrandom 347 |
| 416 | #define __NR_memfd_create 348 | 416 | #define __NR_memfd_create 348 |
| 417 | #define __NR_bpf 349 | 417 | #define __NR_bpf 349 |
| 418 | #define __NR_execveat 350 | ||
| 418 | 419 | ||
| 419 | #define NR_syscalls 350 | 420 | #define NR_syscalls 351 |
| 420 | 421 | ||
| 421 | /* Bitmask values returned from kern_features system call. */ | 422 | /* Bitmask values returned from kern_features system call. */ |
| 422 | #define KERN_FEATURE_MIXED_MODE_STACK 0x00000001 | 423 | #define KERN_FEATURE_MIXED_MODE_STACK 0x00000001 |
diff --git a/arch/sparc/kernel/syscalls.S b/arch/sparc/kernel/syscalls.S index 33a17e7b3ccd..bb0008927598 100644 --- a/arch/sparc/kernel/syscalls.S +++ b/arch/sparc/kernel/syscalls.S | |||
| @@ -6,6 +6,11 @@ sys64_execve: | |||
| 6 | jmpl %g1, %g0 | 6 | jmpl %g1, %g0 |
| 7 | flushw | 7 | flushw |
| 8 | 8 | ||
| 9 | sys64_execveat: | ||
| 10 | set sys_execveat, %g1 | ||
| 11 | jmpl %g1, %g0 | ||
| 12 | flushw | ||
| 13 | |||
| 9 | #ifdef CONFIG_COMPAT | 14 | #ifdef CONFIG_COMPAT |
| 10 | sunos_execv: | 15 | sunos_execv: |
| 11 | mov %g0, %o2 | 16 | mov %g0, %o2 |
| @@ -13,6 +18,11 @@ sys32_execve: | |||
| 13 | set compat_sys_execve, %g1 | 18 | set compat_sys_execve, %g1 |
| 14 | jmpl %g1, %g0 | 19 | jmpl %g1, %g0 |
| 15 | flushw | 20 | flushw |
| 21 | |||
| 22 | sys32_execveat: | ||
| 23 | set compat_sys_execveat, %g1 | ||
| 24 | jmpl %g1, %g0 | ||
| 25 | flushw | ||
| 16 | #endif | 26 | #endif |
| 17 | 27 | ||
| 18 | .align 32 | 28 | .align 32 |
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index ad0cdf497b78..e31a9056a303 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S | |||
| @@ -87,3 +87,4 @@ sys_call_table: | |||
| 87 | /*335*/ .long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev | 87 | /*335*/ .long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev |
| 88 | /*340*/ .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr | 88 | /*340*/ .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr |
| 89 | /*345*/ .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf | 89 | /*345*/ .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf |
| 90 | /*350*/ .long sys_execveat | ||
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index 580cde9370c9..d72f76ae70eb 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S | |||
| @@ -88,6 +88,7 @@ sys_call_table32: | |||
| 88 | .word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev | 88 | .word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev |
| 89 | /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr | 89 | /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr |
| 90 | .word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf | 90 | .word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf |
| 91 | /*350*/ .word sys32_execveat | ||
| 91 | 92 | ||
| 92 | #endif /* CONFIG_COMPAT */ | 93 | #endif /* CONFIG_COMPAT */ |
| 93 | 94 | ||
| @@ -167,3 +168,4 @@ sys_call_table: | |||
| 167 | .word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev | 168 | .word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev |
| 168 | /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr | 169 | /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr |
| 169 | .word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf | 170 | .word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf |
| 171 | /*350*/ .word sys64_execveat | ||
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 2d91c62f7f5f..3ea267c53320 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c | |||
| @@ -1621,7 +1621,7 @@ static void __init kernel_physical_mapping_init(void) | |||
| 1621 | } | 1621 | } |
| 1622 | 1622 | ||
| 1623 | #ifdef CONFIG_DEBUG_PAGEALLOC | 1623 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 1624 | void kernel_map_pages(struct page *page, int numpages, int enable) | 1624 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
| 1625 | { | 1625 | { |
| 1626 | unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT; | 1626 | unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT; |
| 1627 | unsigned long phys_end = phys_start + (numpages * PAGE_SIZE); | 1627 | unsigned long phys_end = phys_start + (numpages * PAGE_SIZE); |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bea3a0159496..d69f1cd87fd9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -24,6 +24,7 @@ config X86 | |||
| 24 | select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI | 24 | select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI |
| 25 | select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS | 25 | select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS |
| 26 | select ARCH_HAS_FAST_MULTIPLIER | 26 | select ARCH_HAS_FAST_MULTIPLIER |
| 27 | select ARCH_HAS_GCOV_PROFILE_ALL | ||
| 27 | select ARCH_MIGHT_HAVE_PC_PARPORT | 28 | select ARCH_MIGHT_HAVE_PC_PARPORT |
| 28 | select ARCH_MIGHT_HAVE_PC_SERIO | 29 | select ARCH_MIGHT_HAVE_PC_SERIO |
| 29 | select HAVE_AOUT if X86_32 | 30 | select HAVE_AOUT if X86_32 |
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 5d7b381da692..2eccc8932ae6 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c | |||
| @@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall) | |||
| 35 | case __NR_socketcall: | 35 | case __NR_socketcall: |
| 36 | return 4; | 36 | return 4; |
| 37 | case __NR_execve: | 37 | case __NR_execve: |
| 38 | case __NR_execveat: | ||
| 38 | return 5; | 39 | return 5; |
| 39 | default: | 40 | default: |
| 40 | return 1; | 41 | return 1; |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ffe71228fc10..82e8a1d44658 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
| @@ -480,6 +480,7 @@ GLOBAL(\label) | |||
| 480 | PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn | 480 | PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn |
| 481 | PTREGSCALL stub32_sigreturn, sys32_sigreturn | 481 | PTREGSCALL stub32_sigreturn, sys32_sigreturn |
| 482 | PTREGSCALL stub32_execve, compat_sys_execve | 482 | PTREGSCALL stub32_execve, compat_sys_execve |
| 483 | PTREGSCALL stub32_execveat, compat_sys_execveat | ||
| 483 | PTREGSCALL stub32_fork, sys_fork | 484 | PTREGSCALL stub32_fork, sys_fork |
| 484 | PTREGSCALL stub32_vfork, sys_vfork | 485 | PTREGSCALL stub32_vfork, sys_vfork |
| 485 | 486 | ||
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index 06d3e5a14d9d..f3672508b249 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c | |||
| @@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall) | |||
| 50 | case __NR_openat: | 50 | case __NR_openat: |
| 51 | return 3; | 51 | return 3; |
| 52 | case __NR_execve: | 52 | case __NR_execve: |
| 53 | case __NR_execveat: | ||
| 53 | return 5; | 54 | return 5; |
| 54 | default: | 55 | default: |
| 55 | return 0; | 56 | return 0; |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c0226ab54106..90878aa38dbd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
| @@ -652,6 +652,20 @@ ENTRY(stub_execve) | |||
| 652 | CFI_ENDPROC | 652 | CFI_ENDPROC |
| 653 | END(stub_execve) | 653 | END(stub_execve) |
| 654 | 654 | ||
| 655 | ENTRY(stub_execveat) | ||
| 656 | CFI_STARTPROC | ||
| 657 | addq $8, %rsp | ||
| 658 | PARTIAL_FRAME 0 | ||
| 659 | SAVE_REST | ||
| 660 | FIXUP_TOP_OF_STACK %r11 | ||
| 661 | call sys_execveat | ||
| 662 | RESTORE_TOP_OF_STACK %r11 | ||
| 663 | movq %rax,RAX(%rsp) | ||
| 664 | RESTORE_REST | ||
| 665 | jmp int_ret_from_sys_call | ||
| 666 | CFI_ENDPROC | ||
| 667 | END(stub_execveat) | ||
| 668 | |||
| 655 | /* | 669 | /* |
| 656 | * sigreturn is special because it needs to restore all registers on return. | 670 | * sigreturn is special because it needs to restore all registers on return. |
| 657 | * This cannot be done with SYSRET, so use the IRET return path instead. | 671 | * This cannot be done with SYSRET, so use the IRET return path instead. |
| @@ -697,6 +711,20 @@ ENTRY(stub_x32_execve) | |||
| 697 | CFI_ENDPROC | 711 | CFI_ENDPROC |
| 698 | END(stub_x32_execve) | 712 | END(stub_x32_execve) |
| 699 | 713 | ||
| 714 | ENTRY(stub_x32_execveat) | ||
| 715 | CFI_STARTPROC | ||
| 716 | addq $8, %rsp | ||
| 717 | PARTIAL_FRAME 0 | ||
| 718 | SAVE_REST | ||
| 719 | FIXUP_TOP_OF_STACK %r11 | ||
| 720 | call compat_sys_execveat | ||
| 721 | RESTORE_TOP_OF_STACK %r11 | ||
| 722 | movq %rax,RAX(%rsp) | ||
| 723 | RESTORE_REST | ||
| 724 | jmp int_ret_from_sys_call | ||
| 725 | CFI_ENDPROC | ||
| 726 | END(stub_x32_execveat) | ||
| 727 | |||
| 700 | #endif | 728 | #endif |
| 701 | 729 | ||
| 702 | /* | 730 | /* |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a3a5d46605d2..dfaf2e0f5f8f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
| @@ -1817,7 +1817,7 @@ static int __set_pages_np(struct page *page, int numpages) | |||
| 1817 | return __change_page_attr_set_clr(&cpa, 0); | 1817 | return __change_page_attr_set_clr(&cpa, 0); |
| 1818 | } | 1818 | } |
| 1819 | 1819 | ||
| 1820 | void kernel_map_pages(struct page *page, int numpages, int enable) | 1820 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
| 1821 | { | 1821 | { |
| 1822 | if (PageHighMem(page)) | 1822 | if (PageHighMem(page)) |
| 1823 | return; | 1823 | return; |
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 9fe1b5d002f0..b3560ece1c9f 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl | |||
| @@ -364,3 +364,4 @@ | |||
| 364 | 355 i386 getrandom sys_getrandom | 364 | 355 i386 getrandom sys_getrandom |
| 365 | 356 i386 memfd_create sys_memfd_create | 365 | 356 i386 memfd_create sys_memfd_create |
| 366 | 357 i386 bpf sys_bpf | 366 | 357 i386 bpf sys_bpf |
| 367 | 358 i386 execveat sys_execveat stub32_execveat | ||
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 281150b539a2..8d656fbb57aa 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl | |||
| @@ -328,6 +328,7 @@ | |||
| 328 | 319 common memfd_create sys_memfd_create | 328 | 319 common memfd_create sys_memfd_create |
| 329 | 320 common kexec_file_load sys_kexec_file_load | 329 | 320 common kexec_file_load sys_kexec_file_load |
| 330 | 321 common bpf sys_bpf | 330 | 321 common bpf sys_bpf |
| 331 | 322 64 execveat stub_execveat | ||
| 331 | 332 | ||
| 332 | # | 333 | # |
| 333 | # x32-specific system call numbers start at 512 to avoid cache impact | 334 | # x32-specific system call numbers start at 512 to avoid cache impact |
| @@ -366,3 +367,4 @@ | |||
| 366 | 542 x32 getsockopt compat_sys_getsockopt | 367 | 542 x32 getsockopt compat_sys_getsockopt |
| 367 | 543 x32 io_setup compat_sys_io_setup | 368 | 543 x32 io_setup compat_sys_io_setup |
| 368 | 544 x32 io_submit compat_sys_io_submit | 369 | 544 x32 io_submit compat_sys_io_submit |
| 370 | 545 x32 execveat stub_x32_execveat | ||
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index f2f0723070ca..20c3649d0691 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | #define stub_fork sys_fork | 31 | #define stub_fork sys_fork |
| 32 | #define stub_vfork sys_vfork | 32 | #define stub_vfork sys_vfork |
| 33 | #define stub_execve sys_execve | 33 | #define stub_execve sys_execve |
| 34 | #define stub_execveat sys_execveat | ||
| 34 | #define stub_rt_sigreturn sys_rt_sigreturn | 35 | #define stub_rt_sigreturn sys_rt_sigreturn |
| 35 | 36 | ||
| 36 | #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) | 37 | #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 7c5d87191b28..85be040a21c8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
| @@ -228,8 +228,8 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t | |||
| 228 | struct page *first_page; | 228 | struct page *first_page; |
| 229 | int ret; | 229 | int ret; |
| 230 | 230 | ||
| 231 | first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); | 231 | start_pfn = phys_index << PFN_SECTION_SHIFT; |
| 232 | start_pfn = page_to_pfn(first_page); | 232 | first_page = pfn_to_page(start_pfn); |
| 233 | 233 | ||
| 234 | switch (action) { | 234 | switch (action) { |
| 235 | case MEM_ONLINE: | 235 | case MEM_ONLINE: |
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3920ee45aa59..bd8bda386e02 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c | |||
| @@ -44,15 +44,14 @@ static const char *default_compressor = "lzo"; | |||
| 44 | static unsigned int num_devices = 1; | 44 | static unsigned int num_devices = 1; |
| 45 | 45 | ||
| 46 | #define ZRAM_ATTR_RO(name) \ | 46 | #define ZRAM_ATTR_RO(name) \ |
| 47 | static ssize_t zram_attr_##name##_show(struct device *d, \ | 47 | static ssize_t name##_show(struct device *d, \ |
| 48 | struct device_attribute *attr, char *b) \ | 48 | struct device_attribute *attr, char *b) \ |
| 49 | { \ | 49 | { \ |
| 50 | struct zram *zram = dev_to_zram(d); \ | 50 | struct zram *zram = dev_to_zram(d); \ |
| 51 | return scnprintf(b, PAGE_SIZE, "%llu\n", \ | 51 | return scnprintf(b, PAGE_SIZE, "%llu\n", \ |
| 52 | (u64)atomic64_read(&zram->stats.name)); \ | 52 | (u64)atomic64_read(&zram->stats.name)); \ |
| 53 | } \ | 53 | } \ |
| 54 | static struct device_attribute dev_attr_##name = \ | 54 | static DEVICE_ATTR_RO(name); |
| 55 | __ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL); | ||
| 56 | 55 | ||
| 57 | static inline int init_done(struct zram *zram) | 56 | static inline int init_done(struct zram *zram) |
| 58 | { | 57 | { |
| @@ -287,19 +286,18 @@ static inline int is_partial_io(struct bio_vec *bvec) | |||
| 287 | /* | 286 | /* |
| 288 | * Check if request is within bounds and aligned on zram logical blocks. | 287 | * Check if request is within bounds and aligned on zram logical blocks. |
| 289 | */ | 288 | */ |
| 290 | static inline int valid_io_request(struct zram *zram, struct bio *bio) | 289 | static inline int valid_io_request(struct zram *zram, |
| 290 | sector_t start, unsigned int size) | ||
| 291 | { | 291 | { |
| 292 | u64 start, end, bound; | 292 | u64 end, bound; |
| 293 | 293 | ||
| 294 | /* unaligned request */ | 294 | /* unaligned request */ |
| 295 | if (unlikely(bio->bi_iter.bi_sector & | 295 | if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) |
| 296 | (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) | ||
| 297 | return 0; | 296 | return 0; |
| 298 | if (unlikely(bio->bi_iter.bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) | 297 | if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) |
| 299 | return 0; | 298 | return 0; |
| 300 | 299 | ||
| 301 | start = bio->bi_iter.bi_sector; | 300 | end = start + (size >> SECTOR_SHIFT); |
| 302 | end = start + (bio->bi_iter.bi_size >> SECTOR_SHIFT); | ||
| 303 | bound = zram->disksize >> SECTOR_SHIFT; | 301 | bound = zram->disksize >> SECTOR_SHIFT; |
| 304 | /* out of range range */ | 302 | /* out of range range */ |
| 305 | if (unlikely(start >= bound || end > bound || start > end)) | 303 | if (unlikely(start >= bound || end > bound || start > end)) |
| @@ -453,7 +451,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) | |||
| 453 | } | 451 | } |
| 454 | 452 | ||
| 455 | static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, | 453 | static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, |
| 456 | u32 index, int offset, struct bio *bio) | 454 | u32 index, int offset) |
| 457 | { | 455 | { |
| 458 | int ret; | 456 | int ret; |
| 459 | struct page *page; | 457 | struct page *page; |
| @@ -645,14 +643,13 @@ out: | |||
| 645 | } | 643 | } |
| 646 | 644 | ||
| 647 | static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, | 645 | static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, |
| 648 | int offset, struct bio *bio) | 646 | int offset, int rw) |
| 649 | { | 647 | { |
| 650 | int ret; | 648 | int ret; |
| 651 | int rw = bio_data_dir(bio); | ||
| 652 | 649 | ||
| 653 | if (rw == READ) { | 650 | if (rw == READ) { |
| 654 | atomic64_inc(&zram->stats.num_reads); | 651 | atomic64_inc(&zram->stats.num_reads); |
| 655 | ret = zram_bvec_read(zram, bvec, index, offset, bio); | 652 | ret = zram_bvec_read(zram, bvec, index, offset); |
| 656 | } else { | 653 | } else { |
| 657 | atomic64_inc(&zram->stats.num_writes); | 654 | atomic64_inc(&zram->stats.num_writes); |
| 658 | ret = zram_bvec_write(zram, bvec, index, offset); | 655 | ret = zram_bvec_write(zram, bvec, index, offset); |
| @@ -853,7 +850,7 @@ out: | |||
| 853 | 850 | ||
| 854 | static void __zram_make_request(struct zram *zram, struct bio *bio) | 851 | static void __zram_make_request(struct zram *zram, struct bio *bio) |
| 855 | { | 852 | { |
| 856 | int offset; | 853 | int offset, rw; |
| 857 | u32 index; | 854 | u32 index; |
| 858 | struct bio_vec bvec; | 855 | struct bio_vec bvec; |
| 859 | struct bvec_iter iter; | 856 | struct bvec_iter iter; |
| @@ -868,6 +865,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) | |||
| 868 | return; | 865 | return; |
| 869 | } | 866 | } |
| 870 | 867 | ||
| 868 | rw = bio_data_dir(bio); | ||
| 871 | bio_for_each_segment(bvec, bio, iter) { | 869 | bio_for_each_segment(bvec, bio, iter) { |
| 872 | int max_transfer_size = PAGE_SIZE - offset; | 870 | int max_transfer_size = PAGE_SIZE - offset; |
| 873 | 871 | ||
| @@ -882,15 +880,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) | |||
| 882 | bv.bv_len = max_transfer_size; | 880 | bv.bv_len = max_transfer_size; |
| 883 | bv.bv_offset = bvec.bv_offset; | 881 | bv.bv_offset = bvec.bv_offset; |
| 884 | 882 | ||
| 885 | if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0) | 883 | if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) |
| 886 | goto out; | 884 | goto out; |
| 887 | 885 | ||
| 888 | bv.bv_len = bvec.bv_len - max_transfer_size; | 886 | bv.bv_len = bvec.bv_len - max_transfer_size; |
| 889 | bv.bv_offset += max_transfer_size; | 887 | bv.bv_offset += max_transfer_size; |
| 890 | if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0) | 888 | if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0) |
| 891 | goto out; | 889 | goto out; |
| 892 | } else | 890 | } else |
| 893 | if (zram_bvec_rw(zram, &bvec, index, offset, bio) < 0) | 891 | if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0) |
| 894 | goto out; | 892 | goto out; |
| 895 | 893 | ||
| 896 | update_position(&index, &offset, &bvec); | 894 | update_position(&index, &offset, &bvec); |
| @@ -915,7 +913,8 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) | |||
| 915 | if (unlikely(!init_done(zram))) | 913 | if (unlikely(!init_done(zram))) |
| 916 | goto error; | 914 | goto error; |
| 917 | 915 | ||
| 918 | if (!valid_io_request(zram, bio)) { | 916 | if (!valid_io_request(zram, bio->bi_iter.bi_sector, |
| 917 | bio->bi_iter.bi_size)) { | ||
| 919 | atomic64_inc(&zram->stats.invalid_io); | 918 | atomic64_inc(&zram->stats.invalid_io); |
| 920 | goto error; | 919 | goto error; |
| 921 | } | 920 | } |
| @@ -945,25 +944,64 @@ static void zram_slot_free_notify(struct block_device *bdev, | |||
| 945 | atomic64_inc(&zram->stats.notify_free); | 944 | atomic64_inc(&zram->stats.notify_free); |
| 946 | } | 945 | } |
| 947 | 946 | ||
| 947 | static int zram_rw_page(struct block_device *bdev, sector_t sector, | ||
| 948 | struct page *page, int rw) | ||
| 949 | { | ||
| 950 | int offset, err; | ||
| 951 | u32 index; | ||
| 952 | struct zram *zram; | ||
| 953 | struct bio_vec bv; | ||
| 954 | |||
| 955 | zram = bdev->bd_disk->private_data; | ||
| 956 | if (!valid_io_request(zram, sector, PAGE_SIZE)) { | ||
| 957 | atomic64_inc(&zram->stats.invalid_io); | ||
| 958 | return -EINVAL; | ||
| 959 | } | ||
| 960 | |||
| 961 | down_read(&zram->init_lock); | ||
| 962 | if (unlikely(!init_done(zram))) { | ||
| 963 | err = -EIO; | ||
| 964 | goto out_unlock; | ||
| 965 | } | ||
| 966 | |||
| 967 | index = sector >> SECTORS_PER_PAGE_SHIFT; | ||
| 968 | offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT; | ||
| 969 | |||
| 970 | bv.bv_page = page; | ||
| 971 | bv.bv_len = PAGE_SIZE; | ||
| 972 | bv.bv_offset = 0; | ||
| 973 | |||
| 974 | err = zram_bvec_rw(zram, &bv, index, offset, rw); | ||
| 975 | out_unlock: | ||
| 976 | up_read(&zram->init_lock); | ||
| 977 | /* | ||
| 978 | * If I/O fails, just return error(ie, non-zero) without | ||
| 979 | * calling page_endio. | ||
| 980 | * It causes resubmit the I/O with bio request by upper functions | ||
| 981 | * of rw_page(e.g., swap_readpage, __swap_writepage) and | ||
| 982 | * bio->bi_end_io does things to handle the error | ||
| 983 | * (e.g., SetPageError, set_page_dirty and extra works). | ||
| 984 | */ | ||
| 985 | if (err == 0) | ||
| 986 | page_endio(page, rw, 0); | ||
| 987 | return err; | ||
| 988 | } | ||
| 989 | |||
| 948 | static const struct block_device_operations zram_devops = { | 990 | static const struct block_device_operations zram_devops = { |
| 949 | .swap_slot_free_notify = zram_slot_free_notify, | 991 | .swap_slot_free_notify = zram_slot_free_notify, |
| 992 | .rw_page = zram_rw_page, | ||
| 950 | .owner = THIS_MODULE | 993 | .owner = THIS_MODULE |
| 951 | }; | 994 | }; |
| 952 | 995 | ||
| 953 | static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, | 996 | static DEVICE_ATTR_RW(disksize); |
| 954 | disksize_show, disksize_store); | 997 | static DEVICE_ATTR_RO(initstate); |
| 955 | static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); | 998 | static DEVICE_ATTR_WO(reset); |
| 956 | static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); | 999 | static DEVICE_ATTR_RO(orig_data_size); |
| 957 | static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); | 1000 | static DEVICE_ATTR_RO(mem_used_total); |
| 958 | static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); | 1001 | static DEVICE_ATTR_RW(mem_limit); |
| 959 | static DEVICE_ATTR(mem_limit, S_IRUGO | S_IWUSR, mem_limit_show, | 1002 | static DEVICE_ATTR_RW(mem_used_max); |
| 960 | mem_limit_store); | 1003 | static DEVICE_ATTR_RW(max_comp_streams); |
| 961 | static DEVICE_ATTR(mem_used_max, S_IRUGO | S_IWUSR, mem_used_max_show, | 1004 | static DEVICE_ATTR_RW(comp_algorithm); |
| 962 | mem_used_max_store); | ||
| 963 | static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, | ||
| 964 | max_comp_streams_show, max_comp_streams_store); | ||
| 965 | static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, | ||
| 966 | comp_algorithm_show, comp_algorithm_store); | ||
| 967 | 1005 | ||
| 968 | ZRAM_ATTR_RO(num_reads); | 1006 | ZRAM_ATTR_RO(num_reads); |
| 969 | ZRAM_ATTR_RO(num_writes); | 1007 | ZRAM_ATTR_RO(num_writes); |
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c6ee271317f5..b05a816b09ac 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h | |||
| @@ -66,8 +66,8 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; | |||
| 66 | /* Flags for zram pages (table[page_no].value) */ | 66 | /* Flags for zram pages (table[page_no].value) */ |
| 67 | enum zram_pageflags { | 67 | enum zram_pageflags { |
| 68 | /* Page consists entirely of zeros */ | 68 | /* Page consists entirely of zeros */ |
| 69 | ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1, | 69 | ZRAM_ZERO = ZRAM_FLAG_SHIFT, |
| 70 | ZRAM_ACCESS, /* page in now accessed */ | 70 | ZRAM_ACCESS, /* page is now accessed */ |
| 71 | 71 | ||
| 72 | __NR_ZRAM_PAGEFLAGS, | 72 | __NR_ZRAM_PAGEFLAGS, |
| 73 | }; | 73 | }; |
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index a2d87a60c27f..bea878f8e7d3 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c | |||
| @@ -509,45 +509,67 @@ static void finish_pri_tag(struct device_state *dev_state, | |||
| 509 | spin_unlock_irqrestore(&pasid_state->lock, flags); | 509 | spin_unlock_irqrestore(&pasid_state->lock, flags); |
| 510 | } | 510 | } |
| 511 | 511 | ||
| 512 | static void handle_fault_error(struct fault *fault) | ||
| 513 | { | ||
| 514 | int status; | ||
| 515 | |||
| 516 | if (!fault->dev_state->inv_ppr_cb) { | ||
| 517 | set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); | ||
| 518 | return; | ||
| 519 | } | ||
| 520 | |||
| 521 | status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, | ||
| 522 | fault->pasid, | ||
| 523 | fault->address, | ||
| 524 | fault->flags); | ||
| 525 | switch (status) { | ||
| 526 | case AMD_IOMMU_INV_PRI_RSP_SUCCESS: | ||
| 527 | set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); | ||
| 528 | break; | ||
| 529 | case AMD_IOMMU_INV_PRI_RSP_INVALID: | ||
| 530 | set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); | ||
| 531 | break; | ||
| 532 | case AMD_IOMMU_INV_PRI_RSP_FAIL: | ||
| 533 | set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); | ||
| 534 | break; | ||
| 535 | default: | ||
| 536 | BUG(); | ||
| 537 | } | ||
| 538 | } | ||
| 539 | |||
| 512 | static void do_fault(struct work_struct *work) | 540 | static void do_fault(struct work_struct *work) |
| 513 | { | 541 | { |
| 514 | struct fault *fault = container_of(work, struct fault, work); | 542 | struct fault *fault = container_of(work, struct fault, work); |
| 515 | int npages, write; | 543 | struct mm_struct *mm; |
| 516 | struct page *page; | 544 | struct vm_area_struct *vma; |
| 545 | u64 address; | ||
| 546 | int ret, write; | ||
| 517 | 547 | ||
| 518 | write = !!(fault->flags & PPR_FAULT_WRITE); | 548 | write = !!(fault->flags & PPR_FAULT_WRITE); |
| 519 | 549 | ||
| 520 | down_read(&fault->state->mm->mmap_sem); | 550 | mm = fault->state->mm; |
| 521 | npages = get_user_pages(NULL, fault->state->mm, | 551 | address = fault->address; |
| 522 | fault->address, 1, write, 0, &page, NULL); | 552 | |
| 523 | up_read(&fault->state->mm->mmap_sem); | 553 | down_read(&mm->mmap_sem); |
| 524 | 554 | vma = find_extend_vma(mm, address); | |
| 525 | if (npages == 1) { | 555 | if (!vma || address < vma->vm_start) { |
| 526 | put_page(page); | 556 | /* failed to get a vma in the right range */ |
| 527 | } else if (fault->dev_state->inv_ppr_cb) { | 557 | up_read(&mm->mmap_sem); |
| 528 | int status; | 558 | handle_fault_error(fault); |
| 529 | 559 | goto out; | |
| 530 | status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, | 560 | } |
| 531 | fault->pasid, | 561 | |
| 532 | fault->address, | 562 | ret = handle_mm_fault(mm, vma, address, write); |
| 533 | fault->flags); | 563 | if (ret & VM_FAULT_ERROR) { |
| 534 | switch (status) { | 564 | /* failed to service fault */ |
| 535 | case AMD_IOMMU_INV_PRI_RSP_SUCCESS: | 565 | up_read(&mm->mmap_sem); |
| 536 | set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); | 566 | handle_fault_error(fault); |
| 537 | break; | 567 | goto out; |
| 538 | case AMD_IOMMU_INV_PRI_RSP_INVALID: | ||
| 539 | set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); | ||
| 540 | break; | ||
| 541 | case AMD_IOMMU_INV_PRI_RSP_FAIL: | ||
| 542 | set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); | ||
| 543 | break; | ||
| 544 | default: | ||
| 545 | BUG(); | ||
| 546 | } | ||
| 547 | } else { | ||
| 548 | set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); | ||
| 549 | } | 568 | } |
| 550 | 569 | ||
| 570 | up_read(&mm->mmap_sem); | ||
| 571 | |||
| 572 | out: | ||
| 551 | finish_pri_tag(fault->dev_state, fault->state, fault->tag); | 573 | finish_pri_tag(fault->dev_state, fault->state, fault->tag); |
| 552 | 574 | ||
| 553 | put_pasid_state(fault->state); | 575 | put_pasid_state(fault->state); |
diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c index 2cd8ffe5c698..942b267c6271 100644 --- a/drivers/rtc/rtc-snvs.c +++ b/drivers/rtc/rtc-snvs.c | |||
| @@ -344,13 +344,20 @@ static int snvs_rtc_resume(struct device *dev) | |||
| 344 | 344 | ||
| 345 | return 0; | 345 | return 0; |
| 346 | } | 346 | } |
| 347 | #endif | ||
| 348 | 347 | ||
| 349 | static const struct dev_pm_ops snvs_rtc_pm_ops = { | 348 | static const struct dev_pm_ops snvs_rtc_pm_ops = { |
| 350 | .suspend_noirq = snvs_rtc_suspend, | 349 | .suspend_noirq = snvs_rtc_suspend, |
| 351 | .resume_noirq = snvs_rtc_resume, | 350 | .resume_noirq = snvs_rtc_resume, |
| 352 | }; | 351 | }; |
| 353 | 352 | ||
| 353 | #define SNVS_RTC_PM_OPS (&snvs_rtc_pm_ops) | ||
| 354 | |||
| 355 | #else | ||
| 356 | |||
| 357 | #define SNVS_RTC_PM_OPS NULL | ||
| 358 | |||
| 359 | #endif | ||
| 360 | |||
| 354 | static const struct of_device_id snvs_dt_ids[] = { | 361 | static const struct of_device_id snvs_dt_ids[] = { |
| 355 | { .compatible = "fsl,sec-v4.0-mon-rtc-lp", }, | 362 | { .compatible = "fsl,sec-v4.0-mon-rtc-lp", }, |
| 356 | { /* sentinel */ } | 363 | { /* sentinel */ } |
| @@ -361,7 +368,7 @@ static struct platform_driver snvs_rtc_driver = { | |||
| 361 | .driver = { | 368 | .driver = { |
| 362 | .name = "snvs_rtc", | 369 | .name = "snvs_rtc", |
| 363 | .owner = THIS_MODULE, | 370 | .owner = THIS_MODULE, |
| 364 | .pm = &snvs_rtc_pm_ops, | 371 | .pm = SNVS_RTC_PM_OPS, |
| 365 | .of_match_table = snvs_dt_ids, | 372 | .of_match_table = snvs_dt_ids, |
| 366 | }, | 373 | }, |
| 367 | .probe = snvs_rtc_probe, | 374 | .probe = snvs_rtc_probe, |
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index ad4f5790a76f..46f8ef42559e 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c | |||
| @@ -418,7 +418,7 @@ out: | |||
| 418 | } | 418 | } |
| 419 | 419 | ||
| 420 | /* | 420 | /* |
| 421 | * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab | 421 | * ashmem_shrink - our cache shrinker, called from mm/vmscan.c |
| 422 | * | 422 | * |
| 423 | * 'nr_to_scan' is the number of objects to scan for freeing. | 423 | * 'nr_to_scan' is the number of objects to scan for freeing. |
| 424 | * | 424 | * |
| @@ -785,7 +785,6 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
| 785 | .nr_to_scan = LONG_MAX, | 785 | .nr_to_scan = LONG_MAX, |
| 786 | }; | 786 | }; |
| 787 | ret = ashmem_shrink_count(&ashmem_shrinker, &sc); | 787 | ret = ashmem_shrink_count(&ashmem_shrinker, &sc); |
| 788 | nodes_setall(sc.nodes_to_scan); | ||
| 789 | ashmem_shrink_scan(&ashmem_shrinker, &sc); | 788 | ashmem_shrink_scan(&ashmem_shrinker, &sc); |
| 790 | } | 789 | } |
| 791 | break; | 790 | break; |
diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 9bca88159725..ff44ff3ff015 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h | |||
| @@ -135,8 +135,10 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); | |||
| 135 | extern void secs_to_datestamp(time_t secs, struct affs_date *ds); | 135 | extern void secs_to_datestamp(time_t secs, struct affs_date *ds); |
| 136 | extern umode_t prot_to_mode(u32 prot); | 136 | extern umode_t prot_to_mode(u32 prot); |
| 137 | extern void mode_to_prot(struct inode *inode); | 137 | extern void mode_to_prot(struct inode *inode); |
| 138 | __printf(3, 4) | ||
| 138 | extern void affs_error(struct super_block *sb, const char *function, | 139 | extern void affs_error(struct super_block *sb, const char *function, |
| 139 | const char *fmt, ...); | 140 | const char *fmt, ...); |
| 141 | __printf(3, 4) | ||
| 140 | extern void affs_warning(struct super_block *sb, const char *function, | 142 | extern void affs_warning(struct super_block *sb, const char *function, |
| 141 | const char *fmt, ...); | 143 | const char *fmt, ...); |
| 142 | extern bool affs_nofilenametruncate(const struct dentry *dentry); | 144 | extern bool affs_nofilenametruncate(const struct dentry *dentry); |
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 937ce8754b24..c852f2fa1710 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c | |||
| @@ -10,8 +10,6 @@ | |||
| 10 | 10 | ||
| 11 | #include "affs.h" | 11 | #include "affs.h" |
| 12 | 12 | ||
| 13 | static char ErrorBuffer[256]; | ||
| 14 | |||
| 15 | /* | 13 | /* |
| 16 | * Functions for accessing Amiga-FFS structures. | 14 | * Functions for accessing Amiga-FFS structures. |
| 17 | */ | 15 | */ |
| @@ -444,30 +442,30 @@ mode_to_prot(struct inode *inode) | |||
| 444 | void | 442 | void |
| 445 | affs_error(struct super_block *sb, const char *function, const char *fmt, ...) | 443 | affs_error(struct super_block *sb, const char *function, const char *fmt, ...) |
| 446 | { | 444 | { |
| 447 | va_list args; | 445 | struct va_format vaf; |
| 448 | 446 | va_list args; | |
| 449 | va_start(args,fmt); | ||
| 450 | vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); | ||
| 451 | va_end(args); | ||
| 452 | 447 | ||
| 453 | pr_crit("error (device %s): %s(): %s\n", sb->s_id, | 448 | va_start(args, fmt); |
| 454 | function,ErrorBuffer); | 449 | vaf.fmt = fmt; |
| 450 | vaf.va = &args; | ||
| 451 | pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf); | ||
| 455 | if (!(sb->s_flags & MS_RDONLY)) | 452 | if (!(sb->s_flags & MS_RDONLY)) |
| 456 | pr_warn("Remounting filesystem read-only\n"); | 453 | pr_warn("Remounting filesystem read-only\n"); |
| 457 | sb->s_flags |= MS_RDONLY; | 454 | sb->s_flags |= MS_RDONLY; |
| 455 | va_end(args); | ||
| 458 | } | 456 | } |
| 459 | 457 | ||
| 460 | void | 458 | void |
| 461 | affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) | 459 | affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) |
| 462 | { | 460 | { |
| 463 | va_list args; | 461 | struct va_format vaf; |
| 462 | va_list args; | ||
| 464 | 463 | ||
| 465 | va_start(args,fmt); | 464 | va_start(args, fmt); |
| 466 | vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); | 465 | vaf.fmt = fmt; |
| 466 | vaf.va = &args; | ||
| 467 | pr_warn("(device %s): %s(): %pV\n", sb->s_id, function, &vaf); | ||
| 467 | va_end(args); | 468 | va_end(args); |
| 468 | |||
| 469 | pr_warn("(device %s): %s(): %s\n", sb->s_id, | ||
| 470 | function,ErrorBuffer); | ||
| 471 | } | 469 | } |
| 472 | 470 | ||
| 473 | bool | 471 | bool |
diff --git a/fs/affs/file.c b/fs/affs/file.c index 1ed590aafecf..8faa6593ca6d 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c | |||
| @@ -12,35 +12,10 @@ | |||
| 12 | * affs regular file handling primitives | 12 | * affs regular file handling primitives |
| 13 | */ | 13 | */ |
| 14 | 14 | ||
| 15 | #include <linux/aio.h> | ||
| 15 | #include "affs.h" | 16 | #include "affs.h" |
| 16 | 17 | ||
| 17 | #if PAGE_SIZE < 4096 | ||
| 18 | #error PAGE_SIZE must be at least 4096 | ||
| 19 | #endif | ||
| 20 | |||
| 21 | static int affs_grow_extcache(struct inode *inode, u32 lc_idx); | ||
| 22 | static struct buffer_head *affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext); | ||
| 23 | static inline struct buffer_head *affs_get_extblock(struct inode *inode, u32 ext); | ||
| 24 | static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); | 18 | static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); |
| 25 | static int affs_file_open(struct inode *inode, struct file *filp); | ||
| 26 | static int affs_file_release(struct inode *inode, struct file *filp); | ||
| 27 | |||
| 28 | const struct file_operations affs_file_operations = { | ||
| 29 | .llseek = generic_file_llseek, | ||
| 30 | .read = new_sync_read, | ||
| 31 | .read_iter = generic_file_read_iter, | ||
| 32 | .write = new_sync_write, | ||
| 33 | .write_iter = generic_file_write_iter, | ||
| 34 | .mmap = generic_file_mmap, | ||
| 35 | .open = affs_file_open, | ||
| 36 | .release = affs_file_release, | ||
| 37 | .fsync = affs_file_fsync, | ||
| 38 | .splice_read = generic_file_splice_read, | ||
| 39 | }; | ||
| 40 | |||
| 41 | const struct inode_operations affs_file_inode_operations = { | ||
| 42 | .setattr = affs_notify_change, | ||
| 43 | }; | ||
| 44 | 19 | ||
| 45 | static int | 20 | static int |
| 46 | affs_file_open(struct inode *inode, struct file *filp) | 21 | affs_file_open(struct inode *inode, struct file *filp) |
| @@ -355,7 +330,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul | |||
| 355 | 330 | ||
| 356 | /* store new block */ | 331 | /* store new block */ |
| 357 | if (bh_result->b_blocknr) | 332 | if (bh_result->b_blocknr) |
| 358 | affs_warning(sb, "get_block", "block already set (%x)", bh_result->b_blocknr); | 333 | affs_warning(sb, "get_block", "block already set (%lx)", |
| 334 | (unsigned long)bh_result->b_blocknr); | ||
| 359 | AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr); | 335 | AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr); |
| 360 | AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1); | 336 | AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1); |
| 361 | affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1); | 337 | affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1); |
| @@ -377,7 +353,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul | |||
| 377 | return 0; | 353 | return 0; |
| 378 | 354 | ||
| 379 | err_big: | 355 | err_big: |
| 380 | affs_error(inode->i_sb,"get_block","strange block request %d", block); | 356 | affs_error(inode->i_sb, "get_block", "strange block request %d", |
| 357 | (int)block); | ||
| 381 | return -EIO; | 358 | return -EIO; |
| 382 | err_ext: | 359 | err_ext: |
| 383 | // unlock cache | 360 | // unlock cache |
| @@ -412,6 +389,22 @@ static void affs_write_failed(struct address_space *mapping, loff_t to) | |||
| 412 | } | 389 | } |
| 413 | } | 390 | } |
| 414 | 391 | ||
| 392 | static ssize_t | ||
| 393 | affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, | ||
| 394 | loff_t offset) | ||
| 395 | { | ||
| 396 | struct file *file = iocb->ki_filp; | ||
| 397 | struct address_space *mapping = file->f_mapping; | ||
| 398 | struct inode *inode = mapping->host; | ||
| 399 | size_t count = iov_iter_count(iter); | ||
| 400 | ssize_t ret; | ||
| 401 | |||
| 402 | ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block); | ||
| 403 | if (ret < 0 && (rw & WRITE)) | ||
| 404 | affs_write_failed(mapping, offset + count); | ||
| 405 | return ret; | ||
| 406 | } | ||
| 407 | |||
| 415 | static int affs_write_begin(struct file *file, struct address_space *mapping, | 408 | static int affs_write_begin(struct file *file, struct address_space *mapping, |
| 416 | loff_t pos, unsigned len, unsigned flags, | 409 | loff_t pos, unsigned len, unsigned flags, |
| 417 | struct page **pagep, void **fsdata) | 410 | struct page **pagep, void **fsdata) |
| @@ -438,6 +431,7 @@ const struct address_space_operations affs_aops = { | |||
| 438 | .writepage = affs_writepage, | 431 | .writepage = affs_writepage, |
| 439 | .write_begin = affs_write_begin, | 432 | .write_begin = affs_write_begin, |
| 440 | .write_end = generic_write_end, | 433 | .write_end = generic_write_end, |
| 434 | .direct_IO = affs_direct_IO, | ||
| 441 | .bmap = _affs_bmap | 435 | .bmap = _affs_bmap |
| 442 | }; | 436 | }; |
| 443 | 437 | ||
| @@ -867,8 +861,9 @@ affs_truncate(struct inode *inode) | |||
| 867 | // lock cache | 861 | // lock cache |
| 868 | ext_bh = affs_get_extblock(inode, ext); | 862 | ext_bh = affs_get_extblock(inode, ext); |
| 869 | if (IS_ERR(ext_bh)) { | 863 | if (IS_ERR(ext_bh)) { |
| 870 | affs_warning(sb, "truncate", "unexpected read error for ext block %u (%d)", | 864 | affs_warning(sb, "truncate", |
| 871 | ext, PTR_ERR(ext_bh)); | 865 | "unexpected read error for ext block %u (%ld)", |
| 866 | (unsigned int)ext, PTR_ERR(ext_bh)); | ||
| 872 | return; | 867 | return; |
| 873 | } | 868 | } |
| 874 | if (AFFS_I(inode)->i_lc) { | 869 | if (AFFS_I(inode)->i_lc) { |
| @@ -914,8 +909,9 @@ affs_truncate(struct inode *inode) | |||
| 914 | struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); | 909 | struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); |
| 915 | u32 tmp; | 910 | u32 tmp; |
| 916 | if (IS_ERR(bh)) { | 911 | if (IS_ERR(bh)) { |
| 917 | affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", | 912 | affs_warning(sb, "truncate", |
| 918 | ext, PTR_ERR(bh)); | 913 | "unexpected read error for last block %u (%ld)", |
| 914 | (unsigned int)ext, PTR_ERR(bh)); | ||
| 919 | return; | 915 | return; |
| 920 | } | 916 | } |
| 921 | tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); | 917 | tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); |
| @@ -961,3 +957,19 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) | |||
| 961 | mutex_unlock(&inode->i_mutex); | 957 | mutex_unlock(&inode->i_mutex); |
| 962 | return ret; | 958 | return ret; |
| 963 | } | 959 | } |
| 960 | const struct file_operations affs_file_operations = { | ||
| 961 | .llseek = generic_file_llseek, | ||
| 962 | .read = new_sync_read, | ||
| 963 | .read_iter = generic_file_read_iter, | ||
| 964 | .write = new_sync_write, | ||
| 965 | .write_iter = generic_file_write_iter, | ||
| 966 | .mmap = generic_file_mmap, | ||
| 967 | .open = affs_file_open, | ||
| 968 | .release = affs_file_release, | ||
| 969 | .fsync = affs_file_fsync, | ||
| 970 | .splice_read = generic_file_splice_read, | ||
| 971 | }; | ||
| 972 | |||
| 973 | const struct inode_operations affs_file_inode_operations = { | ||
| 974 | .setattr = affs_notify_change, | ||
| 975 | }; | ||
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index b94d1cc9cd30..edf47774b03d 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c | |||
| @@ -269,10 +269,6 @@ more: | |||
| 269 | } | 269 | } |
| 270 | ctx->pos++; | 270 | ctx->pos++; |
| 271 | goto more; | 271 | goto more; |
| 272 | |||
| 273 | befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos); | ||
| 274 | |||
| 275 | return 0; | ||
| 276 | } | 272 | } |
| 277 | 273 | ||
| 278 | static struct inode * | 274 | static struct inode * |
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index f37b08cea1f7..490538536cb4 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c | |||
| @@ -42,6 +42,10 @@ static int load_em86(struct linux_binprm *bprm) | |||
| 42 | return -ENOEXEC; | 42 | return -ENOEXEC; |
| 43 | } | 43 | } |
| 44 | 44 | ||
| 45 | /* Need to be able to load the file after exec */ | ||
| 46 | if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) | ||
| 47 | return -ENOENT; | ||
| 48 | |||
| 45 | allow_write_access(bprm->file); | 49 | allow_write_access(bprm->file); |
| 46 | fput(bprm->file); | 50 | fput(bprm->file); |
| 47 | bprm->file = NULL; | 51 | bprm->file = NULL; |
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 70789e198dea..c04ef1d4f18a 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c | |||
| @@ -144,6 +144,10 @@ static int load_misc_binary(struct linux_binprm *bprm) | |||
| 144 | if (!fmt) | 144 | if (!fmt) |
| 145 | goto ret; | 145 | goto ret; |
| 146 | 146 | ||
| 147 | /* Need to be able to load the file after exec */ | ||
| 148 | if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) | ||
| 149 | return -ENOENT; | ||
| 150 | |||
| 147 | if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { | 151 | if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { |
| 148 | retval = remove_arg_zero(bprm); | 152 | retval = remove_arg_zero(bprm); |
| 149 | if (retval) | 153 | if (retval) |
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 5027a3e14922..afdf4e3cafc2 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c | |||
| @@ -24,6 +24,16 @@ static int load_script(struct linux_binprm *bprm) | |||
| 24 | 24 | ||
| 25 | if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) | 25 | if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) |
| 26 | return -ENOEXEC; | 26 | return -ENOEXEC; |
| 27 | |||
| 28 | /* | ||
| 29 | * If the script filename will be inaccessible after exec, typically | ||
| 30 | * because it is a "/dev/fd/<fd>/.." path against an O_CLOEXEC fd, give | ||
| 31 | * up now (on the assumption that the interpreter will want to load | ||
| 32 | * this file). | ||
| 33 | */ | ||
| 34 | if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) | ||
| 35 | return -ENOENT; | ||
| 36 | |||
| 27 | /* | 37 | /* |
| 28 | * This section does the #! interpretation. | 38 | * This section does the #! interpretation. |
| 29 | * Sorta complicated, but hopefully it will work. -TYT | 39 | * Sorta complicated, but hopefully it will work. -TYT |
diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 1de7294aad20..2bc2c87f35e7 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c | |||
| @@ -40,13 +40,14 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) | |||
| 40 | static void drop_slab(void) | 40 | static void drop_slab(void) |
| 41 | { | 41 | { |
| 42 | int nr_objects; | 42 | int nr_objects; |
| 43 | struct shrink_control shrink = { | ||
| 44 | .gfp_mask = GFP_KERNEL, | ||
| 45 | }; | ||
| 46 | 43 | ||
| 47 | nodes_setall(shrink.nodes_to_scan); | ||
| 48 | do { | 44 | do { |
| 49 | nr_objects = shrink_slab(&shrink, 1000, 1000); | 45 | int nid; |
| 46 | |||
| 47 | nr_objects = 0; | ||
| 48 | for_each_online_node(nid) | ||
| 49 | nr_objects += shrink_node_slabs(GFP_KERNEL, nid, | ||
| 50 | 1000, 1000); | ||
| 50 | } while (nr_objects > 10); | 51 | } while (nr_objects > 10); |
| 51 | } | 52 | } |
| 52 | 53 | ||
| @@ -748,18 +748,25 @@ EXPORT_SYMBOL(setup_arg_pages); | |||
| 748 | 748 | ||
| 749 | #endif /* CONFIG_MMU */ | 749 | #endif /* CONFIG_MMU */ |
| 750 | 750 | ||
| 751 | static struct file *do_open_exec(struct filename *name) | 751 | static struct file *do_open_execat(int fd, struct filename *name, int flags) |
| 752 | { | 752 | { |
| 753 | struct file *file; | 753 | struct file *file; |
| 754 | int err; | 754 | int err; |
| 755 | static const struct open_flags open_exec_flags = { | 755 | struct open_flags open_exec_flags = { |
| 756 | .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, | 756 | .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, |
| 757 | .acc_mode = MAY_EXEC | MAY_OPEN, | 757 | .acc_mode = MAY_EXEC | MAY_OPEN, |
| 758 | .intent = LOOKUP_OPEN, | 758 | .intent = LOOKUP_OPEN, |
| 759 | .lookup_flags = LOOKUP_FOLLOW, | 759 | .lookup_flags = LOOKUP_FOLLOW, |
| 760 | }; | 760 | }; |
| 761 | 761 | ||
| 762 | file = do_filp_open(AT_FDCWD, name, &open_exec_flags); | 762 | if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) |
| 763 | return ERR_PTR(-EINVAL); | ||
| 764 | if (flags & AT_SYMLINK_NOFOLLOW) | ||
| 765 | open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; | ||
| 766 | if (flags & AT_EMPTY_PATH) | ||
| 767 | open_exec_flags.lookup_flags |= LOOKUP_EMPTY; | ||
| 768 | |||
| 769 | file = do_filp_open(fd, name, &open_exec_flags); | ||
| 763 | if (IS_ERR(file)) | 770 | if (IS_ERR(file)) |
| 764 | goto out; | 771 | goto out; |
| 765 | 772 | ||
| @@ -770,12 +777,13 @@ static struct file *do_open_exec(struct filename *name) | |||
| 770 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) | 777 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) |
| 771 | goto exit; | 778 | goto exit; |
| 772 | 779 | ||
| 773 | fsnotify_open(file); | ||
| 774 | |||
| 775 | err = deny_write_access(file); | 780 | err = deny_write_access(file); |
| 776 | if (err) | 781 | if (err) |
| 777 | goto exit; | 782 | goto exit; |
| 778 | 783 | ||
| 784 | if (name->name[0] != '\0') | ||
| 785 | fsnotify_open(file); | ||
| 786 | |||
| 779 | out: | 787 | out: |
| 780 | return file; | 788 | return file; |
| 781 | 789 | ||
| @@ -787,7 +795,7 @@ exit: | |||
| 787 | struct file *open_exec(const char *name) | 795 | struct file *open_exec(const char *name) |
| 788 | { | 796 | { |
| 789 | struct filename tmp = { .name = name }; | 797 | struct filename tmp = { .name = name }; |
| 790 | return do_open_exec(&tmp); | 798 | return do_open_execat(AT_FDCWD, &tmp, 0); |
| 791 | } | 799 | } |
| 792 | EXPORT_SYMBOL(open_exec); | 800 | EXPORT_SYMBOL(open_exec); |
| 793 | 801 | ||
| @@ -1428,10 +1436,12 @@ static int exec_binprm(struct linux_binprm *bprm) | |||
| 1428 | /* | 1436 | /* |
| 1429 | * sys_execve() executes a new program. | 1437 | * sys_execve() executes a new program. |
| 1430 | */ | 1438 | */ |
| 1431 | static int do_execve_common(struct filename *filename, | 1439 | static int do_execveat_common(int fd, struct filename *filename, |
| 1432 | struct user_arg_ptr argv, | 1440 | struct user_arg_ptr argv, |
| 1433 | struct user_arg_ptr envp) | 1441 | struct user_arg_ptr envp, |
| 1442 | int flags) | ||
| 1434 | { | 1443 | { |
| 1444 | char *pathbuf = NULL; | ||
| 1435 | struct linux_binprm *bprm; | 1445 | struct linux_binprm *bprm; |
| 1436 | struct file *file; | 1446 | struct file *file; |
| 1437 | struct files_struct *displaced; | 1447 | struct files_struct *displaced; |
| @@ -1472,7 +1482,7 @@ static int do_execve_common(struct filename *filename, | |||
| 1472 | check_unsafe_exec(bprm); | 1482 | check_unsafe_exec(bprm); |
| 1473 | current->in_execve = 1; | 1483 | current->in_execve = 1; |
| 1474 | 1484 | ||
| 1475 | file = do_open_exec(filename); | 1485 | file = do_open_execat(fd, filename, flags); |
| 1476 | retval = PTR_ERR(file); | 1486 | retval = PTR_ERR(file); |
| 1477 | if (IS_ERR(file)) | 1487 | if (IS_ERR(file)) |
| 1478 | goto out_unmark; | 1488 | goto out_unmark; |
| @@ -1480,7 +1490,28 @@ static int do_execve_common(struct filename *filename, | |||
| 1480 | sched_exec(); | 1490 | sched_exec(); |
| 1481 | 1491 | ||
| 1482 | bprm->file = file; | 1492 | bprm->file = file; |
| 1483 | bprm->filename = bprm->interp = filename->name; | 1493 | if (fd == AT_FDCWD || filename->name[0] == '/') { |
| 1494 | bprm->filename = filename->name; | ||
| 1495 | } else { | ||
| 1496 | if (filename->name[0] == '\0') | ||
| 1497 | pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd); | ||
| 1498 | else | ||
| 1499 | pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s", | ||
| 1500 | fd, filename->name); | ||
| 1501 | if (!pathbuf) { | ||
| 1502 | retval = -ENOMEM; | ||
| 1503 | goto out_unmark; | ||
| 1504 | } | ||
| 1505 | /* | ||
| 1506 | * Record that a name derived from an O_CLOEXEC fd will be | ||
| 1507 | * inaccessible after exec. Relies on having exclusive access to | ||
| 1508 | * current->files (due to unshare_files above). | ||
| 1509 | */ | ||
| 1510 | if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) | ||
| 1511 | bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; | ||
| 1512 | bprm->filename = pathbuf; | ||
| 1513 | } | ||
| 1514 | bprm->interp = bprm->filename; | ||
| 1484 | 1515 | ||
| 1485 | retval = bprm_mm_init(bprm); | 1516 | retval = bprm_mm_init(bprm); |
| 1486 | if (retval) | 1517 | if (retval) |
| @@ -1521,6 +1552,7 @@ static int do_execve_common(struct filename *filename, | |||
| 1521 | acct_update_integrals(current); | 1552 | acct_update_integrals(current); |
| 1522 | task_numa_free(current); | 1553 | task_numa_free(current); |
| 1523 | free_bprm(bprm); | 1554 | free_bprm(bprm); |
| 1555 | kfree(pathbuf); | ||
| 1524 | putname(filename); | 1556 | putname(filename); |
| 1525 | if (displaced) | 1557 | if (displaced) |
| 1526 | put_files_struct(displaced); | 1558 | put_files_struct(displaced); |
| @@ -1538,6 +1570,7 @@ out_unmark: | |||
| 1538 | 1570 | ||
| 1539 | out_free: | 1571 | out_free: |
| 1540 | free_bprm(bprm); | 1572 | free_bprm(bprm); |
| 1573 | kfree(pathbuf); | ||
| 1541 | 1574 | ||
| 1542 | out_files: | 1575 | out_files: |
| 1543 | if (displaced) | 1576 | if (displaced) |
| @@ -1553,7 +1586,18 @@ int do_execve(struct filename *filename, | |||
| 1553 | { | 1586 | { |
| 1554 | struct user_arg_ptr argv = { .ptr.native = __argv }; | 1587 | struct user_arg_ptr argv = { .ptr.native = __argv }; |
| 1555 | struct user_arg_ptr envp = { .ptr.native = __envp }; | 1588 | struct user_arg_ptr envp = { .ptr.native = __envp }; |
| 1556 | return do_execve_common(filename, argv, envp); | 1589 | return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); |
| 1590 | } | ||
| 1591 | |||
| 1592 | int do_execveat(int fd, struct filename *filename, | ||
| 1593 | const char __user *const __user *__argv, | ||
| 1594 | const char __user *const __user *__envp, | ||
| 1595 | int flags) | ||
| 1596 | { | ||
| 1597 | struct user_arg_ptr argv = { .ptr.native = __argv }; | ||
| 1598 | struct user_arg_ptr envp = { .ptr.native = __envp }; | ||
| 1599 | |||
| 1600 | return do_execveat_common(fd, filename, argv, envp, flags); | ||
| 1557 | } | 1601 | } |
| 1558 | 1602 | ||
| 1559 | #ifdef CONFIG_COMPAT | 1603 | #ifdef CONFIG_COMPAT |
| @@ -1569,7 +1613,23 @@ static int compat_do_execve(struct filename *filename, | |||
| 1569 | .is_compat = true, | 1613 | .is_compat = true, |
| 1570 | .ptr.compat = __envp, | 1614 | .ptr.compat = __envp, |
| 1571 | }; | 1615 | }; |
| 1572 | return do_execve_common(filename, argv, envp); | 1616 | return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); |
| 1617 | } | ||
| 1618 | |||
| 1619 | static int compat_do_execveat(int fd, struct filename *filename, | ||
| 1620 | const compat_uptr_t __user *__argv, | ||
| 1621 | const compat_uptr_t __user *__envp, | ||
| 1622 | int flags) | ||
| 1623 | { | ||
| 1624 | struct user_arg_ptr argv = { | ||
| 1625 | .is_compat = true, | ||
| 1626 | .ptr.compat = __argv, | ||
| 1627 | }; | ||
| 1628 | struct user_arg_ptr envp = { | ||
| 1629 | .is_compat = true, | ||
| 1630 | .ptr.compat = __envp, | ||
| 1631 | }; | ||
| 1632 | return do_execveat_common(fd, filename, argv, envp, flags); | ||
| 1573 | } | 1633 | } |
| 1574 | #endif | 1634 | #endif |
| 1575 | 1635 | ||
| @@ -1609,6 +1669,20 @@ SYSCALL_DEFINE3(execve, | |||
| 1609 | { | 1669 | { |
| 1610 | return do_execve(getname(filename), argv, envp); | 1670 | return do_execve(getname(filename), argv, envp); |
| 1611 | } | 1671 | } |
| 1672 | |||
| 1673 | SYSCALL_DEFINE5(execveat, | ||
| 1674 | int, fd, const char __user *, filename, | ||
| 1675 | const char __user *const __user *, argv, | ||
| 1676 | const char __user *const __user *, envp, | ||
| 1677 | int, flags) | ||
| 1678 | { | ||
| 1679 | int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; | ||
| 1680 | |||
| 1681 | return do_execveat(fd, | ||
| 1682 | getname_flags(filename, lookup_flags, NULL), | ||
| 1683 | argv, envp, flags); | ||
| 1684 | } | ||
| 1685 | |||
| 1612 | #ifdef CONFIG_COMPAT | 1686 | #ifdef CONFIG_COMPAT |
| 1613 | COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, | 1687 | COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, |
| 1614 | const compat_uptr_t __user *, argv, | 1688 | const compat_uptr_t __user *, argv, |
| @@ -1616,4 +1690,17 @@ COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, | |||
| 1616 | { | 1690 | { |
| 1617 | return compat_do_execve(getname(filename), argv, envp); | 1691 | return compat_do_execve(getname(filename), argv, envp); |
| 1618 | } | 1692 | } |
| 1693 | |||
| 1694 | COMPAT_SYSCALL_DEFINE5(execveat, int, fd, | ||
| 1695 | const char __user *, filename, | ||
| 1696 | const compat_uptr_t __user *, argv, | ||
| 1697 | const compat_uptr_t __user *, envp, | ||
| 1698 | int, flags) | ||
| 1699 | { | ||
| 1700 | int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; | ||
| 1701 | |||
| 1702 | return compat_do_execveat(fd, | ||
| 1703 | getname_flags(filename, lookup_flags, NULL), | ||
| 1704 | argv, envp, flags); | ||
| 1705 | } | ||
| 1619 | #endif | 1706 | #endif |
diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e0c4ba39a377..64e295e8ff38 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h | |||
| @@ -370,6 +370,7 @@ extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, | |||
| 370 | int datasync); | 370 | int datasync); |
| 371 | 371 | ||
| 372 | /* fat/inode.c */ | 372 | /* fat/inode.c */ |
| 373 | extern int fat_block_truncate_page(struct inode *inode, loff_t from); | ||
| 373 | extern void fat_attach(struct inode *inode, loff_t i_pos); | 374 | extern void fat_attach(struct inode *inode, loff_t i_pos); |
| 374 | extern void fat_detach(struct inode *inode); | 375 | extern void fat_detach(struct inode *inode); |
| 375 | extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); | 376 | extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); |
diff --git a/fs/fat/file.c b/fs/fat/file.c index 85f79a89e747..8429c68e3057 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c | |||
| @@ -443,6 +443,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) | |||
| 443 | } | 443 | } |
| 444 | 444 | ||
| 445 | if (attr->ia_valid & ATTR_SIZE) { | 445 | if (attr->ia_valid & ATTR_SIZE) { |
| 446 | error = fat_block_truncate_page(inode, attr->ia_size); | ||
| 447 | if (error) | ||
| 448 | goto out; | ||
| 446 | down_write(&MSDOS_I(inode)->truncate_lock); | 449 | down_write(&MSDOS_I(inode)->truncate_lock); |
| 447 | truncate_setsize(inode, attr->ia_size); | 450 | truncate_setsize(inode, attr->ia_size); |
| 448 | fat_truncate_blocks(inode, attr->ia_size); | 451 | fat_truncate_blocks(inode, attr->ia_size); |
diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 756aead10d96..7b41a2dcdd76 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c | |||
| @@ -294,6 +294,18 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block) | |||
| 294 | return blocknr; | 294 | return blocknr; |
| 295 | } | 295 | } |
| 296 | 296 | ||
| 297 | /* | ||
| 298 | * fat_block_truncate_page() zeroes out a mapping from file offset `from' | ||
| 299 | * up to the end of the block which corresponds to `from'. | ||
| 300 | * This is required during truncate to physically zeroout the tail end | ||
| 301 | * of that block so it doesn't yield old data if the file is later grown. | ||
| 302 | * Also, avoid causing failure from fsx for cases of "data past EOF" | ||
| 303 | */ | ||
| 304 | int fat_block_truncate_page(struct inode *inode, loff_t from) | ||
| 305 | { | ||
| 306 | return block_truncate_page(inode->i_mapping, from, fat_get_block); | ||
| 307 | } | ||
| 308 | |||
| 297 | static const struct address_space_operations fat_aops = { | 309 | static const struct address_space_operations fat_aops = { |
| 298 | .readpage = fat_readpage, | 310 | .readpage = fat_readpage, |
| 299 | .readpages = fat_readpages, | 311 | .readpages = fat_readpages, |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 1e2872b25343..5eba47f593f8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
| @@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) | |||
| 412 | pgoff = offset >> PAGE_SHIFT; | 412 | pgoff = offset >> PAGE_SHIFT; |
| 413 | 413 | ||
| 414 | i_size_write(inode, offset); | 414 | i_size_write(inode, offset); |
| 415 | mutex_lock(&mapping->i_mmap_mutex); | 415 | i_mmap_lock_write(mapping); |
| 416 | if (!RB_EMPTY_ROOT(&mapping->i_mmap)) | 416 | if (!RB_EMPTY_ROOT(&mapping->i_mmap)) |
| 417 | hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); | 417 | hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); |
| 418 | mutex_unlock(&mapping->i_mmap_mutex); | 418 | i_mmap_unlock_write(mapping); |
| 419 | truncate_hugepages(inode, offset); | 419 | truncate_hugepages(inode, offset); |
| 420 | return 0; | 420 | return 0; |
| 421 | } | 421 | } |
| @@ -472,12 +472,12 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, | |||
| 472 | } | 472 | } |
| 473 | 473 | ||
| 474 | /* | 474 | /* |
| 475 | * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never | 475 | * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never |
| 476 | * be taken from reclaim -- unlike regular filesystems. This needs an | 476 | * be taken from reclaim -- unlike regular filesystems. This needs an |
| 477 | * annotation because huge_pmd_share() does an allocation under | 477 | * annotation because huge_pmd_share() does an allocation under |
| 478 | * i_mmap_mutex. | 478 | * i_mmap_rwsem. |
| 479 | */ | 479 | */ |
| 480 | static struct lock_class_key hugetlbfs_i_mmap_mutex_key; | 480 | static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; |
| 481 | 481 | ||
| 482 | static struct inode *hugetlbfs_get_inode(struct super_block *sb, | 482 | static struct inode *hugetlbfs_get_inode(struct super_block *sb, |
| 483 | struct inode *dir, | 483 | struct inode *dir, |
| @@ -495,8 +495,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, | |||
| 495 | struct hugetlbfs_inode_info *info; | 495 | struct hugetlbfs_inode_info *info; |
| 496 | inode->i_ino = get_next_ino(); | 496 | inode->i_ino = get_next_ino(); |
| 497 | inode_init_owner(inode, dir, mode); | 497 | inode_init_owner(inode, dir, mode); |
| 498 | lockdep_set_class(&inode->i_mapping->i_mmap_mutex, | 498 | lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, |
| 499 | &hugetlbfs_i_mmap_mutex_key); | 499 | &hugetlbfs_i_mmap_rwsem_key); |
| 500 | inode->i_mapping->a_ops = &hugetlbfs_aops; | 500 | inode->i_mapping->a_ops = &hugetlbfs_aops; |
| 501 | inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; | 501 | inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; |
| 502 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 502 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
diff --git a/fs/inode.c b/fs/inode.c index 2ed95f7caa4f..ad60555b4768 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
| @@ -346,7 +346,7 @@ void address_space_init_once(struct address_space *mapping) | |||
| 346 | memset(mapping, 0, sizeof(*mapping)); | 346 | memset(mapping, 0, sizeof(*mapping)); |
| 347 | INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); | 347 | INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); |
| 348 | spin_lock_init(&mapping->tree_lock); | 348 | spin_lock_init(&mapping->tree_lock); |
| 349 | mutex_init(&mapping->i_mmap_mutex); | 349 | init_rwsem(&mapping->i_mmap_rwsem); |
| 350 | INIT_LIST_HEAD(&mapping->private_list); | 350 | INIT_LIST_HEAD(&mapping->private_list); |
| 351 | spin_lock_init(&mapping->private_lock); | 351 | spin_lock_init(&mapping->private_lock); |
| 352 | mapping->i_mmap = RB_ROOT; | 352 | mapping->i_mmap = RB_ROOT; |
diff --git a/fs/namei.c b/fs/namei.c index db5fe86319e6..ca814165d84c 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
| @@ -130,7 +130,7 @@ void final_putname(struct filename *name) | |||
| 130 | 130 | ||
| 131 | #define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) | 131 | #define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) |
| 132 | 132 | ||
| 133 | static struct filename * | 133 | struct filename * |
| 134 | getname_flags(const char __user *filename, int flags, int *empty) | 134 | getname_flags(const char __user *filename, int flags, int *empty) |
| 135 | { | 135 | { |
| 136 | struct filename *result, *err; | 136 | struct filename *result, *err; |
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index caaaf9dfe353..44523f4a6084 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c | |||
| @@ -69,8 +69,8 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) | |||
| 69 | if (old_mask == new_mask) | 69 | if (old_mask == new_mask) |
| 70 | return; | 70 | return; |
| 71 | 71 | ||
| 72 | if (fsn_mark->i.inode) | 72 | if (fsn_mark->inode) |
| 73 | fsnotify_recalc_inode_mask(fsn_mark->i.inode); | 73 | fsnotify_recalc_inode_mask(fsn_mark->inode); |
| 74 | } | 74 | } |
| 75 | 75 | ||
| 76 | /* | 76 | /* |
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 6ffd220eb14d..58b7cdb63da9 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c | |||
| @@ -80,7 +80,7 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) | |||
| 80 | return; | 80 | return; |
| 81 | 81 | ||
| 82 | inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); | 82 | inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); |
| 83 | inode = igrab(mark->i.inode); | 83 | inode = igrab(mark->inode); |
| 84 | if (inode) { | 84 | if (inode) { |
| 85 | seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", | 85 | seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", |
| 86 | inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, | 86 | inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, |
| @@ -112,7 +112,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) | |||
| 112 | mflags |= FAN_MARK_IGNORED_SURV_MODIFY; | 112 | mflags |= FAN_MARK_IGNORED_SURV_MODIFY; |
| 113 | 113 | ||
| 114 | if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { | 114 | if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { |
| 115 | inode = igrab(mark->i.inode); | 115 | inode = igrab(mark->inode); |
| 116 | if (!inode) | 116 | if (!inode) |
| 117 | return; | 117 | return; |
| 118 | seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", | 118 | seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", |
| @@ -122,7 +122,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) | |||
| 122 | seq_putc(m, '\n'); | 122 | seq_putc(m, '\n'); |
| 123 | iput(inode); | 123 | iput(inode); |
| 124 | } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { | 124 | } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { |
| 125 | struct mount *mnt = real_mount(mark->m.mnt); | 125 | struct mount *mnt = real_mount(mark->mnt); |
| 126 | 126 | ||
| 127 | seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", | 127 | seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", |
| 128 | mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); | 128 | mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); |
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 41e39102743a..dd3fb0b17be7 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c | |||
| @@ -242,13 +242,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, | |||
| 242 | 242 | ||
| 243 | if (inode_node) { | 243 | if (inode_node) { |
| 244 | inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), | 244 | inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), |
| 245 | struct fsnotify_mark, i.i_list); | 245 | struct fsnotify_mark, obj_list); |
| 246 | inode_group = inode_mark->group; | 246 | inode_group = inode_mark->group; |
| 247 | } | 247 | } |
| 248 | 248 | ||
| 249 | if (vfsmount_node) { | 249 | if (vfsmount_node) { |
| 250 | vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), | 250 | vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), |
| 251 | struct fsnotify_mark, m.m_list); | 251 | struct fsnotify_mark, obj_list); |
| 252 | vfsmount_group = vfsmount_mark->group; | 252 | vfsmount_group = vfsmount_mark->group; |
| 253 | } | 253 | } |
| 254 | 254 | ||
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 3b68b0ae0a97..13a00be516d2 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h | |||
| @@ -12,12 +12,19 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group); | |||
| 12 | /* protects reads of inode and vfsmount marks list */ | 12 | /* protects reads of inode and vfsmount marks list */ |
| 13 | extern struct srcu_struct fsnotify_mark_srcu; | 13 | extern struct srcu_struct fsnotify_mark_srcu; |
| 14 | 14 | ||
| 15 | /* Calculate mask of events for a list of marks */ | ||
| 16 | extern u32 fsnotify_recalc_mask(struct hlist_head *head); | ||
| 17 | |||
| 15 | /* compare two groups for sorting of marks lists */ | 18 | /* compare two groups for sorting of marks lists */ |
| 16 | extern int fsnotify_compare_groups(struct fsnotify_group *a, | 19 | extern int fsnotify_compare_groups(struct fsnotify_group *a, |
| 17 | struct fsnotify_group *b); | 20 | struct fsnotify_group *b); |
| 18 | 21 | ||
| 19 | extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, | 22 | extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, |
| 20 | __u32 mask); | 23 | __u32 mask); |
| 24 | /* Add mark to a proper place in mark list */ | ||
| 25 | extern int fsnotify_add_mark_list(struct hlist_head *head, | ||
| 26 | struct fsnotify_mark *mark, | ||
| 27 | int allow_dups); | ||
| 21 | /* add a mark to an inode */ | 28 | /* add a mark to an inode */ |
| 22 | extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, | 29 | extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, |
| 23 | struct fsnotify_group *group, struct inode *inode, | 30 | struct fsnotify_group *group, struct inode *inode, |
| @@ -31,6 +38,11 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, | |||
| 31 | extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); | 38 | extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); |
| 32 | /* inode specific destruction of a mark */ | 39 | /* inode specific destruction of a mark */ |
| 33 | extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); | 40 | extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); |
| 41 | /* Destroy all marks in the given list */ | ||
| 42 | extern void fsnotify_destroy_marks(struct list_head *to_free); | ||
| 43 | /* Find mark belonging to given group in the list of marks */ | ||
| 44 | extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, | ||
| 45 | struct fsnotify_group *group); | ||
| 34 | /* run the list of all marks associated with inode and flag them to be freed */ | 46 | /* run the list of all marks associated with inode and flag them to be freed */ |
| 35 | extern void fsnotify_clear_marks_by_inode(struct inode *inode); | 47 | extern void fsnotify_clear_marks_by_inode(struct inode *inode); |
| 36 | /* run the list of all marks associated with vfsmount and flag them to be freed */ | 48 | /* run the list of all marks associated with vfsmount and flag them to be freed */ |
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index dfbf5447eea4..3daf513ee99e 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c | |||
| @@ -31,28 +31,13 @@ | |||
| 31 | #include "../internal.h" | 31 | #include "../internal.h" |
| 32 | 32 | ||
| 33 | /* | 33 | /* |
| 34 | * Recalculate the mask of events relevant to a given inode locked. | ||
| 35 | */ | ||
| 36 | static void fsnotify_recalc_inode_mask_locked(struct inode *inode) | ||
| 37 | { | ||
| 38 | struct fsnotify_mark *mark; | ||
| 39 | __u32 new_mask = 0; | ||
| 40 | |||
| 41 | assert_spin_locked(&inode->i_lock); | ||
| 42 | |||
| 43 | hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) | ||
| 44 | new_mask |= mark->mask; | ||
| 45 | inode->i_fsnotify_mask = new_mask; | ||
| 46 | } | ||
| 47 | |||
| 48 | /* | ||
| 49 | * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types | 34 | * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types |
| 50 | * any notifier is interested in hearing for this inode. | 35 | * any notifier is interested in hearing for this inode. |
| 51 | */ | 36 | */ |
| 52 | void fsnotify_recalc_inode_mask(struct inode *inode) | 37 | void fsnotify_recalc_inode_mask(struct inode *inode) |
| 53 | { | 38 | { |
| 54 | spin_lock(&inode->i_lock); | 39 | spin_lock(&inode->i_lock); |
| 55 | fsnotify_recalc_inode_mask_locked(inode); | 40 | inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); |
| 56 | spin_unlock(&inode->i_lock); | 41 | spin_unlock(&inode->i_lock); |
| 57 | 42 | ||
| 58 | __fsnotify_update_child_dentry_flags(inode); | 43 | __fsnotify_update_child_dentry_flags(inode); |
| @@ -60,23 +45,22 @@ void fsnotify_recalc_inode_mask(struct inode *inode) | |||
| 60 | 45 | ||
| 61 | void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) | 46 | void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) |
| 62 | { | 47 | { |
| 63 | struct inode *inode = mark->i.inode; | 48 | struct inode *inode = mark->inode; |
| 64 | 49 | ||
| 65 | BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); | 50 | BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); |
| 66 | assert_spin_locked(&mark->lock); | 51 | assert_spin_locked(&mark->lock); |
| 67 | 52 | ||
| 68 | spin_lock(&inode->i_lock); | 53 | spin_lock(&inode->i_lock); |
| 69 | 54 | ||
| 70 | hlist_del_init_rcu(&mark->i.i_list); | 55 | hlist_del_init_rcu(&mark->obj_list); |
| 71 | mark->i.inode = NULL; | 56 | mark->inode = NULL; |
| 72 | 57 | ||
| 73 | /* | 58 | /* |
| 74 | * this mark is now off the inode->i_fsnotify_marks list and we | 59 | * this mark is now off the inode->i_fsnotify_marks list and we |
| 75 | * hold the inode->i_lock, so this is the perfect time to update the | 60 | * hold the inode->i_lock, so this is the perfect time to update the |
| 76 | * inode->i_fsnotify_mask | 61 | * inode->i_fsnotify_mask |
| 77 | */ | 62 | */ |
| 78 | fsnotify_recalc_inode_mask_locked(inode); | 63 | inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); |
| 79 | |||
| 80 | spin_unlock(&inode->i_lock); | 64 | spin_unlock(&inode->i_lock); |
| 81 | } | 65 | } |
| 82 | 66 | ||
| @@ -85,30 +69,19 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) | |||
| 85 | */ | 69 | */ |
| 86 | void fsnotify_clear_marks_by_inode(struct inode *inode) | 70 | void fsnotify_clear_marks_by_inode(struct inode *inode) |
| 87 | { | 71 | { |
| 88 | struct fsnotify_mark *mark, *lmark; | 72 | struct fsnotify_mark *mark; |
| 89 | struct hlist_node *n; | 73 | struct hlist_node *n; |
| 90 | LIST_HEAD(free_list); | 74 | LIST_HEAD(free_list); |
| 91 | 75 | ||
| 92 | spin_lock(&inode->i_lock); | 76 | spin_lock(&inode->i_lock); |
| 93 | hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, i.i_list) { | 77 | hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) { |
| 94 | list_add(&mark->i.free_i_list, &free_list); | 78 | list_add(&mark->free_list, &free_list); |
| 95 | hlist_del_init_rcu(&mark->i.i_list); | 79 | hlist_del_init_rcu(&mark->obj_list); |
| 96 | fsnotify_get_mark(mark); | 80 | fsnotify_get_mark(mark); |
| 97 | } | 81 | } |
| 98 | spin_unlock(&inode->i_lock); | 82 | spin_unlock(&inode->i_lock); |
| 99 | 83 | ||
| 100 | list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) { | 84 | fsnotify_destroy_marks(&free_list); |
| 101 | struct fsnotify_group *group; | ||
| 102 | |||
| 103 | spin_lock(&mark->lock); | ||
| 104 | fsnotify_get_group(mark->group); | ||
| 105 | group = mark->group; | ||
| 106 | spin_unlock(&mark->lock); | ||
| 107 | |||
| 108 | fsnotify_destroy_mark(mark, group); | ||
| 109 | fsnotify_put_mark(mark); | ||
| 110 | fsnotify_put_group(group); | ||
| 111 | } | ||
| 112 | } | 85 | } |
| 113 | 86 | ||
| 114 | /* | 87 | /* |
| @@ -123,34 +96,13 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) | |||
| 123 | * given a group and inode, find the mark associated with that combination. | 96 | * given a group and inode, find the mark associated with that combination. |
| 124 | * if found take a reference to that mark and return it, else return NULL | 97 | * if found take a reference to that mark and return it, else return NULL |
| 125 | */ | 98 | */ |
| 126 | static struct fsnotify_mark *fsnotify_find_inode_mark_locked( | ||
| 127 | struct fsnotify_group *group, | ||
| 128 | struct inode *inode) | ||
| 129 | { | ||
| 130 | struct fsnotify_mark *mark; | ||
| 131 | |||
| 132 | assert_spin_locked(&inode->i_lock); | ||
| 133 | |||
| 134 | hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) { | ||
| 135 | if (mark->group == group) { | ||
| 136 | fsnotify_get_mark(mark); | ||
| 137 | return mark; | ||
| 138 | } | ||
| 139 | } | ||
| 140 | return NULL; | ||
| 141 | } | ||
| 142 | |||
| 143 | /* | ||
| 144 | * given a group and inode, find the mark associated with that combination. | ||
| 145 | * if found take a reference to that mark and return it, else return NULL | ||
| 146 | */ | ||
| 147 | struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, | 99 | struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, |
| 148 | struct inode *inode) | 100 | struct inode *inode) |
| 149 | { | 101 | { |
| 150 | struct fsnotify_mark *mark; | 102 | struct fsnotify_mark *mark; |
| 151 | 103 | ||
| 152 | spin_lock(&inode->i_lock); | 104 | spin_lock(&inode->i_lock); |
| 153 | mark = fsnotify_find_inode_mark_locked(group, inode); | 105 | mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); |
| 154 | spin_unlock(&inode->i_lock); | 106 | spin_unlock(&inode->i_lock); |
| 155 | 107 | ||
| 156 | return mark; | 108 | return mark; |
| @@ -168,10 +120,10 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, | |||
| 168 | assert_spin_locked(&mark->lock); | 120 | assert_spin_locked(&mark->lock); |
| 169 | 121 | ||
| 170 | if (mask && | 122 | if (mask && |
| 171 | mark->i.inode && | 123 | mark->inode && |
| 172 | !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { | 124 | !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { |
| 173 | mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; | 125 | mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; |
| 174 | inode = igrab(mark->i.inode); | 126 | inode = igrab(mark->inode); |
| 175 | /* | 127 | /* |
| 176 | * we shouldn't be able to get here if the inode wasn't | 128 | * we shouldn't be able to get here if the inode wasn't |
| 177 | * already safely held in memory. But bug in case it | 129 | * already safely held in memory. But bug in case it |
| @@ -192,9 +144,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, | |||
| 192 | struct fsnotify_group *group, struct inode *inode, | 144 | struct fsnotify_group *group, struct inode *inode, |
| 193 | int allow_dups) | 145 | int allow_dups) |
| 194 | { | 146 | { |
| 195 | struct fsnotify_mark *lmark, *last = NULL; | 147 | int ret; |
| 196 | int ret = 0; | ||
| 197 | int cmp; | ||
| 198 | 148 | ||
| 199 | mark->flags |= FSNOTIFY_MARK_FLAG_INODE; | 149 | mark->flags |= FSNOTIFY_MARK_FLAG_INODE; |
| 200 | 150 | ||
| @@ -202,37 +152,10 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, | |||
| 202 | assert_spin_locked(&mark->lock); | 152 | assert_spin_locked(&mark->lock); |
| 203 | 153 | ||
| 204 | spin_lock(&inode->i_lock); | 154 | spin_lock(&inode->i_lock); |
| 205 | 155 | mark->inode = inode; | |
| 206 | mark->i.inode = inode; | 156 | ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark, |
| 207 | 157 | allow_dups); | |
| 208 | /* is mark the first mark? */ | 158 | inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); |
| 209 | if (hlist_empty(&inode->i_fsnotify_marks)) { | ||
| 210 | hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks); | ||
| 211 | goto out; | ||
| 212 | } | ||
| 213 | |||
| 214 | /* should mark be in the middle of the current list? */ | ||
| 215 | hlist_for_each_entry(lmark, &inode->i_fsnotify_marks, i.i_list) { | ||
| 216 | last = lmark; | ||
| 217 | |||
| 218 | if ((lmark->group == group) && !allow_dups) { | ||
| 219 | ret = -EEXIST; | ||
| 220 | goto out; | ||
| 221 | } | ||
| 222 | |||
| 223 | cmp = fsnotify_compare_groups(lmark->group, mark->group); | ||
| 224 | if (cmp < 0) | ||
| 225 | continue; | ||
| 226 | |||
| 227 | hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list); | ||
| 228 | goto out; | ||
| 229 | } | ||
| 230 | |||
| 231 | BUG_ON(last == NULL); | ||
| 232 | /* mark should be the last entry. last is the current last entry */ | ||
| 233 | hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list); | ||
| 234 | out: | ||
| 235 | fsnotify_recalc_inode_mask_locked(inode); | ||
| 236 | spin_unlock(&inode->i_lock); | 159 | spin_unlock(&inode->i_lock); |
| 237 | 160 | ||
| 238 | return ret; | 161 | return ret; |
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 7d888d77d59a..2cd900c2c737 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c | |||
| @@ -156,7 +156,7 @@ static int idr_callback(int id, void *p, void *data) | |||
| 156 | */ | 156 | */ |
| 157 | if (fsn_mark) | 157 | if (fsn_mark) |
| 158 | printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n", | 158 | printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n", |
| 159 | fsn_mark->group, fsn_mark->i.inode, i_mark->wd); | 159 | fsn_mark->group, fsn_mark->inode, i_mark->wd); |
| 160 | return 0; | 160 | return 0; |
| 161 | } | 161 | } |
| 162 | 162 | ||
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 283aa312d745..450648697433 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c | |||
| @@ -433,7 +433,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, | |||
| 433 | if (wd == -1) { | 433 | if (wd == -1) { |
| 434 | WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" | 434 | WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" |
| 435 | " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, | 435 | " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, |
| 436 | i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); | 436 | i_mark->fsn_mark.group, i_mark->fsn_mark.inode); |
| 437 | goto out; | 437 | goto out; |
| 438 | } | 438 | } |
| 439 | 439 | ||
| @@ -442,7 +442,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, | |||
| 442 | if (unlikely(!found_i_mark)) { | 442 | if (unlikely(!found_i_mark)) { |
| 443 | WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" | 443 | WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" |
| 444 | " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, | 444 | " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, |
| 445 | i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); | 445 | i_mark->fsn_mark.group, i_mark->fsn_mark.inode); |
| 446 | goto out; | 446 | goto out; |
| 447 | } | 447 | } |
| 448 | 448 | ||
| @@ -456,9 +456,9 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, | |||
| 456 | "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d " | 456 | "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d " |
| 457 | "found_i_mark->group=%p found_i_mark->inode=%p\n", | 457 | "found_i_mark->group=%p found_i_mark->inode=%p\n", |
| 458 | __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, | 458 | __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, |
| 459 | i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd, | 459 | i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd, |
| 460 | found_i_mark->fsn_mark.group, | 460 | found_i_mark->fsn_mark.group, |
| 461 | found_i_mark->fsn_mark.i.inode); | 461 | found_i_mark->fsn_mark.inode); |
| 462 | goto out; | 462 | goto out; |
| 463 | } | 463 | } |
| 464 | 464 | ||
| @@ -470,7 +470,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, | |||
| 470 | if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) { | 470 | if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) { |
| 471 | printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" | 471 | printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" |
| 472 | " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, | 472 | " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, |
| 473 | i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); | 473 | i_mark->fsn_mark.group, i_mark->fsn_mark.inode); |
| 474 | /* we can't really recover with bad ref cnting.. */ | 474 | /* we can't really recover with bad ref cnting.. */ |
| 475 | BUG(); | 475 | BUG(); |
| 476 | } | 476 | } |
diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 34c38fabf514..92e48c70f0f0 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c | |||
| @@ -110,6 +110,17 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) | |||
| 110 | } | 110 | } |
| 111 | } | 111 | } |
| 112 | 112 | ||
| 113 | /* Calculate mask of events for a list of marks */ | ||
| 114 | u32 fsnotify_recalc_mask(struct hlist_head *head) | ||
| 115 | { | ||
| 116 | u32 new_mask = 0; | ||
| 117 | struct fsnotify_mark *mark; | ||
| 118 | |||
| 119 | hlist_for_each_entry(mark, head, obj_list) | ||
| 120 | new_mask |= mark->mask; | ||
| 121 | return new_mask; | ||
| 122 | } | ||
| 123 | |||
| 113 | /* | 124 | /* |
| 114 | * Any time a mark is getting freed we end up here. | 125 | * Any time a mark is getting freed we end up here. |
| 115 | * The caller had better be holding a reference to this mark so we don't actually | 126 | * The caller had better be holding a reference to this mark so we don't actually |
| @@ -133,7 +144,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, | |||
| 133 | mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; | 144 | mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; |
| 134 | 145 | ||
| 135 | if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { | 146 | if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { |
| 136 | inode = mark->i.inode; | 147 | inode = mark->inode; |
| 137 | fsnotify_destroy_inode_mark(mark); | 148 | fsnotify_destroy_inode_mark(mark); |
| 138 | } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) | 149 | } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) |
| 139 | fsnotify_destroy_vfsmount_mark(mark); | 150 | fsnotify_destroy_vfsmount_mark(mark); |
| @@ -150,7 +161,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, | |||
| 150 | mutex_unlock(&group->mark_mutex); | 161 | mutex_unlock(&group->mark_mutex); |
| 151 | 162 | ||
| 152 | spin_lock(&destroy_lock); | 163 | spin_lock(&destroy_lock); |
| 153 | list_add(&mark->destroy_list, &destroy_list); | 164 | list_add(&mark->g_list, &destroy_list); |
| 154 | spin_unlock(&destroy_lock); | 165 | spin_unlock(&destroy_lock); |
| 155 | wake_up(&destroy_waitq); | 166 | wake_up(&destroy_waitq); |
| 156 | /* | 167 | /* |
| @@ -192,6 +203,27 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, | |||
| 192 | mutex_unlock(&group->mark_mutex); | 203 | mutex_unlock(&group->mark_mutex); |
| 193 | } | 204 | } |
| 194 | 205 | ||
| 206 | /* | ||
| 207 | * Destroy all marks in the given list. The marks must be already detached from | ||
| 208 | * the original inode / vfsmount. | ||
| 209 | */ | ||
| 210 | void fsnotify_destroy_marks(struct list_head *to_free) | ||
| 211 | { | ||
| 212 | struct fsnotify_mark *mark, *lmark; | ||
| 213 | struct fsnotify_group *group; | ||
| 214 | |||
| 215 | list_for_each_entry_safe(mark, lmark, to_free, free_list) { | ||
| 216 | spin_lock(&mark->lock); | ||
| 217 | fsnotify_get_group(mark->group); | ||
| 218 | group = mark->group; | ||
| 219 | spin_unlock(&mark->lock); | ||
| 220 | |||
| 221 | fsnotify_destroy_mark(mark, group); | ||
| 222 | fsnotify_put_mark(mark); | ||
| 223 | fsnotify_put_group(group); | ||
| 224 | } | ||
| 225 | } | ||
| 226 | |||
| 195 | void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) | 227 | void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) |
| 196 | { | 228 | { |
| 197 | assert_spin_locked(&mark->lock); | 229 | assert_spin_locked(&mark->lock); |
| @@ -245,6 +277,39 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) | |||
| 245 | return -1; | 277 | return -1; |
| 246 | } | 278 | } |
| 247 | 279 | ||
| 280 | /* Add mark into proper place in given list of marks */ | ||
| 281 | int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, | ||
| 282 | int allow_dups) | ||
| 283 | { | ||
| 284 | struct fsnotify_mark *lmark, *last = NULL; | ||
| 285 | int cmp; | ||
| 286 | |||
| 287 | /* is mark the first mark? */ | ||
| 288 | if (hlist_empty(head)) { | ||
| 289 | hlist_add_head_rcu(&mark->obj_list, head); | ||
| 290 | return 0; | ||
| 291 | } | ||
| 292 | |||
| 293 | /* should mark be in the middle of the current list? */ | ||
| 294 | hlist_for_each_entry(lmark, head, obj_list) { | ||
| 295 | last = lmark; | ||
| 296 | |||
| 297 | if ((lmark->group == mark->group) && !allow_dups) | ||
| 298 | return -EEXIST; | ||
| 299 | |||
| 300 | cmp = fsnotify_compare_groups(lmark->group, mark->group); | ||
| 301 | if (cmp >= 0) { | ||
| 302 | hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); | ||
| 303 | return 0; | ||
| 304 | } | ||
| 305 | } | ||
| 306 | |||
| 307 | BUG_ON(last == NULL); | ||
| 308 | /* mark should be the last entry. last is the current last entry */ | ||
| 309 | hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); | ||
| 310 | return 0; | ||
| 311 | } | ||
| 312 | |||
| 248 | /* | 313 | /* |
| 249 | * Attach an initialized mark to a given group and fs object. | 314 | * Attach an initialized mark to a given group and fs object. |
| 250 | * These marks may be used for the fsnotify backend to determine which | 315 | * These marks may be used for the fsnotify backend to determine which |
| @@ -305,7 +370,7 @@ err: | |||
| 305 | spin_unlock(&mark->lock); | 370 | spin_unlock(&mark->lock); |
| 306 | 371 | ||
| 307 | spin_lock(&destroy_lock); | 372 | spin_lock(&destroy_lock); |
| 308 | list_add(&mark->destroy_list, &destroy_list); | 373 | list_add(&mark->g_list, &destroy_list); |
| 309 | spin_unlock(&destroy_lock); | 374 | spin_unlock(&destroy_lock); |
| 310 | wake_up(&destroy_waitq); | 375 | wake_up(&destroy_waitq); |
| 311 | 376 | ||
| @@ -323,6 +388,24 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, | |||
| 323 | } | 388 | } |
| 324 | 389 | ||
| 325 | /* | 390 | /* |
| 391 | * Given a list of marks, find the mark associated with given group. If found | ||
| 392 | * take a reference to that mark and return it, else return NULL. | ||
| 393 | */ | ||
| 394 | struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, | ||
| 395 | struct fsnotify_group *group) | ||
| 396 | { | ||
| 397 | struct fsnotify_mark *mark; | ||
| 398 | |||
| 399 | hlist_for_each_entry(mark, head, obj_list) { | ||
| 400 | if (mark->group == group) { | ||
| 401 | fsnotify_get_mark(mark); | ||
| 402 | return mark; | ||
| 403 | } | ||
| 404 | } | ||
| 405 | return NULL; | ||
| 406 | } | ||
| 407 | |||
| 408 | /* | ||
| 326 | * clear any marks in a group in which mark->flags & flags is true | 409 | * clear any marks in a group in which mark->flags & flags is true |
| 327 | */ | 410 | */ |
| 328 | void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, | 411 | void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, |
| @@ -352,8 +435,8 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group) | |||
| 352 | void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) | 435 | void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) |
| 353 | { | 436 | { |
| 354 | assert_spin_locked(&old->lock); | 437 | assert_spin_locked(&old->lock); |
| 355 | new->i.inode = old->i.inode; | 438 | new->inode = old->inode; |
| 356 | new->m.mnt = old->m.mnt; | 439 | new->mnt = old->mnt; |
| 357 | if (old->group) | 440 | if (old->group) |
| 358 | fsnotify_get_group(old->group); | 441 | fsnotify_get_group(old->group); |
| 359 | new->group = old->group; | 442 | new->group = old->group; |
| @@ -386,8 +469,8 @@ static int fsnotify_mark_destroy(void *ignored) | |||
| 386 | 469 | ||
| 387 | synchronize_srcu(&fsnotify_mark_srcu); | 470 | synchronize_srcu(&fsnotify_mark_srcu); |
| 388 | 471 | ||
| 389 | list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) { | 472 | list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { |
| 390 | list_del_init(&mark->destroy_list); | 473 | list_del_init(&mark->g_list); |
| 391 | fsnotify_put_mark(mark); | 474 | fsnotify_put_mark(mark); |
| 392 | } | 475 | } |
| 393 | 476 | ||
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index faefa72a11eb..326b148e623c 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c | |||
| @@ -32,31 +32,20 @@ | |||
| 32 | 32 | ||
| 33 | void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) | 33 | void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) |
| 34 | { | 34 | { |
| 35 | struct fsnotify_mark *mark, *lmark; | 35 | struct fsnotify_mark *mark; |
| 36 | struct hlist_node *n; | 36 | struct hlist_node *n; |
| 37 | struct mount *m = real_mount(mnt); | 37 | struct mount *m = real_mount(mnt); |
| 38 | LIST_HEAD(free_list); | 38 | LIST_HEAD(free_list); |
| 39 | 39 | ||
| 40 | spin_lock(&mnt->mnt_root->d_lock); | 40 | spin_lock(&mnt->mnt_root->d_lock); |
| 41 | hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, m.m_list) { | 41 | hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) { |
| 42 | list_add(&mark->m.free_m_list, &free_list); | 42 | list_add(&mark->free_list, &free_list); |
| 43 | hlist_del_init_rcu(&mark->m.m_list); | 43 | hlist_del_init_rcu(&mark->obj_list); |
| 44 | fsnotify_get_mark(mark); | 44 | fsnotify_get_mark(mark); |
| 45 | } | 45 | } |
| 46 | spin_unlock(&mnt->mnt_root->d_lock); | 46 | spin_unlock(&mnt->mnt_root->d_lock); |
| 47 | 47 | ||
| 48 | list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) { | 48 | fsnotify_destroy_marks(&free_list); |
| 49 | struct fsnotify_group *group; | ||
| 50 | |||
| 51 | spin_lock(&mark->lock); | ||
| 52 | fsnotify_get_group(mark->group); | ||
| 53 | group = mark->group; | ||
| 54 | spin_unlock(&mark->lock); | ||
| 55 | |||
| 56 | fsnotify_destroy_mark(mark, group); | ||
| 57 | fsnotify_put_mark(mark); | ||
| 58 | fsnotify_put_group(group); | ||
| 59 | } | ||
| 60 | } | 49 | } |
| 61 | 50 | ||
| 62 | void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) | 51 | void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) |
| @@ -65,66 +54,35 @@ void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) | |||
| 65 | } | 54 | } |
| 66 | 55 | ||
| 67 | /* | 56 | /* |
| 68 | * Recalculate the mask of events relevant to a given vfsmount locked. | ||
| 69 | */ | ||
| 70 | static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt) | ||
| 71 | { | ||
| 72 | struct mount *m = real_mount(mnt); | ||
| 73 | struct fsnotify_mark *mark; | ||
| 74 | __u32 new_mask = 0; | ||
| 75 | |||
| 76 | assert_spin_locked(&mnt->mnt_root->d_lock); | ||
| 77 | |||
| 78 | hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) | ||
| 79 | new_mask |= mark->mask; | ||
| 80 | m->mnt_fsnotify_mask = new_mask; | ||
| 81 | } | ||
| 82 | |||
| 83 | /* | ||
| 84 | * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types | 57 | * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types |
| 85 | * any notifier is interested in hearing for this mount point | 58 | * any notifier is interested in hearing for this mount point |
| 86 | */ | 59 | */ |
| 87 | void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) | 60 | void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) |
| 88 | { | 61 | { |
| 62 | struct mount *m = real_mount(mnt); | ||
| 63 | |||
| 89 | spin_lock(&mnt->mnt_root->d_lock); | 64 | spin_lock(&mnt->mnt_root->d_lock); |
| 90 | fsnotify_recalc_vfsmount_mask_locked(mnt); | 65 | m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); |
| 91 | spin_unlock(&mnt->mnt_root->d_lock); | 66 | spin_unlock(&mnt->mnt_root->d_lock); |
| 92 | } | 67 | } |
| 93 | 68 | ||
| 94 | void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) | 69 | void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) |
| 95 | { | 70 | { |
| 96 | struct vfsmount *mnt = mark->m.mnt; | 71 | struct vfsmount *mnt = mark->mnt; |
| 72 | struct mount *m = real_mount(mnt); | ||
| 97 | 73 | ||
| 98 | BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); | 74 | BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); |
| 99 | assert_spin_locked(&mark->lock); | 75 | assert_spin_locked(&mark->lock); |
| 100 | 76 | ||
| 101 | spin_lock(&mnt->mnt_root->d_lock); | 77 | spin_lock(&mnt->mnt_root->d_lock); |
| 102 | 78 | ||
| 103 | hlist_del_init_rcu(&mark->m.m_list); | 79 | hlist_del_init_rcu(&mark->obj_list); |
| 104 | mark->m.mnt = NULL; | 80 | mark->mnt = NULL; |
| 105 | |||
| 106 | fsnotify_recalc_vfsmount_mask_locked(mnt); | ||
| 107 | 81 | ||
| 82 | m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); | ||
| 108 | spin_unlock(&mnt->mnt_root->d_lock); | 83 | spin_unlock(&mnt->mnt_root->d_lock); |
| 109 | } | 84 | } |
| 110 | 85 | ||
| 111 | static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group, | ||
| 112 | struct vfsmount *mnt) | ||
| 113 | { | ||
| 114 | struct mount *m = real_mount(mnt); | ||
| 115 | struct fsnotify_mark *mark; | ||
| 116 | |||
| 117 | assert_spin_locked(&mnt->mnt_root->d_lock); | ||
| 118 | |||
| 119 | hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) { | ||
| 120 | if (mark->group == group) { | ||
| 121 | fsnotify_get_mark(mark); | ||
| 122 | return mark; | ||
| 123 | } | ||
| 124 | } | ||
| 125 | return NULL; | ||
| 126 | } | ||
| 127 | |||
| 128 | /* | 86 | /* |
| 129 | * given a group and vfsmount, find the mark associated with that combination. | 87 | * given a group and vfsmount, find the mark associated with that combination. |
| 130 | * if found take a reference to that mark and return it, else return NULL | 88 | * if found take a reference to that mark and return it, else return NULL |
| @@ -132,10 +90,11 @@ static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_ | |||
| 132 | struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, | 90 | struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, |
| 133 | struct vfsmount *mnt) | 91 | struct vfsmount *mnt) |
| 134 | { | 92 | { |
| 93 | struct mount *m = real_mount(mnt); | ||
| 135 | struct fsnotify_mark *mark; | 94 | struct fsnotify_mark *mark; |
| 136 | 95 | ||
| 137 | spin_lock(&mnt->mnt_root->d_lock); | 96 | spin_lock(&mnt->mnt_root->d_lock); |
| 138 | mark = fsnotify_find_vfsmount_mark_locked(group, mnt); | 97 | mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group); |
| 139 | spin_unlock(&mnt->mnt_root->d_lock); | 98 | spin_unlock(&mnt->mnt_root->d_lock); |
| 140 | 99 | ||
| 141 | return mark; | 100 | return mark; |
| @@ -151,9 +110,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, | |||
| 151 | int allow_dups) | 110 | int allow_dups) |
| 152 | { | 111 | { |
| 153 | struct mount *m = real_mount(mnt); | 112 | struct mount *m = real_mount(mnt); |
| 154 | struct fsnotify_mark *lmark, *last = NULL; | 113 | int ret; |
| 155 | int ret = 0; | ||
| 156 | int cmp; | ||
| 157 | 114 | ||
| 158 | mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; | 115 | mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; |
| 159 | 116 | ||
| @@ -161,37 +118,9 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, | |||
| 161 | assert_spin_locked(&mark->lock); | 118 | assert_spin_locked(&mark->lock); |
| 162 | 119 | ||
| 163 | spin_lock(&mnt->mnt_root->d_lock); | 120 | spin_lock(&mnt->mnt_root->d_lock); |
| 164 | 121 | mark->mnt = mnt; | |
| 165 | mark->m.mnt = mnt; | 122 | ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups); |
| 166 | 123 | m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); | |
| 167 | /* is mark the first mark? */ | ||
| 168 | if (hlist_empty(&m->mnt_fsnotify_marks)) { | ||
| 169 | hlist_add_head_rcu(&mark->m.m_list, &m->mnt_fsnotify_marks); | ||
| 170 | goto out; | ||
| 171 | } | ||
| 172 | |||
| 173 | /* should mark be in the middle of the current list? */ | ||
| 174 | hlist_for_each_entry(lmark, &m->mnt_fsnotify_marks, m.m_list) { | ||
| 175 | last = lmark; | ||
| 176 | |||
| 177 | if ((lmark->group == group) && !allow_dups) { | ||
| 178 | ret = -EEXIST; | ||
| 179 | goto out; | ||
| 180 | } | ||
| 181 | |||
| 182 | cmp = fsnotify_compare_groups(lmark->group, mark->group); | ||
| 183 | if (cmp < 0) | ||
| 184 | continue; | ||
| 185 | |||
| 186 | hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list); | ||
| 187 | goto out; | ||
| 188 | } | ||
| 189 | |||
| 190 | BUG_ON(last == NULL); | ||
| 191 | /* mark should be the last entry. last is the current last entry */ | ||
| 192 | hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list); | ||
| 193 | out: | ||
| 194 | fsnotify_recalc_vfsmount_mask_locked(mnt); | ||
| 195 | spin_unlock(&mnt->mnt_root->d_lock); | 124 | spin_unlock(&mnt->mnt_root->d_lock); |
| 196 | 125 | ||
| 197 | return ret; | 126 | return ret; |
| @@ -295,6 +295,17 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) | |||
| 295 | 295 | ||
| 296 | sb_start_write(inode->i_sb); | 296 | sb_start_write(inode->i_sb); |
| 297 | ret = file->f_op->fallocate(file, mode, offset, len); | 297 | ret = file->f_op->fallocate(file, mode, offset, len); |
| 298 | |||
| 299 | /* | ||
| 300 | * Create inotify and fanotify events. | ||
| 301 | * | ||
| 302 | * To keep the logic simple always create events if fallocate succeeds. | ||
| 303 | * This implies that events are even created if the file size remains | ||
| 304 | * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. | ||
| 305 | */ | ||
| 306 | if (ret == 0) | ||
| 307 | fsnotify_modify(file); | ||
| 308 | |||
| 298 | sb_end_write(inode->i_sb); | 309 | sb_end_write(inode->i_sb); |
| 299 | return ret; | 310 | return ret; |
| 300 | } | 311 | } |
diff --git a/fs/seq_file.c b/fs/seq_file.c index 353948ba1c5b..dbf3a59c86bb 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c | |||
| @@ -25,7 +25,11 @@ static void *seq_buf_alloc(unsigned long size) | |||
| 25 | { | 25 | { |
| 26 | void *buf; | 26 | void *buf; |
| 27 | 27 | ||
| 28 | buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); | 28 | /* |
| 29 | * __GFP_NORETRY to avoid oom-killings with high-order allocations - | ||
| 30 | * it's better to fall back to vmalloc() than to kill things. | ||
| 31 | */ | ||
| 32 | buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); | ||
| 29 | if (!buf && size > PAGE_SIZE) | 33 | if (!buf && size > PAGE_SIZE) |
| 30 | buf = vmalloc(size); | 34 | buf = vmalloc(size); |
| 31 | return buf; | 35 | return buf; |
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 61f29e5ea840..576e4639ca60 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h | |||
| @@ -53,6 +53,10 @@ struct linux_binprm { | |||
| 53 | #define BINPRM_FLAGS_EXECFD_BIT 1 | 53 | #define BINPRM_FLAGS_EXECFD_BIT 1 |
| 54 | #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) | 54 | #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) |
| 55 | 55 | ||
| 56 | /* filename of the binary will be inaccessible after exec */ | ||
| 57 | #define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2 | ||
| 58 | #define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT) | ||
| 59 | |||
| 56 | /* Function parameter for binfmt->coredump */ | 60 | /* Function parameter for binfmt->coredump */ |
| 57 | struct coredump_params { | 61 | struct coredump_params { |
| 58 | const siginfo_t *siginfo; | 62 | const siginfo_t *siginfo; |
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index e1c8d080c427..34e020c23644 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h | |||
| @@ -45,6 +45,7 @@ | |||
| 45 | * bitmap_set(dst, pos, nbits) Set specified bit area | 45 | * bitmap_set(dst, pos, nbits) Set specified bit area |
| 46 | * bitmap_clear(dst, pos, nbits) Clear specified bit area | 46 | * bitmap_clear(dst, pos, nbits) Clear specified bit area |
| 47 | * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area | 47 | * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area |
| 48 | * bitmap_find_next_zero_area_off(buf, len, pos, n, mask) as above | ||
| 48 | * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n | 49 | * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n |
| 49 | * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n | 50 | * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n |
| 50 | * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) | 51 | * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) |
| @@ -114,11 +115,36 @@ extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); | |||
| 114 | 115 | ||
| 115 | extern void bitmap_set(unsigned long *map, unsigned int start, int len); | 116 | extern void bitmap_set(unsigned long *map, unsigned int start, int len); |
| 116 | extern void bitmap_clear(unsigned long *map, unsigned int start, int len); | 117 | extern void bitmap_clear(unsigned long *map, unsigned int start, int len); |
| 117 | extern unsigned long bitmap_find_next_zero_area(unsigned long *map, | 118 | |
| 118 | unsigned long size, | 119 | extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map, |
| 119 | unsigned long start, | 120 | unsigned long size, |
| 120 | unsigned int nr, | 121 | unsigned long start, |
| 121 | unsigned long align_mask); | 122 | unsigned int nr, |
| 123 | unsigned long align_mask, | ||
| 124 | unsigned long align_offset); | ||
| 125 | |||
| 126 | /** | ||
| 127 | * bitmap_find_next_zero_area - find a contiguous aligned zero area | ||
| 128 | * @map: The address to base the search on | ||
| 129 | * @size: The bitmap size in bits | ||
| 130 | * @start: The bitnumber to start searching at | ||
| 131 | * @nr: The number of zeroed bits we're looking for | ||
| 132 | * @align_mask: Alignment mask for zero area | ||
| 133 | * | ||
| 134 | * The @align_mask should be one less than a power of 2; the effect is that | ||
| 135 | * the bit offset of all zero areas this function finds is multiples of that | ||
| 136 | * power of 2. A @align_mask of 0 means no alignment is required. | ||
| 137 | */ | ||
| 138 | static inline unsigned long | ||
| 139 | bitmap_find_next_zero_area(unsigned long *map, | ||
| 140 | unsigned long size, | ||
| 141 | unsigned long start, | ||
| 142 | unsigned int nr, | ||
| 143 | unsigned long align_mask) | ||
| 144 | { | ||
| 145 | return bitmap_find_next_zero_area_off(map, size, start, nr, | ||
| 146 | align_mask, 0); | ||
| 147 | } | ||
| 122 | 148 | ||
| 123 | extern int bitmap_scnprintf(char *buf, unsigned int len, | 149 | extern int bitmap_scnprintf(char *buf, unsigned int len, |
| 124 | const unsigned long *src, int nbits); | 150 | const unsigned long *src, int nbits); |
diff --git a/include/linux/compat.h b/include/linux/compat.h index e6494261eaff..7450ca2ac1fc 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h | |||
| @@ -357,6 +357,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int); | |||
| 357 | 357 | ||
| 358 | asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, | 358 | asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, |
| 359 | const compat_uptr_t __user *envp); | 359 | const compat_uptr_t __user *envp); |
| 360 | asmlinkage long compat_sys_execveat(int dfd, const char __user *filename, | ||
| 361 | const compat_uptr_t __user *argv, | ||
| 362 | const compat_uptr_t __user *envp, int flags); | ||
| 360 | 363 | ||
| 361 | asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, | 364 | asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, |
| 362 | compat_ulong_t __user *outp, compat_ulong_t __user *exp, | 365 | compat_ulong_t __user *outp, compat_ulong_t __user *exp, |
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index c6f996f2abb6..798fad9e420d 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | 5 | ||
| 6 | #include <linux/types.h> | 6 | #include <linux/types.h> |
| 7 | #include <linux/debugfs.h> | 7 | #include <linux/debugfs.h> |
| 8 | #include <linux/ratelimit.h> | ||
| 8 | #include <linux/atomic.h> | 9 | #include <linux/atomic.h> |
| 9 | 10 | ||
| 10 | /* | 11 | /* |
| @@ -25,14 +26,18 @@ struct fault_attr { | |||
| 25 | unsigned long reject_end; | 26 | unsigned long reject_end; |
| 26 | 27 | ||
| 27 | unsigned long count; | 28 | unsigned long count; |
| 29 | struct ratelimit_state ratelimit_state; | ||
| 30 | struct dentry *dname; | ||
| 28 | }; | 31 | }; |
| 29 | 32 | ||
| 30 | #define FAULT_ATTR_INITIALIZER { \ | 33 | #define FAULT_ATTR_INITIALIZER { \ |
| 31 | .interval = 1, \ | 34 | .interval = 1, \ |
| 32 | .times = ATOMIC_INIT(1), \ | 35 | .times = ATOMIC_INIT(1), \ |
| 33 | .require_end = ULONG_MAX, \ | 36 | .require_end = ULONG_MAX, \ |
| 34 | .stacktrace_depth = 32, \ | 37 | .stacktrace_depth = 32, \ |
| 35 | .verbose = 2, \ | 38 | .ratelimit_state = RATELIMIT_STATE_INIT_DISABLED, \ |
| 39 | .verbose = 2, \ | ||
| 40 | .dname = NULL, \ | ||
| 36 | } | 41 | } |
| 37 | 42 | ||
| 38 | #define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER | 43 | #define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER |
diff --git a/include/linux/fs.h b/include/linux/fs.h index bb29b02d9bb6..4193a0bd99b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/pid.h> | 18 | #include <linux/pid.h> |
| 19 | #include <linux/bug.h> | 19 | #include <linux/bug.h> |
| 20 | #include <linux/mutex.h> | 20 | #include <linux/mutex.h> |
| 21 | #include <linux/rwsem.h> | ||
| 21 | #include <linux/capability.h> | 22 | #include <linux/capability.h> |
| 22 | #include <linux/semaphore.h> | 23 | #include <linux/semaphore.h> |
| 23 | #include <linux/fiemap.h> | 24 | #include <linux/fiemap.h> |
| @@ -401,7 +402,7 @@ struct address_space { | |||
| 401 | atomic_t i_mmap_writable;/* count VM_SHARED mappings */ | 402 | atomic_t i_mmap_writable;/* count VM_SHARED mappings */ |
| 402 | struct rb_root i_mmap; /* tree of private and shared mappings */ | 403 | struct rb_root i_mmap; /* tree of private and shared mappings */ |
| 403 | struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ | 404 | struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ |
| 404 | struct mutex i_mmap_mutex; /* protect tree, count, list */ | 405 | struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ |
| 405 | /* Protected by tree_lock together with the radix tree */ | 406 | /* Protected by tree_lock together with the radix tree */ |
| 406 | unsigned long nrpages; /* number of total pages */ | 407 | unsigned long nrpages; /* number of total pages */ |
| 407 | unsigned long nrshadows; /* number of shadow entries */ | 408 | unsigned long nrshadows; /* number of shadow entries */ |
| @@ -467,6 +468,26 @@ struct block_device { | |||
| 467 | 468 | ||
| 468 | int mapping_tagged(struct address_space *mapping, int tag); | 469 | int mapping_tagged(struct address_space *mapping, int tag); |
| 469 | 470 | ||
| 471 | static inline void i_mmap_lock_write(struct address_space *mapping) | ||
| 472 | { | ||
| 473 | down_write(&mapping->i_mmap_rwsem); | ||
| 474 | } | ||
| 475 | |||
| 476 | static inline void i_mmap_unlock_write(struct address_space *mapping) | ||
| 477 | { | ||
| 478 | up_write(&mapping->i_mmap_rwsem); | ||
| 479 | } | ||
| 480 | |||
| 481 | static inline void i_mmap_lock_read(struct address_space *mapping) | ||
| 482 | { | ||
| 483 | down_read(&mapping->i_mmap_rwsem); | ||
| 484 | } | ||
| 485 | |||
| 486 | static inline void i_mmap_unlock_read(struct address_space *mapping) | ||
| 487 | { | ||
| 488 | up_read(&mapping->i_mmap_rwsem); | ||
| 489 | } | ||
| 490 | |||
| 470 | /* | 491 | /* |
| 471 | * Might pages of this file be mapped into userspace? | 492 | * Might pages of this file be mapped into userspace? |
| 472 | */ | 493 | */ |
| @@ -2075,6 +2096,7 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *); | |||
| 2075 | extern struct file * dentry_open(const struct path *, int, const struct cred *); | 2096 | extern struct file * dentry_open(const struct path *, int, const struct cred *); |
| 2076 | extern int filp_close(struct file *, fl_owner_t id); | 2097 | extern int filp_close(struct file *, fl_owner_t id); |
| 2077 | 2098 | ||
| 2099 | extern struct filename *getname_flags(const char __user *, int, int *); | ||
| 2078 | extern struct filename *getname(const char __user *); | 2100 | extern struct filename *getname(const char __user *); |
| 2079 | extern struct filename *getname_kernel(const char *); | 2101 | extern struct filename *getname_kernel(const char *); |
| 2080 | 2102 | ||
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index ca060d7c4fa6..0f313f93c586 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h | |||
| @@ -197,24 +197,6 @@ struct fsnotify_group { | |||
| 197 | #define FSNOTIFY_EVENT_INODE 2 | 197 | #define FSNOTIFY_EVENT_INODE 2 |
| 198 | 198 | ||
| 199 | /* | 199 | /* |
| 200 | * Inode specific fields in an fsnotify_mark | ||
| 201 | */ | ||
| 202 | struct fsnotify_inode_mark { | ||
| 203 | struct inode *inode; /* inode this mark is associated with */ | ||
| 204 | struct hlist_node i_list; /* list of marks by inode->i_fsnotify_marks */ | ||
| 205 | struct list_head free_i_list; /* tmp list used when freeing this mark */ | ||
| 206 | }; | ||
| 207 | |||
| 208 | /* | ||
| 209 | * Mount point specific fields in an fsnotify_mark | ||
| 210 | */ | ||
| 211 | struct fsnotify_vfsmount_mark { | ||
| 212 | struct vfsmount *mnt; /* vfsmount this mark is associated with */ | ||
| 213 | struct hlist_node m_list; /* list of marks by inode->i_fsnotify_marks */ | ||
| 214 | struct list_head free_m_list; /* tmp list used when freeing this mark */ | ||
| 215 | }; | ||
| 216 | |||
| 217 | /* | ||
| 218 | * a mark is simply an object attached to an in core inode which allows an | 200 | * a mark is simply an object attached to an in core inode which allows an |
| 219 | * fsnotify listener to indicate they are either no longer interested in events | 201 | * fsnotify listener to indicate they are either no longer interested in events |
| 220 | * of a type matching mask or only interested in those events. | 202 | * of a type matching mask or only interested in those events. |
| @@ -230,11 +212,17 @@ struct fsnotify_mark { | |||
| 230 | * in kernel that found and may be using this mark. */ | 212 | * in kernel that found and may be using this mark. */ |
| 231 | atomic_t refcnt; /* active things looking at this mark */ | 213 | atomic_t refcnt; /* active things looking at this mark */ |
| 232 | struct fsnotify_group *group; /* group this mark is for */ | 214 | struct fsnotify_group *group; /* group this mark is for */ |
| 233 | struct list_head g_list; /* list of marks by group->i_fsnotify_marks */ | 215 | struct list_head g_list; /* list of marks by group->i_fsnotify_marks |
| 216 | * Also reused for queueing mark into | ||
| 217 | * destroy_list when it's waiting for | ||
| 218 | * the end of SRCU period before it can | ||
| 219 | * be freed */ | ||
| 234 | spinlock_t lock; /* protect group and inode */ | 220 | spinlock_t lock; /* protect group and inode */ |
| 221 | struct hlist_node obj_list; /* list of marks for inode / vfsmount */ | ||
| 222 | struct list_head free_list; /* tmp list used when freeing this mark */ | ||
| 235 | union { | 223 | union { |
| 236 | struct fsnotify_inode_mark i; | 224 | struct inode *inode; /* inode this mark is associated with */ |
| 237 | struct fsnotify_vfsmount_mark m; | 225 | struct vfsmount *mnt; /* vfsmount this mark is associated with */ |
| 238 | }; | 226 | }; |
| 239 | __u32 ignored_mask; /* events types to ignore */ | 227 | __u32 ignored_mask; /* events types to ignore */ |
| 240 | #define FSNOTIFY_MARK_FLAG_INODE 0x01 | 228 | #define FSNOTIFY_MARK_FLAG_INODE 0x01 |
| @@ -243,7 +231,6 @@ struct fsnotify_mark { | |||
| 243 | #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08 | 231 | #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08 |
| 244 | #define FSNOTIFY_MARK_FLAG_ALIVE 0x10 | 232 | #define FSNOTIFY_MARK_FLAG_ALIVE 0x10 |
| 245 | unsigned int flags; /* vfsmount or inode mark? */ | 233 | unsigned int flags; /* vfsmount or inode mark? */ |
| 246 | struct list_head destroy_list; | ||
| 247 | void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */ | 234 | void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */ |
| 248 | }; | 235 | }; |
| 249 | 236 | ||
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 07d2699cdb51..b840e3b2770d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
| @@ -110,11 +110,8 @@ struct vm_area_struct; | |||
| 110 | #define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ | 110 | #define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ |
| 111 | __GFP_RECLAIMABLE) | 111 | __GFP_RECLAIMABLE) |
| 112 | #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) | 112 | #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) |
| 113 | #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ | 113 | #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) |
| 114 | __GFP_HIGHMEM) | 114 | #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) |
| 115 | #define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \ | ||
| 116 | __GFP_HARDWALL | __GFP_HIGHMEM | \ | ||
| 117 | __GFP_MOVABLE) | ||
| 118 | #define GFP_IOFS (__GFP_IO | __GFP_FS) | 115 | #define GFP_IOFS (__GFP_IO | __GFP_FS) |
| 119 | #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ | 116 | #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ |
| 120 | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ | 117 | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ |
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 35e7eca4e33b..e365d5ec69cb 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h | |||
| @@ -7,15 +7,6 @@ | |||
| 7 | #include <linux/notifier.h> | 7 | #include <linux/notifier.h> |
| 8 | #include <linux/nsproxy.h> | 8 | #include <linux/nsproxy.h> |
| 9 | 9 | ||
| 10 | /* | ||
| 11 | * ipc namespace events | ||
| 12 | */ | ||
| 13 | #define IPCNS_MEMCHANGED 0x00000001 /* Notify lowmem size changed */ | ||
| 14 | #define IPCNS_CREATED 0x00000002 /* Notify new ipc namespace created */ | ||
| 15 | #define IPCNS_REMOVED 0x00000003 /* Notify ipc namespace removed */ | ||
| 16 | |||
| 17 | #define IPCNS_CALLBACK_PRI 0 | ||
| 18 | |||
| 19 | struct user_namespace; | 10 | struct user_namespace; |
| 20 | 11 | ||
| 21 | struct ipc_ids { | 12 | struct ipc_ids { |
| @@ -38,7 +29,6 @@ struct ipc_namespace { | |||
| 38 | unsigned int msg_ctlmni; | 29 | unsigned int msg_ctlmni; |
| 39 | atomic_t msg_bytes; | 30 | atomic_t msg_bytes; |
| 40 | atomic_t msg_hdrs; | 31 | atomic_t msg_hdrs; |
| 41 | int auto_msgmni; | ||
| 42 | 32 | ||
| 43 | size_t shm_ctlmax; | 33 | size_t shm_ctlmax; |
| 44 | size_t shm_ctlall; | 34 | size_t shm_ctlall; |
| @@ -77,18 +67,8 @@ extern atomic_t nr_ipc_ns; | |||
| 77 | extern spinlock_t mq_lock; | 67 | extern spinlock_t mq_lock; |
| 78 | 68 | ||
| 79 | #ifdef CONFIG_SYSVIPC | 69 | #ifdef CONFIG_SYSVIPC |
| 80 | extern int register_ipcns_notifier(struct ipc_namespace *); | ||
| 81 | extern int cond_register_ipcns_notifier(struct ipc_namespace *); | ||
| 82 | extern void unregister_ipcns_notifier(struct ipc_namespace *); | ||
| 83 | extern int ipcns_notify(unsigned long); | ||
| 84 | extern void shm_destroy_orphaned(struct ipc_namespace *ns); | 70 | extern void shm_destroy_orphaned(struct ipc_namespace *ns); |
| 85 | #else /* CONFIG_SYSVIPC */ | 71 | #else /* CONFIG_SYSVIPC */ |
| 86 | static inline int register_ipcns_notifier(struct ipc_namespace *ns) | ||
| 87 | { return 0; } | ||
| 88 | static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns) | ||
| 89 | { return 0; } | ||
| 90 | static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { } | ||
| 91 | static inline int ipcns_notify(unsigned long l) { return 0; } | ||
| 92 | static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} | 72 | static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} |
| 93 | #endif /* CONFIG_SYSVIPC */ | 73 | #endif /* CONFIG_SYSVIPC */ |
| 94 | 74 | ||
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 057e95971014..e705467ddb47 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h | |||
| @@ -21,6 +21,8 @@ | |||
| 21 | #ifndef __KMEMLEAK_H | 21 | #ifndef __KMEMLEAK_H |
| 22 | #define __KMEMLEAK_H | 22 | #define __KMEMLEAK_H |
| 23 | 23 | ||
| 24 | #include <linux/slab.h> | ||
| 25 | |||
| 24 | #ifdef CONFIG_DEBUG_KMEMLEAK | 26 | #ifdef CONFIG_DEBUG_KMEMLEAK |
| 25 | 27 | ||
| 26 | extern void kmemleak_init(void) __ref; | 28 | extern void kmemleak_init(void) __ref; |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6ea9f919e888..7c95af8d552c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
| @@ -400,8 +400,8 @@ int memcg_cache_id(struct mem_cgroup *memcg); | |||
| 400 | 400 | ||
| 401 | void memcg_update_array_size(int num_groups); | 401 | void memcg_update_array_size(int num_groups); |
| 402 | 402 | ||
| 403 | struct kmem_cache * | 403 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); |
| 404 | __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); | 404 | void __memcg_kmem_put_cache(struct kmem_cache *cachep); |
| 405 | 405 | ||
| 406 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); | 406 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); |
| 407 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); | 407 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); |
| @@ -492,7 +492,13 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
| 492 | if (unlikely(fatal_signal_pending(current))) | 492 | if (unlikely(fatal_signal_pending(current))) |
| 493 | return cachep; | 493 | return cachep; |
| 494 | 494 | ||
| 495 | return __memcg_kmem_get_cache(cachep, gfp); | 495 | return __memcg_kmem_get_cache(cachep); |
| 496 | } | ||
| 497 | |||
| 498 | static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) | ||
| 499 | { | ||
| 500 | if (memcg_kmem_enabled()) | ||
| 501 | __memcg_kmem_put_cache(cachep); | ||
| 496 | } | 502 | } |
| 497 | #else | 503 | #else |
| 498 | #define for_each_memcg_cache_index(_idx) \ | 504 | #define for_each_memcg_cache_index(_idx) \ |
| @@ -528,6 +534,10 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
| 528 | { | 534 | { |
| 529 | return cachep; | 535 | return cachep; |
| 530 | } | 536 | } |
| 537 | |||
| 538 | static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) | ||
| 539 | { | ||
| 540 | } | ||
| 531 | #endif /* CONFIG_MEMCG_KMEM */ | 541 | #endif /* CONFIG_MEMCG_KMEM */ |
| 532 | #endif /* _LINUX_MEMCONTROL_H */ | 542 | #endif /* _LINUX_MEMCONTROL_H */ |
| 533 | 543 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b337efbe533..c0a67b894c4c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/bit_spinlock.h> | 19 | #include <linux/bit_spinlock.h> |
| 20 | #include <linux/shrinker.h> | 20 | #include <linux/shrinker.h> |
| 21 | #include <linux/resource.h> | 21 | #include <linux/resource.h> |
| 22 | #include <linux/page_ext.h> | ||
| 22 | 23 | ||
| 23 | struct mempolicy; | 24 | struct mempolicy; |
| 24 | struct anon_vma; | 25 | struct anon_vma; |
| @@ -2060,7 +2061,22 @@ static inline void vm_stat_account(struct mm_struct *mm, | |||
| 2060 | #endif /* CONFIG_PROC_FS */ | 2061 | #endif /* CONFIG_PROC_FS */ |
| 2061 | 2062 | ||
| 2062 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2063 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 2063 | extern void kernel_map_pages(struct page *page, int numpages, int enable); | 2064 | extern bool _debug_pagealloc_enabled; |
| 2065 | extern void __kernel_map_pages(struct page *page, int numpages, int enable); | ||
| 2066 | |||
| 2067 | static inline bool debug_pagealloc_enabled(void) | ||
| 2068 | { | ||
| 2069 | return _debug_pagealloc_enabled; | ||
| 2070 | } | ||
| 2071 | |||
| 2072 | static inline void | ||
| 2073 | kernel_map_pages(struct page *page, int numpages, int enable) | ||
| 2074 | { | ||
| 2075 | if (!debug_pagealloc_enabled()) | ||
| 2076 | return; | ||
| 2077 | |||
| 2078 | __kernel_map_pages(page, numpages, enable); | ||
| 2079 | } | ||
| 2064 | #ifdef CONFIG_HIBERNATION | 2080 | #ifdef CONFIG_HIBERNATION |
| 2065 | extern bool kernel_page_present(struct page *page); | 2081 | extern bool kernel_page_present(struct page *page); |
| 2066 | #endif /* CONFIG_HIBERNATION */ | 2082 | #endif /* CONFIG_HIBERNATION */ |
| @@ -2094,9 +2110,9 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, | |||
| 2094 | void __user *, size_t *, loff_t *); | 2110 | void __user *, size_t *, loff_t *); |
| 2095 | #endif | 2111 | #endif |
| 2096 | 2112 | ||
| 2097 | unsigned long shrink_slab(struct shrink_control *shrink, | 2113 | unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, |
| 2098 | unsigned long nr_pages_scanned, | 2114 | unsigned long nr_scanned, |
| 2099 | unsigned long lru_pages); | 2115 | unsigned long nr_eligible); |
| 2100 | 2116 | ||
| 2101 | #ifndef CONFIG_MMU | 2117 | #ifndef CONFIG_MMU |
| 2102 | #define randomize_va_space 0 | 2118 | #define randomize_va_space 0 |
| @@ -2155,20 +2171,36 @@ extern void copy_user_huge_page(struct page *dst, struct page *src, | |||
| 2155 | unsigned int pages_per_huge_page); | 2171 | unsigned int pages_per_huge_page); |
| 2156 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | 2172 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ |
| 2157 | 2173 | ||
| 2174 | extern struct page_ext_operations debug_guardpage_ops; | ||
| 2175 | extern struct page_ext_operations page_poisoning_ops; | ||
| 2176 | |||
| 2158 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2177 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 2159 | extern unsigned int _debug_guardpage_minorder; | 2178 | extern unsigned int _debug_guardpage_minorder; |
| 2179 | extern bool _debug_guardpage_enabled; | ||
| 2160 | 2180 | ||
| 2161 | static inline unsigned int debug_guardpage_minorder(void) | 2181 | static inline unsigned int debug_guardpage_minorder(void) |
| 2162 | { | 2182 | { |
| 2163 | return _debug_guardpage_minorder; | 2183 | return _debug_guardpage_minorder; |
| 2164 | } | 2184 | } |
| 2165 | 2185 | ||
| 2186 | static inline bool debug_guardpage_enabled(void) | ||
| 2187 | { | ||
| 2188 | return _debug_guardpage_enabled; | ||
| 2189 | } | ||
| 2190 | |||
| 2166 | static inline bool page_is_guard(struct page *page) | 2191 | static inline bool page_is_guard(struct page *page) |
| 2167 | { | 2192 | { |
| 2168 | return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | 2193 | struct page_ext *page_ext; |
| 2194 | |||
| 2195 | if (!debug_guardpage_enabled()) | ||
| 2196 | return false; | ||
| 2197 | |||
| 2198 | page_ext = lookup_page_ext(page); | ||
| 2199 | return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); | ||
| 2169 | } | 2200 | } |
| 2170 | #else | 2201 | #else |
| 2171 | static inline unsigned int debug_guardpage_minorder(void) { return 0; } | 2202 | static inline unsigned int debug_guardpage_minorder(void) { return 0; } |
| 2203 | static inline bool debug_guardpage_enabled(void) { return false; } | ||
| 2172 | static inline bool page_is_guard(struct page *page) { return false; } | 2204 | static inline bool page_is_guard(struct page *page) { return false; } |
| 2173 | #endif /* CONFIG_DEBUG_PAGEALLOC */ | 2205 | #endif /* CONFIG_DEBUG_PAGEALLOC */ |
| 2174 | 2206 | ||
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bf9f57529dcf..6d34aa266a8c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
| @@ -10,7 +10,6 @@ | |||
| 10 | #include <linux/rwsem.h> | 10 | #include <linux/rwsem.h> |
| 11 | #include <linux/completion.h> | 11 | #include <linux/completion.h> |
| 12 | #include <linux/cpumask.h> | 12 | #include <linux/cpumask.h> |
| 13 | #include <linux/page-debug-flags.h> | ||
| 14 | #include <linux/uprobes.h> | 13 | #include <linux/uprobes.h> |
| 15 | #include <linux/page-flags-layout.h> | 14 | #include <linux/page-flags-layout.h> |
| 16 | #include <asm/page.h> | 15 | #include <asm/page.h> |
| @@ -186,9 +185,6 @@ struct page { | |||
| 186 | void *virtual; /* Kernel virtual address (NULL if | 185 | void *virtual; /* Kernel virtual address (NULL if |
| 187 | not kmapped, ie. highmem) */ | 186 | not kmapped, ie. highmem) */ |
| 188 | #endif /* WANT_PAGE_VIRTUAL */ | 187 | #endif /* WANT_PAGE_VIRTUAL */ |
| 189 | #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS | ||
| 190 | unsigned long debug_flags; /* Use atomic bitops on this */ | ||
| 191 | #endif | ||
| 192 | 188 | ||
| 193 | #ifdef CONFIG_KMEMCHECK | 189 | #ifdef CONFIG_KMEMCHECK |
| 194 | /* | 190 | /* |
| @@ -534,4 +530,12 @@ enum tlb_flush_reason { | |||
| 534 | NR_TLB_FLUSH_REASONS, | 530 | NR_TLB_FLUSH_REASONS, |
| 535 | }; | 531 | }; |
| 536 | 532 | ||
| 533 | /* | ||
| 534 | * A swap entry has to fit into a "unsigned long", as the entry is hidden | ||
| 535 | * in the "index" field of the swapper address space. | ||
| 536 | */ | ||
| 537 | typedef struct { | ||
| 538 | unsigned long val; | ||
| 539 | } swp_entry_t; | ||
| 540 | |||
| 537 | #endif /* _LINUX_MM_TYPES_H */ | 541 | #endif /* _LINUX_MM_TYPES_H */ |
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 88787bb4b3b9..ab8564b03468 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h | |||
| @@ -154,7 +154,7 @@ struct mmu_notifier_ops { | |||
| 154 | * Therefore notifier chains can only be traversed when either | 154 | * Therefore notifier chains can only be traversed when either |
| 155 | * | 155 | * |
| 156 | * 1. mmap_sem is held. | 156 | * 1. mmap_sem is held. |
| 157 | * 2. One of the reverse map locks is held (i_mmap_mutex or anon_vma->rwsem). | 157 | * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). |
| 158 | * 3. No other concurrent thread can access the list (release) | 158 | * 3. No other concurrent thread can access the list (release) |
| 159 | */ | 159 | */ |
| 160 | struct mmu_notifier { | 160 | struct mmu_notifier { |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3879d7664dfc..2f0856d14b21 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
| @@ -722,6 +722,9 @@ typedef struct pglist_data { | |||
| 722 | int nr_zones; | 722 | int nr_zones; |
| 723 | #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ | 723 | #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ |
| 724 | struct page *node_mem_map; | 724 | struct page *node_mem_map; |
| 725 | #ifdef CONFIG_PAGE_EXTENSION | ||
| 726 | struct page_ext *node_page_ext; | ||
| 727 | #endif | ||
| 725 | #endif | 728 | #endif |
| 726 | #ifndef CONFIG_NO_BOOTMEM | 729 | #ifndef CONFIG_NO_BOOTMEM |
| 727 | struct bootmem_data *bdata; | 730 | struct bootmem_data *bdata; |
| @@ -1075,6 +1078,7 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) | |||
| 1075 | #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) | 1078 | #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) |
| 1076 | 1079 | ||
| 1077 | struct page; | 1080 | struct page; |
| 1081 | struct page_ext; | ||
| 1078 | struct mem_section { | 1082 | struct mem_section { |
| 1079 | /* | 1083 | /* |
| 1080 | * This is, logically, a pointer to an array of struct | 1084 | * This is, logically, a pointer to an array of struct |
| @@ -1092,6 +1096,14 @@ struct mem_section { | |||
| 1092 | 1096 | ||
| 1093 | /* See declaration of similar field in struct zone */ | 1097 | /* See declaration of similar field in struct zone */ |
| 1094 | unsigned long *pageblock_flags; | 1098 | unsigned long *pageblock_flags; |
| 1099 | #ifdef CONFIG_PAGE_EXTENSION | ||
| 1100 | /* | ||
| 1101 | * If !SPARSEMEM, pgdat doesn't have page_ext pointer. We use | ||
| 1102 | * section. (see page_ext.h about this.) | ||
| 1103 | */ | ||
| 1104 | struct page_ext *page_ext; | ||
| 1105 | unsigned long pad; | ||
| 1106 | #endif | ||
| 1095 | /* | 1107 | /* |
| 1096 | * WARNING: mem_section must be a power-of-2 in size for the | 1108 | * WARNING: mem_section must be a power-of-2 in size for the |
| 1097 | * calculation and use of SECTION_ROOT_MASK to make sense. | 1109 | * calculation and use of SECTION_ROOT_MASK to make sense. |
diff --git a/include/linux/oom.h b/include/linux/oom.h index e8d6e1058723..853698c721f7 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
| @@ -92,6 +92,17 @@ static inline bool oom_gfp_allowed(gfp_t gfp_mask) | |||
| 92 | 92 | ||
| 93 | extern struct task_struct *find_lock_task_mm(struct task_struct *p); | 93 | extern struct task_struct *find_lock_task_mm(struct task_struct *p); |
| 94 | 94 | ||
| 95 | static inline bool task_will_free_mem(struct task_struct *task) | ||
| 96 | { | ||
| 97 | /* | ||
| 98 | * A coredumping process may sleep for an extended period in exit_mm(), | ||
| 99 | * so the oom killer cannot assume that the process will promptly exit | ||
| 100 | * and release memory. | ||
| 101 | */ | ||
| 102 | return (task->flags & PF_EXITING) && | ||
| 103 | !(task->signal->flags & SIGNAL_GROUP_COREDUMP); | ||
| 104 | } | ||
| 105 | |||
| 95 | /* sysctls */ | 106 | /* sysctls */ |
| 96 | extern int sysctl_oom_dump_tasks; | 107 | extern int sysctl_oom_dump_tasks; |
| 97 | extern int sysctl_oom_kill_allocating_task; | 108 | extern int sysctl_oom_kill_allocating_task; |
diff --git a/include/linux/page-debug-flags.h b/include/linux/page-debug-flags.h deleted file mode 100644 index 22691f614043..000000000000 --- a/include/linux/page-debug-flags.h +++ /dev/null | |||
| @@ -1,32 +0,0 @@ | |||
| 1 | #ifndef LINUX_PAGE_DEBUG_FLAGS_H | ||
| 2 | #define LINUX_PAGE_DEBUG_FLAGS_H | ||
| 3 | |||
| 4 | /* | ||
| 5 | * page->debug_flags bits: | ||
| 6 | * | ||
| 7 | * PAGE_DEBUG_FLAG_POISON is set for poisoned pages. This is used to | ||
| 8 | * implement generic debug pagealloc feature. The pages are filled with | ||
| 9 | * poison patterns and set this flag after free_pages(). The poisoned | ||
| 10 | * pages are verified whether the patterns are not corrupted and clear | ||
| 11 | * the flag before alloc_pages(). | ||
| 12 | */ | ||
| 13 | |||
| 14 | enum page_debug_flags { | ||
| 15 | PAGE_DEBUG_FLAG_POISON, /* Page is poisoned */ | ||
| 16 | PAGE_DEBUG_FLAG_GUARD, | ||
| 17 | }; | ||
| 18 | |||
| 19 | /* | ||
| 20 | * Ensure that CONFIG_WANT_PAGE_DEBUG_FLAGS reliably | ||
| 21 | * gets turned off when no debug features are enabling it! | ||
| 22 | */ | ||
| 23 | |||
| 24 | #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS | ||
| 25 | #if !defined(CONFIG_PAGE_POISONING) && \ | ||
| 26 | !defined(CONFIG_PAGE_GUARD) \ | ||
| 27 | /* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */ | ||
| 28 | #error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features! | ||
| 29 | #endif | ||
| 30 | #endif /* CONFIG_WANT_PAGE_DEBUG_FLAGS */ | ||
| 31 | |||
| 32 | #endif /* LINUX_PAGE_DEBUG_FLAGS_H */ | ||
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h new file mode 100644 index 000000000000..d2a2c84c72d0 --- /dev/null +++ b/include/linux/page_ext.h | |||
| @@ -0,0 +1,84 @@ | |||
| 1 | #ifndef __LINUX_PAGE_EXT_H | ||
| 2 | #define __LINUX_PAGE_EXT_H | ||
| 3 | |||
| 4 | #include <linux/types.h> | ||
| 5 | #include <linux/stacktrace.h> | ||
| 6 | |||
| 7 | struct pglist_data; | ||
| 8 | struct page_ext_operations { | ||
| 9 | bool (*need)(void); | ||
| 10 | void (*init)(void); | ||
| 11 | }; | ||
| 12 | |||
| 13 | #ifdef CONFIG_PAGE_EXTENSION | ||
| 14 | |||
| 15 | /* | ||
| 16 | * page_ext->flags bits: | ||
| 17 | * | ||
| 18 | * PAGE_EXT_DEBUG_POISON is set for poisoned pages. This is used to | ||
| 19 | * implement generic debug pagealloc feature. The pages are filled with | ||
| 20 | * poison patterns and set this flag after free_pages(). The poisoned | ||
| 21 | * pages are verified whether the patterns are not corrupted and clear | ||
| 22 | * the flag before alloc_pages(). | ||
| 23 | */ | ||
| 24 | |||
| 25 | enum page_ext_flags { | ||
| 26 | PAGE_EXT_DEBUG_POISON, /* Page is poisoned */ | ||
| 27 | PAGE_EXT_DEBUG_GUARD, | ||
| 28 | PAGE_EXT_OWNER, | ||
| 29 | }; | ||
| 30 | |||
| 31 | /* | ||
| 32 | * Page Extension can be considered as an extended mem_map. | ||
| 33 | * A page_ext page is associated with every page descriptor. The | ||
| 34 | * page_ext helps us add more information about the page. | ||
| 35 | * All page_ext are allocated at boot or memory hotplug event, | ||
| 36 | * then the page_ext for pfn always exists. | ||
| 37 | */ | ||
| 38 | struct page_ext { | ||
| 39 | unsigned long flags; | ||
| 40 | #ifdef CONFIG_PAGE_OWNER | ||
| 41 | unsigned int order; | ||
| 42 | gfp_t gfp_mask; | ||
| 43 | struct stack_trace trace; | ||
| 44 | unsigned long trace_entries[8]; | ||
| 45 | #endif | ||
| 46 | }; | ||
| 47 | |||
| 48 | extern void pgdat_page_ext_init(struct pglist_data *pgdat); | ||
| 49 | |||
| 50 | #ifdef CONFIG_SPARSEMEM | ||
| 51 | static inline void page_ext_init_flatmem(void) | ||
| 52 | { | ||
| 53 | } | ||
| 54 | extern void page_ext_init(void); | ||
| 55 | #else | ||
| 56 | extern void page_ext_init_flatmem(void); | ||
| 57 | static inline void page_ext_init(void) | ||
| 58 | { | ||
| 59 | } | ||
| 60 | #endif | ||
| 61 | |||
| 62 | struct page_ext *lookup_page_ext(struct page *page); | ||
| 63 | |||
| 64 | #else /* !CONFIG_PAGE_EXTENSION */ | ||
| 65 | struct page_ext; | ||
| 66 | |||
| 67 | static inline void pgdat_page_ext_init(struct pglist_data *pgdat) | ||
| 68 | { | ||
| 69 | } | ||
| 70 | |||
| 71 | static inline struct page_ext *lookup_page_ext(struct page *page) | ||
| 72 | { | ||
| 73 | return NULL; | ||
| 74 | } | ||
| 75 | |||
| 76 | static inline void page_ext_init(void) | ||
| 77 | { | ||
| 78 | } | ||
| 79 | |||
| 80 | static inline void page_ext_init_flatmem(void) | ||
| 81 | { | ||
| 82 | } | ||
| 83 | #endif /* CONFIG_PAGE_EXTENSION */ | ||
| 84 | #endif /* __LINUX_PAGE_EXT_H */ | ||
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h new file mode 100644 index 000000000000..b48c3471c254 --- /dev/null +++ b/include/linux/page_owner.h | |||
| @@ -0,0 +1,38 @@ | |||
| 1 | #ifndef __LINUX_PAGE_OWNER_H | ||
| 2 | #define __LINUX_PAGE_OWNER_H | ||
| 3 | |||
| 4 | #ifdef CONFIG_PAGE_OWNER | ||
| 5 | extern bool page_owner_inited; | ||
| 6 | extern struct page_ext_operations page_owner_ops; | ||
| 7 | |||
| 8 | extern void __reset_page_owner(struct page *page, unsigned int order); | ||
| 9 | extern void __set_page_owner(struct page *page, | ||
| 10 | unsigned int order, gfp_t gfp_mask); | ||
| 11 | |||
| 12 | static inline void reset_page_owner(struct page *page, unsigned int order) | ||
| 13 | { | ||
| 14 | if (likely(!page_owner_inited)) | ||
| 15 | return; | ||
| 16 | |||
| 17 | __reset_page_owner(page, order); | ||
| 18 | } | ||
| 19 | |||
| 20 | static inline void set_page_owner(struct page *page, | ||
| 21 | unsigned int order, gfp_t gfp_mask) | ||
| 22 | { | ||
| 23 | if (likely(!page_owner_inited)) | ||
| 24 | return; | ||
| 25 | |||
| 26 | __set_page_owner(page, order, gfp_mask); | ||
| 27 | } | ||
| 28 | #else | ||
| 29 | static inline void reset_page_owner(struct page *page, unsigned int order) | ||
| 30 | { | ||
| 31 | } | ||
| 32 | static inline void set_page_owner(struct page *page, | ||
| 33 | unsigned int order, gfp_t gfp_mask) | ||
| 34 | { | ||
| 35 | } | ||
| 36 | |||
| 37 | #endif /* CONFIG_PAGE_OWNER */ | ||
| 38 | #endif /* __LINUX_PAGE_OWNER_H */ | ||
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 420032d41d27..57f3a1c550dc 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h | |||
| @@ -254,8 +254,6 @@ do { \ | |||
| 254 | #endif /* CONFIG_SMP */ | 254 | #endif /* CONFIG_SMP */ |
| 255 | 255 | ||
| 256 | #define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu)) | 256 | #define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu)) |
| 257 | #define __raw_get_cpu_var(var) (*raw_cpu_ptr(&(var))) | ||
| 258 | #define __get_cpu_var(var) (*this_cpu_ptr(&(var))) | ||
| 259 | 257 | ||
| 260 | /* | 258 | /* |
| 261 | * Must be an lvalue. Since @var must be a simple identifier, | 259 | * Must be an lvalue. Since @var must be a simple identifier, |
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index 0a260d8a18bf..18102529254e 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h | |||
| @@ -17,14 +17,20 @@ struct ratelimit_state { | |||
| 17 | unsigned long begin; | 17 | unsigned long begin; |
| 18 | }; | 18 | }; |
| 19 | 19 | ||
| 20 | #define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \ | 20 | #define RATELIMIT_STATE_INIT(name, interval_init, burst_init) { \ |
| 21 | \ | ||
| 22 | struct ratelimit_state name = { \ | ||
| 23 | .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ | 21 | .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ |
| 24 | .interval = interval_init, \ | 22 | .interval = interval_init, \ |
| 25 | .burst = burst_init, \ | 23 | .burst = burst_init, \ |
| 26 | } | 24 | } |
| 27 | 25 | ||
| 26 | #define RATELIMIT_STATE_INIT_DISABLED \ | ||
| 27 | RATELIMIT_STATE_INIT(ratelimit_state, 0, DEFAULT_RATELIMIT_BURST) | ||
| 28 | |||
| 29 | #define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \ | ||
| 30 | \ | ||
| 31 | struct ratelimit_state name = \ | ||
| 32 | RATELIMIT_STATE_INIT(name, interval_init, burst_init) \ | ||
| 33 | |||
| 28 | static inline void ratelimit_state_init(struct ratelimit_state *rs, | 34 | static inline void ratelimit_state_init(struct ratelimit_state *rs, |
| 29 | int interval, int burst) | 35 | int interval, int burst) |
| 30 | { | 36 | { |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 55f5ee7cc3d3..8db31ef98d2f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -1364,6 +1364,10 @@ struct task_struct { | |||
| 1364 | unsigned sched_reset_on_fork:1; | 1364 | unsigned sched_reset_on_fork:1; |
| 1365 | unsigned sched_contributes_to_load:1; | 1365 | unsigned sched_contributes_to_load:1; |
| 1366 | 1366 | ||
| 1367 | #ifdef CONFIG_MEMCG_KMEM | ||
| 1368 | unsigned memcg_kmem_skip_account:1; | ||
| 1369 | #endif | ||
| 1370 | |||
| 1367 | unsigned long atomic_flags; /* Flags needing atomic access. */ | 1371 | unsigned long atomic_flags; /* Flags needing atomic access. */ |
| 1368 | 1372 | ||
| 1369 | pid_t pid; | 1373 | pid_t pid; |
| @@ -1679,8 +1683,7 @@ struct task_struct { | |||
| 1679 | /* bitmask and counter of trace recursion */ | 1683 | /* bitmask and counter of trace recursion */ |
| 1680 | unsigned long trace_recursion; | 1684 | unsigned long trace_recursion; |
| 1681 | #endif /* CONFIG_TRACING */ | 1685 | #endif /* CONFIG_TRACING */ |
| 1682 | #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ | 1686 | #ifdef CONFIG_MEMCG |
| 1683 | unsigned int memcg_kmem_skip_account; | ||
| 1684 | struct memcg_oom_info { | 1687 | struct memcg_oom_info { |
| 1685 | struct mem_cgroup *memcg; | 1688 | struct mem_cgroup *memcg; |
| 1686 | gfp_t gfp_mask; | 1689 | gfp_t gfp_mask; |
| @@ -2482,6 +2485,10 @@ extern void do_group_exit(int); | |||
| 2482 | extern int do_execve(struct filename *, | 2485 | extern int do_execve(struct filename *, |
| 2483 | const char __user * const __user *, | 2486 | const char __user * const __user *, |
| 2484 | const char __user * const __user *); | 2487 | const char __user * const __user *); |
| 2488 | extern int do_execveat(int, struct filename *, | ||
| 2489 | const char __user * const __user *, | ||
| 2490 | const char __user * const __user *, | ||
| 2491 | int); | ||
| 2485 | extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); | 2492 | extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); |
| 2486 | struct task_struct *fork_idle(int); | 2493 | struct task_struct *fork_idle(int); |
| 2487 | extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); | 2494 | extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); |
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 68c097077ef0..f4aee75f00b1 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h | |||
| @@ -18,8 +18,6 @@ struct shrink_control { | |||
| 18 | */ | 18 | */ |
| 19 | unsigned long nr_to_scan; | 19 | unsigned long nr_to_scan; |
| 20 | 20 | ||
| 21 | /* shrink from these nodes */ | ||
| 22 | nodemask_t nodes_to_scan; | ||
| 23 | /* current node being shrunk (for NUMA aware shrinkers) */ | 21 | /* current node being shrunk (for NUMA aware shrinkers) */ |
| 24 | int nid; | 22 | int nid; |
| 25 | }; | 23 | }; |
diff --git a/include/linux/slab.h b/include/linux/slab.h index 8a2457d42fc8..9a139b637069 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
| @@ -493,7 +493,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) | |||
| 493 | * @memcg: pointer to the memcg this cache belongs to | 493 | * @memcg: pointer to the memcg this cache belongs to |
| 494 | * @list: list_head for the list of all caches in this memcg | 494 | * @list: list_head for the list of all caches in this memcg |
| 495 | * @root_cache: pointer to the global, root cache, this cache was derived from | 495 | * @root_cache: pointer to the global, root cache, this cache was derived from |
| 496 | * @nr_pages: number of pages that belongs to this cache. | ||
| 497 | */ | 496 | */ |
| 498 | struct memcg_cache_params { | 497 | struct memcg_cache_params { |
| 499 | bool is_root_cache; | 498 | bool is_root_cache; |
| @@ -506,7 +505,6 @@ struct memcg_cache_params { | |||
| 506 | struct mem_cgroup *memcg; | 505 | struct mem_cgroup *memcg; |
| 507 | struct list_head list; | 506 | struct list_head list; |
| 508 | struct kmem_cache *root_cache; | 507 | struct kmem_cache *root_cache; |
| 509 | atomic_t nr_pages; | ||
| 510 | }; | 508 | }; |
| 511 | }; | 509 | }; |
| 512 | }; | 510 | }; |
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 115b570e3bff..669045ab73f3 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h | |||
| @@ -1,6 +1,8 @@ | |||
| 1 | #ifndef __LINUX_STACKTRACE_H | 1 | #ifndef __LINUX_STACKTRACE_H |
| 2 | #define __LINUX_STACKTRACE_H | 2 | #define __LINUX_STACKTRACE_H |
| 3 | 3 | ||
| 4 | #include <linux/types.h> | ||
| 5 | |||
| 4 | struct task_struct; | 6 | struct task_struct; |
| 5 | struct pt_regs; | 7 | struct pt_regs; |
| 6 | 8 | ||
| @@ -20,6 +22,8 @@ extern void save_stack_trace_tsk(struct task_struct *tsk, | |||
| 20 | struct stack_trace *trace); | 22 | struct stack_trace *trace); |
| 21 | 23 | ||
| 22 | extern void print_stack_trace(struct stack_trace *trace, int spaces); | 24 | extern void print_stack_trace(struct stack_trace *trace, int spaces); |
| 25 | extern int snprint_stack_trace(char *buf, size_t size, | ||
| 26 | struct stack_trace *trace, int spaces); | ||
| 23 | 27 | ||
| 24 | #ifdef CONFIG_USER_STACKTRACE_SUPPORT | 28 | #ifdef CONFIG_USER_STACKTRACE_SUPPORT |
| 25 | extern void save_stack_trace_user(struct stack_trace *trace); | 29 | extern void save_stack_trace_user(struct stack_trace *trace); |
| @@ -32,6 +36,7 @@ extern void save_stack_trace_user(struct stack_trace *trace); | |||
| 32 | # define save_stack_trace_tsk(tsk, trace) do { } while (0) | 36 | # define save_stack_trace_tsk(tsk, trace) do { } while (0) |
| 33 | # define save_stack_trace_user(trace) do { } while (0) | 37 | # define save_stack_trace_user(trace) do { } while (0) |
| 34 | # define print_stack_trace(trace, spaces) do { } while (0) | 38 | # define print_stack_trace(trace, spaces) do { } while (0) |
| 39 | # define snprint_stack_trace(buf, size, trace, spaces) do { } while (0) | ||
| 35 | #endif | 40 | #endif |
| 36 | 41 | ||
| 37 | #endif | 42 | #endif |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 37a585beef5c..34e8b60ab973 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
| @@ -102,14 +102,6 @@ union swap_header { | |||
| 102 | } info; | 102 | } info; |
| 103 | }; | 103 | }; |
| 104 | 104 | ||
| 105 | /* A swap entry has to fit into a "unsigned long", as | ||
| 106 | * the entry is hidden in the "index" field of the | ||
| 107 | * swapper address space. | ||
| 108 | */ | ||
| 109 | typedef struct { | ||
| 110 | unsigned long val; | ||
| 111 | } swp_entry_t; | ||
| 112 | |||
| 113 | /* | 105 | /* |
| 114 | * current->reclaim_state points to one of these when a task is running | 106 | * current->reclaim_state points to one of these when a task is running |
| 115 | * memory reclaim | 107 | * memory reclaim |
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c9afdc7a7f84..85893d744901 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h | |||
| @@ -877,4 +877,9 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, | |||
| 877 | asmlinkage long sys_getrandom(char __user *buf, size_t count, | 877 | asmlinkage long sys_getrandom(char __user *buf, size_t count, |
| 878 | unsigned int flags); | 878 | unsigned int flags); |
| 879 | asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); | 879 | asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); |
| 880 | |||
| 881 | asmlinkage long sys_execveat(int dfd, const char __user *filename, | ||
| 882 | const char __user *const __user *argv, | ||
| 883 | const char __user *const __user *envp, int flags); | ||
| 884 | |||
| 880 | #endif | 885 | #endif |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 730334cdf037..9246d32dc973 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
| @@ -90,6 +90,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
| 90 | #ifdef CONFIG_DEBUG_VM_VMACACHE | 90 | #ifdef CONFIG_DEBUG_VM_VMACACHE |
| 91 | VMACACHE_FIND_CALLS, | 91 | VMACACHE_FIND_CALLS, |
| 92 | VMACACHE_FIND_HITS, | 92 | VMACACHE_FIND_HITS, |
| 93 | VMACACHE_FULL_FLUSHES, | ||
| 93 | #endif | 94 | #endif |
| 94 | NR_VM_EVENT_ITEMS | 95 | NR_VM_EVENT_ITEMS |
| 95 | }; | 96 | }; |
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 22749c134117..e016bd9b1a04 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h | |||
| @@ -707,9 +707,11 @@ __SYSCALL(__NR_getrandom, sys_getrandom) | |||
| 707 | __SYSCALL(__NR_memfd_create, sys_memfd_create) | 707 | __SYSCALL(__NR_memfd_create, sys_memfd_create) |
| 708 | #define __NR_bpf 280 | 708 | #define __NR_bpf 280 |
| 709 | __SYSCALL(__NR_bpf, sys_bpf) | 709 | __SYSCALL(__NR_bpf, sys_bpf) |
| 710 | #define __NR_execveat 281 | ||
| 711 | __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) | ||
| 710 | 712 | ||
| 711 | #undef __NR_syscalls | 713 | #undef __NR_syscalls |
| 712 | #define __NR_syscalls 281 | 714 | #define __NR_syscalls 282 |
| 713 | 715 | ||
| 714 | /* | 716 | /* |
| 715 | * All syscalls below here should go away really, | 717 | * All syscalls below here should go away really, |
diff --git a/include/uapi/linux/msg.h b/include/uapi/linux/msg.h index a70375526578..f51c8001dbe5 100644 --- a/include/uapi/linux/msg.h +++ b/include/uapi/linux/msg.h | |||
| @@ -51,16 +51,28 @@ struct msginfo { | |||
| 51 | }; | 51 | }; |
| 52 | 52 | ||
| 53 | /* | 53 | /* |
| 54 | * Scaling factor to compute msgmni: | 54 | * MSGMNI, MSGMAX and MSGMNB are default values which can be |
| 55 | * the memory dedicated to msg queues (msgmni * msgmnb) should occupy | 55 | * modified by sysctl. |
| 56 | * at most 1/MSG_MEM_SCALE of the lowmem (see the formula in ipc/msg.c): | 56 | * |
| 57 | * up to 8MB : msgmni = 16 (MSGMNI) | 57 | * MSGMNI is the upper limit for the number of messages queues per |
| 58 | * 4 GB : msgmni = 8K | 58 | * namespace. |
| 59 | * more than 16 GB : msgmni = 32K (IPCMNI) | 59 | * It has been chosen to be as large possible without facilitating |
| 60 | * scenarios where userspace causes overflows when adjusting the limits via | ||
| 61 | * operations of the form retrieve current limit; add X; update limit". | ||
| 62 | * | ||
| 63 | * MSGMNB is the default size of a new message queue. Non-root tasks can | ||
| 64 | * decrease the size with msgctl(IPC_SET), root tasks | ||
| 65 | * (actually: CAP_SYS_RESOURCE) can both increase and decrease the queue | ||
| 66 | * size. The optimal value is application dependent. | ||
| 67 | * 16384 is used because it was always used (since 0.99.10) | ||
| 68 | * | ||
| 69 | * MAXMAX is the maximum size of an individual message, it's a global | ||
| 70 | * (per-namespace) limit that applies for all message queues. | ||
| 71 | * It's set to 1/2 of MSGMNB, to ensure that at least two messages fit into | ||
| 72 | * the queue. This is also an arbitrary choice (since 2.6.0). | ||
| 60 | */ | 73 | */ |
| 61 | #define MSG_MEM_SCALE 32 | ||
| 62 | 74 | ||
| 63 | #define MSGMNI 16 /* <= IPCMNI */ /* max # of msg queue identifiers */ | 75 | #define MSGMNI 32000 /* <= IPCMNI */ /* max # of msg queue identifiers */ |
| 64 | #define MSGMAX 8192 /* <= INT_MAX */ /* max size of message (bytes) */ | 76 | #define MSGMAX 8192 /* <= INT_MAX */ /* max size of message (bytes) */ |
| 65 | #define MSGMNB 16384 /* <= INT_MAX */ /* default max size of a message queue */ | 77 | #define MSGMNB 16384 /* <= INT_MAX */ /* default max size of a message queue */ |
| 66 | 78 | ||
diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h index 541fce03b50c..dd73b908b2f3 100644 --- a/include/uapi/linux/sem.h +++ b/include/uapi/linux/sem.h | |||
| @@ -63,10 +63,22 @@ struct seminfo { | |||
| 63 | int semaem; | 63 | int semaem; |
| 64 | }; | 64 | }; |
| 65 | 65 | ||
| 66 | #define SEMMNI 128 /* <= IPCMNI max # of semaphore identifiers */ | 66 | /* |
| 67 | #define SEMMSL 250 /* <= 8 000 max num of semaphores per id */ | 67 | * SEMMNI, SEMMSL and SEMMNS are default values which can be |
| 68 | * modified by sysctl. | ||
| 69 | * The values has been chosen to be larger than necessary for any | ||
| 70 | * known configuration. | ||
| 71 | * | ||
| 72 | * SEMOPM should not be increased beyond 1000, otherwise there is the | ||
| 73 | * risk that semop()/semtimedop() fails due to kernel memory fragmentation when | ||
| 74 | * allocating the sop array. | ||
| 75 | */ | ||
| 76 | |||
| 77 | |||
| 78 | #define SEMMNI 32000 /* <= IPCMNI max # of semaphore identifiers */ | ||
| 79 | #define SEMMSL 32000 /* <= INT_MAX max num of semaphores per id */ | ||
| 68 | #define SEMMNS (SEMMNI*SEMMSL) /* <= INT_MAX max # of semaphores in system */ | 80 | #define SEMMNS (SEMMNI*SEMMSL) /* <= INT_MAX max # of semaphores in system */ |
| 69 | #define SEMOPM 32 /* <= 1 000 max num of ops per semop call */ | 81 | #define SEMOPM 500 /* <= 1 000 max num of ops per semop call */ |
| 70 | #define SEMVMX 32767 /* <= 32767 semaphore maximum value */ | 82 | #define SEMVMX 32767 /* <= 32767 semaphore maximum value */ |
| 71 | #define SEMAEM SEMVMX /* adjust on exit max value */ | 83 | #define SEMAEM SEMVMX /* adjust on exit max value */ |
| 72 | 84 | ||
diff --git a/init/main.c b/init/main.c index ca380ec685de..ed7e7ad5fee0 100644 --- a/init/main.c +++ b/init/main.c | |||
| @@ -51,6 +51,7 @@ | |||
| 51 | #include <linux/mempolicy.h> | 51 | #include <linux/mempolicy.h> |
| 52 | #include <linux/key.h> | 52 | #include <linux/key.h> |
| 53 | #include <linux/buffer_head.h> | 53 | #include <linux/buffer_head.h> |
| 54 | #include <linux/page_ext.h> | ||
| 54 | #include <linux/debug_locks.h> | 55 | #include <linux/debug_locks.h> |
| 55 | #include <linux/debugobjects.h> | 56 | #include <linux/debugobjects.h> |
| 56 | #include <linux/lockdep.h> | 57 | #include <linux/lockdep.h> |
| @@ -484,6 +485,11 @@ void __init __weak thread_info_cache_init(void) | |||
| 484 | */ | 485 | */ |
| 485 | static void __init mm_init(void) | 486 | static void __init mm_init(void) |
| 486 | { | 487 | { |
| 488 | /* | ||
| 489 | * page_ext requires contiguous pages, | ||
| 490 | * bigger than MAX_ORDER unless SPARSEMEM. | ||
| 491 | */ | ||
| 492 | page_ext_init_flatmem(); | ||
| 487 | mem_init(); | 493 | mem_init(); |
| 488 | kmem_cache_init(); | 494 | kmem_cache_init(); |
| 489 | percpu_init_late(); | 495 | percpu_init_late(); |
| @@ -621,6 +627,7 @@ asmlinkage __visible void __init start_kernel(void) | |||
| 621 | initrd_start = 0; | 627 | initrd_start = 0; |
| 622 | } | 628 | } |
| 623 | #endif | 629 | #endif |
| 630 | page_ext_init(); | ||
| 624 | debug_objects_mem_init(); | 631 | debug_objects_mem_init(); |
| 625 | kmemleak_init(); | 632 | kmemleak_init(); |
| 626 | setup_per_cpu_pageset(); | 633 | setup_per_cpu_pageset(); |
diff --git a/ipc/Makefile b/ipc/Makefile index 9075e172e52c..86c7300ecdf5 100644 --- a/ipc/Makefile +++ b/ipc/Makefile | |||
| @@ -3,7 +3,7 @@ | |||
| 3 | # | 3 | # |
| 4 | 4 | ||
| 5 | obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o | 5 | obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o |
| 6 | obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o syscall.o | 6 | obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o syscall.o |
| 7 | obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o | 7 | obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o |
| 8 | obj_mq-$(CONFIG_COMPAT) += compat_mq.o | 8 | obj_mq-$(CONFIG_COMPAT) += compat_mq.o |
| 9 | obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) | 9 | obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) |
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index e8075b247497..8ad93c29f511 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c | |||
| @@ -62,29 +62,6 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, | |||
| 62 | return err; | 62 | return err; |
| 63 | } | 63 | } |
| 64 | 64 | ||
| 65 | static int proc_ipc_callback_dointvec_minmax(struct ctl_table *table, int write, | ||
| 66 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 67 | { | ||
| 68 | struct ctl_table ipc_table; | ||
| 69 | size_t lenp_bef = *lenp; | ||
| 70 | int rc; | ||
| 71 | |||
| 72 | memcpy(&ipc_table, table, sizeof(ipc_table)); | ||
| 73 | ipc_table.data = get_ipc(table); | ||
| 74 | |||
| 75 | rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); | ||
| 76 | |||
| 77 | if (write && !rc && lenp_bef == *lenp) | ||
| 78 | /* | ||
| 79 | * Tunable has successfully been changed by hand. Disable its | ||
| 80 | * automatic adjustment. This simply requires unregistering | ||
| 81 | * the notifiers that trigger recalculation. | ||
| 82 | */ | ||
| 83 | unregister_ipcns_notifier(current->nsproxy->ipc_ns); | ||
| 84 | |||
| 85 | return rc; | ||
| 86 | } | ||
| 87 | |||
| 88 | static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, | 65 | static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, |
| 89 | void __user *buffer, size_t *lenp, loff_t *ppos) | 66 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 90 | { | 67 | { |
| @@ -96,54 +73,19 @@ static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, | |||
| 96 | lenp, ppos); | 73 | lenp, ppos); |
| 97 | } | 74 | } |
| 98 | 75 | ||
| 99 | /* | 76 | static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, |
| 100 | * Routine that is called when the file "auto_msgmni" has successfully been | ||
| 101 | * written. | ||
| 102 | * Two values are allowed: | ||
| 103 | * 0: unregister msgmni's callback routine from the ipc namespace notifier | ||
| 104 | * chain. This means that msgmni won't be recomputed anymore upon memory | ||
| 105 | * add/remove or ipc namespace creation/removal. | ||
| 106 | * 1: register back the callback routine. | ||
| 107 | */ | ||
| 108 | static void ipc_auto_callback(int val) | ||
| 109 | { | ||
| 110 | if (!val) | ||
| 111 | unregister_ipcns_notifier(current->nsproxy->ipc_ns); | ||
| 112 | else { | ||
| 113 | /* | ||
| 114 | * Re-enable automatic recomputing only if not already | ||
| 115 | * enabled. | ||
| 116 | */ | ||
| 117 | recompute_msgmni(current->nsproxy->ipc_ns); | ||
| 118 | cond_register_ipcns_notifier(current->nsproxy->ipc_ns); | ||
| 119 | } | ||
| 120 | } | ||
| 121 | |||
| 122 | static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write, | ||
| 123 | void __user *buffer, size_t *lenp, loff_t *ppos) | 77 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 124 | { | 78 | { |
| 125 | struct ctl_table ipc_table; | 79 | struct ctl_table ipc_table; |
| 126 | int oldval; | 80 | int dummy = 0; |
| 127 | int rc; | ||
| 128 | 81 | ||
| 129 | memcpy(&ipc_table, table, sizeof(ipc_table)); | 82 | memcpy(&ipc_table, table, sizeof(ipc_table)); |
| 130 | ipc_table.data = get_ipc(table); | 83 | ipc_table.data = &dummy; |
| 131 | oldval = *((int *)(ipc_table.data)); | ||
| 132 | 84 | ||
| 133 | rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); | 85 | if (write) |
| 86 | pr_info_once("writing to auto_msgmni has no effect"); | ||
| 134 | 87 | ||
| 135 | if (write && !rc) { | 88 | return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); |
| 136 | int newval = *((int *)(ipc_table.data)); | ||
| 137 | /* | ||
| 138 | * The file "auto_msgmni" has correctly been set. | ||
| 139 | * React by (un)registering the corresponding tunable, if the | ||
| 140 | * value has changed. | ||
| 141 | */ | ||
| 142 | if (newval != oldval) | ||
| 143 | ipc_auto_callback(newval); | ||
| 144 | } | ||
| 145 | |||
| 146 | return rc; | ||
| 147 | } | 89 | } |
| 148 | 90 | ||
| 149 | #else | 91 | #else |
| @@ -151,8 +93,7 @@ static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write, | |||
| 151 | #define proc_ipc_dointvec NULL | 93 | #define proc_ipc_dointvec NULL |
| 152 | #define proc_ipc_dointvec_minmax NULL | 94 | #define proc_ipc_dointvec_minmax NULL |
| 153 | #define proc_ipc_dointvec_minmax_orphans NULL | 95 | #define proc_ipc_dointvec_minmax_orphans NULL |
| 154 | #define proc_ipc_callback_dointvec_minmax NULL | 96 | #define proc_ipc_auto_msgmni NULL |
| 155 | #define proc_ipcauto_dointvec_minmax NULL | ||
| 156 | #endif | 97 | #endif |
| 157 | 98 | ||
| 158 | static int zero; | 99 | static int zero; |
| @@ -204,11 +145,20 @@ static struct ctl_table ipc_kern_table[] = { | |||
| 204 | .data = &init_ipc_ns.msg_ctlmni, | 145 | .data = &init_ipc_ns.msg_ctlmni, |
| 205 | .maxlen = sizeof(init_ipc_ns.msg_ctlmni), | 146 | .maxlen = sizeof(init_ipc_ns.msg_ctlmni), |
| 206 | .mode = 0644, | 147 | .mode = 0644, |
| 207 | .proc_handler = proc_ipc_callback_dointvec_minmax, | 148 | .proc_handler = proc_ipc_dointvec_minmax, |
| 208 | .extra1 = &zero, | 149 | .extra1 = &zero, |
| 209 | .extra2 = &int_max, | 150 | .extra2 = &int_max, |
| 210 | }, | 151 | }, |
| 211 | { | 152 | { |
| 153 | .procname = "auto_msgmni", | ||
| 154 | .data = NULL, | ||
| 155 | .maxlen = sizeof(int), | ||
| 156 | .mode = 0644, | ||
| 157 | .proc_handler = proc_ipc_auto_msgmni, | ||
| 158 | .extra1 = &zero, | ||
| 159 | .extra2 = &one, | ||
| 160 | }, | ||
| 161 | { | ||
| 212 | .procname = "msgmnb", | 162 | .procname = "msgmnb", |
| 213 | .data = &init_ipc_ns.msg_ctlmnb, | 163 | .data = &init_ipc_ns.msg_ctlmnb, |
| 214 | .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), | 164 | .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), |
| @@ -224,15 +174,6 @@ static struct ctl_table ipc_kern_table[] = { | |||
| 224 | .mode = 0644, | 174 | .mode = 0644, |
| 225 | .proc_handler = proc_ipc_dointvec, | 175 | .proc_handler = proc_ipc_dointvec, |
| 226 | }, | 176 | }, |
| 227 | { | ||
| 228 | .procname = "auto_msgmni", | ||
| 229 | .data = &init_ipc_ns.auto_msgmni, | ||
| 230 | .maxlen = sizeof(int), | ||
| 231 | .mode = 0644, | ||
| 232 | .proc_handler = proc_ipcauto_dointvec_minmax, | ||
| 233 | .extra1 = &zero, | ||
| 234 | .extra2 = &one, | ||
| 235 | }, | ||
| 236 | #ifdef CONFIG_CHECKPOINT_RESTORE | 177 | #ifdef CONFIG_CHECKPOINT_RESTORE |
| 237 | { | 178 | { |
| 238 | .procname = "sem_next_id", | 179 | .procname = "sem_next_id", |
diff --git a/ipc/ipcns_notifier.c b/ipc/ipcns_notifier.c deleted file mode 100644 index b9b31a4f77e1..000000000000 --- a/ipc/ipcns_notifier.c +++ /dev/null | |||
| @@ -1,92 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * linux/ipc/ipcns_notifier.c | ||
| 3 | * Copyright (C) 2007 BULL SA. Nadia Derbey | ||
| 4 | * | ||
| 5 | * Notification mechanism for ipc namespaces: | ||
| 6 | * The callback routine registered in the memory chain invokes the ipcns | ||
| 7 | * notifier chain with the IPCNS_MEMCHANGED event. | ||
| 8 | * Each callback routine registered in the ipcns namespace recomputes msgmni | ||
| 9 | * for the owning namespace. | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/msg.h> | ||
| 13 | #include <linux/rcupdate.h> | ||
| 14 | #include <linux/notifier.h> | ||
| 15 | #include <linux/nsproxy.h> | ||
| 16 | #include <linux/ipc_namespace.h> | ||
| 17 | |||
| 18 | #include "util.h" | ||
| 19 | |||
| 20 | |||
| 21 | |||
| 22 | static BLOCKING_NOTIFIER_HEAD(ipcns_chain); | ||
| 23 | |||
| 24 | |||
| 25 | static int ipcns_callback(struct notifier_block *self, | ||
| 26 | unsigned long action, void *arg) | ||
| 27 | { | ||
| 28 | struct ipc_namespace *ns; | ||
| 29 | |||
| 30 | switch (action) { | ||
| 31 | case IPCNS_MEMCHANGED: /* amount of lowmem has changed */ | ||
| 32 | case IPCNS_CREATED: | ||
| 33 | case IPCNS_REMOVED: | ||
| 34 | /* | ||
| 35 | * It's time to recompute msgmni | ||
| 36 | */ | ||
| 37 | ns = container_of(self, struct ipc_namespace, ipcns_nb); | ||
| 38 | /* | ||
| 39 | * No need to get a reference on the ns: the 1st job of | ||
| 40 | * free_ipc_ns() is to unregister the callback routine. | ||
| 41 | * blocking_notifier_chain_unregister takes the wr lock to do | ||
| 42 | * it. | ||
| 43 | * When this callback routine is called the rd lock is held by | ||
| 44 | * blocking_notifier_call_chain. | ||
| 45 | * So the ipc ns cannot be freed while we are here. | ||
| 46 | */ | ||
| 47 | recompute_msgmni(ns); | ||
| 48 | break; | ||
| 49 | default: | ||
| 50 | break; | ||
| 51 | } | ||
| 52 | |||
| 53 | return NOTIFY_OK; | ||
| 54 | } | ||
| 55 | |||
| 56 | int register_ipcns_notifier(struct ipc_namespace *ns) | ||
| 57 | { | ||
| 58 | int rc; | ||
| 59 | |||
| 60 | memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb)); | ||
| 61 | ns->ipcns_nb.notifier_call = ipcns_callback; | ||
| 62 | ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI; | ||
| 63 | rc = blocking_notifier_chain_register(&ipcns_chain, &ns->ipcns_nb); | ||
| 64 | if (!rc) | ||
| 65 | ns->auto_msgmni = 1; | ||
| 66 | return rc; | ||
| 67 | } | ||
| 68 | |||
| 69 | int cond_register_ipcns_notifier(struct ipc_namespace *ns) | ||
| 70 | { | ||
| 71 | int rc; | ||
| 72 | |||
| 73 | memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb)); | ||
| 74 | ns->ipcns_nb.notifier_call = ipcns_callback; | ||
| 75 | ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI; | ||
| 76 | rc = blocking_notifier_chain_cond_register(&ipcns_chain, | ||
| 77 | &ns->ipcns_nb); | ||
| 78 | if (!rc) | ||
| 79 | ns->auto_msgmni = 1; | ||
| 80 | return rc; | ||
| 81 | } | ||
| 82 | |||
| 83 | void unregister_ipcns_notifier(struct ipc_namespace *ns) | ||
| 84 | { | ||
| 85 | blocking_notifier_chain_unregister(&ipcns_chain, &ns->ipcns_nb); | ||
| 86 | ns->auto_msgmni = 0; | ||
| 87 | } | ||
| 88 | |||
| 89 | int ipcns_notify(unsigned long val) | ||
| 90 | { | ||
| 91 | return blocking_notifier_call_chain(&ipcns_chain, val, NULL); | ||
| 92 | } | ||
| @@ -989,43 +989,12 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, | |||
| 989 | return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); | 989 | return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); |
| 990 | } | 990 | } |
| 991 | 991 | ||
| 992 | /* | ||
| 993 | * Scale msgmni with the available lowmem size: the memory dedicated to msg | ||
| 994 | * queues should occupy at most 1/MSG_MEM_SCALE of lowmem. | ||
| 995 | * Also take into account the number of nsproxies created so far. | ||
| 996 | * This should be done staying within the (MSGMNI , IPCMNI/nr_ipc_ns) range. | ||
| 997 | */ | ||
| 998 | void recompute_msgmni(struct ipc_namespace *ns) | ||
| 999 | { | ||
| 1000 | struct sysinfo i; | ||
| 1001 | unsigned long allowed; | ||
| 1002 | int nb_ns; | ||
| 1003 | |||
| 1004 | si_meminfo(&i); | ||
| 1005 | allowed = (((i.totalram - i.totalhigh) / MSG_MEM_SCALE) * i.mem_unit) | ||
| 1006 | / MSGMNB; | ||
| 1007 | nb_ns = atomic_read(&nr_ipc_ns); | ||
| 1008 | allowed /= nb_ns; | ||
| 1009 | |||
| 1010 | if (allowed < MSGMNI) { | ||
| 1011 | ns->msg_ctlmni = MSGMNI; | ||
| 1012 | return; | ||
| 1013 | } | ||
| 1014 | |||
| 1015 | if (allowed > IPCMNI / nb_ns) { | ||
| 1016 | ns->msg_ctlmni = IPCMNI / nb_ns; | ||
| 1017 | return; | ||
| 1018 | } | ||
| 1019 | |||
| 1020 | ns->msg_ctlmni = allowed; | ||
| 1021 | } | ||
| 1022 | 992 | ||
| 1023 | void msg_init_ns(struct ipc_namespace *ns) | 993 | void msg_init_ns(struct ipc_namespace *ns) |
| 1024 | { | 994 | { |
| 1025 | ns->msg_ctlmax = MSGMAX; | 995 | ns->msg_ctlmax = MSGMAX; |
| 1026 | ns->msg_ctlmnb = MSGMNB; | 996 | ns->msg_ctlmnb = MSGMNB; |
| 1027 | 997 | ns->msg_ctlmni = MSGMNI; | |
| 1028 | recompute_msgmni(ns); | ||
| 1029 | 998 | ||
| 1030 | atomic_set(&ns->msg_bytes, 0); | 999 | atomic_set(&ns->msg_bytes, 0); |
| 1031 | atomic_set(&ns->msg_hdrs, 0); | 1000 | atomic_set(&ns->msg_hdrs, 0); |
| @@ -1069,9 +1038,6 @@ void __init msg_init(void) | |||
| 1069 | { | 1038 | { |
| 1070 | msg_init_ns(&init_ipc_ns); | 1039 | msg_init_ns(&init_ipc_ns); |
| 1071 | 1040 | ||
| 1072 | printk(KERN_INFO "msgmni has been set to %d\n", | ||
| 1073 | init_ipc_ns.msg_ctlmni); | ||
| 1074 | |||
| 1075 | ipc_init_proc_interface("sysvipc/msg", | 1041 | ipc_init_proc_interface("sysvipc/msg", |
| 1076 | " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", | 1042 | " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", |
| 1077 | IPC_MSG_IDS, sysvipc_msg_proc_show); | 1043 | IPC_MSG_IDS, sysvipc_msg_proc_show); |
diff --git a/ipc/namespace.c b/ipc/namespace.c index b54468e48e32..1a3ffd40356e 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c | |||
| @@ -45,14 +45,6 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, | |||
| 45 | msg_init_ns(ns); | 45 | msg_init_ns(ns); |
| 46 | shm_init_ns(ns); | 46 | shm_init_ns(ns); |
| 47 | 47 | ||
| 48 | /* | ||
| 49 | * msgmni has already been computed for the new ipc ns. | ||
| 50 | * Thus, do the ipcns creation notification before registering that | ||
| 51 | * new ipcns in the chain. | ||
| 52 | */ | ||
| 53 | ipcns_notify(IPCNS_CREATED); | ||
| 54 | register_ipcns_notifier(ns); | ||
| 55 | |||
| 56 | ns->user_ns = get_user_ns(user_ns); | 48 | ns->user_ns = get_user_ns(user_ns); |
| 57 | 49 | ||
| 58 | return ns; | 50 | return ns; |
| @@ -99,25 +91,11 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, | |||
| 99 | 91 | ||
| 100 | static void free_ipc_ns(struct ipc_namespace *ns) | 92 | static void free_ipc_ns(struct ipc_namespace *ns) |
| 101 | { | 93 | { |
| 102 | /* | ||
| 103 | * Unregistering the hotplug notifier at the beginning guarantees | ||
| 104 | * that the ipc namespace won't be freed while we are inside the | ||
| 105 | * callback routine. Since the blocking_notifier_chain_XXX routines | ||
| 106 | * hold a rw lock on the notifier list, unregister_ipcns_notifier() | ||
| 107 | * won't take the rw lock before blocking_notifier_call_chain() has | ||
| 108 | * released the rd lock. | ||
| 109 | */ | ||
| 110 | unregister_ipcns_notifier(ns); | ||
| 111 | sem_exit_ns(ns); | 94 | sem_exit_ns(ns); |
| 112 | msg_exit_ns(ns); | 95 | msg_exit_ns(ns); |
| 113 | shm_exit_ns(ns); | 96 | shm_exit_ns(ns); |
| 114 | atomic_dec(&nr_ipc_ns); | 97 | atomic_dec(&nr_ipc_ns); |
| 115 | 98 | ||
| 116 | /* | ||
| 117 | * Do the ipcns removal notification after decrementing nr_ipc_ns in | ||
| 118 | * order to have a correct value when recomputing msgmni. | ||
| 119 | */ | ||
| 120 | ipcns_notify(IPCNS_REMOVED); | ||
| 121 | put_user_ns(ns->user_ns); | 99 | put_user_ns(ns->user_ns); |
| 122 | proc_free_inum(ns->proc_inum); | 100 | proc_free_inum(ns->proc_inum); |
| 123 | kfree(ns); | 101 | kfree(ns); |
| @@ -326,10 +326,17 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, | |||
| 326 | 326 | ||
| 327 | /* Then check that the global lock is free */ | 327 | /* Then check that the global lock is free */ |
| 328 | if (!spin_is_locked(&sma->sem_perm.lock)) { | 328 | if (!spin_is_locked(&sma->sem_perm.lock)) { |
| 329 | /* spin_is_locked() is not a memory barrier */ | 329 | /* |
| 330 | smp_mb(); | 330 | * The ipc object lock check must be visible on all |
| 331 | * cores before rechecking the complex count. Otherwise | ||
| 332 | * we can race with another thread that does: | ||
| 333 | * complex_count++; | ||
| 334 | * spin_unlock(sem_perm.lock); | ||
| 335 | */ | ||
| 336 | smp_rmb(); | ||
| 331 | 337 | ||
| 332 | /* Now repeat the test of complex_count: | 338 | /* |
| 339 | * Now repeat the test of complex_count: | ||
| 333 | * It can't change anymore until we drop sem->lock. | 340 | * It can't change anymore until we drop sem->lock. |
| 334 | * Thus: if is now 0, then it will stay 0. | 341 | * Thus: if is now 0, then it will stay 0. |
| 335 | */ | 342 | */ |
| @@ -219,7 +219,8 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) | |||
| 219 | if (!is_file_hugepages(shm_file)) | 219 | if (!is_file_hugepages(shm_file)) |
| 220 | shmem_lock(shm_file, 0, shp->mlock_user); | 220 | shmem_lock(shm_file, 0, shp->mlock_user); |
| 221 | else if (shp->mlock_user) | 221 | else if (shp->mlock_user) |
| 222 | user_shm_unlock(file_inode(shm_file)->i_size, shp->mlock_user); | 222 | user_shm_unlock(i_size_read(file_inode(shm_file)), |
| 223 | shp->mlock_user); | ||
| 223 | fput(shm_file); | 224 | fput(shm_file); |
| 224 | ipc_rcu_putref(shp, shm_rcu_free); | 225 | ipc_rcu_putref(shp, shm_rcu_free); |
| 225 | } | 226 | } |
| @@ -1229,6 +1230,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) | |||
| 1229 | int retval = -EINVAL; | 1230 | int retval = -EINVAL; |
| 1230 | #ifdef CONFIG_MMU | 1231 | #ifdef CONFIG_MMU |
| 1231 | loff_t size = 0; | 1232 | loff_t size = 0; |
| 1233 | struct file *file; | ||
| 1232 | struct vm_area_struct *next; | 1234 | struct vm_area_struct *next; |
| 1233 | #endif | 1235 | #endif |
| 1234 | 1236 | ||
| @@ -1245,7 +1247,8 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) | |||
| 1245 | * started at address shmaddr. It records it's size and then unmaps | 1247 | * started at address shmaddr. It records it's size and then unmaps |
| 1246 | * it. | 1248 | * it. |
| 1247 | * - Then it unmaps all shm vmas that started at shmaddr and that | 1249 | * - Then it unmaps all shm vmas that started at shmaddr and that |
| 1248 | * are within the initially determined size. | 1250 | * are within the initially determined size and that are from the |
| 1251 | * same shm segment from which we determined the size. | ||
| 1249 | * Errors from do_munmap are ignored: the function only fails if | 1252 | * Errors from do_munmap are ignored: the function only fails if |
| 1250 | * it's called with invalid parameters or if it's called to unmap | 1253 | * it's called with invalid parameters or if it's called to unmap |
| 1251 | * a part of a vma. Both calls in this function are for full vmas, | 1254 | * a part of a vma. Both calls in this function are for full vmas, |
| @@ -1271,8 +1274,14 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) | |||
| 1271 | if ((vma->vm_ops == &shm_vm_ops) && | 1274 | if ((vma->vm_ops == &shm_vm_ops) && |
| 1272 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { | 1275 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { |
| 1273 | 1276 | ||
| 1274 | 1277 | /* | |
| 1275 | size = file_inode(vma->vm_file)->i_size; | 1278 | * Record the file of the shm segment being |
| 1279 | * unmapped. With mremap(), someone could place | ||
| 1280 | * page from another segment but with equal offsets | ||
| 1281 | * in the range we are unmapping. | ||
| 1282 | */ | ||
| 1283 | file = vma->vm_file; | ||
| 1284 | size = i_size_read(file_inode(vma->vm_file)); | ||
| 1276 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); | 1285 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); |
| 1277 | /* | 1286 | /* |
| 1278 | * We discovered the size of the shm segment, so | 1287 | * We discovered the size of the shm segment, so |
| @@ -1298,8 +1307,8 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) | |||
| 1298 | 1307 | ||
| 1299 | /* finding a matching vma now does not alter retval */ | 1308 | /* finding a matching vma now does not alter retval */ |
| 1300 | if ((vma->vm_ops == &shm_vm_ops) && | 1309 | if ((vma->vm_ops == &shm_vm_ops) && |
| 1301 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) | 1310 | ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && |
| 1302 | 1311 | (vma->vm_file == file)) | |
| 1303 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); | 1312 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); |
| 1304 | vma = next; | 1313 | vma = next; |
| 1305 | } | 1314 | } |
diff --git a/ipc/util.c b/ipc/util.c index 88adc329888c..106bed0378ab 100644 --- a/ipc/util.c +++ b/ipc/util.c | |||
| @@ -71,44 +71,6 @@ struct ipc_proc_iface { | |||
| 71 | int (*show)(struct seq_file *, void *); | 71 | int (*show)(struct seq_file *, void *); |
| 72 | }; | 72 | }; |
| 73 | 73 | ||
| 74 | static void ipc_memory_notifier(struct work_struct *work) | ||
| 75 | { | ||
| 76 | ipcns_notify(IPCNS_MEMCHANGED); | ||
| 77 | } | ||
| 78 | |||
| 79 | static int ipc_memory_callback(struct notifier_block *self, | ||
| 80 | unsigned long action, void *arg) | ||
| 81 | { | ||
| 82 | static DECLARE_WORK(ipc_memory_wq, ipc_memory_notifier); | ||
| 83 | |||
| 84 | switch (action) { | ||
| 85 | case MEM_ONLINE: /* memory successfully brought online */ | ||
| 86 | case MEM_OFFLINE: /* or offline: it's time to recompute msgmni */ | ||
| 87 | /* | ||
| 88 | * This is done by invoking the ipcns notifier chain with the | ||
| 89 | * IPC_MEMCHANGED event. | ||
| 90 | * In order not to keep the lock on the hotplug memory chain | ||
| 91 | * for too long, queue a work item that will, when waken up, | ||
| 92 | * activate the ipcns notification chain. | ||
| 93 | */ | ||
| 94 | schedule_work(&ipc_memory_wq); | ||
| 95 | break; | ||
| 96 | case MEM_GOING_ONLINE: | ||
| 97 | case MEM_GOING_OFFLINE: | ||
| 98 | case MEM_CANCEL_ONLINE: | ||
| 99 | case MEM_CANCEL_OFFLINE: | ||
| 100 | default: | ||
| 101 | break; | ||
| 102 | } | ||
| 103 | |||
| 104 | return NOTIFY_OK; | ||
| 105 | } | ||
| 106 | |||
| 107 | static struct notifier_block ipc_memory_nb = { | ||
| 108 | .notifier_call = ipc_memory_callback, | ||
| 109 | .priority = IPC_CALLBACK_PRI, | ||
| 110 | }; | ||
| 111 | |||
| 112 | /** | 74 | /** |
| 113 | * ipc_init - initialise ipc subsystem | 75 | * ipc_init - initialise ipc subsystem |
| 114 | * | 76 | * |
| @@ -124,8 +86,6 @@ static int __init ipc_init(void) | |||
| 124 | sem_init(); | 86 | sem_init(); |
| 125 | msg_init(); | 87 | msg_init(); |
| 126 | shm_init(); | 88 | shm_init(); |
| 127 | register_hotmemory_notifier(&ipc_memory_nb); | ||
| 128 | register_ipcns_notifier(&init_ipc_ns); | ||
| 129 | return 0; | 89 | return 0; |
| 130 | } | 90 | } |
| 131 | device_initcall(ipc_init); | 91 | device_initcall(ipc_init); |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 80f29e015570..2e0c97427b33 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
| @@ -174,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk) | |||
| 174 | struct fsnotify_mark *entry = &chunk->mark; | 174 | struct fsnotify_mark *entry = &chunk->mark; |
| 175 | struct list_head *list; | 175 | struct list_head *list; |
| 176 | 176 | ||
| 177 | if (!entry->i.inode) | 177 | if (!entry->inode) |
| 178 | return; | 178 | return; |
| 179 | list = chunk_hash(entry->i.inode); | 179 | list = chunk_hash(entry->inode); |
| 180 | list_add_rcu(&chunk->hash, list); | 180 | list_add_rcu(&chunk->hash, list); |
| 181 | } | 181 | } |
| 182 | 182 | ||
| @@ -188,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) | |||
| 188 | 188 | ||
| 189 | list_for_each_entry_rcu(p, list, hash) { | 189 | list_for_each_entry_rcu(p, list, hash) { |
| 190 | /* mark.inode may have gone NULL, but who cares? */ | 190 | /* mark.inode may have gone NULL, but who cares? */ |
| 191 | if (p->mark.i.inode == inode) { | 191 | if (p->mark.inode == inode) { |
| 192 | atomic_long_inc(&p->refs); | 192 | atomic_long_inc(&p->refs); |
| 193 | return p; | 193 | return p; |
| 194 | } | 194 | } |
| @@ -231,7 +231,7 @@ static void untag_chunk(struct node *p) | |||
| 231 | new = alloc_chunk(size); | 231 | new = alloc_chunk(size); |
| 232 | 232 | ||
| 233 | spin_lock(&entry->lock); | 233 | spin_lock(&entry->lock); |
| 234 | if (chunk->dead || !entry->i.inode) { | 234 | if (chunk->dead || !entry->inode) { |
| 235 | spin_unlock(&entry->lock); | 235 | spin_unlock(&entry->lock); |
| 236 | if (new) | 236 | if (new) |
| 237 | free_chunk(new); | 237 | free_chunk(new); |
| @@ -258,7 +258,7 @@ static void untag_chunk(struct node *p) | |||
| 258 | goto Fallback; | 258 | goto Fallback; |
| 259 | 259 | ||
| 260 | fsnotify_duplicate_mark(&new->mark, entry); | 260 | fsnotify_duplicate_mark(&new->mark, entry); |
| 261 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { | 261 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) { |
| 262 | fsnotify_put_mark(&new->mark); | 262 | fsnotify_put_mark(&new->mark); |
| 263 | goto Fallback; | 263 | goto Fallback; |
| 264 | } | 264 | } |
| @@ -386,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
| 386 | chunk_entry = &chunk->mark; | 386 | chunk_entry = &chunk->mark; |
| 387 | 387 | ||
| 388 | spin_lock(&old_entry->lock); | 388 | spin_lock(&old_entry->lock); |
| 389 | if (!old_entry->i.inode) { | 389 | if (!old_entry->inode) { |
| 390 | /* old_entry is being shot, lets just lie */ | 390 | /* old_entry is being shot, lets just lie */ |
| 391 | spin_unlock(&old_entry->lock); | 391 | spin_unlock(&old_entry->lock); |
| 392 | fsnotify_put_mark(old_entry); | 392 | fsnotify_put_mark(old_entry); |
| @@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
| 395 | } | 395 | } |
| 396 | 396 | ||
| 397 | fsnotify_duplicate_mark(chunk_entry, old_entry); | 397 | fsnotify_duplicate_mark(chunk_entry, old_entry); |
| 398 | if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { | 398 | if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) { |
| 399 | spin_unlock(&old_entry->lock); | 399 | spin_unlock(&old_entry->lock); |
| 400 | fsnotify_put_mark(chunk_entry); | 400 | fsnotify_put_mark(chunk_entry); |
| 401 | fsnotify_put_mark(old_entry); | 401 | fsnotify_put_mark(old_entry); |
| @@ -611,7 +611,7 @@ void audit_trim_trees(void) | |||
| 611 | list_for_each_entry(node, &tree->chunks, list) { | 611 | list_for_each_entry(node, &tree->chunks, list) { |
| 612 | struct audit_chunk *chunk = find_chunk(node); | 612 | struct audit_chunk *chunk = find_chunk(node); |
| 613 | /* this could be NULL if the watch is dying else where... */ | 613 | /* this could be NULL if the watch is dying else where... */ |
| 614 | struct inode *inode = chunk->mark.i.inode; | 614 | struct inode *inode = chunk->mark.inode; |
| 615 | node->index |= 1U<<31; | 615 | node->index |= 1U<<31; |
| 616 | if (iterate_mounts(compare_root, inode, root_mnt)) | 616 | if (iterate_mounts(compare_root, inode, root_mnt)) |
| 617 | node->index &= ~(1U<<31); | 617 | node->index &= ~(1U<<31); |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ed8f2cde34c5..995a95f61a19 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
| 724 | int more = 0; | 724 | int more = 0; |
| 725 | 725 | ||
| 726 | again: | 726 | again: |
| 727 | mutex_lock(&mapping->i_mmap_mutex); | 727 | i_mmap_lock_read(mapping); |
| 728 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 728 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
| 729 | if (!valid_vma(vma, is_register)) | 729 | if (!valid_vma(vma, is_register)) |
| 730 | continue; | 730 | continue; |
| 731 | 731 | ||
| 732 | if (!prev && !more) { | 732 | if (!prev && !more) { |
| 733 | /* | 733 | /* |
| 734 | * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through | 734 | * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through |
| 735 | * reclaim. This is optimistic, no harm done if it fails. | 735 | * reclaim. This is optimistic, no harm done if it fails. |
| 736 | */ | 736 | */ |
| 737 | prev = kmalloc(sizeof(struct map_info), | 737 | prev = kmalloc(sizeof(struct map_info), |
| @@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
| 755 | info->mm = vma->vm_mm; | 755 | info->mm = vma->vm_mm; |
| 756 | info->vaddr = offset_to_vaddr(vma, offset); | 756 | info->vaddr = offset_to_vaddr(vma, offset); |
| 757 | } | 757 | } |
| 758 | mutex_unlock(&mapping->i_mmap_mutex); | 758 | i_mmap_unlock_read(mapping); |
| 759 | 759 | ||
| 760 | if (!more) | 760 | if (!more) |
| 761 | goto out; | 761 | goto out; |
diff --git a/kernel/fork.c b/kernel/fork.c index 9ca84189cfc2..4dc2ddade9f1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 433 | get_file(file); | 433 | get_file(file); |
| 434 | if (tmp->vm_flags & VM_DENYWRITE) | 434 | if (tmp->vm_flags & VM_DENYWRITE) |
| 435 | atomic_dec(&inode->i_writecount); | 435 | atomic_dec(&inode->i_writecount); |
| 436 | mutex_lock(&mapping->i_mmap_mutex); | 436 | i_mmap_lock_write(mapping); |
| 437 | if (tmp->vm_flags & VM_SHARED) | 437 | if (tmp->vm_flags & VM_SHARED) |
| 438 | atomic_inc(&mapping->i_mmap_writable); | 438 | atomic_inc(&mapping->i_mmap_writable); |
| 439 | flush_dcache_mmap_lock(mapping); | 439 | flush_dcache_mmap_lock(mapping); |
| @@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 445 | vma_interval_tree_insert_after(tmp, mpnt, | 445 | vma_interval_tree_insert_after(tmp, mpnt, |
| 446 | &mapping->i_mmap); | 446 | &mapping->i_mmap); |
| 447 | flush_dcache_mmap_unlock(mapping); | 447 | flush_dcache_mmap_unlock(mapping); |
| 448 | mutex_unlock(&mapping->i_mmap_mutex); | 448 | i_mmap_unlock_write(mapping); |
| 449 | } | 449 | } |
| 450 | 450 | ||
| 451 | /* | 451 | /* |
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3b7408759bdf..c92e44855ddd 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
| @@ -32,10 +32,13 @@ config GCOV_KERNEL | |||
| 32 | Note that the debugfs filesystem has to be mounted to access | 32 | Note that the debugfs filesystem has to be mounted to access |
| 33 | profiling data. | 33 | profiling data. |
| 34 | 34 | ||
| 35 | config ARCH_HAS_GCOV_PROFILE_ALL | ||
| 36 | def_bool n | ||
| 37 | |||
| 35 | config GCOV_PROFILE_ALL | 38 | config GCOV_PROFILE_ALL |
| 36 | bool "Profile entire Kernel" | 39 | bool "Profile entire Kernel" |
| 37 | depends on GCOV_KERNEL | 40 | depends on GCOV_KERNEL |
| 38 | depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 | 41 | depends on ARCH_HAS_GCOV_PROFILE_ALL |
| 39 | default n | 42 | default n |
| 40 | ---help--- | 43 | ---help--- |
| 41 | This options activates profiling for the entire kernel. | 44 | This options activates profiling for the entire kernel. |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 2abf9f6e9a61..9a8a01abbaed 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, | |||
| 600 | if (!kexec_on_panic) { | 600 | if (!kexec_on_panic) { |
| 601 | image->swap_page = kimage_alloc_control_pages(image, 0); | 601 | image->swap_page = kimage_alloc_control_pages(image, 0); |
| 602 | if (!image->swap_page) { | 602 | if (!image->swap_page) { |
| 603 | pr_err(KERN_ERR "Could not allocate swap buffer\n"); | 603 | pr_err("Could not allocate swap buffer\n"); |
| 604 | goto out_free_control_pages; | 604 | goto out_free_control_pages; |
| 605 | } | 605 | } |
| 606 | } | 606 | } |
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 00fe55cc5a82..b6e4c16377c7 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c | |||
| @@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces) | |||
| 25 | } | 25 | } |
| 26 | EXPORT_SYMBOL_GPL(print_stack_trace); | 26 | EXPORT_SYMBOL_GPL(print_stack_trace); |
| 27 | 27 | ||
| 28 | int snprint_stack_trace(char *buf, size_t size, | ||
| 29 | struct stack_trace *trace, int spaces) | ||
| 30 | { | ||
| 31 | int i; | ||
| 32 | unsigned long ip; | ||
| 33 | int generated; | ||
| 34 | int total = 0; | ||
| 35 | |||
| 36 | if (WARN_ON(!trace->entries)) | ||
| 37 | return 0; | ||
| 38 | |||
| 39 | for (i = 0; i < trace->nr_entries; i++) { | ||
| 40 | ip = trace->entries[i]; | ||
| 41 | generated = snprintf(buf, size, "%*c[<%p>] %pS\n", | ||
| 42 | 1 + spaces, ' ', (void *) ip, (void *) ip); | ||
| 43 | |||
| 44 | total += generated; | ||
| 45 | |||
| 46 | /* Assume that generated isn't a negative number */ | ||
| 47 | if (generated >= size) { | ||
| 48 | buf += size; | ||
| 49 | size = 0; | ||
| 50 | } else { | ||
| 51 | buf += generated; | ||
| 52 | size -= generated; | ||
| 53 | } | ||
| 54 | } | ||
| 55 | |||
| 56 | return total; | ||
| 57 | } | ||
| 58 | EXPORT_SYMBOL_GPL(snprint_stack_trace); | ||
| 59 | |||
| 28 | /* | 60 | /* |
| 29 | * Architectures that do not implement save_stack_trace_tsk or | 61 | * Architectures that do not implement save_stack_trace_tsk or |
| 30 | * save_stack_trace_regs get this weak alias and a once-per-bootup warning | 62 | * save_stack_trace_regs get this weak alias and a once-per-bootup warning |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 61eea02b53f5..5adcb0ae3a58 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -226,3 +226,6 @@ cond_syscall(sys_seccomp); | |||
| 226 | 226 | ||
| 227 | /* access BPF programs and maps */ | 227 | /* access BPF programs and maps */ |
| 228 | cond_syscall(sys_bpf); | 228 | cond_syscall(sys_bpf); |
| 229 | |||
| 230 | /* execveat */ | ||
| 231 | cond_syscall(sys_execveat); | ||
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d780351835e9..5f2ce616c046 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
| @@ -227,6 +227,22 @@ config UNUSED_SYMBOLS | |||
| 227 | you really need it, and what the merge plan to the mainline kernel for | 227 | you really need it, and what the merge plan to the mainline kernel for |
| 228 | your module is. | 228 | your module is. |
| 229 | 229 | ||
| 230 | config PAGE_OWNER | ||
| 231 | bool "Track page owner" | ||
| 232 | depends on DEBUG_KERNEL && STACKTRACE_SUPPORT | ||
| 233 | select DEBUG_FS | ||
| 234 | select STACKTRACE | ||
| 235 | select PAGE_EXTENSION | ||
| 236 | help | ||
| 237 | This keeps track of what call chain is the owner of a page, may | ||
| 238 | help to find bare alloc_page(s) leaks. Even if you include this | ||
| 239 | feature on your build, it is disabled in default. You should pass | ||
| 240 | "page_owner=on" to boot parameter in order to enable it. Eats | ||
| 241 | a fair amount of memory if enabled. See tools/vm/page_owner_sort.c | ||
| 242 | for user-space helper. | ||
| 243 | |||
| 244 | If unsure, say N. | ||
| 245 | |||
| 230 | config DEBUG_FS | 246 | config DEBUG_FS |
| 231 | bool "Debug Filesystem" | 247 | bool "Debug Filesystem" |
| 232 | help | 248 | help |
diff --git a/lib/audit.c b/lib/audit.c index 1d726a22565b..b8fb5ee81e26 100644 --- a/lib/audit.c +++ b/lib/audit.c | |||
| @@ -54,6 +54,9 @@ int audit_classify_syscall(int abi, unsigned syscall) | |||
| 54 | case __NR_socketcall: | 54 | case __NR_socketcall: |
| 55 | return 4; | 55 | return 4; |
| 56 | #endif | 56 | #endif |
| 57 | #ifdef __NR_execveat | ||
| 58 | case __NR_execveat: | ||
| 59 | #endif | ||
| 57 | case __NR_execve: | 60 | case __NR_execve: |
| 58 | return 5; | 61 | return 5; |
| 59 | default: | 62 | default: |
diff --git a/lib/bitmap.c b/lib/bitmap.c index b499ab6ada29..969ae8fbc85b 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c | |||
| @@ -326,30 +326,32 @@ void bitmap_clear(unsigned long *map, unsigned int start, int len) | |||
| 326 | } | 326 | } |
| 327 | EXPORT_SYMBOL(bitmap_clear); | 327 | EXPORT_SYMBOL(bitmap_clear); |
| 328 | 328 | ||
| 329 | /* | 329 | /** |
| 330 | * bitmap_find_next_zero_area - find a contiguous aligned zero area | 330 | * bitmap_find_next_zero_area_off - find a contiguous aligned zero area |
| 331 | * @map: The address to base the search on | 331 | * @map: The address to base the search on |
| 332 | * @size: The bitmap size in bits | 332 | * @size: The bitmap size in bits |
| 333 | * @start: The bitnumber to start searching at | 333 | * @start: The bitnumber to start searching at |
| 334 | * @nr: The number of zeroed bits we're looking for | 334 | * @nr: The number of zeroed bits we're looking for |
| 335 | * @align_mask: Alignment mask for zero area | 335 | * @align_mask: Alignment mask for zero area |
| 336 | * @align_offset: Alignment offset for zero area. | ||
| 336 | * | 337 | * |
| 337 | * The @align_mask should be one less than a power of 2; the effect is that | 338 | * The @align_mask should be one less than a power of 2; the effect is that |
| 338 | * the bit offset of all zero areas this function finds is multiples of that | 339 | * the bit offset of all zero areas this function finds plus @align_offset |
| 339 | * power of 2. A @align_mask of 0 means no alignment is required. | 340 | * is multiple of that power of 2. |
| 340 | */ | 341 | */ |
| 341 | unsigned long bitmap_find_next_zero_area(unsigned long *map, | 342 | unsigned long bitmap_find_next_zero_area_off(unsigned long *map, |
| 342 | unsigned long size, | 343 | unsigned long size, |
| 343 | unsigned long start, | 344 | unsigned long start, |
| 344 | unsigned int nr, | 345 | unsigned int nr, |
| 345 | unsigned long align_mask) | 346 | unsigned long align_mask, |
| 347 | unsigned long align_offset) | ||
| 346 | { | 348 | { |
| 347 | unsigned long index, end, i; | 349 | unsigned long index, end, i; |
| 348 | again: | 350 | again: |
| 349 | index = find_next_zero_bit(map, size, start); | 351 | index = find_next_zero_bit(map, size, start); |
| 350 | 352 | ||
| 351 | /* Align allocation */ | 353 | /* Align allocation */ |
| 352 | index = __ALIGN_MASK(index, align_mask); | 354 | index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset; |
| 353 | 355 | ||
| 354 | end = index + nr; | 356 | end = index + nr; |
| 355 | if (end > size) | 357 | if (end > size) |
| @@ -361,7 +363,7 @@ again: | |||
| 361 | } | 363 | } |
| 362 | return index; | 364 | return index; |
| 363 | } | 365 | } |
| 364 | EXPORT_SYMBOL(bitmap_find_next_zero_area); | 366 | EXPORT_SYMBOL(bitmap_find_next_zero_area_off); |
| 365 | 367 | ||
| 366 | /* | 368 | /* |
| 367 | * Bitmap printing & parsing functions: first version by Nadia Yvette Chambers, | 369 | * Bitmap printing & parsing functions: first version by Nadia Yvette Chambers, |
diff --git a/lib/decompress.c b/lib/decompress.c index 37f3c786348f..528ff932d8e4 100644 --- a/lib/decompress.c +++ b/lib/decompress.c | |||
| @@ -44,8 +44,8 @@ struct compress_format { | |||
| 44 | }; | 44 | }; |
| 45 | 45 | ||
| 46 | static const struct compress_format compressed_formats[] __initconst = { | 46 | static const struct compress_format compressed_formats[] __initconst = { |
| 47 | { {037, 0213}, "gzip", gunzip }, | 47 | { {0x1f, 0x8b}, "gzip", gunzip }, |
| 48 | { {037, 0236}, "gzip", gunzip }, | 48 | { {0x1f, 0x9e}, "gzip", gunzip }, |
| 49 | { {0x42, 0x5a}, "bzip2", bunzip2 }, | 49 | { {0x42, 0x5a}, "bzip2", bunzip2 }, |
| 50 | { {0x5d, 0x00}, "lzma", unlzma }, | 50 | { {0x5d, 0x00}, "lzma", unlzma }, |
| 51 | { {0xfd, 0x37}, "xz", unxz }, | 51 | { {0xfd, 0x37}, "xz", unxz }, |
diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c index 8290e0bef7ea..6dd0335ea61b 100644 --- a/lib/decompress_bunzip2.c +++ b/lib/decompress_bunzip2.c | |||
| @@ -184,7 +184,7 @@ static int INIT get_next_block(struct bunzip_data *bd) | |||
| 184 | if (get_bits(bd, 1)) | 184 | if (get_bits(bd, 1)) |
| 185 | return RETVAL_OBSOLETE_INPUT; | 185 | return RETVAL_OBSOLETE_INPUT; |
| 186 | origPtr = get_bits(bd, 24); | 186 | origPtr = get_bits(bd, 24); |
| 187 | if (origPtr > dbufSize) | 187 | if (origPtr >= dbufSize) |
| 188 | return RETVAL_DATA_ERROR; | 188 | return RETVAL_DATA_ERROR; |
| 189 | /* mapping table: if some byte values are never used (encoding things | 189 | /* mapping table: if some byte values are never used (encoding things |
| 190 | like ascii text), the compression code removes the gaps to have fewer | 190 | like ascii text), the compression code removes the gaps to have fewer |
diff --git a/lib/fault-inject.c b/lib/fault-inject.c index d7d501ea856d..f1cdeb024d17 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c | |||
| @@ -40,10 +40,16 @@ EXPORT_SYMBOL_GPL(setup_fault_attr); | |||
| 40 | 40 | ||
| 41 | static void fail_dump(struct fault_attr *attr) | 41 | static void fail_dump(struct fault_attr *attr) |
| 42 | { | 42 | { |
| 43 | if (attr->verbose > 0) | 43 | if (attr->verbose > 0 && __ratelimit(&attr->ratelimit_state)) { |
| 44 | printk(KERN_NOTICE "FAULT_INJECTION: forcing a failure\n"); | 44 | printk(KERN_NOTICE "FAULT_INJECTION: forcing a failure.\n" |
| 45 | if (attr->verbose > 1) | 45 | "name %pd, interval %lu, probability %lu, " |
| 46 | dump_stack(); | 46 | "space %d, times %d\n", attr->dname, |
| 47 | attr->probability, attr->interval, | ||
| 48 | atomic_read(&attr->space), | ||
| 49 | atomic_read(&attr->times)); | ||
| 50 | if (attr->verbose > 1) | ||
| 51 | dump_stack(); | ||
| 52 | } | ||
| 47 | } | 53 | } |
| 48 | 54 | ||
| 49 | #define atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0) | 55 | #define atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0) |
| @@ -202,6 +208,12 @@ struct dentry *fault_create_debugfs_attr(const char *name, | |||
| 202 | goto fail; | 208 | goto fail; |
| 203 | if (!debugfs_create_ul("verbose", mode, dir, &attr->verbose)) | 209 | if (!debugfs_create_ul("verbose", mode, dir, &attr->verbose)) |
| 204 | goto fail; | 210 | goto fail; |
| 211 | if (!debugfs_create_u32("verbose_ratelimit_interval_ms", mode, dir, | ||
| 212 | &attr->ratelimit_state.interval)) | ||
| 213 | goto fail; | ||
| 214 | if (!debugfs_create_u32("verbose_ratelimit_burst", mode, dir, | ||
| 215 | &attr->ratelimit_state.burst)) | ||
| 216 | goto fail; | ||
| 205 | if (!debugfs_create_bool("task-filter", mode, dir, &attr->task_filter)) | 217 | if (!debugfs_create_bool("task-filter", mode, dir, &attr->task_filter)) |
| 206 | goto fail; | 218 | goto fail; |
| 207 | 219 | ||
| @@ -222,6 +234,7 @@ struct dentry *fault_create_debugfs_attr(const char *name, | |||
| 222 | 234 | ||
| 223 | #endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ | 235 | #endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ |
| 224 | 236 | ||
| 237 | attr->dname = dget(dir); | ||
| 225 | return dir; | 238 | return dir; |
| 226 | fail: | 239 | fail: |
| 227 | debugfs_remove_recursive(dir); | 240 | debugfs_remove_recursive(dir); |
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 4b2443254de2..56badfc4810a 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
| @@ -1,8 +1,18 @@ | |||
| 1 | config PAGE_EXTENSION | ||
| 2 | bool "Extend memmap on extra space for more information on page" | ||
| 3 | ---help--- | ||
| 4 | Extend memmap on extra space for more information on page. This | ||
| 5 | could be used for debugging features that need to insert extra | ||
| 6 | field for every page. This extension enables us to save memory | ||
| 7 | by not allocating this extra memory according to boottime | ||
| 8 | configuration. | ||
| 9 | |||
| 1 | config DEBUG_PAGEALLOC | 10 | config DEBUG_PAGEALLOC |
| 2 | bool "Debug page memory allocations" | 11 | bool "Debug page memory allocations" |
| 3 | depends on DEBUG_KERNEL | 12 | depends on DEBUG_KERNEL |
| 4 | depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC | 13 | depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC |
| 5 | depends on !KMEMCHECK | 14 | depends on !KMEMCHECK |
| 15 | select PAGE_EXTENSION | ||
| 6 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC | 16 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC |
| 7 | select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC | 17 | select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC |
| 8 | ---help--- | 18 | ---help--- |
diff --git a/mm/Makefile b/mm/Makefile index b3c6ce932c64..4bf586e66378 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -63,6 +63,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | |||
| 63 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 63 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
| 64 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | 64 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o |
| 65 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 65 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
| 66 | obj-$(CONFIG_PAGE_OWNER) += page_owner.o | ||
| 66 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | 67 | obj-$(CONFIG_CLEANCACHE) += cleancache.o |
| 67 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o | 68 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o |
| 68 | obj-$(CONFIG_ZPOOL) += zpool.o | 69 | obj-$(CONFIG_ZPOOL) += zpool.o |
| @@ -71,3 +72,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o | |||
| 71 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o | 72 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o |
| 72 | obj-$(CONFIG_CMA) += cma.o | 73 | obj-$(CONFIG_CMA) += cma.o |
| 73 | obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o | 74 | obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o |
| 75 | obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o | ||
| @@ -33,6 +33,7 @@ | |||
| 33 | #include <linux/log2.h> | 33 | #include <linux/log2.h> |
| 34 | #include <linux/cma.h> | 34 | #include <linux/cma.h> |
| 35 | #include <linux/highmem.h> | 35 | #include <linux/highmem.h> |
| 36 | #include <linux/io.h> | ||
| 36 | 37 | ||
| 37 | struct cma { | 38 | struct cma { |
| 38 | unsigned long base_pfn; | 39 | unsigned long base_pfn; |
| @@ -63,6 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) | |||
| 63 | return (1UL << (align_order - cma->order_per_bit)) - 1; | 64 | return (1UL << (align_order - cma->order_per_bit)) - 1; |
| 64 | } | 65 | } |
| 65 | 66 | ||
| 67 | static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) | ||
| 68 | { | ||
| 69 | unsigned int alignment; | ||
| 70 | |||
| 71 | if (align_order <= cma->order_per_bit) | ||
| 72 | return 0; | ||
| 73 | alignment = 1UL << (align_order - cma->order_per_bit); | ||
| 74 | return ALIGN(cma->base_pfn, alignment) - | ||
| 75 | (cma->base_pfn >> cma->order_per_bit); | ||
| 76 | } | ||
| 77 | |||
| 66 | static unsigned long cma_bitmap_maxno(struct cma *cma) | 78 | static unsigned long cma_bitmap_maxno(struct cma *cma) |
| 67 | { | 79 | { |
| 68 | return cma->count >> cma->order_per_bit; | 80 | return cma->count >> cma->order_per_bit; |
| @@ -313,6 +325,11 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
| 313 | } | 325 | } |
| 314 | } | 326 | } |
| 315 | 327 | ||
| 328 | /* | ||
| 329 | * kmemleak scans/reads tracked objects for pointers to other | ||
| 330 | * objects but this address isn't mapped and accessible | ||
| 331 | */ | ||
| 332 | kmemleak_ignore(phys_to_virt(addr)); | ||
| 316 | base = addr; | 333 | base = addr; |
| 317 | } | 334 | } |
| 318 | 335 | ||
| @@ -340,7 +357,7 @@ err: | |||
| 340 | */ | 357 | */ |
| 341 | struct page *cma_alloc(struct cma *cma, int count, unsigned int align) | 358 | struct page *cma_alloc(struct cma *cma, int count, unsigned int align) |
| 342 | { | 359 | { |
| 343 | unsigned long mask, pfn, start = 0; | 360 | unsigned long mask, offset, pfn, start = 0; |
| 344 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; | 361 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; |
| 345 | struct page *page = NULL; | 362 | struct page *page = NULL; |
| 346 | int ret; | 363 | int ret; |
| @@ -355,13 +372,15 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) | |||
| 355 | return NULL; | 372 | return NULL; |
| 356 | 373 | ||
| 357 | mask = cma_bitmap_aligned_mask(cma, align); | 374 | mask = cma_bitmap_aligned_mask(cma, align); |
| 375 | offset = cma_bitmap_aligned_offset(cma, align); | ||
| 358 | bitmap_maxno = cma_bitmap_maxno(cma); | 376 | bitmap_maxno = cma_bitmap_maxno(cma); |
| 359 | bitmap_count = cma_bitmap_pages_to_bits(cma, count); | 377 | bitmap_count = cma_bitmap_pages_to_bits(cma, count); |
| 360 | 378 | ||
| 361 | for (;;) { | 379 | for (;;) { |
| 362 | mutex_lock(&cma->lock); | 380 | mutex_lock(&cma->lock); |
| 363 | bitmap_no = bitmap_find_next_zero_area(cma->bitmap, | 381 | bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, |
| 364 | bitmap_maxno, start, bitmap_count, mask); | 382 | bitmap_maxno, start, bitmap_count, mask, |
| 383 | offset); | ||
| 365 | if (bitmap_no >= bitmap_maxno) { | 384 | if (bitmap_no >= bitmap_maxno) { |
| 366 | mutex_unlock(&cma->lock); | 385 | mutex_unlock(&cma->lock); |
| 367 | break; | 386 | break; |
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c index 789ff70c8a4a..5bf5906ce13b 100644 --- a/mm/debug-pagealloc.c +++ b/mm/debug-pagealloc.c | |||
| @@ -2,23 +2,55 @@ | |||
| 2 | #include <linux/string.h> | 2 | #include <linux/string.h> |
| 3 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
| 4 | #include <linux/highmem.h> | 4 | #include <linux/highmem.h> |
| 5 | #include <linux/page-debug-flags.h> | 5 | #include <linux/page_ext.h> |
| 6 | #include <linux/poison.h> | 6 | #include <linux/poison.h> |
| 7 | #include <linux/ratelimit.h> | 7 | #include <linux/ratelimit.h> |
| 8 | 8 | ||
| 9 | static bool page_poisoning_enabled __read_mostly; | ||
| 10 | |||
| 11 | static bool need_page_poisoning(void) | ||
| 12 | { | ||
| 13 | if (!debug_pagealloc_enabled()) | ||
| 14 | return false; | ||
| 15 | |||
| 16 | return true; | ||
| 17 | } | ||
| 18 | |||
| 19 | static void init_page_poisoning(void) | ||
| 20 | { | ||
| 21 | if (!debug_pagealloc_enabled()) | ||
| 22 | return; | ||
| 23 | |||
| 24 | page_poisoning_enabled = true; | ||
| 25 | } | ||
| 26 | |||
| 27 | struct page_ext_operations page_poisoning_ops = { | ||
| 28 | .need = need_page_poisoning, | ||
| 29 | .init = init_page_poisoning, | ||
| 30 | }; | ||
| 31 | |||
| 9 | static inline void set_page_poison(struct page *page) | 32 | static inline void set_page_poison(struct page *page) |
| 10 | { | 33 | { |
| 11 | __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); | 34 | struct page_ext *page_ext; |
| 35 | |||
| 36 | page_ext = lookup_page_ext(page); | ||
| 37 | __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | ||
| 12 | } | 38 | } |
| 13 | 39 | ||
| 14 | static inline void clear_page_poison(struct page *page) | 40 | static inline void clear_page_poison(struct page *page) |
| 15 | { | 41 | { |
| 16 | __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); | 42 | struct page_ext *page_ext; |
| 43 | |||
| 44 | page_ext = lookup_page_ext(page); | ||
| 45 | __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | ||
| 17 | } | 46 | } |
| 18 | 47 | ||
| 19 | static inline bool page_poison(struct page *page) | 48 | static inline bool page_poison(struct page *page) |
| 20 | { | 49 | { |
| 21 | return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); | 50 | struct page_ext *page_ext; |
| 51 | |||
| 52 | page_ext = lookup_page_ext(page); | ||
| 53 | return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | ||
| 22 | } | 54 | } |
| 23 | 55 | ||
| 24 | static void poison_page(struct page *page) | 56 | static void poison_page(struct page *page) |
| @@ -93,8 +125,11 @@ static void unpoison_pages(struct page *page, int n) | |||
| 93 | unpoison_page(page + i); | 125 | unpoison_page(page + i); |
| 94 | } | 126 | } |
| 95 | 127 | ||
| 96 | void kernel_map_pages(struct page *page, int numpages, int enable) | 128 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
| 97 | { | 129 | { |
| 130 | if (!page_poisoning_enabled) | ||
| 131 | return; | ||
| 132 | |||
| 98 | if (enable) | 133 | if (enable) |
| 99 | unpoison_pages(page, numpages); | 134 | unpoison_pages(page, numpages); |
| 100 | else | 135 | else |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 3bcfd81db45e..2ad7adf4f0a4 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
| @@ -117,7 +117,11 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) | |||
| 117 | __filemap_fdatawrite_range(mapping, offset, endbyte, | 117 | __filemap_fdatawrite_range(mapping, offset, endbyte, |
| 118 | WB_SYNC_NONE); | 118 | WB_SYNC_NONE); |
| 119 | 119 | ||
| 120 | /* First and last FULL page! */ | 120 | /* |
| 121 | * First and last FULL page! Partial pages are deliberately | ||
| 122 | * preserved on the expectation that it is better to preserve | ||
| 123 | * needed memory than to discard unneeded memory. | ||
| 124 | */ | ||
| 121 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; | 125 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; |
| 122 | end_index = (endbyte >> PAGE_CACHE_SHIFT); | 126 | end_index = (endbyte >> PAGE_CACHE_SHIFT); |
| 123 | 127 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 14b4642279f1..e8905bc3cbd7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -62,16 +62,16 @@ | |||
| 62 | /* | 62 | /* |
| 63 | * Lock ordering: | 63 | * Lock ordering: |
| 64 | * | 64 | * |
| 65 | * ->i_mmap_mutex (truncate_pagecache) | 65 | * ->i_mmap_rwsem (truncate_pagecache) |
| 66 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 66 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
| 67 | * ->swap_lock (exclusive_swap_page, others) | 67 | * ->swap_lock (exclusive_swap_page, others) |
| 68 | * ->mapping->tree_lock | 68 | * ->mapping->tree_lock |
| 69 | * | 69 | * |
| 70 | * ->i_mutex | 70 | * ->i_mutex |
| 71 | * ->i_mmap_mutex (truncate->unmap_mapping_range) | 71 | * ->i_mmap_rwsem (truncate->unmap_mapping_range) |
| 72 | * | 72 | * |
| 73 | * ->mmap_sem | 73 | * ->mmap_sem |
| 74 | * ->i_mmap_mutex | 74 | * ->i_mmap_rwsem |
| 75 | * ->page_table_lock or pte_lock (various, mainly in memory.c) | 75 | * ->page_table_lock or pte_lock (various, mainly in memory.c) |
| 76 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) | 76 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) |
| 77 | * | 77 | * |
| @@ -85,7 +85,7 @@ | |||
| 85 | * sb_lock (fs/fs-writeback.c) | 85 | * sb_lock (fs/fs-writeback.c) |
| 86 | * ->mapping->tree_lock (__sync_single_inode) | 86 | * ->mapping->tree_lock (__sync_single_inode) |
| 87 | * | 87 | * |
| 88 | * ->i_mmap_mutex | 88 | * ->i_mmap_rwsem |
| 89 | * ->anon_vma.lock (vma_adjust) | 89 | * ->anon_vma.lock (vma_adjust) |
| 90 | * | 90 | * |
| 91 | * ->anon_vma.lock | 91 | * ->anon_vma.lock |
| @@ -105,7 +105,7 @@ | |||
| 105 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 105 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
| 106 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 106 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
| 107 | * | 107 | * |
| 108 | * ->i_mmap_mutex | 108 | * ->i_mmap_rwsem |
| 109 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 109 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
| 110 | */ | 110 | */ |
| 111 | 111 | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index d8d9fe3f685c..0d105aeff82f 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
| @@ -155,22 +155,14 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) | |||
| 155 | EXPORT_SYMBOL_GPL(xip_file_read); | 155 | EXPORT_SYMBOL_GPL(xip_file_read); |
| 156 | 156 | ||
| 157 | /* | 157 | /* |
| 158 | * __xip_unmap is invoked from xip_unmap and | 158 | * __xip_unmap is invoked from xip_unmap and xip_write |
| 159 | * xip_write | ||
| 160 | * | 159 | * |
| 161 | * This function walks all vmas of the address_space and unmaps the | 160 | * This function walks all vmas of the address_space and unmaps the |
| 162 | * __xip_sparse_page when found at pgoff. | 161 | * __xip_sparse_page when found at pgoff. |
| 163 | */ | 162 | */ |
| 164 | static void | 163 | static void __xip_unmap(struct address_space * mapping, unsigned long pgoff) |
| 165 | __xip_unmap (struct address_space * mapping, | ||
| 166 | unsigned long pgoff) | ||
| 167 | { | 164 | { |
| 168 | struct vm_area_struct *vma; | 165 | struct vm_area_struct *vma; |
| 169 | struct mm_struct *mm; | ||
| 170 | unsigned long address; | ||
| 171 | pte_t *pte; | ||
| 172 | pte_t pteval; | ||
| 173 | spinlock_t *ptl; | ||
| 174 | struct page *page; | 166 | struct page *page; |
| 175 | unsigned count; | 167 | unsigned count; |
| 176 | int locked = 0; | 168 | int locked = 0; |
| @@ -182,11 +174,14 @@ __xip_unmap (struct address_space * mapping, | |||
| 182 | return; | 174 | return; |
| 183 | 175 | ||
| 184 | retry: | 176 | retry: |
| 185 | mutex_lock(&mapping->i_mmap_mutex); | 177 | i_mmap_lock_read(mapping); |
| 186 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 178 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
| 187 | mm = vma->vm_mm; | 179 | pte_t *pte, pteval; |
| 188 | address = vma->vm_start + | 180 | spinlock_t *ptl; |
| 181 | struct mm_struct *mm = vma->vm_mm; | ||
| 182 | unsigned long address = vma->vm_start + | ||
| 189 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 183 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
| 184 | |||
| 190 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 185 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
| 191 | pte = page_check_address(page, mm, address, &ptl, 1); | 186 | pte = page_check_address(page, mm, address, &ptl, 1); |
| 192 | if (pte) { | 187 | if (pte) { |
| @@ -202,7 +197,7 @@ retry: | |||
| 202 | page_cache_release(page); | 197 | page_cache_release(page); |
| 203 | } | 198 | } |
| 204 | } | 199 | } |
| 205 | mutex_unlock(&mapping->i_mmap_mutex); | 200 | i_mmap_unlock_read(mapping); |
| 206 | 201 | ||
| 207 | if (locked) { | 202 | if (locked) { |
| 208 | mutex_unlock(&xip_sparse_mutex); | 203 | mutex_unlock(&xip_sparse_mutex); |
diff --git a/mm/fremap.c b/mm/fremap.c index 72b8fa361433..11ef7ec40d13 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
| @@ -238,13 +238,13 @@ get_write_lock: | |||
| 238 | } | 238 | } |
| 239 | goto out_freed; | 239 | goto out_freed; |
| 240 | } | 240 | } |
| 241 | mutex_lock(&mapping->i_mmap_mutex); | 241 | i_mmap_lock_write(mapping); |
| 242 | flush_dcache_mmap_lock(mapping); | 242 | flush_dcache_mmap_lock(mapping); |
| 243 | vma->vm_flags |= VM_NONLINEAR; | 243 | vma->vm_flags |= VM_NONLINEAR; |
| 244 | vma_interval_tree_remove(vma, &mapping->i_mmap); | 244 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
| 245 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 245 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
| 246 | flush_dcache_mmap_unlock(mapping); | 246 | flush_dcache_mmap_unlock(mapping); |
| 247 | mutex_unlock(&mapping->i_mmap_mutex); | 247 | i_mmap_unlock_write(mapping); |
| 248 | } | 248 | } |
| 249 | 249 | ||
| 250 | if (vma->vm_flags & VM_LOCKED) { | 250 | if (vma->vm_flags & VM_LOCKED) { |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 919b86a2164d..47f6070d7c46 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -1457,7 +1457,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
| 1457 | return 0; | 1457 | return 0; |
| 1458 | 1458 | ||
| 1459 | found: | 1459 | found: |
| 1460 | BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); | 1460 | BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); |
| 1461 | /* Put them into a private list first because mem_map is not up yet */ | 1461 | /* Put them into a private list first because mem_map is not up yet */ |
| 1462 | list_add(&m->list, &huge_boot_pages); | 1462 | list_add(&m->list, &huge_boot_pages); |
| 1463 | m->hstate = h; | 1463 | m->hstate = h; |
| @@ -2083,7 +2083,7 @@ static void hugetlb_register_node(struct node *node) | |||
| 2083 | * devices of nodes that have memory. All on-line nodes should have | 2083 | * devices of nodes that have memory. All on-line nodes should have |
| 2084 | * registered their associated device by this time. | 2084 | * registered their associated device by this time. |
| 2085 | */ | 2085 | */ |
| 2086 | static void hugetlb_register_all_nodes(void) | 2086 | static void __init hugetlb_register_all_nodes(void) |
| 2087 | { | 2087 | { |
| 2088 | int nid; | 2088 | int nid; |
| 2089 | 2089 | ||
| @@ -2726,9 +2726,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, | |||
| 2726 | * on its way out. We're lucky that the flag has such an appropriate | 2726 | * on its way out. We're lucky that the flag has such an appropriate |
| 2727 | * name, and can in fact be safely cleared here. We could clear it | 2727 | * name, and can in fact be safely cleared here. We could clear it |
| 2728 | * before the __unmap_hugepage_range above, but all that's necessary | 2728 | * before the __unmap_hugepage_range above, but all that's necessary |
| 2729 | * is to clear it before releasing the i_mmap_mutex. This works | 2729 | * is to clear it before releasing the i_mmap_rwsem. This works |
| 2730 | * because in the context this is called, the VMA is about to be | 2730 | * because in the context this is called, the VMA is about to be |
| 2731 | * destroyed and the i_mmap_mutex is held. | 2731 | * destroyed and the i_mmap_rwsem is held. |
| 2732 | */ | 2732 | */ |
| 2733 | vma->vm_flags &= ~VM_MAYSHARE; | 2733 | vma->vm_flags &= ~VM_MAYSHARE; |
| 2734 | } | 2734 | } |
| @@ -2774,7 +2774,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2774 | * this mapping should be shared between all the VMAs, | 2774 | * this mapping should be shared between all the VMAs, |
| 2775 | * __unmap_hugepage_range() is called as the lock is already held | 2775 | * __unmap_hugepage_range() is called as the lock is already held |
| 2776 | */ | 2776 | */ |
| 2777 | mutex_lock(&mapping->i_mmap_mutex); | 2777 | i_mmap_lock_write(mapping); |
| 2778 | vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { | 2778 | vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { |
| 2779 | /* Do not unmap the current VMA */ | 2779 | /* Do not unmap the current VMA */ |
| 2780 | if (iter_vma == vma) | 2780 | if (iter_vma == vma) |
| @@ -2791,7 +2791,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2791 | unmap_hugepage_range(iter_vma, address, | 2791 | unmap_hugepage_range(iter_vma, address, |
| 2792 | address + huge_page_size(h), page); | 2792 | address + huge_page_size(h), page); |
| 2793 | } | 2793 | } |
| 2794 | mutex_unlock(&mapping->i_mmap_mutex); | 2794 | i_mmap_unlock_write(mapping); |
| 2795 | } | 2795 | } |
| 2796 | 2796 | ||
| 2797 | /* | 2797 | /* |
| @@ -3348,7 +3348,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
| 3348 | flush_cache_range(vma, address, end); | 3348 | flush_cache_range(vma, address, end); |
| 3349 | 3349 | ||
| 3350 | mmu_notifier_invalidate_range_start(mm, start, end); | 3350 | mmu_notifier_invalidate_range_start(mm, start, end); |
| 3351 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3351 | i_mmap_lock_write(vma->vm_file->f_mapping); |
| 3352 | for (; address < end; address += huge_page_size(h)) { | 3352 | for (; address < end; address += huge_page_size(h)) { |
| 3353 | spinlock_t *ptl; | 3353 | spinlock_t *ptl; |
| 3354 | ptep = huge_pte_offset(mm, address); | 3354 | ptep = huge_pte_offset(mm, address); |
| @@ -3370,13 +3370,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
| 3370 | spin_unlock(ptl); | 3370 | spin_unlock(ptl); |
| 3371 | } | 3371 | } |
| 3372 | /* | 3372 | /* |
| 3373 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare | 3373 | * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare |
| 3374 | * may have cleared our pud entry and done put_page on the page table: | 3374 | * may have cleared our pud entry and done put_page on the page table: |
| 3375 | * once we release i_mmap_mutex, another task can do the final put_page | 3375 | * once we release i_mmap_rwsem, another task can do the final put_page |
| 3376 | * and that page table be reused and filled with junk. | 3376 | * and that page table be reused and filled with junk. |
| 3377 | */ | 3377 | */ |
| 3378 | flush_tlb_range(vma, start, end); | 3378 | flush_tlb_range(vma, start, end); |
| 3379 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3379 | i_mmap_unlock_write(vma->vm_file->f_mapping); |
| 3380 | mmu_notifier_invalidate_range_end(mm, start, end); | 3380 | mmu_notifier_invalidate_range_end(mm, start, end); |
| 3381 | 3381 | ||
| 3382 | return pages << h->order; | 3382 | return pages << h->order; |
| @@ -3525,7 +3525,7 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) | |||
| 3525 | * and returns the corresponding pte. While this is not necessary for the | 3525 | * and returns the corresponding pte. While this is not necessary for the |
| 3526 | * !shared pmd case because we can allocate the pmd later as well, it makes the | 3526 | * !shared pmd case because we can allocate the pmd later as well, it makes the |
| 3527 | * code much cleaner. pmd allocation is essential for the shared case because | 3527 | * code much cleaner. pmd allocation is essential for the shared case because |
| 3528 | * pud has to be populated inside the same i_mmap_mutex section - otherwise | 3528 | * pud has to be populated inside the same i_mmap_rwsem section - otherwise |
| 3529 | * racing tasks could either miss the sharing (see huge_pte_offset) or select a | 3529 | * racing tasks could either miss the sharing (see huge_pte_offset) or select a |
| 3530 | * bad pmd for sharing. | 3530 | * bad pmd for sharing. |
| 3531 | */ | 3531 | */ |
| @@ -3544,7 +3544,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
| 3544 | if (!vma_shareable(vma, addr)) | 3544 | if (!vma_shareable(vma, addr)) |
| 3545 | return (pte_t *)pmd_alloc(mm, pud, addr); | 3545 | return (pte_t *)pmd_alloc(mm, pud, addr); |
| 3546 | 3546 | ||
| 3547 | mutex_lock(&mapping->i_mmap_mutex); | 3547 | i_mmap_lock_write(mapping); |
| 3548 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { | 3548 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { |
| 3549 | if (svma == vma) | 3549 | if (svma == vma) |
| 3550 | continue; | 3550 | continue; |
| @@ -3572,7 +3572,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
| 3572 | spin_unlock(ptl); | 3572 | spin_unlock(ptl); |
| 3573 | out: | 3573 | out: |
| 3574 | pte = (pte_t *)pmd_alloc(mm, pud, addr); | 3574 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
| 3575 | mutex_unlock(&mapping->i_mmap_mutex); | 3575 | i_mmap_unlock_write(mapping); |
| 3576 | return pte; | 3576 | return pte; |
| 3577 | } | 3577 | } |
| 3578 | 3578 | ||
diff --git a/mm/memblock.c b/mm/memblock.c index 6ecb0d937fb5..252b77bdf65e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
| @@ -715,16 +715,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | |||
| 715 | } | 715 | } |
| 716 | 716 | ||
| 717 | /** | 717 | /** |
| 718 | * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. | ||
| 719 | * @base: the base phys addr of the region | ||
| 720 | * @size: the size of the region | ||
| 721 | * | 718 | * |
| 722 | * This function isolates region [@base, @base + @size), and mark it with flag | 719 | * This function isolates region [@base, @base + @size), and sets/clears flag |
| 723 | * MEMBLOCK_HOTPLUG. | ||
| 724 | * | 720 | * |
| 725 | * Return 0 on succees, -errno on failure. | 721 | * Return 0 on succees, -errno on failure. |
| 726 | */ | 722 | */ |
| 727 | int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | 723 | static int __init_memblock memblock_setclr_flag(phys_addr_t base, |
| 724 | phys_addr_t size, int set, int flag) | ||
| 728 | { | 725 | { |
| 729 | struct memblock_type *type = &memblock.memory; | 726 | struct memblock_type *type = &memblock.memory; |
| 730 | int i, ret, start_rgn, end_rgn; | 727 | int i, ret, start_rgn, end_rgn; |
| @@ -734,37 +731,37 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | |||
| 734 | return ret; | 731 | return ret; |
| 735 | 732 | ||
| 736 | for (i = start_rgn; i < end_rgn; i++) | 733 | for (i = start_rgn; i < end_rgn; i++) |
| 737 | memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); | 734 | if (set) |
| 735 | memblock_set_region_flags(&type->regions[i], flag); | ||
| 736 | else | ||
| 737 | memblock_clear_region_flags(&type->regions[i], flag); | ||
| 738 | 738 | ||
| 739 | memblock_merge_regions(type); | 739 | memblock_merge_regions(type); |
| 740 | return 0; | 740 | return 0; |
| 741 | } | 741 | } |
| 742 | 742 | ||
| 743 | /** | 743 | /** |
| 744 | * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. | 744 | * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. |
| 745 | * @base: the base phys addr of the region | 745 | * @base: the base phys addr of the region |
| 746 | * @size: the size of the region | 746 | * @size: the size of the region |
| 747 | * | 747 | * |
| 748 | * This function isolates region [@base, @base + @size), and clear flag | 748 | * Return 0 on succees, -errno on failure. |
| 749 | * MEMBLOCK_HOTPLUG for the isolated regions. | 749 | */ |
| 750 | int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | ||
| 751 | { | ||
| 752 | return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG); | ||
| 753 | } | ||
| 754 | |||
| 755 | /** | ||
| 756 | * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. | ||
| 757 | * @base: the base phys addr of the region | ||
| 758 | * @size: the size of the region | ||
| 750 | * | 759 | * |
| 751 | * Return 0 on succees, -errno on failure. | 760 | * Return 0 on succees, -errno on failure. |
| 752 | */ | 761 | */ |
| 753 | int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | 762 | int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) |
| 754 | { | 763 | { |
| 755 | struct memblock_type *type = &memblock.memory; | 764 | return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG); |
| 756 | int i, ret, start_rgn, end_rgn; | ||
| 757 | |||
| 758 | ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); | ||
| 759 | if (ret) | ||
| 760 | return ret; | ||
| 761 | |||
| 762 | for (i = start_rgn; i < end_rgn; i++) | ||
| 763 | memblock_clear_region_flags(&type->regions[i], | ||
| 764 | MEMBLOCK_HOTPLUG); | ||
| 765 | |||
| 766 | memblock_merge_regions(type); | ||
| 767 | return 0; | ||
| 768 | } | 765 | } |
| 769 | 766 | ||
| 770 | /** | 767 | /** |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 85df503ec023..ef91e856c7e4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -296,7 +296,6 @@ struct mem_cgroup { | |||
| 296 | * Should the accounting and control be hierarchical, per subtree? | 296 | * Should the accounting and control be hierarchical, per subtree? |
| 297 | */ | 297 | */ |
| 298 | bool use_hierarchy; | 298 | bool use_hierarchy; |
| 299 | unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ | ||
| 300 | 299 | ||
| 301 | bool oom_lock; | 300 | bool oom_lock; |
| 302 | atomic_t under_oom; | 301 | atomic_t under_oom; |
| @@ -366,22 +365,11 @@ struct mem_cgroup { | |||
| 366 | /* WARNING: nodeinfo must be the last member here */ | 365 | /* WARNING: nodeinfo must be the last member here */ |
| 367 | }; | 366 | }; |
| 368 | 367 | ||
| 369 | /* internal only representation about the status of kmem accounting. */ | ||
| 370 | enum { | ||
| 371 | KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ | ||
| 372 | }; | ||
| 373 | |||
| 374 | #ifdef CONFIG_MEMCG_KMEM | 368 | #ifdef CONFIG_MEMCG_KMEM |
| 375 | static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) | ||
| 376 | { | ||
| 377 | set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | ||
| 378 | } | ||
| 379 | |||
| 380 | static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | 369 | static bool memcg_kmem_is_active(struct mem_cgroup *memcg) |
| 381 | { | 370 | { |
| 382 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | 371 | return memcg->kmemcg_id >= 0; |
| 383 | } | 372 | } |
| 384 | |||
| 385 | #endif | 373 | #endif |
| 386 | 374 | ||
| 387 | /* Stuffs for move charges at task migration. */ | 375 | /* Stuffs for move charges at task migration. */ |
| @@ -1571,7 +1559,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
| 1571 | * select it. The goal is to allow it to allocate so that it may | 1559 | * select it. The goal is to allow it to allocate so that it may |
| 1572 | * quickly exit and free its memory. | 1560 | * quickly exit and free its memory. |
| 1573 | */ | 1561 | */ |
| 1574 | if (fatal_signal_pending(current) || current->flags & PF_EXITING) { | 1562 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { |
| 1575 | set_thread_flag(TIF_MEMDIE); | 1563 | set_thread_flag(TIF_MEMDIE); |
| 1576 | return; | 1564 | return; |
| 1577 | } | 1565 | } |
| @@ -1628,6 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
| 1628 | NULL, "Memory cgroup out of memory"); | 1616 | NULL, "Memory cgroup out of memory"); |
| 1629 | } | 1617 | } |
| 1630 | 1618 | ||
| 1619 | #if MAX_NUMNODES > 1 | ||
| 1620 | |||
| 1631 | /** | 1621 | /** |
| 1632 | * test_mem_cgroup_node_reclaimable | 1622 | * test_mem_cgroup_node_reclaimable |
| 1633 | * @memcg: the target memcg | 1623 | * @memcg: the target memcg |
| @@ -1650,7 +1640,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | |||
| 1650 | return false; | 1640 | return false; |
| 1651 | 1641 | ||
| 1652 | } | 1642 | } |
| 1653 | #if MAX_NUMNODES > 1 | ||
| 1654 | 1643 | ||
| 1655 | /* | 1644 | /* |
| 1656 | * Always updating the nodemask is not very good - even if we have an empty | 1645 | * Always updating the nodemask is not very good - even if we have an empty |
| @@ -2646,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg, | |||
| 2646 | if (!cachep) | 2635 | if (!cachep) |
| 2647 | return; | 2636 | return; |
| 2648 | 2637 | ||
| 2649 | css_get(&memcg->css); | ||
| 2650 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | 2638 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); |
| 2651 | 2639 | ||
| 2652 | /* | 2640 | /* |
| @@ -2680,40 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep) | |||
| 2680 | list_del(&cachep->memcg_params->list); | 2668 | list_del(&cachep->memcg_params->list); |
| 2681 | 2669 | ||
| 2682 | kmem_cache_destroy(cachep); | 2670 | kmem_cache_destroy(cachep); |
| 2683 | |||
| 2684 | /* drop the reference taken in memcg_register_cache */ | ||
| 2685 | css_put(&memcg->css); | ||
| 2686 | } | ||
| 2687 | |||
| 2688 | /* | ||
| 2689 | * During the creation a new cache, we need to disable our accounting mechanism | ||
| 2690 | * altogether. This is true even if we are not creating, but rather just | ||
| 2691 | * enqueing new caches to be created. | ||
| 2692 | * | ||
| 2693 | * This is because that process will trigger allocations; some visible, like | ||
| 2694 | * explicit kmallocs to auxiliary data structures, name strings and internal | ||
| 2695 | * cache structures; some well concealed, like INIT_WORK() that can allocate | ||
| 2696 | * objects during debug. | ||
| 2697 | * | ||
| 2698 | * If any allocation happens during memcg_kmem_get_cache, we will recurse back | ||
| 2699 | * to it. This may not be a bounded recursion: since the first cache creation | ||
| 2700 | * failed to complete (waiting on the allocation), we'll just try to create the | ||
| 2701 | * cache again, failing at the same point. | ||
| 2702 | * | ||
| 2703 | * memcg_kmem_get_cache is prepared to abort after seeing a positive count of | ||
| 2704 | * memcg_kmem_skip_account. So we enclose anything that might allocate memory | ||
| 2705 | * inside the following two functions. | ||
| 2706 | */ | ||
| 2707 | static inline void memcg_stop_kmem_account(void) | ||
| 2708 | { | ||
| 2709 | VM_BUG_ON(!current->mm); | ||
| 2710 | current->memcg_kmem_skip_account++; | ||
| 2711 | } | ||
| 2712 | |||
| 2713 | static inline void memcg_resume_kmem_account(void) | ||
| 2714 | { | ||
| 2715 | VM_BUG_ON(!current->mm); | ||
| 2716 | current->memcg_kmem_skip_account--; | ||
| 2717 | } | 2671 | } |
| 2718 | 2672 | ||
| 2719 | int __memcg_cleanup_cache_params(struct kmem_cache *s) | 2673 | int __memcg_cleanup_cache_params(struct kmem_cache *s) |
| @@ -2747,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg) | |||
| 2747 | mutex_lock(&memcg_slab_mutex); | 2701 | mutex_lock(&memcg_slab_mutex); |
| 2748 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { | 2702 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { |
| 2749 | cachep = memcg_params_to_cache(params); | 2703 | cachep = memcg_params_to_cache(params); |
| 2750 | kmem_cache_shrink(cachep); | 2704 | memcg_unregister_cache(cachep); |
| 2751 | if (atomic_read(&cachep->memcg_params->nr_pages) == 0) | ||
| 2752 | memcg_unregister_cache(cachep); | ||
| 2753 | } | 2705 | } |
| 2754 | mutex_unlock(&memcg_slab_mutex); | 2706 | mutex_unlock(&memcg_slab_mutex); |
| 2755 | } | 2707 | } |
| @@ -2784,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
| 2784 | struct memcg_register_cache_work *cw; | 2736 | struct memcg_register_cache_work *cw; |
| 2785 | 2737 | ||
| 2786 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); | 2738 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); |
| 2787 | if (cw == NULL) { | 2739 | if (!cw) |
| 2788 | css_put(&memcg->css); | ||
| 2789 | return; | 2740 | return; |
| 2790 | } | 2741 | |
| 2742 | css_get(&memcg->css); | ||
| 2791 | 2743 | ||
| 2792 | cw->memcg = memcg; | 2744 | cw->memcg = memcg; |
| 2793 | cw->cachep = cachep; | 2745 | cw->cachep = cachep; |
| @@ -2810,20 +2762,16 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
| 2810 | * this point we can't allow ourselves back into memcg_kmem_get_cache, | 2762 | * this point we can't allow ourselves back into memcg_kmem_get_cache, |
| 2811 | * the safest choice is to do it like this, wrapping the whole function. | 2763 | * the safest choice is to do it like this, wrapping the whole function. |
| 2812 | */ | 2764 | */ |
| 2813 | memcg_stop_kmem_account(); | 2765 | current->memcg_kmem_skip_account = 1; |
| 2814 | __memcg_schedule_register_cache(memcg, cachep); | 2766 | __memcg_schedule_register_cache(memcg, cachep); |
| 2815 | memcg_resume_kmem_account(); | 2767 | current->memcg_kmem_skip_account = 0; |
| 2816 | } | 2768 | } |
| 2817 | 2769 | ||
| 2818 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) | 2770 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) |
| 2819 | { | 2771 | { |
| 2820 | unsigned int nr_pages = 1 << order; | 2772 | unsigned int nr_pages = 1 << order; |
| 2821 | int res; | ||
| 2822 | 2773 | ||
| 2823 | res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); | 2774 | return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); |
| 2824 | if (!res) | ||
| 2825 | atomic_add(nr_pages, &cachep->memcg_params->nr_pages); | ||
| 2826 | return res; | ||
| 2827 | } | 2775 | } |
| 2828 | 2776 | ||
| 2829 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | 2777 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) |
| @@ -2831,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | |||
| 2831 | unsigned int nr_pages = 1 << order; | 2779 | unsigned int nr_pages = 1 << order; |
| 2832 | 2780 | ||
| 2833 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); | 2781 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); |
| 2834 | atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); | ||
| 2835 | } | 2782 | } |
| 2836 | 2783 | ||
| 2837 | /* | 2784 | /* |
| @@ -2847,8 +2794,7 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | |||
| 2847 | * Can't be called in interrupt context or from kernel threads. | 2794 | * Can't be called in interrupt context or from kernel threads. |
| 2848 | * This function needs to be called with rcu_read_lock() held. | 2795 | * This function needs to be called with rcu_read_lock() held. |
| 2849 | */ | 2796 | */ |
| 2850 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | 2797 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) |
| 2851 | gfp_t gfp) | ||
| 2852 | { | 2798 | { |
| 2853 | struct mem_cgroup *memcg; | 2799 | struct mem_cgroup *memcg; |
| 2854 | struct kmem_cache *memcg_cachep; | 2800 | struct kmem_cache *memcg_cachep; |
| @@ -2856,25 +2802,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
| 2856 | VM_BUG_ON(!cachep->memcg_params); | 2802 | VM_BUG_ON(!cachep->memcg_params); |
| 2857 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); | 2803 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); |
| 2858 | 2804 | ||
| 2859 | if (!current->mm || current->memcg_kmem_skip_account) | 2805 | if (current->memcg_kmem_skip_account) |
| 2860 | return cachep; | 2806 | return cachep; |
| 2861 | 2807 | ||
| 2862 | rcu_read_lock(); | 2808 | memcg = get_mem_cgroup_from_mm(current->mm); |
| 2863 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); | ||
| 2864 | |||
| 2865 | if (!memcg_kmem_is_active(memcg)) | 2809 | if (!memcg_kmem_is_active(memcg)) |
| 2866 | goto out; | 2810 | goto out; |
| 2867 | 2811 | ||
| 2868 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); | 2812 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); |
| 2869 | if (likely(memcg_cachep)) { | 2813 | if (likely(memcg_cachep)) |
| 2870 | cachep = memcg_cachep; | 2814 | return memcg_cachep; |
| 2871 | goto out; | ||
| 2872 | } | ||
| 2873 | |||
| 2874 | /* The corresponding put will be done in the workqueue. */ | ||
| 2875 | if (!css_tryget_online(&memcg->css)) | ||
| 2876 | goto out; | ||
| 2877 | rcu_read_unlock(); | ||
| 2878 | 2815 | ||
| 2879 | /* | 2816 | /* |
| 2880 | * If we are in a safe context (can wait, and not in interrupt | 2817 | * If we are in a safe context (can wait, and not in interrupt |
| @@ -2889,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
| 2889 | * defer everything. | 2826 | * defer everything. |
| 2890 | */ | 2827 | */ |
| 2891 | memcg_schedule_register_cache(memcg, cachep); | 2828 | memcg_schedule_register_cache(memcg, cachep); |
| 2892 | return cachep; | ||
| 2893 | out: | 2829 | out: |
| 2894 | rcu_read_unlock(); | 2830 | css_put(&memcg->css); |
| 2895 | return cachep; | 2831 | return cachep; |
| 2896 | } | 2832 | } |
| 2897 | 2833 | ||
| 2834 | void __memcg_kmem_put_cache(struct kmem_cache *cachep) | ||
| 2835 | { | ||
| 2836 | if (!is_root_cache(cachep)) | ||
| 2837 | css_put(&cachep->memcg_params->memcg->css); | ||
| 2838 | } | ||
| 2839 | |||
| 2898 | /* | 2840 | /* |
| 2899 | * We need to verify if the allocation against current->mm->owner's memcg is | 2841 | * We need to verify if the allocation against current->mm->owner's memcg is |
| 2900 | * possible for the given order. But the page is not allocated yet, so we'll | 2842 | * possible for the given order. But the page is not allocated yet, so we'll |
| @@ -2917,34 +2859,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
| 2917 | 2859 | ||
| 2918 | *_memcg = NULL; | 2860 | *_memcg = NULL; |
| 2919 | 2861 | ||
| 2920 | /* | ||
| 2921 | * Disabling accounting is only relevant for some specific memcg | ||
| 2922 | * internal allocations. Therefore we would initially not have such | ||
| 2923 | * check here, since direct calls to the page allocator that are | ||
| 2924 | * accounted to kmemcg (alloc_kmem_pages and friends) only happen | ||
| 2925 | * outside memcg core. We are mostly concerned with cache allocations, | ||
| 2926 | * and by having this test at memcg_kmem_get_cache, we are already able | ||
| 2927 | * to relay the allocation to the root cache and bypass the memcg cache | ||
| 2928 | * altogether. | ||
| 2929 | * | ||
| 2930 | * There is one exception, though: the SLUB allocator does not create | ||
| 2931 | * large order caches, but rather service large kmallocs directly from | ||
| 2932 | * the page allocator. Therefore, the following sequence when backed by | ||
| 2933 | * the SLUB allocator: | ||
| 2934 | * | ||
| 2935 | * memcg_stop_kmem_account(); | ||
| 2936 | * kmalloc(<large_number>) | ||
| 2937 | * memcg_resume_kmem_account(); | ||
| 2938 | * | ||
| 2939 | * would effectively ignore the fact that we should skip accounting, | ||
| 2940 | * since it will drive us directly to this function without passing | ||
| 2941 | * through the cache selector memcg_kmem_get_cache. Such large | ||
| 2942 | * allocations are extremely rare but can happen, for instance, for the | ||
| 2943 | * cache arrays. We bring this test here. | ||
| 2944 | */ | ||
| 2945 | if (!current->mm || current->memcg_kmem_skip_account) | ||
| 2946 | return true; | ||
| 2947 | |||
| 2948 | memcg = get_mem_cgroup_from_mm(current->mm); | 2862 | memcg = get_mem_cgroup_from_mm(current->mm); |
| 2949 | 2863 | ||
| 2950 | if (!memcg_kmem_is_active(memcg)) { | 2864 | if (!memcg_kmem_is_active(memcg)) { |
| @@ -2985,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) | |||
| 2985 | memcg_uncharge_kmem(memcg, 1 << order); | 2899 | memcg_uncharge_kmem(memcg, 1 << order); |
| 2986 | page->mem_cgroup = NULL; | 2900 | page->mem_cgroup = NULL; |
| 2987 | } | 2901 | } |
| 2988 | #else | ||
| 2989 | static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) | ||
| 2990 | { | ||
| 2991 | } | ||
| 2992 | #endif /* CONFIG_MEMCG_KMEM */ | 2902 | #endif /* CONFIG_MEMCG_KMEM */ |
| 2993 | 2903 | ||
| 2994 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2904 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| @@ -3539,12 +3449,6 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, | |||
| 3539 | return 0; | 3449 | return 0; |
| 3540 | 3450 | ||
| 3541 | /* | 3451 | /* |
| 3542 | * We are going to allocate memory for data shared by all memory | ||
| 3543 | * cgroups so let's stop accounting here. | ||
| 3544 | */ | ||
| 3545 | memcg_stop_kmem_account(); | ||
| 3546 | |||
| 3547 | /* | ||
| 3548 | * For simplicity, we won't allow this to be disabled. It also can't | 3452 | * For simplicity, we won't allow this to be disabled. It also can't |
| 3549 | * be changed if the cgroup has children already, or if tasks had | 3453 | * be changed if the cgroup has children already, or if tasks had |
| 3550 | * already joined. | 3454 | * already joined. |
| @@ -3570,25 +3474,22 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, | |||
| 3570 | goto out; | 3474 | goto out; |
| 3571 | } | 3475 | } |
| 3572 | 3476 | ||
| 3573 | memcg->kmemcg_id = memcg_id; | ||
| 3574 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
| 3575 | |||
| 3576 | /* | 3477 | /* |
| 3577 | * We couldn't have accounted to this cgroup, because it hasn't got the | 3478 | * We couldn't have accounted to this cgroup, because it hasn't got |
| 3578 | * active bit set yet, so this should succeed. | 3479 | * activated yet, so this should succeed. |
| 3579 | */ | 3480 | */ |
| 3580 | err = page_counter_limit(&memcg->kmem, nr_pages); | 3481 | err = page_counter_limit(&memcg->kmem, nr_pages); |
| 3581 | VM_BUG_ON(err); | 3482 | VM_BUG_ON(err); |
| 3582 | 3483 | ||
| 3583 | static_key_slow_inc(&memcg_kmem_enabled_key); | 3484 | static_key_slow_inc(&memcg_kmem_enabled_key); |
| 3584 | /* | 3485 | /* |
| 3585 | * Setting the active bit after enabling static branching will | 3486 | * A memory cgroup is considered kmem-active as soon as it gets |
| 3487 | * kmemcg_id. Setting the id after enabling static branching will | ||
| 3586 | * guarantee no one starts accounting before all call sites are | 3488 | * guarantee no one starts accounting before all call sites are |
| 3587 | * patched. | 3489 | * patched. |
| 3588 | */ | 3490 | */ |
| 3589 | memcg_kmem_set_active(memcg); | 3491 | memcg->kmemcg_id = memcg_id; |
| 3590 | out: | 3492 | out: |
| 3591 | memcg_resume_kmem_account(); | ||
| 3592 | return err; | 3493 | return err; |
| 3593 | } | 3494 | } |
| 3594 | 3495 | ||
| @@ -3791,11 +3692,6 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) | |||
| 3791 | } | 3692 | } |
| 3792 | #endif /* CONFIG_NUMA */ | 3693 | #endif /* CONFIG_NUMA */ |
| 3793 | 3694 | ||
| 3794 | static inline void mem_cgroup_lru_names_not_uptodate(void) | ||
| 3795 | { | ||
| 3796 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | ||
| 3797 | } | ||
| 3798 | |||
| 3799 | static int memcg_stat_show(struct seq_file *m, void *v) | 3695 | static int memcg_stat_show(struct seq_file *m, void *v) |
| 3800 | { | 3696 | { |
| 3801 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 3697 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| @@ -3803,6 +3699,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) | |||
| 3803 | struct mem_cgroup *mi; | 3699 | struct mem_cgroup *mi; |
| 3804 | unsigned int i; | 3700 | unsigned int i; |
| 3805 | 3701 | ||
| 3702 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | ||
| 3703 | |||
| 3806 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 3704 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
| 3807 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) | 3705 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
| 3808 | continue; | 3706 | continue; |
| @@ -4259,7 +4157,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
| 4259 | { | 4157 | { |
| 4260 | int ret; | 4158 | int ret; |
| 4261 | 4159 | ||
| 4262 | memcg->kmemcg_id = -1; | ||
| 4263 | ret = memcg_propagate_kmem(memcg); | 4160 | ret = memcg_propagate_kmem(memcg); |
| 4264 | if (ret) | 4161 | if (ret) |
| 4265 | return ret; | 4162 | return ret; |
| @@ -4269,6 +4166,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
| 4269 | 4166 | ||
| 4270 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4167 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
| 4271 | { | 4168 | { |
| 4169 | memcg_unregister_all_caches(memcg); | ||
| 4272 | mem_cgroup_sockets_destroy(memcg); | 4170 | mem_cgroup_sockets_destroy(memcg); |
| 4273 | } | 4171 | } |
| 4274 | #else | 4172 | #else |
| @@ -4724,17 +4622,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
| 4724 | 4622 | ||
| 4725 | free_percpu(memcg->stat); | 4623 | free_percpu(memcg->stat); |
| 4726 | 4624 | ||
| 4727 | /* | ||
| 4728 | * We need to make sure that (at least for now), the jump label | ||
| 4729 | * destruction code runs outside of the cgroup lock. This is because | ||
| 4730 | * get_online_cpus(), which is called from the static_branch update, | ||
| 4731 | * can't be called inside the cgroup_lock. cpusets are the ones | ||
| 4732 | * enforcing this dependency, so if they ever change, we might as well. | ||
| 4733 | * | ||
| 4734 | * schedule_work() will guarantee this happens. Be careful if you need | ||
| 4735 | * to move this code around, and make sure it is outside | ||
| 4736 | * the cgroup_lock. | ||
| 4737 | */ | ||
| 4738 | disarm_static_keys(memcg); | 4625 | disarm_static_keys(memcg); |
| 4739 | kfree(memcg); | 4626 | kfree(memcg); |
| 4740 | } | 4627 | } |
| @@ -4804,6 +4691,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
| 4804 | vmpressure_init(&memcg->vmpressure); | 4691 | vmpressure_init(&memcg->vmpressure); |
| 4805 | INIT_LIST_HEAD(&memcg->event_list); | 4692 | INIT_LIST_HEAD(&memcg->event_list); |
| 4806 | spin_lock_init(&memcg->event_list_lock); | 4693 | spin_lock_init(&memcg->event_list_lock); |
| 4694 | #ifdef CONFIG_MEMCG_KMEM | ||
| 4695 | memcg->kmemcg_id = -1; | ||
| 4696 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
| 4697 | #endif | ||
| 4807 | 4698 | ||
| 4808 | return &memcg->css; | 4699 | return &memcg->css; |
| 4809 | 4700 | ||
| @@ -4885,7 +4776,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
| 4885 | } | 4776 | } |
| 4886 | spin_unlock(&memcg->event_list_lock); | 4777 | spin_unlock(&memcg->event_list_lock); |
| 4887 | 4778 | ||
| 4888 | memcg_unregister_all_caches(memcg); | ||
| 4889 | vmpressure_cleanup(&memcg->vmpressure); | 4779 | vmpressure_cleanup(&memcg->vmpressure); |
| 4890 | } | 4780 | } |
| 4891 | 4781 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e5ee0ca7ae85..feb803bf3443 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -239,19 +239,14 @@ void shake_page(struct page *p, int access) | |||
| 239 | } | 239 | } |
| 240 | 240 | ||
| 241 | /* | 241 | /* |
| 242 | * Only call shrink_slab here (which would also shrink other caches) if | 242 | * Only call shrink_node_slabs here (which would also shrink |
| 243 | * access is not potentially fatal. | 243 | * other caches) if access is not potentially fatal. |
| 244 | */ | 244 | */ |
| 245 | if (access) { | 245 | if (access) { |
| 246 | int nr; | 246 | int nr; |
| 247 | int nid = page_to_nid(p); | 247 | int nid = page_to_nid(p); |
| 248 | do { | 248 | do { |
| 249 | struct shrink_control shrink = { | 249 | nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); |
| 250 | .gfp_mask = GFP_KERNEL, | ||
| 251 | }; | ||
| 252 | node_set(nid, shrink.nodes_to_scan); | ||
| 253 | |||
| 254 | nr = shrink_slab(&shrink, 1000, 1000); | ||
| 255 | if (page_count(p) == 1) | 250 | if (page_count(p) == 1) |
| 256 | break; | 251 | break; |
| 257 | } while (nr > 10); | 252 | } while (nr > 10); |
| @@ -466,7 +461,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
| 466 | struct task_struct *tsk; | 461 | struct task_struct *tsk; |
| 467 | struct address_space *mapping = page->mapping; | 462 | struct address_space *mapping = page->mapping; |
| 468 | 463 | ||
| 469 | mutex_lock(&mapping->i_mmap_mutex); | 464 | i_mmap_lock_read(mapping); |
| 470 | read_lock(&tasklist_lock); | 465 | read_lock(&tasklist_lock); |
| 471 | for_each_process(tsk) { | 466 | for_each_process(tsk) { |
| 472 | pgoff_t pgoff = page_to_pgoff(page); | 467 | pgoff_t pgoff = page_to_pgoff(page); |
| @@ -488,7 +483,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
| 488 | } | 483 | } |
| 489 | } | 484 | } |
| 490 | read_unlock(&tasklist_lock); | 485 | read_unlock(&tasklist_lock); |
| 491 | mutex_unlock(&mapping->i_mmap_mutex); | 486 | i_mmap_unlock_read(mapping); |
| 492 | } | 487 | } |
| 493 | 488 | ||
| 494 | /* | 489 | /* |
diff --git a/mm/memory.c b/mm/memory.c index 4b5a282e1107..fbf74112de5b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -1326,9 +1326,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
| 1326 | * safe to do nothing in this case. | 1326 | * safe to do nothing in this case. |
| 1327 | */ | 1327 | */ |
| 1328 | if (vma->vm_file) { | 1328 | if (vma->vm_file) { |
| 1329 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 1329 | i_mmap_lock_write(vma->vm_file->f_mapping); |
| 1330 | __unmap_hugepage_range_final(tlb, vma, start, end, NULL); | 1330 | __unmap_hugepage_range_final(tlb, vma, start, end, NULL); |
| 1331 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 1331 | i_mmap_unlock_write(vma->vm_file->f_mapping); |
| 1332 | } | 1332 | } |
| 1333 | } else | 1333 | } else |
| 1334 | unmap_page_range(tlb, vma, start, end, details); | 1334 | unmap_page_range(tlb, vma, start, end, details); |
| @@ -2377,12 +2377,12 @@ void unmap_mapping_range(struct address_space *mapping, | |||
| 2377 | details.last_index = ULONG_MAX; | 2377 | details.last_index = ULONG_MAX; |
| 2378 | 2378 | ||
| 2379 | 2379 | ||
| 2380 | mutex_lock(&mapping->i_mmap_mutex); | 2380 | i_mmap_lock_read(mapping); |
| 2381 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) | 2381 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) |
| 2382 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2382 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
| 2383 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | 2383 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) |
| 2384 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | 2384 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); |
| 2385 | mutex_unlock(&mapping->i_mmap_mutex); | 2385 | i_mmap_unlock_read(mapping); |
| 2386 | } | 2386 | } |
| 2387 | EXPORT_SYMBOL(unmap_mapping_range); | 2387 | EXPORT_SYMBOL(unmap_mapping_range); |
| 2388 | 2388 | ||
| @@ -3365,6 +3365,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3365 | 3365 | ||
| 3366 | return ret; | 3366 | return ret; |
| 3367 | } | 3367 | } |
| 3368 | EXPORT_SYMBOL_GPL(handle_mm_fault); | ||
| 3368 | 3369 | ||
| 3369 | #ifndef __PAGETABLE_PUD_FOLDED | 3370 | #ifndef __PAGETABLE_PUD_FOLDED |
| 3370 | /* | 3371 | /* |
diff --git a/mm/migrate.c b/mm/migrate.c index 01439953abf5..253474c22239 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -746,7 +746,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
| 746 | * MIGRATEPAGE_SUCCESS - success | 746 | * MIGRATEPAGE_SUCCESS - success |
| 747 | */ | 747 | */ |
| 748 | static int move_to_new_page(struct page *newpage, struct page *page, | 748 | static int move_to_new_page(struct page *newpage, struct page *page, |
| 749 | int remap_swapcache, enum migrate_mode mode) | 749 | int page_was_mapped, enum migrate_mode mode) |
| 750 | { | 750 | { |
| 751 | struct address_space *mapping; | 751 | struct address_space *mapping; |
| 752 | int rc; | 752 | int rc; |
| @@ -784,7 +784,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
| 784 | newpage->mapping = NULL; | 784 | newpage->mapping = NULL; |
| 785 | } else { | 785 | } else { |
| 786 | mem_cgroup_migrate(page, newpage, false); | 786 | mem_cgroup_migrate(page, newpage, false); |
| 787 | if (remap_swapcache) | 787 | if (page_was_mapped) |
| 788 | remove_migration_ptes(page, newpage); | 788 | remove_migration_ptes(page, newpage); |
| 789 | page->mapping = NULL; | 789 | page->mapping = NULL; |
| 790 | } | 790 | } |
| @@ -798,7 +798,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
| 798 | int force, enum migrate_mode mode) | 798 | int force, enum migrate_mode mode) |
| 799 | { | 799 | { |
| 800 | int rc = -EAGAIN; | 800 | int rc = -EAGAIN; |
| 801 | int remap_swapcache = 1; | 801 | int page_was_mapped = 0; |
| 802 | struct anon_vma *anon_vma = NULL; | 802 | struct anon_vma *anon_vma = NULL; |
| 803 | 803 | ||
| 804 | if (!trylock_page(page)) { | 804 | if (!trylock_page(page)) { |
| @@ -870,7 +870,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
| 870 | * migrated but are not remapped when migration | 870 | * migrated but are not remapped when migration |
| 871 | * completes | 871 | * completes |
| 872 | */ | 872 | */ |
| 873 | remap_swapcache = 0; | ||
| 874 | } else { | 873 | } else { |
| 875 | goto out_unlock; | 874 | goto out_unlock; |
| 876 | } | 875 | } |
| @@ -910,13 +909,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
| 910 | } | 909 | } |
| 911 | 910 | ||
| 912 | /* Establish migration ptes or remove ptes */ | 911 | /* Establish migration ptes or remove ptes */ |
| 913 | try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 912 | if (page_mapped(page)) { |
| 913 | try_to_unmap(page, | ||
| 914 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | ||
| 915 | page_was_mapped = 1; | ||
| 916 | } | ||
| 914 | 917 | ||
| 915 | skip_unmap: | 918 | skip_unmap: |
| 916 | if (!page_mapped(page)) | 919 | if (!page_mapped(page)) |
| 917 | rc = move_to_new_page(newpage, page, remap_swapcache, mode); | 920 | rc = move_to_new_page(newpage, page, page_was_mapped, mode); |
| 918 | 921 | ||
| 919 | if (rc && remap_swapcache) | 922 | if (rc && page_was_mapped) |
| 920 | remove_migration_ptes(page, page); | 923 | remove_migration_ptes(page, page); |
| 921 | 924 | ||
| 922 | /* Drop an anon_vma reference if we took one */ | 925 | /* Drop an anon_vma reference if we took one */ |
| @@ -1017,6 +1020,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
| 1017 | { | 1020 | { |
| 1018 | int rc = 0; | 1021 | int rc = 0; |
| 1019 | int *result = NULL; | 1022 | int *result = NULL; |
| 1023 | int page_was_mapped = 0; | ||
| 1020 | struct page *new_hpage; | 1024 | struct page *new_hpage; |
| 1021 | struct anon_vma *anon_vma = NULL; | 1025 | struct anon_vma *anon_vma = NULL; |
| 1022 | 1026 | ||
| @@ -1047,12 +1051,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
| 1047 | if (PageAnon(hpage)) | 1051 | if (PageAnon(hpage)) |
| 1048 | anon_vma = page_get_anon_vma(hpage); | 1052 | anon_vma = page_get_anon_vma(hpage); |
| 1049 | 1053 | ||
| 1050 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 1054 | if (page_mapped(hpage)) { |
| 1055 | try_to_unmap(hpage, | ||
| 1056 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | ||
| 1057 | page_was_mapped = 1; | ||
| 1058 | } | ||
| 1051 | 1059 | ||
| 1052 | if (!page_mapped(hpage)) | 1060 | if (!page_mapped(hpage)) |
| 1053 | rc = move_to_new_page(new_hpage, hpage, 1, mode); | 1061 | rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode); |
| 1054 | 1062 | ||
| 1055 | if (rc != MIGRATEPAGE_SUCCESS) | 1063 | if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) |
| 1056 | remove_migration_ptes(hpage, hpage); | 1064 | remove_migration_ptes(hpage, hpage); |
| 1057 | 1065 | ||
| 1058 | if (anon_vma) | 1066 | if (anon_vma) |
diff --git a/mm/mincore.c b/mm/mincore.c index 725c80961048..c8c528b36641 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
| @@ -137,8 +137,11 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 137 | } else { /* pte is a swap entry */ | 137 | } else { /* pte is a swap entry */ |
| 138 | swp_entry_t entry = pte_to_swp_entry(pte); | 138 | swp_entry_t entry = pte_to_swp_entry(pte); |
| 139 | 139 | ||
| 140 | if (is_migration_entry(entry)) { | 140 | if (non_swap_entry(entry)) { |
| 141 | /* migration entries are always uptodate */ | 141 | /* |
| 142 | * migration or hwpoison entries are always | ||
| 143 | * uptodate | ||
| 144 | */ | ||
| 142 | *vec = 1; | 145 | *vec = 1; |
| 143 | } else { | 146 | } else { |
| 144 | #ifdef CONFIG_SWAP | 147 | #ifdef CONFIG_SWAP |
| @@ -232,7 +232,7 @@ error: | |||
| 232 | } | 232 | } |
| 233 | 233 | ||
| 234 | /* | 234 | /* |
| 235 | * Requires inode->i_mapping->i_mmap_mutex | 235 | * Requires inode->i_mapping->i_mmap_rwsem |
| 236 | */ | 236 | */ |
| 237 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, | 237 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, |
| 238 | struct file *file, struct address_space *mapping) | 238 | struct file *file, struct address_space *mapping) |
| @@ -260,9 +260,9 @@ void unlink_file_vma(struct vm_area_struct *vma) | |||
| 260 | 260 | ||
| 261 | if (file) { | 261 | if (file) { |
| 262 | struct address_space *mapping = file->f_mapping; | 262 | struct address_space *mapping = file->f_mapping; |
| 263 | mutex_lock(&mapping->i_mmap_mutex); | 263 | i_mmap_lock_write(mapping); |
| 264 | __remove_shared_vm_struct(vma, file, mapping); | 264 | __remove_shared_vm_struct(vma, file, mapping); |
| 265 | mutex_unlock(&mapping->i_mmap_mutex); | 265 | i_mmap_unlock_write(mapping); |
| 266 | } | 266 | } |
| 267 | } | 267 | } |
| 268 | 268 | ||
| @@ -674,14 +674,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 674 | 674 | ||
| 675 | if (vma->vm_file) { | 675 | if (vma->vm_file) { |
| 676 | mapping = vma->vm_file->f_mapping; | 676 | mapping = vma->vm_file->f_mapping; |
| 677 | mutex_lock(&mapping->i_mmap_mutex); | 677 | i_mmap_lock_write(mapping); |
| 678 | } | 678 | } |
| 679 | 679 | ||
| 680 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 680 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
| 681 | __vma_link_file(vma); | 681 | __vma_link_file(vma); |
| 682 | 682 | ||
| 683 | if (mapping) | 683 | if (mapping) |
| 684 | mutex_unlock(&mapping->i_mmap_mutex); | 684 | i_mmap_unlock_write(mapping); |
| 685 | 685 | ||
| 686 | mm->map_count++; | 686 | mm->map_count++; |
| 687 | validate_mm(mm); | 687 | validate_mm(mm); |
| @@ -796,7 +796,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 796 | next->vm_end); | 796 | next->vm_end); |
| 797 | } | 797 | } |
| 798 | 798 | ||
| 799 | mutex_lock(&mapping->i_mmap_mutex); | 799 | i_mmap_lock_write(mapping); |
| 800 | if (insert) { | 800 | if (insert) { |
| 801 | /* | 801 | /* |
| 802 | * Put into interval tree now, so instantiated pages | 802 | * Put into interval tree now, so instantiated pages |
| @@ -883,7 +883,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 883 | anon_vma_unlock_write(anon_vma); | 883 | anon_vma_unlock_write(anon_vma); |
| 884 | } | 884 | } |
| 885 | if (mapping) | 885 | if (mapping) |
| 886 | mutex_unlock(&mapping->i_mmap_mutex); | 886 | i_mmap_unlock_write(mapping); |
| 887 | 887 | ||
| 888 | if (root) { | 888 | if (root) { |
| 889 | uprobe_mmap(vma); | 889 | uprobe_mmap(vma); |
| @@ -2362,6 +2362,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
| 2362 | } | 2362 | } |
| 2363 | #endif | 2363 | #endif |
| 2364 | 2364 | ||
| 2365 | EXPORT_SYMBOL_GPL(find_extend_vma); | ||
| 2366 | |||
| 2365 | /* | 2367 | /* |
| 2366 | * Ok - we have the memory areas we should free on the vma list, | 2368 | * Ok - we have the memory areas we should free on the vma list, |
| 2367 | * so release them, and do the vma updates. | 2369 | * so release them, and do the vma updates. |
| @@ -2791,7 +2793,7 @@ void exit_mmap(struct mm_struct *mm) | |||
| 2791 | 2793 | ||
| 2792 | /* Insert vm structure into process list sorted by address | 2794 | /* Insert vm structure into process list sorted by address |
| 2793 | * and into the inode's i_mmap tree. If vm_file is non-NULL | 2795 | * and into the inode's i_mmap tree. If vm_file is non-NULL |
| 2794 | * then i_mmap_mutex is taken here. | 2796 | * then i_mmap_rwsem is taken here. |
| 2795 | */ | 2797 | */ |
| 2796 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 2798 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
| 2797 | { | 2799 | { |
| @@ -3086,7 +3088,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
| 3086 | */ | 3088 | */ |
| 3087 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) | 3089 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) |
| 3088 | BUG(); | 3090 | BUG(); |
| 3089 | mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); | 3091 | down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); |
| 3090 | } | 3092 | } |
| 3091 | } | 3093 | } |
| 3092 | 3094 | ||
| @@ -3113,7 +3115,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
| 3113 | * vma in this mm is backed by the same anon_vma or address_space. | 3115 | * vma in this mm is backed by the same anon_vma or address_space. |
| 3114 | * | 3116 | * |
| 3115 | * We can take all the locks in random order because the VM code | 3117 | * We can take all the locks in random order because the VM code |
| 3116 | * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never | 3118 | * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never |
| 3117 | * takes more than one of them in a row. Secondly we're protected | 3119 | * takes more than one of them in a row. Secondly we're protected |
| 3118 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | 3120 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. |
| 3119 | * | 3121 | * |
| @@ -3182,7 +3184,7 @@ static void vm_unlock_mapping(struct address_space *mapping) | |||
| 3182 | * AS_MM_ALL_LOCKS can't change to 0 from under us | 3184 | * AS_MM_ALL_LOCKS can't change to 0 from under us |
| 3183 | * because we hold the mm_all_locks_mutex. | 3185 | * because we hold the mm_all_locks_mutex. |
| 3184 | */ | 3186 | */ |
| 3185 | mutex_unlock(&mapping->i_mmap_mutex); | 3187 | i_mmap_unlock_write(mapping); |
| 3186 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, | 3188 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, |
| 3187 | &mapping->flags)) | 3189 | &mapping->flags)) |
| 3188 | BUG(); | 3190 | BUG(); |
diff --git a/mm/mremap.c b/mm/mremap.c index b147f66f4c40..84aa36f9f308 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -99,7 +99,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
| 99 | spinlock_t *old_ptl, *new_ptl; | 99 | spinlock_t *old_ptl, *new_ptl; |
| 100 | 100 | ||
| 101 | /* | 101 | /* |
| 102 | * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma | 102 | * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma |
| 103 | * locks to ensure that rmap will always observe either the old or the | 103 | * locks to ensure that rmap will always observe either the old or the |
| 104 | * new ptes. This is the easiest way to avoid races with | 104 | * new ptes. This is the easiest way to avoid races with |
| 105 | * truncate_pagecache(), page migration, etc... | 105 | * truncate_pagecache(), page migration, etc... |
| @@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
| 119 | if (need_rmap_locks) { | 119 | if (need_rmap_locks) { |
| 120 | if (vma->vm_file) { | 120 | if (vma->vm_file) { |
| 121 | mapping = vma->vm_file->f_mapping; | 121 | mapping = vma->vm_file->f_mapping; |
| 122 | mutex_lock(&mapping->i_mmap_mutex); | 122 | i_mmap_lock_write(mapping); |
| 123 | } | 123 | } |
| 124 | if (vma->anon_vma) { | 124 | if (vma->anon_vma) { |
| 125 | anon_vma = vma->anon_vma; | 125 | anon_vma = vma->anon_vma; |
| @@ -156,7 +156,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
| 156 | if (anon_vma) | 156 | if (anon_vma) |
| 157 | anon_vma_unlock_write(anon_vma); | 157 | anon_vma_unlock_write(anon_vma); |
| 158 | if (mapping) | 158 | if (mapping) |
| 159 | mutex_unlock(&mapping->i_mmap_mutex); | 159 | i_mmap_unlock_write(mapping); |
| 160 | } | 160 | } |
| 161 | 161 | ||
| 162 | #define LATENCY_LIMIT (64 * PAGE_SIZE) | 162 | #define LATENCY_LIMIT (64 * PAGE_SIZE) |
diff --git a/mm/nommu.c b/mm/nommu.c index bd1808e194a7..b51eadf6d952 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -722,11 +722,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
| 722 | if (vma->vm_file) { | 722 | if (vma->vm_file) { |
| 723 | mapping = vma->vm_file->f_mapping; | 723 | mapping = vma->vm_file->f_mapping; |
| 724 | 724 | ||
| 725 | mutex_lock(&mapping->i_mmap_mutex); | 725 | i_mmap_lock_write(mapping); |
| 726 | flush_dcache_mmap_lock(mapping); | 726 | flush_dcache_mmap_lock(mapping); |
| 727 | vma_interval_tree_insert(vma, &mapping->i_mmap); | 727 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
| 728 | flush_dcache_mmap_unlock(mapping); | 728 | flush_dcache_mmap_unlock(mapping); |
| 729 | mutex_unlock(&mapping->i_mmap_mutex); | 729 | i_mmap_unlock_write(mapping); |
| 730 | } | 730 | } |
| 731 | 731 | ||
| 732 | /* add the VMA to the tree */ | 732 | /* add the VMA to the tree */ |
| @@ -795,11 +795,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
| 795 | if (vma->vm_file) { | 795 | if (vma->vm_file) { |
| 796 | mapping = vma->vm_file->f_mapping; | 796 | mapping = vma->vm_file->f_mapping; |
| 797 | 797 | ||
| 798 | mutex_lock(&mapping->i_mmap_mutex); | 798 | i_mmap_lock_write(mapping); |
| 799 | flush_dcache_mmap_lock(mapping); | 799 | flush_dcache_mmap_lock(mapping); |
| 800 | vma_interval_tree_remove(vma, &mapping->i_mmap); | 800 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
| 801 | flush_dcache_mmap_unlock(mapping); | 801 | flush_dcache_mmap_unlock(mapping); |
| 802 | mutex_unlock(&mapping->i_mmap_mutex); | 802 | i_mmap_unlock_write(mapping); |
| 803 | } | 803 | } |
| 804 | 804 | ||
| 805 | /* remove from the MM's tree and list */ | 805 | /* remove from the MM's tree and list */ |
| @@ -1149,8 +1149,7 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
| 1149 | unsigned long len, | 1149 | unsigned long len, |
| 1150 | unsigned long capabilities) | 1150 | unsigned long capabilities) |
| 1151 | { | 1151 | { |
| 1152 | struct page *pages; | 1152 | unsigned long total, point; |
| 1153 | unsigned long total, point, n; | ||
| 1154 | void *base; | 1153 | void *base; |
| 1155 | int ret, order; | 1154 | int ret, order; |
| 1156 | 1155 | ||
| @@ -1182,33 +1181,23 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
| 1182 | order = get_order(len); | 1181 | order = get_order(len); |
| 1183 | kdebug("alloc order %d for %lx", order, len); | 1182 | kdebug("alloc order %d for %lx", order, len); |
| 1184 | 1183 | ||
| 1185 | pages = alloc_pages(GFP_KERNEL, order); | ||
| 1186 | if (!pages) | ||
| 1187 | goto enomem; | ||
| 1188 | |||
| 1189 | total = 1 << order; | 1184 | total = 1 << order; |
| 1190 | atomic_long_add(total, &mmap_pages_allocated); | ||
| 1191 | |||
| 1192 | point = len >> PAGE_SHIFT; | 1185 | point = len >> PAGE_SHIFT; |
| 1193 | 1186 | ||
| 1194 | /* we allocated a power-of-2 sized page set, so we may want to trim off | 1187 | /* we don't want to allocate a power-of-2 sized page set */ |
| 1195 | * the excess */ | ||
| 1196 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { | 1188 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { |
| 1197 | while (total > point) { | 1189 | total = point; |
| 1198 | order = ilog2(total - point); | 1190 | kdebug("try to alloc exact %lu pages", total); |
| 1199 | n = 1 << order; | 1191 | base = alloc_pages_exact(len, GFP_KERNEL); |
| 1200 | kdebug("shave %lu/%lu @%lu", n, total - point, total); | 1192 | } else { |
| 1201 | atomic_long_sub(n, &mmap_pages_allocated); | 1193 | base = (void *)__get_free_pages(GFP_KERNEL, order); |
| 1202 | total -= n; | ||
| 1203 | set_page_refcounted(pages + total); | ||
| 1204 | __free_pages(pages + total, order); | ||
| 1205 | } | ||
| 1206 | } | 1194 | } |
| 1207 | 1195 | ||
| 1208 | for (point = 1; point < total; point++) | 1196 | if (!base) |
| 1209 | set_page_refcounted(&pages[point]); | 1197 | goto enomem; |
| 1198 | |||
| 1199 | atomic_long_add(total, &mmap_pages_allocated); | ||
| 1210 | 1200 | ||
| 1211 | base = page_address(pages); | ||
| 1212 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; | 1201 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; |
| 1213 | region->vm_start = (unsigned long) base; | 1202 | region->vm_start = (unsigned long) base; |
| 1214 | region->vm_end = region->vm_start + len; | 1203 | region->vm_end = region->vm_start + len; |
| @@ -2094,14 +2083,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
| 2094 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 2083 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
| 2095 | 2084 | ||
| 2096 | down_write(&nommu_region_sem); | 2085 | down_write(&nommu_region_sem); |
| 2097 | mutex_lock(&inode->i_mapping->i_mmap_mutex); | 2086 | i_mmap_lock_read(inode->i_mapping); |
| 2098 | 2087 | ||
| 2099 | /* search for VMAs that fall within the dead zone */ | 2088 | /* search for VMAs that fall within the dead zone */ |
| 2100 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { | 2089 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { |
| 2101 | /* found one - only interested if it's shared out of the page | 2090 | /* found one - only interested if it's shared out of the page |
| 2102 | * cache */ | 2091 | * cache */ |
| 2103 | if (vma->vm_flags & VM_SHARED) { | 2092 | if (vma->vm_flags & VM_SHARED) { |
| 2104 | mutex_unlock(&inode->i_mapping->i_mmap_mutex); | 2093 | i_mmap_unlock_read(inode->i_mapping); |
| 2105 | up_write(&nommu_region_sem); | 2094 | up_write(&nommu_region_sem); |
| 2106 | return -ETXTBSY; /* not quite true, but near enough */ | 2095 | return -ETXTBSY; /* not quite true, but near enough */ |
| 2107 | } | 2096 | } |
| @@ -2113,8 +2102,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
| 2113 | * we don't check for any regions that start beyond the EOF as there | 2102 | * we don't check for any regions that start beyond the EOF as there |
| 2114 | * shouldn't be any | 2103 | * shouldn't be any |
| 2115 | */ | 2104 | */ |
| 2116 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, | 2105 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) { |
| 2117 | 0, ULONG_MAX) { | ||
| 2118 | if (!(vma->vm_flags & VM_SHARED)) | 2106 | if (!(vma->vm_flags & VM_SHARED)) |
| 2119 | continue; | 2107 | continue; |
| 2120 | 2108 | ||
| @@ -2129,7 +2117,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
| 2129 | } | 2117 | } |
| 2130 | } | 2118 | } |
| 2131 | 2119 | ||
| 2132 | mutex_unlock(&inode->i_mapping->i_mmap_mutex); | 2120 | i_mmap_unlock_read(inode->i_mapping); |
| 2133 | up_write(&nommu_region_sem); | 2121 | up_write(&nommu_region_sem); |
| 2134 | return 0; | 2122 | return 0; |
| 2135 | } | 2123 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 864bba992735..d503e9ce1c7b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -281,14 +281,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
| 281 | if (oom_task_origin(task)) | 281 | if (oom_task_origin(task)) |
| 282 | return OOM_SCAN_SELECT; | 282 | return OOM_SCAN_SELECT; |
| 283 | 283 | ||
| 284 | if (task->flags & PF_EXITING && !force_kill) { | 284 | if (task_will_free_mem(task) && !force_kill) |
| 285 | /* | 285 | return OOM_SCAN_ABORT; |
| 286 | * If this task is not being ptraced on exit, then wait for it | 286 | |
| 287 | * to finish before killing some other task unnecessarily. | ||
| 288 | */ | ||
| 289 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) | ||
| 290 | return OOM_SCAN_ABORT; | ||
| 291 | } | ||
| 292 | return OOM_SCAN_OK; | 287 | return OOM_SCAN_OK; |
| 293 | } | 288 | } |
| 294 | 289 | ||
| @@ -443,7 +438,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 443 | * If the task is already exiting, don't alarm the sysadmin or kill | 438 | * If the task is already exiting, don't alarm the sysadmin or kill |
| 444 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 439 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
| 445 | */ | 440 | */ |
| 446 | if (p->flags & PF_EXITING) { | 441 | if (task_will_free_mem(p)) { |
| 447 | set_tsk_thread_flag(p, TIF_MEMDIE); | 442 | set_tsk_thread_flag(p, TIF_MEMDIE); |
| 448 | put_task_struct(p); | 443 | put_task_struct(p); |
| 449 | return; | 444 | return; |
| @@ -649,7 +644,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
| 649 | * select it. The goal is to allow it to allocate so that it may | 644 | * select it. The goal is to allow it to allocate so that it may |
| 650 | * quickly exit and free its memory. | 645 | * quickly exit and free its memory. |
| 651 | */ | 646 | */ |
| 652 | if (fatal_signal_pending(current) || current->flags & PF_EXITING) { | 647 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { |
| 653 | set_thread_flag(TIF_MEMDIE); | 648 | set_thread_flag(TIF_MEMDIE); |
| 654 | return; | 649 | return; |
| 655 | } | 650 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df542feaac3b..fa974d87f60d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -48,6 +48,7 @@ | |||
| 48 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
| 49 | #include <linux/fault-inject.h> | 49 | #include <linux/fault-inject.h> |
| 50 | #include <linux/page-isolation.h> | 50 | #include <linux/page-isolation.h> |
| 51 | #include <linux/page_ext.h> | ||
| 51 | #include <linux/debugobjects.h> | 52 | #include <linux/debugobjects.h> |
| 52 | #include <linux/kmemleak.h> | 53 | #include <linux/kmemleak.h> |
| 53 | #include <linux/compaction.h> | 54 | #include <linux/compaction.h> |
| @@ -55,9 +56,10 @@ | |||
| 55 | #include <linux/prefetch.h> | 56 | #include <linux/prefetch.h> |
| 56 | #include <linux/mm_inline.h> | 57 | #include <linux/mm_inline.h> |
| 57 | #include <linux/migrate.h> | 58 | #include <linux/migrate.h> |
| 58 | #include <linux/page-debug-flags.h> | 59 | #include <linux/page_ext.h> |
| 59 | #include <linux/hugetlb.h> | 60 | #include <linux/hugetlb.h> |
| 60 | #include <linux/sched/rt.h> | 61 | #include <linux/sched/rt.h> |
| 62 | #include <linux/page_owner.h> | ||
| 61 | 63 | ||
| 62 | #include <asm/sections.h> | 64 | #include <asm/sections.h> |
| 63 | #include <asm/tlbflush.h> | 65 | #include <asm/tlbflush.h> |
| @@ -424,6 +426,42 @@ static inline void prep_zero_page(struct page *page, unsigned int order, | |||
| 424 | 426 | ||
| 425 | #ifdef CONFIG_DEBUG_PAGEALLOC | 427 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 426 | unsigned int _debug_guardpage_minorder; | 428 | unsigned int _debug_guardpage_minorder; |
| 429 | bool _debug_pagealloc_enabled __read_mostly; | ||
| 430 | bool _debug_guardpage_enabled __read_mostly; | ||
| 431 | |||
| 432 | static int __init early_debug_pagealloc(char *buf) | ||
| 433 | { | ||
| 434 | if (!buf) | ||
| 435 | return -EINVAL; | ||
| 436 | |||
| 437 | if (strcmp(buf, "on") == 0) | ||
| 438 | _debug_pagealloc_enabled = true; | ||
| 439 | |||
| 440 | return 0; | ||
| 441 | } | ||
| 442 | early_param("debug_pagealloc", early_debug_pagealloc); | ||
| 443 | |||
| 444 | static bool need_debug_guardpage(void) | ||
| 445 | { | ||
| 446 | /* If we don't use debug_pagealloc, we don't need guard page */ | ||
| 447 | if (!debug_pagealloc_enabled()) | ||
| 448 | return false; | ||
| 449 | |||
| 450 | return true; | ||
| 451 | } | ||
| 452 | |||
| 453 | static void init_debug_guardpage(void) | ||
| 454 | { | ||
| 455 | if (!debug_pagealloc_enabled()) | ||
| 456 | return; | ||
| 457 | |||
| 458 | _debug_guardpage_enabled = true; | ||
| 459 | } | ||
| 460 | |||
| 461 | struct page_ext_operations debug_guardpage_ops = { | ||
| 462 | .need = need_debug_guardpage, | ||
| 463 | .init = init_debug_guardpage, | ||
| 464 | }; | ||
| 427 | 465 | ||
| 428 | static int __init debug_guardpage_minorder_setup(char *buf) | 466 | static int __init debug_guardpage_minorder_setup(char *buf) |
| 429 | { | 467 | { |
| @@ -439,18 +477,44 @@ static int __init debug_guardpage_minorder_setup(char *buf) | |||
| 439 | } | 477 | } |
| 440 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); | 478 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); |
| 441 | 479 | ||
| 442 | static inline void set_page_guard_flag(struct page *page) | 480 | static inline void set_page_guard(struct zone *zone, struct page *page, |
| 481 | unsigned int order, int migratetype) | ||
| 443 | { | 482 | { |
| 444 | __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | 483 | struct page_ext *page_ext; |
| 484 | |||
| 485 | if (!debug_guardpage_enabled()) | ||
| 486 | return; | ||
| 487 | |||
| 488 | page_ext = lookup_page_ext(page); | ||
| 489 | __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); | ||
| 490 | |||
| 491 | INIT_LIST_HEAD(&page->lru); | ||
| 492 | set_page_private(page, order); | ||
| 493 | /* Guard pages are not available for any usage */ | ||
| 494 | __mod_zone_freepage_state(zone, -(1 << order), migratetype); | ||
| 445 | } | 495 | } |
| 446 | 496 | ||
| 447 | static inline void clear_page_guard_flag(struct page *page) | 497 | static inline void clear_page_guard(struct zone *zone, struct page *page, |
| 498 | unsigned int order, int migratetype) | ||
| 448 | { | 499 | { |
| 449 | __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | 500 | struct page_ext *page_ext; |
| 501 | |||
| 502 | if (!debug_guardpage_enabled()) | ||
| 503 | return; | ||
| 504 | |||
| 505 | page_ext = lookup_page_ext(page); | ||
| 506 | __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); | ||
| 507 | |||
| 508 | set_page_private(page, 0); | ||
| 509 | if (!is_migrate_isolate(migratetype)) | ||
| 510 | __mod_zone_freepage_state(zone, (1 << order), migratetype); | ||
| 450 | } | 511 | } |
| 451 | #else | 512 | #else |
| 452 | static inline void set_page_guard_flag(struct page *page) { } | 513 | struct page_ext_operations debug_guardpage_ops = { NULL, }; |
| 453 | static inline void clear_page_guard_flag(struct page *page) { } | 514 | static inline void set_page_guard(struct zone *zone, struct page *page, |
| 515 | unsigned int order, int migratetype) {} | ||
| 516 | static inline void clear_page_guard(struct zone *zone, struct page *page, | ||
| 517 | unsigned int order, int migratetype) {} | ||
| 454 | #endif | 518 | #endif |
| 455 | 519 | ||
| 456 | static inline void set_page_order(struct page *page, unsigned int order) | 520 | static inline void set_page_order(struct page *page, unsigned int order) |
| @@ -581,12 +645,7 @@ static inline void __free_one_page(struct page *page, | |||
| 581 | * merge with it and move up one order. | 645 | * merge with it and move up one order. |
| 582 | */ | 646 | */ |
| 583 | if (page_is_guard(buddy)) { | 647 | if (page_is_guard(buddy)) { |
| 584 | clear_page_guard_flag(buddy); | 648 | clear_page_guard(zone, buddy, order, migratetype); |
| 585 | set_page_private(buddy, 0); | ||
| 586 | if (!is_migrate_isolate(migratetype)) { | ||
| 587 | __mod_zone_freepage_state(zone, 1 << order, | ||
| 588 | migratetype); | ||
| 589 | } | ||
| 590 | } else { | 649 | } else { |
| 591 | list_del(&buddy->lru); | 650 | list_del(&buddy->lru); |
| 592 | zone->free_area[order].nr_free--; | 651 | zone->free_area[order].nr_free--; |
| @@ -755,6 +814,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
| 755 | if (bad) | 814 | if (bad) |
| 756 | return false; | 815 | return false; |
| 757 | 816 | ||
| 817 | reset_page_owner(page, order); | ||
| 818 | |||
| 758 | if (!PageHighMem(page)) { | 819 | if (!PageHighMem(page)) { |
| 759 | debug_check_no_locks_freed(page_address(page), | 820 | debug_check_no_locks_freed(page_address(page), |
| 760 | PAGE_SIZE << order); | 821 | PAGE_SIZE << order); |
| @@ -861,23 +922,18 @@ static inline void expand(struct zone *zone, struct page *page, | |||
| 861 | size >>= 1; | 922 | size >>= 1; |
| 862 | VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); | 923 | VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); |
| 863 | 924 | ||
| 864 | #ifdef CONFIG_DEBUG_PAGEALLOC | 925 | if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && |
| 865 | if (high < debug_guardpage_minorder()) { | 926 | debug_guardpage_enabled() && |
| 927 | high < debug_guardpage_minorder()) { | ||
| 866 | /* | 928 | /* |
| 867 | * Mark as guard pages (or page), that will allow to | 929 | * Mark as guard pages (or page), that will allow to |
| 868 | * merge back to allocator when buddy will be freed. | 930 | * merge back to allocator when buddy will be freed. |
| 869 | * Corresponding page table entries will not be touched, | 931 | * Corresponding page table entries will not be touched, |
| 870 | * pages will stay not present in virtual address space | 932 | * pages will stay not present in virtual address space |
| 871 | */ | 933 | */ |
| 872 | INIT_LIST_HEAD(&page[size].lru); | 934 | set_page_guard(zone, &page[size], high, migratetype); |
| 873 | set_page_guard_flag(&page[size]); | ||
| 874 | set_page_private(&page[size], high); | ||
| 875 | /* Guard pages are not available for any usage */ | ||
| 876 | __mod_zone_freepage_state(zone, -(1 << high), | ||
| 877 | migratetype); | ||
| 878 | continue; | 935 | continue; |
| 879 | } | 936 | } |
| 880 | #endif | ||
| 881 | list_add(&page[size].lru, &area->free_list[migratetype]); | 937 | list_add(&page[size].lru, &area->free_list[migratetype]); |
| 882 | area->nr_free++; | 938 | area->nr_free++; |
| 883 | set_page_order(&page[size], high); | 939 | set_page_order(&page[size], high); |
| @@ -935,6 +991,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | |||
| 935 | if (order && (gfp_flags & __GFP_COMP)) | 991 | if (order && (gfp_flags & __GFP_COMP)) |
| 936 | prep_compound_page(page, order); | 992 | prep_compound_page(page, order); |
| 937 | 993 | ||
| 994 | set_page_owner(page, order, gfp_flags); | ||
| 995 | |||
| 938 | return 0; | 996 | return 0; |
| 939 | } | 997 | } |
| 940 | 998 | ||
| @@ -1507,8 +1565,11 @@ void split_page(struct page *page, unsigned int order) | |||
| 1507 | split_page(virt_to_page(page[0].shadow), order); | 1565 | split_page(virt_to_page(page[0].shadow), order); |
| 1508 | #endif | 1566 | #endif |
| 1509 | 1567 | ||
| 1510 | for (i = 1; i < (1 << order); i++) | 1568 | set_page_owner(page, 0, 0); |
| 1569 | for (i = 1; i < (1 << order); i++) { | ||
| 1511 | set_page_refcounted(page + i); | 1570 | set_page_refcounted(page + i); |
| 1571 | set_page_owner(page + i, 0, 0); | ||
| 1572 | } | ||
| 1512 | } | 1573 | } |
| 1513 | EXPORT_SYMBOL_GPL(split_page); | 1574 | EXPORT_SYMBOL_GPL(split_page); |
| 1514 | 1575 | ||
| @@ -1548,6 +1609,7 @@ int __isolate_free_page(struct page *page, unsigned int order) | |||
| 1548 | } | 1609 | } |
| 1549 | } | 1610 | } |
| 1550 | 1611 | ||
| 1612 | set_page_owner(page, order, 0); | ||
| 1551 | return 1UL << order; | 1613 | return 1UL << order; |
| 1552 | } | 1614 | } |
| 1553 | 1615 | ||
| @@ -4856,6 +4918,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 4856 | #endif | 4918 | #endif |
| 4857 | init_waitqueue_head(&pgdat->kswapd_wait); | 4919 | init_waitqueue_head(&pgdat->kswapd_wait); |
| 4858 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | 4920 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
| 4921 | pgdat_page_ext_init(pgdat); | ||
| 4859 | 4922 | ||
| 4860 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4923 | for (j = 0; j < MAX_NR_ZONES; j++) { |
| 4861 | struct zone *zone = pgdat->node_zones + j; | 4924 | struct zone *zone = pgdat->node_zones + j; |
| @@ -4874,16 +4937,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 4874 | * and per-cpu initialisations | 4937 | * and per-cpu initialisations |
| 4875 | */ | 4938 | */ |
| 4876 | memmap_pages = calc_memmap_size(size, realsize); | 4939 | memmap_pages = calc_memmap_size(size, realsize); |
| 4877 | if (freesize >= memmap_pages) { | 4940 | if (!is_highmem_idx(j)) { |
| 4878 | freesize -= memmap_pages; | 4941 | if (freesize >= memmap_pages) { |
| 4879 | if (memmap_pages) | 4942 | freesize -= memmap_pages; |
| 4880 | printk(KERN_DEBUG | 4943 | if (memmap_pages) |
| 4881 | " %s zone: %lu pages used for memmap\n", | 4944 | printk(KERN_DEBUG |
| 4882 | zone_names[j], memmap_pages); | 4945 | " %s zone: %lu pages used for memmap\n", |
| 4883 | } else | 4946 | zone_names[j], memmap_pages); |
| 4884 | printk(KERN_WARNING | 4947 | } else |
| 4885 | " %s zone: %lu pages exceeds freesize %lu\n", | 4948 | printk(KERN_WARNING |
| 4886 | zone_names[j], memmap_pages, freesize); | 4949 | " %s zone: %lu pages exceeds freesize %lu\n", |
| 4950 | zone_names[j], memmap_pages, freesize); | ||
| 4951 | } | ||
| 4887 | 4952 | ||
| 4888 | /* Account for reserved pages */ | 4953 | /* Account for reserved pages */ |
| 4889 | if (j == 0 && freesize > dma_reserve) { | 4954 | if (j == 0 && freesize > dma_reserve) { |
| @@ -6221,9 +6286,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
| 6221 | if (!PageLRU(page)) | 6286 | if (!PageLRU(page)) |
| 6222 | found++; | 6287 | found++; |
| 6223 | /* | 6288 | /* |
| 6224 | * If there are RECLAIMABLE pages, we need to check it. | 6289 | * If there are RECLAIMABLE pages, we need to check |
| 6225 | * But now, memory offline itself doesn't call shrink_slab() | 6290 | * it. But now, memory offline itself doesn't call |
| 6226 | * and it still to be fixed. | 6291 | * shrink_node_slabs() and it still to be fixed. |
| 6227 | */ | 6292 | */ |
| 6228 | /* | 6293 | /* |
| 6229 | * If the page is not RAM, page_count()should be 0. | 6294 | * If the page is not RAM, page_count()should be 0. |
diff --git a/mm/page_ext.c b/mm/page_ext.c new file mode 100644 index 000000000000..d86fd2f5353f --- /dev/null +++ b/mm/page_ext.c | |||
| @@ -0,0 +1,403 @@ | |||
| 1 | #include <linux/mm.h> | ||
| 2 | #include <linux/mmzone.h> | ||
| 3 | #include <linux/bootmem.h> | ||
| 4 | #include <linux/page_ext.h> | ||
| 5 | #include <linux/memory.h> | ||
| 6 | #include <linux/vmalloc.h> | ||
| 7 | #include <linux/kmemleak.h> | ||
| 8 | #include <linux/page_owner.h> | ||
| 9 | |||
| 10 | /* | ||
| 11 | * struct page extension | ||
| 12 | * | ||
| 13 | * This is the feature to manage memory for extended data per page. | ||
| 14 | * | ||
| 15 | * Until now, we must modify struct page itself to store extra data per page. | ||
| 16 | * This requires rebuilding the kernel and it is really time consuming process. | ||
| 17 | * And, sometimes, rebuild is impossible due to third party module dependency. | ||
| 18 | * At last, enlarging struct page could cause un-wanted system behaviour change. | ||
| 19 | * | ||
| 20 | * This feature is intended to overcome above mentioned problems. This feature | ||
| 21 | * allocates memory for extended data per page in certain place rather than | ||
| 22 | * the struct page itself. This memory can be accessed by the accessor | ||
| 23 | * functions provided by this code. During the boot process, it checks whether | ||
| 24 | * allocation of huge chunk of memory is needed or not. If not, it avoids | ||
| 25 | * allocating memory at all. With this advantage, we can include this feature | ||
| 26 | * into the kernel in default and can avoid rebuild and solve related problems. | ||
| 27 | * | ||
| 28 | * To help these things to work well, there are two callbacks for clients. One | ||
| 29 | * is the need callback which is mandatory if user wants to avoid useless | ||
| 30 | * memory allocation at boot-time. The other is optional, init callback, which | ||
| 31 | * is used to do proper initialization after memory is allocated. | ||
| 32 | * | ||
| 33 | * The need callback is used to decide whether extended memory allocation is | ||
| 34 | * needed or not. Sometimes users want to deactivate some features in this | ||
| 35 | * boot and extra memory would be unneccessary. In this case, to avoid | ||
| 36 | * allocating huge chunk of memory, each clients represent their need of | ||
| 37 | * extra memory through the need callback. If one of the need callbacks | ||
| 38 | * returns true, it means that someone needs extra memory so that | ||
| 39 | * page extension core should allocates memory for page extension. If | ||
| 40 | * none of need callbacks return true, memory isn't needed at all in this boot | ||
| 41 | * and page extension core can skip to allocate memory. As result, | ||
| 42 | * none of memory is wasted. | ||
| 43 | * | ||
| 44 | * The init callback is used to do proper initialization after page extension | ||
| 45 | * is completely initialized. In sparse memory system, extra memory is | ||
| 46 | * allocated some time later than memmap is allocated. In other words, lifetime | ||
| 47 | * of memory for page extension isn't same with memmap for struct page. | ||
| 48 | * Therefore, clients can't store extra data until page extension is | ||
| 49 | * initialized, even if pages are allocated and used freely. This could | ||
| 50 | * cause inadequate state of extra data per page, so, to prevent it, client | ||
| 51 | * can utilize this callback to initialize the state of it correctly. | ||
| 52 | */ | ||
| 53 | |||
| 54 | static struct page_ext_operations *page_ext_ops[] = { | ||
| 55 | &debug_guardpage_ops, | ||
| 56 | #ifdef CONFIG_PAGE_POISONING | ||
| 57 | &page_poisoning_ops, | ||
| 58 | #endif | ||
| 59 | #ifdef CONFIG_PAGE_OWNER | ||
| 60 | &page_owner_ops, | ||
| 61 | #endif | ||
| 62 | }; | ||
| 63 | |||
| 64 | static unsigned long total_usage; | ||
| 65 | |||
| 66 | static bool __init invoke_need_callbacks(void) | ||
| 67 | { | ||
| 68 | int i; | ||
| 69 | int entries = ARRAY_SIZE(page_ext_ops); | ||
| 70 | |||
| 71 | for (i = 0; i < entries; i++) { | ||
| 72 | if (page_ext_ops[i]->need && page_ext_ops[i]->need()) | ||
| 73 | return true; | ||
| 74 | } | ||
| 75 | |||
| 76 | return false; | ||
| 77 | } | ||
| 78 | |||
| 79 | static void __init invoke_init_callbacks(void) | ||
| 80 | { | ||
| 81 | int i; | ||
| 82 | int entries = ARRAY_SIZE(page_ext_ops); | ||
| 83 | |||
| 84 | for (i = 0; i < entries; i++) { | ||
| 85 | if (page_ext_ops[i]->init) | ||
| 86 | page_ext_ops[i]->init(); | ||
| 87 | } | ||
| 88 | } | ||
| 89 | |||
| 90 | #if !defined(CONFIG_SPARSEMEM) | ||
| 91 | |||
| 92 | |||
| 93 | void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) | ||
| 94 | { | ||
| 95 | pgdat->node_page_ext = NULL; | ||
| 96 | } | ||
| 97 | |||
| 98 | struct page_ext *lookup_page_ext(struct page *page) | ||
| 99 | { | ||
| 100 | unsigned long pfn = page_to_pfn(page); | ||
| 101 | unsigned long offset; | ||
| 102 | struct page_ext *base; | ||
| 103 | |||
| 104 | base = NODE_DATA(page_to_nid(page))->node_page_ext; | ||
| 105 | #ifdef CONFIG_DEBUG_VM | ||
| 106 | /* | ||
| 107 | * The sanity checks the page allocator does upon freeing a | ||
| 108 | * page can reach here before the page_ext arrays are | ||
| 109 | * allocated when feeding a range of pages to the allocator | ||
| 110 | * for the first time during bootup or memory hotplug. | ||
| 111 | */ | ||
| 112 | if (unlikely(!base)) | ||
| 113 | return NULL; | ||
| 114 | #endif | ||
| 115 | offset = pfn - round_down(node_start_pfn(page_to_nid(page)), | ||
| 116 | MAX_ORDER_NR_PAGES); | ||
| 117 | return base + offset; | ||
| 118 | } | ||
| 119 | |||
| 120 | static int __init alloc_node_page_ext(int nid) | ||
| 121 | { | ||
| 122 | struct page_ext *base; | ||
| 123 | unsigned long table_size; | ||
| 124 | unsigned long nr_pages; | ||
| 125 | |||
| 126 | nr_pages = NODE_DATA(nid)->node_spanned_pages; | ||
| 127 | if (!nr_pages) | ||
| 128 | return 0; | ||
| 129 | |||
| 130 | /* | ||
| 131 | * Need extra space if node range is not aligned with | ||
| 132 | * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm | ||
| 133 | * checks buddy's status, range could be out of exact node range. | ||
| 134 | */ | ||
| 135 | if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || | ||
| 136 | !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) | ||
| 137 | nr_pages += MAX_ORDER_NR_PAGES; | ||
| 138 | |||
| 139 | table_size = sizeof(struct page_ext) * nr_pages; | ||
| 140 | |||
| 141 | base = memblock_virt_alloc_try_nid_nopanic( | ||
| 142 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), | ||
| 143 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
| 144 | if (!base) | ||
| 145 | return -ENOMEM; | ||
| 146 | NODE_DATA(nid)->node_page_ext = base; | ||
| 147 | total_usage += table_size; | ||
| 148 | return 0; | ||
| 149 | } | ||
| 150 | |||
| 151 | void __init page_ext_init_flatmem(void) | ||
| 152 | { | ||
| 153 | |||
| 154 | int nid, fail; | ||
| 155 | |||
| 156 | if (!invoke_need_callbacks()) | ||
| 157 | return; | ||
| 158 | |||
| 159 | for_each_online_node(nid) { | ||
| 160 | fail = alloc_node_page_ext(nid); | ||
| 161 | if (fail) | ||
| 162 | goto fail; | ||
| 163 | } | ||
| 164 | pr_info("allocated %ld bytes of page_ext\n", total_usage); | ||
| 165 | invoke_init_callbacks(); | ||
| 166 | return; | ||
| 167 | |||
| 168 | fail: | ||
| 169 | pr_crit("allocation of page_ext failed.\n"); | ||
| 170 | panic("Out of memory"); | ||
| 171 | } | ||
| 172 | |||
| 173 | #else /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
| 174 | |||
| 175 | struct page_ext *lookup_page_ext(struct page *page) | ||
| 176 | { | ||
| 177 | unsigned long pfn = page_to_pfn(page); | ||
| 178 | struct mem_section *section = __pfn_to_section(pfn); | ||
| 179 | #ifdef CONFIG_DEBUG_VM | ||
| 180 | /* | ||
| 181 | * The sanity checks the page allocator does upon freeing a | ||
| 182 | * page can reach here before the page_ext arrays are | ||
| 183 | * allocated when feeding a range of pages to the allocator | ||
| 184 | * for the first time during bootup or memory hotplug. | ||
| 185 | */ | ||
| 186 | if (!section->page_ext) | ||
| 187 | return NULL; | ||
| 188 | #endif | ||
| 189 | return section->page_ext + pfn; | ||
| 190 | } | ||
| 191 | |||
| 192 | static void *__meminit alloc_page_ext(size_t size, int nid) | ||
| 193 | { | ||
| 194 | gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; | ||
| 195 | void *addr = NULL; | ||
| 196 | |||
| 197 | addr = alloc_pages_exact_nid(nid, size, flags); | ||
| 198 | if (addr) { | ||
| 199 | kmemleak_alloc(addr, size, 1, flags); | ||
| 200 | return addr; | ||
| 201 | } | ||
| 202 | |||
| 203 | if (node_state(nid, N_HIGH_MEMORY)) | ||
| 204 | addr = vzalloc_node(size, nid); | ||
| 205 | else | ||
| 206 | addr = vzalloc(size); | ||
| 207 | |||
| 208 | return addr; | ||
| 209 | } | ||
| 210 | |||
| 211 | static int __meminit init_section_page_ext(unsigned long pfn, int nid) | ||
| 212 | { | ||
| 213 | struct mem_section *section; | ||
| 214 | struct page_ext *base; | ||
| 215 | unsigned long table_size; | ||
| 216 | |||
| 217 | section = __pfn_to_section(pfn); | ||
| 218 | |||
| 219 | if (section->page_ext) | ||
| 220 | return 0; | ||
| 221 | |||
| 222 | table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; | ||
| 223 | base = alloc_page_ext(table_size, nid); | ||
| 224 | |||
| 225 | /* | ||
| 226 | * The value stored in section->page_ext is (base - pfn) | ||
| 227 | * and it does not point to the memory block allocated above, | ||
| 228 | * causing kmemleak false positives. | ||
| 229 | */ | ||
| 230 | kmemleak_not_leak(base); | ||
| 231 | |||
| 232 | if (!base) { | ||
| 233 | pr_err("page ext allocation failure\n"); | ||
| 234 | return -ENOMEM; | ||
| 235 | } | ||
| 236 | |||
| 237 | /* | ||
| 238 | * The passed "pfn" may not be aligned to SECTION. For the calculation | ||
| 239 | * we need to apply a mask. | ||
| 240 | */ | ||
| 241 | pfn &= PAGE_SECTION_MASK; | ||
| 242 | section->page_ext = base - pfn; | ||
| 243 | total_usage += table_size; | ||
| 244 | return 0; | ||
| 245 | } | ||
| 246 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
| 247 | static void free_page_ext(void *addr) | ||
| 248 | { | ||
| 249 | if (is_vmalloc_addr(addr)) { | ||
| 250 | vfree(addr); | ||
| 251 | } else { | ||
| 252 | struct page *page = virt_to_page(addr); | ||
| 253 | size_t table_size; | ||
| 254 | |||
| 255 | table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; | ||
| 256 | |||
| 257 | BUG_ON(PageReserved(page)); | ||
| 258 | free_pages_exact(addr, table_size); | ||
| 259 | } | ||
| 260 | } | ||
| 261 | |||
| 262 | static void __free_page_ext(unsigned long pfn) | ||
| 263 | { | ||
| 264 | struct mem_section *ms; | ||
| 265 | struct page_ext *base; | ||
| 266 | |||
| 267 | ms = __pfn_to_section(pfn); | ||
| 268 | if (!ms || !ms->page_ext) | ||
| 269 | return; | ||
| 270 | base = ms->page_ext + pfn; | ||
| 271 | free_page_ext(base); | ||
| 272 | ms->page_ext = NULL; | ||
| 273 | } | ||
| 274 | |||
| 275 | static int __meminit online_page_ext(unsigned long start_pfn, | ||
| 276 | unsigned long nr_pages, | ||
| 277 | int nid) | ||
| 278 | { | ||
| 279 | unsigned long start, end, pfn; | ||
| 280 | int fail = 0; | ||
| 281 | |||
| 282 | start = SECTION_ALIGN_DOWN(start_pfn); | ||
| 283 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); | ||
| 284 | |||
| 285 | if (nid == -1) { | ||
| 286 | /* | ||
| 287 | * In this case, "nid" already exists and contains valid memory. | ||
| 288 | * "start_pfn" passed to us is a pfn which is an arg for | ||
| 289 | * online__pages(), and start_pfn should exist. | ||
| 290 | */ | ||
| 291 | nid = pfn_to_nid(start_pfn); | ||
| 292 | VM_BUG_ON(!node_state(nid, N_ONLINE)); | ||
| 293 | } | ||
| 294 | |||
| 295 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { | ||
| 296 | if (!pfn_present(pfn)) | ||
| 297 | continue; | ||
| 298 | fail = init_section_page_ext(pfn, nid); | ||
| 299 | } | ||
| 300 | if (!fail) | ||
| 301 | return 0; | ||
| 302 | |||
| 303 | /* rollback */ | ||
| 304 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
| 305 | __free_page_ext(pfn); | ||
| 306 | |||
| 307 | return -ENOMEM; | ||
| 308 | } | ||
| 309 | |||
| 310 | static int __meminit offline_page_ext(unsigned long start_pfn, | ||
| 311 | unsigned long nr_pages, int nid) | ||
| 312 | { | ||
| 313 | unsigned long start, end, pfn; | ||
| 314 | |||
| 315 | start = SECTION_ALIGN_DOWN(start_pfn); | ||
| 316 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); | ||
| 317 | |||
| 318 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
| 319 | __free_page_ext(pfn); | ||
| 320 | return 0; | ||
| 321 | |||
| 322 | } | ||
| 323 | |||
| 324 | static int __meminit page_ext_callback(struct notifier_block *self, | ||
| 325 | unsigned long action, void *arg) | ||
| 326 | { | ||
| 327 | struct memory_notify *mn = arg; | ||
| 328 | int ret = 0; | ||
| 329 | |||
| 330 | switch (action) { | ||
| 331 | case MEM_GOING_ONLINE: | ||
| 332 | ret = online_page_ext(mn->start_pfn, | ||
| 333 | mn->nr_pages, mn->status_change_nid); | ||
| 334 | break; | ||
| 335 | case MEM_OFFLINE: | ||
| 336 | offline_page_ext(mn->start_pfn, | ||
| 337 | mn->nr_pages, mn->status_change_nid); | ||
| 338 | break; | ||
| 339 | case MEM_CANCEL_ONLINE: | ||
| 340 | offline_page_ext(mn->start_pfn, | ||
| 341 | mn->nr_pages, mn->status_change_nid); | ||
| 342 | break; | ||
| 343 | case MEM_GOING_OFFLINE: | ||
| 344 | break; | ||
| 345 | case MEM_ONLINE: | ||
| 346 | case MEM_CANCEL_OFFLINE: | ||
| 347 | break; | ||
| 348 | } | ||
| 349 | |||
| 350 | return notifier_from_errno(ret); | ||
| 351 | } | ||
| 352 | |||
| 353 | #endif | ||
| 354 | |||
| 355 | void __init page_ext_init(void) | ||
| 356 | { | ||
| 357 | unsigned long pfn; | ||
| 358 | int nid; | ||
| 359 | |||
| 360 | if (!invoke_need_callbacks()) | ||
| 361 | return; | ||
| 362 | |||
| 363 | for_each_node_state(nid, N_MEMORY) { | ||
| 364 | unsigned long start_pfn, end_pfn; | ||
| 365 | |||
| 366 | start_pfn = node_start_pfn(nid); | ||
| 367 | end_pfn = node_end_pfn(nid); | ||
| 368 | /* | ||
| 369 | * start_pfn and end_pfn may not be aligned to SECTION and the | ||
| 370 | * page->flags of out of node pages are not initialized. So we | ||
| 371 | * scan [start_pfn, the biggest section's pfn < end_pfn) here. | ||
| 372 | */ | ||
| 373 | for (pfn = start_pfn; pfn < end_pfn; | ||
| 374 | pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { | ||
| 375 | |||
| 376 | if (!pfn_valid(pfn)) | ||
| 377 | continue; | ||
| 378 | /* | ||
| 379 | * Nodes's pfns can be overlapping. | ||
| 380 | * We know some arch can have a nodes layout such as | ||
| 381 | * -------------pfn--------------> | ||
| 382 | * N0 | N1 | N2 | N0 | N1 | N2|.... | ||
| 383 | */ | ||
| 384 | if (pfn_to_nid(pfn) != nid) | ||
| 385 | continue; | ||
| 386 | if (init_section_page_ext(pfn, nid)) | ||
| 387 | goto oom; | ||
| 388 | } | ||
| 389 | } | ||
| 390 | hotplug_memory_notifier(page_ext_callback, 0); | ||
| 391 | pr_info("allocated %ld bytes of page_ext\n", total_usage); | ||
| 392 | invoke_init_callbacks(); | ||
| 393 | return; | ||
| 394 | |||
| 395 | oom: | ||
| 396 | panic("Out of memory"); | ||
| 397 | } | ||
| 398 | |||
| 399 | void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) | ||
| 400 | { | ||
| 401 | } | ||
| 402 | |||
| 403 | #endif | ||
diff --git a/mm/page_owner.c b/mm/page_owner.c new file mode 100644 index 000000000000..9ab4a9b5bc09 --- /dev/null +++ b/mm/page_owner.c | |||
| @@ -0,0 +1,311 @@ | |||
| 1 | #include <linux/debugfs.h> | ||
| 2 | #include <linux/mm.h> | ||
| 3 | #include <linux/slab.h> | ||
| 4 | #include <linux/uaccess.h> | ||
| 5 | #include <linux/bootmem.h> | ||
| 6 | #include <linux/stacktrace.h> | ||
| 7 | #include <linux/page_owner.h> | ||
| 8 | #include "internal.h" | ||
| 9 | |||
| 10 | static bool page_owner_disabled = true; | ||
| 11 | bool page_owner_inited __read_mostly; | ||
| 12 | |||
| 13 | static void init_early_allocated_pages(void); | ||
| 14 | |||
| 15 | static int early_page_owner_param(char *buf) | ||
| 16 | { | ||
| 17 | if (!buf) | ||
| 18 | return -EINVAL; | ||
| 19 | |||
| 20 | if (strcmp(buf, "on") == 0) | ||
| 21 | page_owner_disabled = false; | ||
| 22 | |||
| 23 | return 0; | ||
| 24 | } | ||
| 25 | early_param("page_owner", early_page_owner_param); | ||
| 26 | |||
| 27 | static bool need_page_owner(void) | ||
| 28 | { | ||
| 29 | if (page_owner_disabled) | ||
| 30 | return false; | ||
| 31 | |||
| 32 | return true; | ||
| 33 | } | ||
| 34 | |||
| 35 | static void init_page_owner(void) | ||
| 36 | { | ||
| 37 | if (page_owner_disabled) | ||
| 38 | return; | ||
| 39 | |||
| 40 | page_owner_inited = true; | ||
| 41 | init_early_allocated_pages(); | ||
| 42 | } | ||
| 43 | |||
| 44 | struct page_ext_operations page_owner_ops = { | ||
| 45 | .need = need_page_owner, | ||
| 46 | .init = init_page_owner, | ||
| 47 | }; | ||
| 48 | |||
| 49 | void __reset_page_owner(struct page *page, unsigned int order) | ||
| 50 | { | ||
| 51 | int i; | ||
| 52 | struct page_ext *page_ext; | ||
| 53 | |||
| 54 | for (i = 0; i < (1 << order); i++) { | ||
| 55 | page_ext = lookup_page_ext(page + i); | ||
| 56 | __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); | ||
| 57 | } | ||
| 58 | } | ||
| 59 | |||
| 60 | void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) | ||
| 61 | { | ||
| 62 | struct page_ext *page_ext; | ||
| 63 | struct stack_trace *trace; | ||
| 64 | |||
| 65 | page_ext = lookup_page_ext(page); | ||
| 66 | |||
| 67 | trace = &page_ext->trace; | ||
| 68 | trace->nr_entries = 0; | ||
| 69 | trace->max_entries = ARRAY_SIZE(page_ext->trace_entries); | ||
| 70 | trace->entries = &page_ext->trace_entries[0]; | ||
| 71 | trace->skip = 3; | ||
| 72 | save_stack_trace(&page_ext->trace); | ||
| 73 | |||
| 74 | page_ext->order = order; | ||
| 75 | page_ext->gfp_mask = gfp_mask; | ||
| 76 | |||
| 77 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); | ||
| 78 | } | ||
| 79 | |||
| 80 | static ssize_t | ||
| 81 | print_page_owner(char __user *buf, size_t count, unsigned long pfn, | ||
| 82 | struct page *page, struct page_ext *page_ext) | ||
| 83 | { | ||
| 84 | int ret; | ||
| 85 | int pageblock_mt, page_mt; | ||
| 86 | char *kbuf; | ||
| 87 | |||
| 88 | kbuf = kmalloc(count, GFP_KERNEL); | ||
| 89 | if (!kbuf) | ||
| 90 | return -ENOMEM; | ||
| 91 | |||
| 92 | ret = snprintf(kbuf, count, | ||
| 93 | "Page allocated via order %u, mask 0x%x\n", | ||
| 94 | page_ext->order, page_ext->gfp_mask); | ||
| 95 | |||
| 96 | if (ret >= count) | ||
| 97 | goto err; | ||
| 98 | |||
| 99 | /* Print information relevant to grouping pages by mobility */ | ||
| 100 | pageblock_mt = get_pfnblock_migratetype(page, pfn); | ||
| 101 | page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); | ||
| 102 | ret += snprintf(kbuf + ret, count - ret, | ||
| 103 | "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", | ||
| 104 | pfn, | ||
| 105 | pfn >> pageblock_order, | ||
| 106 | pageblock_mt, | ||
| 107 | pageblock_mt != page_mt ? "Fallback" : " ", | ||
| 108 | PageLocked(page) ? "K" : " ", | ||
| 109 | PageError(page) ? "E" : " ", | ||
| 110 | PageReferenced(page) ? "R" : " ", | ||
| 111 | PageUptodate(page) ? "U" : " ", | ||
| 112 | PageDirty(page) ? "D" : " ", | ||
| 113 | PageLRU(page) ? "L" : " ", | ||
| 114 | PageActive(page) ? "A" : " ", | ||
| 115 | PageSlab(page) ? "S" : " ", | ||
| 116 | PageWriteback(page) ? "W" : " ", | ||
| 117 | PageCompound(page) ? "C" : " ", | ||
| 118 | PageSwapCache(page) ? "B" : " ", | ||
| 119 | PageMappedToDisk(page) ? "M" : " "); | ||
| 120 | |||
| 121 | if (ret >= count) | ||
| 122 | goto err; | ||
| 123 | |||
| 124 | ret += snprint_stack_trace(kbuf + ret, count - ret, | ||
| 125 | &page_ext->trace, 0); | ||
| 126 | if (ret >= count) | ||
| 127 | goto err; | ||
| 128 | |||
| 129 | ret += snprintf(kbuf + ret, count - ret, "\n"); | ||
| 130 | if (ret >= count) | ||
| 131 | goto err; | ||
| 132 | |||
| 133 | if (copy_to_user(buf, kbuf, ret)) | ||
| 134 | ret = -EFAULT; | ||
| 135 | |||
| 136 | kfree(kbuf); | ||
| 137 | return ret; | ||
| 138 | |||
| 139 | err: | ||
| 140 | kfree(kbuf); | ||
| 141 | return -ENOMEM; | ||
| 142 | } | ||
| 143 | |||
| 144 | static ssize_t | ||
| 145 | read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) | ||
| 146 | { | ||
| 147 | unsigned long pfn; | ||
| 148 | struct page *page; | ||
| 149 | struct page_ext *page_ext; | ||
| 150 | |||
| 151 | if (!page_owner_inited) | ||
| 152 | return -EINVAL; | ||
| 153 | |||
| 154 | page = NULL; | ||
| 155 | pfn = min_low_pfn + *ppos; | ||
| 156 | |||
| 157 | /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ | ||
| 158 | while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) | ||
| 159 | pfn++; | ||
| 160 | |||
| 161 | drain_all_pages(NULL); | ||
| 162 | |||
| 163 | /* Find an allocated page */ | ||
| 164 | for (; pfn < max_pfn; pfn++) { | ||
| 165 | /* | ||
| 166 | * If the new page is in a new MAX_ORDER_NR_PAGES area, | ||
| 167 | * validate the area as existing, skip it if not | ||
| 168 | */ | ||
| 169 | if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { | ||
| 170 | pfn += MAX_ORDER_NR_PAGES - 1; | ||
| 171 | continue; | ||
| 172 | } | ||
| 173 | |||
| 174 | /* Check for holes within a MAX_ORDER area */ | ||
| 175 | if (!pfn_valid_within(pfn)) | ||
| 176 | continue; | ||
| 177 | |||
| 178 | page = pfn_to_page(pfn); | ||
| 179 | if (PageBuddy(page)) { | ||
| 180 | unsigned long freepage_order = page_order_unsafe(page); | ||
| 181 | |||
| 182 | if (freepage_order < MAX_ORDER) | ||
| 183 | pfn += (1UL << freepage_order) - 1; | ||
| 184 | continue; | ||
| 185 | } | ||
| 186 | |||
| 187 | page_ext = lookup_page_ext(page); | ||
| 188 | |||
| 189 | /* | ||
| 190 | * Some pages could be missed by concurrent allocation or free, | ||
| 191 | * because we don't hold the zone lock. | ||
| 192 | */ | ||
| 193 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | ||
| 194 | continue; | ||
| 195 | |||
| 196 | /* Record the next PFN to read in the file offset */ | ||
| 197 | *ppos = (pfn - min_low_pfn) + 1; | ||
| 198 | |||
| 199 | return print_page_owner(buf, count, pfn, page, page_ext); | ||
| 200 | } | ||
| 201 | |||
| 202 | return 0; | ||
| 203 | } | ||
| 204 | |||
| 205 | static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) | ||
| 206 | { | ||
| 207 | struct page *page; | ||
| 208 | struct page_ext *page_ext; | ||
| 209 | unsigned long pfn = zone->zone_start_pfn, block_end_pfn; | ||
| 210 | unsigned long end_pfn = pfn + zone->spanned_pages; | ||
| 211 | unsigned long count = 0; | ||
| 212 | |||
| 213 | /* Scan block by block. First and last block may be incomplete */ | ||
| 214 | pfn = zone->zone_start_pfn; | ||
| 215 | |||
| 216 | /* | ||
| 217 | * Walk the zone in pageblock_nr_pages steps. If a page block spans | ||
| 218 | * a zone boundary, it will be double counted between zones. This does | ||
| 219 | * not matter as the mixed block count will still be correct | ||
| 220 | */ | ||
| 221 | for (; pfn < end_pfn; ) { | ||
| 222 | if (!pfn_valid(pfn)) { | ||
| 223 | pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); | ||
| 224 | continue; | ||
| 225 | } | ||
| 226 | |||
| 227 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
| 228 | block_end_pfn = min(block_end_pfn, end_pfn); | ||
| 229 | |||
| 230 | page = pfn_to_page(pfn); | ||
| 231 | |||
| 232 | for (; pfn < block_end_pfn; pfn++) { | ||
| 233 | if (!pfn_valid_within(pfn)) | ||
| 234 | continue; | ||
| 235 | |||
| 236 | page = pfn_to_page(pfn); | ||
| 237 | |||
| 238 | /* | ||
| 239 | * We are safe to check buddy flag and order, because | ||
| 240 | * this is init stage and only single thread runs. | ||
| 241 | */ | ||
| 242 | if (PageBuddy(page)) { | ||
| 243 | pfn += (1UL << page_order(page)) - 1; | ||
| 244 | continue; | ||
| 245 | } | ||
| 246 | |||
| 247 | if (PageReserved(page)) | ||
| 248 | continue; | ||
| 249 | |||
| 250 | page_ext = lookup_page_ext(page); | ||
| 251 | |||
| 252 | /* Maybe overraping zone */ | ||
| 253 | if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | ||
| 254 | continue; | ||
| 255 | |||
| 256 | /* Found early allocated page */ | ||
| 257 | set_page_owner(page, 0, 0); | ||
| 258 | count++; | ||
| 259 | } | ||
| 260 | } | ||
| 261 | |||
| 262 | pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", | ||
| 263 | pgdat->node_id, zone->name, count); | ||
| 264 | } | ||
| 265 | |||
| 266 | static void init_zones_in_node(pg_data_t *pgdat) | ||
| 267 | { | ||
| 268 | struct zone *zone; | ||
| 269 | struct zone *node_zones = pgdat->node_zones; | ||
| 270 | unsigned long flags; | ||
| 271 | |||
| 272 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
| 273 | if (!populated_zone(zone)) | ||
| 274 | continue; | ||
| 275 | |||
| 276 | spin_lock_irqsave(&zone->lock, flags); | ||
| 277 | init_pages_in_zone(pgdat, zone); | ||
| 278 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 279 | } | ||
| 280 | } | ||
| 281 | |||
| 282 | static void init_early_allocated_pages(void) | ||
| 283 | { | ||
| 284 | pg_data_t *pgdat; | ||
| 285 | |||
| 286 | drain_all_pages(NULL); | ||
| 287 | for_each_online_pgdat(pgdat) | ||
| 288 | init_zones_in_node(pgdat); | ||
| 289 | } | ||
| 290 | |||
| 291 | static const struct file_operations proc_page_owner_operations = { | ||
| 292 | .read = read_page_owner, | ||
| 293 | }; | ||
| 294 | |||
| 295 | static int __init pageowner_init(void) | ||
| 296 | { | ||
| 297 | struct dentry *dentry; | ||
| 298 | |||
| 299 | if (!page_owner_inited) { | ||
| 300 | pr_info("page_owner is disabled\n"); | ||
| 301 | return 0; | ||
| 302 | } | ||
| 303 | |||
| 304 | dentry = debugfs_create_file("page_owner", S_IRUSR, NULL, | ||
| 305 | NULL, &proc_page_owner_operations); | ||
| 306 | if (IS_ERR(dentry)) | ||
| 307 | return PTR_ERR(dentry); | ||
| 308 | |||
| 309 | return 0; | ||
| 310 | } | ||
| 311 | module_init(pageowner_init) | ||
| @@ -23,7 +23,7 @@ | |||
| 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
| 24 | * mm->mmap_sem | 24 | * mm->mmap_sem |
| 25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
| 26 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_rwsem |
| 27 | * anon_vma->rwsem | 27 | * anon_vma->rwsem |
| 28 | * mm->page_table_lock or pte_lock | 28 | * mm->page_table_lock or pte_lock |
| 29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
| @@ -1260,7 +1260,7 @@ out_mlock: | |||
| 1260 | /* | 1260 | /* |
| 1261 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | 1261 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes |
| 1262 | * unstable result and race. Plus, We can't wait here because | 1262 | * unstable result and race. Plus, We can't wait here because |
| 1263 | * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. | 1263 | * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem. |
| 1264 | * if trylock failed, the page remain in evictable lru and later | 1264 | * if trylock failed, the page remain in evictable lru and later |
| 1265 | * vmscan could retry to move the page to unevictable lru if the | 1265 | * vmscan could retry to move the page to unevictable lru if the |
| 1266 | * page is actually mlocked. | 1266 | * page is actually mlocked. |
| @@ -1635,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, | |||
| 1635 | static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | 1635 | static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) |
| 1636 | { | 1636 | { |
| 1637 | struct anon_vma *anon_vma; | 1637 | struct anon_vma *anon_vma; |
| 1638 | pgoff_t pgoff = page_to_pgoff(page); | 1638 | pgoff_t pgoff; |
| 1639 | struct anon_vma_chain *avc; | 1639 | struct anon_vma_chain *avc; |
| 1640 | int ret = SWAP_AGAIN; | 1640 | int ret = SWAP_AGAIN; |
| 1641 | 1641 | ||
| @@ -1643,6 +1643,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | |||
| 1643 | if (!anon_vma) | 1643 | if (!anon_vma) |
| 1644 | return ret; | 1644 | return ret; |
| 1645 | 1645 | ||
| 1646 | pgoff = page_to_pgoff(page); | ||
| 1646 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1647 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
| 1647 | struct vm_area_struct *vma = avc->vma; | 1648 | struct vm_area_struct *vma = avc->vma; |
| 1648 | unsigned long address = vma_address(page, vma); | 1649 | unsigned long address = vma_address(page, vma); |
| @@ -1676,7 +1677,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | |||
| 1676 | static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | 1677 | static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) |
| 1677 | { | 1678 | { |
| 1678 | struct address_space *mapping = page->mapping; | 1679 | struct address_space *mapping = page->mapping; |
| 1679 | pgoff_t pgoff = page_to_pgoff(page); | 1680 | pgoff_t pgoff; |
| 1680 | struct vm_area_struct *vma; | 1681 | struct vm_area_struct *vma; |
| 1681 | int ret = SWAP_AGAIN; | 1682 | int ret = SWAP_AGAIN; |
| 1682 | 1683 | ||
| @@ -1684,13 +1685,15 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
| 1684 | * The page lock not only makes sure that page->mapping cannot | 1685 | * The page lock not only makes sure that page->mapping cannot |
| 1685 | * suddenly be NULLified by truncation, it makes sure that the | 1686 | * suddenly be NULLified by truncation, it makes sure that the |
| 1686 | * structure at mapping cannot be freed and reused yet, | 1687 | * structure at mapping cannot be freed and reused yet, |
| 1687 | * so we can safely take mapping->i_mmap_mutex. | 1688 | * so we can safely take mapping->i_mmap_rwsem. |
| 1688 | */ | 1689 | */ |
| 1689 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 1690 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 1690 | 1691 | ||
| 1691 | if (!mapping) | 1692 | if (!mapping) |
| 1692 | return ret; | 1693 | return ret; |
| 1693 | mutex_lock(&mapping->i_mmap_mutex); | 1694 | |
| 1695 | pgoff = page_to_pgoff(page); | ||
| 1696 | i_mmap_lock_read(mapping); | ||
| 1694 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 1697 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
| 1695 | unsigned long address = vma_address(page, vma); | 1698 | unsigned long address = vma_address(page, vma); |
| 1696 | 1699 | ||
| @@ -1711,9 +1714,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
| 1711 | goto done; | 1714 | goto done; |
| 1712 | 1715 | ||
| 1713 | ret = rwc->file_nonlinear(page, mapping, rwc->arg); | 1716 | ret = rwc->file_nonlinear(page, mapping, rwc->arg); |
| 1714 | |||
| 1715 | done: | 1717 | done: |
| 1716 | mutex_unlock(&mapping->i_mmap_mutex); | 1718 | i_mmap_unlock_read(mapping); |
| 1717 | return ret; | 1719 | return ret; |
| 1718 | } | 1720 | } |
| 1719 | 1721 | ||
| @@ -3015,7 +3015,7 @@ retry: | |||
| 3015 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 3015 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
| 3016 | nid = zone_to_nid(zone); | 3016 | nid = zone_to_nid(zone); |
| 3017 | 3017 | ||
| 3018 | if (cpuset_zone_allowed(zone, flags | __GFP_HARDWALL) && | 3018 | if (cpuset_zone_allowed(zone, flags) && |
| 3019 | get_node(cache, nid) && | 3019 | get_node(cache, nid) && |
| 3020 | get_node(cache, nid)->free_objects) { | 3020 | get_node(cache, nid)->free_objects) { |
| 3021 | obj = ____cache_alloc_node(cache, | 3021 | obj = ____cache_alloc_node(cache, |
| @@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
| 3182 | memset(ptr, 0, cachep->object_size); | 3182 | memset(ptr, 0, cachep->object_size); |
| 3183 | } | 3183 | } |
| 3184 | 3184 | ||
| 3185 | memcg_kmem_put_cache(cachep); | ||
| 3185 | return ptr; | 3186 | return ptr; |
| 3186 | } | 3187 | } |
| 3187 | 3188 | ||
| @@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) | |||
| 3247 | memset(objp, 0, cachep->object_size); | 3248 | memset(objp, 0, cachep->object_size); |
| 3248 | } | 3249 | } |
| 3249 | 3250 | ||
| 3251 | memcg_kmem_put_cache(cachep); | ||
| 3250 | return objp; | 3252 | return objp; |
| 3251 | } | 3253 | } |
| 3252 | 3254 | ||
| @@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x) | |||
| 1233 | kmemleak_free(x); | 1233 | kmemleak_free(x); |
| 1234 | } | 1234 | } |
| 1235 | 1235 | ||
| 1236 | static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) | 1236 | static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, |
| 1237 | gfp_t flags) | ||
| 1237 | { | 1238 | { |
| 1238 | flags &= gfp_allowed_mask; | 1239 | flags &= gfp_allowed_mask; |
| 1239 | lockdep_trace_alloc(flags); | 1240 | lockdep_trace_alloc(flags); |
| 1240 | might_sleep_if(flags & __GFP_WAIT); | 1241 | might_sleep_if(flags & __GFP_WAIT); |
| 1241 | 1242 | ||
| 1242 | return should_failslab(s->object_size, flags, s->flags); | 1243 | if (should_failslab(s->object_size, flags, s->flags)) |
| 1244 | return NULL; | ||
| 1245 | |||
| 1246 | return memcg_kmem_get_cache(s, flags); | ||
| 1243 | } | 1247 | } |
| 1244 | 1248 | ||
| 1245 | static inline void slab_post_alloc_hook(struct kmem_cache *s, | 1249 | static inline void slab_post_alloc_hook(struct kmem_cache *s, |
| @@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, | |||
| 1248 | flags &= gfp_allowed_mask; | 1252 | flags &= gfp_allowed_mask; |
| 1249 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | 1253 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); |
| 1250 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); | 1254 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); |
| 1255 | memcg_kmem_put_cache(s); | ||
| 1251 | } | 1256 | } |
| 1252 | 1257 | ||
| 1253 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | 1258 | static inline void slab_free_hook(struct kmem_cache *s, void *x) |
| @@ -1665,8 +1670,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
| 1665 | 1670 | ||
| 1666 | n = get_node(s, zone_to_nid(zone)); | 1671 | n = get_node(s, zone_to_nid(zone)); |
| 1667 | 1672 | ||
| 1668 | if (n && cpuset_zone_allowed(zone, | 1673 | if (n && cpuset_zone_allowed(zone, flags) && |
| 1669 | flags | __GFP_HARDWALL) && | ||
| 1670 | n->nr_partial > s->min_partial) { | 1674 | n->nr_partial > s->min_partial) { |
| 1671 | object = get_partial_node(s, n, c, flags); | 1675 | object = get_partial_node(s, n, c, flags); |
| 1672 | if (object) { | 1676 | if (object) { |
| @@ -2384,10 +2388,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, | |||
| 2384 | struct page *page; | 2388 | struct page *page; |
| 2385 | unsigned long tid; | 2389 | unsigned long tid; |
| 2386 | 2390 | ||
| 2387 | if (slab_pre_alloc_hook(s, gfpflags)) | 2391 | s = slab_pre_alloc_hook(s, gfpflags); |
| 2392 | if (!s) | ||
| 2388 | return NULL; | 2393 | return NULL; |
| 2389 | |||
| 2390 | s = memcg_kmem_get_cache(s, gfpflags); | ||
| 2391 | redo: | 2394 | redo: |
| 2392 | /* | 2395 | /* |
| 2393 | * Must read kmem_cache cpu data via this cpu ptr. Preemption is | 2396 | * Must read kmem_cache cpu data via this cpu ptr. Preemption is |
diff --git a/mm/vmacache.c b/mm/vmacache.c index 9f25af825dec..b6e3662fe339 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c | |||
| @@ -17,6 +17,8 @@ void vmacache_flush_all(struct mm_struct *mm) | |||
| 17 | { | 17 | { |
| 18 | struct task_struct *g, *p; | 18 | struct task_struct *g, *p; |
| 19 | 19 | ||
| 20 | count_vm_vmacache_event(VMACACHE_FULL_FLUSHES); | ||
| 21 | |||
| 20 | /* | 22 | /* |
| 21 | * Single threaded tasks need not iterate the entire | 23 | * Single threaded tasks need not iterate the entire |
| 22 | * list of process. We can avoid the flushing as well | 24 | * list of process. We can avoid the flushing as well |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8a18196fcdff..39c338896416 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -2574,10 +2574,10 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) | |||
| 2574 | if (!counters) | 2574 | if (!counters) |
| 2575 | return; | 2575 | return; |
| 2576 | 2576 | ||
| 2577 | /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ | ||
| 2578 | smp_rmb(); | ||
| 2579 | if (v->flags & VM_UNINITIALIZED) | 2577 | if (v->flags & VM_UNINITIALIZED) |
| 2580 | return; | 2578 | return; |
| 2579 | /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ | ||
| 2580 | smp_rmb(); | ||
| 2581 | 2581 | ||
| 2582 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); | 2582 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); |
| 2583 | 2583 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index a384339bf718..bd9a72bc4a1b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -229,9 +229,10 @@ EXPORT_SYMBOL(unregister_shrinker); | |||
| 229 | 229 | ||
| 230 | #define SHRINK_BATCH 128 | 230 | #define SHRINK_BATCH 128 |
| 231 | 231 | ||
| 232 | static unsigned long | 232 | static unsigned long shrink_slabs(struct shrink_control *shrinkctl, |
| 233 | shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | 233 | struct shrinker *shrinker, |
| 234 | unsigned long nr_pages_scanned, unsigned long lru_pages) | 234 | unsigned long nr_scanned, |
| 235 | unsigned long nr_eligible) | ||
| 235 | { | 236 | { |
| 236 | unsigned long freed = 0; | 237 | unsigned long freed = 0; |
| 237 | unsigned long long delta; | 238 | unsigned long long delta; |
| @@ -255,9 +256,9 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
| 255 | nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); | 256 | nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); |
| 256 | 257 | ||
| 257 | total_scan = nr; | 258 | total_scan = nr; |
| 258 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | 259 | delta = (4 * nr_scanned) / shrinker->seeks; |
| 259 | delta *= freeable; | 260 | delta *= freeable; |
| 260 | do_div(delta, lru_pages + 1); | 261 | do_div(delta, nr_eligible + 1); |
| 261 | total_scan += delta; | 262 | total_scan += delta; |
| 262 | if (total_scan < 0) { | 263 | if (total_scan < 0) { |
| 263 | pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", | 264 | pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", |
| @@ -289,8 +290,8 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
| 289 | total_scan = freeable * 2; | 290 | total_scan = freeable * 2; |
| 290 | 291 | ||
| 291 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, | 292 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, |
| 292 | nr_pages_scanned, lru_pages, | 293 | nr_scanned, nr_eligible, |
| 293 | freeable, delta, total_scan); | 294 | freeable, delta, total_scan); |
| 294 | 295 | ||
| 295 | /* | 296 | /* |
| 296 | * Normally, we should not scan less than batch_size objects in one | 297 | * Normally, we should not scan less than batch_size objects in one |
| @@ -339,34 +340,37 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
| 339 | return freed; | 340 | return freed; |
| 340 | } | 341 | } |
| 341 | 342 | ||
| 342 | /* | 343 | /** |
| 343 | * Call the shrink functions to age shrinkable caches | 344 | * shrink_node_slabs - shrink slab caches of a given node |
| 344 | * | 345 | * @gfp_mask: allocation context |
| 345 | * Here we assume it costs one seek to replace a lru page and that it also | 346 | * @nid: node whose slab caches to target |
| 346 | * takes a seek to recreate a cache object. With this in mind we age equal | 347 | * @nr_scanned: pressure numerator |
| 347 | * percentages of the lru and ageable caches. This should balance the seeks | 348 | * @nr_eligible: pressure denominator |
| 348 | * generated by these structures. | ||
| 349 | * | 349 | * |
| 350 | * If the vm encountered mapped pages on the LRU it increase the pressure on | 350 | * Call the shrink functions to age shrinkable caches. |
| 351 | * slab to avoid swapping. | ||
| 352 | * | 351 | * |
| 353 | * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. | 352 | * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, |
| 353 | * unaware shrinkers will receive a node id of 0 instead. | ||
| 354 | * | 354 | * |
| 355 | * `lru_pages' represents the number of on-LRU pages in all the zones which | 355 | * @nr_scanned and @nr_eligible form a ratio that indicate how much of |
| 356 | * are eligible for the caller's allocation attempt. It is used for balancing | 356 | * the available objects should be scanned. Page reclaim for example |
| 357 | * slab reclaim versus page reclaim. | 357 | * passes the number of pages scanned and the number of pages on the |
| 358 | * LRU lists that it considered on @nid, plus a bias in @nr_scanned | ||
| 359 | * when it encountered mapped pages. The ratio is further biased by | ||
| 360 | * the ->seeks setting of the shrink function, which indicates the | ||
| 361 | * cost to recreate an object relative to that of an LRU page. | ||
| 358 | * | 362 | * |
| 359 | * Returns the number of slab objects which we shrunk. | 363 | * Returns the number of reclaimed slab objects. |
| 360 | */ | 364 | */ |
| 361 | unsigned long shrink_slab(struct shrink_control *shrinkctl, | 365 | unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, |
| 362 | unsigned long nr_pages_scanned, | 366 | unsigned long nr_scanned, |
| 363 | unsigned long lru_pages) | 367 | unsigned long nr_eligible) |
| 364 | { | 368 | { |
| 365 | struct shrinker *shrinker; | 369 | struct shrinker *shrinker; |
| 366 | unsigned long freed = 0; | 370 | unsigned long freed = 0; |
| 367 | 371 | ||
| 368 | if (nr_pages_scanned == 0) | 372 | if (nr_scanned == 0) |
| 369 | nr_pages_scanned = SWAP_CLUSTER_MAX; | 373 | nr_scanned = SWAP_CLUSTER_MAX; |
| 370 | 374 | ||
| 371 | if (!down_read_trylock(&shrinker_rwsem)) { | 375 | if (!down_read_trylock(&shrinker_rwsem)) { |
| 372 | /* | 376 | /* |
| @@ -380,20 +384,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, | |||
| 380 | } | 384 | } |
| 381 | 385 | ||
| 382 | list_for_each_entry(shrinker, &shrinker_list, list) { | 386 | list_for_each_entry(shrinker, &shrinker_list, list) { |
| 383 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { | 387 | struct shrink_control sc = { |
| 384 | shrinkctl->nid = 0; | 388 | .gfp_mask = gfp_mask, |
| 385 | freed += shrink_slab_node(shrinkctl, shrinker, | 389 | .nid = nid, |
| 386 | nr_pages_scanned, lru_pages); | 390 | }; |
| 387 | continue; | ||
| 388 | } | ||
| 389 | 391 | ||
| 390 | for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { | 392 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) |
| 391 | if (node_online(shrinkctl->nid)) | 393 | sc.nid = 0; |
| 392 | freed += shrink_slab_node(shrinkctl, shrinker, | ||
| 393 | nr_pages_scanned, lru_pages); | ||
| 394 | 394 | ||
| 395 | } | 395 | freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); |
| 396 | } | 396 | } |
| 397 | |||
| 397 | up_read(&shrinker_rwsem); | 398 | up_read(&shrinker_rwsem); |
| 398 | out: | 399 | out: |
| 399 | cond_resched(); | 400 | cond_resched(); |
| @@ -1876,7 +1877,8 @@ enum scan_balance { | |||
| 1876 | * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan | 1877 | * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan |
| 1877 | */ | 1878 | */ |
| 1878 | static void get_scan_count(struct lruvec *lruvec, int swappiness, | 1879 | static void get_scan_count(struct lruvec *lruvec, int swappiness, |
| 1879 | struct scan_control *sc, unsigned long *nr) | 1880 | struct scan_control *sc, unsigned long *nr, |
| 1881 | unsigned long *lru_pages) | ||
| 1880 | { | 1882 | { |
| 1881 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | 1883 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
| 1882 | u64 fraction[2]; | 1884 | u64 fraction[2]; |
| @@ -2022,6 +2024,7 @@ out: | |||
| 2022 | some_scanned = false; | 2024 | some_scanned = false; |
| 2023 | /* Only use force_scan on second pass. */ | 2025 | /* Only use force_scan on second pass. */ |
| 2024 | for (pass = 0; !some_scanned && pass < 2; pass++) { | 2026 | for (pass = 0; !some_scanned && pass < 2; pass++) { |
| 2027 | *lru_pages = 0; | ||
| 2025 | for_each_evictable_lru(lru) { | 2028 | for_each_evictable_lru(lru) { |
| 2026 | int file = is_file_lru(lru); | 2029 | int file = is_file_lru(lru); |
| 2027 | unsigned long size; | 2030 | unsigned long size; |
| @@ -2048,14 +2051,19 @@ out: | |||
| 2048 | case SCAN_FILE: | 2051 | case SCAN_FILE: |
| 2049 | case SCAN_ANON: | 2052 | case SCAN_ANON: |
| 2050 | /* Scan one type exclusively */ | 2053 | /* Scan one type exclusively */ |
| 2051 | if ((scan_balance == SCAN_FILE) != file) | 2054 | if ((scan_balance == SCAN_FILE) != file) { |
| 2055 | size = 0; | ||
| 2052 | scan = 0; | 2056 | scan = 0; |
| 2057 | } | ||
| 2053 | break; | 2058 | break; |
| 2054 | default: | 2059 | default: |
| 2055 | /* Look ma, no brain */ | 2060 | /* Look ma, no brain */ |
| 2056 | BUG(); | 2061 | BUG(); |
| 2057 | } | 2062 | } |
| 2063 | |||
| 2064 | *lru_pages += size; | ||
| 2058 | nr[lru] = scan; | 2065 | nr[lru] = scan; |
| 2066 | |||
| 2059 | /* | 2067 | /* |
| 2060 | * Skip the second pass and don't force_scan, | 2068 | * Skip the second pass and don't force_scan, |
| 2061 | * if we found something to scan. | 2069 | * if we found something to scan. |
| @@ -2069,7 +2077,7 @@ out: | |||
| 2069 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 2077 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
| 2070 | */ | 2078 | */ |
| 2071 | static void shrink_lruvec(struct lruvec *lruvec, int swappiness, | 2079 | static void shrink_lruvec(struct lruvec *lruvec, int swappiness, |
| 2072 | struct scan_control *sc) | 2080 | struct scan_control *sc, unsigned long *lru_pages) |
| 2073 | { | 2081 | { |
| 2074 | unsigned long nr[NR_LRU_LISTS]; | 2082 | unsigned long nr[NR_LRU_LISTS]; |
| 2075 | unsigned long targets[NR_LRU_LISTS]; | 2083 | unsigned long targets[NR_LRU_LISTS]; |
| @@ -2080,7 +2088,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, | |||
| 2080 | struct blk_plug plug; | 2088 | struct blk_plug plug; |
| 2081 | bool scan_adjusted; | 2089 | bool scan_adjusted; |
| 2082 | 2090 | ||
| 2083 | get_scan_count(lruvec, swappiness, sc, nr); | 2091 | get_scan_count(lruvec, swappiness, sc, nr, lru_pages); |
| 2084 | 2092 | ||
| 2085 | /* Record the original scan target for proportional adjustments later */ | 2093 | /* Record the original scan target for proportional adjustments later */ |
| 2086 | memcpy(targets, nr, sizeof(nr)); | 2094 | memcpy(targets, nr, sizeof(nr)); |
| @@ -2258,7 +2266,8 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
| 2258 | } | 2266 | } |
| 2259 | } | 2267 | } |
| 2260 | 2268 | ||
| 2261 | static bool shrink_zone(struct zone *zone, struct scan_control *sc) | 2269 | static bool shrink_zone(struct zone *zone, struct scan_control *sc, |
| 2270 | bool is_classzone) | ||
| 2262 | { | 2271 | { |
| 2263 | unsigned long nr_reclaimed, nr_scanned; | 2272 | unsigned long nr_reclaimed, nr_scanned; |
| 2264 | bool reclaimable = false; | 2273 | bool reclaimable = false; |
| @@ -2269,6 +2278,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) | |||
| 2269 | .zone = zone, | 2278 | .zone = zone, |
| 2270 | .priority = sc->priority, | 2279 | .priority = sc->priority, |
| 2271 | }; | 2280 | }; |
| 2281 | unsigned long zone_lru_pages = 0; | ||
| 2272 | struct mem_cgroup *memcg; | 2282 | struct mem_cgroup *memcg; |
| 2273 | 2283 | ||
| 2274 | nr_reclaimed = sc->nr_reclaimed; | 2284 | nr_reclaimed = sc->nr_reclaimed; |
| @@ -2276,13 +2286,15 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) | |||
| 2276 | 2286 | ||
| 2277 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 2287 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
| 2278 | do { | 2288 | do { |
| 2289 | unsigned long lru_pages; | ||
| 2279 | struct lruvec *lruvec; | 2290 | struct lruvec *lruvec; |
| 2280 | int swappiness; | 2291 | int swappiness; |
| 2281 | 2292 | ||
| 2282 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2293 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
| 2283 | swappiness = mem_cgroup_swappiness(memcg); | 2294 | swappiness = mem_cgroup_swappiness(memcg); |
| 2284 | 2295 | ||
| 2285 | shrink_lruvec(lruvec, swappiness, sc); | 2296 | shrink_lruvec(lruvec, swappiness, sc, &lru_pages); |
| 2297 | zone_lru_pages += lru_pages; | ||
| 2286 | 2298 | ||
| 2287 | /* | 2299 | /* |
| 2288 | * Direct reclaim and kswapd have to scan all memory | 2300 | * Direct reclaim and kswapd have to scan all memory |
| @@ -2302,6 +2314,25 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) | |||
| 2302 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 2314 | memcg = mem_cgroup_iter(root, memcg, &reclaim); |
| 2303 | } while (memcg); | 2315 | } while (memcg); |
| 2304 | 2316 | ||
| 2317 | /* | ||
| 2318 | * Shrink the slab caches in the same proportion that | ||
| 2319 | * the eligible LRU pages were scanned. | ||
| 2320 | */ | ||
| 2321 | if (global_reclaim(sc) && is_classzone) { | ||
| 2322 | struct reclaim_state *reclaim_state; | ||
| 2323 | |||
| 2324 | shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), | ||
| 2325 | sc->nr_scanned - nr_scanned, | ||
| 2326 | zone_lru_pages); | ||
| 2327 | |||
| 2328 | reclaim_state = current->reclaim_state; | ||
| 2329 | if (reclaim_state) { | ||
| 2330 | sc->nr_reclaimed += | ||
| 2331 | reclaim_state->reclaimed_slab; | ||
| 2332 | reclaim_state->reclaimed_slab = 0; | ||
| 2333 | } | ||
| 2334 | } | ||
| 2335 | |||
| 2305 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | 2336 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, |
| 2306 | sc->nr_scanned - nr_scanned, | 2337 | sc->nr_scanned - nr_scanned, |
| 2307 | sc->nr_reclaimed - nr_reclaimed); | 2338 | sc->nr_reclaimed - nr_reclaimed); |
| @@ -2376,12 +2407,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
| 2376 | struct zone *zone; | 2407 | struct zone *zone; |
| 2377 | unsigned long nr_soft_reclaimed; | 2408 | unsigned long nr_soft_reclaimed; |
| 2378 | unsigned long nr_soft_scanned; | 2409 | unsigned long nr_soft_scanned; |
| 2379 | unsigned long lru_pages = 0; | ||
| 2380 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
| 2381 | gfp_t orig_mask; | 2410 | gfp_t orig_mask; |
| 2382 | struct shrink_control shrink = { | ||
| 2383 | .gfp_mask = sc->gfp_mask, | ||
| 2384 | }; | ||
| 2385 | enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); | 2411 | enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); |
| 2386 | bool reclaimable = false; | 2412 | bool reclaimable = false; |
| 2387 | 2413 | ||
| @@ -2394,12 +2420,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
| 2394 | if (buffer_heads_over_limit) | 2420 | if (buffer_heads_over_limit) |
| 2395 | sc->gfp_mask |= __GFP_HIGHMEM; | 2421 | sc->gfp_mask |= __GFP_HIGHMEM; |
| 2396 | 2422 | ||
| 2397 | nodes_clear(shrink.nodes_to_scan); | ||
| 2398 | |||
| 2399 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2423 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
| 2400 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2424 | requested_highidx, sc->nodemask) { |
| 2425 | enum zone_type classzone_idx; | ||
| 2426 | |||
| 2401 | if (!populated_zone(zone)) | 2427 | if (!populated_zone(zone)) |
| 2402 | continue; | 2428 | continue; |
| 2429 | |||
| 2430 | classzone_idx = requested_highidx; | ||
| 2431 | while (!populated_zone(zone->zone_pgdat->node_zones + | ||
| 2432 | classzone_idx)) | ||
| 2433 | classzone_idx--; | ||
| 2434 | |||
| 2403 | /* | 2435 | /* |
| 2404 | * Take care memory controller reclaiming has small influence | 2436 | * Take care memory controller reclaiming has small influence |
| 2405 | * to global LRU. | 2437 | * to global LRU. |
| @@ -2409,9 +2441,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
| 2409 | GFP_KERNEL | __GFP_HARDWALL)) | 2441 | GFP_KERNEL | __GFP_HARDWALL)) |
| 2410 | continue; | 2442 | continue; |
| 2411 | 2443 | ||
| 2412 | lru_pages += zone_reclaimable_pages(zone); | ||
| 2413 | node_set(zone_to_nid(zone), shrink.nodes_to_scan); | ||
| 2414 | |||
| 2415 | if (sc->priority != DEF_PRIORITY && | 2444 | if (sc->priority != DEF_PRIORITY && |
| 2416 | !zone_reclaimable(zone)) | 2445 | !zone_reclaimable(zone)) |
| 2417 | continue; /* Let kswapd poll it */ | 2446 | continue; /* Let kswapd poll it */ |
| @@ -2450,7 +2479,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
| 2450 | /* need some check for avoid more shrink_zone() */ | 2479 | /* need some check for avoid more shrink_zone() */ |
| 2451 | } | 2480 | } |
| 2452 | 2481 | ||
| 2453 | if (shrink_zone(zone, sc)) | 2482 | if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx)) |
| 2454 | reclaimable = true; | 2483 | reclaimable = true; |
| 2455 | 2484 | ||
| 2456 | if (global_reclaim(sc) && | 2485 | if (global_reclaim(sc) && |
| @@ -2459,20 +2488,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
| 2459 | } | 2488 | } |
| 2460 | 2489 | ||
| 2461 | /* | 2490 | /* |
| 2462 | * Don't shrink slabs when reclaiming memory from over limit cgroups | ||
| 2463 | * but do shrink slab at least once when aborting reclaim for | ||
| 2464 | * compaction to avoid unevenly scanning file/anon LRU pages over slab | ||
| 2465 | * pages. | ||
| 2466 | */ | ||
| 2467 | if (global_reclaim(sc)) { | ||
| 2468 | shrink_slab(&shrink, sc->nr_scanned, lru_pages); | ||
| 2469 | if (reclaim_state) { | ||
| 2470 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | ||
| 2471 | reclaim_state->reclaimed_slab = 0; | ||
| 2472 | } | ||
| 2473 | } | ||
| 2474 | |||
| 2475 | /* | ||
| 2476 | * Restore to original mask to avoid the impact on the caller if we | 2491 | * Restore to original mask to avoid the impact on the caller if we |
| 2477 | * promoted it to __GFP_HIGHMEM. | 2492 | * promoted it to __GFP_HIGHMEM. |
| 2478 | */ | 2493 | */ |
| @@ -2736,6 +2751,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
| 2736 | }; | 2751 | }; |
| 2737 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2752 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
| 2738 | int swappiness = mem_cgroup_swappiness(memcg); | 2753 | int swappiness = mem_cgroup_swappiness(memcg); |
| 2754 | unsigned long lru_pages; | ||
| 2739 | 2755 | ||
| 2740 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2756 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
| 2741 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2757 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
| @@ -2751,7 +2767,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
| 2751 | * will pick up pages from other mem cgroup's as well. We hack | 2767 | * will pick up pages from other mem cgroup's as well. We hack |
| 2752 | * the priority and make it zero. | 2768 | * the priority and make it zero. |
| 2753 | */ | 2769 | */ |
| 2754 | shrink_lruvec(lruvec, swappiness, &sc); | 2770 | shrink_lruvec(lruvec, swappiness, &sc, &lru_pages); |
| 2755 | 2771 | ||
| 2756 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2772 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
| 2757 | 2773 | ||
| @@ -2932,15 +2948,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
| 2932 | static bool kswapd_shrink_zone(struct zone *zone, | 2948 | static bool kswapd_shrink_zone(struct zone *zone, |
| 2933 | int classzone_idx, | 2949 | int classzone_idx, |
| 2934 | struct scan_control *sc, | 2950 | struct scan_control *sc, |
| 2935 | unsigned long lru_pages, | ||
| 2936 | unsigned long *nr_attempted) | 2951 | unsigned long *nr_attempted) |
| 2937 | { | 2952 | { |
| 2938 | int testorder = sc->order; | 2953 | int testorder = sc->order; |
| 2939 | unsigned long balance_gap; | 2954 | unsigned long balance_gap; |
| 2940 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
| 2941 | struct shrink_control shrink = { | ||
| 2942 | .gfp_mask = sc->gfp_mask, | ||
| 2943 | }; | ||
| 2944 | bool lowmem_pressure; | 2955 | bool lowmem_pressure; |
| 2945 | 2956 | ||
| 2946 | /* Reclaim above the high watermark. */ | 2957 | /* Reclaim above the high watermark. */ |
| @@ -2975,13 +2986,7 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
| 2975 | balance_gap, classzone_idx)) | 2986 | balance_gap, classzone_idx)) |
| 2976 | return true; | 2987 | return true; |
| 2977 | 2988 | ||
| 2978 | shrink_zone(zone, sc); | 2989 | shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); |
| 2979 | nodes_clear(shrink.nodes_to_scan); | ||
| 2980 | node_set(zone_to_nid(zone), shrink.nodes_to_scan); | ||
| 2981 | |||
| 2982 | reclaim_state->reclaimed_slab = 0; | ||
| 2983 | shrink_slab(&shrink, sc->nr_scanned, lru_pages); | ||
| 2984 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | ||
| 2985 | 2990 | ||
| 2986 | /* Account for the number of pages attempted to reclaim */ | 2991 | /* Account for the number of pages attempted to reclaim */ |
| 2987 | *nr_attempted += sc->nr_to_reclaim; | 2992 | *nr_attempted += sc->nr_to_reclaim; |
| @@ -3042,7 +3047,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
| 3042 | count_vm_event(PAGEOUTRUN); | 3047 | count_vm_event(PAGEOUTRUN); |
| 3043 | 3048 | ||
| 3044 | do { | 3049 | do { |
| 3045 | unsigned long lru_pages = 0; | ||
| 3046 | unsigned long nr_attempted = 0; | 3050 | unsigned long nr_attempted = 0; |
| 3047 | bool raise_priority = true; | 3051 | bool raise_priority = true; |
| 3048 | bool pgdat_needs_compaction = (order > 0); | 3052 | bool pgdat_needs_compaction = (order > 0); |
| @@ -3102,8 +3106,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
| 3102 | if (!populated_zone(zone)) | 3106 | if (!populated_zone(zone)) |
| 3103 | continue; | 3107 | continue; |
| 3104 | 3108 | ||
| 3105 | lru_pages += zone_reclaimable_pages(zone); | ||
| 3106 | |||
| 3107 | /* | 3109 | /* |
| 3108 | * If any zone is currently balanced then kswapd will | 3110 | * If any zone is currently balanced then kswapd will |
| 3109 | * not call compaction as it is expected that the | 3111 | * not call compaction as it is expected that the |
| @@ -3159,8 +3161,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
| 3159 | * that that high watermark would be met at 100% | 3161 | * that that high watermark would be met at 100% |
| 3160 | * efficiency. | 3162 | * efficiency. |
| 3161 | */ | 3163 | */ |
| 3162 | if (kswapd_shrink_zone(zone, end_zone, &sc, | 3164 | if (kswapd_shrink_zone(zone, end_zone, |
| 3163 | lru_pages, &nr_attempted)) | 3165 | &sc, &nr_attempted)) |
| 3164 | raise_priority = false; | 3166 | raise_priority = false; |
| 3165 | } | 3167 | } |
| 3166 | 3168 | ||
| @@ -3612,10 +3614,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 3612 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 3614 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
| 3613 | .may_swap = 1, | 3615 | .may_swap = 1, |
| 3614 | }; | 3616 | }; |
| 3615 | struct shrink_control shrink = { | ||
| 3616 | .gfp_mask = sc.gfp_mask, | ||
| 3617 | }; | ||
| 3618 | unsigned long nr_slab_pages0, nr_slab_pages1; | ||
| 3619 | 3617 | ||
| 3620 | cond_resched(); | 3618 | cond_resched(); |
| 3621 | /* | 3619 | /* |
| @@ -3634,44 +3632,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 3634 | * priorities until we have enough memory freed. | 3632 | * priorities until we have enough memory freed. |
| 3635 | */ | 3633 | */ |
| 3636 | do { | 3634 | do { |
| 3637 | shrink_zone(zone, &sc); | 3635 | shrink_zone(zone, &sc, true); |
| 3638 | } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); | 3636 | } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); |
| 3639 | } | 3637 | } |
| 3640 | 3638 | ||
| 3641 | nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | ||
| 3642 | if (nr_slab_pages0 > zone->min_slab_pages) { | ||
| 3643 | /* | ||
| 3644 | * shrink_slab() does not currently allow us to determine how | ||
| 3645 | * many pages were freed in this zone. So we take the current | ||
| 3646 | * number of slab pages and shake the slab until it is reduced | ||
| 3647 | * by the same nr_pages that we used for reclaiming unmapped | ||
| 3648 | * pages. | ||
| 3649 | */ | ||
| 3650 | nodes_clear(shrink.nodes_to_scan); | ||
| 3651 | node_set(zone_to_nid(zone), shrink.nodes_to_scan); | ||
| 3652 | for (;;) { | ||
| 3653 | unsigned long lru_pages = zone_reclaimable_pages(zone); | ||
| 3654 | |||
| 3655 | /* No reclaimable slab or very low memory pressure */ | ||
| 3656 | if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) | ||
| 3657 | break; | ||
| 3658 | |||
| 3659 | /* Freed enough memory */ | ||
| 3660 | nr_slab_pages1 = zone_page_state(zone, | ||
| 3661 | NR_SLAB_RECLAIMABLE); | ||
| 3662 | if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) | ||
| 3663 | break; | ||
| 3664 | } | ||
| 3665 | |||
| 3666 | /* | ||
| 3667 | * Update nr_reclaimed by the number of slab pages we | ||
| 3668 | * reclaimed from this zone. | ||
| 3669 | */ | ||
| 3670 | nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | ||
| 3671 | if (nr_slab_pages1 < nr_slab_pages0) | ||
| 3672 | sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; | ||
| 3673 | } | ||
| 3674 | |||
| 3675 | p->reclaim_state = NULL; | 3639 | p->reclaim_state = NULL; |
| 3676 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 3640 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
| 3677 | lockdep_clear_current_reclaim_state(); | 3641 | lockdep_clear_current_reclaim_state(); |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 1b12d390dc68..1284f89fca08 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -22,6 +22,8 @@ | |||
| 22 | #include <linux/writeback.h> | 22 | #include <linux/writeback.h> |
| 23 | #include <linux/compaction.h> | 23 | #include <linux/compaction.h> |
| 24 | #include <linux/mm_inline.h> | 24 | #include <linux/mm_inline.h> |
| 25 | #include <linux/page_ext.h> | ||
| 26 | #include <linux/page_owner.h> | ||
| 25 | 27 | ||
| 26 | #include "internal.h" | 28 | #include "internal.h" |
| 27 | 29 | ||
| @@ -898,6 +900,7 @@ const char * const vmstat_text[] = { | |||
| 898 | #ifdef CONFIG_DEBUG_VM_VMACACHE | 900 | #ifdef CONFIG_DEBUG_VM_VMACACHE |
| 899 | "vmacache_find_calls", | 901 | "vmacache_find_calls", |
| 900 | "vmacache_find_hits", | 902 | "vmacache_find_hits", |
| 903 | "vmacache_full_flushes", | ||
| 901 | #endif | 904 | #endif |
| 902 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 905 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
| 903 | }; | 906 | }; |
| @@ -1017,6 +1020,104 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) | |||
| 1017 | return 0; | 1020 | return 0; |
| 1018 | } | 1021 | } |
| 1019 | 1022 | ||
| 1023 | #ifdef CONFIG_PAGE_OWNER | ||
| 1024 | static void pagetypeinfo_showmixedcount_print(struct seq_file *m, | ||
| 1025 | pg_data_t *pgdat, | ||
| 1026 | struct zone *zone) | ||
| 1027 | { | ||
| 1028 | struct page *page; | ||
| 1029 | struct page_ext *page_ext; | ||
| 1030 | unsigned long pfn = zone->zone_start_pfn, block_end_pfn; | ||
| 1031 | unsigned long end_pfn = pfn + zone->spanned_pages; | ||
| 1032 | unsigned long count[MIGRATE_TYPES] = { 0, }; | ||
| 1033 | int pageblock_mt, page_mt; | ||
| 1034 | int i; | ||
| 1035 | |||
| 1036 | /* Scan block by block. First and last block may be incomplete */ | ||
| 1037 | pfn = zone->zone_start_pfn; | ||
| 1038 | |||
| 1039 | /* | ||
| 1040 | * Walk the zone in pageblock_nr_pages steps. If a page block spans | ||
| 1041 | * a zone boundary, it will be double counted between zones. This does | ||
| 1042 | * not matter as the mixed block count will still be correct | ||
| 1043 | */ | ||
| 1044 | for (; pfn < end_pfn; ) { | ||
| 1045 | if (!pfn_valid(pfn)) { | ||
| 1046 | pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); | ||
| 1047 | continue; | ||
| 1048 | } | ||
| 1049 | |||
| 1050 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
| 1051 | block_end_pfn = min(block_end_pfn, end_pfn); | ||
| 1052 | |||
| 1053 | page = pfn_to_page(pfn); | ||
| 1054 | pageblock_mt = get_pfnblock_migratetype(page, pfn); | ||
| 1055 | |||
| 1056 | for (; pfn < block_end_pfn; pfn++) { | ||
| 1057 | if (!pfn_valid_within(pfn)) | ||
| 1058 | continue; | ||
| 1059 | |||
| 1060 | page = pfn_to_page(pfn); | ||
| 1061 | if (PageBuddy(page)) { | ||
| 1062 | pfn += (1UL << page_order(page)) - 1; | ||
| 1063 | continue; | ||
| 1064 | } | ||
| 1065 | |||
| 1066 | if (PageReserved(page)) | ||
| 1067 | continue; | ||
| 1068 | |||
| 1069 | page_ext = lookup_page_ext(page); | ||
| 1070 | |||
| 1071 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | ||
| 1072 | continue; | ||
| 1073 | |||
| 1074 | page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); | ||
| 1075 | if (pageblock_mt != page_mt) { | ||
| 1076 | if (is_migrate_cma(pageblock_mt)) | ||
| 1077 | count[MIGRATE_MOVABLE]++; | ||
| 1078 | else | ||
| 1079 | count[pageblock_mt]++; | ||
| 1080 | |||
| 1081 | pfn = block_end_pfn; | ||
| 1082 | break; | ||
| 1083 | } | ||
| 1084 | pfn += (1UL << page_ext->order) - 1; | ||
| 1085 | } | ||
| 1086 | } | ||
| 1087 | |||
| 1088 | /* Print counts */ | ||
| 1089 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
| 1090 | for (i = 0; i < MIGRATE_TYPES; i++) | ||
| 1091 | seq_printf(m, "%12lu ", count[i]); | ||
| 1092 | seq_putc(m, '\n'); | ||
| 1093 | } | ||
| 1094 | #endif /* CONFIG_PAGE_OWNER */ | ||
| 1095 | |||
| 1096 | /* | ||
| 1097 | * Print out the number of pageblocks for each migratetype that contain pages | ||
| 1098 | * of other types. This gives an indication of how well fallbacks are being | ||
| 1099 | * contained by rmqueue_fallback(). It requires information from PAGE_OWNER | ||
| 1100 | * to determine what is going on | ||
| 1101 | */ | ||
| 1102 | static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) | ||
| 1103 | { | ||
| 1104 | #ifdef CONFIG_PAGE_OWNER | ||
| 1105 | int mtype; | ||
| 1106 | |||
| 1107 | if (!page_owner_inited) | ||
| 1108 | return; | ||
| 1109 | |||
| 1110 | drain_all_pages(NULL); | ||
| 1111 | |||
| 1112 | seq_printf(m, "\n%-23s", "Number of mixed blocks "); | ||
| 1113 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) | ||
| 1114 | seq_printf(m, "%12s ", migratetype_names[mtype]); | ||
| 1115 | seq_putc(m, '\n'); | ||
| 1116 | |||
| 1117 | walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); | ||
| 1118 | #endif /* CONFIG_PAGE_OWNER */ | ||
| 1119 | } | ||
| 1120 | |||
| 1020 | /* | 1121 | /* |
| 1021 | * This prints out statistics in relation to grouping pages by mobility. | 1122 | * This prints out statistics in relation to grouping pages by mobility. |
| 1022 | * It is expensive to collect so do not constantly read the file. | 1123 | * It is expensive to collect so do not constantly read the file. |
| @@ -1034,6 +1135,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) | |||
| 1034 | seq_putc(m, '\n'); | 1135 | seq_putc(m, '\n'); |
| 1035 | pagetypeinfo_showfree(m, pgdat); | 1136 | pagetypeinfo_showfree(m, pgdat); |
| 1036 | pagetypeinfo_showblockcount(m, pgdat); | 1137 | pagetypeinfo_showblockcount(m, pgdat); |
| 1138 | pagetypeinfo_showmixedcount(m, pgdat); | ||
| 1037 | 1139 | ||
| 1038 | return 0; | 1140 | return 0; |
| 1039 | } | 1141 | } |
| @@ -132,7 +132,7 @@ static struct zbud_ops zbud_zpool_ops = { | |||
| 132 | 132 | ||
| 133 | static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) | 133 | static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) |
| 134 | { | 134 | { |
| 135 | return zbud_create_pool(gfp, &zbud_zpool_ops); | 135 | return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); |
| 136 | } | 136 | } |
| 137 | 137 | ||
| 138 | static void zbud_zpool_destroy(void *pool) | 138 | static void zbud_zpool_destroy(void *pool) |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 839a48c3ca27..4d0a063145ec 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
| @@ -155,8 +155,6 @@ | |||
| 155 | * (reason above) | 155 | * (reason above) |
| 156 | */ | 156 | */ |
| 157 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) | 157 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) |
| 158 | #define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ | ||
| 159 | ZS_SIZE_CLASS_DELTA + 1) | ||
| 160 | 158 | ||
| 161 | /* | 159 | /* |
| 162 | * We do not maintain any list for completely empty or full pages | 160 | * We do not maintain any list for completely empty or full pages |
| @@ -171,6 +169,11 @@ enum fullness_group { | |||
| 171 | }; | 169 | }; |
| 172 | 170 | ||
| 173 | /* | 171 | /* |
| 172 | * number of size_classes | ||
| 173 | */ | ||
| 174 | static int zs_size_classes; | ||
| 175 | |||
| 176 | /* | ||
| 174 | * We assign a page to ZS_ALMOST_EMPTY fullness group when: | 177 | * We assign a page to ZS_ALMOST_EMPTY fullness group when: |
| 175 | * n <= N / f, where | 178 | * n <= N / f, where |
| 176 | * n = number of allocated objects | 179 | * n = number of allocated objects |
| @@ -214,7 +217,7 @@ struct link_free { | |||
| 214 | }; | 217 | }; |
| 215 | 218 | ||
| 216 | struct zs_pool { | 219 | struct zs_pool { |
| 217 | struct size_class size_class[ZS_SIZE_CLASSES]; | 220 | struct size_class **size_class; |
| 218 | 221 | ||
| 219 | gfp_t flags; /* allocation flags used when growing pool */ | 222 | gfp_t flags; /* allocation flags used when growing pool */ |
| 220 | atomic_long_t pages_allocated; | 223 | atomic_long_t pages_allocated; |
| @@ -468,7 +471,7 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, | |||
| 468 | if (newfg == currfg) | 471 | if (newfg == currfg) |
| 469 | goto out; | 472 | goto out; |
| 470 | 473 | ||
| 471 | class = &pool->size_class[class_idx]; | 474 | class = pool->size_class[class_idx]; |
| 472 | remove_zspage(page, class, currfg); | 475 | remove_zspage(page, class, currfg); |
| 473 | insert_zspage(page, class, newfg); | 476 | insert_zspage(page, class, newfg); |
| 474 | set_zspage_mapping(page, class_idx, newfg); | 477 | set_zspage_mapping(page, class_idx, newfg); |
| @@ -629,6 +632,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
| 629 | struct page *next_page; | 632 | struct page *next_page; |
| 630 | struct link_free *link; | 633 | struct link_free *link; |
| 631 | unsigned int i = 1; | 634 | unsigned int i = 1; |
| 635 | void *vaddr; | ||
| 632 | 636 | ||
| 633 | /* | 637 | /* |
| 634 | * page->index stores offset of first object starting | 638 | * page->index stores offset of first object starting |
| @@ -639,8 +643,8 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
| 639 | if (page != first_page) | 643 | if (page != first_page) |
| 640 | page->index = off; | 644 | page->index = off; |
| 641 | 645 | ||
| 642 | link = (struct link_free *)kmap_atomic(page) + | 646 | vaddr = kmap_atomic(page); |
| 643 | off / sizeof(*link); | 647 | link = (struct link_free *)vaddr + off / sizeof(*link); |
| 644 | 648 | ||
| 645 | while ((off += class->size) < PAGE_SIZE) { | 649 | while ((off += class->size) < PAGE_SIZE) { |
| 646 | link->next = obj_location_to_handle(page, i++); | 650 | link->next = obj_location_to_handle(page, i++); |
| @@ -654,7 +658,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
| 654 | */ | 658 | */ |
| 655 | next_page = get_next_page(page); | 659 | next_page = get_next_page(page); |
| 656 | link->next = obj_location_to_handle(next_page, 0); | 660 | link->next = obj_location_to_handle(next_page, 0); |
| 657 | kunmap_atomic(link); | 661 | kunmap_atomic(vaddr); |
| 658 | page = next_page; | 662 | page = next_page; |
| 659 | off %= PAGE_SIZE; | 663 | off %= PAGE_SIZE; |
| 660 | } | 664 | } |
| @@ -784,7 +788,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) | |||
| 784 | */ | 788 | */ |
| 785 | if (area->vm_buf) | 789 | if (area->vm_buf) |
| 786 | return 0; | 790 | return 0; |
| 787 | area->vm_buf = (char *)__get_free_page(GFP_KERNEL); | 791 | area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); |
| 788 | if (!area->vm_buf) | 792 | if (!area->vm_buf) |
| 789 | return -ENOMEM; | 793 | return -ENOMEM; |
| 790 | return 0; | 794 | return 0; |
| @@ -792,8 +796,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) | |||
| 792 | 796 | ||
| 793 | static inline void __zs_cpu_down(struct mapping_area *area) | 797 | static inline void __zs_cpu_down(struct mapping_area *area) |
| 794 | { | 798 | { |
| 795 | if (area->vm_buf) | 799 | kfree(area->vm_buf); |
| 796 | free_page((unsigned long)area->vm_buf); | ||
| 797 | area->vm_buf = NULL; | 800 | area->vm_buf = NULL; |
| 798 | } | 801 | } |
| 799 | 802 | ||
| @@ -881,14 +884,10 @@ static struct notifier_block zs_cpu_nb = { | |||
| 881 | .notifier_call = zs_cpu_notifier | 884 | .notifier_call = zs_cpu_notifier |
| 882 | }; | 885 | }; |
| 883 | 886 | ||
| 884 | static void zs_exit(void) | 887 | static void zs_unregister_cpu_notifier(void) |
| 885 | { | 888 | { |
| 886 | int cpu; | 889 | int cpu; |
| 887 | 890 | ||
| 888 | #ifdef CONFIG_ZPOOL | ||
| 889 | zpool_unregister_driver(&zs_zpool_driver); | ||
| 890 | #endif | ||
| 891 | |||
| 892 | cpu_notifier_register_begin(); | 891 | cpu_notifier_register_begin(); |
| 893 | 892 | ||
| 894 | for_each_online_cpu(cpu) | 893 | for_each_online_cpu(cpu) |
| @@ -898,31 +897,74 @@ static void zs_exit(void) | |||
| 898 | cpu_notifier_register_done(); | 897 | cpu_notifier_register_done(); |
| 899 | } | 898 | } |
| 900 | 899 | ||
| 901 | static int zs_init(void) | 900 | static int zs_register_cpu_notifier(void) |
| 902 | { | 901 | { |
| 903 | int cpu, ret; | 902 | int cpu, uninitialized_var(ret); |
| 904 | 903 | ||
| 905 | cpu_notifier_register_begin(); | 904 | cpu_notifier_register_begin(); |
| 906 | 905 | ||
| 907 | __register_cpu_notifier(&zs_cpu_nb); | 906 | __register_cpu_notifier(&zs_cpu_nb); |
| 908 | for_each_online_cpu(cpu) { | 907 | for_each_online_cpu(cpu) { |
| 909 | ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 908 | ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
| 910 | if (notifier_to_errno(ret)) { | 909 | if (notifier_to_errno(ret)) |
| 911 | cpu_notifier_register_done(); | 910 | break; |
| 912 | goto fail; | ||
| 913 | } | ||
| 914 | } | 911 | } |
| 915 | 912 | ||
| 916 | cpu_notifier_register_done(); | 913 | cpu_notifier_register_done(); |
| 914 | return notifier_to_errno(ret); | ||
| 915 | } | ||
| 916 | |||
| 917 | static void init_zs_size_classes(void) | ||
| 918 | { | ||
| 919 | int nr; | ||
| 917 | 920 | ||
| 921 | nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; | ||
| 922 | if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) | ||
| 923 | nr += 1; | ||
| 924 | |||
| 925 | zs_size_classes = nr; | ||
| 926 | } | ||
| 927 | |||
| 928 | static void __exit zs_exit(void) | ||
| 929 | { | ||
| 918 | #ifdef CONFIG_ZPOOL | 930 | #ifdef CONFIG_ZPOOL |
| 919 | zpool_register_driver(&zs_zpool_driver); | 931 | zpool_unregister_driver(&zs_zpool_driver); |
| 920 | #endif | 932 | #endif |
| 933 | zs_unregister_cpu_notifier(); | ||
| 934 | } | ||
| 921 | 935 | ||
| 936 | static int __init zs_init(void) | ||
| 937 | { | ||
| 938 | int ret = zs_register_cpu_notifier(); | ||
| 939 | |||
| 940 | if (ret) { | ||
| 941 | zs_unregister_cpu_notifier(); | ||
| 942 | return ret; | ||
| 943 | } | ||
| 944 | |||
| 945 | init_zs_size_classes(); | ||
| 946 | |||
| 947 | #ifdef CONFIG_ZPOOL | ||
| 948 | zpool_register_driver(&zs_zpool_driver); | ||
| 949 | #endif | ||
| 922 | return 0; | 950 | return 0; |
| 923 | fail: | 951 | } |
| 924 | zs_exit(); | 952 | |
| 925 | return notifier_to_errno(ret); | 953 | static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) |
| 954 | { | ||
| 955 | return pages_per_zspage * PAGE_SIZE / size; | ||
| 956 | } | ||
| 957 | |||
| 958 | static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | ||
| 959 | { | ||
| 960 | if (prev->pages_per_zspage != pages_per_zspage) | ||
| 961 | return false; | ||
| 962 | |||
| 963 | if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) | ||
| 964 | != get_maxobj_per_zspage(size, pages_per_zspage)) | ||
| 965 | return false; | ||
| 966 | |||
| 967 | return true; | ||
| 926 | } | 968 | } |
| 927 | 969 | ||
| 928 | /** | 970 | /** |
| @@ -937,33 +979,71 @@ fail: | |||
| 937 | */ | 979 | */ |
| 938 | struct zs_pool *zs_create_pool(gfp_t flags) | 980 | struct zs_pool *zs_create_pool(gfp_t flags) |
| 939 | { | 981 | { |
| 940 | int i, ovhd_size; | 982 | int i; |
| 941 | struct zs_pool *pool; | 983 | struct zs_pool *pool; |
| 984 | struct size_class *prev_class = NULL; | ||
| 942 | 985 | ||
| 943 | ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); | 986 | pool = kzalloc(sizeof(*pool), GFP_KERNEL); |
| 944 | pool = kzalloc(ovhd_size, GFP_KERNEL); | ||
| 945 | if (!pool) | 987 | if (!pool) |
| 946 | return NULL; | 988 | return NULL; |
| 947 | 989 | ||
| 948 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { | 990 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), |
| 991 | GFP_KERNEL); | ||
| 992 | if (!pool->size_class) { | ||
| 993 | kfree(pool); | ||
| 994 | return NULL; | ||
| 995 | } | ||
| 996 | |||
| 997 | /* | ||
| 998 | * Iterate reversly, because, size of size_class that we want to use | ||
| 999 | * for merging should be larger or equal to current size. | ||
| 1000 | */ | ||
| 1001 | for (i = zs_size_classes - 1; i >= 0; i--) { | ||
| 949 | int size; | 1002 | int size; |
| 1003 | int pages_per_zspage; | ||
| 950 | struct size_class *class; | 1004 | struct size_class *class; |
| 951 | 1005 | ||
| 952 | size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; | 1006 | size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; |
| 953 | if (size > ZS_MAX_ALLOC_SIZE) | 1007 | if (size > ZS_MAX_ALLOC_SIZE) |
| 954 | size = ZS_MAX_ALLOC_SIZE; | 1008 | size = ZS_MAX_ALLOC_SIZE; |
| 1009 | pages_per_zspage = get_pages_per_zspage(size); | ||
| 1010 | |||
| 1011 | /* | ||
| 1012 | * size_class is used for normal zsmalloc operation such | ||
| 1013 | * as alloc/free for that size. Although it is natural that we | ||
| 1014 | * have one size_class for each size, there is a chance that we | ||
| 1015 | * can get more memory utilization if we use one size_class for | ||
| 1016 | * many different sizes whose size_class have same | ||
| 1017 | * characteristics. So, we makes size_class point to | ||
| 1018 | * previous size_class if possible. | ||
| 1019 | */ | ||
| 1020 | if (prev_class) { | ||
| 1021 | if (can_merge(prev_class, size, pages_per_zspage)) { | ||
| 1022 | pool->size_class[i] = prev_class; | ||
| 1023 | continue; | ||
| 1024 | } | ||
| 1025 | } | ||
| 1026 | |||
| 1027 | class = kzalloc(sizeof(struct size_class), GFP_KERNEL); | ||
| 1028 | if (!class) | ||
| 1029 | goto err; | ||
| 955 | 1030 | ||
| 956 | class = &pool->size_class[i]; | ||
| 957 | class->size = size; | 1031 | class->size = size; |
| 958 | class->index = i; | 1032 | class->index = i; |
| 1033 | class->pages_per_zspage = pages_per_zspage; | ||
| 959 | spin_lock_init(&class->lock); | 1034 | spin_lock_init(&class->lock); |
| 960 | class->pages_per_zspage = get_pages_per_zspage(size); | 1035 | pool->size_class[i] = class; |
| 961 | 1036 | ||
| 1037 | prev_class = class; | ||
| 962 | } | 1038 | } |
| 963 | 1039 | ||
| 964 | pool->flags = flags; | 1040 | pool->flags = flags; |
| 965 | 1041 | ||
| 966 | return pool; | 1042 | return pool; |
| 1043 | |||
| 1044 | err: | ||
| 1045 | zs_destroy_pool(pool); | ||
| 1046 | return NULL; | ||
| 967 | } | 1047 | } |
| 968 | EXPORT_SYMBOL_GPL(zs_create_pool); | 1048 | EXPORT_SYMBOL_GPL(zs_create_pool); |
| 969 | 1049 | ||
| @@ -971,9 +1051,15 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
| 971 | { | 1051 | { |
| 972 | int i; | 1052 | int i; |
| 973 | 1053 | ||
| 974 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { | 1054 | for (i = 0; i < zs_size_classes; i++) { |
| 975 | int fg; | 1055 | int fg; |
| 976 | struct size_class *class = &pool->size_class[i]; | 1056 | struct size_class *class = pool->size_class[i]; |
| 1057 | |||
| 1058 | if (!class) | ||
| 1059 | continue; | ||
| 1060 | |||
| 1061 | if (class->index != i) | ||
| 1062 | continue; | ||
| 977 | 1063 | ||
| 978 | for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { | 1064 | for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { |
| 979 | if (class->fullness_list[fg]) { | 1065 | if (class->fullness_list[fg]) { |
| @@ -981,7 +1067,10 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
| 981 | class->size, fg); | 1067 | class->size, fg); |
| 982 | } | 1068 | } |
| 983 | } | 1069 | } |
| 1070 | kfree(class); | ||
| 984 | } | 1071 | } |
| 1072 | |||
| 1073 | kfree(pool->size_class); | ||
| 985 | kfree(pool); | 1074 | kfree(pool); |
| 986 | } | 1075 | } |
| 987 | EXPORT_SYMBOL_GPL(zs_destroy_pool); | 1076 | EXPORT_SYMBOL_GPL(zs_destroy_pool); |
| @@ -999,8 +1088,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
| 999 | { | 1088 | { |
| 1000 | unsigned long obj; | 1089 | unsigned long obj; |
| 1001 | struct link_free *link; | 1090 | struct link_free *link; |
| 1002 | int class_idx; | ||
| 1003 | struct size_class *class; | 1091 | struct size_class *class; |
| 1092 | void *vaddr; | ||
| 1004 | 1093 | ||
| 1005 | struct page *first_page, *m_page; | 1094 | struct page *first_page, *m_page; |
| 1006 | unsigned long m_objidx, m_offset; | 1095 | unsigned long m_objidx, m_offset; |
| @@ -1008,9 +1097,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
| 1008 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) | 1097 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) |
| 1009 | return 0; | 1098 | return 0; |
| 1010 | 1099 | ||
| 1011 | class_idx = get_size_class_index(size); | 1100 | class = pool->size_class[get_size_class_index(size)]; |
| 1012 | class = &pool->size_class[class_idx]; | ||
| 1013 | BUG_ON(class_idx != class->index); | ||
| 1014 | 1101 | ||
| 1015 | spin_lock(&class->lock); | 1102 | spin_lock(&class->lock); |
| 1016 | first_page = find_get_zspage(class); | 1103 | first_page = find_get_zspage(class); |
| @@ -1031,11 +1118,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
| 1031 | obj_handle_to_location(obj, &m_page, &m_objidx); | 1118 | obj_handle_to_location(obj, &m_page, &m_objidx); |
| 1032 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); | 1119 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); |
| 1033 | 1120 | ||
| 1034 | link = (struct link_free *)kmap_atomic(m_page) + | 1121 | vaddr = kmap_atomic(m_page); |
| 1035 | m_offset / sizeof(*link); | 1122 | link = (struct link_free *)vaddr + m_offset / sizeof(*link); |
| 1036 | first_page->freelist = link->next; | 1123 | first_page->freelist = link->next; |
| 1037 | memset(link, POISON_INUSE, sizeof(*link)); | 1124 | memset(link, POISON_INUSE, sizeof(*link)); |
| 1038 | kunmap_atomic(link); | 1125 | kunmap_atomic(vaddr); |
| 1039 | 1126 | ||
| 1040 | first_page->inuse++; | 1127 | first_page->inuse++; |
| 1041 | /* Now move the zspage to another fullness group, if required */ | 1128 | /* Now move the zspage to another fullness group, if required */ |
| @@ -1051,6 +1138,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj) | |||
| 1051 | struct link_free *link; | 1138 | struct link_free *link; |
| 1052 | struct page *first_page, *f_page; | 1139 | struct page *first_page, *f_page; |
| 1053 | unsigned long f_objidx, f_offset; | 1140 | unsigned long f_objidx, f_offset; |
| 1141 | void *vaddr; | ||
| 1054 | 1142 | ||
| 1055 | int class_idx; | 1143 | int class_idx; |
| 1056 | struct size_class *class; | 1144 | struct size_class *class; |
| @@ -1063,16 +1151,16 @@ void zs_free(struct zs_pool *pool, unsigned long obj) | |||
| 1063 | first_page = get_first_page(f_page); | 1151 | first_page = get_first_page(f_page); |
| 1064 | 1152 | ||
| 1065 | get_zspage_mapping(first_page, &class_idx, &fullness); | 1153 | get_zspage_mapping(first_page, &class_idx, &fullness); |
| 1066 | class = &pool->size_class[class_idx]; | 1154 | class = pool->size_class[class_idx]; |
| 1067 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); | 1155 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); |
| 1068 | 1156 | ||
| 1069 | spin_lock(&class->lock); | 1157 | spin_lock(&class->lock); |
| 1070 | 1158 | ||
| 1071 | /* Insert this object in containing zspage's freelist */ | 1159 | /* Insert this object in containing zspage's freelist */ |
| 1072 | link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) | 1160 | vaddr = kmap_atomic(f_page); |
| 1073 | + f_offset); | 1161 | link = (struct link_free *)(vaddr + f_offset); |
| 1074 | link->next = first_page->freelist; | 1162 | link->next = first_page->freelist; |
| 1075 | kunmap_atomic(link); | 1163 | kunmap_atomic(vaddr); |
| 1076 | first_page->freelist = (void *)obj; | 1164 | first_page->freelist = (void *)obj; |
| 1077 | 1165 | ||
| 1078 | first_page->inuse--; | 1166 | first_page->inuse--; |
| @@ -1124,7 +1212,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
| 1124 | 1212 | ||
| 1125 | obj_handle_to_location(handle, &page, &obj_idx); | 1213 | obj_handle_to_location(handle, &page, &obj_idx); |
| 1126 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1214 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); |
| 1127 | class = &pool->size_class[class_idx]; | 1215 | class = pool->size_class[class_idx]; |
| 1128 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1216 | off = obj_idx_to_offset(page, obj_idx, class->size); |
| 1129 | 1217 | ||
| 1130 | area = &get_cpu_var(zs_map_area); | 1218 | area = &get_cpu_var(zs_map_area); |
| @@ -1158,7 +1246,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
| 1158 | 1246 | ||
| 1159 | obj_handle_to_location(handle, &page, &obj_idx); | 1247 | obj_handle_to_location(handle, &page, &obj_idx); |
| 1160 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1248 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); |
| 1161 | class = &pool->size_class[class_idx]; | 1249 | class = pool->size_class[class_idx]; |
| 1162 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1250 | off = obj_idx_to_offset(page, obj_idx, class->size); |
| 1163 | 1251 | ||
| 1164 | area = this_cpu_ptr(&zs_map_area); | 1252 | area = this_cpu_ptr(&zs_map_area); |
diff --git a/mm/zswap.c b/mm/zswap.c index c1543061a192..0cfce9bc51e4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
| @@ -149,11 +149,10 @@ static int __init zswap_comp_init(void) | |||
| 149 | return 0; | 149 | return 0; |
| 150 | } | 150 | } |
| 151 | 151 | ||
| 152 | static void zswap_comp_exit(void) | 152 | static void __init zswap_comp_exit(void) |
| 153 | { | 153 | { |
| 154 | /* free percpu transforms */ | 154 | /* free percpu transforms */ |
| 155 | if (zswap_comp_pcpu_tfms) | 155 | free_percpu(zswap_comp_pcpu_tfms); |
| 156 | free_percpu(zswap_comp_pcpu_tfms); | ||
| 157 | } | 156 | } |
| 158 | 157 | ||
| 159 | /********************************* | 158 | /********************************* |
| @@ -206,7 +205,7 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; | |||
| 206 | **********************************/ | 205 | **********************************/ |
| 207 | static struct kmem_cache *zswap_entry_cache; | 206 | static struct kmem_cache *zswap_entry_cache; |
| 208 | 207 | ||
| 209 | static int zswap_entry_cache_create(void) | 208 | static int __init zswap_entry_cache_create(void) |
| 210 | { | 209 | { |
| 211 | zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); | 210 | zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); |
| 212 | return zswap_entry_cache == NULL; | 211 | return zswap_entry_cache == NULL; |
| @@ -389,7 +388,7 @@ static struct notifier_block zswap_cpu_notifier_block = { | |||
| 389 | .notifier_call = zswap_cpu_notifier | 388 | .notifier_call = zswap_cpu_notifier |
| 390 | }; | 389 | }; |
| 391 | 390 | ||
| 392 | static int zswap_cpu_init(void) | 391 | static int __init zswap_cpu_init(void) |
| 393 | { | 392 | { |
| 394 | unsigned long cpu; | 393 | unsigned long cpu; |
| 395 | 394 | ||
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 45f145c6f843..c14893b501a9 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile | |||
| @@ -15,6 +15,7 @@ TARGETS += user | |||
| 15 | TARGETS += sysctl | 15 | TARGETS += sysctl |
| 16 | TARGETS += firmware | 16 | TARGETS += firmware |
| 17 | TARGETS += ftrace | 17 | TARGETS += ftrace |
| 18 | TARGETS += exec | ||
| 18 | 19 | ||
| 19 | TARGETS_HOTPLUG = cpu-hotplug | 20 | TARGETS_HOTPLUG = cpu-hotplug |
| 20 | TARGETS_HOTPLUG += memory-hotplug | 21 | TARGETS_HOTPLUG += memory-hotplug |
diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore new file mode 100644 index 000000000000..64073e050c6a --- /dev/null +++ b/tools/testing/selftests/exec/.gitignore | |||
| @@ -0,0 +1,9 @@ | |||
| 1 | subdir* | ||
| 2 | script* | ||
| 3 | execveat | ||
| 4 | execveat.symlink | ||
| 5 | execveat.moved | ||
| 6 | execveat.path.ephemeral | ||
| 7 | execveat.ephemeral | ||
| 8 | execveat.denatured | ||
| 9 | xxxxxxxx* \ No newline at end of file | ||
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile new file mode 100644 index 000000000000..66dfc2ce1788 --- /dev/null +++ b/tools/testing/selftests/exec/Makefile | |||
| @@ -0,0 +1,25 @@ | |||
| 1 | CC = $(CROSS_COMPILE)gcc | ||
| 2 | CFLAGS = -Wall | ||
| 3 | BINARIES = execveat | ||
| 4 | DEPS = execveat.symlink execveat.denatured script subdir | ||
| 5 | all: $(BINARIES) $(DEPS) | ||
| 6 | |||
| 7 | subdir: | ||
| 8 | mkdir -p $@ | ||
| 9 | script: | ||
| 10 | echo '#!/bin/sh' > $@ | ||
| 11 | echo 'exit $$*' >> $@ | ||
| 12 | chmod +x $@ | ||
| 13 | execveat.symlink: execveat | ||
| 14 | ln -s -f $< $@ | ||
| 15 | execveat.denatured: execveat | ||
| 16 | cp $< $@ | ||
| 17 | chmod -x $@ | ||
| 18 | %: %.c | ||
| 19 | $(CC) $(CFLAGS) -o $@ $^ | ||
| 20 | |||
| 21 | run_tests: all | ||
| 22 | ./execveat | ||
| 23 | |||
| 24 | clean: | ||
| 25 | rm -rf $(BINARIES) $(DEPS) subdir.moved execveat.moved xxxxx* | ||
diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c new file mode 100644 index 000000000000..33a5c06d95ca --- /dev/null +++ b/tools/testing/selftests/exec/execveat.c | |||
| @@ -0,0 +1,397 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2014 Google, Inc. | ||
| 3 | * | ||
| 4 | * Licensed under the terms of the GNU GPL License version 2 | ||
| 5 | * | ||
| 6 | * Selftests for execveat(2). | ||
| 7 | */ | ||
| 8 | |||
| 9 | #define _GNU_SOURCE /* to get O_PATH, AT_EMPTY_PATH */ | ||
| 10 | #include <sys/sendfile.h> | ||
| 11 | #include <sys/stat.h> | ||
| 12 | #include <sys/syscall.h> | ||
| 13 | #include <sys/types.h> | ||
| 14 | #include <sys/wait.h> | ||
| 15 | #include <errno.h> | ||
| 16 | #include <fcntl.h> | ||
| 17 | #include <limits.h> | ||
| 18 | #include <stdio.h> | ||
| 19 | #include <stdlib.h> | ||
| 20 | #include <string.h> | ||
| 21 | #include <unistd.h> | ||
| 22 | |||
| 23 | static char longpath[2 * PATH_MAX] = ""; | ||
| 24 | static char *envp[] = { "IN_TEST=yes", NULL, NULL }; | ||
| 25 | static char *argv[] = { "execveat", "99", NULL }; | ||
| 26 | |||
| 27 | static int execveat_(int fd, const char *path, char **argv, char **envp, | ||
| 28 | int flags) | ||
| 29 | { | ||
| 30 | #ifdef __NR_execveat | ||
| 31 | return syscall(__NR_execveat, fd, path, argv, envp, flags); | ||
| 32 | #else | ||
| 33 | errno = -ENOSYS; | ||
| 34 | return -1; | ||
| 35 | #endif | ||
| 36 | } | ||
| 37 | |||
| 38 | #define check_execveat_fail(fd, path, flags, errno) \ | ||
| 39 | _check_execveat_fail(fd, path, flags, errno, #errno) | ||
| 40 | static int _check_execveat_fail(int fd, const char *path, int flags, | ||
| 41 | int expected_errno, const char *errno_str) | ||
| 42 | { | ||
| 43 | int rc; | ||
| 44 | |||
| 45 | errno = 0; | ||
| 46 | printf("Check failure of execveat(%d, '%s', %d) with %s... ", | ||
| 47 | fd, path?:"(null)", flags, errno_str); | ||
| 48 | rc = execveat_(fd, path, argv, envp, flags); | ||
| 49 | |||
| 50 | if (rc > 0) { | ||
| 51 | printf("[FAIL] (unexpected success from execveat(2))\n"); | ||
| 52 | return 1; | ||
| 53 | } | ||
| 54 | if (errno != expected_errno) { | ||
| 55 | printf("[FAIL] (expected errno %d (%s) not %d (%s)\n", | ||
| 56 | expected_errno, strerror(expected_errno), | ||
| 57 | errno, strerror(errno)); | ||
| 58 | return 1; | ||
| 59 | } | ||
| 60 | printf("[OK]\n"); | ||
| 61 | return 0; | ||
| 62 | } | ||
| 63 | |||
| 64 | static int check_execveat_invoked_rc(int fd, const char *path, int flags, | ||
| 65 | int expected_rc) | ||
| 66 | { | ||
| 67 | int status; | ||
| 68 | int rc; | ||
| 69 | pid_t child; | ||
| 70 | int pathlen = path ? strlen(path) : 0; | ||
| 71 | |||
| 72 | if (pathlen > 40) | ||
| 73 | printf("Check success of execveat(%d, '%.20s...%s', %d)... ", | ||
| 74 | fd, path, (path + pathlen - 20), flags); | ||
| 75 | else | ||
| 76 | printf("Check success of execveat(%d, '%s', %d)... ", | ||
| 77 | fd, path?:"(null)", flags); | ||
| 78 | child = fork(); | ||
| 79 | if (child < 0) { | ||
| 80 | printf("[FAIL] (fork() failed)\n"); | ||
| 81 | return 1; | ||
| 82 | } | ||
| 83 | if (child == 0) { | ||
| 84 | /* Child: do execveat(). */ | ||
| 85 | rc = execveat_(fd, path, argv, envp, flags); | ||
| 86 | printf("[FAIL]: execveat() failed, rc=%d errno=%d (%s)\n", | ||
| 87 | rc, errno, strerror(errno)); | ||
| 88 | exit(1); /* should not reach here */ | ||
| 89 | } | ||
| 90 | /* Parent: wait for & check child's exit status. */ | ||
| 91 | rc = waitpid(child, &status, 0); | ||
| 92 | if (rc != child) { | ||
| 93 | printf("[FAIL] (waitpid(%d,...) returned %d)\n", child, rc); | ||
| 94 | return 1; | ||
| 95 | } | ||
| 96 | if (!WIFEXITED(status)) { | ||
| 97 | printf("[FAIL] (child %d did not exit cleanly, status=%08x)\n", | ||
| 98 | child, status); | ||
| 99 | return 1; | ||
| 100 | } | ||
| 101 | if (WEXITSTATUS(status) != expected_rc) { | ||
| 102 | printf("[FAIL] (child %d exited with %d not %d)\n", | ||
| 103 | child, WEXITSTATUS(status), expected_rc); | ||
| 104 | return 1; | ||
| 105 | } | ||
| 106 | printf("[OK]\n"); | ||
| 107 | return 0; | ||
| 108 | } | ||
| 109 | |||
| 110 | static int check_execveat(int fd, const char *path, int flags) | ||
| 111 | { | ||
| 112 | return check_execveat_invoked_rc(fd, path, flags, 99); | ||
| 113 | } | ||
| 114 | |||
| 115 | static char *concat(const char *left, const char *right) | ||
| 116 | { | ||
| 117 | char *result = malloc(strlen(left) + strlen(right) + 1); | ||
| 118 | |||
| 119 | strcpy(result, left); | ||
| 120 | strcat(result, right); | ||
| 121 | return result; | ||
| 122 | } | ||
| 123 | |||
| 124 | static int open_or_die(const char *filename, int flags) | ||
| 125 | { | ||
| 126 | int fd = open(filename, flags); | ||
| 127 | |||
| 128 | if (fd < 0) { | ||
| 129 | printf("Failed to open '%s'; " | ||
| 130 | "check prerequisites are available\n", filename); | ||
| 131 | exit(1); | ||
| 132 | } | ||
| 133 | return fd; | ||
| 134 | } | ||
| 135 | |||
| 136 | static void exe_cp(const char *src, const char *dest) | ||
| 137 | { | ||
| 138 | int in_fd = open_or_die(src, O_RDONLY); | ||
| 139 | int out_fd = open(dest, O_RDWR|O_CREAT|O_TRUNC, 0755); | ||
| 140 | struct stat info; | ||
| 141 | |||
| 142 | fstat(in_fd, &info); | ||
| 143 | sendfile(out_fd, in_fd, NULL, info.st_size); | ||
| 144 | close(in_fd); | ||
| 145 | close(out_fd); | ||
| 146 | } | ||
| 147 | |||
| 148 | #define XX_DIR_LEN 200 | ||
| 149 | static int check_execveat_pathmax(int dot_dfd, const char *src, int is_script) | ||
| 150 | { | ||
| 151 | int fail = 0; | ||
| 152 | int ii, count, len; | ||
| 153 | char longname[XX_DIR_LEN + 1]; | ||
| 154 | int fd; | ||
| 155 | |||
| 156 | if (*longpath == '\0') { | ||
| 157 | /* Create a filename close to PATH_MAX in length */ | ||
| 158 | memset(longname, 'x', XX_DIR_LEN - 1); | ||
| 159 | longname[XX_DIR_LEN - 1] = '/'; | ||
| 160 | longname[XX_DIR_LEN] = '\0'; | ||
| 161 | count = (PATH_MAX - 3) / XX_DIR_LEN; | ||
| 162 | for (ii = 0; ii < count; ii++) { | ||
| 163 | strcat(longpath, longname); | ||
| 164 | mkdir(longpath, 0755); | ||
| 165 | } | ||
| 166 | len = (PATH_MAX - 3) - (count * XX_DIR_LEN); | ||
| 167 | if (len <= 0) | ||
| 168 | len = 1; | ||
| 169 | memset(longname, 'y', len); | ||
| 170 | longname[len] = '\0'; | ||
| 171 | strcat(longpath, longname); | ||
| 172 | } | ||
| 173 | exe_cp(src, longpath); | ||
| 174 | |||
| 175 | /* | ||
| 176 | * Execute as a pre-opened file descriptor, which works whether this is | ||
| 177 | * a script or not (because the interpreter sees a filename like | ||
| 178 | * "/dev/fd/20"). | ||
| 179 | */ | ||
| 180 | fd = open(longpath, O_RDONLY); | ||
| 181 | if (fd > 0) { | ||
| 182 | printf("Invoke copy of '%s' via filename of length %lu:\n", | ||
| 183 | src, strlen(longpath)); | ||
| 184 | fail += check_execveat(fd, "", AT_EMPTY_PATH); | ||
| 185 | } else { | ||
| 186 | printf("Failed to open length %lu filename, errno=%d (%s)\n", | ||
| 187 | strlen(longpath), errno, strerror(errno)); | ||
| 188 | fail++; | ||
| 189 | } | ||
| 190 | |||
| 191 | /* | ||
| 192 | * Execute as a long pathname relative to ".". If this is a script, | ||
| 193 | * the interpreter will launch but fail to open the script because its | ||
| 194 | * name ("/dev/fd/5/xxx....") is bigger than PATH_MAX. | ||
| 195 | */ | ||
| 196 | if (is_script) | ||
| 197 | fail += check_execveat_invoked_rc(dot_dfd, longpath, 0, 127); | ||
| 198 | else | ||
| 199 | fail += check_execveat(dot_dfd, longpath, 0); | ||
| 200 | |||
| 201 | return fail; | ||
| 202 | } | ||
| 203 | |||
| 204 | static int run_tests(void) | ||
| 205 | { | ||
| 206 | int fail = 0; | ||
| 207 | char *fullname = realpath("execveat", NULL); | ||
| 208 | char *fullname_script = realpath("script", NULL); | ||
| 209 | char *fullname_symlink = concat(fullname, ".symlink"); | ||
| 210 | int subdir_dfd = open_or_die("subdir", O_DIRECTORY|O_RDONLY); | ||
| 211 | int subdir_dfd_ephemeral = open_or_die("subdir.ephemeral", | ||
| 212 | O_DIRECTORY|O_RDONLY); | ||
| 213 | int dot_dfd = open_or_die(".", O_DIRECTORY|O_RDONLY); | ||
| 214 | int dot_dfd_path = open_or_die(".", O_DIRECTORY|O_RDONLY|O_PATH); | ||
| 215 | int dot_dfd_cloexec = open_or_die(".", O_DIRECTORY|O_RDONLY|O_CLOEXEC); | ||
| 216 | int fd = open_or_die("execveat", O_RDONLY); | ||
| 217 | int fd_path = open_or_die("execveat", O_RDONLY|O_PATH); | ||
| 218 | int fd_symlink = open_or_die("execveat.symlink", O_RDONLY); | ||
| 219 | int fd_denatured = open_or_die("execveat.denatured", O_RDONLY); | ||
| 220 | int fd_denatured_path = open_or_die("execveat.denatured", | ||
| 221 | O_RDONLY|O_PATH); | ||
| 222 | int fd_script = open_or_die("script", O_RDONLY); | ||
| 223 | int fd_ephemeral = open_or_die("execveat.ephemeral", O_RDONLY); | ||
| 224 | int fd_ephemeral_path = open_or_die("execveat.path.ephemeral", | ||
| 225 | O_RDONLY|O_PATH); | ||
| 226 | int fd_script_ephemeral = open_or_die("script.ephemeral", O_RDONLY); | ||
| 227 | int fd_cloexec = open_or_die("execveat", O_RDONLY|O_CLOEXEC); | ||
| 228 | int fd_script_cloexec = open_or_die("script", O_RDONLY|O_CLOEXEC); | ||
| 229 | |||
| 230 | /* Change file position to confirm it doesn't affect anything */ | ||
| 231 | lseek(fd, 10, SEEK_SET); | ||
| 232 | |||
| 233 | /* Normal executable file: */ | ||
| 234 | /* dfd + path */ | ||
| 235 | fail += check_execveat(subdir_dfd, "../execveat", 0); | ||
| 236 | fail += check_execveat(dot_dfd, "execveat", 0); | ||
| 237 | fail += check_execveat(dot_dfd_path, "execveat", 0); | ||
| 238 | /* absolute path */ | ||
| 239 | fail += check_execveat(AT_FDCWD, fullname, 0); | ||
| 240 | /* absolute path with nonsense dfd */ | ||
| 241 | fail += check_execveat(99, fullname, 0); | ||
| 242 | /* fd + no path */ | ||
| 243 | fail += check_execveat(fd, "", AT_EMPTY_PATH); | ||
| 244 | /* O_CLOEXEC fd + no path */ | ||
| 245 | fail += check_execveat(fd_cloexec, "", AT_EMPTY_PATH); | ||
| 246 | /* O_PATH fd */ | ||
| 247 | fail += check_execveat(fd_path, "", AT_EMPTY_PATH); | ||
| 248 | |||
| 249 | /* Mess with executable file that's already open: */ | ||
| 250 | /* fd + no path to a file that's been renamed */ | ||
| 251 | rename("execveat.ephemeral", "execveat.moved"); | ||
| 252 | fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH); | ||
| 253 | /* fd + no path to a file that's been deleted */ | ||
| 254 | unlink("execveat.moved"); /* remove the file now fd open */ | ||
| 255 | fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH); | ||
| 256 | |||
| 257 | /* Mess with executable file that's already open with O_PATH */ | ||
| 258 | /* fd + no path to a file that's been deleted */ | ||
| 259 | unlink("execveat.path.ephemeral"); | ||
| 260 | fail += check_execveat(fd_ephemeral_path, "", AT_EMPTY_PATH); | ||
| 261 | |||
| 262 | /* Invalid argument failures */ | ||
| 263 | fail += check_execveat_fail(fd, "", 0, ENOENT); | ||
| 264 | fail += check_execveat_fail(fd, NULL, AT_EMPTY_PATH, EFAULT); | ||
| 265 | |||
| 266 | /* Symlink to executable file: */ | ||
| 267 | /* dfd + path */ | ||
| 268 | fail += check_execveat(dot_dfd, "execveat.symlink", 0); | ||
| 269 | fail += check_execveat(dot_dfd_path, "execveat.symlink", 0); | ||
| 270 | /* absolute path */ | ||
| 271 | fail += check_execveat(AT_FDCWD, fullname_symlink, 0); | ||
| 272 | /* fd + no path, even with AT_SYMLINK_NOFOLLOW (already followed) */ | ||
| 273 | fail += check_execveat(fd_symlink, "", AT_EMPTY_PATH); | ||
| 274 | fail += check_execveat(fd_symlink, "", | ||
| 275 | AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW); | ||
| 276 | |||
| 277 | /* Symlink fails when AT_SYMLINK_NOFOLLOW set: */ | ||
| 278 | /* dfd + path */ | ||
| 279 | fail += check_execveat_fail(dot_dfd, "execveat.symlink", | ||
| 280 | AT_SYMLINK_NOFOLLOW, ELOOP); | ||
| 281 | fail += check_execveat_fail(dot_dfd_path, "execveat.symlink", | ||
| 282 | AT_SYMLINK_NOFOLLOW, ELOOP); | ||
| 283 | /* absolute path */ | ||
| 284 | fail += check_execveat_fail(AT_FDCWD, fullname_symlink, | ||
| 285 | AT_SYMLINK_NOFOLLOW, ELOOP); | ||
| 286 | |||
| 287 | /* Shell script wrapping executable file: */ | ||
| 288 | /* dfd + path */ | ||
| 289 | fail += check_execveat(subdir_dfd, "../script", 0); | ||
| 290 | fail += check_execveat(dot_dfd, "script", 0); | ||
| 291 | fail += check_execveat(dot_dfd_path, "script", 0); | ||
| 292 | /* absolute path */ | ||
| 293 | fail += check_execveat(AT_FDCWD, fullname_script, 0); | ||
| 294 | /* fd + no path */ | ||
| 295 | fail += check_execveat(fd_script, "", AT_EMPTY_PATH); | ||
| 296 | fail += check_execveat(fd_script, "", | ||
| 297 | AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW); | ||
| 298 | /* O_CLOEXEC fd fails for a script (as script file inaccessible) */ | ||
| 299 | fail += check_execveat_fail(fd_script_cloexec, "", AT_EMPTY_PATH, | ||
| 300 | ENOENT); | ||
| 301 | fail += check_execveat_fail(dot_dfd_cloexec, "script", 0, ENOENT); | ||
| 302 | |||
| 303 | /* Mess with script file that's already open: */ | ||
| 304 | /* fd + no path to a file that's been renamed */ | ||
| 305 | rename("script.ephemeral", "script.moved"); | ||
| 306 | fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH); | ||
| 307 | /* fd + no path to a file that's been deleted */ | ||
| 308 | unlink("script.moved"); /* remove the file while fd open */ | ||
| 309 | fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH); | ||
| 310 | |||
| 311 | /* Rename a subdirectory in the path: */ | ||
| 312 | rename("subdir.ephemeral", "subdir.moved"); | ||
| 313 | fail += check_execveat(subdir_dfd_ephemeral, "../script", 0); | ||
| 314 | fail += check_execveat(subdir_dfd_ephemeral, "script", 0); | ||
| 315 | /* Remove the subdir and its contents */ | ||
| 316 | unlink("subdir.moved/script"); | ||
| 317 | unlink("subdir.moved"); | ||
| 318 | /* Shell loads via deleted subdir OK because name starts with .. */ | ||
| 319 | fail += check_execveat(subdir_dfd_ephemeral, "../script", 0); | ||
| 320 | fail += check_execveat_fail(subdir_dfd_ephemeral, "script", 0, ENOENT); | ||
| 321 | |||
| 322 | /* Flag values other than AT_SYMLINK_NOFOLLOW => EINVAL */ | ||
| 323 | fail += check_execveat_fail(dot_dfd, "execveat", 0xFFFF, EINVAL); | ||
| 324 | /* Invalid path => ENOENT */ | ||
| 325 | fail += check_execveat_fail(dot_dfd, "no-such-file", 0, ENOENT); | ||
| 326 | fail += check_execveat_fail(dot_dfd_path, "no-such-file", 0, ENOENT); | ||
| 327 | fail += check_execveat_fail(AT_FDCWD, "no-such-file", 0, ENOENT); | ||
| 328 | /* Attempt to execute directory => EACCES */ | ||
| 329 | fail += check_execveat_fail(dot_dfd, "", AT_EMPTY_PATH, EACCES); | ||
| 330 | /* Attempt to execute non-executable => EACCES */ | ||
| 331 | fail += check_execveat_fail(dot_dfd, "Makefile", 0, EACCES); | ||
| 332 | fail += check_execveat_fail(fd_denatured, "", AT_EMPTY_PATH, EACCES); | ||
| 333 | fail += check_execveat_fail(fd_denatured_path, "", AT_EMPTY_PATH, | ||
| 334 | EACCES); | ||
| 335 | /* Attempt to execute nonsense FD => EBADF */ | ||
| 336 | fail += check_execveat_fail(99, "", AT_EMPTY_PATH, EBADF); | ||
| 337 | fail += check_execveat_fail(99, "execveat", 0, EBADF); | ||
| 338 | /* Attempt to execute relative to non-directory => ENOTDIR */ | ||
| 339 | fail += check_execveat_fail(fd, "execveat", 0, ENOTDIR); | ||
| 340 | |||
| 341 | fail += check_execveat_pathmax(dot_dfd, "execveat", 0); | ||
| 342 | fail += check_execveat_pathmax(dot_dfd, "script", 1); | ||
| 343 | return fail; | ||
| 344 | } | ||
| 345 | |||
| 346 | static void prerequisites(void) | ||
| 347 | { | ||
| 348 | int fd; | ||
| 349 | const char *script = "#!/bin/sh\nexit $*\n"; | ||
| 350 | |||
| 351 | /* Create ephemeral copies of files */ | ||
| 352 | exe_cp("execveat", "execveat.ephemeral"); | ||
| 353 | exe_cp("execveat", "execveat.path.ephemeral"); | ||
| 354 | exe_cp("script", "script.ephemeral"); | ||
| 355 | mkdir("subdir.ephemeral", 0755); | ||
| 356 | |||
| 357 | fd = open("subdir.ephemeral/script", O_RDWR|O_CREAT|O_TRUNC, 0755); | ||
| 358 | write(fd, script, strlen(script)); | ||
| 359 | close(fd); | ||
| 360 | } | ||
| 361 | |||
| 362 | int main(int argc, char **argv) | ||
| 363 | { | ||
| 364 | int ii; | ||
| 365 | int rc; | ||
| 366 | const char *verbose = getenv("VERBOSE"); | ||
| 367 | |||
| 368 | if (argc >= 2) { | ||
| 369 | /* If we are invoked with an argument, don't run tests. */ | ||
| 370 | const char *in_test = getenv("IN_TEST"); | ||
| 371 | |||
| 372 | if (verbose) { | ||
| 373 | printf(" invoked with:"); | ||
| 374 | for (ii = 0; ii < argc; ii++) | ||
| 375 | printf(" [%d]='%s'", ii, argv[ii]); | ||
| 376 | printf("\n"); | ||
| 377 | } | ||
| 378 | |||
| 379 | /* Check expected environment transferred. */ | ||
| 380 | if (!in_test || strcmp(in_test, "yes") != 0) { | ||
| 381 | printf("[FAIL] (no IN_TEST=yes in env)\n"); | ||
| 382 | return 1; | ||
| 383 | } | ||
| 384 | |||
| 385 | /* Use the final argument as an exit code. */ | ||
| 386 | rc = atoi(argv[argc - 1]); | ||
| 387 | fflush(stdout); | ||
| 388 | } else { | ||
| 389 | prerequisites(); | ||
| 390 | if (verbose) | ||
| 391 | envp[1] = "VERBOSE=1"; | ||
| 392 | rc = run_tests(); | ||
| 393 | if (rc > 0) | ||
| 394 | printf("%d tests failed\n", rc); | ||
| 395 | } | ||
| 396 | return rc; | ||
| 397 | } | ||
diff --git a/tools/vm/Makefile b/tools/vm/Makefile index 3d907dacf2ac..ac884b65a072 100644 --- a/tools/vm/Makefile +++ b/tools/vm/Makefile | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | # Makefile for vm tools | 1 | # Makefile for vm tools |
| 2 | # | 2 | # |
| 3 | TARGETS=page-types slabinfo | 3 | TARGETS=page-types slabinfo page_owner_sort |
| 4 | 4 | ||
| 5 | LIB_DIR = ../lib/api | 5 | LIB_DIR = ../lib/api |
| 6 | LIBS = $(LIB_DIR)/libapikfs.a | 6 | LIBS = $(LIB_DIR)/libapikfs.a |
| @@ -18,5 +18,5 @@ $(LIBS): | |||
| 18 | $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) | 18 | $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) |
| 19 | 19 | ||
| 20 | clean: | 20 | clean: |
| 21 | $(RM) page-types slabinfo | 21 | $(RM) page-types slabinfo page_owner_sort |
| 22 | make -C $(LIB_DIR) clean | 22 | make -C $(LIB_DIR) clean |
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c new file mode 100644 index 000000000000..77147b42d598 --- /dev/null +++ b/tools/vm/page_owner_sort.c | |||
| @@ -0,0 +1,144 @@ | |||
| 1 | /* | ||
| 2 | * User-space helper to sort the output of /sys/kernel/debug/page_owner | ||
| 3 | * | ||
| 4 | * Example use: | ||
| 5 | * cat /sys/kernel/debug/page_owner > page_owner_full.txt | ||
| 6 | * grep -v ^PFN page_owner_full.txt > page_owner.txt | ||
| 7 | * ./sort page_owner.txt sorted_page_owner.txt | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <stdio.h> | ||
| 11 | #include <stdlib.h> | ||
| 12 | #include <sys/types.h> | ||
| 13 | #include <sys/stat.h> | ||
| 14 | #include <fcntl.h> | ||
| 15 | #include <unistd.h> | ||
| 16 | #include <string.h> | ||
| 17 | |||
| 18 | struct block_list { | ||
| 19 | char *txt; | ||
| 20 | int len; | ||
| 21 | int num; | ||
| 22 | }; | ||
| 23 | |||
| 24 | |||
| 25 | static struct block_list *list; | ||
| 26 | static int list_size; | ||
| 27 | static int max_size; | ||
| 28 | |||
| 29 | struct block_list *block_head; | ||
| 30 | |||
| 31 | int read_block(char *buf, int buf_size, FILE *fin) | ||
| 32 | { | ||
| 33 | char *curr = buf, *const buf_end = buf + buf_size; | ||
| 34 | |||
| 35 | while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) { | ||
| 36 | if (*curr == '\n') /* empty line */ | ||
| 37 | return curr - buf; | ||
| 38 | curr += strlen(curr); | ||
| 39 | } | ||
| 40 | |||
| 41 | return -1; /* EOF or no space left in buf. */ | ||
| 42 | } | ||
| 43 | |||
| 44 | static int compare_txt(const void *p1, const void *p2) | ||
| 45 | { | ||
| 46 | const struct block_list *l1 = p1, *l2 = p2; | ||
| 47 | |||
| 48 | return strcmp(l1->txt, l2->txt); | ||
| 49 | } | ||
| 50 | |||
| 51 | static int compare_num(const void *p1, const void *p2) | ||
| 52 | { | ||
| 53 | const struct block_list *l1 = p1, *l2 = p2; | ||
| 54 | |||
| 55 | return l2->num - l1->num; | ||
| 56 | } | ||
| 57 | |||
| 58 | static void add_list(char *buf, int len) | ||
| 59 | { | ||
| 60 | if (list_size != 0 && | ||
| 61 | len == list[list_size-1].len && | ||
| 62 | memcmp(buf, list[list_size-1].txt, len) == 0) { | ||
| 63 | list[list_size-1].num++; | ||
| 64 | return; | ||
| 65 | } | ||
| 66 | if (list_size == max_size) { | ||
| 67 | printf("max_size too small??\n"); | ||
| 68 | exit(1); | ||
| 69 | } | ||
| 70 | list[list_size].txt = malloc(len+1); | ||
| 71 | list[list_size].len = len; | ||
| 72 | list[list_size].num = 1; | ||
| 73 | memcpy(list[list_size].txt, buf, len); | ||
| 74 | list[list_size].txt[len] = 0; | ||
| 75 | list_size++; | ||
| 76 | if (list_size % 1000 == 0) { | ||
| 77 | printf("loaded %d\r", list_size); | ||
| 78 | fflush(stdout); | ||
| 79 | } | ||
| 80 | } | ||
| 81 | |||
| 82 | #define BUF_SIZE 1024 | ||
| 83 | |||
| 84 | int main(int argc, char **argv) | ||
| 85 | { | ||
| 86 | FILE *fin, *fout; | ||
| 87 | char buf[BUF_SIZE]; | ||
| 88 | int ret, i, count; | ||
| 89 | struct block_list *list2; | ||
| 90 | struct stat st; | ||
| 91 | |||
| 92 | if (argc < 3) { | ||
| 93 | printf("Usage: ./program <input> <output>\n"); | ||
| 94 | perror("open: "); | ||
| 95 | exit(1); | ||
| 96 | } | ||
| 97 | |||
| 98 | fin = fopen(argv[1], "r"); | ||
| 99 | fout = fopen(argv[2], "w"); | ||
| 100 | if (!fin || !fout) { | ||
| 101 | printf("Usage: ./program <input> <output>\n"); | ||
| 102 | perror("open: "); | ||
| 103 | exit(1); | ||
| 104 | } | ||
| 105 | |||
| 106 | fstat(fileno(fin), &st); | ||
| 107 | max_size = st.st_size / 100; /* hack ... */ | ||
| 108 | |||
| 109 | list = malloc(max_size * sizeof(*list)); | ||
| 110 | |||
| 111 | for ( ; ; ) { | ||
| 112 | ret = read_block(buf, BUF_SIZE, fin); | ||
| 113 | if (ret < 0) | ||
| 114 | break; | ||
| 115 | |||
| 116 | add_list(buf, ret); | ||
| 117 | } | ||
| 118 | |||
| 119 | printf("loaded %d\n", list_size); | ||
| 120 | |||
| 121 | printf("sorting ....\n"); | ||
| 122 | |||
| 123 | qsort(list, list_size, sizeof(list[0]), compare_txt); | ||
| 124 | |||
| 125 | list2 = malloc(sizeof(*list) * list_size); | ||
| 126 | |||
| 127 | printf("culling\n"); | ||
| 128 | |||
| 129 | for (i = count = 0; i < list_size; i++) { | ||
| 130 | if (count == 0 || | ||
| 131 | strcmp(list2[count-1].txt, list[i].txt) != 0) { | ||
| 132 | list2[count++] = list[i]; | ||
| 133 | } else { | ||
| 134 | list2[count-1].num += list[i].num; | ||
| 135 | } | ||
| 136 | } | ||
| 137 | |||
| 138 | qsort(list2, count, sizeof(list[0]), compare_num); | ||
| 139 | |||
| 140 | for (i = 0; i < count; i++) | ||
| 141 | fprintf(fout, "%d times:\n%s\n", list2[i].num, list2[i].txt); | ||
| 142 | |||
| 143 | return 0; | ||
| 144 | } | ||
diff --git a/usr/Kconfig b/usr/Kconfig index 2d4c77eecf2e..572dcf7b6a44 100644 --- a/usr/Kconfig +++ b/usr/Kconfig | |||
| @@ -46,17 +46,17 @@ config INITRAMFS_ROOT_GID | |||
| 46 | If you are not sure, leave it set to "0". | 46 | If you are not sure, leave it set to "0". |
| 47 | 47 | ||
| 48 | config RD_GZIP | 48 | config RD_GZIP |
| 49 | bool "Support initial ramdisks compressed using gzip" if EXPERT | 49 | bool "Support initial ramdisks compressed using gzip" |
| 50 | default y | ||
| 51 | depends on BLK_DEV_INITRD | 50 | depends on BLK_DEV_INITRD |
| 51 | default y | ||
| 52 | select DECOMPRESS_GZIP | 52 | select DECOMPRESS_GZIP |
| 53 | help | 53 | help |
| 54 | Support loading of a gzip encoded initial ramdisk or cpio buffer. | 54 | Support loading of a gzip encoded initial ramdisk or cpio buffer. |
| 55 | If unsure, say Y. | 55 | If unsure, say Y. |
| 56 | 56 | ||
| 57 | config RD_BZIP2 | 57 | config RD_BZIP2 |
| 58 | bool "Support initial ramdisks compressed using bzip2" if EXPERT | 58 | bool "Support initial ramdisks compressed using bzip2" |
| 59 | default !EXPERT | 59 | default y |
| 60 | depends on BLK_DEV_INITRD | 60 | depends on BLK_DEV_INITRD |
| 61 | select DECOMPRESS_BZIP2 | 61 | select DECOMPRESS_BZIP2 |
| 62 | help | 62 | help |
| @@ -64,8 +64,8 @@ config RD_BZIP2 | |||
| 64 | If unsure, say N. | 64 | If unsure, say N. |
| 65 | 65 | ||
| 66 | config RD_LZMA | 66 | config RD_LZMA |
| 67 | bool "Support initial ramdisks compressed using LZMA" if EXPERT | 67 | bool "Support initial ramdisks compressed using LZMA" |
| 68 | default !EXPERT | 68 | default y |
| 69 | depends on BLK_DEV_INITRD | 69 | depends on BLK_DEV_INITRD |
| 70 | select DECOMPRESS_LZMA | 70 | select DECOMPRESS_LZMA |
| 71 | help | 71 | help |
| @@ -73,17 +73,17 @@ config RD_LZMA | |||
| 73 | If unsure, say N. | 73 | If unsure, say N. |
| 74 | 74 | ||
| 75 | config RD_XZ | 75 | config RD_XZ |
| 76 | bool "Support initial ramdisks compressed using XZ" if EXPERT | 76 | bool "Support initial ramdisks compressed using XZ" |
| 77 | default !EXPERT | ||
| 78 | depends on BLK_DEV_INITRD | 77 | depends on BLK_DEV_INITRD |
| 78 | default y | ||
| 79 | select DECOMPRESS_XZ | 79 | select DECOMPRESS_XZ |
| 80 | help | 80 | help |
| 81 | Support loading of a XZ encoded initial ramdisk or cpio buffer. | 81 | Support loading of a XZ encoded initial ramdisk or cpio buffer. |
| 82 | If unsure, say N. | 82 | If unsure, say N. |
| 83 | 83 | ||
| 84 | config RD_LZO | 84 | config RD_LZO |
| 85 | bool "Support initial ramdisks compressed using LZO" if EXPERT | 85 | bool "Support initial ramdisks compressed using LZO" |
| 86 | default !EXPERT | 86 | default y |
| 87 | depends on BLK_DEV_INITRD | 87 | depends on BLK_DEV_INITRD |
| 88 | select DECOMPRESS_LZO | 88 | select DECOMPRESS_LZO |
| 89 | help | 89 | help |
| @@ -91,8 +91,8 @@ config RD_LZO | |||
| 91 | If unsure, say N. | 91 | If unsure, say N. |
| 92 | 92 | ||
| 93 | config RD_LZ4 | 93 | config RD_LZ4 |
| 94 | bool "Support initial ramdisks compressed using LZ4" if EXPERT | 94 | bool "Support initial ramdisks compressed using LZ4" |
| 95 | default !EXPERT | 95 | default y |
| 96 | depends on BLK_DEV_INITRD | 96 | depends on BLK_DEV_INITRD |
| 97 | select DECOMPRESS_LZ4 | 97 | select DECOMPRESS_LZ4 |
| 98 | help | 98 | help |
