aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-13 16:00:36 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-13 16:00:36 -0500
commit78a45c6f067824cf5d0a9fedea7339ac2e28603c (patch)
treeb4f78c8b6b9059ddace0a18c11629b8d2045f793
parentf96fe225677b3efb74346ebd56fafe3997b02afa (diff)
parent29d293b6007b91a4463f05bc8d0b26e0e65c5816 (diff)
Merge branch 'akpm' (second patch-bomb from Andrew)
Merge second patchbomb from Andrew Morton: - the rest of MM - misc fs fixes - add execveat() syscall - new ratelimit feature for fault-injection - decompressor updates - ipc/ updates - fallocate feature creep - fsnotify cleanups - a few other misc things * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (99 commits) cgroups: Documentation: fix trivial typos and wrong paragraph numberings parisc: percpu: update comments referring to __get_cpu_var percpu: update local_ops.txt to reflect this_cpu operations percpu: remove __get_cpu_var and __raw_get_cpu_var macros fsnotify: remove destroy_list from fsnotify_mark fsnotify: unify inode and mount marks handling fallocate: create FAN_MODIFY and IN_MODIFY events mm/cma: make kmemleak ignore CMA regions slub: fix cpuset check in get_any_partial slab: fix cpuset check in fallback_alloc shmdt: use i_size_read() instead of ->i_size ipc/shm.c: fix overly aggressive shmdt() when calls span multiple segments ipc/msg: increase MSGMNI, remove scaling ipc/sem.c: increase SEMMSL, SEMMNI, SEMOPM ipc/sem.c: change memory barrier in sem_lock() to smp_rmb() lib/decompress.c: consistency of compress formats for kernel image decompress_bunzip2: off by one in get_next_block() usr/Kconfig: make initrd compression algorithm selection not expert fault-inject: add ratelimit option ratelimit: add initialization macro ...
-rw-r--r--Documentation/cgroups/cpusets.txt6
-rw-r--r--Documentation/cgroups/memory.txt8
-rw-r--r--Documentation/kernel-parameters.txt19
-rw-r--r--Documentation/local_ops.txt13
-rw-r--r--Documentation/sysctl/kernel.txt10
-rw-r--r--Documentation/vm/page_owner.txt81
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm64/Kconfig1
-rw-r--r--arch/microblaze/Kconfig1
-rw-r--r--arch/parisc/lib/fixup.S4
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/mm/hash_utils_64.c2
-rw-r--r--arch/powerpc/mm/pgtable_32.c2
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/mm/pageattr.c2
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sparc/include/uapi/asm/unistd.h3
-rw-r--r--arch/sparc/kernel/syscalls.S10
-rw-r--r--arch/sparc/kernel/systbls_32.S1
-rw-r--r--arch/sparc/kernel/systbls_64.S2
-rw-r--r--arch/sparc/mm/init_64.c2
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/ia32/audit.c1
-rw-r--r--arch/x86/ia32/ia32entry.S1
-rw-r--r--arch/x86/kernel/audit_64.c1
-rw-r--r--arch/x86/kernel/entry_64.S28
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/um/sys_call_table_64.c1
-rw-r--r--drivers/base/memory.c4
-rw-r--r--drivers/block/zram/zram_drv.c104
-rw-r--r--drivers/block/zram/zram_drv.h4
-rw-r--r--drivers/iommu/amd_iommu_v2.c84
-rw-r--r--drivers/rtc/rtc-snvs.c11
-rw-r--r--drivers/staging/android/ashmem.c3
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/amigaffs.c28
-rw-r--r--fs/affs/file.c76
-rw-r--r--fs/befs/linuxvfs.c4
-rw-r--r--fs/binfmt_em86.c4
-rw-r--r--fs/binfmt_misc.c4
-rw-r--r--fs/binfmt_script.c10
-rw-r--r--fs/drop_caches.c11
-rw-r--r--fs/exec.c113
-rw-r--r--fs/fat/fat.h1
-rw-r--r--fs/fat/file.c3
-rw-r--r--fs/fat/inode.c12
-rw-r--r--fs/hugetlbfs/inode.c14
-rw-r--r--fs/inode.c2
-rw-r--r--fs/namei.c2
-rw-r--r--fs/notify/dnotify/dnotify.c4
-rw-r--r--fs/notify/fdinfo.c6
-rw-r--r--fs/notify/fsnotify.c4
-rw-r--r--fs/notify/fsnotify.h12
-rw-r--r--fs/notify/inode_mark.c113
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c10
-rw-r--r--fs/notify/mark.c97
-rw-r--r--fs/notify/vfsmount_mark.c109
-rw-r--r--fs/open.c11
-rw-r--r--fs/seq_file.c6
-rw-r--r--include/linux/binfmts.h4
-rw-r--r--include/linux/bitmap.h36
-rw-r--r--include/linux/compat.h3
-rw-r--r--include/linux/fault-inject.h17
-rw-r--r--include/linux/fs.h24
-rw-r--r--include/linux/fsnotify_backend.h31
-rw-r--r--include/linux/gfp.h7
-rw-r--r--include/linux/ipc_namespace.h20
-rw-r--r--include/linux/kmemleak.h2
-rw-r--r--include/linux/memcontrol.h16
-rw-r--r--include/linux/mm.h42
-rw-r--r--include/linux/mm_types.h12
-rw-r--r--include/linux/mmu_notifier.h2
-rw-r--r--include/linux/mmzone.h12
-rw-r--r--include/linux/oom.h11
-rw-r--r--include/linux/page-debug-flags.h32
-rw-r--r--include/linux/page_ext.h84
-rw-r--r--include/linux/page_owner.h38
-rw-r--r--include/linux/percpu-defs.h2
-rw-r--r--include/linux/ratelimit.h12
-rw-r--r--include/linux/sched.h11
-rw-r--r--include/linux/shrinker.h2
-rw-r--r--include/linux/slab.h2
-rw-r--r--include/linux/stacktrace.h5
-rw-r--r--include/linux/swap.h8
-rw-r--r--include/linux/syscalls.h5
-rw-r--r--include/linux/vm_event_item.h1
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--include/uapi/linux/msg.h28
-rw-r--r--include/uapi/linux/sem.h18
-rw-r--r--init/main.c7
-rw-r--r--ipc/Makefile2
-rw-r--r--ipc/ipc_sysctl.c93
-rw-r--r--ipc/ipcns_notifier.c92
-rw-r--r--ipc/msg.c36
-rw-r--r--ipc/namespace.c22
-rw-r--r--ipc/sem.c13
-rw-r--r--ipc/shm.c21
-rw-r--r--ipc/util.c40
-rw-r--r--kernel/audit_tree.c16
-rw-r--r--kernel/events/uprobes.c6
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/gcov/Kconfig5
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/stacktrace.c32
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--lib/Kconfig.debug16
-rw-r--r--lib/audit.c3
-rw-r--r--lib/bitmap.c24
-rw-r--r--lib/decompress.c4
-rw-r--r--lib/decompress_bunzip2.c2
-rw-r--r--lib/fault-inject.c21
-rw-r--r--mm/Kconfig.debug10
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cma.c25
-rw-r--r--mm/debug-pagealloc.c45
-rw-r--r--mm/fadvise.c6
-rw-r--r--mm/filemap.c10
-rw-r--r--mm/filemap_xip.c23
-rw-r--r--mm/fremap.c4
-rw-r--r--mm/hugetlb.c26
-rw-r--r--mm/memblock.c43
-rw-r--r--mm/memcontrol.c180
-rw-r--r--mm/memory-failure.c15
-rw-r--r--mm/memory.c9
-rw-r--r--mm/migrate.c28
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mmap.c24
-rw-r--r--mm/mremap.c6
-rw-r--r--mm/nommu.c50
-rw-r--r--mm/oom_kill.c15
-rw-r--r--mm/page_alloc.c137
-rw-r--r--mm/page_ext.c403
-rw-r--r--mm/page_owner.c311
-rw-r--r--mm/rmap.c18
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slub.c17
-rw-r--r--mm/vmacache.c2
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c216
-rw-r--r--mm/vmstat.c102
-rw-r--r--mm/zbud.c2
-rw-r--r--mm/zsmalloc.c180
-rw-r--r--mm/zswap.c9
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/exec/.gitignore9
-rw-r--r--tools/testing/selftests/exec/Makefile25
-rw-r--r--tools/testing/selftests/exec/execveat.c397
-rw-r--r--tools/vm/Makefile4
-rw-r--r--tools/vm/page_owner_sort.c144
-rw-r--r--usr/Kconfig24
154 files changed, 3209 insertions, 1364 deletions
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index 3c94ff3f9693..f2235a162529 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -445,7 +445,7 @@ across partially overlapping sets of CPUs would risk unstable dynamics
445that would be beyond our understanding. So if each of two partially 445that would be beyond our understanding. So if each of two partially
446overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we 446overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
447form a single sched domain that is a superset of both. We won't move 447form a single sched domain that is a superset of both. We won't move
448a task to a CPU outside it cpuset, but the scheduler load balancing 448a task to a CPU outside its cpuset, but the scheduler load balancing
449code might waste some compute cycles considering that possibility. 449code might waste some compute cycles considering that possibility.
450 450
451This mismatch is why there is not a simple one-to-one relation 451This mismatch is why there is not a simple one-to-one relation
@@ -552,8 +552,8 @@ otherwise initial value -1 that indicates the cpuset has no request.
552 1 : search siblings (hyperthreads in a core). 552 1 : search siblings (hyperthreads in a core).
553 2 : search cores in a package. 553 2 : search cores in a package.
554 3 : search cpus in a node [= system wide on non-NUMA system] 554 3 : search cpus in a node [= system wide on non-NUMA system]
555 ( 4 : search nodes in a chunk of node [on NUMA system] ) 555 4 : search nodes in a chunk of node [on NUMA system]
556 ( 5 : search system wide [on NUMA system] ) 556 5 : search system wide [on NUMA system]
557 557
558The system default is architecture dependent. The system default 558The system default is architecture dependent. The system default
559can be changed using the relax_domain_level= boot parameter. 559can be changed using the relax_domain_level= boot parameter.
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 46b2b5080317..a22df3ad35ff 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -326,7 +326,7 @@ per cgroup, instead of globally.
326 326
327* tcp memory pressure: sockets memory pressure for the tcp protocol. 327* tcp memory pressure: sockets memory pressure for the tcp protocol.
328 328
3292.7.3 Common use cases 3292.7.2 Common use cases
330 330
331Because the "kmem" counter is fed to the main user counter, kernel memory can 331Because the "kmem" counter is fed to the main user counter, kernel memory can
332never be limited completely independently of user memory. Say "U" is the user 332never be limited completely independently of user memory. Say "U" is the user
@@ -354,19 +354,19 @@ set:
354 354
3553. User Interface 3553. User Interface
356 356
3570. Configuration 3573.0. Configuration
358 358
359a. Enable CONFIG_CGROUPS 359a. Enable CONFIG_CGROUPS
360b. Enable CONFIG_MEMCG 360b. Enable CONFIG_MEMCG
361c. Enable CONFIG_MEMCG_SWAP (to use swap extension) 361c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
362d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) 362d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
363 363
3641. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) 3643.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
365# mount -t tmpfs none /sys/fs/cgroup 365# mount -t tmpfs none /sys/fs/cgroup
366# mkdir /sys/fs/cgroup/memory 366# mkdir /sys/fs/cgroup/memory
367# mount -t cgroup none /sys/fs/cgroup/memory -o memory 367# mount -t cgroup none /sys/fs/cgroup/memory -o memory
368 368
3692. Make the new group and move bash into it 3693.2. Make the new group and move bash into it
370# mkdir /sys/fs/cgroup/memory/0 370# mkdir /sys/fs/cgroup/memory/0
371# echo $$ > /sys/fs/cgroup/memory/0/tasks 371# echo $$ > /sys/fs/cgroup/memory/0/tasks
372 372
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 43ecdcd39df2..4a337daf0c09 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -829,6 +829,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
829 CONFIG_DEBUG_PAGEALLOC, hence this option will not help 829 CONFIG_DEBUG_PAGEALLOC, hence this option will not help
830 tracking down these problems. 830 tracking down these problems.
831 831
832 debug_pagealloc=
833 [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
834 parameter enables the feature at boot time. In
835 default, it is disabled. We can avoid allocating huge
836 chunk of memory for debug pagealloc if we don't enable
837 it at boot time and the system will work mostly same
838 with the kernel built without CONFIG_DEBUG_PAGEALLOC.
839 on: enable the feature
840
832 debugpat [X86] Enable PAT debugging 841 debugpat [X86] Enable PAT debugging
833 842
834 decnet.addr= [HW,NET] 843 decnet.addr= [HW,NET]
@@ -1228,9 +1237,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1228 multiple times interleaved with hugepages= to reserve 1237 multiple times interleaved with hugepages= to reserve
1229 huge pages of different sizes. Valid pages sizes on 1238 huge pages of different sizes. Valid pages sizes on
1230 x86-64 are 2M (when the CPU supports "pse") and 1G 1239 x86-64 are 2M (when the CPU supports "pse") and 1G
1231 (when the CPU supports the "pdpe1gb" cpuinfo flag) 1240 (when the CPU supports the "pdpe1gb" cpuinfo flag).
1232 Note that 1GB pages can only be allocated at boot time
1233 using hugepages= and not freed afterwards.
1234 1241
1235 hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) 1242 hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC)
1236 terminal devices. Valid values: 0..8 1243 terminal devices. Valid values: 0..8
@@ -2506,6 +2513,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2506 OSS [HW,OSS] 2513 OSS [HW,OSS]
2507 See Documentation/sound/oss/oss-parameters.txt 2514 See Documentation/sound/oss/oss-parameters.txt
2508 2515
2516 page_owner= [KNL] Boot-time page_owner enabling option.
2517 Storage of the information about who allocated
2518 each page is disabled in default. With this switch,
2519 we can turn it on.
2520 on: enable the feature
2521
2509 panic= [KNL] Kernel behaviour on panic: delay <timeout> 2522 panic= [KNL] Kernel behaviour on panic: delay <timeout>
2510 timeout > 0: seconds before rebooting 2523 timeout > 0: seconds before rebooting
2511 timeout = 0: wait forever 2524 timeout = 0: wait forever
diff --git a/Documentation/local_ops.txt b/Documentation/local_ops.txt
index 300da4bdfdbd..407576a23317 100644
--- a/Documentation/local_ops.txt
+++ b/Documentation/local_ops.txt
@@ -8,6 +8,11 @@ to implement them for any given architecture and shows how they can be used
8properly. It also stresses on the precautions that must be taken when reading 8properly. It also stresses on the precautions that must be taken when reading
9those local variables across CPUs when the order of memory writes matters. 9those local variables across CPUs when the order of memory writes matters.
10 10
11Note that local_t based operations are not recommended for general kernel use.
12Please use the this_cpu operations instead unless there is really a special purpose.
13Most uses of local_t in the kernel have been replaced by this_cpu operations.
14this_cpu operations combine the relocation with the local_t like semantics in
15a single instruction and yield more compact and faster executing code.
11 16
12 17
13* Purpose of local atomic operations 18* Purpose of local atomic operations
@@ -87,10 +92,10 @@ the per cpu variable. For instance :
87 local_inc(&get_cpu_var(counters)); 92 local_inc(&get_cpu_var(counters));
88 put_cpu_var(counters); 93 put_cpu_var(counters);
89 94
90If you are already in a preemption-safe context, you can directly use 95If you are already in a preemption-safe context, you can use
91__get_cpu_var() instead. 96this_cpu_ptr() instead.
92 97
93 local_inc(&__get_cpu_var(counters)); 98 local_inc(this_cpu_ptr(&counters));
94 99
95 100
96 101
@@ -134,7 +139,7 @@ static void test_each(void *info)
134{ 139{
135 /* Increment the counter from a non preemptible context */ 140 /* Increment the counter from a non preemptible context */
136 printk("Increment on cpu %d\n", smp_processor_id()); 141 printk("Increment on cpu %d\n", smp_processor_id());
137 local_inc(&__get_cpu_var(counters)); 142 local_inc(this_cpu_ptr(&counters));
138 143
139 /* This is what incrementing the variable would look like within a 144 /* This is what incrementing the variable would look like within a
140 * preemptible context (it disables preemption) : 145 * preemptible context (it disables preemption) :
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index b5d0c8501a18..75511efefc64 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -116,10 +116,12 @@ set during run time.
116 116
117auto_msgmni: 117auto_msgmni:
118 118
119Enables/Disables automatic recomputing of msgmni upon memory add/remove 119This variable has no effect and may be removed in future kernel
120or upon ipc namespace creation/removal (see the msgmni description 120releases. Reading it always returns 0.
121above). Echoing "1" into this file enables msgmni automatic recomputing. 121Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni
122Echoing "0" turns it off. auto_msgmni default value is 1. 122upon memory add/remove or upon ipc namespace creation/removal.
123Echoing "1" into this file enabled msgmni automatic recomputing.
124Echoing "0" turned it off. auto_msgmni default value was 1.
123 125
124 126
125============================================================== 127==============================================================
diff --git a/Documentation/vm/page_owner.txt b/Documentation/vm/page_owner.txt
new file mode 100644
index 000000000000..8f3ce9b3aa11
--- /dev/null
+++ b/Documentation/vm/page_owner.txt
@@ -0,0 +1,81 @@
1page owner: Tracking about who allocated each page
2-----------------------------------------------------------
3
4* Introduction
5
6page owner is for the tracking about who allocated each page.
7It can be used to debug memory leak or to find a memory hogger.
8When allocation happens, information about allocation such as call stack
9and order of pages is stored into certain storage for each page.
10When we need to know about status of all pages, we can get and analyze
11this information.
12
13Although we already have tracepoint for tracing page allocation/free,
14using it for analyzing who allocate each page is rather complex. We need
15to enlarge the trace buffer for preventing overlapping until userspace
16program launched. And, launched program continually dump out the trace
17buffer for later analysis and it would change system behviour with more
18possibility rather than just keeping it in memory, so bad for debugging.
19
20page owner can also be used for various purposes. For example, accurate
21fragmentation statistics can be obtained through gfp flag information of
22each page. It is already implemented and activated if page owner is
23enabled. Other usages are more than welcome.
24
25page owner is disabled in default. So, if you'd like to use it, you need
26to add "page_owner=on" into your boot cmdline. If the kernel is built
27with page owner and page owner is disabled in runtime due to no enabling
28boot option, runtime overhead is marginal. If disabled in runtime, it
29doesn't require memory to store owner information, so there is no runtime
30memory overhead. And, page owner inserts just two unlikely branches into
31the page allocator hotpath and if it returns false then allocation is
32done like as the kernel without page owner. These two unlikely branches
33would not affect to allocation performance. Following is the kernel's
34code size change due to this facility.
35
36- Without page owner
37 text data bss dec hex filename
38 40662 1493 644 42799 a72f mm/page_alloc.o
39
40- With page owner
41 text data bss dec hex filename
42 40892 1493 644 43029 a815 mm/page_alloc.o
43 1427 24 8 1459 5b3 mm/page_ext.o
44 2722 50 0 2772 ad4 mm/page_owner.o
45
46Although, roughly, 4 KB code is added in total, page_alloc.o increase by
47230 bytes and only half of it is in hotpath. Building the kernel with
48page owner and turning it on if needed would be great option to debug
49kernel memory problem.
50
51There is one notice that is caused by implementation detail. page owner
52stores information into the memory from struct page extension. This memory
53is initialized some time later than that page allocator starts in sparse
54memory system, so, until initialization, many pages can be allocated and
55they would have no owner information. To fix it up, these early allocated
56pages are investigated and marked as allocated in initialization phase.
57Although it doesn't mean that they have the right owner information,
58at least, we can tell whether the page is allocated or not,
59more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages
60are catched and marked, although they are mostly allocated from struct
61page extension feature. Anyway, after that, no page is left in
62un-tracking state.
63
64* Usage
65
661) Build user-space helper
67 cd tools/vm
68 make page_owner_sort
69
702) Enable page owner
71 Add "page_owner=on" to boot cmdline.
72
733) Do the job what you want to debug
74
754) Analyze information from page owner
76 cat /sys/kernel/debug/page_owner > page_owner_full.txt
77 grep -v ^PFN page_owner_full.txt > page_owner.txt
78 ./page_owner_sort page_owner.txt sorted_page_owner.txt
79
80 See the result about who allocated each page
81 in the sorted_page_owner.txt.
diff --git a/MAINTAINERS b/MAINTAINERS
index 326dc2d1652d..1f0ef48830f9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4045,7 +4045,7 @@ F: drivers/tty/serial/ucc_uart.c
4045FREESCALE SOC SOUND DRIVERS 4045FREESCALE SOC SOUND DRIVERS
4046M: Timur Tabi <timur@tabi.org> 4046M: Timur Tabi <timur@tabi.org>
4047M: Nicolin Chen <nicoleotsuka@gmail.com> 4047M: Nicolin Chen <nicoleotsuka@gmail.com>
4048M: Xiubo Li <Li.Xiubo@freescale.com> 4048M: Xiubo Li <Xiubo.Lee@gmail.com>
4049L: alsa-devel@alsa-project.org (moderated for non-subscribers) 4049L: alsa-devel@alsa-project.org (moderated for non-subscribers)
4050L: linuxppc-dev@lists.ozlabs.org 4050L: linuxppc-dev@lists.ozlabs.org
4051S: Maintained 4051S: Maintained
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0bee1fe209b1..97d07ed60a0b 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -5,6 +5,7 @@ config ARM
5 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 5 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
6 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST 6 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
7 select ARCH_HAVE_CUSTOM_GPIO_H 7 select ARCH_HAVE_CUSTOM_GPIO_H
8 select ARCH_HAS_GCOV_PROFILE_ALL
8 select ARCH_MIGHT_HAVE_PC_PARPORT 9 select ARCH_MIGHT_HAVE_PC_PARPORT
9 select ARCH_SUPPORTS_ATOMIC_RMW 10 select ARCH_SUPPORTS_ATOMIC_RMW
10 select ARCH_USE_BUILTIN_BSWAP 11 select ARCH_USE_BUILTIN_BSWAP
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6b1ebd964c10..688db03ef5b8 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2,6 +2,7 @@ config ARM64
2 def_bool y 2 def_bool y
3 select ARCH_BINFMT_ELF_RANDOMIZE_PIE 3 select ARCH_BINFMT_ELF_RANDOMIZE_PIE
4 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 4 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
5 select ARCH_HAS_GCOV_PROFILE_ALL
5 select ARCH_HAS_SG_CHAIN 6 select ARCH_HAS_SG_CHAIN
6 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST 7 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
7 select ARCH_USE_CMPXCHG_LOCKREF 8 select ARCH_USE_CMPXCHG_LOCKREF
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index a7736fa0580c..0bce820428fc 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -1,5 +1,6 @@
1config MICROBLAZE 1config MICROBLAZE
2 def_bool y 2 def_bool y
3 select ARCH_HAS_GCOV_PROFILE_ALL
3 select ARCH_MIGHT_HAVE_PC_PARPORT 4 select ARCH_MIGHT_HAVE_PC_PARPORT
4 select ARCH_WANT_IPC_PARSE_VERSION 5 select ARCH_WANT_IPC_PARSE_VERSION
5 select ARCH_WANT_OPTIONAL_GPIOLIB 6 select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/parisc/lib/fixup.S b/arch/parisc/lib/fixup.S
index f8c45cc2947d..536ef66bb94b 100644
--- a/arch/parisc/lib/fixup.S
+++ b/arch/parisc/lib/fixup.S
@@ -38,14 +38,14 @@
38 LDREGX \t2(\t1),\t2 38 LDREGX \t2(\t1),\t2
39 addil LT%exception_data,%r27 39 addil LT%exception_data,%r27
40 LDREG RT%exception_data(%r1),\t1 40 LDREG RT%exception_data(%r1),\t1
41 /* t1 = &__get_cpu_var(exception_data) */ 41 /* t1 = this_cpu_ptr(&exception_data) */
42 add,l \t1,\t2,\t1 42 add,l \t1,\t2,\t1
43 /* t1 = t1->fault_ip */ 43 /* t1 = t1->fault_ip */
44 LDREG EXCDATA_IP(\t1), \t1 44 LDREG EXCDATA_IP(\t1), \t1
45 .endm 45 .endm
46#else 46#else
47 .macro get_fault_ip t1 t2 47 .macro get_fault_ip t1 t2
48 /* t1 = &__get_cpu_var(exception_data) */ 48 /* t1 = this_cpu_ptr(&exception_data) */
49 addil LT%exception_data,%r27 49 addil LT%exception_data,%r27
50 LDREG RT%exception_data(%r1),\t2 50 LDREG RT%exception_data(%r1),\t2
51 /* t1 = t2->fault_ip */ 51 /* t1 = t2->fault_ip */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index af696874248b..a2a168e2dfe7 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,6 +129,7 @@ config PPC
129 select HAVE_BPF_JIT if PPC64 129 select HAVE_BPF_JIT if PPC64
130 select HAVE_ARCH_JUMP_LABEL 130 select HAVE_ARCH_JUMP_LABEL
131 select ARCH_HAVE_NMI_SAFE_CMPXCHG 131 select ARCH_HAVE_NMI_SAFE_CMPXCHG
132 select ARCH_HAS_GCOV_PROFILE_ALL
132 select GENERIC_SMP_IDLE_THREAD 133 select GENERIC_SMP_IDLE_THREAD
133 select GENERIC_CMOS_UPDATE 134 select GENERIC_CMOS_UPDATE
134 select GENERIC_TIME_VSYSCALL_OLD 135 select GENERIC_TIME_VSYSCALL_OLD
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index e56a307bc676..2c2022d16059 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1514,7 +1514,7 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
1514 mmu_kernel_ssize, 0); 1514 mmu_kernel_ssize, 0);
1515} 1515}
1516 1516
1517void kernel_map_pages(struct page *page, int numpages, int enable) 1517void __kernel_map_pages(struct page *page, int numpages, int enable)
1518{ 1518{
1519 unsigned long flags, vaddr, lmi; 1519 unsigned long flags, vaddr, lmi;
1520 int i; 1520 int i;
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index d545b1231594..50fad3801f30 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -429,7 +429,7 @@ static int change_page_attr(struct page *page, int numpages, pgprot_t prot)
429} 429}
430 430
431 431
432void kernel_map_pages(struct page *page, int numpages, int enable) 432void __kernel_map_pages(struct page *page, int numpages, int enable)
433{ 433{
434 if (PageHighMem(page)) 434 if (PageHighMem(page))
435 return; 435 return;
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f2cf1f90295b..68b68d755fdf 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -65,6 +65,7 @@ config S390
65 def_bool y 65 def_bool y
66 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 66 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
67 select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS 67 select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
68 select ARCH_HAS_GCOV_PROFILE_ALL
68 select ARCH_HAVE_NMI_SAFE_CMPXCHG 69 select ARCH_HAVE_NMI_SAFE_CMPXCHG
69 select ARCH_INLINE_READ_LOCK 70 select ARCH_INLINE_READ_LOCK
70 select ARCH_INLINE_READ_LOCK_BH 71 select ARCH_INLINE_READ_LOCK_BH
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 3fef3b299665..426c9d462d1c 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -120,7 +120,7 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr)
120 } 120 }
121} 121}
122 122
123void kernel_map_pages(struct page *page, int numpages, int enable) 123void __kernel_map_pages(struct page *page, int numpages, int enable)
124{ 124{
125 unsigned long address; 125 unsigned long address;
126 int nr, i, j; 126 int nr, i, j;
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index a1403470f80e..c6b6ee5f38b2 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -16,6 +16,7 @@ config SUPERH
16 select HAVE_DEBUG_BUGVERBOSE 16 select HAVE_DEBUG_BUGVERBOSE
17 select ARCH_HAVE_CUSTOM_GPIO_H 17 select ARCH_HAVE_CUSTOM_GPIO_H
18 select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) 18 select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
19 select ARCH_HAS_GCOV_PROFILE_ALL
19 select PERF_USE_VMALLOC 20 select PERF_USE_VMALLOC
20 select HAVE_DEBUG_KMEMLEAK 21 select HAVE_DEBUG_KMEMLEAK
21 select HAVE_KERNEL_GZIP 22 select HAVE_KERNEL_GZIP
diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h
index 46d83842eddc..6f35f4df17f2 100644
--- a/arch/sparc/include/uapi/asm/unistd.h
+++ b/arch/sparc/include/uapi/asm/unistd.h
@@ -415,8 +415,9 @@
415#define __NR_getrandom 347 415#define __NR_getrandom 347
416#define __NR_memfd_create 348 416#define __NR_memfd_create 348
417#define __NR_bpf 349 417#define __NR_bpf 349
418#define __NR_execveat 350
418 419
419#define NR_syscalls 350 420#define NR_syscalls 351
420 421
421/* Bitmask values returned from kern_features system call. */ 422/* Bitmask values returned from kern_features system call. */
422#define KERN_FEATURE_MIXED_MODE_STACK 0x00000001 423#define KERN_FEATURE_MIXED_MODE_STACK 0x00000001
diff --git a/arch/sparc/kernel/syscalls.S b/arch/sparc/kernel/syscalls.S
index 33a17e7b3ccd..bb0008927598 100644
--- a/arch/sparc/kernel/syscalls.S
+++ b/arch/sparc/kernel/syscalls.S
@@ -6,6 +6,11 @@ sys64_execve:
6 jmpl %g1, %g0 6 jmpl %g1, %g0
7 flushw 7 flushw
8 8
9sys64_execveat:
10 set sys_execveat, %g1
11 jmpl %g1, %g0
12 flushw
13
9#ifdef CONFIG_COMPAT 14#ifdef CONFIG_COMPAT
10sunos_execv: 15sunos_execv:
11 mov %g0, %o2 16 mov %g0, %o2
@@ -13,6 +18,11 @@ sys32_execve:
13 set compat_sys_execve, %g1 18 set compat_sys_execve, %g1
14 jmpl %g1, %g0 19 jmpl %g1, %g0
15 flushw 20 flushw
21
22sys32_execveat:
23 set compat_sys_execveat, %g1
24 jmpl %g1, %g0
25 flushw
16#endif 26#endif
17 27
18 .align 32 28 .align 32
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index ad0cdf497b78..e31a9056a303 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -87,3 +87,4 @@ sys_call_table:
87/*335*/ .long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev 87/*335*/ .long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
88/*340*/ .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr 88/*340*/ .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
89/*345*/ .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf 89/*345*/ .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
90/*350*/ .long sys_execveat
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 580cde9370c9..d72f76ae70eb 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -88,6 +88,7 @@ sys_call_table32:
88 .word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev 88 .word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev
89/*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr 89/*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
90 .word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf 90 .word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
91/*350*/ .word sys32_execveat
91 92
92#endif /* CONFIG_COMPAT */ 93#endif /* CONFIG_COMPAT */
93 94
@@ -167,3 +168,4 @@ sys_call_table:
167 .word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev 168 .word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
168/*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr 169/*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
169 .word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf 170 .word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
171/*350*/ .word sys64_execveat
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 2d91c62f7f5f..3ea267c53320 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1621,7 +1621,7 @@ static void __init kernel_physical_mapping_init(void)
1621} 1621}
1622 1622
1623#ifdef CONFIG_DEBUG_PAGEALLOC 1623#ifdef CONFIG_DEBUG_PAGEALLOC
1624void kernel_map_pages(struct page *page, int numpages, int enable) 1624void __kernel_map_pages(struct page *page, int numpages, int enable)
1625{ 1625{
1626 unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT; 1626 unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT;
1627 unsigned long phys_end = phys_start + (numpages * PAGE_SIZE); 1627 unsigned long phys_end = phys_start + (numpages * PAGE_SIZE);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bea3a0159496..d69f1cd87fd9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,6 +24,7 @@ config X86
24 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI 24 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
25 select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS 25 select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
26 select ARCH_HAS_FAST_MULTIPLIER 26 select ARCH_HAS_FAST_MULTIPLIER
27 select ARCH_HAS_GCOV_PROFILE_ALL
27 select ARCH_MIGHT_HAVE_PC_PARPORT 28 select ARCH_MIGHT_HAVE_PC_PARPORT
28 select ARCH_MIGHT_HAVE_PC_SERIO 29 select ARCH_MIGHT_HAVE_PC_SERIO
29 select HAVE_AOUT if X86_32 30 select HAVE_AOUT if X86_32
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
index 5d7b381da692..2eccc8932ae6 100644
--- a/arch/x86/ia32/audit.c
+++ b/arch/x86/ia32/audit.c
@@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall)
35 case __NR_socketcall: 35 case __NR_socketcall:
36 return 4; 36 return 4;
37 case __NR_execve: 37 case __NR_execve:
38 case __NR_execveat:
38 return 5; 39 return 5;
39 default: 40 default:
40 return 1; 41 return 1;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ffe71228fc10..82e8a1d44658 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -480,6 +480,7 @@ GLOBAL(\label)
480 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn 480 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
481 PTREGSCALL stub32_sigreturn, sys32_sigreturn 481 PTREGSCALL stub32_sigreturn, sys32_sigreturn
482 PTREGSCALL stub32_execve, compat_sys_execve 482 PTREGSCALL stub32_execve, compat_sys_execve
483 PTREGSCALL stub32_execveat, compat_sys_execveat
483 PTREGSCALL stub32_fork, sys_fork 484 PTREGSCALL stub32_fork, sys_fork
484 PTREGSCALL stub32_vfork, sys_vfork 485 PTREGSCALL stub32_vfork, sys_vfork
485 486
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 06d3e5a14d9d..f3672508b249 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall)
50 case __NR_openat: 50 case __NR_openat:
51 return 3; 51 return 3;
52 case __NR_execve: 52 case __NR_execve:
53 case __NR_execveat:
53 return 5; 54 return 5;
54 default: 55 default:
55 return 0; 56 return 0;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c0226ab54106..90878aa38dbd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -652,6 +652,20 @@ ENTRY(stub_execve)
652 CFI_ENDPROC 652 CFI_ENDPROC
653END(stub_execve) 653END(stub_execve)
654 654
655ENTRY(stub_execveat)
656 CFI_STARTPROC
657 addq $8, %rsp
658 PARTIAL_FRAME 0
659 SAVE_REST
660 FIXUP_TOP_OF_STACK %r11
661 call sys_execveat
662 RESTORE_TOP_OF_STACK %r11
663 movq %rax,RAX(%rsp)
664 RESTORE_REST
665 jmp int_ret_from_sys_call
666 CFI_ENDPROC
667END(stub_execveat)
668
655/* 669/*
656 * sigreturn is special because it needs to restore all registers on return. 670 * sigreturn is special because it needs to restore all registers on return.
657 * This cannot be done with SYSRET, so use the IRET return path instead. 671 * This cannot be done with SYSRET, so use the IRET return path instead.
@@ -697,6 +711,20 @@ ENTRY(stub_x32_execve)
697 CFI_ENDPROC 711 CFI_ENDPROC
698END(stub_x32_execve) 712END(stub_x32_execve)
699 713
714ENTRY(stub_x32_execveat)
715 CFI_STARTPROC
716 addq $8, %rsp
717 PARTIAL_FRAME 0
718 SAVE_REST
719 FIXUP_TOP_OF_STACK %r11
720 call compat_sys_execveat
721 RESTORE_TOP_OF_STACK %r11
722 movq %rax,RAX(%rsp)
723 RESTORE_REST
724 jmp int_ret_from_sys_call
725 CFI_ENDPROC
726END(stub_x32_execveat)
727
700#endif 728#endif
701 729
702/* 730/*
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a3a5d46605d2..dfaf2e0f5f8f 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1817,7 +1817,7 @@ static int __set_pages_np(struct page *page, int numpages)
1817 return __change_page_attr_set_clr(&cpa, 0); 1817 return __change_page_attr_set_clr(&cpa, 0);
1818} 1818}
1819 1819
1820void kernel_map_pages(struct page *page, int numpages, int enable) 1820void __kernel_map_pages(struct page *page, int numpages, int enable)
1821{ 1821{
1822 if (PageHighMem(page)) 1822 if (PageHighMem(page))
1823 return; 1823 return;
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 9fe1b5d002f0..b3560ece1c9f 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -364,3 +364,4 @@
364355 i386 getrandom sys_getrandom 364355 i386 getrandom sys_getrandom
365356 i386 memfd_create sys_memfd_create 365356 i386 memfd_create sys_memfd_create
366357 i386 bpf sys_bpf 366357 i386 bpf sys_bpf
367358 i386 execveat sys_execveat stub32_execveat
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 281150b539a2..8d656fbb57aa 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -328,6 +328,7 @@
328319 common memfd_create sys_memfd_create 328319 common memfd_create sys_memfd_create
329320 common kexec_file_load sys_kexec_file_load 329320 common kexec_file_load sys_kexec_file_load
330321 common bpf sys_bpf 330321 common bpf sys_bpf
331322 64 execveat stub_execveat
331 332
332# 333#
333# x32-specific system call numbers start at 512 to avoid cache impact 334# x32-specific system call numbers start at 512 to avoid cache impact
@@ -366,3 +367,4 @@
366542 x32 getsockopt compat_sys_getsockopt 367542 x32 getsockopt compat_sys_getsockopt
367543 x32 io_setup compat_sys_io_setup 368543 x32 io_setup compat_sys_io_setup
368544 x32 io_submit compat_sys_io_submit 369544 x32 io_submit compat_sys_io_submit
370545 x32 execveat stub_x32_execveat
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index f2f0723070ca..20c3649d0691 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -31,6 +31,7 @@
31#define stub_fork sys_fork 31#define stub_fork sys_fork
32#define stub_vfork sys_vfork 32#define stub_vfork sys_vfork
33#define stub_execve sys_execve 33#define stub_execve sys_execve
34#define stub_execveat sys_execveat
34#define stub_rt_sigreturn sys_rt_sigreturn 35#define stub_rt_sigreturn sys_rt_sigreturn
35 36
36#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) 37#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 7c5d87191b28..85be040a21c8 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -228,8 +228,8 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
228 struct page *first_page; 228 struct page *first_page;
229 int ret; 229 int ret;
230 230
231 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 231 start_pfn = phys_index << PFN_SECTION_SHIFT;
232 start_pfn = page_to_pfn(first_page); 232 first_page = pfn_to_page(start_pfn);
233 233
234 switch (action) { 234 switch (action) {
235 case MEM_ONLINE: 235 case MEM_ONLINE:
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 3920ee45aa59..bd8bda386e02 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -44,15 +44,14 @@ static const char *default_compressor = "lzo";
44static unsigned int num_devices = 1; 44static unsigned int num_devices = 1;
45 45
46#define ZRAM_ATTR_RO(name) \ 46#define ZRAM_ATTR_RO(name) \
47static ssize_t zram_attr_##name##_show(struct device *d, \ 47static ssize_t name##_show(struct device *d, \
48 struct device_attribute *attr, char *b) \ 48 struct device_attribute *attr, char *b) \
49{ \ 49{ \
50 struct zram *zram = dev_to_zram(d); \ 50 struct zram *zram = dev_to_zram(d); \
51 return scnprintf(b, PAGE_SIZE, "%llu\n", \ 51 return scnprintf(b, PAGE_SIZE, "%llu\n", \
52 (u64)atomic64_read(&zram->stats.name)); \ 52 (u64)atomic64_read(&zram->stats.name)); \
53} \ 53} \
54static struct device_attribute dev_attr_##name = \ 54static DEVICE_ATTR_RO(name);
55 __ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL);
56 55
57static inline int init_done(struct zram *zram) 56static inline int init_done(struct zram *zram)
58{ 57{
@@ -287,19 +286,18 @@ static inline int is_partial_io(struct bio_vec *bvec)
287/* 286/*
288 * Check if request is within bounds and aligned on zram logical blocks. 287 * Check if request is within bounds and aligned on zram logical blocks.
289 */ 288 */
290static inline int valid_io_request(struct zram *zram, struct bio *bio) 289static inline int valid_io_request(struct zram *zram,
290 sector_t start, unsigned int size)
291{ 291{
292 u64 start, end, bound; 292 u64 end, bound;
293 293
294 /* unaligned request */ 294 /* unaligned request */
295 if (unlikely(bio->bi_iter.bi_sector & 295 if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
296 (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
297 return 0; 296 return 0;
298 if (unlikely(bio->bi_iter.bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) 297 if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
299 return 0; 298 return 0;
300 299
301 start = bio->bi_iter.bi_sector; 300 end = start + (size >> SECTOR_SHIFT);
302 end = start + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
303 bound = zram->disksize >> SECTOR_SHIFT; 301 bound = zram->disksize >> SECTOR_SHIFT;
304 /* out of range range */ 302 /* out of range range */
305 if (unlikely(start >= bound || end > bound || start > end)) 303 if (unlikely(start >= bound || end > bound || start > end))
@@ -453,7 +451,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
453} 451}
454 452
455static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, 453static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
456 u32 index, int offset, struct bio *bio) 454 u32 index, int offset)
457{ 455{
458 int ret; 456 int ret;
459 struct page *page; 457 struct page *page;
@@ -645,14 +643,13 @@ out:
645} 643}
646 644
647static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, 645static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
648 int offset, struct bio *bio) 646 int offset, int rw)
649{ 647{
650 int ret; 648 int ret;
651 int rw = bio_data_dir(bio);
652 649
653 if (rw == READ) { 650 if (rw == READ) {
654 atomic64_inc(&zram->stats.num_reads); 651 atomic64_inc(&zram->stats.num_reads);
655 ret = zram_bvec_read(zram, bvec, index, offset, bio); 652 ret = zram_bvec_read(zram, bvec, index, offset);
656 } else { 653 } else {
657 atomic64_inc(&zram->stats.num_writes); 654 atomic64_inc(&zram->stats.num_writes);
658 ret = zram_bvec_write(zram, bvec, index, offset); 655 ret = zram_bvec_write(zram, bvec, index, offset);
@@ -853,7 +850,7 @@ out:
853 850
854static void __zram_make_request(struct zram *zram, struct bio *bio) 851static void __zram_make_request(struct zram *zram, struct bio *bio)
855{ 852{
856 int offset; 853 int offset, rw;
857 u32 index; 854 u32 index;
858 struct bio_vec bvec; 855 struct bio_vec bvec;
859 struct bvec_iter iter; 856 struct bvec_iter iter;
@@ -868,6 +865,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
868 return; 865 return;
869 } 866 }
870 867
868 rw = bio_data_dir(bio);
871 bio_for_each_segment(bvec, bio, iter) { 869 bio_for_each_segment(bvec, bio, iter) {
872 int max_transfer_size = PAGE_SIZE - offset; 870 int max_transfer_size = PAGE_SIZE - offset;
873 871
@@ -882,15 +880,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
882 bv.bv_len = max_transfer_size; 880 bv.bv_len = max_transfer_size;
883 bv.bv_offset = bvec.bv_offset; 881 bv.bv_offset = bvec.bv_offset;
884 882
885 if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0) 883 if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0)
886 goto out; 884 goto out;
887 885
888 bv.bv_len = bvec.bv_len - max_transfer_size; 886 bv.bv_len = bvec.bv_len - max_transfer_size;
889 bv.bv_offset += max_transfer_size; 887 bv.bv_offset += max_transfer_size;
890 if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0) 888 if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0)
891 goto out; 889 goto out;
892 } else 890 } else
893 if (zram_bvec_rw(zram, &bvec, index, offset, bio) < 0) 891 if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0)
894 goto out; 892 goto out;
895 893
896 update_position(&index, &offset, &bvec); 894 update_position(&index, &offset, &bvec);
@@ -915,7 +913,8 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
915 if (unlikely(!init_done(zram))) 913 if (unlikely(!init_done(zram)))
916 goto error; 914 goto error;
917 915
918 if (!valid_io_request(zram, bio)) { 916 if (!valid_io_request(zram, bio->bi_iter.bi_sector,
917 bio->bi_iter.bi_size)) {
919 atomic64_inc(&zram->stats.invalid_io); 918 atomic64_inc(&zram->stats.invalid_io);
920 goto error; 919 goto error;
921 } 920 }
@@ -945,25 +944,64 @@ static void zram_slot_free_notify(struct block_device *bdev,
945 atomic64_inc(&zram->stats.notify_free); 944 atomic64_inc(&zram->stats.notify_free);
946} 945}
947 946
947static int zram_rw_page(struct block_device *bdev, sector_t sector,
948 struct page *page, int rw)
949{
950 int offset, err;
951 u32 index;
952 struct zram *zram;
953 struct bio_vec bv;
954
955 zram = bdev->bd_disk->private_data;
956 if (!valid_io_request(zram, sector, PAGE_SIZE)) {
957 atomic64_inc(&zram->stats.invalid_io);
958 return -EINVAL;
959 }
960
961 down_read(&zram->init_lock);
962 if (unlikely(!init_done(zram))) {
963 err = -EIO;
964 goto out_unlock;
965 }
966
967 index = sector >> SECTORS_PER_PAGE_SHIFT;
968 offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
969
970 bv.bv_page = page;
971 bv.bv_len = PAGE_SIZE;
972 bv.bv_offset = 0;
973
974 err = zram_bvec_rw(zram, &bv, index, offset, rw);
975out_unlock:
976 up_read(&zram->init_lock);
977 /*
978 * If I/O fails, just return error(ie, non-zero) without
979 * calling page_endio.
980 * It causes resubmit the I/O with bio request by upper functions
981 * of rw_page(e.g., swap_readpage, __swap_writepage) and
982 * bio->bi_end_io does things to handle the error
983 * (e.g., SetPageError, set_page_dirty and extra works).
984 */
985 if (err == 0)
986 page_endio(page, rw, 0);
987 return err;
988}
989
948static const struct block_device_operations zram_devops = { 990static const struct block_device_operations zram_devops = {
949 .swap_slot_free_notify = zram_slot_free_notify, 991 .swap_slot_free_notify = zram_slot_free_notify,
992 .rw_page = zram_rw_page,
950 .owner = THIS_MODULE 993 .owner = THIS_MODULE
951}; 994};
952 995
953static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, 996static DEVICE_ATTR_RW(disksize);
954 disksize_show, disksize_store); 997static DEVICE_ATTR_RO(initstate);
955static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); 998static DEVICE_ATTR_WO(reset);
956static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); 999static DEVICE_ATTR_RO(orig_data_size);
957static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); 1000static DEVICE_ATTR_RO(mem_used_total);
958static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); 1001static DEVICE_ATTR_RW(mem_limit);
959static DEVICE_ATTR(mem_limit, S_IRUGO | S_IWUSR, mem_limit_show, 1002static DEVICE_ATTR_RW(mem_used_max);
960 mem_limit_store); 1003static DEVICE_ATTR_RW(max_comp_streams);
961static DEVICE_ATTR(mem_used_max, S_IRUGO | S_IWUSR, mem_used_max_show, 1004static DEVICE_ATTR_RW(comp_algorithm);
962 mem_used_max_store);
963static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR,
964 max_comp_streams_show, max_comp_streams_store);
965static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR,
966 comp_algorithm_show, comp_algorithm_store);
967 1005
968ZRAM_ATTR_RO(num_reads); 1006ZRAM_ATTR_RO(num_reads);
969ZRAM_ATTR_RO(num_writes); 1007ZRAM_ATTR_RO(num_writes);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index c6ee271317f5..b05a816b09ac 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -66,8 +66,8 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
66/* Flags for zram pages (table[page_no].value) */ 66/* Flags for zram pages (table[page_no].value) */
67enum zram_pageflags { 67enum zram_pageflags {
68 /* Page consists entirely of zeros */ 68 /* Page consists entirely of zeros */
69 ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1, 69 ZRAM_ZERO = ZRAM_FLAG_SHIFT,
70 ZRAM_ACCESS, /* page in now accessed */ 70 ZRAM_ACCESS, /* page is now accessed */
71 71
72 __NR_ZRAM_PAGEFLAGS, 72 __NR_ZRAM_PAGEFLAGS,
73}; 73};
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index a2d87a60c27f..bea878f8e7d3 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -509,45 +509,67 @@ static void finish_pri_tag(struct device_state *dev_state,
509 spin_unlock_irqrestore(&pasid_state->lock, flags); 509 spin_unlock_irqrestore(&pasid_state->lock, flags);
510} 510}
511 511
512static void handle_fault_error(struct fault *fault)
513{
514 int status;
515
516 if (!fault->dev_state->inv_ppr_cb) {
517 set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
518 return;
519 }
520
521 status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev,
522 fault->pasid,
523 fault->address,
524 fault->flags);
525 switch (status) {
526 case AMD_IOMMU_INV_PRI_RSP_SUCCESS:
527 set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS);
528 break;
529 case AMD_IOMMU_INV_PRI_RSP_INVALID:
530 set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
531 break;
532 case AMD_IOMMU_INV_PRI_RSP_FAIL:
533 set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE);
534 break;
535 default:
536 BUG();
537 }
538}
539
512static void do_fault(struct work_struct *work) 540static void do_fault(struct work_struct *work)
513{ 541{
514 struct fault *fault = container_of(work, struct fault, work); 542 struct fault *fault = container_of(work, struct fault, work);
515 int npages, write; 543 struct mm_struct *mm;
516 struct page *page; 544 struct vm_area_struct *vma;
545 u64 address;
546 int ret, write;
517 547
518 write = !!(fault->flags & PPR_FAULT_WRITE); 548 write = !!(fault->flags & PPR_FAULT_WRITE);
519 549
520 down_read(&fault->state->mm->mmap_sem); 550 mm = fault->state->mm;
521 npages = get_user_pages(NULL, fault->state->mm, 551 address = fault->address;
522 fault->address, 1, write, 0, &page, NULL); 552
523 up_read(&fault->state->mm->mmap_sem); 553 down_read(&mm->mmap_sem);
524 554 vma = find_extend_vma(mm, address);
525 if (npages == 1) { 555 if (!vma || address < vma->vm_start) {
526 put_page(page); 556 /* failed to get a vma in the right range */
527 } else if (fault->dev_state->inv_ppr_cb) { 557 up_read(&mm->mmap_sem);
528 int status; 558 handle_fault_error(fault);
529 559 goto out;
530 status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, 560 }
531 fault->pasid, 561
532 fault->address, 562 ret = handle_mm_fault(mm, vma, address, write);
533 fault->flags); 563 if (ret & VM_FAULT_ERROR) {
534 switch (status) { 564 /* failed to service fault */
535 case AMD_IOMMU_INV_PRI_RSP_SUCCESS: 565 up_read(&mm->mmap_sem);
536 set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); 566 handle_fault_error(fault);
537 break; 567 goto out;
538 case AMD_IOMMU_INV_PRI_RSP_INVALID:
539 set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
540 break;
541 case AMD_IOMMU_INV_PRI_RSP_FAIL:
542 set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE);
543 break;
544 default:
545 BUG();
546 }
547 } else {
548 set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
549 } 568 }
550 569
570 up_read(&mm->mmap_sem);
571
572out:
551 finish_pri_tag(fault->dev_state, fault->state, fault->tag); 573 finish_pri_tag(fault->dev_state, fault->state, fault->tag);
552 574
553 put_pasid_state(fault->state); 575 put_pasid_state(fault->state);
diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c
index 2cd8ffe5c698..942b267c6271 100644
--- a/drivers/rtc/rtc-snvs.c
+++ b/drivers/rtc/rtc-snvs.c
@@ -344,13 +344,20 @@ static int snvs_rtc_resume(struct device *dev)
344 344
345 return 0; 345 return 0;
346} 346}
347#endif
348 347
349static const struct dev_pm_ops snvs_rtc_pm_ops = { 348static const struct dev_pm_ops snvs_rtc_pm_ops = {
350 .suspend_noirq = snvs_rtc_suspend, 349 .suspend_noirq = snvs_rtc_suspend,
351 .resume_noirq = snvs_rtc_resume, 350 .resume_noirq = snvs_rtc_resume,
352}; 351};
353 352
353#define SNVS_RTC_PM_OPS (&snvs_rtc_pm_ops)
354
355#else
356
357#define SNVS_RTC_PM_OPS NULL
358
359#endif
360
354static const struct of_device_id snvs_dt_ids[] = { 361static const struct of_device_id snvs_dt_ids[] = {
355 { .compatible = "fsl,sec-v4.0-mon-rtc-lp", }, 362 { .compatible = "fsl,sec-v4.0-mon-rtc-lp", },
356 { /* sentinel */ } 363 { /* sentinel */ }
@@ -361,7 +368,7 @@ static struct platform_driver snvs_rtc_driver = {
361 .driver = { 368 .driver = {
362 .name = "snvs_rtc", 369 .name = "snvs_rtc",
363 .owner = THIS_MODULE, 370 .owner = THIS_MODULE,
364 .pm = &snvs_rtc_pm_ops, 371 .pm = SNVS_RTC_PM_OPS,
365 .of_match_table = snvs_dt_ids, 372 .of_match_table = snvs_dt_ids,
366 }, 373 },
367 .probe = snvs_rtc_probe, 374 .probe = snvs_rtc_probe,
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index ad4f5790a76f..46f8ef42559e 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -418,7 +418,7 @@ out:
418} 418}
419 419
420/* 420/*
421 * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab 421 * ashmem_shrink - our cache shrinker, called from mm/vmscan.c
422 * 422 *
423 * 'nr_to_scan' is the number of objects to scan for freeing. 423 * 'nr_to_scan' is the number of objects to scan for freeing.
424 * 424 *
@@ -785,7 +785,6 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
785 .nr_to_scan = LONG_MAX, 785 .nr_to_scan = LONG_MAX,
786 }; 786 };
787 ret = ashmem_shrink_count(&ashmem_shrinker, &sc); 787 ret = ashmem_shrink_count(&ashmem_shrinker, &sc);
788 nodes_setall(sc.nodes_to_scan);
789 ashmem_shrink_scan(&ashmem_shrinker, &sc); 788 ashmem_shrink_scan(&ashmem_shrinker, &sc);
790 } 789 }
791 break; 790 break;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 9bca88159725..ff44ff3ff015 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -135,8 +135,10 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
135extern void secs_to_datestamp(time_t secs, struct affs_date *ds); 135extern void secs_to_datestamp(time_t secs, struct affs_date *ds);
136extern umode_t prot_to_mode(u32 prot); 136extern umode_t prot_to_mode(u32 prot);
137extern void mode_to_prot(struct inode *inode); 137extern void mode_to_prot(struct inode *inode);
138__printf(3, 4)
138extern void affs_error(struct super_block *sb, const char *function, 139extern void affs_error(struct super_block *sb, const char *function,
139 const char *fmt, ...); 140 const char *fmt, ...);
141__printf(3, 4)
140extern void affs_warning(struct super_block *sb, const char *function, 142extern void affs_warning(struct super_block *sb, const char *function,
141 const char *fmt, ...); 143 const char *fmt, ...);
142extern bool affs_nofilenametruncate(const struct dentry *dentry); 144extern bool affs_nofilenametruncate(const struct dentry *dentry);
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 937ce8754b24..c852f2fa1710 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -10,8 +10,6 @@
10 10
11#include "affs.h" 11#include "affs.h"
12 12
13static char ErrorBuffer[256];
14
15/* 13/*
16 * Functions for accessing Amiga-FFS structures. 14 * Functions for accessing Amiga-FFS structures.
17 */ 15 */
@@ -444,30 +442,30 @@ mode_to_prot(struct inode *inode)
444void 442void
445affs_error(struct super_block *sb, const char *function, const char *fmt, ...) 443affs_error(struct super_block *sb, const char *function, const char *fmt, ...)
446{ 444{
447 va_list args; 445 struct va_format vaf;
448 446 va_list args;
449 va_start(args,fmt);
450 vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
451 va_end(args);
452 447
453 pr_crit("error (device %s): %s(): %s\n", sb->s_id, 448 va_start(args, fmt);
454 function,ErrorBuffer); 449 vaf.fmt = fmt;
450 vaf.va = &args;
451 pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf);
455 if (!(sb->s_flags & MS_RDONLY)) 452 if (!(sb->s_flags & MS_RDONLY))
456 pr_warn("Remounting filesystem read-only\n"); 453 pr_warn("Remounting filesystem read-only\n");
457 sb->s_flags |= MS_RDONLY; 454 sb->s_flags |= MS_RDONLY;
455 va_end(args);
458} 456}
459 457
460void 458void
461affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) 459affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
462{ 460{
463 va_list args; 461 struct va_format vaf;
462 va_list args;
464 463
465 va_start(args,fmt); 464 va_start(args, fmt);
466 vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); 465 vaf.fmt = fmt;
466 vaf.va = &args;
467 pr_warn("(device %s): %s(): %pV\n", sb->s_id, function, &vaf);
467 va_end(args); 468 va_end(args);
468
469 pr_warn("(device %s): %s(): %s\n", sb->s_id,
470 function,ErrorBuffer);
471} 469}
472 470
473bool 471bool
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1ed590aafecf..8faa6593ca6d 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -12,35 +12,10 @@
12 * affs regular file handling primitives 12 * affs regular file handling primitives
13 */ 13 */
14 14
15#include <linux/aio.h>
15#include "affs.h" 16#include "affs.h"
16 17
17#if PAGE_SIZE < 4096
18#error PAGE_SIZE must be at least 4096
19#endif
20
21static int affs_grow_extcache(struct inode *inode, u32 lc_idx);
22static struct buffer_head *affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext);
23static inline struct buffer_head *affs_get_extblock(struct inode *inode, u32 ext);
24static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); 18static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
25static int affs_file_open(struct inode *inode, struct file *filp);
26static int affs_file_release(struct inode *inode, struct file *filp);
27
28const struct file_operations affs_file_operations = {
29 .llseek = generic_file_llseek,
30 .read = new_sync_read,
31 .read_iter = generic_file_read_iter,
32 .write = new_sync_write,
33 .write_iter = generic_file_write_iter,
34 .mmap = generic_file_mmap,
35 .open = affs_file_open,
36 .release = affs_file_release,
37 .fsync = affs_file_fsync,
38 .splice_read = generic_file_splice_read,
39};
40
41const struct inode_operations affs_file_inode_operations = {
42 .setattr = affs_notify_change,
43};
44 19
45static int 20static int
46affs_file_open(struct inode *inode, struct file *filp) 21affs_file_open(struct inode *inode, struct file *filp)
@@ -355,7 +330,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
355 330
356 /* store new block */ 331 /* store new block */
357 if (bh_result->b_blocknr) 332 if (bh_result->b_blocknr)
358 affs_warning(sb, "get_block", "block already set (%x)", bh_result->b_blocknr); 333 affs_warning(sb, "get_block", "block already set (%lx)",
334 (unsigned long)bh_result->b_blocknr);
359 AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr); 335 AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr);
360 AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1); 336 AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1);
361 affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1); 337 affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1);
@@ -377,7 +353,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
377 return 0; 353 return 0;
378 354
379err_big: 355err_big:
380 affs_error(inode->i_sb,"get_block","strange block request %d", block); 356 affs_error(inode->i_sb, "get_block", "strange block request %d",
357 (int)block);
381 return -EIO; 358 return -EIO;
382err_ext: 359err_ext:
383 // unlock cache 360 // unlock cache
@@ -412,6 +389,22 @@ static void affs_write_failed(struct address_space *mapping, loff_t to)
412 } 389 }
413} 390}
414 391
392static ssize_t
393affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
394 loff_t offset)
395{
396 struct file *file = iocb->ki_filp;
397 struct address_space *mapping = file->f_mapping;
398 struct inode *inode = mapping->host;
399 size_t count = iov_iter_count(iter);
400 ssize_t ret;
401
402 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block);
403 if (ret < 0 && (rw & WRITE))
404 affs_write_failed(mapping, offset + count);
405 return ret;
406}
407
415static int affs_write_begin(struct file *file, struct address_space *mapping, 408static int affs_write_begin(struct file *file, struct address_space *mapping,
416 loff_t pos, unsigned len, unsigned flags, 409 loff_t pos, unsigned len, unsigned flags,
417 struct page **pagep, void **fsdata) 410 struct page **pagep, void **fsdata)
@@ -438,6 +431,7 @@ const struct address_space_operations affs_aops = {
438 .writepage = affs_writepage, 431 .writepage = affs_writepage,
439 .write_begin = affs_write_begin, 432 .write_begin = affs_write_begin,
440 .write_end = generic_write_end, 433 .write_end = generic_write_end,
434 .direct_IO = affs_direct_IO,
441 .bmap = _affs_bmap 435 .bmap = _affs_bmap
442}; 436};
443 437
@@ -867,8 +861,9 @@ affs_truncate(struct inode *inode)
867 // lock cache 861 // lock cache
868 ext_bh = affs_get_extblock(inode, ext); 862 ext_bh = affs_get_extblock(inode, ext);
869 if (IS_ERR(ext_bh)) { 863 if (IS_ERR(ext_bh)) {
870 affs_warning(sb, "truncate", "unexpected read error for ext block %u (%d)", 864 affs_warning(sb, "truncate",
871 ext, PTR_ERR(ext_bh)); 865 "unexpected read error for ext block %u (%ld)",
866 (unsigned int)ext, PTR_ERR(ext_bh));
872 return; 867 return;
873 } 868 }
874 if (AFFS_I(inode)->i_lc) { 869 if (AFFS_I(inode)->i_lc) {
@@ -914,8 +909,9 @@ affs_truncate(struct inode *inode)
914 struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); 909 struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
915 u32 tmp; 910 u32 tmp;
916 if (IS_ERR(bh)) { 911 if (IS_ERR(bh)) {
917 affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", 912 affs_warning(sb, "truncate",
918 ext, PTR_ERR(bh)); 913 "unexpected read error for last block %u (%ld)",
914 (unsigned int)ext, PTR_ERR(bh));
919 return; 915 return;
920 } 916 }
921 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); 917 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
@@ -961,3 +957,19 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
961 mutex_unlock(&inode->i_mutex); 957 mutex_unlock(&inode->i_mutex);
962 return ret; 958 return ret;
963} 959}
960const struct file_operations affs_file_operations = {
961 .llseek = generic_file_llseek,
962 .read = new_sync_read,
963 .read_iter = generic_file_read_iter,
964 .write = new_sync_write,
965 .write_iter = generic_file_write_iter,
966 .mmap = generic_file_mmap,
967 .open = affs_file_open,
968 .release = affs_file_release,
969 .fsync = affs_file_fsync,
970 .splice_read = generic_file_splice_read,
971};
972
973const struct inode_operations affs_file_inode_operations = {
974 .setattr = affs_notify_change,
975};
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b94d1cc9cd30..edf47774b03d 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -269,10 +269,6 @@ more:
269 } 269 }
270 ctx->pos++; 270 ctx->pos++;
271 goto more; 271 goto more;
272
273 befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos);
274
275 return 0;
276} 272}
277 273
278static struct inode * 274static struct inode *
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index f37b08cea1f7..490538536cb4 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -42,6 +42,10 @@ static int load_em86(struct linux_binprm *bprm)
42 return -ENOEXEC; 42 return -ENOEXEC;
43 } 43 }
44 44
45 /* Need to be able to load the file after exec */
46 if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
47 return -ENOENT;
48
45 allow_write_access(bprm->file); 49 allow_write_access(bprm->file);
46 fput(bprm->file); 50 fput(bprm->file);
47 bprm->file = NULL; 51 bprm->file = NULL;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 70789e198dea..c04ef1d4f18a 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -144,6 +144,10 @@ static int load_misc_binary(struct linux_binprm *bprm)
144 if (!fmt) 144 if (!fmt)
145 goto ret; 145 goto ret;
146 146
147 /* Need to be able to load the file after exec */
148 if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
149 return -ENOENT;
150
147 if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { 151 if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
148 retval = remove_arg_zero(bprm); 152 retval = remove_arg_zero(bprm);
149 if (retval) 153 if (retval)
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 5027a3e14922..afdf4e3cafc2 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -24,6 +24,16 @@ static int load_script(struct linux_binprm *bprm)
24 24
25 if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) 25 if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
26 return -ENOEXEC; 26 return -ENOEXEC;
27
28 /*
29 * If the script filename will be inaccessible after exec, typically
30 * because it is a "/dev/fd/<fd>/.." path against an O_CLOEXEC fd, give
31 * up now (on the assumption that the interpreter will want to load
32 * this file).
33 */
34 if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
35 return -ENOENT;
36
27 /* 37 /*
28 * This section does the #! interpretation. 38 * This section does the #! interpretation.
29 * Sorta complicated, but hopefully it will work. -TYT 39 * Sorta complicated, but hopefully it will work. -TYT
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 1de7294aad20..2bc2c87f35e7 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -40,13 +40,14 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
40static void drop_slab(void) 40static void drop_slab(void)
41{ 41{
42 int nr_objects; 42 int nr_objects;
43 struct shrink_control shrink = {
44 .gfp_mask = GFP_KERNEL,
45 };
46 43
47 nodes_setall(shrink.nodes_to_scan);
48 do { 44 do {
49 nr_objects = shrink_slab(&shrink, 1000, 1000); 45 int nid;
46
47 nr_objects = 0;
48 for_each_online_node(nid)
49 nr_objects += shrink_node_slabs(GFP_KERNEL, nid,
50 1000, 1000);
50 } while (nr_objects > 10); 51 } while (nr_objects > 10);
51} 52}
52 53
diff --git a/fs/exec.c b/fs/exec.c
index 01aebe300200..ad8798e26be9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -748,18 +748,25 @@ EXPORT_SYMBOL(setup_arg_pages);
748 748
749#endif /* CONFIG_MMU */ 749#endif /* CONFIG_MMU */
750 750
751static struct file *do_open_exec(struct filename *name) 751static struct file *do_open_execat(int fd, struct filename *name, int flags)
752{ 752{
753 struct file *file; 753 struct file *file;
754 int err; 754 int err;
755 static const struct open_flags open_exec_flags = { 755 struct open_flags open_exec_flags = {
756 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 756 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
757 .acc_mode = MAY_EXEC | MAY_OPEN, 757 .acc_mode = MAY_EXEC | MAY_OPEN,
758 .intent = LOOKUP_OPEN, 758 .intent = LOOKUP_OPEN,
759 .lookup_flags = LOOKUP_FOLLOW, 759 .lookup_flags = LOOKUP_FOLLOW,
760 }; 760 };
761 761
762 file = do_filp_open(AT_FDCWD, name, &open_exec_flags); 762 if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
763 return ERR_PTR(-EINVAL);
764 if (flags & AT_SYMLINK_NOFOLLOW)
765 open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
766 if (flags & AT_EMPTY_PATH)
767 open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
768
769 file = do_filp_open(fd, name, &open_exec_flags);
763 if (IS_ERR(file)) 770 if (IS_ERR(file))
764 goto out; 771 goto out;
765 772
@@ -770,12 +777,13 @@ static struct file *do_open_exec(struct filename *name)
770 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) 777 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
771 goto exit; 778 goto exit;
772 779
773 fsnotify_open(file);
774
775 err = deny_write_access(file); 780 err = deny_write_access(file);
776 if (err) 781 if (err)
777 goto exit; 782 goto exit;
778 783
784 if (name->name[0] != '\0')
785 fsnotify_open(file);
786
779out: 787out:
780 return file; 788 return file;
781 789
@@ -787,7 +795,7 @@ exit:
787struct file *open_exec(const char *name) 795struct file *open_exec(const char *name)
788{ 796{
789 struct filename tmp = { .name = name }; 797 struct filename tmp = { .name = name };
790 return do_open_exec(&tmp); 798 return do_open_execat(AT_FDCWD, &tmp, 0);
791} 799}
792EXPORT_SYMBOL(open_exec); 800EXPORT_SYMBOL(open_exec);
793 801
@@ -1428,10 +1436,12 @@ static int exec_binprm(struct linux_binprm *bprm)
1428/* 1436/*
1429 * sys_execve() executes a new program. 1437 * sys_execve() executes a new program.
1430 */ 1438 */
1431static int do_execve_common(struct filename *filename, 1439static int do_execveat_common(int fd, struct filename *filename,
1432 struct user_arg_ptr argv, 1440 struct user_arg_ptr argv,
1433 struct user_arg_ptr envp) 1441 struct user_arg_ptr envp,
1442 int flags)
1434{ 1443{
1444 char *pathbuf = NULL;
1435 struct linux_binprm *bprm; 1445 struct linux_binprm *bprm;
1436 struct file *file; 1446 struct file *file;
1437 struct files_struct *displaced; 1447 struct files_struct *displaced;
@@ -1472,7 +1482,7 @@ static int do_execve_common(struct filename *filename,
1472 check_unsafe_exec(bprm); 1482 check_unsafe_exec(bprm);
1473 current->in_execve = 1; 1483 current->in_execve = 1;
1474 1484
1475 file = do_open_exec(filename); 1485 file = do_open_execat(fd, filename, flags);
1476 retval = PTR_ERR(file); 1486 retval = PTR_ERR(file);
1477 if (IS_ERR(file)) 1487 if (IS_ERR(file))
1478 goto out_unmark; 1488 goto out_unmark;
@@ -1480,7 +1490,28 @@ static int do_execve_common(struct filename *filename,
1480 sched_exec(); 1490 sched_exec();
1481 1491
1482 bprm->file = file; 1492 bprm->file = file;
1483 bprm->filename = bprm->interp = filename->name; 1493 if (fd == AT_FDCWD || filename->name[0] == '/') {
1494 bprm->filename = filename->name;
1495 } else {
1496 if (filename->name[0] == '\0')
1497 pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd);
1498 else
1499 pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s",
1500 fd, filename->name);
1501 if (!pathbuf) {
1502 retval = -ENOMEM;
1503 goto out_unmark;
1504 }
1505 /*
1506 * Record that a name derived from an O_CLOEXEC fd will be
1507 * inaccessible after exec. Relies on having exclusive access to
1508 * current->files (due to unshare_files above).
1509 */
1510 if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
1511 bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
1512 bprm->filename = pathbuf;
1513 }
1514 bprm->interp = bprm->filename;
1484 1515
1485 retval = bprm_mm_init(bprm); 1516 retval = bprm_mm_init(bprm);
1486 if (retval) 1517 if (retval)
@@ -1521,6 +1552,7 @@ static int do_execve_common(struct filename *filename,
1521 acct_update_integrals(current); 1552 acct_update_integrals(current);
1522 task_numa_free(current); 1553 task_numa_free(current);
1523 free_bprm(bprm); 1554 free_bprm(bprm);
1555 kfree(pathbuf);
1524 putname(filename); 1556 putname(filename);
1525 if (displaced) 1557 if (displaced)
1526 put_files_struct(displaced); 1558 put_files_struct(displaced);
@@ -1538,6 +1570,7 @@ out_unmark:
1538 1570
1539out_free: 1571out_free:
1540 free_bprm(bprm); 1572 free_bprm(bprm);
1573 kfree(pathbuf);
1541 1574
1542out_files: 1575out_files:
1543 if (displaced) 1576 if (displaced)
@@ -1553,7 +1586,18 @@ int do_execve(struct filename *filename,
1553{ 1586{
1554 struct user_arg_ptr argv = { .ptr.native = __argv }; 1587 struct user_arg_ptr argv = { .ptr.native = __argv };
1555 struct user_arg_ptr envp = { .ptr.native = __envp }; 1588 struct user_arg_ptr envp = { .ptr.native = __envp };
1556 return do_execve_common(filename, argv, envp); 1589 return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1590}
1591
1592int do_execveat(int fd, struct filename *filename,
1593 const char __user *const __user *__argv,
1594 const char __user *const __user *__envp,
1595 int flags)
1596{
1597 struct user_arg_ptr argv = { .ptr.native = __argv };
1598 struct user_arg_ptr envp = { .ptr.native = __envp };
1599
1600 return do_execveat_common(fd, filename, argv, envp, flags);
1557} 1601}
1558 1602
1559#ifdef CONFIG_COMPAT 1603#ifdef CONFIG_COMPAT
@@ -1569,7 +1613,23 @@ static int compat_do_execve(struct filename *filename,
1569 .is_compat = true, 1613 .is_compat = true,
1570 .ptr.compat = __envp, 1614 .ptr.compat = __envp,
1571 }; 1615 };
1572 return do_execve_common(filename, argv, envp); 1616 return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1617}
1618
1619static int compat_do_execveat(int fd, struct filename *filename,
1620 const compat_uptr_t __user *__argv,
1621 const compat_uptr_t __user *__envp,
1622 int flags)
1623{
1624 struct user_arg_ptr argv = {
1625 .is_compat = true,
1626 .ptr.compat = __argv,
1627 };
1628 struct user_arg_ptr envp = {
1629 .is_compat = true,
1630 .ptr.compat = __envp,
1631 };
1632 return do_execveat_common(fd, filename, argv, envp, flags);
1573} 1633}
1574#endif 1634#endif
1575 1635
@@ -1609,6 +1669,20 @@ SYSCALL_DEFINE3(execve,
1609{ 1669{
1610 return do_execve(getname(filename), argv, envp); 1670 return do_execve(getname(filename), argv, envp);
1611} 1671}
1672
1673SYSCALL_DEFINE5(execveat,
1674 int, fd, const char __user *, filename,
1675 const char __user *const __user *, argv,
1676 const char __user *const __user *, envp,
1677 int, flags)
1678{
1679 int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
1680
1681 return do_execveat(fd,
1682 getname_flags(filename, lookup_flags, NULL),
1683 argv, envp, flags);
1684}
1685
1612#ifdef CONFIG_COMPAT 1686#ifdef CONFIG_COMPAT
1613COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, 1687COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
1614 const compat_uptr_t __user *, argv, 1688 const compat_uptr_t __user *, argv,
@@ -1616,4 +1690,17 @@ COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
1616{ 1690{
1617 return compat_do_execve(getname(filename), argv, envp); 1691 return compat_do_execve(getname(filename), argv, envp);
1618} 1692}
1693
1694COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
1695 const char __user *, filename,
1696 const compat_uptr_t __user *, argv,
1697 const compat_uptr_t __user *, envp,
1698 int, flags)
1699{
1700 int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
1701
1702 return compat_do_execveat(fd,
1703 getname_flags(filename, lookup_flags, NULL),
1704 argv, envp, flags);
1705}
1619#endif 1706#endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e0c4ba39a377..64e295e8ff38 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -370,6 +370,7 @@ extern int fat_file_fsync(struct file *file, loff_t start, loff_t end,
370 int datasync); 370 int datasync);
371 371
372/* fat/inode.c */ 372/* fat/inode.c */
373extern int fat_block_truncate_page(struct inode *inode, loff_t from);
373extern void fat_attach(struct inode *inode, loff_t i_pos); 374extern void fat_attach(struct inode *inode, loff_t i_pos);
374extern void fat_detach(struct inode *inode); 375extern void fat_detach(struct inode *inode);
375extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); 376extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 85f79a89e747..8429c68e3057 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -443,6 +443,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
443 } 443 }
444 444
445 if (attr->ia_valid & ATTR_SIZE) { 445 if (attr->ia_valid & ATTR_SIZE) {
446 error = fat_block_truncate_page(inode, attr->ia_size);
447 if (error)
448 goto out;
446 down_write(&MSDOS_I(inode)->truncate_lock); 449 down_write(&MSDOS_I(inode)->truncate_lock);
447 truncate_setsize(inode, attr->ia_size); 450 truncate_setsize(inode, attr->ia_size);
448 fat_truncate_blocks(inode, attr->ia_size); 451 fat_truncate_blocks(inode, attr->ia_size);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 756aead10d96..7b41a2dcdd76 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -294,6 +294,18 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
294 return blocknr; 294 return blocknr;
295} 295}
296 296
297/*
298 * fat_block_truncate_page() zeroes out a mapping from file offset `from'
299 * up to the end of the block which corresponds to `from'.
300 * This is required during truncate to physically zeroout the tail end
301 * of that block so it doesn't yield old data if the file is later grown.
302 * Also, avoid causing failure from fsx for cases of "data past EOF"
303 */
304int fat_block_truncate_page(struct inode *inode, loff_t from)
305{
306 return block_truncate_page(inode->i_mapping, from, fat_get_block);
307}
308
297static const struct address_space_operations fat_aops = { 309static const struct address_space_operations fat_aops = {
298 .readpage = fat_readpage, 310 .readpage = fat_readpage,
299 .readpages = fat_readpages, 311 .readpages = fat_readpages,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1e2872b25343..5eba47f593f8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
412 pgoff = offset >> PAGE_SHIFT; 412 pgoff = offset >> PAGE_SHIFT;
413 413
414 i_size_write(inode, offset); 414 i_size_write(inode, offset);
415 mutex_lock(&mapping->i_mmap_mutex); 415 i_mmap_lock_write(mapping);
416 if (!RB_EMPTY_ROOT(&mapping->i_mmap)) 416 if (!RB_EMPTY_ROOT(&mapping->i_mmap))
417 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 417 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
418 mutex_unlock(&mapping->i_mmap_mutex); 418 i_mmap_unlock_write(mapping);
419 truncate_hugepages(inode, offset); 419 truncate_hugepages(inode, offset);
420 return 0; 420 return 0;
421} 421}
@@ -472,12 +472,12 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
472} 472}
473 473
474/* 474/*
475 * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never 475 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
476 * be taken from reclaim -- unlike regular filesystems. This needs an 476 * be taken from reclaim -- unlike regular filesystems. This needs an
477 * annotation because huge_pmd_share() does an allocation under 477 * annotation because huge_pmd_share() does an allocation under
478 * i_mmap_mutex. 478 * i_mmap_rwsem.
479 */ 479 */
480static struct lock_class_key hugetlbfs_i_mmap_mutex_key; 480static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
481 481
482static struct inode *hugetlbfs_get_inode(struct super_block *sb, 482static struct inode *hugetlbfs_get_inode(struct super_block *sb,
483 struct inode *dir, 483 struct inode *dir,
@@ -495,8 +495,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
495 struct hugetlbfs_inode_info *info; 495 struct hugetlbfs_inode_info *info;
496 inode->i_ino = get_next_ino(); 496 inode->i_ino = get_next_ino();
497 inode_init_owner(inode, dir, mode); 497 inode_init_owner(inode, dir, mode);
498 lockdep_set_class(&inode->i_mapping->i_mmap_mutex, 498 lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
499 &hugetlbfs_i_mmap_mutex_key); 499 &hugetlbfs_i_mmap_rwsem_key);
500 inode->i_mapping->a_ops = &hugetlbfs_aops; 500 inode->i_mapping->a_ops = &hugetlbfs_aops;
501 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 501 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
502 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 502 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 2ed95f7caa4f..ad60555b4768 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -346,7 +346,7 @@ void address_space_init_once(struct address_space *mapping)
346 memset(mapping, 0, sizeof(*mapping)); 346 memset(mapping, 0, sizeof(*mapping));
347 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); 347 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
348 spin_lock_init(&mapping->tree_lock); 348 spin_lock_init(&mapping->tree_lock);
349 mutex_init(&mapping->i_mmap_mutex); 349 init_rwsem(&mapping->i_mmap_rwsem);
350 INIT_LIST_HEAD(&mapping->private_list); 350 INIT_LIST_HEAD(&mapping->private_list);
351 spin_lock_init(&mapping->private_lock); 351 spin_lock_init(&mapping->private_lock);
352 mapping->i_mmap = RB_ROOT; 352 mapping->i_mmap = RB_ROOT;
diff --git a/fs/namei.c b/fs/namei.c
index db5fe86319e6..ca814165d84c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -130,7 +130,7 @@ void final_putname(struct filename *name)
130 130
131#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) 131#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
132 132
133static struct filename * 133struct filename *
134getname_flags(const char __user *filename, int flags, int *empty) 134getname_flags(const char __user *filename, int flags, int *empty)
135{ 135{
136 struct filename *result, *err; 136 struct filename *result, *err;
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index caaaf9dfe353..44523f4a6084 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -69,8 +69,8 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
69 if (old_mask == new_mask) 69 if (old_mask == new_mask)
70 return; 70 return;
71 71
72 if (fsn_mark->i.inode) 72 if (fsn_mark->inode)
73 fsnotify_recalc_inode_mask(fsn_mark->i.inode); 73 fsnotify_recalc_inode_mask(fsn_mark->inode);
74} 74}
75 75
76/* 76/*
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 6ffd220eb14d..58b7cdb63da9 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -80,7 +80,7 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
80 return; 80 return;
81 81
82 inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); 82 inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
83 inode = igrab(mark->i.inode); 83 inode = igrab(mark->inode);
84 if (inode) { 84 if (inode) {
85 seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", 85 seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
86 inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, 86 inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
@@ -112,7 +112,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
112 mflags |= FAN_MARK_IGNORED_SURV_MODIFY; 112 mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
113 113
114 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { 114 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
115 inode = igrab(mark->i.inode); 115 inode = igrab(mark->inode);
116 if (!inode) 116 if (!inode)
117 return; 117 return;
118 seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", 118 seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
@@ -122,7 +122,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
122 seq_putc(m, '\n'); 122 seq_putc(m, '\n');
123 iput(inode); 123 iput(inode);
124 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { 124 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
125 struct mount *mnt = real_mount(mark->m.mnt); 125 struct mount *mnt = real_mount(mark->mnt);
126 126
127 seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", 127 seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
128 mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); 128 mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 41e39102743a..dd3fb0b17be7 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -242,13 +242,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
242 242
243 if (inode_node) { 243 if (inode_node) {
244 inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), 244 inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
245 struct fsnotify_mark, i.i_list); 245 struct fsnotify_mark, obj_list);
246 inode_group = inode_mark->group; 246 inode_group = inode_mark->group;
247 } 247 }
248 248
249 if (vfsmount_node) { 249 if (vfsmount_node) {
250 vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), 250 vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
251 struct fsnotify_mark, m.m_list); 251 struct fsnotify_mark, obj_list);
252 vfsmount_group = vfsmount_mark->group; 252 vfsmount_group = vfsmount_mark->group;
253 } 253 }
254 254
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 3b68b0ae0a97..13a00be516d2 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -12,12 +12,19 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group);
12/* protects reads of inode and vfsmount marks list */ 12/* protects reads of inode and vfsmount marks list */
13extern struct srcu_struct fsnotify_mark_srcu; 13extern struct srcu_struct fsnotify_mark_srcu;
14 14
15/* Calculate mask of events for a list of marks */
16extern u32 fsnotify_recalc_mask(struct hlist_head *head);
17
15/* compare two groups for sorting of marks lists */ 18/* compare two groups for sorting of marks lists */
16extern int fsnotify_compare_groups(struct fsnotify_group *a, 19extern int fsnotify_compare_groups(struct fsnotify_group *a,
17 struct fsnotify_group *b); 20 struct fsnotify_group *b);
18 21
19extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, 22extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
20 __u32 mask); 23 __u32 mask);
24/* Add mark to a proper place in mark list */
25extern int fsnotify_add_mark_list(struct hlist_head *head,
26 struct fsnotify_mark *mark,
27 int allow_dups);
21/* add a mark to an inode */ 28/* add a mark to an inode */
22extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, 29extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
23 struct fsnotify_group *group, struct inode *inode, 30 struct fsnotify_group *group, struct inode *inode,
@@ -31,6 +38,11 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
31extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); 38extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
32/* inode specific destruction of a mark */ 39/* inode specific destruction of a mark */
33extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); 40extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
41/* Destroy all marks in the given list */
42extern void fsnotify_destroy_marks(struct list_head *to_free);
43/* Find mark belonging to given group in the list of marks */
44extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
45 struct fsnotify_group *group);
34/* run the list of all marks associated with inode and flag them to be freed */ 46/* run the list of all marks associated with inode and flag them to be freed */
35extern void fsnotify_clear_marks_by_inode(struct inode *inode); 47extern void fsnotify_clear_marks_by_inode(struct inode *inode);
36/* run the list of all marks associated with vfsmount and flag them to be freed */ 48/* run the list of all marks associated with vfsmount and flag them to be freed */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index dfbf5447eea4..3daf513ee99e 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -31,28 +31,13 @@
31#include "../internal.h" 31#include "../internal.h"
32 32
33/* 33/*
34 * Recalculate the mask of events relevant to a given inode locked.
35 */
36static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
37{
38 struct fsnotify_mark *mark;
39 __u32 new_mask = 0;
40
41 assert_spin_locked(&inode->i_lock);
42
43 hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list)
44 new_mask |= mark->mask;
45 inode->i_fsnotify_mask = new_mask;
46}
47
48/*
49 * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types 34 * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
50 * any notifier is interested in hearing for this inode. 35 * any notifier is interested in hearing for this inode.
51 */ 36 */
52void fsnotify_recalc_inode_mask(struct inode *inode) 37void fsnotify_recalc_inode_mask(struct inode *inode)
53{ 38{
54 spin_lock(&inode->i_lock); 39 spin_lock(&inode->i_lock);
55 fsnotify_recalc_inode_mask_locked(inode); 40 inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
56 spin_unlock(&inode->i_lock); 41 spin_unlock(&inode->i_lock);
57 42
58 __fsnotify_update_child_dentry_flags(inode); 43 __fsnotify_update_child_dentry_flags(inode);
@@ -60,23 +45,22 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
60 45
61void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) 46void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
62{ 47{
63 struct inode *inode = mark->i.inode; 48 struct inode *inode = mark->inode;
64 49
65 BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); 50 BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
66 assert_spin_locked(&mark->lock); 51 assert_spin_locked(&mark->lock);
67 52
68 spin_lock(&inode->i_lock); 53 spin_lock(&inode->i_lock);
69 54
70 hlist_del_init_rcu(&mark->i.i_list); 55 hlist_del_init_rcu(&mark->obj_list);
71 mark->i.inode = NULL; 56 mark->inode = NULL;
72 57
73 /* 58 /*
74 * this mark is now off the inode->i_fsnotify_marks list and we 59 * this mark is now off the inode->i_fsnotify_marks list and we
75 * hold the inode->i_lock, so this is the perfect time to update the 60 * hold the inode->i_lock, so this is the perfect time to update the
76 * inode->i_fsnotify_mask 61 * inode->i_fsnotify_mask
77 */ 62 */
78 fsnotify_recalc_inode_mask_locked(inode); 63 inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
79
80 spin_unlock(&inode->i_lock); 64 spin_unlock(&inode->i_lock);
81} 65}
82 66
@@ -85,30 +69,19 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
85 */ 69 */
86void fsnotify_clear_marks_by_inode(struct inode *inode) 70void fsnotify_clear_marks_by_inode(struct inode *inode)
87{ 71{
88 struct fsnotify_mark *mark, *lmark; 72 struct fsnotify_mark *mark;
89 struct hlist_node *n; 73 struct hlist_node *n;
90 LIST_HEAD(free_list); 74 LIST_HEAD(free_list);
91 75
92 spin_lock(&inode->i_lock); 76 spin_lock(&inode->i_lock);
93 hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, i.i_list) { 77 hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
94 list_add(&mark->i.free_i_list, &free_list); 78 list_add(&mark->free_list, &free_list);
95 hlist_del_init_rcu(&mark->i.i_list); 79 hlist_del_init_rcu(&mark->obj_list);
96 fsnotify_get_mark(mark); 80 fsnotify_get_mark(mark);
97 } 81 }
98 spin_unlock(&inode->i_lock); 82 spin_unlock(&inode->i_lock);
99 83
100 list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) { 84 fsnotify_destroy_marks(&free_list);
101 struct fsnotify_group *group;
102
103 spin_lock(&mark->lock);
104 fsnotify_get_group(mark->group);
105 group = mark->group;
106 spin_unlock(&mark->lock);
107
108 fsnotify_destroy_mark(mark, group);
109 fsnotify_put_mark(mark);
110 fsnotify_put_group(group);
111 }
112} 85}
113 86
114/* 87/*
@@ -123,34 +96,13 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
123 * given a group and inode, find the mark associated with that combination. 96 * given a group and inode, find the mark associated with that combination.
124 * if found take a reference to that mark and return it, else return NULL 97 * if found take a reference to that mark and return it, else return NULL
125 */ 98 */
126static struct fsnotify_mark *fsnotify_find_inode_mark_locked(
127 struct fsnotify_group *group,
128 struct inode *inode)
129{
130 struct fsnotify_mark *mark;
131
132 assert_spin_locked(&inode->i_lock);
133
134 hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) {
135 if (mark->group == group) {
136 fsnotify_get_mark(mark);
137 return mark;
138 }
139 }
140 return NULL;
141}
142
143/*
144 * given a group and inode, find the mark associated with that combination.
145 * if found take a reference to that mark and return it, else return NULL
146 */
147struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, 99struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
148 struct inode *inode) 100 struct inode *inode)
149{ 101{
150 struct fsnotify_mark *mark; 102 struct fsnotify_mark *mark;
151 103
152 spin_lock(&inode->i_lock); 104 spin_lock(&inode->i_lock);
153 mark = fsnotify_find_inode_mark_locked(group, inode); 105 mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
154 spin_unlock(&inode->i_lock); 106 spin_unlock(&inode->i_lock);
155 107
156 return mark; 108 return mark;
@@ -168,10 +120,10 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
168 assert_spin_locked(&mark->lock); 120 assert_spin_locked(&mark->lock);
169 121
170 if (mask && 122 if (mask &&
171 mark->i.inode && 123 mark->inode &&
172 !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { 124 !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
173 mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; 125 mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
174 inode = igrab(mark->i.inode); 126 inode = igrab(mark->inode);
175 /* 127 /*
176 * we shouldn't be able to get here if the inode wasn't 128 * we shouldn't be able to get here if the inode wasn't
177 * already safely held in memory. But bug in case it 129 * already safely held in memory. But bug in case it
@@ -192,9 +144,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
192 struct fsnotify_group *group, struct inode *inode, 144 struct fsnotify_group *group, struct inode *inode,
193 int allow_dups) 145 int allow_dups)
194{ 146{
195 struct fsnotify_mark *lmark, *last = NULL; 147 int ret;
196 int ret = 0;
197 int cmp;
198 148
199 mark->flags |= FSNOTIFY_MARK_FLAG_INODE; 149 mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
200 150
@@ -202,37 +152,10 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
202 assert_spin_locked(&mark->lock); 152 assert_spin_locked(&mark->lock);
203 153
204 spin_lock(&inode->i_lock); 154 spin_lock(&inode->i_lock);
205 155 mark->inode = inode;
206 mark->i.inode = inode; 156 ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark,
207 157 allow_dups);
208 /* is mark the first mark? */ 158 inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
209 if (hlist_empty(&inode->i_fsnotify_marks)) {
210 hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks);
211 goto out;
212 }
213
214 /* should mark be in the middle of the current list? */
215 hlist_for_each_entry(lmark, &inode->i_fsnotify_marks, i.i_list) {
216 last = lmark;
217
218 if ((lmark->group == group) && !allow_dups) {
219 ret = -EEXIST;
220 goto out;
221 }
222
223 cmp = fsnotify_compare_groups(lmark->group, mark->group);
224 if (cmp < 0)
225 continue;
226
227 hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
228 goto out;
229 }
230
231 BUG_ON(last == NULL);
232 /* mark should be the last entry. last is the current last entry */
233 hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list);
234out:
235 fsnotify_recalc_inode_mask_locked(inode);
236 spin_unlock(&inode->i_lock); 159 spin_unlock(&inode->i_lock);
237 160
238 return ret; 161 return ret;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 7d888d77d59a..2cd900c2c737 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -156,7 +156,7 @@ static int idr_callback(int id, void *p, void *data)
156 */ 156 */
157 if (fsn_mark) 157 if (fsn_mark)
158 printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n", 158 printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
159 fsn_mark->group, fsn_mark->i.inode, i_mark->wd); 159 fsn_mark->group, fsn_mark->inode, i_mark->wd);
160 return 0; 160 return 0;
161} 161}
162 162
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 283aa312d745..450648697433 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -433,7 +433,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
433 if (wd == -1) { 433 if (wd == -1) {
434 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" 434 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
435 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, 435 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
436 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); 436 i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
437 goto out; 437 goto out;
438 } 438 }
439 439
@@ -442,7 +442,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
442 if (unlikely(!found_i_mark)) { 442 if (unlikely(!found_i_mark)) {
443 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" 443 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
444 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, 444 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
445 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); 445 i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
446 goto out; 446 goto out;
447 } 447 }
448 448
@@ -456,9 +456,9 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
456 "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d " 456 "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
457 "found_i_mark->group=%p found_i_mark->inode=%p\n", 457 "found_i_mark->group=%p found_i_mark->inode=%p\n",
458 __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, 458 __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
459 i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd, 459 i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd,
460 found_i_mark->fsn_mark.group, 460 found_i_mark->fsn_mark.group,
461 found_i_mark->fsn_mark.i.inode); 461 found_i_mark->fsn_mark.inode);
462 goto out; 462 goto out;
463 } 463 }
464 464
@@ -470,7 +470,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
470 if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) { 470 if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
471 printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" 471 printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
472 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, 472 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
473 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); 473 i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
474 /* we can't really recover with bad ref cnting.. */ 474 /* we can't really recover with bad ref cnting.. */
475 BUG(); 475 BUG();
476 } 476 }
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 34c38fabf514..92e48c70f0f0 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -110,6 +110,17 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
110 } 110 }
111} 111}
112 112
113/* Calculate mask of events for a list of marks */
114u32 fsnotify_recalc_mask(struct hlist_head *head)
115{
116 u32 new_mask = 0;
117 struct fsnotify_mark *mark;
118
119 hlist_for_each_entry(mark, head, obj_list)
120 new_mask |= mark->mask;
121 return new_mask;
122}
123
113/* 124/*
114 * Any time a mark is getting freed we end up here. 125 * Any time a mark is getting freed we end up here.
115 * The caller had better be holding a reference to this mark so we don't actually 126 * The caller had better be holding a reference to this mark so we don't actually
@@ -133,7 +144,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
133 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 144 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
134 145
135 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { 146 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
136 inode = mark->i.inode; 147 inode = mark->inode;
137 fsnotify_destroy_inode_mark(mark); 148 fsnotify_destroy_inode_mark(mark);
138 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) 149 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
139 fsnotify_destroy_vfsmount_mark(mark); 150 fsnotify_destroy_vfsmount_mark(mark);
@@ -150,7 +161,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
150 mutex_unlock(&group->mark_mutex); 161 mutex_unlock(&group->mark_mutex);
151 162
152 spin_lock(&destroy_lock); 163 spin_lock(&destroy_lock);
153 list_add(&mark->destroy_list, &destroy_list); 164 list_add(&mark->g_list, &destroy_list);
154 spin_unlock(&destroy_lock); 165 spin_unlock(&destroy_lock);
155 wake_up(&destroy_waitq); 166 wake_up(&destroy_waitq);
156 /* 167 /*
@@ -192,6 +203,27 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
192 mutex_unlock(&group->mark_mutex); 203 mutex_unlock(&group->mark_mutex);
193} 204}
194 205
206/*
207 * Destroy all marks in the given list. The marks must be already detached from
208 * the original inode / vfsmount.
209 */
210void fsnotify_destroy_marks(struct list_head *to_free)
211{
212 struct fsnotify_mark *mark, *lmark;
213 struct fsnotify_group *group;
214
215 list_for_each_entry_safe(mark, lmark, to_free, free_list) {
216 spin_lock(&mark->lock);
217 fsnotify_get_group(mark->group);
218 group = mark->group;
219 spin_unlock(&mark->lock);
220
221 fsnotify_destroy_mark(mark, group);
222 fsnotify_put_mark(mark);
223 fsnotify_put_group(group);
224 }
225}
226
195void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) 227void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
196{ 228{
197 assert_spin_locked(&mark->lock); 229 assert_spin_locked(&mark->lock);
@@ -245,6 +277,39 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
245 return -1; 277 return -1;
246} 278}
247 279
280/* Add mark into proper place in given list of marks */
281int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark,
282 int allow_dups)
283{
284 struct fsnotify_mark *lmark, *last = NULL;
285 int cmp;
286
287 /* is mark the first mark? */
288 if (hlist_empty(head)) {
289 hlist_add_head_rcu(&mark->obj_list, head);
290 return 0;
291 }
292
293 /* should mark be in the middle of the current list? */
294 hlist_for_each_entry(lmark, head, obj_list) {
295 last = lmark;
296
297 if ((lmark->group == mark->group) && !allow_dups)
298 return -EEXIST;
299
300 cmp = fsnotify_compare_groups(lmark->group, mark->group);
301 if (cmp >= 0) {
302 hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
303 return 0;
304 }
305 }
306
307 BUG_ON(last == NULL);
308 /* mark should be the last entry. last is the current last entry */
309 hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
310 return 0;
311}
312
248/* 313/*
249 * Attach an initialized mark to a given group and fs object. 314 * Attach an initialized mark to a given group and fs object.
250 * These marks may be used for the fsnotify backend to determine which 315 * These marks may be used for the fsnotify backend to determine which
@@ -305,7 +370,7 @@ err:
305 spin_unlock(&mark->lock); 370 spin_unlock(&mark->lock);
306 371
307 spin_lock(&destroy_lock); 372 spin_lock(&destroy_lock);
308 list_add(&mark->destroy_list, &destroy_list); 373 list_add(&mark->g_list, &destroy_list);
309 spin_unlock(&destroy_lock); 374 spin_unlock(&destroy_lock);
310 wake_up(&destroy_waitq); 375 wake_up(&destroy_waitq);
311 376
@@ -323,6 +388,24 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
323} 388}
324 389
325/* 390/*
391 * Given a list of marks, find the mark associated with given group. If found
392 * take a reference to that mark and return it, else return NULL.
393 */
394struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
395 struct fsnotify_group *group)
396{
397 struct fsnotify_mark *mark;
398
399 hlist_for_each_entry(mark, head, obj_list) {
400 if (mark->group == group) {
401 fsnotify_get_mark(mark);
402 return mark;
403 }
404 }
405 return NULL;
406}
407
408/*
326 * clear any marks in a group in which mark->flags & flags is true 409 * clear any marks in a group in which mark->flags & flags is true
327 */ 410 */
328void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, 411void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
@@ -352,8 +435,8 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
352void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) 435void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
353{ 436{
354 assert_spin_locked(&old->lock); 437 assert_spin_locked(&old->lock);
355 new->i.inode = old->i.inode; 438 new->inode = old->inode;
356 new->m.mnt = old->m.mnt; 439 new->mnt = old->mnt;
357 if (old->group) 440 if (old->group)
358 fsnotify_get_group(old->group); 441 fsnotify_get_group(old->group);
359 new->group = old->group; 442 new->group = old->group;
@@ -386,8 +469,8 @@ static int fsnotify_mark_destroy(void *ignored)
386 469
387 synchronize_srcu(&fsnotify_mark_srcu); 470 synchronize_srcu(&fsnotify_mark_srcu);
388 471
389 list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) { 472 list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
390 list_del_init(&mark->destroy_list); 473 list_del_init(&mark->g_list);
391 fsnotify_put_mark(mark); 474 fsnotify_put_mark(mark);
392 } 475 }
393 476
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index faefa72a11eb..326b148e623c 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -32,31 +32,20 @@
32 32
33void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) 33void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
34{ 34{
35 struct fsnotify_mark *mark, *lmark; 35 struct fsnotify_mark *mark;
36 struct hlist_node *n; 36 struct hlist_node *n;
37 struct mount *m = real_mount(mnt); 37 struct mount *m = real_mount(mnt);
38 LIST_HEAD(free_list); 38 LIST_HEAD(free_list);
39 39
40 spin_lock(&mnt->mnt_root->d_lock); 40 spin_lock(&mnt->mnt_root->d_lock);
41 hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, m.m_list) { 41 hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) {
42 list_add(&mark->m.free_m_list, &free_list); 42 list_add(&mark->free_list, &free_list);
43 hlist_del_init_rcu(&mark->m.m_list); 43 hlist_del_init_rcu(&mark->obj_list);
44 fsnotify_get_mark(mark); 44 fsnotify_get_mark(mark);
45 } 45 }
46 spin_unlock(&mnt->mnt_root->d_lock); 46 spin_unlock(&mnt->mnt_root->d_lock);
47 47
48 list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) { 48 fsnotify_destroy_marks(&free_list);
49 struct fsnotify_group *group;
50
51 spin_lock(&mark->lock);
52 fsnotify_get_group(mark->group);
53 group = mark->group;
54 spin_unlock(&mark->lock);
55
56 fsnotify_destroy_mark(mark, group);
57 fsnotify_put_mark(mark);
58 fsnotify_put_group(group);
59 }
60} 49}
61 50
62void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) 51void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
@@ -65,66 +54,35 @@ void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
65} 54}
66 55
67/* 56/*
68 * Recalculate the mask of events relevant to a given vfsmount locked.
69 */
70static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
71{
72 struct mount *m = real_mount(mnt);
73 struct fsnotify_mark *mark;
74 __u32 new_mask = 0;
75
76 assert_spin_locked(&mnt->mnt_root->d_lock);
77
78 hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list)
79 new_mask |= mark->mask;
80 m->mnt_fsnotify_mask = new_mask;
81}
82
83/*
84 * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types 57 * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
85 * any notifier is interested in hearing for this mount point 58 * any notifier is interested in hearing for this mount point
86 */ 59 */
87void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) 60void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
88{ 61{
62 struct mount *m = real_mount(mnt);
63
89 spin_lock(&mnt->mnt_root->d_lock); 64 spin_lock(&mnt->mnt_root->d_lock);
90 fsnotify_recalc_vfsmount_mask_locked(mnt); 65 m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
91 spin_unlock(&mnt->mnt_root->d_lock); 66 spin_unlock(&mnt->mnt_root->d_lock);
92} 67}
93 68
94void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) 69void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
95{ 70{
96 struct vfsmount *mnt = mark->m.mnt; 71 struct vfsmount *mnt = mark->mnt;
72 struct mount *m = real_mount(mnt);
97 73
98 BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); 74 BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
99 assert_spin_locked(&mark->lock); 75 assert_spin_locked(&mark->lock);
100 76
101 spin_lock(&mnt->mnt_root->d_lock); 77 spin_lock(&mnt->mnt_root->d_lock);
102 78
103 hlist_del_init_rcu(&mark->m.m_list); 79 hlist_del_init_rcu(&mark->obj_list);
104 mark->m.mnt = NULL; 80 mark->mnt = NULL;
105
106 fsnotify_recalc_vfsmount_mask_locked(mnt);
107 81
82 m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
108 spin_unlock(&mnt->mnt_root->d_lock); 83 spin_unlock(&mnt->mnt_root->d_lock);
109} 84}
110 85
111static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group,
112 struct vfsmount *mnt)
113{
114 struct mount *m = real_mount(mnt);
115 struct fsnotify_mark *mark;
116
117 assert_spin_locked(&mnt->mnt_root->d_lock);
118
119 hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) {
120 if (mark->group == group) {
121 fsnotify_get_mark(mark);
122 return mark;
123 }
124 }
125 return NULL;
126}
127
128/* 86/*
129 * given a group and vfsmount, find the mark associated with that combination. 87 * given a group and vfsmount, find the mark associated with that combination.
130 * if found take a reference to that mark and return it, else return NULL 88 * if found take a reference to that mark and return it, else return NULL
@@ -132,10 +90,11 @@ static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_
132struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, 90struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
133 struct vfsmount *mnt) 91 struct vfsmount *mnt)
134{ 92{
93 struct mount *m = real_mount(mnt);
135 struct fsnotify_mark *mark; 94 struct fsnotify_mark *mark;
136 95
137 spin_lock(&mnt->mnt_root->d_lock); 96 spin_lock(&mnt->mnt_root->d_lock);
138 mark = fsnotify_find_vfsmount_mark_locked(group, mnt); 97 mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group);
139 spin_unlock(&mnt->mnt_root->d_lock); 98 spin_unlock(&mnt->mnt_root->d_lock);
140 99
141 return mark; 100 return mark;
@@ -151,9 +110,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
151 int allow_dups) 110 int allow_dups)
152{ 111{
153 struct mount *m = real_mount(mnt); 112 struct mount *m = real_mount(mnt);
154 struct fsnotify_mark *lmark, *last = NULL; 113 int ret;
155 int ret = 0;
156 int cmp;
157 114
158 mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; 115 mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
159 116
@@ -161,37 +118,9 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
161 assert_spin_locked(&mark->lock); 118 assert_spin_locked(&mark->lock);
162 119
163 spin_lock(&mnt->mnt_root->d_lock); 120 spin_lock(&mnt->mnt_root->d_lock);
164 121 mark->mnt = mnt;
165 mark->m.mnt = mnt; 122 ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups);
166 123 m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
167 /* is mark the first mark? */
168 if (hlist_empty(&m->mnt_fsnotify_marks)) {
169 hlist_add_head_rcu(&mark->m.m_list, &m->mnt_fsnotify_marks);
170 goto out;
171 }
172
173 /* should mark be in the middle of the current list? */
174 hlist_for_each_entry(lmark, &m->mnt_fsnotify_marks, m.m_list) {
175 last = lmark;
176
177 if ((lmark->group == group) && !allow_dups) {
178 ret = -EEXIST;
179 goto out;
180 }
181
182 cmp = fsnotify_compare_groups(lmark->group, mark->group);
183 if (cmp < 0)
184 continue;
185
186 hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
187 goto out;
188 }
189
190 BUG_ON(last == NULL);
191 /* mark should be the last entry. last is the current last entry */
192 hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list);
193out:
194 fsnotify_recalc_vfsmount_mask_locked(mnt);
195 spin_unlock(&mnt->mnt_root->d_lock); 124 spin_unlock(&mnt->mnt_root->d_lock);
196 125
197 return ret; 126 return ret;
diff --git a/fs/open.c b/fs/open.c
index b1bf3d542d5d..d45bd905d418 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -295,6 +295,17 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
295 295
296 sb_start_write(inode->i_sb); 296 sb_start_write(inode->i_sb);
297 ret = file->f_op->fallocate(file, mode, offset, len); 297 ret = file->f_op->fallocate(file, mode, offset, len);
298
299 /*
300 * Create inotify and fanotify events.
301 *
302 * To keep the logic simple always create events if fallocate succeeds.
303 * This implies that events are even created if the file size remains
304 * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
305 */
306 if (ret == 0)
307 fsnotify_modify(file);
308
298 sb_end_write(inode->i_sb); 309 sb_end_write(inode->i_sb);
299 return ret; 310 return ret;
300} 311}
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 353948ba1c5b..dbf3a59c86bb 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -25,7 +25,11 @@ static void *seq_buf_alloc(unsigned long size)
25{ 25{
26 void *buf; 26 void *buf;
27 27
28 buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); 28 /*
29 * __GFP_NORETRY to avoid oom-killings with high-order allocations -
30 * it's better to fall back to vmalloc() than to kill things.
31 */
32 buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
29 if (!buf && size > PAGE_SIZE) 33 if (!buf && size > PAGE_SIZE)
30 buf = vmalloc(size); 34 buf = vmalloc(size);
31 return buf; 35 return buf;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 61f29e5ea840..576e4639ca60 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -53,6 +53,10 @@ struct linux_binprm {
53#define BINPRM_FLAGS_EXECFD_BIT 1 53#define BINPRM_FLAGS_EXECFD_BIT 1
54#define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) 54#define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT)
55 55
56/* filename of the binary will be inaccessible after exec */
57#define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2
58#define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
59
56/* Function parameter for binfmt->coredump */ 60/* Function parameter for binfmt->coredump */
57struct coredump_params { 61struct coredump_params {
58 const siginfo_t *siginfo; 62 const siginfo_t *siginfo;
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index e1c8d080c427..34e020c23644 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -45,6 +45,7 @@
45 * bitmap_set(dst, pos, nbits) Set specified bit area 45 * bitmap_set(dst, pos, nbits) Set specified bit area
46 * bitmap_clear(dst, pos, nbits) Clear specified bit area 46 * bitmap_clear(dst, pos, nbits) Clear specified bit area
47 * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area 47 * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area
48 * bitmap_find_next_zero_area_off(buf, len, pos, n, mask) as above
48 * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n 49 * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n
49 * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n 50 * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n
50 * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) 51 * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
@@ -114,11 +115,36 @@ extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
114 115
115extern void bitmap_set(unsigned long *map, unsigned int start, int len); 116extern void bitmap_set(unsigned long *map, unsigned int start, int len);
116extern void bitmap_clear(unsigned long *map, unsigned int start, int len); 117extern void bitmap_clear(unsigned long *map, unsigned int start, int len);
117extern unsigned long bitmap_find_next_zero_area(unsigned long *map, 118
118 unsigned long size, 119extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
119 unsigned long start, 120 unsigned long size,
120 unsigned int nr, 121 unsigned long start,
121 unsigned long align_mask); 122 unsigned int nr,
123 unsigned long align_mask,
124 unsigned long align_offset);
125
126/**
127 * bitmap_find_next_zero_area - find a contiguous aligned zero area
128 * @map: The address to base the search on
129 * @size: The bitmap size in bits
130 * @start: The bitnumber to start searching at
131 * @nr: The number of zeroed bits we're looking for
132 * @align_mask: Alignment mask for zero area
133 *
134 * The @align_mask should be one less than a power of 2; the effect is that
135 * the bit offset of all zero areas this function finds is multiples of that
136 * power of 2. A @align_mask of 0 means no alignment is required.
137 */
138static inline unsigned long
139bitmap_find_next_zero_area(unsigned long *map,
140 unsigned long size,
141 unsigned long start,
142 unsigned int nr,
143 unsigned long align_mask)
144{
145 return bitmap_find_next_zero_area_off(map, size, start, nr,
146 align_mask, 0);
147}
122 148
123extern int bitmap_scnprintf(char *buf, unsigned int len, 149extern int bitmap_scnprintf(char *buf, unsigned int len,
124 const unsigned long *src, int nbits); 150 const unsigned long *src, int nbits);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index e6494261eaff..7450ca2ac1fc 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -357,6 +357,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int);
357 357
358asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, 358asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
359 const compat_uptr_t __user *envp); 359 const compat_uptr_t __user *envp);
360asmlinkage long compat_sys_execveat(int dfd, const char __user *filename,
361 const compat_uptr_t __user *argv,
362 const compat_uptr_t __user *envp, int flags);
360 363
361asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, 364asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
362 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 365 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index c6f996f2abb6..798fad9e420d 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -5,6 +5,7 @@
5 5
6#include <linux/types.h> 6#include <linux/types.h>
7#include <linux/debugfs.h> 7#include <linux/debugfs.h>
8#include <linux/ratelimit.h>
8#include <linux/atomic.h> 9#include <linux/atomic.h>
9 10
10/* 11/*
@@ -25,14 +26,18 @@ struct fault_attr {
25 unsigned long reject_end; 26 unsigned long reject_end;
26 27
27 unsigned long count; 28 unsigned long count;
29 struct ratelimit_state ratelimit_state;
30 struct dentry *dname;
28}; 31};
29 32
30#define FAULT_ATTR_INITIALIZER { \ 33#define FAULT_ATTR_INITIALIZER { \
31 .interval = 1, \ 34 .interval = 1, \
32 .times = ATOMIC_INIT(1), \ 35 .times = ATOMIC_INIT(1), \
33 .require_end = ULONG_MAX, \ 36 .require_end = ULONG_MAX, \
34 .stacktrace_depth = 32, \ 37 .stacktrace_depth = 32, \
35 .verbose = 2, \ 38 .ratelimit_state = RATELIMIT_STATE_INIT_DISABLED, \
39 .verbose = 2, \
40 .dname = NULL, \
36 } 41 }
37 42
38#define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER 43#define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bb29b02d9bb6..4193a0bd99b0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -18,6 +18,7 @@
18#include <linux/pid.h> 18#include <linux/pid.h>
19#include <linux/bug.h> 19#include <linux/bug.h>
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/rwsem.h>
21#include <linux/capability.h> 22#include <linux/capability.h>
22#include <linux/semaphore.h> 23#include <linux/semaphore.h>
23#include <linux/fiemap.h> 24#include <linux/fiemap.h>
@@ -401,7 +402,7 @@ struct address_space {
401 atomic_t i_mmap_writable;/* count VM_SHARED mappings */ 402 atomic_t i_mmap_writable;/* count VM_SHARED mappings */
402 struct rb_root i_mmap; /* tree of private and shared mappings */ 403 struct rb_root i_mmap; /* tree of private and shared mappings */
403 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ 404 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
404 struct mutex i_mmap_mutex; /* protect tree, count, list */ 405 struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */
405 /* Protected by tree_lock together with the radix tree */ 406 /* Protected by tree_lock together with the radix tree */
406 unsigned long nrpages; /* number of total pages */ 407 unsigned long nrpages; /* number of total pages */
407 unsigned long nrshadows; /* number of shadow entries */ 408 unsigned long nrshadows; /* number of shadow entries */
@@ -467,6 +468,26 @@ struct block_device {
467 468
468int mapping_tagged(struct address_space *mapping, int tag); 469int mapping_tagged(struct address_space *mapping, int tag);
469 470
471static inline void i_mmap_lock_write(struct address_space *mapping)
472{
473 down_write(&mapping->i_mmap_rwsem);
474}
475
476static inline void i_mmap_unlock_write(struct address_space *mapping)
477{
478 up_write(&mapping->i_mmap_rwsem);
479}
480
481static inline void i_mmap_lock_read(struct address_space *mapping)
482{
483 down_read(&mapping->i_mmap_rwsem);
484}
485
486static inline void i_mmap_unlock_read(struct address_space *mapping)
487{
488 up_read(&mapping->i_mmap_rwsem);
489}
490
470/* 491/*
471 * Might pages of this file be mapped into userspace? 492 * Might pages of this file be mapped into userspace?
472 */ 493 */
@@ -2075,6 +2096,7 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *);
2075extern struct file * dentry_open(const struct path *, int, const struct cred *); 2096extern struct file * dentry_open(const struct path *, int, const struct cred *);
2076extern int filp_close(struct file *, fl_owner_t id); 2097extern int filp_close(struct file *, fl_owner_t id);
2077 2098
2099extern struct filename *getname_flags(const char __user *, int, int *);
2078extern struct filename *getname(const char __user *); 2100extern struct filename *getname(const char __user *);
2079extern struct filename *getname_kernel(const char *); 2101extern struct filename *getname_kernel(const char *);
2080 2102
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index ca060d7c4fa6..0f313f93c586 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -197,24 +197,6 @@ struct fsnotify_group {
197#define FSNOTIFY_EVENT_INODE 2 197#define FSNOTIFY_EVENT_INODE 2
198 198
199/* 199/*
200 * Inode specific fields in an fsnotify_mark
201 */
202struct fsnotify_inode_mark {
203 struct inode *inode; /* inode this mark is associated with */
204 struct hlist_node i_list; /* list of marks by inode->i_fsnotify_marks */
205 struct list_head free_i_list; /* tmp list used when freeing this mark */
206};
207
208/*
209 * Mount point specific fields in an fsnotify_mark
210 */
211struct fsnotify_vfsmount_mark {
212 struct vfsmount *mnt; /* vfsmount this mark is associated with */
213 struct hlist_node m_list; /* list of marks by inode->i_fsnotify_marks */
214 struct list_head free_m_list; /* tmp list used when freeing this mark */
215};
216
217/*
218 * a mark is simply an object attached to an in core inode which allows an 200 * a mark is simply an object attached to an in core inode which allows an
219 * fsnotify listener to indicate they are either no longer interested in events 201 * fsnotify listener to indicate they are either no longer interested in events
220 * of a type matching mask or only interested in those events. 202 * of a type matching mask or only interested in those events.
@@ -230,11 +212,17 @@ struct fsnotify_mark {
230 * in kernel that found and may be using this mark. */ 212 * in kernel that found and may be using this mark. */
231 atomic_t refcnt; /* active things looking at this mark */ 213 atomic_t refcnt; /* active things looking at this mark */
232 struct fsnotify_group *group; /* group this mark is for */ 214 struct fsnotify_group *group; /* group this mark is for */
233 struct list_head g_list; /* list of marks by group->i_fsnotify_marks */ 215 struct list_head g_list; /* list of marks by group->i_fsnotify_marks
216 * Also reused for queueing mark into
217 * destroy_list when it's waiting for
218 * the end of SRCU period before it can
219 * be freed */
234 spinlock_t lock; /* protect group and inode */ 220 spinlock_t lock; /* protect group and inode */
221 struct hlist_node obj_list; /* list of marks for inode / vfsmount */
222 struct list_head free_list; /* tmp list used when freeing this mark */
235 union { 223 union {
236 struct fsnotify_inode_mark i; 224 struct inode *inode; /* inode this mark is associated with */
237 struct fsnotify_vfsmount_mark m; 225 struct vfsmount *mnt; /* vfsmount this mark is associated with */
238 }; 226 };
239 __u32 ignored_mask; /* events types to ignore */ 227 __u32 ignored_mask; /* events types to ignore */
240#define FSNOTIFY_MARK_FLAG_INODE 0x01 228#define FSNOTIFY_MARK_FLAG_INODE 0x01
@@ -243,7 +231,6 @@ struct fsnotify_mark {
243#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08 231#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08
244#define FSNOTIFY_MARK_FLAG_ALIVE 0x10 232#define FSNOTIFY_MARK_FLAG_ALIVE 0x10
245 unsigned int flags; /* vfsmount or inode mark? */ 233 unsigned int flags; /* vfsmount or inode mark? */
246 struct list_head destroy_list;
247 void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */ 234 void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
248}; 235};
249 236
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 07d2699cdb51..b840e3b2770d 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -110,11 +110,8 @@ struct vm_area_struct;
110#define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ 110#define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \
111 __GFP_RECLAIMABLE) 111 __GFP_RECLAIMABLE)
112#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) 112#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
113#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ 113#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM)
114 __GFP_HIGHMEM) 114#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE)
115#define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
116 __GFP_HARDWALL | __GFP_HIGHMEM | \
117 __GFP_MOVABLE)
118#define GFP_IOFS (__GFP_IO | __GFP_FS) 115#define GFP_IOFS (__GFP_IO | __GFP_FS)
119#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ 116#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
120 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ 117 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 35e7eca4e33b..e365d5ec69cb 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -7,15 +7,6 @@
7#include <linux/notifier.h> 7#include <linux/notifier.h>
8#include <linux/nsproxy.h> 8#include <linux/nsproxy.h>
9 9
10/*
11 * ipc namespace events
12 */
13#define IPCNS_MEMCHANGED 0x00000001 /* Notify lowmem size changed */
14#define IPCNS_CREATED 0x00000002 /* Notify new ipc namespace created */
15#define IPCNS_REMOVED 0x00000003 /* Notify ipc namespace removed */
16
17#define IPCNS_CALLBACK_PRI 0
18
19struct user_namespace; 10struct user_namespace;
20 11
21struct ipc_ids { 12struct ipc_ids {
@@ -38,7 +29,6 @@ struct ipc_namespace {
38 unsigned int msg_ctlmni; 29 unsigned int msg_ctlmni;
39 atomic_t msg_bytes; 30 atomic_t msg_bytes;
40 atomic_t msg_hdrs; 31 atomic_t msg_hdrs;
41 int auto_msgmni;
42 32
43 size_t shm_ctlmax; 33 size_t shm_ctlmax;
44 size_t shm_ctlall; 34 size_t shm_ctlall;
@@ -77,18 +67,8 @@ extern atomic_t nr_ipc_ns;
77extern spinlock_t mq_lock; 67extern spinlock_t mq_lock;
78 68
79#ifdef CONFIG_SYSVIPC 69#ifdef CONFIG_SYSVIPC
80extern int register_ipcns_notifier(struct ipc_namespace *);
81extern int cond_register_ipcns_notifier(struct ipc_namespace *);
82extern void unregister_ipcns_notifier(struct ipc_namespace *);
83extern int ipcns_notify(unsigned long);
84extern void shm_destroy_orphaned(struct ipc_namespace *ns); 70extern void shm_destroy_orphaned(struct ipc_namespace *ns);
85#else /* CONFIG_SYSVIPC */ 71#else /* CONFIG_SYSVIPC */
86static inline int register_ipcns_notifier(struct ipc_namespace *ns)
87{ return 0; }
88static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns)
89{ return 0; }
90static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { }
91static inline int ipcns_notify(unsigned long l) { return 0; }
92static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} 72static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
93#endif /* CONFIG_SYSVIPC */ 73#endif /* CONFIG_SYSVIPC */
94 74
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 057e95971014..e705467ddb47 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -21,6 +21,8 @@
21#ifndef __KMEMLEAK_H 21#ifndef __KMEMLEAK_H
22#define __KMEMLEAK_H 22#define __KMEMLEAK_H
23 23
24#include <linux/slab.h>
25
24#ifdef CONFIG_DEBUG_KMEMLEAK 26#ifdef CONFIG_DEBUG_KMEMLEAK
25 27
26extern void kmemleak_init(void) __ref; 28extern void kmemleak_init(void) __ref;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6ea9f919e888..7c95af8d552c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -400,8 +400,8 @@ int memcg_cache_id(struct mem_cgroup *memcg);
400 400
401void memcg_update_array_size(int num_groups); 401void memcg_update_array_size(int num_groups);
402 402
403struct kmem_cache * 403struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
404__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); 404void __memcg_kmem_put_cache(struct kmem_cache *cachep);
405 405
406int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); 406int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order);
407void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); 407void __memcg_uncharge_slab(struct kmem_cache *cachep, int order);
@@ -492,7 +492,13 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
492 if (unlikely(fatal_signal_pending(current))) 492 if (unlikely(fatal_signal_pending(current)))
493 return cachep; 493 return cachep;
494 494
495 return __memcg_kmem_get_cache(cachep, gfp); 495 return __memcg_kmem_get_cache(cachep);
496}
497
498static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
499{
500 if (memcg_kmem_enabled())
501 __memcg_kmem_put_cache(cachep);
496} 502}
497#else 503#else
498#define for_each_memcg_cache_index(_idx) \ 504#define for_each_memcg_cache_index(_idx) \
@@ -528,6 +534,10 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
528{ 534{
529 return cachep; 535 return cachep;
530} 536}
537
538static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
539{
540}
531#endif /* CONFIG_MEMCG_KMEM */ 541#endif /* CONFIG_MEMCG_KMEM */
532#endif /* _LINUX_MEMCONTROL_H */ 542#endif /* _LINUX_MEMCONTROL_H */
533 543
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3b337efbe533..c0a67b894c4c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -19,6 +19,7 @@
19#include <linux/bit_spinlock.h> 19#include <linux/bit_spinlock.h>
20#include <linux/shrinker.h> 20#include <linux/shrinker.h>
21#include <linux/resource.h> 21#include <linux/resource.h>
22#include <linux/page_ext.h>
22 23
23struct mempolicy; 24struct mempolicy;
24struct anon_vma; 25struct anon_vma;
@@ -2060,7 +2061,22 @@ static inline void vm_stat_account(struct mm_struct *mm,
2060#endif /* CONFIG_PROC_FS */ 2061#endif /* CONFIG_PROC_FS */
2061 2062
2062#ifdef CONFIG_DEBUG_PAGEALLOC 2063#ifdef CONFIG_DEBUG_PAGEALLOC
2063extern void kernel_map_pages(struct page *page, int numpages, int enable); 2064extern bool _debug_pagealloc_enabled;
2065extern void __kernel_map_pages(struct page *page, int numpages, int enable);
2066
2067static inline bool debug_pagealloc_enabled(void)
2068{
2069 return _debug_pagealloc_enabled;
2070}
2071
2072static inline void
2073kernel_map_pages(struct page *page, int numpages, int enable)
2074{
2075 if (!debug_pagealloc_enabled())
2076 return;
2077
2078 __kernel_map_pages(page, numpages, enable);
2079}
2064#ifdef CONFIG_HIBERNATION 2080#ifdef CONFIG_HIBERNATION
2065extern bool kernel_page_present(struct page *page); 2081extern bool kernel_page_present(struct page *page);
2066#endif /* CONFIG_HIBERNATION */ 2082#endif /* CONFIG_HIBERNATION */
@@ -2094,9 +2110,9 @@ int drop_caches_sysctl_handler(struct ctl_table *, int,
2094 void __user *, size_t *, loff_t *); 2110 void __user *, size_t *, loff_t *);
2095#endif 2111#endif
2096 2112
2097unsigned long shrink_slab(struct shrink_control *shrink, 2113unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
2098 unsigned long nr_pages_scanned, 2114 unsigned long nr_scanned,
2099 unsigned long lru_pages); 2115 unsigned long nr_eligible);
2100 2116
2101#ifndef CONFIG_MMU 2117#ifndef CONFIG_MMU
2102#define randomize_va_space 0 2118#define randomize_va_space 0
@@ -2155,20 +2171,36 @@ extern void copy_user_huge_page(struct page *dst, struct page *src,
2155 unsigned int pages_per_huge_page); 2171 unsigned int pages_per_huge_page);
2156#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 2172#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
2157 2173
2174extern struct page_ext_operations debug_guardpage_ops;
2175extern struct page_ext_operations page_poisoning_ops;
2176
2158#ifdef CONFIG_DEBUG_PAGEALLOC 2177#ifdef CONFIG_DEBUG_PAGEALLOC
2159extern unsigned int _debug_guardpage_minorder; 2178extern unsigned int _debug_guardpage_minorder;
2179extern bool _debug_guardpage_enabled;
2160 2180
2161static inline unsigned int debug_guardpage_minorder(void) 2181static inline unsigned int debug_guardpage_minorder(void)
2162{ 2182{
2163 return _debug_guardpage_minorder; 2183 return _debug_guardpage_minorder;
2164} 2184}
2165 2185
2186static inline bool debug_guardpage_enabled(void)
2187{
2188 return _debug_guardpage_enabled;
2189}
2190
2166static inline bool page_is_guard(struct page *page) 2191static inline bool page_is_guard(struct page *page)
2167{ 2192{
2168 return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 2193 struct page_ext *page_ext;
2194
2195 if (!debug_guardpage_enabled())
2196 return false;
2197
2198 page_ext = lookup_page_ext(page);
2199 return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
2169} 2200}
2170#else 2201#else
2171static inline unsigned int debug_guardpage_minorder(void) { return 0; } 2202static inline unsigned int debug_guardpage_minorder(void) { return 0; }
2203static inline bool debug_guardpage_enabled(void) { return false; }
2172static inline bool page_is_guard(struct page *page) { return false; } 2204static inline bool page_is_guard(struct page *page) { return false; }
2173#endif /* CONFIG_DEBUG_PAGEALLOC */ 2205#endif /* CONFIG_DEBUG_PAGEALLOC */
2174 2206
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bf9f57529dcf..6d34aa266a8c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -10,7 +10,6 @@
10#include <linux/rwsem.h> 10#include <linux/rwsem.h>
11#include <linux/completion.h> 11#include <linux/completion.h>
12#include <linux/cpumask.h> 12#include <linux/cpumask.h>
13#include <linux/page-debug-flags.h>
14#include <linux/uprobes.h> 13#include <linux/uprobes.h>
15#include <linux/page-flags-layout.h> 14#include <linux/page-flags-layout.h>
16#include <asm/page.h> 15#include <asm/page.h>
@@ -186,9 +185,6 @@ struct page {
186 void *virtual; /* Kernel virtual address (NULL if 185 void *virtual; /* Kernel virtual address (NULL if
187 not kmapped, ie. highmem) */ 186 not kmapped, ie. highmem) */
188#endif /* WANT_PAGE_VIRTUAL */ 187#endif /* WANT_PAGE_VIRTUAL */
189#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
190 unsigned long debug_flags; /* Use atomic bitops on this */
191#endif
192 188
193#ifdef CONFIG_KMEMCHECK 189#ifdef CONFIG_KMEMCHECK
194 /* 190 /*
@@ -534,4 +530,12 @@ enum tlb_flush_reason {
534 NR_TLB_FLUSH_REASONS, 530 NR_TLB_FLUSH_REASONS,
535}; 531};
536 532
533 /*
534 * A swap entry has to fit into a "unsigned long", as the entry is hidden
535 * in the "index" field of the swapper address space.
536 */
537typedef struct {
538 unsigned long val;
539} swp_entry_t;
540
537#endif /* _LINUX_MM_TYPES_H */ 541#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 88787bb4b3b9..ab8564b03468 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -154,7 +154,7 @@ struct mmu_notifier_ops {
154 * Therefore notifier chains can only be traversed when either 154 * Therefore notifier chains can only be traversed when either
155 * 155 *
156 * 1. mmap_sem is held. 156 * 1. mmap_sem is held.
157 * 2. One of the reverse map locks is held (i_mmap_mutex or anon_vma->rwsem). 157 * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem).
158 * 3. No other concurrent thread can access the list (release) 158 * 3. No other concurrent thread can access the list (release)
159 */ 159 */
160struct mmu_notifier { 160struct mmu_notifier {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3879d7664dfc..2f0856d14b21 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -722,6 +722,9 @@ typedef struct pglist_data {
722 int nr_zones; 722 int nr_zones;
723#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ 723#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
724 struct page *node_mem_map; 724 struct page *node_mem_map;
725#ifdef CONFIG_PAGE_EXTENSION
726 struct page_ext *node_page_ext;
727#endif
725#endif 728#endif
726#ifndef CONFIG_NO_BOOTMEM 729#ifndef CONFIG_NO_BOOTMEM
727 struct bootmem_data *bdata; 730 struct bootmem_data *bdata;
@@ -1075,6 +1078,7 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
1075#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) 1078#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
1076 1079
1077struct page; 1080struct page;
1081struct page_ext;
1078struct mem_section { 1082struct mem_section {
1079 /* 1083 /*
1080 * This is, logically, a pointer to an array of struct 1084 * This is, logically, a pointer to an array of struct
@@ -1092,6 +1096,14 @@ struct mem_section {
1092 1096
1093 /* See declaration of similar field in struct zone */ 1097 /* See declaration of similar field in struct zone */
1094 unsigned long *pageblock_flags; 1098 unsigned long *pageblock_flags;
1099#ifdef CONFIG_PAGE_EXTENSION
1100 /*
1101 * If !SPARSEMEM, pgdat doesn't have page_ext pointer. We use
1102 * section. (see page_ext.h about this.)
1103 */
1104 struct page_ext *page_ext;
1105 unsigned long pad;
1106#endif
1095 /* 1107 /*
1096 * WARNING: mem_section must be a power-of-2 in size for the 1108 * WARNING: mem_section must be a power-of-2 in size for the
1097 * calculation and use of SECTION_ROOT_MASK to make sense. 1109 * calculation and use of SECTION_ROOT_MASK to make sense.
diff --git a/include/linux/oom.h b/include/linux/oom.h
index e8d6e1058723..853698c721f7 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -92,6 +92,17 @@ static inline bool oom_gfp_allowed(gfp_t gfp_mask)
92 92
93extern struct task_struct *find_lock_task_mm(struct task_struct *p); 93extern struct task_struct *find_lock_task_mm(struct task_struct *p);
94 94
95static inline bool task_will_free_mem(struct task_struct *task)
96{
97 /*
98 * A coredumping process may sleep for an extended period in exit_mm(),
99 * so the oom killer cannot assume that the process will promptly exit
100 * and release memory.
101 */
102 return (task->flags & PF_EXITING) &&
103 !(task->signal->flags & SIGNAL_GROUP_COREDUMP);
104}
105
95/* sysctls */ 106/* sysctls */
96extern int sysctl_oom_dump_tasks; 107extern int sysctl_oom_dump_tasks;
97extern int sysctl_oom_kill_allocating_task; 108extern int sysctl_oom_kill_allocating_task;
diff --git a/include/linux/page-debug-flags.h b/include/linux/page-debug-flags.h
deleted file mode 100644
index 22691f614043..000000000000
--- a/include/linux/page-debug-flags.h
+++ /dev/null
@@ -1,32 +0,0 @@
1#ifndef LINUX_PAGE_DEBUG_FLAGS_H
2#define LINUX_PAGE_DEBUG_FLAGS_H
3
4/*
5 * page->debug_flags bits:
6 *
7 * PAGE_DEBUG_FLAG_POISON is set for poisoned pages. This is used to
8 * implement generic debug pagealloc feature. The pages are filled with
9 * poison patterns and set this flag after free_pages(). The poisoned
10 * pages are verified whether the patterns are not corrupted and clear
11 * the flag before alloc_pages().
12 */
13
14enum page_debug_flags {
15 PAGE_DEBUG_FLAG_POISON, /* Page is poisoned */
16 PAGE_DEBUG_FLAG_GUARD,
17};
18
19/*
20 * Ensure that CONFIG_WANT_PAGE_DEBUG_FLAGS reliably
21 * gets turned off when no debug features are enabling it!
22 */
23
24#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
25#if !defined(CONFIG_PAGE_POISONING) && \
26 !defined(CONFIG_PAGE_GUARD) \
27/* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */
28#error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features!
29#endif
30#endif /* CONFIG_WANT_PAGE_DEBUG_FLAGS */
31
32#endif /* LINUX_PAGE_DEBUG_FLAGS_H */
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
new file mode 100644
index 000000000000..d2a2c84c72d0
--- /dev/null
+++ b/include/linux/page_ext.h
@@ -0,0 +1,84 @@
1#ifndef __LINUX_PAGE_EXT_H
2#define __LINUX_PAGE_EXT_H
3
4#include <linux/types.h>
5#include <linux/stacktrace.h>
6
7struct pglist_data;
8struct page_ext_operations {
9 bool (*need)(void);
10 void (*init)(void);
11};
12
13#ifdef CONFIG_PAGE_EXTENSION
14
15/*
16 * page_ext->flags bits:
17 *
18 * PAGE_EXT_DEBUG_POISON is set for poisoned pages. This is used to
19 * implement generic debug pagealloc feature. The pages are filled with
20 * poison patterns and set this flag after free_pages(). The poisoned
21 * pages are verified whether the patterns are not corrupted and clear
22 * the flag before alloc_pages().
23 */
24
25enum page_ext_flags {
26 PAGE_EXT_DEBUG_POISON, /* Page is poisoned */
27 PAGE_EXT_DEBUG_GUARD,
28 PAGE_EXT_OWNER,
29};
30
31/*
32 * Page Extension can be considered as an extended mem_map.
33 * A page_ext page is associated with every page descriptor. The
34 * page_ext helps us add more information about the page.
35 * All page_ext are allocated at boot or memory hotplug event,
36 * then the page_ext for pfn always exists.
37 */
38struct page_ext {
39 unsigned long flags;
40#ifdef CONFIG_PAGE_OWNER
41 unsigned int order;
42 gfp_t gfp_mask;
43 struct stack_trace trace;
44 unsigned long trace_entries[8];
45#endif
46};
47
48extern void pgdat_page_ext_init(struct pglist_data *pgdat);
49
50#ifdef CONFIG_SPARSEMEM
51static inline void page_ext_init_flatmem(void)
52{
53}
54extern void page_ext_init(void);
55#else
56extern void page_ext_init_flatmem(void);
57static inline void page_ext_init(void)
58{
59}
60#endif
61
62struct page_ext *lookup_page_ext(struct page *page);
63
64#else /* !CONFIG_PAGE_EXTENSION */
65struct page_ext;
66
67static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
68{
69}
70
71static inline struct page_ext *lookup_page_ext(struct page *page)
72{
73 return NULL;
74}
75
76static inline void page_ext_init(void)
77{
78}
79
80static inline void page_ext_init_flatmem(void)
81{
82}
83#endif /* CONFIG_PAGE_EXTENSION */
84#endif /* __LINUX_PAGE_EXT_H */
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
new file mode 100644
index 000000000000..b48c3471c254
--- /dev/null
+++ b/include/linux/page_owner.h
@@ -0,0 +1,38 @@
1#ifndef __LINUX_PAGE_OWNER_H
2#define __LINUX_PAGE_OWNER_H
3
4#ifdef CONFIG_PAGE_OWNER
5extern bool page_owner_inited;
6extern struct page_ext_operations page_owner_ops;
7
8extern void __reset_page_owner(struct page *page, unsigned int order);
9extern void __set_page_owner(struct page *page,
10 unsigned int order, gfp_t gfp_mask);
11
12static inline void reset_page_owner(struct page *page, unsigned int order)
13{
14 if (likely(!page_owner_inited))
15 return;
16
17 __reset_page_owner(page, order);
18}
19
20static inline void set_page_owner(struct page *page,
21 unsigned int order, gfp_t gfp_mask)
22{
23 if (likely(!page_owner_inited))
24 return;
25
26 __set_page_owner(page, order, gfp_mask);
27}
28#else
29static inline void reset_page_owner(struct page *page, unsigned int order)
30{
31}
32static inline void set_page_owner(struct page *page,
33 unsigned int order, gfp_t gfp_mask)
34{
35}
36
37#endif /* CONFIG_PAGE_OWNER */
38#endif /* __LINUX_PAGE_OWNER_H */
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 420032d41d27..57f3a1c550dc 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -254,8 +254,6 @@ do { \
254#endif /* CONFIG_SMP */ 254#endif /* CONFIG_SMP */
255 255
256#define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu)) 256#define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu))
257#define __raw_get_cpu_var(var) (*raw_cpu_ptr(&(var)))
258#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
259 257
260/* 258/*
261 * Must be an lvalue. Since @var must be a simple identifier, 259 * Must be an lvalue. Since @var must be a simple identifier,
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
index 0a260d8a18bf..18102529254e 100644
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -17,14 +17,20 @@ struct ratelimit_state {
17 unsigned long begin; 17 unsigned long begin;
18}; 18};
19 19
20#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \ 20#define RATELIMIT_STATE_INIT(name, interval_init, burst_init) { \
21 \
22 struct ratelimit_state name = { \
23 .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ 21 .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
24 .interval = interval_init, \ 22 .interval = interval_init, \
25 .burst = burst_init, \ 23 .burst = burst_init, \
26 } 24 }
27 25
26#define RATELIMIT_STATE_INIT_DISABLED \
27 RATELIMIT_STATE_INIT(ratelimit_state, 0, DEFAULT_RATELIMIT_BURST)
28
29#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \
30 \
31 struct ratelimit_state name = \
32 RATELIMIT_STATE_INIT(name, interval_init, burst_init) \
33
28static inline void ratelimit_state_init(struct ratelimit_state *rs, 34static inline void ratelimit_state_init(struct ratelimit_state *rs,
29 int interval, int burst) 35 int interval, int burst)
30{ 36{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55f5ee7cc3d3..8db31ef98d2f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1364,6 +1364,10 @@ struct task_struct {
1364 unsigned sched_reset_on_fork:1; 1364 unsigned sched_reset_on_fork:1;
1365 unsigned sched_contributes_to_load:1; 1365 unsigned sched_contributes_to_load:1;
1366 1366
1367#ifdef CONFIG_MEMCG_KMEM
1368 unsigned memcg_kmem_skip_account:1;
1369#endif
1370
1367 unsigned long atomic_flags; /* Flags needing atomic access. */ 1371 unsigned long atomic_flags; /* Flags needing atomic access. */
1368 1372
1369 pid_t pid; 1373 pid_t pid;
@@ -1679,8 +1683,7 @@ struct task_struct {
1679 /* bitmask and counter of trace recursion */ 1683 /* bitmask and counter of trace recursion */
1680 unsigned long trace_recursion; 1684 unsigned long trace_recursion;
1681#endif /* CONFIG_TRACING */ 1685#endif /* CONFIG_TRACING */
1682#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ 1686#ifdef CONFIG_MEMCG
1683 unsigned int memcg_kmem_skip_account;
1684 struct memcg_oom_info { 1687 struct memcg_oom_info {
1685 struct mem_cgroup *memcg; 1688 struct mem_cgroup *memcg;
1686 gfp_t gfp_mask; 1689 gfp_t gfp_mask;
@@ -2482,6 +2485,10 @@ extern void do_group_exit(int);
2482extern int do_execve(struct filename *, 2485extern int do_execve(struct filename *,
2483 const char __user * const __user *, 2486 const char __user * const __user *,
2484 const char __user * const __user *); 2487 const char __user * const __user *);
2488extern int do_execveat(int, struct filename *,
2489 const char __user * const __user *,
2490 const char __user * const __user *,
2491 int);
2485extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); 2492extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
2486struct task_struct *fork_idle(int); 2493struct task_struct *fork_idle(int);
2487extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); 2494extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 68c097077ef0..f4aee75f00b1 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -18,8 +18,6 @@ struct shrink_control {
18 */ 18 */
19 unsigned long nr_to_scan; 19 unsigned long nr_to_scan;
20 20
21 /* shrink from these nodes */
22 nodemask_t nodes_to_scan;
23 /* current node being shrunk (for NUMA aware shrinkers) */ 21 /* current node being shrunk (for NUMA aware shrinkers) */
24 int nid; 22 int nid;
25}; 23};
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 8a2457d42fc8..9a139b637069 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -493,7 +493,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
493 * @memcg: pointer to the memcg this cache belongs to 493 * @memcg: pointer to the memcg this cache belongs to
494 * @list: list_head for the list of all caches in this memcg 494 * @list: list_head for the list of all caches in this memcg
495 * @root_cache: pointer to the global, root cache, this cache was derived from 495 * @root_cache: pointer to the global, root cache, this cache was derived from
496 * @nr_pages: number of pages that belongs to this cache.
497 */ 496 */
498struct memcg_cache_params { 497struct memcg_cache_params {
499 bool is_root_cache; 498 bool is_root_cache;
@@ -506,7 +505,6 @@ struct memcg_cache_params {
506 struct mem_cgroup *memcg; 505 struct mem_cgroup *memcg;
507 struct list_head list; 506 struct list_head list;
508 struct kmem_cache *root_cache; 507 struct kmem_cache *root_cache;
509 atomic_t nr_pages;
510 }; 508 };
511 }; 509 };
512}; 510};
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 115b570e3bff..669045ab73f3 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -1,6 +1,8 @@
1#ifndef __LINUX_STACKTRACE_H 1#ifndef __LINUX_STACKTRACE_H
2#define __LINUX_STACKTRACE_H 2#define __LINUX_STACKTRACE_H
3 3
4#include <linux/types.h>
5
4struct task_struct; 6struct task_struct;
5struct pt_regs; 7struct pt_regs;
6 8
@@ -20,6 +22,8 @@ extern void save_stack_trace_tsk(struct task_struct *tsk,
20 struct stack_trace *trace); 22 struct stack_trace *trace);
21 23
22extern void print_stack_trace(struct stack_trace *trace, int spaces); 24extern void print_stack_trace(struct stack_trace *trace, int spaces);
25extern int snprint_stack_trace(char *buf, size_t size,
26 struct stack_trace *trace, int spaces);
23 27
24#ifdef CONFIG_USER_STACKTRACE_SUPPORT 28#ifdef CONFIG_USER_STACKTRACE_SUPPORT
25extern void save_stack_trace_user(struct stack_trace *trace); 29extern void save_stack_trace_user(struct stack_trace *trace);
@@ -32,6 +36,7 @@ extern void save_stack_trace_user(struct stack_trace *trace);
32# define save_stack_trace_tsk(tsk, trace) do { } while (0) 36# define save_stack_trace_tsk(tsk, trace) do { } while (0)
33# define save_stack_trace_user(trace) do { } while (0) 37# define save_stack_trace_user(trace) do { } while (0)
34# define print_stack_trace(trace, spaces) do { } while (0) 38# define print_stack_trace(trace, spaces) do { } while (0)
39# define snprint_stack_trace(buf, size, trace, spaces) do { } while (0)
35#endif 40#endif
36 41
37#endif 42#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 37a585beef5c..34e8b60ab973 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -102,14 +102,6 @@ union swap_header {
102 } info; 102 } info;
103}; 103};
104 104
105 /* A swap entry has to fit into a "unsigned long", as
106 * the entry is hidden in the "index" field of the
107 * swapper address space.
108 */
109typedef struct {
110 unsigned long val;
111} swp_entry_t;
112
113/* 105/*
114 * current->reclaim_state points to one of these when a task is running 106 * current->reclaim_state points to one of these when a task is running
115 * memory reclaim 107 * memory reclaim
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c9afdc7a7f84..85893d744901 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -877,4 +877,9 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
877asmlinkage long sys_getrandom(char __user *buf, size_t count, 877asmlinkage long sys_getrandom(char __user *buf, size_t count,
878 unsigned int flags); 878 unsigned int flags);
879asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); 879asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
880
881asmlinkage long sys_execveat(int dfd, const char __user *filename,
882 const char __user *const __user *argv,
883 const char __user *const __user *envp, int flags);
884
880#endif 885#endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 730334cdf037..9246d32dc973 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -90,6 +90,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
90#ifdef CONFIG_DEBUG_VM_VMACACHE 90#ifdef CONFIG_DEBUG_VM_VMACACHE
91 VMACACHE_FIND_CALLS, 91 VMACACHE_FIND_CALLS,
92 VMACACHE_FIND_HITS, 92 VMACACHE_FIND_HITS,
93 VMACACHE_FULL_FLUSHES,
93#endif 94#endif
94 NR_VM_EVENT_ITEMS 95 NR_VM_EVENT_ITEMS
95}; 96};
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 22749c134117..e016bd9b1a04 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -707,9 +707,11 @@ __SYSCALL(__NR_getrandom, sys_getrandom)
707__SYSCALL(__NR_memfd_create, sys_memfd_create) 707__SYSCALL(__NR_memfd_create, sys_memfd_create)
708#define __NR_bpf 280 708#define __NR_bpf 280
709__SYSCALL(__NR_bpf, sys_bpf) 709__SYSCALL(__NR_bpf, sys_bpf)
710#define __NR_execveat 281
711__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
710 712
711#undef __NR_syscalls 713#undef __NR_syscalls
712#define __NR_syscalls 281 714#define __NR_syscalls 282
713 715
714/* 716/*
715 * All syscalls below here should go away really, 717 * All syscalls below here should go away really,
diff --git a/include/uapi/linux/msg.h b/include/uapi/linux/msg.h
index a70375526578..f51c8001dbe5 100644
--- a/include/uapi/linux/msg.h
+++ b/include/uapi/linux/msg.h
@@ -51,16 +51,28 @@ struct msginfo {
51}; 51};
52 52
53/* 53/*
54 * Scaling factor to compute msgmni: 54 * MSGMNI, MSGMAX and MSGMNB are default values which can be
55 * the memory dedicated to msg queues (msgmni * msgmnb) should occupy 55 * modified by sysctl.
56 * at most 1/MSG_MEM_SCALE of the lowmem (see the formula in ipc/msg.c): 56 *
57 * up to 8MB : msgmni = 16 (MSGMNI) 57 * MSGMNI is the upper limit for the number of messages queues per
58 * 4 GB : msgmni = 8K 58 * namespace.
59 * more than 16 GB : msgmni = 32K (IPCMNI) 59 * It has been chosen to be as large possible without facilitating
60 * scenarios where userspace causes overflows when adjusting the limits via
61 * operations of the form retrieve current limit; add X; update limit".
62 *
63 * MSGMNB is the default size of a new message queue. Non-root tasks can
64 * decrease the size with msgctl(IPC_SET), root tasks
65 * (actually: CAP_SYS_RESOURCE) can both increase and decrease the queue
66 * size. The optimal value is application dependent.
67 * 16384 is used because it was always used (since 0.99.10)
68 *
69 * MAXMAX is the maximum size of an individual message, it's a global
70 * (per-namespace) limit that applies for all message queues.
71 * It's set to 1/2 of MSGMNB, to ensure that at least two messages fit into
72 * the queue. This is also an arbitrary choice (since 2.6.0).
60 */ 73 */
61#define MSG_MEM_SCALE 32
62 74
63#define MSGMNI 16 /* <= IPCMNI */ /* max # of msg queue identifiers */ 75#define MSGMNI 32000 /* <= IPCMNI */ /* max # of msg queue identifiers */
64#define MSGMAX 8192 /* <= INT_MAX */ /* max size of message (bytes) */ 76#define MSGMAX 8192 /* <= INT_MAX */ /* max size of message (bytes) */
65#define MSGMNB 16384 /* <= INT_MAX */ /* default max size of a message queue */ 77#define MSGMNB 16384 /* <= INT_MAX */ /* default max size of a message queue */
66 78
diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h
index 541fce03b50c..dd73b908b2f3 100644
--- a/include/uapi/linux/sem.h
+++ b/include/uapi/linux/sem.h
@@ -63,10 +63,22 @@ struct seminfo {
63 int semaem; 63 int semaem;
64}; 64};
65 65
66#define SEMMNI 128 /* <= IPCMNI max # of semaphore identifiers */ 66/*
67#define SEMMSL 250 /* <= 8 000 max num of semaphores per id */ 67 * SEMMNI, SEMMSL and SEMMNS are default values which can be
68 * modified by sysctl.
69 * The values has been chosen to be larger than necessary for any
70 * known configuration.
71 *
72 * SEMOPM should not be increased beyond 1000, otherwise there is the
73 * risk that semop()/semtimedop() fails due to kernel memory fragmentation when
74 * allocating the sop array.
75 */
76
77
78#define SEMMNI 32000 /* <= IPCMNI max # of semaphore identifiers */
79#define SEMMSL 32000 /* <= INT_MAX max num of semaphores per id */
68#define SEMMNS (SEMMNI*SEMMSL) /* <= INT_MAX max # of semaphores in system */ 80#define SEMMNS (SEMMNI*SEMMSL) /* <= INT_MAX max # of semaphores in system */
69#define SEMOPM 32 /* <= 1 000 max num of ops per semop call */ 81#define SEMOPM 500 /* <= 1 000 max num of ops per semop call */
70#define SEMVMX 32767 /* <= 32767 semaphore maximum value */ 82#define SEMVMX 32767 /* <= 32767 semaphore maximum value */
71#define SEMAEM SEMVMX /* adjust on exit max value */ 83#define SEMAEM SEMVMX /* adjust on exit max value */
72 84
diff --git a/init/main.c b/init/main.c
index ca380ec685de..ed7e7ad5fee0 100644
--- a/init/main.c
+++ b/init/main.c
@@ -51,6 +51,7 @@
51#include <linux/mempolicy.h> 51#include <linux/mempolicy.h>
52#include <linux/key.h> 52#include <linux/key.h>
53#include <linux/buffer_head.h> 53#include <linux/buffer_head.h>
54#include <linux/page_ext.h>
54#include <linux/debug_locks.h> 55#include <linux/debug_locks.h>
55#include <linux/debugobjects.h> 56#include <linux/debugobjects.h>
56#include <linux/lockdep.h> 57#include <linux/lockdep.h>
@@ -484,6 +485,11 @@ void __init __weak thread_info_cache_init(void)
484 */ 485 */
485static void __init mm_init(void) 486static void __init mm_init(void)
486{ 487{
488 /*
489 * page_ext requires contiguous pages,
490 * bigger than MAX_ORDER unless SPARSEMEM.
491 */
492 page_ext_init_flatmem();
487 mem_init(); 493 mem_init();
488 kmem_cache_init(); 494 kmem_cache_init();
489 percpu_init_late(); 495 percpu_init_late();
@@ -621,6 +627,7 @@ asmlinkage __visible void __init start_kernel(void)
621 initrd_start = 0; 627 initrd_start = 0;
622 } 628 }
623#endif 629#endif
630 page_ext_init();
624 debug_objects_mem_init(); 631 debug_objects_mem_init();
625 kmemleak_init(); 632 kmemleak_init();
626 setup_per_cpu_pageset(); 633 setup_per_cpu_pageset();
diff --git a/ipc/Makefile b/ipc/Makefile
index 9075e172e52c..86c7300ecdf5 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -3,7 +3,7 @@
3# 3#
4 4
5obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o 5obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
6obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o syscall.o 6obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o syscall.o
7obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o 7obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o
8obj_mq-$(CONFIG_COMPAT) += compat_mq.o 8obj_mq-$(CONFIG_COMPAT) += compat_mq.o
9obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) 9obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index e8075b247497..8ad93c29f511 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -62,29 +62,6 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
62 return err; 62 return err;
63} 63}
64 64
65static int proc_ipc_callback_dointvec_minmax(struct ctl_table *table, int write,
66 void __user *buffer, size_t *lenp, loff_t *ppos)
67{
68 struct ctl_table ipc_table;
69 size_t lenp_bef = *lenp;
70 int rc;
71
72 memcpy(&ipc_table, table, sizeof(ipc_table));
73 ipc_table.data = get_ipc(table);
74
75 rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
76
77 if (write && !rc && lenp_bef == *lenp)
78 /*
79 * Tunable has successfully been changed by hand. Disable its
80 * automatic adjustment. This simply requires unregistering
81 * the notifiers that trigger recalculation.
82 */
83 unregister_ipcns_notifier(current->nsproxy->ipc_ns);
84
85 return rc;
86}
87
88static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, 65static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write,
89 void __user *buffer, size_t *lenp, loff_t *ppos) 66 void __user *buffer, size_t *lenp, loff_t *ppos)
90{ 67{
@@ -96,54 +73,19 @@ static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write,
96 lenp, ppos); 73 lenp, ppos);
97} 74}
98 75
99/* 76static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
100 * Routine that is called when the file "auto_msgmni" has successfully been
101 * written.
102 * Two values are allowed:
103 * 0: unregister msgmni's callback routine from the ipc namespace notifier
104 * chain. This means that msgmni won't be recomputed anymore upon memory
105 * add/remove or ipc namespace creation/removal.
106 * 1: register back the callback routine.
107 */
108static void ipc_auto_callback(int val)
109{
110 if (!val)
111 unregister_ipcns_notifier(current->nsproxy->ipc_ns);
112 else {
113 /*
114 * Re-enable automatic recomputing only if not already
115 * enabled.
116 */
117 recompute_msgmni(current->nsproxy->ipc_ns);
118 cond_register_ipcns_notifier(current->nsproxy->ipc_ns);
119 }
120}
121
122static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write,
123 void __user *buffer, size_t *lenp, loff_t *ppos) 77 void __user *buffer, size_t *lenp, loff_t *ppos)
124{ 78{
125 struct ctl_table ipc_table; 79 struct ctl_table ipc_table;
126 int oldval; 80 int dummy = 0;
127 int rc;
128 81
129 memcpy(&ipc_table, table, sizeof(ipc_table)); 82 memcpy(&ipc_table, table, sizeof(ipc_table));
130 ipc_table.data = get_ipc(table); 83 ipc_table.data = &dummy;
131 oldval = *((int *)(ipc_table.data));
132 84
133 rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); 85 if (write)
86 pr_info_once("writing to auto_msgmni has no effect");
134 87
135 if (write && !rc) { 88 return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
136 int newval = *((int *)(ipc_table.data));
137 /*
138 * The file "auto_msgmni" has correctly been set.
139 * React by (un)registering the corresponding tunable, if the
140 * value has changed.
141 */
142 if (newval != oldval)
143 ipc_auto_callback(newval);
144 }
145
146 return rc;
147} 89}
148 90
149#else 91#else
@@ -151,8 +93,7 @@ static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write,
151#define proc_ipc_dointvec NULL 93#define proc_ipc_dointvec NULL
152#define proc_ipc_dointvec_minmax NULL 94#define proc_ipc_dointvec_minmax NULL
153#define proc_ipc_dointvec_minmax_orphans NULL 95#define proc_ipc_dointvec_minmax_orphans NULL
154#define proc_ipc_callback_dointvec_minmax NULL 96#define proc_ipc_auto_msgmni NULL
155#define proc_ipcauto_dointvec_minmax NULL
156#endif 97#endif
157 98
158static int zero; 99static int zero;
@@ -204,11 +145,20 @@ static struct ctl_table ipc_kern_table[] = {
204 .data = &init_ipc_ns.msg_ctlmni, 145 .data = &init_ipc_ns.msg_ctlmni,
205 .maxlen = sizeof(init_ipc_ns.msg_ctlmni), 146 .maxlen = sizeof(init_ipc_ns.msg_ctlmni),
206 .mode = 0644, 147 .mode = 0644,
207 .proc_handler = proc_ipc_callback_dointvec_minmax, 148 .proc_handler = proc_ipc_dointvec_minmax,
208 .extra1 = &zero, 149 .extra1 = &zero,
209 .extra2 = &int_max, 150 .extra2 = &int_max,
210 }, 151 },
211 { 152 {
153 .procname = "auto_msgmni",
154 .data = NULL,
155 .maxlen = sizeof(int),
156 .mode = 0644,
157 .proc_handler = proc_ipc_auto_msgmni,
158 .extra1 = &zero,
159 .extra2 = &one,
160 },
161 {
212 .procname = "msgmnb", 162 .procname = "msgmnb",
213 .data = &init_ipc_ns.msg_ctlmnb, 163 .data = &init_ipc_ns.msg_ctlmnb,
214 .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), 164 .maxlen = sizeof(init_ipc_ns.msg_ctlmnb),
@@ -224,15 +174,6 @@ static struct ctl_table ipc_kern_table[] = {
224 .mode = 0644, 174 .mode = 0644,
225 .proc_handler = proc_ipc_dointvec, 175 .proc_handler = proc_ipc_dointvec,
226 }, 176 },
227 {
228 .procname = "auto_msgmni",
229 .data = &init_ipc_ns.auto_msgmni,
230 .maxlen = sizeof(int),
231 .mode = 0644,
232 .proc_handler = proc_ipcauto_dointvec_minmax,
233 .extra1 = &zero,
234 .extra2 = &one,
235 },
236#ifdef CONFIG_CHECKPOINT_RESTORE 177#ifdef CONFIG_CHECKPOINT_RESTORE
237 { 178 {
238 .procname = "sem_next_id", 179 .procname = "sem_next_id",
diff --git a/ipc/ipcns_notifier.c b/ipc/ipcns_notifier.c
deleted file mode 100644
index b9b31a4f77e1..000000000000
--- a/ipc/ipcns_notifier.c
+++ /dev/null
@@ -1,92 +0,0 @@
1/*
2 * linux/ipc/ipcns_notifier.c
3 * Copyright (C) 2007 BULL SA. Nadia Derbey
4 *
5 * Notification mechanism for ipc namespaces:
6 * The callback routine registered in the memory chain invokes the ipcns
7 * notifier chain with the IPCNS_MEMCHANGED event.
8 * Each callback routine registered in the ipcns namespace recomputes msgmni
9 * for the owning namespace.
10 */
11
12#include <linux/msg.h>
13#include <linux/rcupdate.h>
14#include <linux/notifier.h>
15#include <linux/nsproxy.h>
16#include <linux/ipc_namespace.h>
17
18#include "util.h"
19
20
21
22static BLOCKING_NOTIFIER_HEAD(ipcns_chain);
23
24
25static int ipcns_callback(struct notifier_block *self,
26 unsigned long action, void *arg)
27{
28 struct ipc_namespace *ns;
29
30 switch (action) {
31 case IPCNS_MEMCHANGED: /* amount of lowmem has changed */
32 case IPCNS_CREATED:
33 case IPCNS_REMOVED:
34 /*
35 * It's time to recompute msgmni
36 */
37 ns = container_of(self, struct ipc_namespace, ipcns_nb);
38 /*
39 * No need to get a reference on the ns: the 1st job of
40 * free_ipc_ns() is to unregister the callback routine.
41 * blocking_notifier_chain_unregister takes the wr lock to do
42 * it.
43 * When this callback routine is called the rd lock is held by
44 * blocking_notifier_call_chain.
45 * So the ipc ns cannot be freed while we are here.
46 */
47 recompute_msgmni(ns);
48 break;
49 default:
50 break;
51 }
52
53 return NOTIFY_OK;
54}
55
56int register_ipcns_notifier(struct ipc_namespace *ns)
57{
58 int rc;
59
60 memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb));
61 ns->ipcns_nb.notifier_call = ipcns_callback;
62 ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI;
63 rc = blocking_notifier_chain_register(&ipcns_chain, &ns->ipcns_nb);
64 if (!rc)
65 ns->auto_msgmni = 1;
66 return rc;
67}
68
69int cond_register_ipcns_notifier(struct ipc_namespace *ns)
70{
71 int rc;
72
73 memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb));
74 ns->ipcns_nb.notifier_call = ipcns_callback;
75 ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI;
76 rc = blocking_notifier_chain_cond_register(&ipcns_chain,
77 &ns->ipcns_nb);
78 if (!rc)
79 ns->auto_msgmni = 1;
80 return rc;
81}
82
83void unregister_ipcns_notifier(struct ipc_namespace *ns)
84{
85 blocking_notifier_chain_unregister(&ipcns_chain, &ns->ipcns_nb);
86 ns->auto_msgmni = 0;
87}
88
89int ipcns_notify(unsigned long val)
90{
91 return blocking_notifier_call_chain(&ipcns_chain, val, NULL);
92}
diff --git a/ipc/msg.c b/ipc/msg.c
index c5d8e3749985..a7261d5cbc89 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -989,43 +989,12 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,
989 return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); 989 return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill);
990} 990}
991 991
992/*
993 * Scale msgmni with the available lowmem size: the memory dedicated to msg
994 * queues should occupy at most 1/MSG_MEM_SCALE of lowmem.
995 * Also take into account the number of nsproxies created so far.
996 * This should be done staying within the (MSGMNI , IPCMNI/nr_ipc_ns) range.
997 */
998void recompute_msgmni(struct ipc_namespace *ns)
999{
1000 struct sysinfo i;
1001 unsigned long allowed;
1002 int nb_ns;
1003
1004 si_meminfo(&i);
1005 allowed = (((i.totalram - i.totalhigh) / MSG_MEM_SCALE) * i.mem_unit)
1006 / MSGMNB;
1007 nb_ns = atomic_read(&nr_ipc_ns);
1008 allowed /= nb_ns;
1009
1010 if (allowed < MSGMNI) {
1011 ns->msg_ctlmni = MSGMNI;
1012 return;
1013 }
1014
1015 if (allowed > IPCMNI / nb_ns) {
1016 ns->msg_ctlmni = IPCMNI / nb_ns;
1017 return;
1018 }
1019
1020 ns->msg_ctlmni = allowed;
1021}
1022 992
1023void msg_init_ns(struct ipc_namespace *ns) 993void msg_init_ns(struct ipc_namespace *ns)
1024{ 994{
1025 ns->msg_ctlmax = MSGMAX; 995 ns->msg_ctlmax = MSGMAX;
1026 ns->msg_ctlmnb = MSGMNB; 996 ns->msg_ctlmnb = MSGMNB;
1027 997 ns->msg_ctlmni = MSGMNI;
1028 recompute_msgmni(ns);
1029 998
1030 atomic_set(&ns->msg_bytes, 0); 999 atomic_set(&ns->msg_bytes, 0);
1031 atomic_set(&ns->msg_hdrs, 0); 1000 atomic_set(&ns->msg_hdrs, 0);
@@ -1069,9 +1038,6 @@ void __init msg_init(void)
1069{ 1038{
1070 msg_init_ns(&init_ipc_ns); 1039 msg_init_ns(&init_ipc_ns);
1071 1040
1072 printk(KERN_INFO "msgmni has been set to %d\n",
1073 init_ipc_ns.msg_ctlmni);
1074
1075 ipc_init_proc_interface("sysvipc/msg", 1041 ipc_init_proc_interface("sysvipc/msg",
1076 " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", 1042 " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n",
1077 IPC_MSG_IDS, sysvipc_msg_proc_show); 1043 IPC_MSG_IDS, sysvipc_msg_proc_show);
diff --git a/ipc/namespace.c b/ipc/namespace.c
index b54468e48e32..1a3ffd40356e 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -45,14 +45,6 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
45 msg_init_ns(ns); 45 msg_init_ns(ns);
46 shm_init_ns(ns); 46 shm_init_ns(ns);
47 47
48 /*
49 * msgmni has already been computed for the new ipc ns.
50 * Thus, do the ipcns creation notification before registering that
51 * new ipcns in the chain.
52 */
53 ipcns_notify(IPCNS_CREATED);
54 register_ipcns_notifier(ns);
55
56 ns->user_ns = get_user_ns(user_ns); 48 ns->user_ns = get_user_ns(user_ns);
57 49
58 return ns; 50 return ns;
@@ -99,25 +91,11 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
99 91
100static void free_ipc_ns(struct ipc_namespace *ns) 92static void free_ipc_ns(struct ipc_namespace *ns)
101{ 93{
102 /*
103 * Unregistering the hotplug notifier at the beginning guarantees
104 * that the ipc namespace won't be freed while we are inside the
105 * callback routine. Since the blocking_notifier_chain_XXX routines
106 * hold a rw lock on the notifier list, unregister_ipcns_notifier()
107 * won't take the rw lock before blocking_notifier_call_chain() has
108 * released the rd lock.
109 */
110 unregister_ipcns_notifier(ns);
111 sem_exit_ns(ns); 94 sem_exit_ns(ns);
112 msg_exit_ns(ns); 95 msg_exit_ns(ns);
113 shm_exit_ns(ns); 96 shm_exit_ns(ns);
114 atomic_dec(&nr_ipc_ns); 97 atomic_dec(&nr_ipc_ns);
115 98
116 /*
117 * Do the ipcns removal notification after decrementing nr_ipc_ns in
118 * order to have a correct value when recomputing msgmni.
119 */
120 ipcns_notify(IPCNS_REMOVED);
121 put_user_ns(ns->user_ns); 99 put_user_ns(ns->user_ns);
122 proc_free_inum(ns->proc_inum); 100 proc_free_inum(ns->proc_inum);
123 kfree(ns); 101 kfree(ns);
diff --git a/ipc/sem.c b/ipc/sem.c
index 53c3310f41c6..6115146563f9 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -326,10 +326,17 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
326 326
327 /* Then check that the global lock is free */ 327 /* Then check that the global lock is free */
328 if (!spin_is_locked(&sma->sem_perm.lock)) { 328 if (!spin_is_locked(&sma->sem_perm.lock)) {
329 /* spin_is_locked() is not a memory barrier */ 329 /*
330 smp_mb(); 330 * The ipc object lock check must be visible on all
331 * cores before rechecking the complex count. Otherwise
332 * we can race with another thread that does:
333 * complex_count++;
334 * spin_unlock(sem_perm.lock);
335 */
336 smp_rmb();
331 337
332 /* Now repeat the test of complex_count: 338 /*
339 * Now repeat the test of complex_count:
333 * It can't change anymore until we drop sem->lock. 340 * It can't change anymore until we drop sem->lock.
334 * Thus: if is now 0, then it will stay 0. 341 * Thus: if is now 0, then it will stay 0.
335 */ 342 */
diff --git a/ipc/shm.c b/ipc/shm.c
index 01454796ba3c..19633b4a2350 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -219,7 +219,8 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
219 if (!is_file_hugepages(shm_file)) 219 if (!is_file_hugepages(shm_file))
220 shmem_lock(shm_file, 0, shp->mlock_user); 220 shmem_lock(shm_file, 0, shp->mlock_user);
221 else if (shp->mlock_user) 221 else if (shp->mlock_user)
222 user_shm_unlock(file_inode(shm_file)->i_size, shp->mlock_user); 222 user_shm_unlock(i_size_read(file_inode(shm_file)),
223 shp->mlock_user);
223 fput(shm_file); 224 fput(shm_file);
224 ipc_rcu_putref(shp, shm_rcu_free); 225 ipc_rcu_putref(shp, shm_rcu_free);
225} 226}
@@ -1229,6 +1230,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1229 int retval = -EINVAL; 1230 int retval = -EINVAL;
1230#ifdef CONFIG_MMU 1231#ifdef CONFIG_MMU
1231 loff_t size = 0; 1232 loff_t size = 0;
1233 struct file *file;
1232 struct vm_area_struct *next; 1234 struct vm_area_struct *next;
1233#endif 1235#endif
1234 1236
@@ -1245,7 +1247,8 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1245 * started at address shmaddr. It records it's size and then unmaps 1247 * started at address shmaddr. It records it's size and then unmaps
1246 * it. 1248 * it.
1247 * - Then it unmaps all shm vmas that started at shmaddr and that 1249 * - Then it unmaps all shm vmas that started at shmaddr and that
1248 * are within the initially determined size. 1250 * are within the initially determined size and that are from the
1251 * same shm segment from which we determined the size.
1249 * Errors from do_munmap are ignored: the function only fails if 1252 * Errors from do_munmap are ignored: the function only fails if
1250 * it's called with invalid parameters or if it's called to unmap 1253 * it's called with invalid parameters or if it's called to unmap
1251 * a part of a vma. Both calls in this function are for full vmas, 1254 * a part of a vma. Both calls in this function are for full vmas,
@@ -1271,8 +1274,14 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1271 if ((vma->vm_ops == &shm_vm_ops) && 1274 if ((vma->vm_ops == &shm_vm_ops) &&
1272 (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { 1275 (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
1273 1276
1274 1277 /*
1275 size = file_inode(vma->vm_file)->i_size; 1278 * Record the file of the shm segment being
1279 * unmapped. With mremap(), someone could place
1280 * page from another segment but with equal offsets
1281 * in the range we are unmapping.
1282 */
1283 file = vma->vm_file;
1284 size = i_size_read(file_inode(vma->vm_file));
1276 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); 1285 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1277 /* 1286 /*
1278 * We discovered the size of the shm segment, so 1287 * We discovered the size of the shm segment, so
@@ -1298,8 +1307,8 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1298 1307
1299 /* finding a matching vma now does not alter retval */ 1308 /* finding a matching vma now does not alter retval */
1300 if ((vma->vm_ops == &shm_vm_ops) && 1309 if ((vma->vm_ops == &shm_vm_ops) &&
1301 (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) 1310 ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
1302 1311 (vma->vm_file == file))
1303 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); 1312 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1304 vma = next; 1313 vma = next;
1305 } 1314 }
diff --git a/ipc/util.c b/ipc/util.c
index 88adc329888c..106bed0378ab 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -71,44 +71,6 @@ struct ipc_proc_iface {
71 int (*show)(struct seq_file *, void *); 71 int (*show)(struct seq_file *, void *);
72}; 72};
73 73
74static void ipc_memory_notifier(struct work_struct *work)
75{
76 ipcns_notify(IPCNS_MEMCHANGED);
77}
78
79static int ipc_memory_callback(struct notifier_block *self,
80 unsigned long action, void *arg)
81{
82 static DECLARE_WORK(ipc_memory_wq, ipc_memory_notifier);
83
84 switch (action) {
85 case MEM_ONLINE: /* memory successfully brought online */
86 case MEM_OFFLINE: /* or offline: it's time to recompute msgmni */
87 /*
88 * This is done by invoking the ipcns notifier chain with the
89 * IPC_MEMCHANGED event.
90 * In order not to keep the lock on the hotplug memory chain
91 * for too long, queue a work item that will, when waken up,
92 * activate the ipcns notification chain.
93 */
94 schedule_work(&ipc_memory_wq);
95 break;
96 case MEM_GOING_ONLINE:
97 case MEM_GOING_OFFLINE:
98 case MEM_CANCEL_ONLINE:
99 case MEM_CANCEL_OFFLINE:
100 default:
101 break;
102 }
103
104 return NOTIFY_OK;
105}
106
107static struct notifier_block ipc_memory_nb = {
108 .notifier_call = ipc_memory_callback,
109 .priority = IPC_CALLBACK_PRI,
110};
111
112/** 74/**
113 * ipc_init - initialise ipc subsystem 75 * ipc_init - initialise ipc subsystem
114 * 76 *
@@ -124,8 +86,6 @@ static int __init ipc_init(void)
124 sem_init(); 86 sem_init();
125 msg_init(); 87 msg_init();
126 shm_init(); 88 shm_init();
127 register_hotmemory_notifier(&ipc_memory_nb);
128 register_ipcns_notifier(&init_ipc_ns);
129 return 0; 89 return 0;
130} 90}
131device_initcall(ipc_init); 91device_initcall(ipc_init);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 80f29e015570..2e0c97427b33 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -174,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk)
174 struct fsnotify_mark *entry = &chunk->mark; 174 struct fsnotify_mark *entry = &chunk->mark;
175 struct list_head *list; 175 struct list_head *list;
176 176
177 if (!entry->i.inode) 177 if (!entry->inode)
178 return; 178 return;
179 list = chunk_hash(entry->i.inode); 179 list = chunk_hash(entry->inode);
180 list_add_rcu(&chunk->hash, list); 180 list_add_rcu(&chunk->hash, list);
181} 181}
182 182
@@ -188,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
188 188
189 list_for_each_entry_rcu(p, list, hash) { 189 list_for_each_entry_rcu(p, list, hash) {
190 /* mark.inode may have gone NULL, but who cares? */ 190 /* mark.inode may have gone NULL, but who cares? */
191 if (p->mark.i.inode == inode) { 191 if (p->mark.inode == inode) {
192 atomic_long_inc(&p->refs); 192 atomic_long_inc(&p->refs);
193 return p; 193 return p;
194 } 194 }
@@ -231,7 +231,7 @@ static void untag_chunk(struct node *p)
231 new = alloc_chunk(size); 231 new = alloc_chunk(size);
232 232
233 spin_lock(&entry->lock); 233 spin_lock(&entry->lock);
234 if (chunk->dead || !entry->i.inode) { 234 if (chunk->dead || !entry->inode) {
235 spin_unlock(&entry->lock); 235 spin_unlock(&entry->lock);
236 if (new) 236 if (new)
237 free_chunk(new); 237 free_chunk(new);
@@ -258,7 +258,7 @@ static void untag_chunk(struct node *p)
258 goto Fallback; 258 goto Fallback;
259 259
260 fsnotify_duplicate_mark(&new->mark, entry); 260 fsnotify_duplicate_mark(&new->mark, entry);
261 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 261 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
262 fsnotify_put_mark(&new->mark); 262 fsnotify_put_mark(&new->mark);
263 goto Fallback; 263 goto Fallback;
264 } 264 }
@@ -386,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
386 chunk_entry = &chunk->mark; 386 chunk_entry = &chunk->mark;
387 387
388 spin_lock(&old_entry->lock); 388 spin_lock(&old_entry->lock);
389 if (!old_entry->i.inode) { 389 if (!old_entry->inode) {
390 /* old_entry is being shot, lets just lie */ 390 /* old_entry is being shot, lets just lie */
391 spin_unlock(&old_entry->lock); 391 spin_unlock(&old_entry->lock);
392 fsnotify_put_mark(old_entry); 392 fsnotify_put_mark(old_entry);
@@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
395 } 395 }
396 396
397 fsnotify_duplicate_mark(chunk_entry, old_entry); 397 fsnotify_duplicate_mark(chunk_entry, old_entry);
398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { 398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
399 spin_unlock(&old_entry->lock); 399 spin_unlock(&old_entry->lock);
400 fsnotify_put_mark(chunk_entry); 400 fsnotify_put_mark(chunk_entry);
401 fsnotify_put_mark(old_entry); 401 fsnotify_put_mark(old_entry);
@@ -611,7 +611,7 @@ void audit_trim_trees(void)
611 list_for_each_entry(node, &tree->chunks, list) { 611 list_for_each_entry(node, &tree->chunks, list) {
612 struct audit_chunk *chunk = find_chunk(node); 612 struct audit_chunk *chunk = find_chunk(node);
613 /* this could be NULL if the watch is dying else where... */ 613 /* this could be NULL if the watch is dying else where... */
614 struct inode *inode = chunk->mark.i.inode; 614 struct inode *inode = chunk->mark.inode;
615 node->index |= 1U<<31; 615 node->index |= 1U<<31;
616 if (iterate_mounts(compare_root, inode, root_mnt)) 616 if (iterate_mounts(compare_root, inode, root_mnt))
617 node->index &= ~(1U<<31); 617 node->index &= ~(1U<<31);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ed8f2cde34c5..995a95f61a19 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
724 int more = 0; 724 int more = 0;
725 725
726 again: 726 again:
727 mutex_lock(&mapping->i_mmap_mutex); 727 i_mmap_lock_read(mapping);
728 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 728 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
729 if (!valid_vma(vma, is_register)) 729 if (!valid_vma(vma, is_register))
730 continue; 730 continue;
731 731
732 if (!prev && !more) { 732 if (!prev && !more) {
733 /* 733 /*
734 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through 734 * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
735 * reclaim. This is optimistic, no harm done if it fails. 735 * reclaim. This is optimistic, no harm done if it fails.
736 */ 736 */
737 prev = kmalloc(sizeof(struct map_info), 737 prev = kmalloc(sizeof(struct map_info),
@@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
755 info->mm = vma->vm_mm; 755 info->mm = vma->vm_mm;
756 info->vaddr = offset_to_vaddr(vma, offset); 756 info->vaddr = offset_to_vaddr(vma, offset);
757 } 757 }
758 mutex_unlock(&mapping->i_mmap_mutex); 758 i_mmap_unlock_read(mapping);
759 759
760 if (!more) 760 if (!more)
761 goto out; 761 goto out;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9ca84189cfc2..4dc2ddade9f1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
433 get_file(file); 433 get_file(file);
434 if (tmp->vm_flags & VM_DENYWRITE) 434 if (tmp->vm_flags & VM_DENYWRITE)
435 atomic_dec(&inode->i_writecount); 435 atomic_dec(&inode->i_writecount);
436 mutex_lock(&mapping->i_mmap_mutex); 436 i_mmap_lock_write(mapping);
437 if (tmp->vm_flags & VM_SHARED) 437 if (tmp->vm_flags & VM_SHARED)
438 atomic_inc(&mapping->i_mmap_writable); 438 atomic_inc(&mapping->i_mmap_writable);
439 flush_dcache_mmap_lock(mapping); 439 flush_dcache_mmap_lock(mapping);
@@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
445 vma_interval_tree_insert_after(tmp, mpnt, 445 vma_interval_tree_insert_after(tmp, mpnt,
446 &mapping->i_mmap); 446 &mapping->i_mmap);
447 flush_dcache_mmap_unlock(mapping); 447 flush_dcache_mmap_unlock(mapping);
448 mutex_unlock(&mapping->i_mmap_mutex); 448 i_mmap_unlock_write(mapping);
449 } 449 }
450 450
451 /* 451 /*
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 3b7408759bdf..c92e44855ddd 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -32,10 +32,13 @@ config GCOV_KERNEL
32 Note that the debugfs filesystem has to be mounted to access 32 Note that the debugfs filesystem has to be mounted to access
33 profiling data. 33 profiling data.
34 34
35config ARCH_HAS_GCOV_PROFILE_ALL
36 def_bool n
37
35config GCOV_PROFILE_ALL 38config GCOV_PROFILE_ALL
36 bool "Profile entire Kernel" 39 bool "Profile entire Kernel"
37 depends on GCOV_KERNEL 40 depends on GCOV_KERNEL
38 depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 41 depends on ARCH_HAS_GCOV_PROFILE_ALL
39 default n 42 default n
40 ---help--- 43 ---help---
41 This options activates profiling for the entire kernel. 44 This options activates profiling for the entire kernel.
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2abf9f6e9a61..9a8a01abbaed 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
600 if (!kexec_on_panic) { 600 if (!kexec_on_panic) {
601 image->swap_page = kimage_alloc_control_pages(image, 0); 601 image->swap_page = kimage_alloc_control_pages(image, 0);
602 if (!image->swap_page) { 602 if (!image->swap_page) {
603 pr_err(KERN_ERR "Could not allocate swap buffer\n"); 603 pr_err("Could not allocate swap buffer\n");
604 goto out_free_control_pages; 604 goto out_free_control_pages;
605 } 605 }
606 } 606 }
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 00fe55cc5a82..b6e4c16377c7 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
25} 25}
26EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
27 27
28int snprint_stack_trace(char *buf, size_t size,
29 struct stack_trace *trace, int spaces)
30{
31 int i;
32 unsigned long ip;
33 int generated;
34 int total = 0;
35
36 if (WARN_ON(!trace->entries))
37 return 0;
38
39 for (i = 0; i < trace->nr_entries; i++) {
40 ip = trace->entries[i];
41 generated = snprintf(buf, size, "%*c[<%p>] %pS\n",
42 1 + spaces, ' ', (void *) ip, (void *) ip);
43
44 total += generated;
45
46 /* Assume that generated isn't a negative number */
47 if (generated >= size) {
48 buf += size;
49 size = 0;
50 } else {
51 buf += generated;
52 size -= generated;
53 }
54 }
55
56 return total;
57}
58EXPORT_SYMBOL_GPL(snprint_stack_trace);
59
28/* 60/*
29 * Architectures that do not implement save_stack_trace_tsk or 61 * Architectures that do not implement save_stack_trace_tsk or
30 * save_stack_trace_regs get this weak alias and a once-per-bootup warning 62 * save_stack_trace_regs get this weak alias and a once-per-bootup warning
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 61eea02b53f5..5adcb0ae3a58 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -226,3 +226,6 @@ cond_syscall(sys_seccomp);
226 226
227/* access BPF programs and maps */ 227/* access BPF programs and maps */
228cond_syscall(sys_bpf); 228cond_syscall(sys_bpf);
229
230/* execveat */
231cond_syscall(sys_execveat);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d780351835e9..5f2ce616c046 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -227,6 +227,22 @@ config UNUSED_SYMBOLS
227 you really need it, and what the merge plan to the mainline kernel for 227 you really need it, and what the merge plan to the mainline kernel for
228 your module is. 228 your module is.
229 229
230config PAGE_OWNER
231 bool "Track page owner"
232 depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
233 select DEBUG_FS
234 select STACKTRACE
235 select PAGE_EXTENSION
236 help
237 This keeps track of what call chain is the owner of a page, may
238 help to find bare alloc_page(s) leaks. Even if you include this
239 feature on your build, it is disabled in default. You should pass
240 "page_owner=on" to boot parameter in order to enable it. Eats
241 a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
242 for user-space helper.
243
244 If unsure, say N.
245
230config DEBUG_FS 246config DEBUG_FS
231 bool "Debug Filesystem" 247 bool "Debug Filesystem"
232 help 248 help
diff --git a/lib/audit.c b/lib/audit.c
index 1d726a22565b..b8fb5ee81e26 100644
--- a/lib/audit.c
+++ b/lib/audit.c
@@ -54,6 +54,9 @@ int audit_classify_syscall(int abi, unsigned syscall)
54 case __NR_socketcall: 54 case __NR_socketcall:
55 return 4; 55 return 4;
56#endif 56#endif
57#ifdef __NR_execveat
58 case __NR_execveat:
59#endif
57 case __NR_execve: 60 case __NR_execve:
58 return 5; 61 return 5;
59 default: 62 default:
diff --git a/lib/bitmap.c b/lib/bitmap.c
index b499ab6ada29..969ae8fbc85b 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -326,30 +326,32 @@ void bitmap_clear(unsigned long *map, unsigned int start, int len)
326} 326}
327EXPORT_SYMBOL(bitmap_clear); 327EXPORT_SYMBOL(bitmap_clear);
328 328
329/* 329/**
330 * bitmap_find_next_zero_area - find a contiguous aligned zero area 330 * bitmap_find_next_zero_area_off - find a contiguous aligned zero area
331 * @map: The address to base the search on 331 * @map: The address to base the search on
332 * @size: The bitmap size in bits 332 * @size: The bitmap size in bits
333 * @start: The bitnumber to start searching at 333 * @start: The bitnumber to start searching at
334 * @nr: The number of zeroed bits we're looking for 334 * @nr: The number of zeroed bits we're looking for
335 * @align_mask: Alignment mask for zero area 335 * @align_mask: Alignment mask for zero area
336 * @align_offset: Alignment offset for zero area.
336 * 337 *
337 * The @align_mask should be one less than a power of 2; the effect is that 338 * The @align_mask should be one less than a power of 2; the effect is that
338 * the bit offset of all zero areas this function finds is multiples of that 339 * the bit offset of all zero areas this function finds plus @align_offset
339 * power of 2. A @align_mask of 0 means no alignment is required. 340 * is multiple of that power of 2.
340 */ 341 */
341unsigned long bitmap_find_next_zero_area(unsigned long *map, 342unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
342 unsigned long size, 343 unsigned long size,
343 unsigned long start, 344 unsigned long start,
344 unsigned int nr, 345 unsigned int nr,
345 unsigned long align_mask) 346 unsigned long align_mask,
347 unsigned long align_offset)
346{ 348{
347 unsigned long index, end, i; 349 unsigned long index, end, i;
348again: 350again:
349 index = find_next_zero_bit(map, size, start); 351 index = find_next_zero_bit(map, size, start);
350 352
351 /* Align allocation */ 353 /* Align allocation */
352 index = __ALIGN_MASK(index, align_mask); 354 index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;
353 355
354 end = index + nr; 356 end = index + nr;
355 if (end > size) 357 if (end > size)
@@ -361,7 +363,7 @@ again:
361 } 363 }
362 return index; 364 return index;
363} 365}
364EXPORT_SYMBOL(bitmap_find_next_zero_area); 366EXPORT_SYMBOL(bitmap_find_next_zero_area_off);
365 367
366/* 368/*
367 * Bitmap printing & parsing functions: first version by Nadia Yvette Chambers, 369 * Bitmap printing & parsing functions: first version by Nadia Yvette Chambers,
diff --git a/lib/decompress.c b/lib/decompress.c
index 37f3c786348f..528ff932d8e4 100644
--- a/lib/decompress.c
+++ b/lib/decompress.c
@@ -44,8 +44,8 @@ struct compress_format {
44}; 44};
45 45
46static const struct compress_format compressed_formats[] __initconst = { 46static const struct compress_format compressed_formats[] __initconst = {
47 { {037, 0213}, "gzip", gunzip }, 47 { {0x1f, 0x8b}, "gzip", gunzip },
48 { {037, 0236}, "gzip", gunzip }, 48 { {0x1f, 0x9e}, "gzip", gunzip },
49 { {0x42, 0x5a}, "bzip2", bunzip2 }, 49 { {0x42, 0x5a}, "bzip2", bunzip2 },
50 { {0x5d, 0x00}, "lzma", unlzma }, 50 { {0x5d, 0x00}, "lzma", unlzma },
51 { {0xfd, 0x37}, "xz", unxz }, 51 { {0xfd, 0x37}, "xz", unxz },
diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c
index 8290e0bef7ea..6dd0335ea61b 100644
--- a/lib/decompress_bunzip2.c
+++ b/lib/decompress_bunzip2.c
@@ -184,7 +184,7 @@ static int INIT get_next_block(struct bunzip_data *bd)
184 if (get_bits(bd, 1)) 184 if (get_bits(bd, 1))
185 return RETVAL_OBSOLETE_INPUT; 185 return RETVAL_OBSOLETE_INPUT;
186 origPtr = get_bits(bd, 24); 186 origPtr = get_bits(bd, 24);
187 if (origPtr > dbufSize) 187 if (origPtr >= dbufSize)
188 return RETVAL_DATA_ERROR; 188 return RETVAL_DATA_ERROR;
189 /* mapping table: if some byte values are never used (encoding things 189 /* mapping table: if some byte values are never used (encoding things
190 like ascii text), the compression code removes the gaps to have fewer 190 like ascii text), the compression code removes the gaps to have fewer
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index d7d501ea856d..f1cdeb024d17 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -40,10 +40,16 @@ EXPORT_SYMBOL_GPL(setup_fault_attr);
40 40
41static void fail_dump(struct fault_attr *attr) 41static void fail_dump(struct fault_attr *attr)
42{ 42{
43 if (attr->verbose > 0) 43 if (attr->verbose > 0 && __ratelimit(&attr->ratelimit_state)) {
44 printk(KERN_NOTICE "FAULT_INJECTION: forcing a failure\n"); 44 printk(KERN_NOTICE "FAULT_INJECTION: forcing a failure.\n"
45 if (attr->verbose > 1) 45 "name %pd, interval %lu, probability %lu, "
46 dump_stack(); 46 "space %d, times %d\n", attr->dname,
47 attr->probability, attr->interval,
48 atomic_read(&attr->space),
49 atomic_read(&attr->times));
50 if (attr->verbose > 1)
51 dump_stack();
52 }
47} 53}
48 54
49#define atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0) 55#define atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0)
@@ -202,6 +208,12 @@ struct dentry *fault_create_debugfs_attr(const char *name,
202 goto fail; 208 goto fail;
203 if (!debugfs_create_ul("verbose", mode, dir, &attr->verbose)) 209 if (!debugfs_create_ul("verbose", mode, dir, &attr->verbose))
204 goto fail; 210 goto fail;
211 if (!debugfs_create_u32("verbose_ratelimit_interval_ms", mode, dir,
212 &attr->ratelimit_state.interval))
213 goto fail;
214 if (!debugfs_create_u32("verbose_ratelimit_burst", mode, dir,
215 &attr->ratelimit_state.burst))
216 goto fail;
205 if (!debugfs_create_bool("task-filter", mode, dir, &attr->task_filter)) 217 if (!debugfs_create_bool("task-filter", mode, dir, &attr->task_filter))
206 goto fail; 218 goto fail;
207 219
@@ -222,6 +234,7 @@ struct dentry *fault_create_debugfs_attr(const char *name,
222 234
223#endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ 235#endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */
224 236
237 attr->dname = dget(dir);
225 return dir; 238 return dir;
226fail: 239fail:
227 debugfs_remove_recursive(dir); 240 debugfs_remove_recursive(dir);
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 4b2443254de2..56badfc4810a 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,8 +1,18 @@
1config PAGE_EXTENSION
2 bool "Extend memmap on extra space for more information on page"
3 ---help---
4 Extend memmap on extra space for more information on page. This
5 could be used for debugging features that need to insert extra
6 field for every page. This extension enables us to save memory
7 by not allocating this extra memory according to boottime
8 configuration.
9
1config DEBUG_PAGEALLOC 10config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations" 11 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL 12 depends on DEBUG_KERNEL
4 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC 13 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
5 depends on !KMEMCHECK 14 depends on !KMEMCHECK
15 select PAGE_EXTENSION
6 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC 16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
7 select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC 17 select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
8 ---help--- 18 ---help---
diff --git a/mm/Makefile b/mm/Makefile
index b3c6ce932c64..4bf586e66378 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
63obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 63obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
64obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 64obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
65obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 65obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
66obj-$(CONFIG_PAGE_OWNER) += page_owner.o
66obj-$(CONFIG_CLEANCACHE) += cleancache.o 67obj-$(CONFIG_CLEANCACHE) += cleancache.o
67obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o 68obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
68obj-$(CONFIG_ZPOOL) += zpool.o 69obj-$(CONFIG_ZPOOL) += zpool.o
@@ -71,3 +72,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
71obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o 72obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
72obj-$(CONFIG_CMA) += cma.o 73obj-$(CONFIG_CMA) += cma.o
73obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o 74obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
75obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
diff --git a/mm/cma.c b/mm/cma.c
index 8e9ec13d31db..f8917629cbdd 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -33,6 +33,7 @@
33#include <linux/log2.h> 33#include <linux/log2.h>
34#include <linux/cma.h> 34#include <linux/cma.h>
35#include <linux/highmem.h> 35#include <linux/highmem.h>
36#include <linux/io.h>
36 37
37struct cma { 38struct cma {
38 unsigned long base_pfn; 39 unsigned long base_pfn;
@@ -63,6 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
63 return (1UL << (align_order - cma->order_per_bit)) - 1; 64 return (1UL << (align_order - cma->order_per_bit)) - 1;
64} 65}
65 66
67static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
68{
69 unsigned int alignment;
70
71 if (align_order <= cma->order_per_bit)
72 return 0;
73 alignment = 1UL << (align_order - cma->order_per_bit);
74 return ALIGN(cma->base_pfn, alignment) -
75 (cma->base_pfn >> cma->order_per_bit);
76}
77
66static unsigned long cma_bitmap_maxno(struct cma *cma) 78static unsigned long cma_bitmap_maxno(struct cma *cma)
67{ 79{
68 return cma->count >> cma->order_per_bit; 80 return cma->count >> cma->order_per_bit;
@@ -313,6 +325,11 @@ int __init cma_declare_contiguous(phys_addr_t base,
313 } 325 }
314 } 326 }
315 327
328 /*
329 * kmemleak scans/reads tracked objects for pointers to other
330 * objects but this address isn't mapped and accessible
331 */
332 kmemleak_ignore(phys_to_virt(addr));
316 base = addr; 333 base = addr;
317 } 334 }
318 335
@@ -340,7 +357,7 @@ err:
340 */ 357 */
341struct page *cma_alloc(struct cma *cma, int count, unsigned int align) 358struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
342{ 359{
343 unsigned long mask, pfn, start = 0; 360 unsigned long mask, offset, pfn, start = 0;
344 unsigned long bitmap_maxno, bitmap_no, bitmap_count; 361 unsigned long bitmap_maxno, bitmap_no, bitmap_count;
345 struct page *page = NULL; 362 struct page *page = NULL;
346 int ret; 363 int ret;
@@ -355,13 +372,15 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
355 return NULL; 372 return NULL;
356 373
357 mask = cma_bitmap_aligned_mask(cma, align); 374 mask = cma_bitmap_aligned_mask(cma, align);
375 offset = cma_bitmap_aligned_offset(cma, align);
358 bitmap_maxno = cma_bitmap_maxno(cma); 376 bitmap_maxno = cma_bitmap_maxno(cma);
359 bitmap_count = cma_bitmap_pages_to_bits(cma, count); 377 bitmap_count = cma_bitmap_pages_to_bits(cma, count);
360 378
361 for (;;) { 379 for (;;) {
362 mutex_lock(&cma->lock); 380 mutex_lock(&cma->lock);
363 bitmap_no = bitmap_find_next_zero_area(cma->bitmap, 381 bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
364 bitmap_maxno, start, bitmap_count, mask); 382 bitmap_maxno, start, bitmap_count, mask,
383 offset);
365 if (bitmap_no >= bitmap_maxno) { 384 if (bitmap_no >= bitmap_maxno) {
366 mutex_unlock(&cma->lock); 385 mutex_unlock(&cma->lock);
367 break; 386 break;
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index 789ff70c8a4a..5bf5906ce13b 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -2,23 +2,55 @@
2#include <linux/string.h> 2#include <linux/string.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <linux/highmem.h> 4#include <linux/highmem.h>
5#include <linux/page-debug-flags.h> 5#include <linux/page_ext.h>
6#include <linux/poison.h> 6#include <linux/poison.h>
7#include <linux/ratelimit.h> 7#include <linux/ratelimit.h>
8 8
9static bool page_poisoning_enabled __read_mostly;
10
11static bool need_page_poisoning(void)
12{
13 if (!debug_pagealloc_enabled())
14 return false;
15
16 return true;
17}
18
19static void init_page_poisoning(void)
20{
21 if (!debug_pagealloc_enabled())
22 return;
23
24 page_poisoning_enabled = true;
25}
26
27struct page_ext_operations page_poisoning_ops = {
28 .need = need_page_poisoning,
29 .init = init_page_poisoning,
30};
31
9static inline void set_page_poison(struct page *page) 32static inline void set_page_poison(struct page *page)
10{ 33{
11 __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); 34 struct page_ext *page_ext;
35
36 page_ext = lookup_page_ext(page);
37 __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
12} 38}
13 39
14static inline void clear_page_poison(struct page *page) 40static inline void clear_page_poison(struct page *page)
15{ 41{
16 __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); 42 struct page_ext *page_ext;
43
44 page_ext = lookup_page_ext(page);
45 __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
17} 46}
18 47
19static inline bool page_poison(struct page *page) 48static inline bool page_poison(struct page *page)
20{ 49{
21 return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); 50 struct page_ext *page_ext;
51
52 page_ext = lookup_page_ext(page);
53 return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
22} 54}
23 55
24static void poison_page(struct page *page) 56static void poison_page(struct page *page)
@@ -93,8 +125,11 @@ static void unpoison_pages(struct page *page, int n)
93 unpoison_page(page + i); 125 unpoison_page(page + i);
94} 126}
95 127
96void kernel_map_pages(struct page *page, int numpages, int enable) 128void __kernel_map_pages(struct page *page, int numpages, int enable)
97{ 129{
130 if (!page_poisoning_enabled)
131 return;
132
98 if (enable) 133 if (enable)
99 unpoison_pages(page, numpages); 134 unpoison_pages(page, numpages);
100 else 135 else
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81db45e..2ad7adf4f0a4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -117,7 +117,11 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
117 __filemap_fdatawrite_range(mapping, offset, endbyte, 117 __filemap_fdatawrite_range(mapping, offset, endbyte,
118 WB_SYNC_NONE); 118 WB_SYNC_NONE);
119 119
120 /* First and last FULL page! */ 120 /*
121 * First and last FULL page! Partial pages are deliberately
122 * preserved on the expectation that it is better to preserve
123 * needed memory than to discard unneeded memory.
124 */
121 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; 125 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
122 end_index = (endbyte >> PAGE_CACHE_SHIFT); 126 end_index = (endbyte >> PAGE_CACHE_SHIFT);
123 127
diff --git a/mm/filemap.c b/mm/filemap.c
index 14b4642279f1..e8905bc3cbd7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -62,16 +62,16 @@
62/* 62/*
63 * Lock ordering: 63 * Lock ordering:
64 * 64 *
65 * ->i_mmap_mutex (truncate_pagecache) 65 * ->i_mmap_rwsem (truncate_pagecache)
66 * ->private_lock (__free_pte->__set_page_dirty_buffers) 66 * ->private_lock (__free_pte->__set_page_dirty_buffers)
67 * ->swap_lock (exclusive_swap_page, others) 67 * ->swap_lock (exclusive_swap_page, others)
68 * ->mapping->tree_lock 68 * ->mapping->tree_lock
69 * 69 *
70 * ->i_mutex 70 * ->i_mutex
71 * ->i_mmap_mutex (truncate->unmap_mapping_range) 71 * ->i_mmap_rwsem (truncate->unmap_mapping_range)
72 * 72 *
73 * ->mmap_sem 73 * ->mmap_sem
74 * ->i_mmap_mutex 74 * ->i_mmap_rwsem
75 * ->page_table_lock or pte_lock (various, mainly in memory.c) 75 * ->page_table_lock or pte_lock (various, mainly in memory.c)
76 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 76 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
77 * 77 *
@@ -85,7 +85,7 @@
85 * sb_lock (fs/fs-writeback.c) 85 * sb_lock (fs/fs-writeback.c)
86 * ->mapping->tree_lock (__sync_single_inode) 86 * ->mapping->tree_lock (__sync_single_inode)
87 * 87 *
88 * ->i_mmap_mutex 88 * ->i_mmap_rwsem
89 * ->anon_vma.lock (vma_adjust) 89 * ->anon_vma.lock (vma_adjust)
90 * 90 *
91 * ->anon_vma.lock 91 * ->anon_vma.lock
@@ -105,7 +105,7 @@
105 * ->inode->i_lock (zap_pte_range->set_page_dirty) 105 * ->inode->i_lock (zap_pte_range->set_page_dirty)
106 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 106 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
107 * 107 *
108 * ->i_mmap_mutex 108 * ->i_mmap_rwsem
109 * ->tasklist_lock (memory_failure, collect_procs_ao) 109 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 */ 110 */
111 111
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index d8d9fe3f685c..0d105aeff82f 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -155,22 +155,14 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
155EXPORT_SYMBOL_GPL(xip_file_read); 155EXPORT_SYMBOL_GPL(xip_file_read);
156 156
157/* 157/*
158 * __xip_unmap is invoked from xip_unmap and 158 * __xip_unmap is invoked from xip_unmap and xip_write
159 * xip_write
160 * 159 *
161 * This function walks all vmas of the address_space and unmaps the 160 * This function walks all vmas of the address_space and unmaps the
162 * __xip_sparse_page when found at pgoff. 161 * __xip_sparse_page when found at pgoff.
163 */ 162 */
164static void 163static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
165__xip_unmap (struct address_space * mapping,
166 unsigned long pgoff)
167{ 164{
168 struct vm_area_struct *vma; 165 struct vm_area_struct *vma;
169 struct mm_struct *mm;
170 unsigned long address;
171 pte_t *pte;
172 pte_t pteval;
173 spinlock_t *ptl;
174 struct page *page; 166 struct page *page;
175 unsigned count; 167 unsigned count;
176 int locked = 0; 168 int locked = 0;
@@ -182,11 +174,14 @@ __xip_unmap (struct address_space * mapping,
182 return; 174 return;
183 175
184retry: 176retry:
185 mutex_lock(&mapping->i_mmap_mutex); 177 i_mmap_lock_read(mapping);
186 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 178 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
187 mm = vma->vm_mm; 179 pte_t *pte, pteval;
188 address = vma->vm_start + 180 spinlock_t *ptl;
181 struct mm_struct *mm = vma->vm_mm;
182 unsigned long address = vma->vm_start +
189 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 183 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
184
190 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 185 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
191 pte = page_check_address(page, mm, address, &ptl, 1); 186 pte = page_check_address(page, mm, address, &ptl, 1);
192 if (pte) { 187 if (pte) {
@@ -202,7 +197,7 @@ retry:
202 page_cache_release(page); 197 page_cache_release(page);
203 } 198 }
204 } 199 }
205 mutex_unlock(&mapping->i_mmap_mutex); 200 i_mmap_unlock_read(mapping);
206 201
207 if (locked) { 202 if (locked) {
208 mutex_unlock(&xip_sparse_mutex); 203 mutex_unlock(&xip_sparse_mutex);
diff --git a/mm/fremap.c b/mm/fremap.c
index 72b8fa361433..11ef7ec40d13 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -238,13 +238,13 @@ get_write_lock:
238 } 238 }
239 goto out_freed; 239 goto out_freed;
240 } 240 }
241 mutex_lock(&mapping->i_mmap_mutex); 241 i_mmap_lock_write(mapping);
242 flush_dcache_mmap_lock(mapping); 242 flush_dcache_mmap_lock(mapping);
243 vma->vm_flags |= VM_NONLINEAR; 243 vma->vm_flags |= VM_NONLINEAR;
244 vma_interval_tree_remove(vma, &mapping->i_mmap); 244 vma_interval_tree_remove(vma, &mapping->i_mmap);
245 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 245 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
246 flush_dcache_mmap_unlock(mapping); 246 flush_dcache_mmap_unlock(mapping);
247 mutex_unlock(&mapping->i_mmap_mutex); 247 i_mmap_unlock_write(mapping);
248 } 248 }
249 249
250 if (vma->vm_flags & VM_LOCKED) { 250 if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 919b86a2164d..47f6070d7c46 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1457,7 +1457,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1457 return 0; 1457 return 0;
1458 1458
1459found: 1459found:
1460 BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); 1460 BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
1461 /* Put them into a private list first because mem_map is not up yet */ 1461 /* Put them into a private list first because mem_map is not up yet */
1462 list_add(&m->list, &huge_boot_pages); 1462 list_add(&m->list, &huge_boot_pages);
1463 m->hstate = h; 1463 m->hstate = h;
@@ -2083,7 +2083,7 @@ static void hugetlb_register_node(struct node *node)
2083 * devices of nodes that have memory. All on-line nodes should have 2083 * devices of nodes that have memory. All on-line nodes should have
2084 * registered their associated device by this time. 2084 * registered their associated device by this time.
2085 */ 2085 */
2086static void hugetlb_register_all_nodes(void) 2086static void __init hugetlb_register_all_nodes(void)
2087{ 2087{
2088 int nid; 2088 int nid;
2089 2089
@@ -2726,9 +2726,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2726 * on its way out. We're lucky that the flag has such an appropriate 2726 * on its way out. We're lucky that the flag has such an appropriate
2727 * name, and can in fact be safely cleared here. We could clear it 2727 * name, and can in fact be safely cleared here. We could clear it
2728 * before the __unmap_hugepage_range above, but all that's necessary 2728 * before the __unmap_hugepage_range above, but all that's necessary
2729 * is to clear it before releasing the i_mmap_mutex. This works 2729 * is to clear it before releasing the i_mmap_rwsem. This works
2730 * because in the context this is called, the VMA is about to be 2730 * because in the context this is called, the VMA is about to be
2731 * destroyed and the i_mmap_mutex is held. 2731 * destroyed and the i_mmap_rwsem is held.
2732 */ 2732 */
2733 vma->vm_flags &= ~VM_MAYSHARE; 2733 vma->vm_flags &= ~VM_MAYSHARE;
2734} 2734}
@@ -2774,7 +2774,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2774 * this mapping should be shared between all the VMAs, 2774 * this mapping should be shared between all the VMAs,
2775 * __unmap_hugepage_range() is called as the lock is already held 2775 * __unmap_hugepage_range() is called as the lock is already held
2776 */ 2776 */
2777 mutex_lock(&mapping->i_mmap_mutex); 2777 i_mmap_lock_write(mapping);
2778 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 2778 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
2779 /* Do not unmap the current VMA */ 2779 /* Do not unmap the current VMA */
2780 if (iter_vma == vma) 2780 if (iter_vma == vma)
@@ -2791,7 +2791,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2791 unmap_hugepage_range(iter_vma, address, 2791 unmap_hugepage_range(iter_vma, address,
2792 address + huge_page_size(h), page); 2792 address + huge_page_size(h), page);
2793 } 2793 }
2794 mutex_unlock(&mapping->i_mmap_mutex); 2794 i_mmap_unlock_write(mapping);
2795} 2795}
2796 2796
2797/* 2797/*
@@ -3348,7 +3348,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3348 flush_cache_range(vma, address, end); 3348 flush_cache_range(vma, address, end);
3349 3349
3350 mmu_notifier_invalidate_range_start(mm, start, end); 3350 mmu_notifier_invalidate_range_start(mm, start, end);
3351 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 3351 i_mmap_lock_write(vma->vm_file->f_mapping);
3352 for (; address < end; address += huge_page_size(h)) { 3352 for (; address < end; address += huge_page_size(h)) {
3353 spinlock_t *ptl; 3353 spinlock_t *ptl;
3354 ptep = huge_pte_offset(mm, address); 3354 ptep = huge_pte_offset(mm, address);
@@ -3370,13 +3370,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3370 spin_unlock(ptl); 3370 spin_unlock(ptl);
3371 } 3371 }
3372 /* 3372 /*
3373 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare 3373 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
3374 * may have cleared our pud entry and done put_page on the page table: 3374 * may have cleared our pud entry and done put_page on the page table:
3375 * once we release i_mmap_mutex, another task can do the final put_page 3375 * once we release i_mmap_rwsem, another task can do the final put_page
3376 * and that page table be reused and filled with junk. 3376 * and that page table be reused and filled with junk.
3377 */ 3377 */
3378 flush_tlb_range(vma, start, end); 3378 flush_tlb_range(vma, start, end);
3379 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3379 i_mmap_unlock_write(vma->vm_file->f_mapping);
3380 mmu_notifier_invalidate_range_end(mm, start, end); 3380 mmu_notifier_invalidate_range_end(mm, start, end);
3381 3381
3382 return pages << h->order; 3382 return pages << h->order;
@@ -3525,7 +3525,7 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
3525 * and returns the corresponding pte. While this is not necessary for the 3525 * and returns the corresponding pte. While this is not necessary for the
3526 * !shared pmd case because we can allocate the pmd later as well, it makes the 3526 * !shared pmd case because we can allocate the pmd later as well, it makes the
3527 * code much cleaner. pmd allocation is essential for the shared case because 3527 * code much cleaner. pmd allocation is essential for the shared case because
3528 * pud has to be populated inside the same i_mmap_mutex section - otherwise 3528 * pud has to be populated inside the same i_mmap_rwsem section - otherwise
3529 * racing tasks could either miss the sharing (see huge_pte_offset) or select a 3529 * racing tasks could either miss the sharing (see huge_pte_offset) or select a
3530 * bad pmd for sharing. 3530 * bad pmd for sharing.
3531 */ 3531 */
@@ -3544,7 +3544,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3544 if (!vma_shareable(vma, addr)) 3544 if (!vma_shareable(vma, addr))
3545 return (pte_t *)pmd_alloc(mm, pud, addr); 3545 return (pte_t *)pmd_alloc(mm, pud, addr);
3546 3546
3547 mutex_lock(&mapping->i_mmap_mutex); 3547 i_mmap_lock_write(mapping);
3548 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 3548 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
3549 if (svma == vma) 3549 if (svma == vma)
3550 continue; 3550 continue;
@@ -3572,7 +3572,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3572 spin_unlock(ptl); 3572 spin_unlock(ptl);
3573out: 3573out:
3574 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3574 pte = (pte_t *)pmd_alloc(mm, pud, addr);
3575 mutex_unlock(&mapping->i_mmap_mutex); 3575 i_mmap_unlock_write(mapping);
3576 return pte; 3576 return pte;
3577} 3577}
3578 3578
diff --git a/mm/memblock.c b/mm/memblock.c
index 6ecb0d937fb5..252b77bdf65e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -715,16 +715,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
715} 715}
716 716
717/** 717/**
718 * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
719 * @base: the base phys addr of the region
720 * @size: the size of the region
721 * 718 *
722 * This function isolates region [@base, @base + @size), and mark it with flag 719 * This function isolates region [@base, @base + @size), and sets/clears flag
723 * MEMBLOCK_HOTPLUG.
724 * 720 *
725 * Return 0 on succees, -errno on failure. 721 * Return 0 on succees, -errno on failure.
726 */ 722 */
727int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) 723static int __init_memblock memblock_setclr_flag(phys_addr_t base,
724 phys_addr_t size, int set, int flag)
728{ 725{
729 struct memblock_type *type = &memblock.memory; 726 struct memblock_type *type = &memblock.memory;
730 int i, ret, start_rgn, end_rgn; 727 int i, ret, start_rgn, end_rgn;
@@ -734,37 +731,37 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
734 return ret; 731 return ret;
735 732
736 for (i = start_rgn; i < end_rgn; i++) 733 for (i = start_rgn; i < end_rgn; i++)
737 memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); 734 if (set)
735 memblock_set_region_flags(&type->regions[i], flag);
736 else
737 memblock_clear_region_flags(&type->regions[i], flag);
738 738
739 memblock_merge_regions(type); 739 memblock_merge_regions(type);
740 return 0; 740 return 0;
741} 741}
742 742
743/** 743/**
744 * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. 744 * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
745 * @base: the base phys addr of the region 745 * @base: the base phys addr of the region
746 * @size: the size of the region 746 * @size: the size of the region
747 * 747 *
748 * This function isolates region [@base, @base + @size), and clear flag 748 * Return 0 on succees, -errno on failure.
749 * MEMBLOCK_HOTPLUG for the isolated regions. 749 */
750int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
751{
752 return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG);
753}
754
755/**
756 * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
757 * @base: the base phys addr of the region
758 * @size: the size of the region
750 * 759 *
751 * Return 0 on succees, -errno on failure. 760 * Return 0 on succees, -errno on failure.
752 */ 761 */
753int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) 762int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
754{ 763{
755 struct memblock_type *type = &memblock.memory; 764 return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG);
756 int i, ret, start_rgn, end_rgn;
757
758 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
759 if (ret)
760 return ret;
761
762 for (i = start_rgn; i < end_rgn; i++)
763 memblock_clear_region_flags(&type->regions[i],
764 MEMBLOCK_HOTPLUG);
765
766 memblock_merge_regions(type);
767 return 0;
768} 765}
769 766
770/** 767/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 85df503ec023..ef91e856c7e4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -296,7 +296,6 @@ struct mem_cgroup {
296 * Should the accounting and control be hierarchical, per subtree? 296 * Should the accounting and control be hierarchical, per subtree?
297 */ 297 */
298 bool use_hierarchy; 298 bool use_hierarchy;
299 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
300 299
301 bool oom_lock; 300 bool oom_lock;
302 atomic_t under_oom; 301 atomic_t under_oom;
@@ -366,22 +365,11 @@ struct mem_cgroup {
366 /* WARNING: nodeinfo must be the last member here */ 365 /* WARNING: nodeinfo must be the last member here */
367}; 366};
368 367
369/* internal only representation about the status of kmem accounting. */
370enum {
371 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
372};
373
374#ifdef CONFIG_MEMCG_KMEM 368#ifdef CONFIG_MEMCG_KMEM
375static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
376{
377 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
378}
379
380static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 369static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
381{ 370{
382 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 371 return memcg->kmemcg_id >= 0;
383} 372}
384
385#endif 373#endif
386 374
387/* Stuffs for move charges at task migration. */ 375/* Stuffs for move charges at task migration. */
@@ -1571,7 +1559,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1571 * select it. The goal is to allow it to allocate so that it may 1559 * select it. The goal is to allow it to allocate so that it may
1572 * quickly exit and free its memory. 1560 * quickly exit and free its memory.
1573 */ 1561 */
1574 if (fatal_signal_pending(current) || current->flags & PF_EXITING) { 1562 if (fatal_signal_pending(current) || task_will_free_mem(current)) {
1575 set_thread_flag(TIF_MEMDIE); 1563 set_thread_flag(TIF_MEMDIE);
1576 return; 1564 return;
1577 } 1565 }
@@ -1628,6 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1628 NULL, "Memory cgroup out of memory"); 1616 NULL, "Memory cgroup out of memory");
1629} 1617}
1630 1618
1619#if MAX_NUMNODES > 1
1620
1631/** 1621/**
1632 * test_mem_cgroup_node_reclaimable 1622 * test_mem_cgroup_node_reclaimable
1633 * @memcg: the target memcg 1623 * @memcg: the target memcg
@@ -1650,7 +1640,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1650 return false; 1640 return false;
1651 1641
1652} 1642}
1653#if MAX_NUMNODES > 1
1654 1643
1655/* 1644/*
1656 * Always updating the nodemask is not very good - even if we have an empty 1645 * Always updating the nodemask is not very good - even if we have an empty
@@ -2646,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
2646 if (!cachep) 2635 if (!cachep)
2647 return; 2636 return;
2648 2637
2649 css_get(&memcg->css);
2650 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 2638 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2651 2639
2652 /* 2640 /*
@@ -2680,40 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
2680 list_del(&cachep->memcg_params->list); 2668 list_del(&cachep->memcg_params->list);
2681 2669
2682 kmem_cache_destroy(cachep); 2670 kmem_cache_destroy(cachep);
2683
2684 /* drop the reference taken in memcg_register_cache */
2685 css_put(&memcg->css);
2686}
2687
2688/*
2689 * During the creation a new cache, we need to disable our accounting mechanism
2690 * altogether. This is true even if we are not creating, but rather just
2691 * enqueing new caches to be created.
2692 *
2693 * This is because that process will trigger allocations; some visible, like
2694 * explicit kmallocs to auxiliary data structures, name strings and internal
2695 * cache structures; some well concealed, like INIT_WORK() that can allocate
2696 * objects during debug.
2697 *
2698 * If any allocation happens during memcg_kmem_get_cache, we will recurse back
2699 * to it. This may not be a bounded recursion: since the first cache creation
2700 * failed to complete (waiting on the allocation), we'll just try to create the
2701 * cache again, failing at the same point.
2702 *
2703 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
2704 * memcg_kmem_skip_account. So we enclose anything that might allocate memory
2705 * inside the following two functions.
2706 */
2707static inline void memcg_stop_kmem_account(void)
2708{
2709 VM_BUG_ON(!current->mm);
2710 current->memcg_kmem_skip_account++;
2711}
2712
2713static inline void memcg_resume_kmem_account(void)
2714{
2715 VM_BUG_ON(!current->mm);
2716 current->memcg_kmem_skip_account--;
2717} 2671}
2718 2672
2719int __memcg_cleanup_cache_params(struct kmem_cache *s) 2673int __memcg_cleanup_cache_params(struct kmem_cache *s)
@@ -2747,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
2747 mutex_lock(&memcg_slab_mutex); 2701 mutex_lock(&memcg_slab_mutex);
2748 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { 2702 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
2749 cachep = memcg_params_to_cache(params); 2703 cachep = memcg_params_to_cache(params);
2750 kmem_cache_shrink(cachep); 2704 memcg_unregister_cache(cachep);
2751 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
2752 memcg_unregister_cache(cachep);
2753 } 2705 }
2754 mutex_unlock(&memcg_slab_mutex); 2706 mutex_unlock(&memcg_slab_mutex);
2755} 2707}
@@ -2784,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
2784 struct memcg_register_cache_work *cw; 2736 struct memcg_register_cache_work *cw;
2785 2737
2786 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 2738 cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
2787 if (cw == NULL) { 2739 if (!cw)
2788 css_put(&memcg->css);
2789 return; 2740 return;
2790 } 2741
2742 css_get(&memcg->css);
2791 2743
2792 cw->memcg = memcg; 2744 cw->memcg = memcg;
2793 cw->cachep = cachep; 2745 cw->cachep = cachep;
@@ -2810,20 +2762,16 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
2810 * this point we can't allow ourselves back into memcg_kmem_get_cache, 2762 * this point we can't allow ourselves back into memcg_kmem_get_cache,
2811 * the safest choice is to do it like this, wrapping the whole function. 2763 * the safest choice is to do it like this, wrapping the whole function.
2812 */ 2764 */
2813 memcg_stop_kmem_account(); 2765 current->memcg_kmem_skip_account = 1;
2814 __memcg_schedule_register_cache(memcg, cachep); 2766 __memcg_schedule_register_cache(memcg, cachep);
2815 memcg_resume_kmem_account(); 2767 current->memcg_kmem_skip_account = 0;
2816} 2768}
2817 2769
2818int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) 2770int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
2819{ 2771{
2820 unsigned int nr_pages = 1 << order; 2772 unsigned int nr_pages = 1 << order;
2821 int res;
2822 2773
2823 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); 2774 return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
2824 if (!res)
2825 atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
2826 return res;
2827} 2775}
2828 2776
2829void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) 2777void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
@@ -2831,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
2831 unsigned int nr_pages = 1 << order; 2779 unsigned int nr_pages = 1 << order;
2832 2780
2833 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); 2781 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
2834 atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
2835} 2782}
2836 2783
2837/* 2784/*
@@ -2847,8 +2794,7 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
2847 * Can't be called in interrupt context or from kernel threads. 2794 * Can't be called in interrupt context or from kernel threads.
2848 * This function needs to be called with rcu_read_lock() held. 2795 * This function needs to be called with rcu_read_lock() held.
2849 */ 2796 */
2850struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, 2797struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2851 gfp_t gfp)
2852{ 2798{
2853 struct mem_cgroup *memcg; 2799 struct mem_cgroup *memcg;
2854 struct kmem_cache *memcg_cachep; 2800 struct kmem_cache *memcg_cachep;
@@ -2856,25 +2802,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
2856 VM_BUG_ON(!cachep->memcg_params); 2802 VM_BUG_ON(!cachep->memcg_params);
2857 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 2803 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
2858 2804
2859 if (!current->mm || current->memcg_kmem_skip_account) 2805 if (current->memcg_kmem_skip_account)
2860 return cachep; 2806 return cachep;
2861 2807
2862 rcu_read_lock(); 2808 memcg = get_mem_cgroup_from_mm(current->mm);
2863 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
2864
2865 if (!memcg_kmem_is_active(memcg)) 2809 if (!memcg_kmem_is_active(memcg))
2866 goto out; 2810 goto out;
2867 2811
2868 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 2812 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
2869 if (likely(memcg_cachep)) { 2813 if (likely(memcg_cachep))
2870 cachep = memcg_cachep; 2814 return memcg_cachep;
2871 goto out;
2872 }
2873
2874 /* The corresponding put will be done in the workqueue. */
2875 if (!css_tryget_online(&memcg->css))
2876 goto out;
2877 rcu_read_unlock();
2878 2815
2879 /* 2816 /*
2880 * If we are in a safe context (can wait, and not in interrupt 2817 * If we are in a safe context (can wait, and not in interrupt
@@ -2889,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
2889 * defer everything. 2826 * defer everything.
2890 */ 2827 */
2891 memcg_schedule_register_cache(memcg, cachep); 2828 memcg_schedule_register_cache(memcg, cachep);
2892 return cachep;
2893out: 2829out:
2894 rcu_read_unlock(); 2830 css_put(&memcg->css);
2895 return cachep; 2831 return cachep;
2896} 2832}
2897 2833
2834void __memcg_kmem_put_cache(struct kmem_cache *cachep)
2835{
2836 if (!is_root_cache(cachep))
2837 css_put(&cachep->memcg_params->memcg->css);
2838}
2839
2898/* 2840/*
2899 * We need to verify if the allocation against current->mm->owner's memcg is 2841 * We need to verify if the allocation against current->mm->owner's memcg is
2900 * possible for the given order. But the page is not allocated yet, so we'll 2842 * possible for the given order. But the page is not allocated yet, so we'll
@@ -2917,34 +2859,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
2917 2859
2918 *_memcg = NULL; 2860 *_memcg = NULL;
2919 2861
2920 /*
2921 * Disabling accounting is only relevant for some specific memcg
2922 * internal allocations. Therefore we would initially not have such
2923 * check here, since direct calls to the page allocator that are
2924 * accounted to kmemcg (alloc_kmem_pages and friends) only happen
2925 * outside memcg core. We are mostly concerned with cache allocations,
2926 * and by having this test at memcg_kmem_get_cache, we are already able
2927 * to relay the allocation to the root cache and bypass the memcg cache
2928 * altogether.
2929 *
2930 * There is one exception, though: the SLUB allocator does not create
2931 * large order caches, but rather service large kmallocs directly from
2932 * the page allocator. Therefore, the following sequence when backed by
2933 * the SLUB allocator:
2934 *
2935 * memcg_stop_kmem_account();
2936 * kmalloc(<large_number>)
2937 * memcg_resume_kmem_account();
2938 *
2939 * would effectively ignore the fact that we should skip accounting,
2940 * since it will drive us directly to this function without passing
2941 * through the cache selector memcg_kmem_get_cache. Such large
2942 * allocations are extremely rare but can happen, for instance, for the
2943 * cache arrays. We bring this test here.
2944 */
2945 if (!current->mm || current->memcg_kmem_skip_account)
2946 return true;
2947
2948 memcg = get_mem_cgroup_from_mm(current->mm); 2862 memcg = get_mem_cgroup_from_mm(current->mm);
2949 2863
2950 if (!memcg_kmem_is_active(memcg)) { 2864 if (!memcg_kmem_is_active(memcg)) {
@@ -2985,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
2985 memcg_uncharge_kmem(memcg, 1 << order); 2899 memcg_uncharge_kmem(memcg, 1 << order);
2986 page->mem_cgroup = NULL; 2900 page->mem_cgroup = NULL;
2987} 2901}
2988#else
2989static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
2990{
2991}
2992#endif /* CONFIG_MEMCG_KMEM */ 2902#endif /* CONFIG_MEMCG_KMEM */
2993 2903
2994#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2904#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3539,12 +3449,6 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
3539 return 0; 3449 return 0;
3540 3450
3541 /* 3451 /*
3542 * We are going to allocate memory for data shared by all memory
3543 * cgroups so let's stop accounting here.
3544 */
3545 memcg_stop_kmem_account();
3546
3547 /*
3548 * For simplicity, we won't allow this to be disabled. It also can't 3452 * For simplicity, we won't allow this to be disabled. It also can't
3549 * be changed if the cgroup has children already, or if tasks had 3453 * be changed if the cgroup has children already, or if tasks had
3550 * already joined. 3454 * already joined.
@@ -3570,25 +3474,22 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
3570 goto out; 3474 goto out;
3571 } 3475 }
3572 3476
3573 memcg->kmemcg_id = memcg_id;
3574 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
3575
3576 /* 3477 /*
3577 * We couldn't have accounted to this cgroup, because it hasn't got the 3478 * We couldn't have accounted to this cgroup, because it hasn't got
3578 * active bit set yet, so this should succeed. 3479 * activated yet, so this should succeed.
3579 */ 3480 */
3580 err = page_counter_limit(&memcg->kmem, nr_pages); 3481 err = page_counter_limit(&memcg->kmem, nr_pages);
3581 VM_BUG_ON(err); 3482 VM_BUG_ON(err);
3582 3483
3583 static_key_slow_inc(&memcg_kmem_enabled_key); 3484 static_key_slow_inc(&memcg_kmem_enabled_key);
3584 /* 3485 /*
3585 * Setting the active bit after enabling static branching will 3486 * A memory cgroup is considered kmem-active as soon as it gets
3487 * kmemcg_id. Setting the id after enabling static branching will
3586 * guarantee no one starts accounting before all call sites are 3488 * guarantee no one starts accounting before all call sites are
3587 * patched. 3489 * patched.
3588 */ 3490 */
3589 memcg_kmem_set_active(memcg); 3491 memcg->kmemcg_id = memcg_id;
3590out: 3492out:
3591 memcg_resume_kmem_account();
3592 return err; 3493 return err;
3593} 3494}
3594 3495
@@ -3791,11 +3692,6 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
3791} 3692}
3792#endif /* CONFIG_NUMA */ 3693#endif /* CONFIG_NUMA */
3793 3694
3794static inline void mem_cgroup_lru_names_not_uptodate(void)
3795{
3796 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3797}
3798
3799static int memcg_stat_show(struct seq_file *m, void *v) 3695static int memcg_stat_show(struct seq_file *m, void *v)
3800{ 3696{
3801 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3697 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -3803,6 +3699,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
3803 struct mem_cgroup *mi; 3699 struct mem_cgroup *mi;
3804 unsigned int i; 3700 unsigned int i;
3805 3701
3702 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3703
3806 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3704 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3807 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3705 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
3808 continue; 3706 continue;
@@ -4259,7 +4157,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4259{ 4157{
4260 int ret; 4158 int ret;
4261 4159
4262 memcg->kmemcg_id = -1;
4263 ret = memcg_propagate_kmem(memcg); 4160 ret = memcg_propagate_kmem(memcg);
4264 if (ret) 4161 if (ret)
4265 return ret; 4162 return ret;
@@ -4269,6 +4166,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4269 4166
4270static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4167static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4271{ 4168{
4169 memcg_unregister_all_caches(memcg);
4272 mem_cgroup_sockets_destroy(memcg); 4170 mem_cgroup_sockets_destroy(memcg);
4273} 4171}
4274#else 4172#else
@@ -4724,17 +4622,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4724 4622
4725 free_percpu(memcg->stat); 4623 free_percpu(memcg->stat);
4726 4624
4727 /*
4728 * We need to make sure that (at least for now), the jump label
4729 * destruction code runs outside of the cgroup lock. This is because
4730 * get_online_cpus(), which is called from the static_branch update,
4731 * can't be called inside the cgroup_lock. cpusets are the ones
4732 * enforcing this dependency, so if they ever change, we might as well.
4733 *
4734 * schedule_work() will guarantee this happens. Be careful if you need
4735 * to move this code around, and make sure it is outside
4736 * the cgroup_lock.
4737 */
4738 disarm_static_keys(memcg); 4625 disarm_static_keys(memcg);
4739 kfree(memcg); 4626 kfree(memcg);
4740} 4627}
@@ -4804,6 +4691,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4804 vmpressure_init(&memcg->vmpressure); 4691 vmpressure_init(&memcg->vmpressure);
4805 INIT_LIST_HEAD(&memcg->event_list); 4692 INIT_LIST_HEAD(&memcg->event_list);
4806 spin_lock_init(&memcg->event_list_lock); 4693 spin_lock_init(&memcg->event_list_lock);
4694#ifdef CONFIG_MEMCG_KMEM
4695 memcg->kmemcg_id = -1;
4696 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
4697#endif
4807 4698
4808 return &memcg->css; 4699 return &memcg->css;
4809 4700
@@ -4885,7 +4776,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4885 } 4776 }
4886 spin_unlock(&memcg->event_list_lock); 4777 spin_unlock(&memcg->event_list_lock);
4887 4778
4888 memcg_unregister_all_caches(memcg);
4889 vmpressure_cleanup(&memcg->vmpressure); 4779 vmpressure_cleanup(&memcg->vmpressure);
4890} 4780}
4891 4781
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e5ee0ca7ae85..feb803bf3443 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -239,19 +239,14 @@ void shake_page(struct page *p, int access)
239 } 239 }
240 240
241 /* 241 /*
242 * Only call shrink_slab here (which would also shrink other caches) if 242 * Only call shrink_node_slabs here (which would also shrink
243 * access is not potentially fatal. 243 * other caches) if access is not potentially fatal.
244 */ 244 */
245 if (access) { 245 if (access) {
246 int nr; 246 int nr;
247 int nid = page_to_nid(p); 247 int nid = page_to_nid(p);
248 do { 248 do {
249 struct shrink_control shrink = { 249 nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
250 .gfp_mask = GFP_KERNEL,
251 };
252 node_set(nid, shrink.nodes_to_scan);
253
254 nr = shrink_slab(&shrink, 1000, 1000);
255 if (page_count(p) == 1) 250 if (page_count(p) == 1)
256 break; 251 break;
257 } while (nr > 10); 252 } while (nr > 10);
@@ -466,7 +461,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
466 struct task_struct *tsk; 461 struct task_struct *tsk;
467 struct address_space *mapping = page->mapping; 462 struct address_space *mapping = page->mapping;
468 463
469 mutex_lock(&mapping->i_mmap_mutex); 464 i_mmap_lock_read(mapping);
470 read_lock(&tasklist_lock); 465 read_lock(&tasklist_lock);
471 for_each_process(tsk) { 466 for_each_process(tsk) {
472 pgoff_t pgoff = page_to_pgoff(page); 467 pgoff_t pgoff = page_to_pgoff(page);
@@ -488,7 +483,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
488 } 483 }
489 } 484 }
490 read_unlock(&tasklist_lock); 485 read_unlock(&tasklist_lock);
491 mutex_unlock(&mapping->i_mmap_mutex); 486 i_mmap_unlock_read(mapping);
492} 487}
493 488
494/* 489/*
diff --git a/mm/memory.c b/mm/memory.c
index 4b5a282e1107..fbf74112de5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1326,9 +1326,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1326 * safe to do nothing in this case. 1326 * safe to do nothing in this case.
1327 */ 1327 */
1328 if (vma->vm_file) { 1328 if (vma->vm_file) {
1329 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 1329 i_mmap_lock_write(vma->vm_file->f_mapping);
1330 __unmap_hugepage_range_final(tlb, vma, start, end, NULL); 1330 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1331 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 1331 i_mmap_unlock_write(vma->vm_file->f_mapping);
1332 } 1332 }
1333 } else 1333 } else
1334 unmap_page_range(tlb, vma, start, end, details); 1334 unmap_page_range(tlb, vma, start, end, details);
@@ -2377,12 +2377,12 @@ void unmap_mapping_range(struct address_space *mapping,
2377 details.last_index = ULONG_MAX; 2377 details.last_index = ULONG_MAX;
2378 2378
2379 2379
2380 mutex_lock(&mapping->i_mmap_mutex); 2380 i_mmap_lock_read(mapping);
2381 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) 2381 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2382 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2382 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2383 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2383 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2384 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2384 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2385 mutex_unlock(&mapping->i_mmap_mutex); 2385 i_mmap_unlock_read(mapping);
2386} 2386}
2387EXPORT_SYMBOL(unmap_mapping_range); 2387EXPORT_SYMBOL(unmap_mapping_range);
2388 2388
@@ -3365,6 +3365,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3365 3365
3366 return ret; 3366 return ret;
3367} 3367}
3368EXPORT_SYMBOL_GPL(handle_mm_fault);
3368 3369
3369#ifndef __PAGETABLE_PUD_FOLDED 3370#ifndef __PAGETABLE_PUD_FOLDED
3370/* 3371/*
diff --git a/mm/migrate.c b/mm/migrate.c
index 01439953abf5..253474c22239 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -746,7 +746,7 @@ static int fallback_migrate_page(struct address_space *mapping,
746 * MIGRATEPAGE_SUCCESS - success 746 * MIGRATEPAGE_SUCCESS - success
747 */ 747 */
748static int move_to_new_page(struct page *newpage, struct page *page, 748static int move_to_new_page(struct page *newpage, struct page *page,
749 int remap_swapcache, enum migrate_mode mode) 749 int page_was_mapped, enum migrate_mode mode)
750{ 750{
751 struct address_space *mapping; 751 struct address_space *mapping;
752 int rc; 752 int rc;
@@ -784,7 +784,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
784 newpage->mapping = NULL; 784 newpage->mapping = NULL;
785 } else { 785 } else {
786 mem_cgroup_migrate(page, newpage, false); 786 mem_cgroup_migrate(page, newpage, false);
787 if (remap_swapcache) 787 if (page_was_mapped)
788 remove_migration_ptes(page, newpage); 788 remove_migration_ptes(page, newpage);
789 page->mapping = NULL; 789 page->mapping = NULL;
790 } 790 }
@@ -798,7 +798,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
798 int force, enum migrate_mode mode) 798 int force, enum migrate_mode mode)
799{ 799{
800 int rc = -EAGAIN; 800 int rc = -EAGAIN;
801 int remap_swapcache = 1; 801 int page_was_mapped = 0;
802 struct anon_vma *anon_vma = NULL; 802 struct anon_vma *anon_vma = NULL;
803 803
804 if (!trylock_page(page)) { 804 if (!trylock_page(page)) {
@@ -870,7 +870,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
870 * migrated but are not remapped when migration 870 * migrated but are not remapped when migration
871 * completes 871 * completes
872 */ 872 */
873 remap_swapcache = 0;
874 } else { 873 } else {
875 goto out_unlock; 874 goto out_unlock;
876 } 875 }
@@ -910,13 +909,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
910 } 909 }
911 910
912 /* Establish migration ptes or remove ptes */ 911 /* Establish migration ptes or remove ptes */
913 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 912 if (page_mapped(page)) {
913 try_to_unmap(page,
914 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
915 page_was_mapped = 1;
916 }
914 917
915skip_unmap: 918skip_unmap:
916 if (!page_mapped(page)) 919 if (!page_mapped(page))
917 rc = move_to_new_page(newpage, page, remap_swapcache, mode); 920 rc = move_to_new_page(newpage, page, page_was_mapped, mode);
918 921
919 if (rc && remap_swapcache) 922 if (rc && page_was_mapped)
920 remove_migration_ptes(page, page); 923 remove_migration_ptes(page, page);
921 924
922 /* Drop an anon_vma reference if we took one */ 925 /* Drop an anon_vma reference if we took one */
@@ -1017,6 +1020,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1017{ 1020{
1018 int rc = 0; 1021 int rc = 0;
1019 int *result = NULL; 1022 int *result = NULL;
1023 int page_was_mapped = 0;
1020 struct page *new_hpage; 1024 struct page *new_hpage;
1021 struct anon_vma *anon_vma = NULL; 1025 struct anon_vma *anon_vma = NULL;
1022 1026
@@ -1047,12 +1051,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1047 if (PageAnon(hpage)) 1051 if (PageAnon(hpage))
1048 anon_vma = page_get_anon_vma(hpage); 1052 anon_vma = page_get_anon_vma(hpage);
1049 1053
1050 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 1054 if (page_mapped(hpage)) {
1055 try_to_unmap(hpage,
1056 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1057 page_was_mapped = 1;
1058 }
1051 1059
1052 if (!page_mapped(hpage)) 1060 if (!page_mapped(hpage))
1053 rc = move_to_new_page(new_hpage, hpage, 1, mode); 1061 rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
1054 1062
1055 if (rc != MIGRATEPAGE_SUCCESS) 1063 if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
1056 remove_migration_ptes(hpage, hpage); 1064 remove_migration_ptes(hpage, hpage);
1057 1065
1058 if (anon_vma) 1066 if (anon_vma)
diff --git a/mm/mincore.c b/mm/mincore.c
index 725c80961048..c8c528b36641 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -137,8 +137,11 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
137 } else { /* pte is a swap entry */ 137 } else { /* pte is a swap entry */
138 swp_entry_t entry = pte_to_swp_entry(pte); 138 swp_entry_t entry = pte_to_swp_entry(pte);
139 139
140 if (is_migration_entry(entry)) { 140 if (non_swap_entry(entry)) {
141 /* migration entries are always uptodate */ 141 /*
142 * migration or hwpoison entries are always
143 * uptodate
144 */
142 *vec = 1; 145 *vec = 1;
143 } else { 146 } else {
144#ifdef CONFIG_SWAP 147#ifdef CONFIG_SWAP
diff --git a/mm/mmap.c b/mm/mmap.c
index b6c0a77fc1c8..7b36aa7cc89a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -232,7 +232,7 @@ error:
232} 232}
233 233
234/* 234/*
235 * Requires inode->i_mapping->i_mmap_mutex 235 * Requires inode->i_mapping->i_mmap_rwsem
236 */ 236 */
237static void __remove_shared_vm_struct(struct vm_area_struct *vma, 237static void __remove_shared_vm_struct(struct vm_area_struct *vma,
238 struct file *file, struct address_space *mapping) 238 struct file *file, struct address_space *mapping)
@@ -260,9 +260,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
260 260
261 if (file) { 261 if (file) {
262 struct address_space *mapping = file->f_mapping; 262 struct address_space *mapping = file->f_mapping;
263 mutex_lock(&mapping->i_mmap_mutex); 263 i_mmap_lock_write(mapping);
264 __remove_shared_vm_struct(vma, file, mapping); 264 __remove_shared_vm_struct(vma, file, mapping);
265 mutex_unlock(&mapping->i_mmap_mutex); 265 i_mmap_unlock_write(mapping);
266 } 266 }
267} 267}
268 268
@@ -674,14 +674,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
674 674
675 if (vma->vm_file) { 675 if (vma->vm_file) {
676 mapping = vma->vm_file->f_mapping; 676 mapping = vma->vm_file->f_mapping;
677 mutex_lock(&mapping->i_mmap_mutex); 677 i_mmap_lock_write(mapping);
678 } 678 }
679 679
680 __vma_link(mm, vma, prev, rb_link, rb_parent); 680 __vma_link(mm, vma, prev, rb_link, rb_parent);
681 __vma_link_file(vma); 681 __vma_link_file(vma);
682 682
683 if (mapping) 683 if (mapping)
684 mutex_unlock(&mapping->i_mmap_mutex); 684 i_mmap_unlock_write(mapping);
685 685
686 mm->map_count++; 686 mm->map_count++;
687 validate_mm(mm); 687 validate_mm(mm);
@@ -796,7 +796,7 @@ again: remove_next = 1 + (end > next->vm_end);
796 next->vm_end); 796 next->vm_end);
797 } 797 }
798 798
799 mutex_lock(&mapping->i_mmap_mutex); 799 i_mmap_lock_write(mapping);
800 if (insert) { 800 if (insert) {
801 /* 801 /*
802 * Put into interval tree now, so instantiated pages 802 * Put into interval tree now, so instantiated pages
@@ -883,7 +883,7 @@ again: remove_next = 1 + (end > next->vm_end);
883 anon_vma_unlock_write(anon_vma); 883 anon_vma_unlock_write(anon_vma);
884 } 884 }
885 if (mapping) 885 if (mapping)
886 mutex_unlock(&mapping->i_mmap_mutex); 886 i_mmap_unlock_write(mapping);
887 887
888 if (root) { 888 if (root) {
889 uprobe_mmap(vma); 889 uprobe_mmap(vma);
@@ -2362,6 +2362,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
2362} 2362}
2363#endif 2363#endif
2364 2364
2365EXPORT_SYMBOL_GPL(find_extend_vma);
2366
2365/* 2367/*
2366 * Ok - we have the memory areas we should free on the vma list, 2368 * Ok - we have the memory areas we should free on the vma list,
2367 * so release them, and do the vma updates. 2369 * so release them, and do the vma updates.
@@ -2791,7 +2793,7 @@ void exit_mmap(struct mm_struct *mm)
2791 2793
2792/* Insert vm structure into process list sorted by address 2794/* Insert vm structure into process list sorted by address
2793 * and into the inode's i_mmap tree. If vm_file is non-NULL 2795 * and into the inode's i_mmap tree. If vm_file is non-NULL
2794 * then i_mmap_mutex is taken here. 2796 * then i_mmap_rwsem is taken here.
2795 */ 2797 */
2796int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 2798int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2797{ 2799{
@@ -3086,7 +3088,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
3086 */ 3088 */
3087 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 3089 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
3088 BUG(); 3090 BUG();
3089 mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); 3091 down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
3090 } 3092 }
3091} 3093}
3092 3094
@@ -3113,7 +3115,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
3113 * vma in this mm is backed by the same anon_vma or address_space. 3115 * vma in this mm is backed by the same anon_vma or address_space.
3114 * 3116 *
3115 * We can take all the locks in random order because the VM code 3117 * We can take all the locks in random order because the VM code
3116 * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never 3118 * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
3117 * takes more than one of them in a row. Secondly we're protected 3119 * takes more than one of them in a row. Secondly we're protected
3118 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. 3120 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
3119 * 3121 *
@@ -3182,7 +3184,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
3182 * AS_MM_ALL_LOCKS can't change to 0 from under us 3184 * AS_MM_ALL_LOCKS can't change to 0 from under us
3183 * because we hold the mm_all_locks_mutex. 3185 * because we hold the mm_all_locks_mutex.
3184 */ 3186 */
3185 mutex_unlock(&mapping->i_mmap_mutex); 3187 i_mmap_unlock_write(mapping);
3186 if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 3188 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
3187 &mapping->flags)) 3189 &mapping->flags))
3188 BUG(); 3190 BUG();
diff --git a/mm/mremap.c b/mm/mremap.c
index b147f66f4c40..84aa36f9f308 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -99,7 +99,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
99 spinlock_t *old_ptl, *new_ptl; 99 spinlock_t *old_ptl, *new_ptl;
100 100
101 /* 101 /*
102 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma 102 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
103 * locks to ensure that rmap will always observe either the old or the 103 * locks to ensure that rmap will always observe either the old or the
104 * new ptes. This is the easiest way to avoid races with 104 * new ptes. This is the easiest way to avoid races with
105 * truncate_pagecache(), page migration, etc... 105 * truncate_pagecache(), page migration, etc...
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
119 if (need_rmap_locks) { 119 if (need_rmap_locks) {
120 if (vma->vm_file) { 120 if (vma->vm_file) {
121 mapping = vma->vm_file->f_mapping; 121 mapping = vma->vm_file->f_mapping;
122 mutex_lock(&mapping->i_mmap_mutex); 122 i_mmap_lock_write(mapping);
123 } 123 }
124 if (vma->anon_vma) { 124 if (vma->anon_vma) {
125 anon_vma = vma->anon_vma; 125 anon_vma = vma->anon_vma;
@@ -156,7 +156,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
156 if (anon_vma) 156 if (anon_vma)
157 anon_vma_unlock_write(anon_vma); 157 anon_vma_unlock_write(anon_vma);
158 if (mapping) 158 if (mapping)
159 mutex_unlock(&mapping->i_mmap_mutex); 159 i_mmap_unlock_write(mapping);
160} 160}
161 161
162#define LATENCY_LIMIT (64 * PAGE_SIZE) 162#define LATENCY_LIMIT (64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index bd1808e194a7..b51eadf6d952 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -722,11 +722,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
722 if (vma->vm_file) { 722 if (vma->vm_file) {
723 mapping = vma->vm_file->f_mapping; 723 mapping = vma->vm_file->f_mapping;
724 724
725 mutex_lock(&mapping->i_mmap_mutex); 725 i_mmap_lock_write(mapping);
726 flush_dcache_mmap_lock(mapping); 726 flush_dcache_mmap_lock(mapping);
727 vma_interval_tree_insert(vma, &mapping->i_mmap); 727 vma_interval_tree_insert(vma, &mapping->i_mmap);
728 flush_dcache_mmap_unlock(mapping); 728 flush_dcache_mmap_unlock(mapping);
729 mutex_unlock(&mapping->i_mmap_mutex); 729 i_mmap_unlock_write(mapping);
730 } 730 }
731 731
732 /* add the VMA to the tree */ 732 /* add the VMA to the tree */
@@ -795,11 +795,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
795 if (vma->vm_file) { 795 if (vma->vm_file) {
796 mapping = vma->vm_file->f_mapping; 796 mapping = vma->vm_file->f_mapping;
797 797
798 mutex_lock(&mapping->i_mmap_mutex); 798 i_mmap_lock_write(mapping);
799 flush_dcache_mmap_lock(mapping); 799 flush_dcache_mmap_lock(mapping);
800 vma_interval_tree_remove(vma, &mapping->i_mmap); 800 vma_interval_tree_remove(vma, &mapping->i_mmap);
801 flush_dcache_mmap_unlock(mapping); 801 flush_dcache_mmap_unlock(mapping);
802 mutex_unlock(&mapping->i_mmap_mutex); 802 i_mmap_unlock_write(mapping);
803 } 803 }
804 804
805 /* remove from the MM's tree and list */ 805 /* remove from the MM's tree and list */
@@ -1149,8 +1149,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1149 unsigned long len, 1149 unsigned long len,
1150 unsigned long capabilities) 1150 unsigned long capabilities)
1151{ 1151{
1152 struct page *pages; 1152 unsigned long total, point;
1153 unsigned long total, point, n;
1154 void *base; 1153 void *base;
1155 int ret, order; 1154 int ret, order;
1156 1155
@@ -1182,33 +1181,23 @@ static int do_mmap_private(struct vm_area_struct *vma,
1182 order = get_order(len); 1181 order = get_order(len);
1183 kdebug("alloc order %d for %lx", order, len); 1182 kdebug("alloc order %d for %lx", order, len);
1184 1183
1185 pages = alloc_pages(GFP_KERNEL, order);
1186 if (!pages)
1187 goto enomem;
1188
1189 total = 1 << order; 1184 total = 1 << order;
1190 atomic_long_add(total, &mmap_pages_allocated);
1191
1192 point = len >> PAGE_SHIFT; 1185 point = len >> PAGE_SHIFT;
1193 1186
1194 /* we allocated a power-of-2 sized page set, so we may want to trim off 1187 /* we don't want to allocate a power-of-2 sized page set */
1195 * the excess */
1196 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { 1188 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
1197 while (total > point) { 1189 total = point;
1198 order = ilog2(total - point); 1190 kdebug("try to alloc exact %lu pages", total);
1199 n = 1 << order; 1191 base = alloc_pages_exact(len, GFP_KERNEL);
1200 kdebug("shave %lu/%lu @%lu", n, total - point, total); 1192 } else {
1201 atomic_long_sub(n, &mmap_pages_allocated); 1193 base = (void *)__get_free_pages(GFP_KERNEL, order);
1202 total -= n;
1203 set_page_refcounted(pages + total);
1204 __free_pages(pages + total, order);
1205 }
1206 } 1194 }
1207 1195
1208 for (point = 1; point < total; point++) 1196 if (!base)
1209 set_page_refcounted(&pages[point]); 1197 goto enomem;
1198
1199 atomic_long_add(total, &mmap_pages_allocated);
1210 1200
1211 base = page_address(pages);
1212 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; 1201 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
1213 region->vm_start = (unsigned long) base; 1202 region->vm_start = (unsigned long) base;
1214 region->vm_end = region->vm_start + len; 1203 region->vm_end = region->vm_start + len;
@@ -2094,14 +2083,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2094 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 2083 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
2095 2084
2096 down_write(&nommu_region_sem); 2085 down_write(&nommu_region_sem);
2097 mutex_lock(&inode->i_mapping->i_mmap_mutex); 2086 i_mmap_lock_read(inode->i_mapping);
2098 2087
2099 /* search for VMAs that fall within the dead zone */ 2088 /* search for VMAs that fall within the dead zone */
2100 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { 2089 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
2101 /* found one - only interested if it's shared out of the page 2090 /* found one - only interested if it's shared out of the page
2102 * cache */ 2091 * cache */
2103 if (vma->vm_flags & VM_SHARED) { 2092 if (vma->vm_flags & VM_SHARED) {
2104 mutex_unlock(&inode->i_mapping->i_mmap_mutex); 2093 i_mmap_unlock_read(inode->i_mapping);
2105 up_write(&nommu_region_sem); 2094 up_write(&nommu_region_sem);
2106 return -ETXTBSY; /* not quite true, but near enough */ 2095 return -ETXTBSY; /* not quite true, but near enough */
2107 } 2096 }
@@ -2113,8 +2102,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2113 * we don't check for any regions that start beyond the EOF as there 2102 * we don't check for any regions that start beyond the EOF as there
2114 * shouldn't be any 2103 * shouldn't be any
2115 */ 2104 */
2116 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 2105 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
2117 0, ULONG_MAX) {
2118 if (!(vma->vm_flags & VM_SHARED)) 2106 if (!(vma->vm_flags & VM_SHARED))
2119 continue; 2107 continue;
2120 2108
@@ -2129,7 +2117,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2129 } 2117 }
2130 } 2118 }
2131 2119
2132 mutex_unlock(&inode->i_mapping->i_mmap_mutex); 2120 i_mmap_unlock_read(inode->i_mapping);
2133 up_write(&nommu_region_sem); 2121 up_write(&nommu_region_sem);
2134 return 0; 2122 return 0;
2135} 2123}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 864bba992735..d503e9ce1c7b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -281,14 +281,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
281 if (oom_task_origin(task)) 281 if (oom_task_origin(task))
282 return OOM_SCAN_SELECT; 282 return OOM_SCAN_SELECT;
283 283
284 if (task->flags & PF_EXITING && !force_kill) { 284 if (task_will_free_mem(task) && !force_kill)
285 /* 285 return OOM_SCAN_ABORT;
286 * If this task is not being ptraced on exit, then wait for it 286
287 * to finish before killing some other task unnecessarily.
288 */
289 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
290 return OOM_SCAN_ABORT;
291 }
292 return OOM_SCAN_OK; 287 return OOM_SCAN_OK;
293} 288}
294 289
@@ -443,7 +438,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
443 * If the task is already exiting, don't alarm the sysadmin or kill 438 * If the task is already exiting, don't alarm the sysadmin or kill
444 * its children or threads, just set TIF_MEMDIE so it can die quickly 439 * its children or threads, just set TIF_MEMDIE so it can die quickly
445 */ 440 */
446 if (p->flags & PF_EXITING) { 441 if (task_will_free_mem(p)) {
447 set_tsk_thread_flag(p, TIF_MEMDIE); 442 set_tsk_thread_flag(p, TIF_MEMDIE);
448 put_task_struct(p); 443 put_task_struct(p);
449 return; 444 return;
@@ -649,7 +644,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
649 * select it. The goal is to allow it to allocate so that it may 644 * select it. The goal is to allow it to allocate so that it may
650 * quickly exit and free its memory. 645 * quickly exit and free its memory.
651 */ 646 */
652 if (fatal_signal_pending(current) || current->flags & PF_EXITING) { 647 if (fatal_signal_pending(current) || task_will_free_mem(current)) {
653 set_thread_flag(TIF_MEMDIE); 648 set_thread_flag(TIF_MEMDIE);
654 return; 649 return;
655 } 650 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df542feaac3b..fa974d87f60d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
48#include <linux/backing-dev.h> 48#include <linux/backing-dev.h>
49#include <linux/fault-inject.h> 49#include <linux/fault-inject.h>
50#include <linux/page-isolation.h> 50#include <linux/page-isolation.h>
51#include <linux/page_ext.h>
51#include <linux/debugobjects.h> 52#include <linux/debugobjects.h>
52#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
53#include <linux/compaction.h> 54#include <linux/compaction.h>
@@ -55,9 +56,10 @@
55#include <linux/prefetch.h> 56#include <linux/prefetch.h>
56#include <linux/mm_inline.h> 57#include <linux/mm_inline.h>
57#include <linux/migrate.h> 58#include <linux/migrate.h>
58#include <linux/page-debug-flags.h> 59#include <linux/page_ext.h>
59#include <linux/hugetlb.h> 60#include <linux/hugetlb.h>
60#include <linux/sched/rt.h> 61#include <linux/sched/rt.h>
62#include <linux/page_owner.h>
61 63
62#include <asm/sections.h> 64#include <asm/sections.h>
63#include <asm/tlbflush.h> 65#include <asm/tlbflush.h>
@@ -424,6 +426,42 @@ static inline void prep_zero_page(struct page *page, unsigned int order,
424 426
425#ifdef CONFIG_DEBUG_PAGEALLOC 427#ifdef CONFIG_DEBUG_PAGEALLOC
426unsigned int _debug_guardpage_minorder; 428unsigned int _debug_guardpage_minorder;
429bool _debug_pagealloc_enabled __read_mostly;
430bool _debug_guardpage_enabled __read_mostly;
431
432static int __init early_debug_pagealloc(char *buf)
433{
434 if (!buf)
435 return -EINVAL;
436
437 if (strcmp(buf, "on") == 0)
438 _debug_pagealloc_enabled = true;
439
440 return 0;
441}
442early_param("debug_pagealloc", early_debug_pagealloc);
443
444static bool need_debug_guardpage(void)
445{
446 /* If we don't use debug_pagealloc, we don't need guard page */
447 if (!debug_pagealloc_enabled())
448 return false;
449
450 return true;
451}
452
453static void init_debug_guardpage(void)
454{
455 if (!debug_pagealloc_enabled())
456 return;
457
458 _debug_guardpage_enabled = true;
459}
460
461struct page_ext_operations debug_guardpage_ops = {
462 .need = need_debug_guardpage,
463 .init = init_debug_guardpage,
464};
427 465
428static int __init debug_guardpage_minorder_setup(char *buf) 466static int __init debug_guardpage_minorder_setup(char *buf)
429{ 467{
@@ -439,18 +477,44 @@ static int __init debug_guardpage_minorder_setup(char *buf)
439} 477}
440__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 478__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
441 479
442static inline void set_page_guard_flag(struct page *page) 480static inline void set_page_guard(struct zone *zone, struct page *page,
481 unsigned int order, int migratetype)
443{ 482{
444 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 483 struct page_ext *page_ext;
484
485 if (!debug_guardpage_enabled())
486 return;
487
488 page_ext = lookup_page_ext(page);
489 __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
490
491 INIT_LIST_HEAD(&page->lru);
492 set_page_private(page, order);
493 /* Guard pages are not available for any usage */
494 __mod_zone_freepage_state(zone, -(1 << order), migratetype);
445} 495}
446 496
447static inline void clear_page_guard_flag(struct page *page) 497static inline void clear_page_guard(struct zone *zone, struct page *page,
498 unsigned int order, int migratetype)
448{ 499{
449 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 500 struct page_ext *page_ext;
501
502 if (!debug_guardpage_enabled())
503 return;
504
505 page_ext = lookup_page_ext(page);
506 __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
507
508 set_page_private(page, 0);
509 if (!is_migrate_isolate(migratetype))
510 __mod_zone_freepage_state(zone, (1 << order), migratetype);
450} 511}
451#else 512#else
452static inline void set_page_guard_flag(struct page *page) { } 513struct page_ext_operations debug_guardpage_ops = { NULL, };
453static inline void clear_page_guard_flag(struct page *page) { } 514static inline void set_page_guard(struct zone *zone, struct page *page,
515 unsigned int order, int migratetype) {}
516static inline void clear_page_guard(struct zone *zone, struct page *page,
517 unsigned int order, int migratetype) {}
454#endif 518#endif
455 519
456static inline void set_page_order(struct page *page, unsigned int order) 520static inline void set_page_order(struct page *page, unsigned int order)
@@ -581,12 +645,7 @@ static inline void __free_one_page(struct page *page,
581 * merge with it and move up one order. 645 * merge with it and move up one order.
582 */ 646 */
583 if (page_is_guard(buddy)) { 647 if (page_is_guard(buddy)) {
584 clear_page_guard_flag(buddy); 648 clear_page_guard(zone, buddy, order, migratetype);
585 set_page_private(buddy, 0);
586 if (!is_migrate_isolate(migratetype)) {
587 __mod_zone_freepage_state(zone, 1 << order,
588 migratetype);
589 }
590 } else { 649 } else {
591 list_del(&buddy->lru); 650 list_del(&buddy->lru);
592 zone->free_area[order].nr_free--; 651 zone->free_area[order].nr_free--;
@@ -755,6 +814,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
755 if (bad) 814 if (bad)
756 return false; 815 return false;
757 816
817 reset_page_owner(page, order);
818
758 if (!PageHighMem(page)) { 819 if (!PageHighMem(page)) {
759 debug_check_no_locks_freed(page_address(page), 820 debug_check_no_locks_freed(page_address(page),
760 PAGE_SIZE << order); 821 PAGE_SIZE << order);
@@ -861,23 +922,18 @@ static inline void expand(struct zone *zone, struct page *page,
861 size >>= 1; 922 size >>= 1;
862 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 923 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
863 924
864#ifdef CONFIG_DEBUG_PAGEALLOC 925 if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
865 if (high < debug_guardpage_minorder()) { 926 debug_guardpage_enabled() &&
927 high < debug_guardpage_minorder()) {
866 /* 928 /*
867 * Mark as guard pages (or page), that will allow to 929 * Mark as guard pages (or page), that will allow to
868 * merge back to allocator when buddy will be freed. 930 * merge back to allocator when buddy will be freed.
869 * Corresponding page table entries will not be touched, 931 * Corresponding page table entries will not be touched,
870 * pages will stay not present in virtual address space 932 * pages will stay not present in virtual address space
871 */ 933 */
872 INIT_LIST_HEAD(&page[size].lru); 934 set_page_guard(zone, &page[size], high, migratetype);
873 set_page_guard_flag(&page[size]);
874 set_page_private(&page[size], high);
875 /* Guard pages are not available for any usage */
876 __mod_zone_freepage_state(zone, -(1 << high),
877 migratetype);
878 continue; 935 continue;
879 } 936 }
880#endif
881 list_add(&page[size].lru, &area->free_list[migratetype]); 937 list_add(&page[size].lru, &area->free_list[migratetype]);
882 area->nr_free++; 938 area->nr_free++;
883 set_page_order(&page[size], high); 939 set_page_order(&page[size], high);
@@ -935,6 +991,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
935 if (order && (gfp_flags & __GFP_COMP)) 991 if (order && (gfp_flags & __GFP_COMP))
936 prep_compound_page(page, order); 992 prep_compound_page(page, order);
937 993
994 set_page_owner(page, order, gfp_flags);
995
938 return 0; 996 return 0;
939} 997}
940 998
@@ -1507,8 +1565,11 @@ void split_page(struct page *page, unsigned int order)
1507 split_page(virt_to_page(page[0].shadow), order); 1565 split_page(virt_to_page(page[0].shadow), order);
1508#endif 1566#endif
1509 1567
1510 for (i = 1; i < (1 << order); i++) 1568 set_page_owner(page, 0, 0);
1569 for (i = 1; i < (1 << order); i++) {
1511 set_page_refcounted(page + i); 1570 set_page_refcounted(page + i);
1571 set_page_owner(page + i, 0, 0);
1572 }
1512} 1573}
1513EXPORT_SYMBOL_GPL(split_page); 1574EXPORT_SYMBOL_GPL(split_page);
1514 1575
@@ -1548,6 +1609,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
1548 } 1609 }
1549 } 1610 }
1550 1611
1612 set_page_owner(page, order, 0);
1551 return 1UL << order; 1613 return 1UL << order;
1552} 1614}
1553 1615
@@ -4856,6 +4918,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4856#endif 4918#endif
4857 init_waitqueue_head(&pgdat->kswapd_wait); 4919 init_waitqueue_head(&pgdat->kswapd_wait);
4858 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4920 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4921 pgdat_page_ext_init(pgdat);
4859 4922
4860 for (j = 0; j < MAX_NR_ZONES; j++) { 4923 for (j = 0; j < MAX_NR_ZONES; j++) {
4861 struct zone *zone = pgdat->node_zones + j; 4924 struct zone *zone = pgdat->node_zones + j;
@@ -4874,16 +4937,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4874 * and per-cpu initialisations 4937 * and per-cpu initialisations
4875 */ 4938 */
4876 memmap_pages = calc_memmap_size(size, realsize); 4939 memmap_pages = calc_memmap_size(size, realsize);
4877 if (freesize >= memmap_pages) { 4940 if (!is_highmem_idx(j)) {
4878 freesize -= memmap_pages; 4941 if (freesize >= memmap_pages) {
4879 if (memmap_pages) 4942 freesize -= memmap_pages;
4880 printk(KERN_DEBUG 4943 if (memmap_pages)
4881 " %s zone: %lu pages used for memmap\n", 4944 printk(KERN_DEBUG
4882 zone_names[j], memmap_pages); 4945 " %s zone: %lu pages used for memmap\n",
4883 } else 4946 zone_names[j], memmap_pages);
4884 printk(KERN_WARNING 4947 } else
4885 " %s zone: %lu pages exceeds freesize %lu\n", 4948 printk(KERN_WARNING
4886 zone_names[j], memmap_pages, freesize); 4949 " %s zone: %lu pages exceeds freesize %lu\n",
4950 zone_names[j], memmap_pages, freesize);
4951 }
4887 4952
4888 /* Account for reserved pages */ 4953 /* Account for reserved pages */
4889 if (j == 0 && freesize > dma_reserve) { 4954 if (j == 0 && freesize > dma_reserve) {
@@ -6221,9 +6286,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
6221 if (!PageLRU(page)) 6286 if (!PageLRU(page))
6222 found++; 6287 found++;
6223 /* 6288 /*
6224 * If there are RECLAIMABLE pages, we need to check it. 6289 * If there are RECLAIMABLE pages, we need to check
6225 * But now, memory offline itself doesn't call shrink_slab() 6290 * it. But now, memory offline itself doesn't call
6226 * and it still to be fixed. 6291 * shrink_node_slabs() and it still to be fixed.
6227 */ 6292 */
6228 /* 6293 /*
6229 * If the page is not RAM, page_count()should be 0. 6294 * If the page is not RAM, page_count()should be 0.
diff --git a/mm/page_ext.c b/mm/page_ext.c
new file mode 100644
index 000000000000..d86fd2f5353f
--- /dev/null
+++ b/mm/page_ext.c
@@ -0,0 +1,403 @@
1#include <linux/mm.h>
2#include <linux/mmzone.h>
3#include <linux/bootmem.h>
4#include <linux/page_ext.h>
5#include <linux/memory.h>
6#include <linux/vmalloc.h>
7#include <linux/kmemleak.h>
8#include <linux/page_owner.h>
9
10/*
11 * struct page extension
12 *
13 * This is the feature to manage memory for extended data per page.
14 *
15 * Until now, we must modify struct page itself to store extra data per page.
16 * This requires rebuilding the kernel and it is really time consuming process.
17 * And, sometimes, rebuild is impossible due to third party module dependency.
18 * At last, enlarging struct page could cause un-wanted system behaviour change.
19 *
20 * This feature is intended to overcome above mentioned problems. This feature
21 * allocates memory for extended data per page in certain place rather than
22 * the struct page itself. This memory can be accessed by the accessor
23 * functions provided by this code. During the boot process, it checks whether
24 * allocation of huge chunk of memory is needed or not. If not, it avoids
25 * allocating memory at all. With this advantage, we can include this feature
26 * into the kernel in default and can avoid rebuild and solve related problems.
27 *
28 * To help these things to work well, there are two callbacks for clients. One
29 * is the need callback which is mandatory if user wants to avoid useless
30 * memory allocation at boot-time. The other is optional, init callback, which
31 * is used to do proper initialization after memory is allocated.
32 *
33 * The need callback is used to decide whether extended memory allocation is
34 * needed or not. Sometimes users want to deactivate some features in this
35 * boot and extra memory would be unneccessary. In this case, to avoid
36 * allocating huge chunk of memory, each clients represent their need of
37 * extra memory through the need callback. If one of the need callbacks
38 * returns true, it means that someone needs extra memory so that
39 * page extension core should allocates memory for page extension. If
40 * none of need callbacks return true, memory isn't needed at all in this boot
41 * and page extension core can skip to allocate memory. As result,
42 * none of memory is wasted.
43 *
44 * The init callback is used to do proper initialization after page extension
45 * is completely initialized. In sparse memory system, extra memory is
46 * allocated some time later than memmap is allocated. In other words, lifetime
47 * of memory for page extension isn't same with memmap for struct page.
48 * Therefore, clients can't store extra data until page extension is
49 * initialized, even if pages are allocated and used freely. This could
50 * cause inadequate state of extra data per page, so, to prevent it, client
51 * can utilize this callback to initialize the state of it correctly.
52 */
53
54static struct page_ext_operations *page_ext_ops[] = {
55 &debug_guardpage_ops,
56#ifdef CONFIG_PAGE_POISONING
57 &page_poisoning_ops,
58#endif
59#ifdef CONFIG_PAGE_OWNER
60 &page_owner_ops,
61#endif
62};
63
64static unsigned long total_usage;
65
66static bool __init invoke_need_callbacks(void)
67{
68 int i;
69 int entries = ARRAY_SIZE(page_ext_ops);
70
71 for (i = 0; i < entries; i++) {
72 if (page_ext_ops[i]->need && page_ext_ops[i]->need())
73 return true;
74 }
75
76 return false;
77}
78
79static void __init invoke_init_callbacks(void)
80{
81 int i;
82 int entries = ARRAY_SIZE(page_ext_ops);
83
84 for (i = 0; i < entries; i++) {
85 if (page_ext_ops[i]->init)
86 page_ext_ops[i]->init();
87 }
88}
89
90#if !defined(CONFIG_SPARSEMEM)
91
92
93void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
94{
95 pgdat->node_page_ext = NULL;
96}
97
98struct page_ext *lookup_page_ext(struct page *page)
99{
100 unsigned long pfn = page_to_pfn(page);
101 unsigned long offset;
102 struct page_ext *base;
103
104 base = NODE_DATA(page_to_nid(page))->node_page_ext;
105#ifdef CONFIG_DEBUG_VM
106 /*
107 * The sanity checks the page allocator does upon freeing a
108 * page can reach here before the page_ext arrays are
109 * allocated when feeding a range of pages to the allocator
110 * for the first time during bootup or memory hotplug.
111 */
112 if (unlikely(!base))
113 return NULL;
114#endif
115 offset = pfn - round_down(node_start_pfn(page_to_nid(page)),
116 MAX_ORDER_NR_PAGES);
117 return base + offset;
118}
119
120static int __init alloc_node_page_ext(int nid)
121{
122 struct page_ext *base;
123 unsigned long table_size;
124 unsigned long nr_pages;
125
126 nr_pages = NODE_DATA(nid)->node_spanned_pages;
127 if (!nr_pages)
128 return 0;
129
130 /*
131 * Need extra space if node range is not aligned with
132 * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
133 * checks buddy's status, range could be out of exact node range.
134 */
135 if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
136 !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
137 nr_pages += MAX_ORDER_NR_PAGES;
138
139 table_size = sizeof(struct page_ext) * nr_pages;
140
141 base = memblock_virt_alloc_try_nid_nopanic(
142 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
143 BOOTMEM_ALLOC_ACCESSIBLE, nid);
144 if (!base)
145 return -ENOMEM;
146 NODE_DATA(nid)->node_page_ext = base;
147 total_usage += table_size;
148 return 0;
149}
150
151void __init page_ext_init_flatmem(void)
152{
153
154 int nid, fail;
155
156 if (!invoke_need_callbacks())
157 return;
158
159 for_each_online_node(nid) {
160 fail = alloc_node_page_ext(nid);
161 if (fail)
162 goto fail;
163 }
164 pr_info("allocated %ld bytes of page_ext\n", total_usage);
165 invoke_init_callbacks();
166 return;
167
168fail:
169 pr_crit("allocation of page_ext failed.\n");
170 panic("Out of memory");
171}
172
173#else /* CONFIG_FLAT_NODE_MEM_MAP */
174
175struct page_ext *lookup_page_ext(struct page *page)
176{
177 unsigned long pfn = page_to_pfn(page);
178 struct mem_section *section = __pfn_to_section(pfn);
179#ifdef CONFIG_DEBUG_VM
180 /*
181 * The sanity checks the page allocator does upon freeing a
182 * page can reach here before the page_ext arrays are
183 * allocated when feeding a range of pages to the allocator
184 * for the first time during bootup or memory hotplug.
185 */
186 if (!section->page_ext)
187 return NULL;
188#endif
189 return section->page_ext + pfn;
190}
191
192static void *__meminit alloc_page_ext(size_t size, int nid)
193{
194 gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
195 void *addr = NULL;
196
197 addr = alloc_pages_exact_nid(nid, size, flags);
198 if (addr) {
199 kmemleak_alloc(addr, size, 1, flags);
200 return addr;
201 }
202
203 if (node_state(nid, N_HIGH_MEMORY))
204 addr = vzalloc_node(size, nid);
205 else
206 addr = vzalloc(size);
207
208 return addr;
209}
210
211static int __meminit init_section_page_ext(unsigned long pfn, int nid)
212{
213 struct mem_section *section;
214 struct page_ext *base;
215 unsigned long table_size;
216
217 section = __pfn_to_section(pfn);
218
219 if (section->page_ext)
220 return 0;
221
222 table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
223 base = alloc_page_ext(table_size, nid);
224
225 /*
226 * The value stored in section->page_ext is (base - pfn)
227 * and it does not point to the memory block allocated above,
228 * causing kmemleak false positives.
229 */
230 kmemleak_not_leak(base);
231
232 if (!base) {
233 pr_err("page ext allocation failure\n");
234 return -ENOMEM;
235 }
236
237 /*
238 * The passed "pfn" may not be aligned to SECTION. For the calculation
239 * we need to apply a mask.
240 */
241 pfn &= PAGE_SECTION_MASK;
242 section->page_ext = base - pfn;
243 total_usage += table_size;
244 return 0;
245}
246#ifdef CONFIG_MEMORY_HOTPLUG
247static void free_page_ext(void *addr)
248{
249 if (is_vmalloc_addr(addr)) {
250 vfree(addr);
251 } else {
252 struct page *page = virt_to_page(addr);
253 size_t table_size;
254
255 table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
256
257 BUG_ON(PageReserved(page));
258 free_pages_exact(addr, table_size);
259 }
260}
261
262static void __free_page_ext(unsigned long pfn)
263{
264 struct mem_section *ms;
265 struct page_ext *base;
266
267 ms = __pfn_to_section(pfn);
268 if (!ms || !ms->page_ext)
269 return;
270 base = ms->page_ext + pfn;
271 free_page_ext(base);
272 ms->page_ext = NULL;
273}
274
275static int __meminit online_page_ext(unsigned long start_pfn,
276 unsigned long nr_pages,
277 int nid)
278{
279 unsigned long start, end, pfn;
280 int fail = 0;
281
282 start = SECTION_ALIGN_DOWN(start_pfn);
283 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
284
285 if (nid == -1) {
286 /*
287 * In this case, "nid" already exists and contains valid memory.
288 * "start_pfn" passed to us is a pfn which is an arg for
289 * online__pages(), and start_pfn should exist.
290 */
291 nid = pfn_to_nid(start_pfn);
292 VM_BUG_ON(!node_state(nid, N_ONLINE));
293 }
294
295 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
296 if (!pfn_present(pfn))
297 continue;
298 fail = init_section_page_ext(pfn, nid);
299 }
300 if (!fail)
301 return 0;
302
303 /* rollback */
304 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
305 __free_page_ext(pfn);
306
307 return -ENOMEM;
308}
309
310static int __meminit offline_page_ext(unsigned long start_pfn,
311 unsigned long nr_pages, int nid)
312{
313 unsigned long start, end, pfn;
314
315 start = SECTION_ALIGN_DOWN(start_pfn);
316 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
317
318 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
319 __free_page_ext(pfn);
320 return 0;
321
322}
323
324static int __meminit page_ext_callback(struct notifier_block *self,
325 unsigned long action, void *arg)
326{
327 struct memory_notify *mn = arg;
328 int ret = 0;
329
330 switch (action) {
331 case MEM_GOING_ONLINE:
332 ret = online_page_ext(mn->start_pfn,
333 mn->nr_pages, mn->status_change_nid);
334 break;
335 case MEM_OFFLINE:
336 offline_page_ext(mn->start_pfn,
337 mn->nr_pages, mn->status_change_nid);
338 break;
339 case MEM_CANCEL_ONLINE:
340 offline_page_ext(mn->start_pfn,
341 mn->nr_pages, mn->status_change_nid);
342 break;
343 case MEM_GOING_OFFLINE:
344 break;
345 case MEM_ONLINE:
346 case MEM_CANCEL_OFFLINE:
347 break;
348 }
349
350 return notifier_from_errno(ret);
351}
352
353#endif
354
355void __init page_ext_init(void)
356{
357 unsigned long pfn;
358 int nid;
359
360 if (!invoke_need_callbacks())
361 return;
362
363 for_each_node_state(nid, N_MEMORY) {
364 unsigned long start_pfn, end_pfn;
365
366 start_pfn = node_start_pfn(nid);
367 end_pfn = node_end_pfn(nid);
368 /*
369 * start_pfn and end_pfn may not be aligned to SECTION and the
370 * page->flags of out of node pages are not initialized. So we
371 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
372 */
373 for (pfn = start_pfn; pfn < end_pfn;
374 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
375
376 if (!pfn_valid(pfn))
377 continue;
378 /*
379 * Nodes's pfns can be overlapping.
380 * We know some arch can have a nodes layout such as
381 * -------------pfn-------------->
382 * N0 | N1 | N2 | N0 | N1 | N2|....
383 */
384 if (pfn_to_nid(pfn) != nid)
385 continue;
386 if (init_section_page_ext(pfn, nid))
387 goto oom;
388 }
389 }
390 hotplug_memory_notifier(page_ext_callback, 0);
391 pr_info("allocated %ld bytes of page_ext\n", total_usage);
392 invoke_init_callbacks();
393 return;
394
395oom:
396 panic("Out of memory");
397}
398
399void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
400{
401}
402
403#endif
diff --git a/mm/page_owner.c b/mm/page_owner.c
new file mode 100644
index 000000000000..9ab4a9b5bc09
--- /dev/null
+++ b/mm/page_owner.c
@@ -0,0 +1,311 @@
1#include <linux/debugfs.h>
2#include <linux/mm.h>
3#include <linux/slab.h>
4#include <linux/uaccess.h>
5#include <linux/bootmem.h>
6#include <linux/stacktrace.h>
7#include <linux/page_owner.h>
8#include "internal.h"
9
10static bool page_owner_disabled = true;
11bool page_owner_inited __read_mostly;
12
13static void init_early_allocated_pages(void);
14
15static int early_page_owner_param(char *buf)
16{
17 if (!buf)
18 return -EINVAL;
19
20 if (strcmp(buf, "on") == 0)
21 page_owner_disabled = false;
22
23 return 0;
24}
25early_param("page_owner", early_page_owner_param);
26
27static bool need_page_owner(void)
28{
29 if (page_owner_disabled)
30 return false;
31
32 return true;
33}
34
35static void init_page_owner(void)
36{
37 if (page_owner_disabled)
38 return;
39
40 page_owner_inited = true;
41 init_early_allocated_pages();
42}
43
44struct page_ext_operations page_owner_ops = {
45 .need = need_page_owner,
46 .init = init_page_owner,
47};
48
49void __reset_page_owner(struct page *page, unsigned int order)
50{
51 int i;
52 struct page_ext *page_ext;
53
54 for (i = 0; i < (1 << order); i++) {
55 page_ext = lookup_page_ext(page + i);
56 __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
57 }
58}
59
60void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
61{
62 struct page_ext *page_ext;
63 struct stack_trace *trace;
64
65 page_ext = lookup_page_ext(page);
66
67 trace = &page_ext->trace;
68 trace->nr_entries = 0;
69 trace->max_entries = ARRAY_SIZE(page_ext->trace_entries);
70 trace->entries = &page_ext->trace_entries[0];
71 trace->skip = 3;
72 save_stack_trace(&page_ext->trace);
73
74 page_ext->order = order;
75 page_ext->gfp_mask = gfp_mask;
76
77 __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
78}
79
80static ssize_t
81print_page_owner(char __user *buf, size_t count, unsigned long pfn,
82 struct page *page, struct page_ext *page_ext)
83{
84 int ret;
85 int pageblock_mt, page_mt;
86 char *kbuf;
87
88 kbuf = kmalloc(count, GFP_KERNEL);
89 if (!kbuf)
90 return -ENOMEM;
91
92 ret = snprintf(kbuf, count,
93 "Page allocated via order %u, mask 0x%x\n",
94 page_ext->order, page_ext->gfp_mask);
95
96 if (ret >= count)
97 goto err;
98
99 /* Print information relevant to grouping pages by mobility */
100 pageblock_mt = get_pfnblock_migratetype(page, pfn);
101 page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
102 ret += snprintf(kbuf + ret, count - ret,
103 "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
104 pfn,
105 pfn >> pageblock_order,
106 pageblock_mt,
107 pageblock_mt != page_mt ? "Fallback" : " ",
108 PageLocked(page) ? "K" : " ",
109 PageError(page) ? "E" : " ",
110 PageReferenced(page) ? "R" : " ",
111 PageUptodate(page) ? "U" : " ",
112 PageDirty(page) ? "D" : " ",
113 PageLRU(page) ? "L" : " ",
114 PageActive(page) ? "A" : " ",
115 PageSlab(page) ? "S" : " ",
116 PageWriteback(page) ? "W" : " ",
117 PageCompound(page) ? "C" : " ",
118 PageSwapCache(page) ? "B" : " ",
119 PageMappedToDisk(page) ? "M" : " ");
120
121 if (ret >= count)
122 goto err;
123
124 ret += snprint_stack_trace(kbuf + ret, count - ret,
125 &page_ext->trace, 0);
126 if (ret >= count)
127 goto err;
128
129 ret += snprintf(kbuf + ret, count - ret, "\n");
130 if (ret >= count)
131 goto err;
132
133 if (copy_to_user(buf, kbuf, ret))
134 ret = -EFAULT;
135
136 kfree(kbuf);
137 return ret;
138
139err:
140 kfree(kbuf);
141 return -ENOMEM;
142}
143
144static ssize_t
145read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
146{
147 unsigned long pfn;
148 struct page *page;
149 struct page_ext *page_ext;
150
151 if (!page_owner_inited)
152 return -EINVAL;
153
154 page = NULL;
155 pfn = min_low_pfn + *ppos;
156
157 /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
158 while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
159 pfn++;
160
161 drain_all_pages(NULL);
162
163 /* Find an allocated page */
164 for (; pfn < max_pfn; pfn++) {
165 /*
166 * If the new page is in a new MAX_ORDER_NR_PAGES area,
167 * validate the area as existing, skip it if not
168 */
169 if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
170 pfn += MAX_ORDER_NR_PAGES - 1;
171 continue;
172 }
173
174 /* Check for holes within a MAX_ORDER area */
175 if (!pfn_valid_within(pfn))
176 continue;
177
178 page = pfn_to_page(pfn);
179 if (PageBuddy(page)) {
180 unsigned long freepage_order = page_order_unsafe(page);
181
182 if (freepage_order < MAX_ORDER)
183 pfn += (1UL << freepage_order) - 1;
184 continue;
185 }
186
187 page_ext = lookup_page_ext(page);
188
189 /*
190 * Some pages could be missed by concurrent allocation or free,
191 * because we don't hold the zone lock.
192 */
193 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
194 continue;
195
196 /* Record the next PFN to read in the file offset */
197 *ppos = (pfn - min_low_pfn) + 1;
198
199 return print_page_owner(buf, count, pfn, page, page_ext);
200 }
201
202 return 0;
203}
204
205static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
206{
207 struct page *page;
208 struct page_ext *page_ext;
209 unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
210 unsigned long end_pfn = pfn + zone->spanned_pages;
211 unsigned long count = 0;
212
213 /* Scan block by block. First and last block may be incomplete */
214 pfn = zone->zone_start_pfn;
215
216 /*
217 * Walk the zone in pageblock_nr_pages steps. If a page block spans
218 * a zone boundary, it will be double counted between zones. This does
219 * not matter as the mixed block count will still be correct
220 */
221 for (; pfn < end_pfn; ) {
222 if (!pfn_valid(pfn)) {
223 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
224 continue;
225 }
226
227 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
228 block_end_pfn = min(block_end_pfn, end_pfn);
229
230 page = pfn_to_page(pfn);
231
232 for (; pfn < block_end_pfn; pfn++) {
233 if (!pfn_valid_within(pfn))
234 continue;
235
236 page = pfn_to_page(pfn);
237
238 /*
239 * We are safe to check buddy flag and order, because
240 * this is init stage and only single thread runs.
241 */
242 if (PageBuddy(page)) {
243 pfn += (1UL << page_order(page)) - 1;
244 continue;
245 }
246
247 if (PageReserved(page))
248 continue;
249
250 page_ext = lookup_page_ext(page);
251
252 /* Maybe overraping zone */
253 if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
254 continue;
255
256 /* Found early allocated page */
257 set_page_owner(page, 0, 0);
258 count++;
259 }
260 }
261
262 pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
263 pgdat->node_id, zone->name, count);
264}
265
266static void init_zones_in_node(pg_data_t *pgdat)
267{
268 struct zone *zone;
269 struct zone *node_zones = pgdat->node_zones;
270 unsigned long flags;
271
272 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
273 if (!populated_zone(zone))
274 continue;
275
276 spin_lock_irqsave(&zone->lock, flags);
277 init_pages_in_zone(pgdat, zone);
278 spin_unlock_irqrestore(&zone->lock, flags);
279 }
280}
281
282static void init_early_allocated_pages(void)
283{
284 pg_data_t *pgdat;
285
286 drain_all_pages(NULL);
287 for_each_online_pgdat(pgdat)
288 init_zones_in_node(pgdat);
289}
290
291static const struct file_operations proc_page_owner_operations = {
292 .read = read_page_owner,
293};
294
295static int __init pageowner_init(void)
296{
297 struct dentry *dentry;
298
299 if (!page_owner_inited) {
300 pr_info("page_owner is disabled\n");
301 return 0;
302 }
303
304 dentry = debugfs_create_file("page_owner", S_IRUSR, NULL,
305 NULL, &proc_page_owner_operations);
306 if (IS_ERR(dentry))
307 return PTR_ERR(dentry);
308
309 return 0;
310}
311module_init(pageowner_init)
diff --git a/mm/rmap.c b/mm/rmap.c
index 45eba36fd673..c52f43a69eea 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,7 +23,7 @@
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * mm->mmap_sem 24 * mm->mmap_sem
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * mapping->i_mmap_mutex 26 * mapping->i_mmap_rwsem
27 * anon_vma->rwsem 27 * anon_vma->rwsem
28 * mm->page_table_lock or pte_lock 28 * mm->page_table_lock or pte_lock
29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -1260,7 +1260,7 @@ out_mlock:
1260 /* 1260 /*
1261 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 1261 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1262 * unstable result and race. Plus, We can't wait here because 1262 * unstable result and race. Plus, We can't wait here because
1263 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. 1263 * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
1264 * if trylock failed, the page remain in evictable lru and later 1264 * if trylock failed, the page remain in evictable lru and later
1265 * vmscan could retry to move the page to unevictable lru if the 1265 * vmscan could retry to move the page to unevictable lru if the
1266 * page is actually mlocked. 1266 * page is actually mlocked.
@@ -1635,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
1635static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) 1635static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1636{ 1636{
1637 struct anon_vma *anon_vma; 1637 struct anon_vma *anon_vma;
1638 pgoff_t pgoff = page_to_pgoff(page); 1638 pgoff_t pgoff;
1639 struct anon_vma_chain *avc; 1639 struct anon_vma_chain *avc;
1640 int ret = SWAP_AGAIN; 1640 int ret = SWAP_AGAIN;
1641 1641
@@ -1643,6 +1643,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1643 if (!anon_vma) 1643 if (!anon_vma)
1644 return ret; 1644 return ret;
1645 1645
1646 pgoff = page_to_pgoff(page);
1646 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1647 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1647 struct vm_area_struct *vma = avc->vma; 1648 struct vm_area_struct *vma = avc->vma;
1648 unsigned long address = vma_address(page, vma); 1649 unsigned long address = vma_address(page, vma);
@@ -1676,7 +1677,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1676static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) 1677static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1677{ 1678{
1678 struct address_space *mapping = page->mapping; 1679 struct address_space *mapping = page->mapping;
1679 pgoff_t pgoff = page_to_pgoff(page); 1680 pgoff_t pgoff;
1680 struct vm_area_struct *vma; 1681 struct vm_area_struct *vma;
1681 int ret = SWAP_AGAIN; 1682 int ret = SWAP_AGAIN;
1682 1683
@@ -1684,13 +1685,15 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1684 * The page lock not only makes sure that page->mapping cannot 1685 * The page lock not only makes sure that page->mapping cannot
1685 * suddenly be NULLified by truncation, it makes sure that the 1686 * suddenly be NULLified by truncation, it makes sure that the
1686 * structure at mapping cannot be freed and reused yet, 1687 * structure at mapping cannot be freed and reused yet,
1687 * so we can safely take mapping->i_mmap_mutex. 1688 * so we can safely take mapping->i_mmap_rwsem.
1688 */ 1689 */
1689 VM_BUG_ON_PAGE(!PageLocked(page), page); 1690 VM_BUG_ON_PAGE(!PageLocked(page), page);
1690 1691
1691 if (!mapping) 1692 if (!mapping)
1692 return ret; 1693 return ret;
1693 mutex_lock(&mapping->i_mmap_mutex); 1694
1695 pgoff = page_to_pgoff(page);
1696 i_mmap_lock_read(mapping);
1694 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1697 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1695 unsigned long address = vma_address(page, vma); 1698 unsigned long address = vma_address(page, vma);
1696 1699
@@ -1711,9 +1714,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1711 goto done; 1714 goto done;
1712 1715
1713 ret = rwc->file_nonlinear(page, mapping, rwc->arg); 1716 ret = rwc->file_nonlinear(page, mapping, rwc->arg);
1714
1715done: 1717done:
1716 mutex_unlock(&mapping->i_mmap_mutex); 1718 i_mmap_unlock_read(mapping);
1717 return ret; 1719 return ret;
1718} 1720}
1719 1721
diff --git a/mm/slab.c b/mm/slab.c
index fee275b5b6b7..65b5dcb6f671 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3015,7 +3015,7 @@ retry:
3015 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 3015 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
3016 nid = zone_to_nid(zone); 3016 nid = zone_to_nid(zone);
3017 3017
3018 if (cpuset_zone_allowed(zone, flags | __GFP_HARDWALL) && 3018 if (cpuset_zone_allowed(zone, flags) &&
3019 get_node(cache, nid) && 3019 get_node(cache, nid) &&
3020 get_node(cache, nid)->free_objects) { 3020 get_node(cache, nid)->free_objects) {
3021 obj = ____cache_alloc_node(cache, 3021 obj = ____cache_alloc_node(cache,
@@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3182 memset(ptr, 0, cachep->object_size); 3182 memset(ptr, 0, cachep->object_size);
3183 } 3183 }
3184 3184
3185 memcg_kmem_put_cache(cachep);
3185 return ptr; 3186 return ptr;
3186} 3187}
3187 3188
@@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3247 memset(objp, 0, cachep->object_size); 3248 memset(objp, 0, cachep->object_size);
3248 } 3249 }
3249 3250
3251 memcg_kmem_put_cache(cachep);
3250 return objp; 3252 return objp;
3251} 3253}
3252 3254
diff --git a/mm/slub.c b/mm/slub.c
index 765c5884d03d..fe376fe1f4fe 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x)
1233 kmemleak_free(x); 1233 kmemleak_free(x);
1234} 1234}
1235 1235
1236static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1236static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
1237 gfp_t flags)
1237{ 1238{
1238 flags &= gfp_allowed_mask; 1239 flags &= gfp_allowed_mask;
1239 lockdep_trace_alloc(flags); 1240 lockdep_trace_alloc(flags);
1240 might_sleep_if(flags & __GFP_WAIT); 1241 might_sleep_if(flags & __GFP_WAIT);
1241 1242
1242 return should_failslab(s->object_size, flags, s->flags); 1243 if (should_failslab(s->object_size, flags, s->flags))
1244 return NULL;
1245
1246 return memcg_kmem_get_cache(s, flags);
1243} 1247}
1244 1248
1245static inline void slab_post_alloc_hook(struct kmem_cache *s, 1249static inline void slab_post_alloc_hook(struct kmem_cache *s,
@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
1248 flags &= gfp_allowed_mask; 1252 flags &= gfp_allowed_mask;
1249 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 1253 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
1250 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); 1254 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
1255 memcg_kmem_put_cache(s);
1251} 1256}
1252 1257
1253static inline void slab_free_hook(struct kmem_cache *s, void *x) 1258static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -1665,8 +1670,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1665 1670
1666 n = get_node(s, zone_to_nid(zone)); 1671 n = get_node(s, zone_to_nid(zone));
1667 1672
1668 if (n && cpuset_zone_allowed(zone, 1673 if (n && cpuset_zone_allowed(zone, flags) &&
1669 flags | __GFP_HARDWALL) &&
1670 n->nr_partial > s->min_partial) { 1674 n->nr_partial > s->min_partial) {
1671 object = get_partial_node(s, n, c, flags); 1675 object = get_partial_node(s, n, c, flags);
1672 if (object) { 1676 if (object) {
@@ -2384,10 +2388,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2384 struct page *page; 2388 struct page *page;
2385 unsigned long tid; 2389 unsigned long tid;
2386 2390
2387 if (slab_pre_alloc_hook(s, gfpflags)) 2391 s = slab_pre_alloc_hook(s, gfpflags);
2392 if (!s)
2388 return NULL; 2393 return NULL;
2389
2390 s = memcg_kmem_get_cache(s, gfpflags);
2391redo: 2394redo:
2392 /* 2395 /*
2393 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 2396 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 9f25af825dec..b6e3662fe339 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -17,6 +17,8 @@ void vmacache_flush_all(struct mm_struct *mm)
17{ 17{
18 struct task_struct *g, *p; 18 struct task_struct *g, *p;
19 19
20 count_vm_vmacache_event(VMACACHE_FULL_FLUSHES);
21
20 /* 22 /*
21 * Single threaded tasks need not iterate the entire 23 * Single threaded tasks need not iterate the entire
22 * list of process. We can avoid the flushing as well 24 * list of process. We can avoid the flushing as well
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8a18196fcdff..39c338896416 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2574,10 +2574,10 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2574 if (!counters) 2574 if (!counters)
2575 return; 2575 return;
2576 2576
2577 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
2578 smp_rmb();
2579 if (v->flags & VM_UNINITIALIZED) 2577 if (v->flags & VM_UNINITIALIZED)
2580 return; 2578 return;
2579 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
2580 smp_rmb();
2581 2581
2582 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 2582 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
2583 2583
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a384339bf718..bd9a72bc4a1b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -229,9 +229,10 @@ EXPORT_SYMBOL(unregister_shrinker);
229 229
230#define SHRINK_BATCH 128 230#define SHRINK_BATCH 128
231 231
232static unsigned long 232static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
233shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, 233 struct shrinker *shrinker,
234 unsigned long nr_pages_scanned, unsigned long lru_pages) 234 unsigned long nr_scanned,
235 unsigned long nr_eligible)
235{ 236{
236 unsigned long freed = 0; 237 unsigned long freed = 0;
237 unsigned long long delta; 238 unsigned long long delta;
@@ -255,9 +256,9 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
255 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); 256 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
256 257
257 total_scan = nr; 258 total_scan = nr;
258 delta = (4 * nr_pages_scanned) / shrinker->seeks; 259 delta = (4 * nr_scanned) / shrinker->seeks;
259 delta *= freeable; 260 delta *= freeable;
260 do_div(delta, lru_pages + 1); 261 do_div(delta, nr_eligible + 1);
261 total_scan += delta; 262 total_scan += delta;
262 if (total_scan < 0) { 263 if (total_scan < 0) {
263 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", 264 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
@@ -289,8 +290,8 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
289 total_scan = freeable * 2; 290 total_scan = freeable * 2;
290 291
291 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, 292 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
292 nr_pages_scanned, lru_pages, 293 nr_scanned, nr_eligible,
293 freeable, delta, total_scan); 294 freeable, delta, total_scan);
294 295
295 /* 296 /*
296 * Normally, we should not scan less than batch_size objects in one 297 * Normally, we should not scan less than batch_size objects in one
@@ -339,34 +340,37 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
339 return freed; 340 return freed;
340} 341}
341 342
342/* 343/**
343 * Call the shrink functions to age shrinkable caches 344 * shrink_node_slabs - shrink slab caches of a given node
344 * 345 * @gfp_mask: allocation context
345 * Here we assume it costs one seek to replace a lru page and that it also 346 * @nid: node whose slab caches to target
346 * takes a seek to recreate a cache object. With this in mind we age equal 347 * @nr_scanned: pressure numerator
347 * percentages of the lru and ageable caches. This should balance the seeks 348 * @nr_eligible: pressure denominator
348 * generated by these structures.
349 * 349 *
350 * If the vm encountered mapped pages on the LRU it increase the pressure on 350 * Call the shrink functions to age shrinkable caches.
351 * slab to avoid swapping.
352 * 351 *
353 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 352 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
353 * unaware shrinkers will receive a node id of 0 instead.
354 * 354 *
355 * `lru_pages' represents the number of on-LRU pages in all the zones which 355 * @nr_scanned and @nr_eligible form a ratio that indicate how much of
356 * are eligible for the caller's allocation attempt. It is used for balancing 356 * the available objects should be scanned. Page reclaim for example
357 * slab reclaim versus page reclaim. 357 * passes the number of pages scanned and the number of pages on the
358 * LRU lists that it considered on @nid, plus a bias in @nr_scanned
359 * when it encountered mapped pages. The ratio is further biased by
360 * the ->seeks setting of the shrink function, which indicates the
361 * cost to recreate an object relative to that of an LRU page.
358 * 362 *
359 * Returns the number of slab objects which we shrunk. 363 * Returns the number of reclaimed slab objects.
360 */ 364 */
361unsigned long shrink_slab(struct shrink_control *shrinkctl, 365unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
362 unsigned long nr_pages_scanned, 366 unsigned long nr_scanned,
363 unsigned long lru_pages) 367 unsigned long nr_eligible)
364{ 368{
365 struct shrinker *shrinker; 369 struct shrinker *shrinker;
366 unsigned long freed = 0; 370 unsigned long freed = 0;
367 371
368 if (nr_pages_scanned == 0) 372 if (nr_scanned == 0)
369 nr_pages_scanned = SWAP_CLUSTER_MAX; 373 nr_scanned = SWAP_CLUSTER_MAX;
370 374
371 if (!down_read_trylock(&shrinker_rwsem)) { 375 if (!down_read_trylock(&shrinker_rwsem)) {
372 /* 376 /*
@@ -380,20 +384,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
380 } 384 }
381 385
382 list_for_each_entry(shrinker, &shrinker_list, list) { 386 list_for_each_entry(shrinker, &shrinker_list, list) {
383 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { 387 struct shrink_control sc = {
384 shrinkctl->nid = 0; 388 .gfp_mask = gfp_mask,
385 freed += shrink_slab_node(shrinkctl, shrinker, 389 .nid = nid,
386 nr_pages_scanned, lru_pages); 390 };
387 continue;
388 }
389 391
390 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { 392 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
391 if (node_online(shrinkctl->nid)) 393 sc.nid = 0;
392 freed += shrink_slab_node(shrinkctl, shrinker,
393 nr_pages_scanned, lru_pages);
394 394
395 } 395 freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible);
396 } 396 }
397
397 up_read(&shrinker_rwsem); 398 up_read(&shrinker_rwsem);
398out: 399out:
399 cond_resched(); 400 cond_resched();
@@ -1876,7 +1877,8 @@ enum scan_balance {
1876 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan 1877 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
1877 */ 1878 */
1878static void get_scan_count(struct lruvec *lruvec, int swappiness, 1879static void get_scan_count(struct lruvec *lruvec, int swappiness,
1879 struct scan_control *sc, unsigned long *nr) 1880 struct scan_control *sc, unsigned long *nr,
1881 unsigned long *lru_pages)
1880{ 1882{
1881 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1883 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1882 u64 fraction[2]; 1884 u64 fraction[2];
@@ -2022,6 +2024,7 @@ out:
2022 some_scanned = false; 2024 some_scanned = false;
2023 /* Only use force_scan on second pass. */ 2025 /* Only use force_scan on second pass. */
2024 for (pass = 0; !some_scanned && pass < 2; pass++) { 2026 for (pass = 0; !some_scanned && pass < 2; pass++) {
2027 *lru_pages = 0;
2025 for_each_evictable_lru(lru) { 2028 for_each_evictable_lru(lru) {
2026 int file = is_file_lru(lru); 2029 int file = is_file_lru(lru);
2027 unsigned long size; 2030 unsigned long size;
@@ -2048,14 +2051,19 @@ out:
2048 case SCAN_FILE: 2051 case SCAN_FILE:
2049 case SCAN_ANON: 2052 case SCAN_ANON:
2050 /* Scan one type exclusively */ 2053 /* Scan one type exclusively */
2051 if ((scan_balance == SCAN_FILE) != file) 2054 if ((scan_balance == SCAN_FILE) != file) {
2055 size = 0;
2052 scan = 0; 2056 scan = 0;
2057 }
2053 break; 2058 break;
2054 default: 2059 default:
2055 /* Look ma, no brain */ 2060 /* Look ma, no brain */
2056 BUG(); 2061 BUG();
2057 } 2062 }
2063
2064 *lru_pages += size;
2058 nr[lru] = scan; 2065 nr[lru] = scan;
2066
2059 /* 2067 /*
2060 * Skip the second pass and don't force_scan, 2068 * Skip the second pass and don't force_scan,
2061 * if we found something to scan. 2069 * if we found something to scan.
@@ -2069,7 +2077,7 @@ out:
2069 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 2077 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2070 */ 2078 */
2071static void shrink_lruvec(struct lruvec *lruvec, int swappiness, 2079static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
2072 struct scan_control *sc) 2080 struct scan_control *sc, unsigned long *lru_pages)
2073{ 2081{
2074 unsigned long nr[NR_LRU_LISTS]; 2082 unsigned long nr[NR_LRU_LISTS];
2075 unsigned long targets[NR_LRU_LISTS]; 2083 unsigned long targets[NR_LRU_LISTS];
@@ -2080,7 +2088,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
2080 struct blk_plug plug; 2088 struct blk_plug plug;
2081 bool scan_adjusted; 2089 bool scan_adjusted;
2082 2090
2083 get_scan_count(lruvec, swappiness, sc, nr); 2091 get_scan_count(lruvec, swappiness, sc, nr, lru_pages);
2084 2092
2085 /* Record the original scan target for proportional adjustments later */ 2093 /* Record the original scan target for proportional adjustments later */
2086 memcpy(targets, nr, sizeof(nr)); 2094 memcpy(targets, nr, sizeof(nr));
@@ -2258,7 +2266,8 @@ static inline bool should_continue_reclaim(struct zone *zone,
2258 } 2266 }
2259} 2267}
2260 2268
2261static bool shrink_zone(struct zone *zone, struct scan_control *sc) 2269static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2270 bool is_classzone)
2262{ 2271{
2263 unsigned long nr_reclaimed, nr_scanned; 2272 unsigned long nr_reclaimed, nr_scanned;
2264 bool reclaimable = false; 2273 bool reclaimable = false;
@@ -2269,6 +2278,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
2269 .zone = zone, 2278 .zone = zone,
2270 .priority = sc->priority, 2279 .priority = sc->priority,
2271 }; 2280 };
2281 unsigned long zone_lru_pages = 0;
2272 struct mem_cgroup *memcg; 2282 struct mem_cgroup *memcg;
2273 2283
2274 nr_reclaimed = sc->nr_reclaimed; 2284 nr_reclaimed = sc->nr_reclaimed;
@@ -2276,13 +2286,15 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
2276 2286
2277 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2287 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2278 do { 2288 do {
2289 unsigned long lru_pages;
2279 struct lruvec *lruvec; 2290 struct lruvec *lruvec;
2280 int swappiness; 2291 int swappiness;
2281 2292
2282 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2293 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2283 swappiness = mem_cgroup_swappiness(memcg); 2294 swappiness = mem_cgroup_swappiness(memcg);
2284 2295
2285 shrink_lruvec(lruvec, swappiness, sc); 2296 shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
2297 zone_lru_pages += lru_pages;
2286 2298
2287 /* 2299 /*
2288 * Direct reclaim and kswapd have to scan all memory 2300 * Direct reclaim and kswapd have to scan all memory
@@ -2302,6 +2314,25 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
2302 memcg = mem_cgroup_iter(root, memcg, &reclaim); 2314 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2303 } while (memcg); 2315 } while (memcg);
2304 2316
2317 /*
2318 * Shrink the slab caches in the same proportion that
2319 * the eligible LRU pages were scanned.
2320 */
2321 if (global_reclaim(sc) && is_classzone) {
2322 struct reclaim_state *reclaim_state;
2323
2324 shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
2325 sc->nr_scanned - nr_scanned,
2326 zone_lru_pages);
2327
2328 reclaim_state = current->reclaim_state;
2329 if (reclaim_state) {
2330 sc->nr_reclaimed +=
2331 reclaim_state->reclaimed_slab;
2332 reclaim_state->reclaimed_slab = 0;
2333 }
2334 }
2335
2305 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2336 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
2306 sc->nr_scanned - nr_scanned, 2337 sc->nr_scanned - nr_scanned,
2307 sc->nr_reclaimed - nr_reclaimed); 2338 sc->nr_reclaimed - nr_reclaimed);
@@ -2376,12 +2407,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2376 struct zone *zone; 2407 struct zone *zone;
2377 unsigned long nr_soft_reclaimed; 2408 unsigned long nr_soft_reclaimed;
2378 unsigned long nr_soft_scanned; 2409 unsigned long nr_soft_scanned;
2379 unsigned long lru_pages = 0;
2380 struct reclaim_state *reclaim_state = current->reclaim_state;
2381 gfp_t orig_mask; 2410 gfp_t orig_mask;
2382 struct shrink_control shrink = {
2383 .gfp_mask = sc->gfp_mask,
2384 };
2385 enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); 2411 enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
2386 bool reclaimable = false; 2412 bool reclaimable = false;
2387 2413
@@ -2394,12 +2420,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2394 if (buffer_heads_over_limit) 2420 if (buffer_heads_over_limit)
2395 sc->gfp_mask |= __GFP_HIGHMEM; 2421 sc->gfp_mask |= __GFP_HIGHMEM;
2396 2422
2397 nodes_clear(shrink.nodes_to_scan);
2398
2399 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2423 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2400 gfp_zone(sc->gfp_mask), sc->nodemask) { 2424 requested_highidx, sc->nodemask) {
2425 enum zone_type classzone_idx;
2426
2401 if (!populated_zone(zone)) 2427 if (!populated_zone(zone))
2402 continue; 2428 continue;
2429
2430 classzone_idx = requested_highidx;
2431 while (!populated_zone(zone->zone_pgdat->node_zones +
2432 classzone_idx))
2433 classzone_idx--;
2434
2403 /* 2435 /*
2404 * Take care memory controller reclaiming has small influence 2436 * Take care memory controller reclaiming has small influence
2405 * to global LRU. 2437 * to global LRU.
@@ -2409,9 +2441,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2409 GFP_KERNEL | __GFP_HARDWALL)) 2441 GFP_KERNEL | __GFP_HARDWALL))
2410 continue; 2442 continue;
2411 2443
2412 lru_pages += zone_reclaimable_pages(zone);
2413 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
2414
2415 if (sc->priority != DEF_PRIORITY && 2444 if (sc->priority != DEF_PRIORITY &&
2416 !zone_reclaimable(zone)) 2445 !zone_reclaimable(zone))
2417 continue; /* Let kswapd poll it */ 2446 continue; /* Let kswapd poll it */
@@ -2450,7 +2479,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2450 /* need some check for avoid more shrink_zone() */ 2479 /* need some check for avoid more shrink_zone() */
2451 } 2480 }
2452 2481
2453 if (shrink_zone(zone, sc)) 2482 if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
2454 reclaimable = true; 2483 reclaimable = true;
2455 2484
2456 if (global_reclaim(sc) && 2485 if (global_reclaim(sc) &&
@@ -2459,20 +2488,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2459 } 2488 }
2460 2489
2461 /* 2490 /*
2462 * Don't shrink slabs when reclaiming memory from over limit cgroups
2463 * but do shrink slab at least once when aborting reclaim for
2464 * compaction to avoid unevenly scanning file/anon LRU pages over slab
2465 * pages.
2466 */
2467 if (global_reclaim(sc)) {
2468 shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2469 if (reclaim_state) {
2470 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2471 reclaim_state->reclaimed_slab = 0;
2472 }
2473 }
2474
2475 /*
2476 * Restore to original mask to avoid the impact on the caller if we 2491 * Restore to original mask to avoid the impact on the caller if we
2477 * promoted it to __GFP_HIGHMEM. 2492 * promoted it to __GFP_HIGHMEM.
2478 */ 2493 */
@@ -2736,6 +2751,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2736 }; 2751 };
2737 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2752 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2738 int swappiness = mem_cgroup_swappiness(memcg); 2753 int swappiness = mem_cgroup_swappiness(memcg);
2754 unsigned long lru_pages;
2739 2755
2740 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2756 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2741 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2757 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2751,7 +2767,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2751 * will pick up pages from other mem cgroup's as well. We hack 2767 * will pick up pages from other mem cgroup's as well. We hack
2752 * the priority and make it zero. 2768 * the priority and make it zero.
2753 */ 2769 */
2754 shrink_lruvec(lruvec, swappiness, &sc); 2770 shrink_lruvec(lruvec, swappiness, &sc, &lru_pages);
2755 2771
2756 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2772 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2757 2773
@@ -2932,15 +2948,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2932static bool kswapd_shrink_zone(struct zone *zone, 2948static bool kswapd_shrink_zone(struct zone *zone,
2933 int classzone_idx, 2949 int classzone_idx,
2934 struct scan_control *sc, 2950 struct scan_control *sc,
2935 unsigned long lru_pages,
2936 unsigned long *nr_attempted) 2951 unsigned long *nr_attempted)
2937{ 2952{
2938 int testorder = sc->order; 2953 int testorder = sc->order;
2939 unsigned long balance_gap; 2954 unsigned long balance_gap;
2940 struct reclaim_state *reclaim_state = current->reclaim_state;
2941 struct shrink_control shrink = {
2942 .gfp_mask = sc->gfp_mask,
2943 };
2944 bool lowmem_pressure; 2955 bool lowmem_pressure;
2945 2956
2946 /* Reclaim above the high watermark. */ 2957 /* Reclaim above the high watermark. */
@@ -2975,13 +2986,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
2975 balance_gap, classzone_idx)) 2986 balance_gap, classzone_idx))
2976 return true; 2987 return true;
2977 2988
2978 shrink_zone(zone, sc); 2989 shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
2979 nodes_clear(shrink.nodes_to_scan);
2980 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
2981
2982 reclaim_state->reclaimed_slab = 0;
2983 shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2984 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2985 2990
2986 /* Account for the number of pages attempted to reclaim */ 2991 /* Account for the number of pages attempted to reclaim */
2987 *nr_attempted += sc->nr_to_reclaim; 2992 *nr_attempted += sc->nr_to_reclaim;
@@ -3042,7 +3047,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3042 count_vm_event(PAGEOUTRUN); 3047 count_vm_event(PAGEOUTRUN);
3043 3048
3044 do { 3049 do {
3045 unsigned long lru_pages = 0;
3046 unsigned long nr_attempted = 0; 3050 unsigned long nr_attempted = 0;
3047 bool raise_priority = true; 3051 bool raise_priority = true;
3048 bool pgdat_needs_compaction = (order > 0); 3052 bool pgdat_needs_compaction = (order > 0);
@@ -3102,8 +3106,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3102 if (!populated_zone(zone)) 3106 if (!populated_zone(zone))
3103 continue; 3107 continue;
3104 3108
3105 lru_pages += zone_reclaimable_pages(zone);
3106
3107 /* 3109 /*
3108 * If any zone is currently balanced then kswapd will 3110 * If any zone is currently balanced then kswapd will
3109 * not call compaction as it is expected that the 3111 * not call compaction as it is expected that the
@@ -3159,8 +3161,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3159 * that that high watermark would be met at 100% 3161 * that that high watermark would be met at 100%
3160 * efficiency. 3162 * efficiency.
3161 */ 3163 */
3162 if (kswapd_shrink_zone(zone, end_zone, &sc, 3164 if (kswapd_shrink_zone(zone, end_zone,
3163 lru_pages, &nr_attempted)) 3165 &sc, &nr_attempted))
3164 raise_priority = false; 3166 raise_priority = false;
3165 } 3167 }
3166 3168
@@ -3612,10 +3614,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3612 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3614 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3613 .may_swap = 1, 3615 .may_swap = 1,
3614 }; 3616 };
3615 struct shrink_control shrink = {
3616 .gfp_mask = sc.gfp_mask,
3617 };
3618 unsigned long nr_slab_pages0, nr_slab_pages1;
3619 3617
3620 cond_resched(); 3618 cond_resched();
3621 /* 3619 /*
@@ -3634,44 +3632,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3634 * priorities until we have enough memory freed. 3632 * priorities until we have enough memory freed.
3635 */ 3633 */
3636 do { 3634 do {
3637 shrink_zone(zone, &sc); 3635 shrink_zone(zone, &sc, true);
3638 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); 3636 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3639 } 3637 }
3640 3638
3641 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3642 if (nr_slab_pages0 > zone->min_slab_pages) {
3643 /*
3644 * shrink_slab() does not currently allow us to determine how
3645 * many pages were freed in this zone. So we take the current
3646 * number of slab pages and shake the slab until it is reduced
3647 * by the same nr_pages that we used for reclaiming unmapped
3648 * pages.
3649 */
3650 nodes_clear(shrink.nodes_to_scan);
3651 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
3652 for (;;) {
3653 unsigned long lru_pages = zone_reclaimable_pages(zone);
3654
3655 /* No reclaimable slab or very low memory pressure */
3656 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
3657 break;
3658
3659 /* Freed enough memory */
3660 nr_slab_pages1 = zone_page_state(zone,
3661 NR_SLAB_RECLAIMABLE);
3662 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3663 break;
3664 }
3665
3666 /*
3667 * Update nr_reclaimed by the number of slab pages we
3668 * reclaimed from this zone.
3669 */
3670 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3671 if (nr_slab_pages1 < nr_slab_pages0)
3672 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3673 }
3674
3675 p->reclaim_state = NULL; 3639 p->reclaim_state = NULL;
3676 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 3640 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3677 lockdep_clear_current_reclaim_state(); 3641 lockdep_clear_current_reclaim_state();
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1b12d390dc68..1284f89fca08 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -22,6 +22,8 @@
22#include <linux/writeback.h> 22#include <linux/writeback.h>
23#include <linux/compaction.h> 23#include <linux/compaction.h>
24#include <linux/mm_inline.h> 24#include <linux/mm_inline.h>
25#include <linux/page_ext.h>
26#include <linux/page_owner.h>
25 27
26#include "internal.h" 28#include "internal.h"
27 29
@@ -898,6 +900,7 @@ const char * const vmstat_text[] = {
898#ifdef CONFIG_DEBUG_VM_VMACACHE 900#ifdef CONFIG_DEBUG_VM_VMACACHE
899 "vmacache_find_calls", 901 "vmacache_find_calls",
900 "vmacache_find_hits", 902 "vmacache_find_hits",
903 "vmacache_full_flushes",
901#endif 904#endif
902#endif /* CONFIG_VM_EVENTS_COUNTERS */ 905#endif /* CONFIG_VM_EVENTS_COUNTERS */
903}; 906};
@@ -1017,6 +1020,104 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1017 return 0; 1020 return 0;
1018} 1021}
1019 1022
1023#ifdef CONFIG_PAGE_OWNER
1024static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
1025 pg_data_t *pgdat,
1026 struct zone *zone)
1027{
1028 struct page *page;
1029 struct page_ext *page_ext;
1030 unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
1031 unsigned long end_pfn = pfn + zone->spanned_pages;
1032 unsigned long count[MIGRATE_TYPES] = { 0, };
1033 int pageblock_mt, page_mt;
1034 int i;
1035
1036 /* Scan block by block. First and last block may be incomplete */
1037 pfn = zone->zone_start_pfn;
1038
1039 /*
1040 * Walk the zone in pageblock_nr_pages steps. If a page block spans
1041 * a zone boundary, it will be double counted between zones. This does
1042 * not matter as the mixed block count will still be correct
1043 */
1044 for (; pfn < end_pfn; ) {
1045 if (!pfn_valid(pfn)) {
1046 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
1047 continue;
1048 }
1049
1050 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
1051 block_end_pfn = min(block_end_pfn, end_pfn);
1052
1053 page = pfn_to_page(pfn);
1054 pageblock_mt = get_pfnblock_migratetype(page, pfn);
1055
1056 for (; pfn < block_end_pfn; pfn++) {
1057 if (!pfn_valid_within(pfn))
1058 continue;
1059
1060 page = pfn_to_page(pfn);
1061 if (PageBuddy(page)) {
1062 pfn += (1UL << page_order(page)) - 1;
1063 continue;
1064 }
1065
1066 if (PageReserved(page))
1067 continue;
1068
1069 page_ext = lookup_page_ext(page);
1070
1071 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
1072 continue;
1073
1074 page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
1075 if (pageblock_mt != page_mt) {
1076 if (is_migrate_cma(pageblock_mt))
1077 count[MIGRATE_MOVABLE]++;
1078 else
1079 count[pageblock_mt]++;
1080
1081 pfn = block_end_pfn;
1082 break;
1083 }
1084 pfn += (1UL << page_ext->order) - 1;
1085 }
1086 }
1087
1088 /* Print counts */
1089 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1090 for (i = 0; i < MIGRATE_TYPES; i++)
1091 seq_printf(m, "%12lu ", count[i]);
1092 seq_putc(m, '\n');
1093}
1094#endif /* CONFIG_PAGE_OWNER */
1095
1096/*
1097 * Print out the number of pageblocks for each migratetype that contain pages
1098 * of other types. This gives an indication of how well fallbacks are being
1099 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1100 * to determine what is going on
1101 */
1102static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1103{
1104#ifdef CONFIG_PAGE_OWNER
1105 int mtype;
1106
1107 if (!page_owner_inited)
1108 return;
1109
1110 drain_all_pages(NULL);
1111
1112 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1113 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1114 seq_printf(m, "%12s ", migratetype_names[mtype]);
1115 seq_putc(m, '\n');
1116
1117 walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
1118#endif /* CONFIG_PAGE_OWNER */
1119}
1120
1020/* 1121/*
1021 * This prints out statistics in relation to grouping pages by mobility. 1122 * This prints out statistics in relation to grouping pages by mobility.
1022 * It is expensive to collect so do not constantly read the file. 1123 * It is expensive to collect so do not constantly read the file.
@@ -1034,6 +1135,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
1034 seq_putc(m, '\n'); 1135 seq_putc(m, '\n');
1035 pagetypeinfo_showfree(m, pgdat); 1136 pagetypeinfo_showfree(m, pgdat);
1036 pagetypeinfo_showblockcount(m, pgdat); 1137 pagetypeinfo_showblockcount(m, pgdat);
1138 pagetypeinfo_showmixedcount(m, pgdat);
1037 1139
1038 return 0; 1140 return 0;
1039} 1141}
diff --git a/mm/zbud.c b/mm/zbud.c
index ec71b37fb06c..4e387bea702e 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -132,7 +132,7 @@ static struct zbud_ops zbud_zpool_ops = {
132 132
133static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) 133static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
134{ 134{
135 return zbud_create_pool(gfp, &zbud_zpool_ops); 135 return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
136} 136}
137 137
138static void zbud_zpool_destroy(void *pool) 138static void zbud_zpool_destroy(void *pool)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 839a48c3ca27..4d0a063145ec 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -155,8 +155,6 @@
155 * (reason above) 155 * (reason above)
156 */ 156 */
157#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) 157#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
158#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
159 ZS_SIZE_CLASS_DELTA + 1)
160 158
161/* 159/*
162 * We do not maintain any list for completely empty or full pages 160 * We do not maintain any list for completely empty or full pages
@@ -171,6 +169,11 @@ enum fullness_group {
171}; 169};
172 170
173/* 171/*
172 * number of size_classes
173 */
174static int zs_size_classes;
175
176/*
174 * We assign a page to ZS_ALMOST_EMPTY fullness group when: 177 * We assign a page to ZS_ALMOST_EMPTY fullness group when:
175 * n <= N / f, where 178 * n <= N / f, where
176 * n = number of allocated objects 179 * n = number of allocated objects
@@ -214,7 +217,7 @@ struct link_free {
214}; 217};
215 218
216struct zs_pool { 219struct zs_pool {
217 struct size_class size_class[ZS_SIZE_CLASSES]; 220 struct size_class **size_class;
218 221
219 gfp_t flags; /* allocation flags used when growing pool */ 222 gfp_t flags; /* allocation flags used when growing pool */
220 atomic_long_t pages_allocated; 223 atomic_long_t pages_allocated;
@@ -468,7 +471,7 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
468 if (newfg == currfg) 471 if (newfg == currfg)
469 goto out; 472 goto out;
470 473
471 class = &pool->size_class[class_idx]; 474 class = pool->size_class[class_idx];
472 remove_zspage(page, class, currfg); 475 remove_zspage(page, class, currfg);
473 insert_zspage(page, class, newfg); 476 insert_zspage(page, class, newfg);
474 set_zspage_mapping(page, class_idx, newfg); 477 set_zspage_mapping(page, class_idx, newfg);
@@ -629,6 +632,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
629 struct page *next_page; 632 struct page *next_page;
630 struct link_free *link; 633 struct link_free *link;
631 unsigned int i = 1; 634 unsigned int i = 1;
635 void *vaddr;
632 636
633 /* 637 /*
634 * page->index stores offset of first object starting 638 * page->index stores offset of first object starting
@@ -639,8 +643,8 @@ static void init_zspage(struct page *first_page, struct size_class *class)
639 if (page != first_page) 643 if (page != first_page)
640 page->index = off; 644 page->index = off;
641 645
642 link = (struct link_free *)kmap_atomic(page) + 646 vaddr = kmap_atomic(page);
643 off / sizeof(*link); 647 link = (struct link_free *)vaddr + off / sizeof(*link);
644 648
645 while ((off += class->size) < PAGE_SIZE) { 649 while ((off += class->size) < PAGE_SIZE) {
646 link->next = obj_location_to_handle(page, i++); 650 link->next = obj_location_to_handle(page, i++);
@@ -654,7 +658,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
654 */ 658 */
655 next_page = get_next_page(page); 659 next_page = get_next_page(page);
656 link->next = obj_location_to_handle(next_page, 0); 660 link->next = obj_location_to_handle(next_page, 0);
657 kunmap_atomic(link); 661 kunmap_atomic(vaddr);
658 page = next_page; 662 page = next_page;
659 off %= PAGE_SIZE; 663 off %= PAGE_SIZE;
660 } 664 }
@@ -784,7 +788,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
784 */ 788 */
785 if (area->vm_buf) 789 if (area->vm_buf)
786 return 0; 790 return 0;
787 area->vm_buf = (char *)__get_free_page(GFP_KERNEL); 791 area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
788 if (!area->vm_buf) 792 if (!area->vm_buf)
789 return -ENOMEM; 793 return -ENOMEM;
790 return 0; 794 return 0;
@@ -792,8 +796,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
792 796
793static inline void __zs_cpu_down(struct mapping_area *area) 797static inline void __zs_cpu_down(struct mapping_area *area)
794{ 798{
795 if (area->vm_buf) 799 kfree(area->vm_buf);
796 free_page((unsigned long)area->vm_buf);
797 area->vm_buf = NULL; 800 area->vm_buf = NULL;
798} 801}
799 802
@@ -881,14 +884,10 @@ static struct notifier_block zs_cpu_nb = {
881 .notifier_call = zs_cpu_notifier 884 .notifier_call = zs_cpu_notifier
882}; 885};
883 886
884static void zs_exit(void) 887static void zs_unregister_cpu_notifier(void)
885{ 888{
886 int cpu; 889 int cpu;
887 890
888#ifdef CONFIG_ZPOOL
889 zpool_unregister_driver(&zs_zpool_driver);
890#endif
891
892 cpu_notifier_register_begin(); 891 cpu_notifier_register_begin();
893 892
894 for_each_online_cpu(cpu) 893 for_each_online_cpu(cpu)
@@ -898,31 +897,74 @@ static void zs_exit(void)
898 cpu_notifier_register_done(); 897 cpu_notifier_register_done();
899} 898}
900 899
901static int zs_init(void) 900static int zs_register_cpu_notifier(void)
902{ 901{
903 int cpu, ret; 902 int cpu, uninitialized_var(ret);
904 903
905 cpu_notifier_register_begin(); 904 cpu_notifier_register_begin();
906 905
907 __register_cpu_notifier(&zs_cpu_nb); 906 __register_cpu_notifier(&zs_cpu_nb);
908 for_each_online_cpu(cpu) { 907 for_each_online_cpu(cpu) {
909 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 908 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
910 if (notifier_to_errno(ret)) { 909 if (notifier_to_errno(ret))
911 cpu_notifier_register_done(); 910 break;
912 goto fail;
913 }
914 } 911 }
915 912
916 cpu_notifier_register_done(); 913 cpu_notifier_register_done();
914 return notifier_to_errno(ret);
915}
916
917static void init_zs_size_classes(void)
918{
919 int nr;
917 920
921 nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1;
922 if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA)
923 nr += 1;
924
925 zs_size_classes = nr;
926}
927
928static void __exit zs_exit(void)
929{
918#ifdef CONFIG_ZPOOL 930#ifdef CONFIG_ZPOOL
919 zpool_register_driver(&zs_zpool_driver); 931 zpool_unregister_driver(&zs_zpool_driver);
920#endif 932#endif
933 zs_unregister_cpu_notifier();
934}
921 935
936static int __init zs_init(void)
937{
938 int ret = zs_register_cpu_notifier();
939
940 if (ret) {
941 zs_unregister_cpu_notifier();
942 return ret;
943 }
944
945 init_zs_size_classes();
946
947#ifdef CONFIG_ZPOOL
948 zpool_register_driver(&zs_zpool_driver);
949#endif
922 return 0; 950 return 0;
923fail: 951}
924 zs_exit(); 952
925 return notifier_to_errno(ret); 953static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
954{
955 return pages_per_zspage * PAGE_SIZE / size;
956}
957
958static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
959{
960 if (prev->pages_per_zspage != pages_per_zspage)
961 return false;
962
963 if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage)
964 != get_maxobj_per_zspage(size, pages_per_zspage))
965 return false;
966
967 return true;
926} 968}
927 969
928/** 970/**
@@ -937,33 +979,71 @@ fail:
937 */ 979 */
938struct zs_pool *zs_create_pool(gfp_t flags) 980struct zs_pool *zs_create_pool(gfp_t flags)
939{ 981{
940 int i, ovhd_size; 982 int i;
941 struct zs_pool *pool; 983 struct zs_pool *pool;
984 struct size_class *prev_class = NULL;
942 985
943 ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); 986 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
944 pool = kzalloc(ovhd_size, GFP_KERNEL);
945 if (!pool) 987 if (!pool)
946 return NULL; 988 return NULL;
947 989
948 for (i = 0; i < ZS_SIZE_CLASSES; i++) { 990 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
991 GFP_KERNEL);
992 if (!pool->size_class) {
993 kfree(pool);
994 return NULL;
995 }
996
997 /*
998 * Iterate reversly, because, size of size_class that we want to use
999 * for merging should be larger or equal to current size.
1000 */
1001 for (i = zs_size_classes - 1; i >= 0; i--) {
949 int size; 1002 int size;
1003 int pages_per_zspage;
950 struct size_class *class; 1004 struct size_class *class;
951 1005
952 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 1006 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
953 if (size > ZS_MAX_ALLOC_SIZE) 1007 if (size > ZS_MAX_ALLOC_SIZE)
954 size = ZS_MAX_ALLOC_SIZE; 1008 size = ZS_MAX_ALLOC_SIZE;
1009 pages_per_zspage = get_pages_per_zspage(size);
1010
1011 /*
1012 * size_class is used for normal zsmalloc operation such
1013 * as alloc/free for that size. Although it is natural that we
1014 * have one size_class for each size, there is a chance that we
1015 * can get more memory utilization if we use one size_class for
1016 * many different sizes whose size_class have same
1017 * characteristics. So, we makes size_class point to
1018 * previous size_class if possible.
1019 */
1020 if (prev_class) {
1021 if (can_merge(prev_class, size, pages_per_zspage)) {
1022 pool->size_class[i] = prev_class;
1023 continue;
1024 }
1025 }
1026
1027 class = kzalloc(sizeof(struct size_class), GFP_KERNEL);
1028 if (!class)
1029 goto err;
955 1030
956 class = &pool->size_class[i];
957 class->size = size; 1031 class->size = size;
958 class->index = i; 1032 class->index = i;
1033 class->pages_per_zspage = pages_per_zspage;
959 spin_lock_init(&class->lock); 1034 spin_lock_init(&class->lock);
960 class->pages_per_zspage = get_pages_per_zspage(size); 1035 pool->size_class[i] = class;
961 1036
1037 prev_class = class;
962 } 1038 }
963 1039
964 pool->flags = flags; 1040 pool->flags = flags;
965 1041
966 return pool; 1042 return pool;
1043
1044err:
1045 zs_destroy_pool(pool);
1046 return NULL;
967} 1047}
968EXPORT_SYMBOL_GPL(zs_create_pool); 1048EXPORT_SYMBOL_GPL(zs_create_pool);
969 1049
@@ -971,9 +1051,15 @@ void zs_destroy_pool(struct zs_pool *pool)
971{ 1051{
972 int i; 1052 int i;
973 1053
974 for (i = 0; i < ZS_SIZE_CLASSES; i++) { 1054 for (i = 0; i < zs_size_classes; i++) {
975 int fg; 1055 int fg;
976 struct size_class *class = &pool->size_class[i]; 1056 struct size_class *class = pool->size_class[i];
1057
1058 if (!class)
1059 continue;
1060
1061 if (class->index != i)
1062 continue;
977 1063
978 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { 1064 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
979 if (class->fullness_list[fg]) { 1065 if (class->fullness_list[fg]) {
@@ -981,7 +1067,10 @@ void zs_destroy_pool(struct zs_pool *pool)
981 class->size, fg); 1067 class->size, fg);
982 } 1068 }
983 } 1069 }
1070 kfree(class);
984 } 1071 }
1072
1073 kfree(pool->size_class);
985 kfree(pool); 1074 kfree(pool);
986} 1075}
987EXPORT_SYMBOL_GPL(zs_destroy_pool); 1076EXPORT_SYMBOL_GPL(zs_destroy_pool);
@@ -999,8 +1088,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
999{ 1088{
1000 unsigned long obj; 1089 unsigned long obj;
1001 struct link_free *link; 1090 struct link_free *link;
1002 int class_idx;
1003 struct size_class *class; 1091 struct size_class *class;
1092 void *vaddr;
1004 1093
1005 struct page *first_page, *m_page; 1094 struct page *first_page, *m_page;
1006 unsigned long m_objidx, m_offset; 1095 unsigned long m_objidx, m_offset;
@@ -1008,9 +1097,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1008 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1097 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
1009 return 0; 1098 return 0;
1010 1099
1011 class_idx = get_size_class_index(size); 1100 class = pool->size_class[get_size_class_index(size)];
1012 class = &pool->size_class[class_idx];
1013 BUG_ON(class_idx != class->index);
1014 1101
1015 spin_lock(&class->lock); 1102 spin_lock(&class->lock);
1016 first_page = find_get_zspage(class); 1103 first_page = find_get_zspage(class);
@@ -1031,11 +1118,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1031 obj_handle_to_location(obj, &m_page, &m_objidx); 1118 obj_handle_to_location(obj, &m_page, &m_objidx);
1032 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 1119 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
1033 1120
1034 link = (struct link_free *)kmap_atomic(m_page) + 1121 vaddr = kmap_atomic(m_page);
1035 m_offset / sizeof(*link); 1122 link = (struct link_free *)vaddr + m_offset / sizeof(*link);
1036 first_page->freelist = link->next; 1123 first_page->freelist = link->next;
1037 memset(link, POISON_INUSE, sizeof(*link)); 1124 memset(link, POISON_INUSE, sizeof(*link));
1038 kunmap_atomic(link); 1125 kunmap_atomic(vaddr);
1039 1126
1040 first_page->inuse++; 1127 first_page->inuse++;
1041 /* Now move the zspage to another fullness group, if required */ 1128 /* Now move the zspage to another fullness group, if required */
@@ -1051,6 +1138,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
1051 struct link_free *link; 1138 struct link_free *link;
1052 struct page *first_page, *f_page; 1139 struct page *first_page, *f_page;
1053 unsigned long f_objidx, f_offset; 1140 unsigned long f_objidx, f_offset;
1141 void *vaddr;
1054 1142
1055 int class_idx; 1143 int class_idx;
1056 struct size_class *class; 1144 struct size_class *class;
@@ -1063,16 +1151,16 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
1063 first_page = get_first_page(f_page); 1151 first_page = get_first_page(f_page);
1064 1152
1065 get_zspage_mapping(first_page, &class_idx, &fullness); 1153 get_zspage_mapping(first_page, &class_idx, &fullness);
1066 class = &pool->size_class[class_idx]; 1154 class = pool->size_class[class_idx];
1067 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 1155 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
1068 1156
1069 spin_lock(&class->lock); 1157 spin_lock(&class->lock);
1070 1158
1071 /* Insert this object in containing zspage's freelist */ 1159 /* Insert this object in containing zspage's freelist */
1072 link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) 1160 vaddr = kmap_atomic(f_page);
1073 + f_offset); 1161 link = (struct link_free *)(vaddr + f_offset);
1074 link->next = first_page->freelist; 1162 link->next = first_page->freelist;
1075 kunmap_atomic(link); 1163 kunmap_atomic(vaddr);
1076 first_page->freelist = (void *)obj; 1164 first_page->freelist = (void *)obj;
1077 1165
1078 first_page->inuse--; 1166 first_page->inuse--;
@@ -1124,7 +1212,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1124 1212
1125 obj_handle_to_location(handle, &page, &obj_idx); 1213 obj_handle_to_location(handle, &page, &obj_idx);
1126 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1214 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1127 class = &pool->size_class[class_idx]; 1215 class = pool->size_class[class_idx];
1128 off = obj_idx_to_offset(page, obj_idx, class->size); 1216 off = obj_idx_to_offset(page, obj_idx, class->size);
1129 1217
1130 area = &get_cpu_var(zs_map_area); 1218 area = &get_cpu_var(zs_map_area);
@@ -1158,7 +1246,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1158 1246
1159 obj_handle_to_location(handle, &page, &obj_idx); 1247 obj_handle_to_location(handle, &page, &obj_idx);
1160 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1248 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1161 class = &pool->size_class[class_idx]; 1249 class = pool->size_class[class_idx];
1162 off = obj_idx_to_offset(page, obj_idx, class->size); 1250 off = obj_idx_to_offset(page, obj_idx, class->size);
1163 1251
1164 area = this_cpu_ptr(&zs_map_area); 1252 area = this_cpu_ptr(&zs_map_area);
diff --git a/mm/zswap.c b/mm/zswap.c
index c1543061a192..0cfce9bc51e4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -149,11 +149,10 @@ static int __init zswap_comp_init(void)
149 return 0; 149 return 0;
150} 150}
151 151
152static void zswap_comp_exit(void) 152static void __init zswap_comp_exit(void)
153{ 153{
154 /* free percpu transforms */ 154 /* free percpu transforms */
155 if (zswap_comp_pcpu_tfms) 155 free_percpu(zswap_comp_pcpu_tfms);
156 free_percpu(zswap_comp_pcpu_tfms);
157} 156}
158 157
159/********************************* 158/*********************************
@@ -206,7 +205,7 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
206**********************************/ 205**********************************/
207static struct kmem_cache *zswap_entry_cache; 206static struct kmem_cache *zswap_entry_cache;
208 207
209static int zswap_entry_cache_create(void) 208static int __init zswap_entry_cache_create(void)
210{ 209{
211 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 210 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
212 return zswap_entry_cache == NULL; 211 return zswap_entry_cache == NULL;
@@ -389,7 +388,7 @@ static struct notifier_block zswap_cpu_notifier_block = {
389 .notifier_call = zswap_cpu_notifier 388 .notifier_call = zswap_cpu_notifier
390}; 389};
391 390
392static int zswap_cpu_init(void) 391static int __init zswap_cpu_init(void)
393{ 392{
394 unsigned long cpu; 393 unsigned long cpu;
395 394
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 45f145c6f843..c14893b501a9 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -15,6 +15,7 @@ TARGETS += user
15TARGETS += sysctl 15TARGETS += sysctl
16TARGETS += firmware 16TARGETS += firmware
17TARGETS += ftrace 17TARGETS += ftrace
18TARGETS += exec
18 19
19TARGETS_HOTPLUG = cpu-hotplug 20TARGETS_HOTPLUG = cpu-hotplug
20TARGETS_HOTPLUG += memory-hotplug 21TARGETS_HOTPLUG += memory-hotplug
diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore
new file mode 100644
index 000000000000..64073e050c6a
--- /dev/null
+++ b/tools/testing/selftests/exec/.gitignore
@@ -0,0 +1,9 @@
1subdir*
2script*
3execveat
4execveat.symlink
5execveat.moved
6execveat.path.ephemeral
7execveat.ephemeral
8execveat.denatured
9xxxxxxxx* \ No newline at end of file
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
new file mode 100644
index 000000000000..66dfc2ce1788
--- /dev/null
+++ b/tools/testing/selftests/exec/Makefile
@@ -0,0 +1,25 @@
1CC = $(CROSS_COMPILE)gcc
2CFLAGS = -Wall
3BINARIES = execveat
4DEPS = execveat.symlink execveat.denatured script subdir
5all: $(BINARIES) $(DEPS)
6
7subdir:
8 mkdir -p $@
9script:
10 echo '#!/bin/sh' > $@
11 echo 'exit $$*' >> $@
12 chmod +x $@
13execveat.symlink: execveat
14 ln -s -f $< $@
15execveat.denatured: execveat
16 cp $< $@
17 chmod -x $@
18%: %.c
19 $(CC) $(CFLAGS) -o $@ $^
20
21run_tests: all
22 ./execveat
23
24clean:
25 rm -rf $(BINARIES) $(DEPS) subdir.moved execveat.moved xxxxx*
diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c
new file mode 100644
index 000000000000..33a5c06d95ca
--- /dev/null
+++ b/tools/testing/selftests/exec/execveat.c
@@ -0,0 +1,397 @@
1/*
2 * Copyright (c) 2014 Google, Inc.
3 *
4 * Licensed under the terms of the GNU GPL License version 2
5 *
6 * Selftests for execveat(2).
7 */
8
9#define _GNU_SOURCE /* to get O_PATH, AT_EMPTY_PATH */
10#include <sys/sendfile.h>
11#include <sys/stat.h>
12#include <sys/syscall.h>
13#include <sys/types.h>
14#include <sys/wait.h>
15#include <errno.h>
16#include <fcntl.h>
17#include <limits.h>
18#include <stdio.h>
19#include <stdlib.h>
20#include <string.h>
21#include <unistd.h>
22
23static char longpath[2 * PATH_MAX] = "";
24static char *envp[] = { "IN_TEST=yes", NULL, NULL };
25static char *argv[] = { "execveat", "99", NULL };
26
27static int execveat_(int fd, const char *path, char **argv, char **envp,
28 int flags)
29{
30#ifdef __NR_execveat
31 return syscall(__NR_execveat, fd, path, argv, envp, flags);
32#else
33 errno = -ENOSYS;
34 return -1;
35#endif
36}
37
38#define check_execveat_fail(fd, path, flags, errno) \
39 _check_execveat_fail(fd, path, flags, errno, #errno)
40static int _check_execveat_fail(int fd, const char *path, int flags,
41 int expected_errno, const char *errno_str)
42{
43 int rc;
44
45 errno = 0;
46 printf("Check failure of execveat(%d, '%s', %d) with %s... ",
47 fd, path?:"(null)", flags, errno_str);
48 rc = execveat_(fd, path, argv, envp, flags);
49
50 if (rc > 0) {
51 printf("[FAIL] (unexpected success from execveat(2))\n");
52 return 1;
53 }
54 if (errno != expected_errno) {
55 printf("[FAIL] (expected errno %d (%s) not %d (%s)\n",
56 expected_errno, strerror(expected_errno),
57 errno, strerror(errno));
58 return 1;
59 }
60 printf("[OK]\n");
61 return 0;
62}
63
64static int check_execveat_invoked_rc(int fd, const char *path, int flags,
65 int expected_rc)
66{
67 int status;
68 int rc;
69 pid_t child;
70 int pathlen = path ? strlen(path) : 0;
71
72 if (pathlen > 40)
73 printf("Check success of execveat(%d, '%.20s...%s', %d)... ",
74 fd, path, (path + pathlen - 20), flags);
75 else
76 printf("Check success of execveat(%d, '%s', %d)... ",
77 fd, path?:"(null)", flags);
78 child = fork();
79 if (child < 0) {
80 printf("[FAIL] (fork() failed)\n");
81 return 1;
82 }
83 if (child == 0) {
84 /* Child: do execveat(). */
85 rc = execveat_(fd, path, argv, envp, flags);
86 printf("[FAIL]: execveat() failed, rc=%d errno=%d (%s)\n",
87 rc, errno, strerror(errno));
88 exit(1); /* should not reach here */
89 }
90 /* Parent: wait for & check child's exit status. */
91 rc = waitpid(child, &status, 0);
92 if (rc != child) {
93 printf("[FAIL] (waitpid(%d,...) returned %d)\n", child, rc);
94 return 1;
95 }
96 if (!WIFEXITED(status)) {
97 printf("[FAIL] (child %d did not exit cleanly, status=%08x)\n",
98 child, status);
99 return 1;
100 }
101 if (WEXITSTATUS(status) != expected_rc) {
102 printf("[FAIL] (child %d exited with %d not %d)\n",
103 child, WEXITSTATUS(status), expected_rc);
104 return 1;
105 }
106 printf("[OK]\n");
107 return 0;
108}
109
110static int check_execveat(int fd, const char *path, int flags)
111{
112 return check_execveat_invoked_rc(fd, path, flags, 99);
113}
114
115static char *concat(const char *left, const char *right)
116{
117 char *result = malloc(strlen(left) + strlen(right) + 1);
118
119 strcpy(result, left);
120 strcat(result, right);
121 return result;
122}
123
124static int open_or_die(const char *filename, int flags)
125{
126 int fd = open(filename, flags);
127
128 if (fd < 0) {
129 printf("Failed to open '%s'; "
130 "check prerequisites are available\n", filename);
131 exit(1);
132 }
133 return fd;
134}
135
136static void exe_cp(const char *src, const char *dest)
137{
138 int in_fd = open_or_die(src, O_RDONLY);
139 int out_fd = open(dest, O_RDWR|O_CREAT|O_TRUNC, 0755);
140 struct stat info;
141
142 fstat(in_fd, &info);
143 sendfile(out_fd, in_fd, NULL, info.st_size);
144 close(in_fd);
145 close(out_fd);
146}
147
148#define XX_DIR_LEN 200
149static int check_execveat_pathmax(int dot_dfd, const char *src, int is_script)
150{
151 int fail = 0;
152 int ii, count, len;
153 char longname[XX_DIR_LEN + 1];
154 int fd;
155
156 if (*longpath == '\0') {
157 /* Create a filename close to PATH_MAX in length */
158 memset(longname, 'x', XX_DIR_LEN - 1);
159 longname[XX_DIR_LEN - 1] = '/';
160 longname[XX_DIR_LEN] = '\0';
161 count = (PATH_MAX - 3) / XX_DIR_LEN;
162 for (ii = 0; ii < count; ii++) {
163 strcat(longpath, longname);
164 mkdir(longpath, 0755);
165 }
166 len = (PATH_MAX - 3) - (count * XX_DIR_LEN);
167 if (len <= 0)
168 len = 1;
169 memset(longname, 'y', len);
170 longname[len] = '\0';
171 strcat(longpath, longname);
172 }
173 exe_cp(src, longpath);
174
175 /*
176 * Execute as a pre-opened file descriptor, which works whether this is
177 * a script or not (because the interpreter sees a filename like
178 * "/dev/fd/20").
179 */
180 fd = open(longpath, O_RDONLY);
181 if (fd > 0) {
182 printf("Invoke copy of '%s' via filename of length %lu:\n",
183 src, strlen(longpath));
184 fail += check_execveat(fd, "", AT_EMPTY_PATH);
185 } else {
186 printf("Failed to open length %lu filename, errno=%d (%s)\n",
187 strlen(longpath), errno, strerror(errno));
188 fail++;
189 }
190
191 /*
192 * Execute as a long pathname relative to ".". If this is a script,
193 * the interpreter will launch but fail to open the script because its
194 * name ("/dev/fd/5/xxx....") is bigger than PATH_MAX.
195 */
196 if (is_script)
197 fail += check_execveat_invoked_rc(dot_dfd, longpath, 0, 127);
198 else
199 fail += check_execveat(dot_dfd, longpath, 0);
200
201 return fail;
202}
203
204static int run_tests(void)
205{
206 int fail = 0;
207 char *fullname = realpath("execveat", NULL);
208 char *fullname_script = realpath("script", NULL);
209 char *fullname_symlink = concat(fullname, ".symlink");
210 int subdir_dfd = open_or_die("subdir", O_DIRECTORY|O_RDONLY);
211 int subdir_dfd_ephemeral = open_or_die("subdir.ephemeral",
212 O_DIRECTORY|O_RDONLY);
213 int dot_dfd = open_or_die(".", O_DIRECTORY|O_RDONLY);
214 int dot_dfd_path = open_or_die(".", O_DIRECTORY|O_RDONLY|O_PATH);
215 int dot_dfd_cloexec = open_or_die(".", O_DIRECTORY|O_RDONLY|O_CLOEXEC);
216 int fd = open_or_die("execveat", O_RDONLY);
217 int fd_path = open_or_die("execveat", O_RDONLY|O_PATH);
218 int fd_symlink = open_or_die("execveat.symlink", O_RDONLY);
219 int fd_denatured = open_or_die("execveat.denatured", O_RDONLY);
220 int fd_denatured_path = open_or_die("execveat.denatured",
221 O_RDONLY|O_PATH);
222 int fd_script = open_or_die("script", O_RDONLY);
223 int fd_ephemeral = open_or_die("execveat.ephemeral", O_RDONLY);
224 int fd_ephemeral_path = open_or_die("execveat.path.ephemeral",
225 O_RDONLY|O_PATH);
226 int fd_script_ephemeral = open_or_die("script.ephemeral", O_RDONLY);
227 int fd_cloexec = open_or_die("execveat", O_RDONLY|O_CLOEXEC);
228 int fd_script_cloexec = open_or_die("script", O_RDONLY|O_CLOEXEC);
229
230 /* Change file position to confirm it doesn't affect anything */
231 lseek(fd, 10, SEEK_SET);
232
233 /* Normal executable file: */
234 /* dfd + path */
235 fail += check_execveat(subdir_dfd, "../execveat", 0);
236 fail += check_execveat(dot_dfd, "execveat", 0);
237 fail += check_execveat(dot_dfd_path, "execveat", 0);
238 /* absolute path */
239 fail += check_execveat(AT_FDCWD, fullname, 0);
240 /* absolute path with nonsense dfd */
241 fail += check_execveat(99, fullname, 0);
242 /* fd + no path */
243 fail += check_execveat(fd, "", AT_EMPTY_PATH);
244 /* O_CLOEXEC fd + no path */
245 fail += check_execveat(fd_cloexec, "", AT_EMPTY_PATH);
246 /* O_PATH fd */
247 fail += check_execveat(fd_path, "", AT_EMPTY_PATH);
248
249 /* Mess with executable file that's already open: */
250 /* fd + no path to a file that's been renamed */
251 rename("execveat.ephemeral", "execveat.moved");
252 fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH);
253 /* fd + no path to a file that's been deleted */
254 unlink("execveat.moved"); /* remove the file now fd open */
255 fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH);
256
257 /* Mess with executable file that's already open with O_PATH */
258 /* fd + no path to a file that's been deleted */
259 unlink("execveat.path.ephemeral");
260 fail += check_execveat(fd_ephemeral_path, "", AT_EMPTY_PATH);
261
262 /* Invalid argument failures */
263 fail += check_execveat_fail(fd, "", 0, ENOENT);
264 fail += check_execveat_fail(fd, NULL, AT_EMPTY_PATH, EFAULT);
265
266 /* Symlink to executable file: */
267 /* dfd + path */
268 fail += check_execveat(dot_dfd, "execveat.symlink", 0);
269 fail += check_execveat(dot_dfd_path, "execveat.symlink", 0);
270 /* absolute path */
271 fail += check_execveat(AT_FDCWD, fullname_symlink, 0);
272 /* fd + no path, even with AT_SYMLINK_NOFOLLOW (already followed) */
273 fail += check_execveat(fd_symlink, "", AT_EMPTY_PATH);
274 fail += check_execveat(fd_symlink, "",
275 AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW);
276
277 /* Symlink fails when AT_SYMLINK_NOFOLLOW set: */
278 /* dfd + path */
279 fail += check_execveat_fail(dot_dfd, "execveat.symlink",
280 AT_SYMLINK_NOFOLLOW, ELOOP);
281 fail += check_execveat_fail(dot_dfd_path, "execveat.symlink",
282 AT_SYMLINK_NOFOLLOW, ELOOP);
283 /* absolute path */
284 fail += check_execveat_fail(AT_FDCWD, fullname_symlink,
285 AT_SYMLINK_NOFOLLOW, ELOOP);
286
287 /* Shell script wrapping executable file: */
288 /* dfd + path */
289 fail += check_execveat(subdir_dfd, "../script", 0);
290 fail += check_execveat(dot_dfd, "script", 0);
291 fail += check_execveat(dot_dfd_path, "script", 0);
292 /* absolute path */
293 fail += check_execveat(AT_FDCWD, fullname_script, 0);
294 /* fd + no path */
295 fail += check_execveat(fd_script, "", AT_EMPTY_PATH);
296 fail += check_execveat(fd_script, "",
297 AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW);
298 /* O_CLOEXEC fd fails for a script (as script file inaccessible) */
299 fail += check_execveat_fail(fd_script_cloexec, "", AT_EMPTY_PATH,
300 ENOENT);
301 fail += check_execveat_fail(dot_dfd_cloexec, "script", 0, ENOENT);
302
303 /* Mess with script file that's already open: */
304 /* fd + no path to a file that's been renamed */
305 rename("script.ephemeral", "script.moved");
306 fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH);
307 /* fd + no path to a file that's been deleted */
308 unlink("script.moved"); /* remove the file while fd open */
309 fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH);
310
311 /* Rename a subdirectory in the path: */
312 rename("subdir.ephemeral", "subdir.moved");
313 fail += check_execveat(subdir_dfd_ephemeral, "../script", 0);
314 fail += check_execveat(subdir_dfd_ephemeral, "script", 0);
315 /* Remove the subdir and its contents */
316 unlink("subdir.moved/script");
317 unlink("subdir.moved");
318 /* Shell loads via deleted subdir OK because name starts with .. */
319 fail += check_execveat(subdir_dfd_ephemeral, "../script", 0);
320 fail += check_execveat_fail(subdir_dfd_ephemeral, "script", 0, ENOENT);
321
322 /* Flag values other than AT_SYMLINK_NOFOLLOW => EINVAL */
323 fail += check_execveat_fail(dot_dfd, "execveat", 0xFFFF, EINVAL);
324 /* Invalid path => ENOENT */
325 fail += check_execveat_fail(dot_dfd, "no-such-file", 0, ENOENT);
326 fail += check_execveat_fail(dot_dfd_path, "no-such-file", 0, ENOENT);
327 fail += check_execveat_fail(AT_FDCWD, "no-such-file", 0, ENOENT);
328 /* Attempt to execute directory => EACCES */
329 fail += check_execveat_fail(dot_dfd, "", AT_EMPTY_PATH, EACCES);
330 /* Attempt to execute non-executable => EACCES */
331 fail += check_execveat_fail(dot_dfd, "Makefile", 0, EACCES);
332 fail += check_execveat_fail(fd_denatured, "", AT_EMPTY_PATH, EACCES);
333 fail += check_execveat_fail(fd_denatured_path, "", AT_EMPTY_PATH,
334 EACCES);
335 /* Attempt to execute nonsense FD => EBADF */
336 fail += check_execveat_fail(99, "", AT_EMPTY_PATH, EBADF);
337 fail += check_execveat_fail(99, "execveat", 0, EBADF);
338 /* Attempt to execute relative to non-directory => ENOTDIR */
339 fail += check_execveat_fail(fd, "execveat", 0, ENOTDIR);
340
341 fail += check_execveat_pathmax(dot_dfd, "execveat", 0);
342 fail += check_execveat_pathmax(dot_dfd, "script", 1);
343 return fail;
344}
345
346static void prerequisites(void)
347{
348 int fd;
349 const char *script = "#!/bin/sh\nexit $*\n";
350
351 /* Create ephemeral copies of files */
352 exe_cp("execveat", "execveat.ephemeral");
353 exe_cp("execveat", "execveat.path.ephemeral");
354 exe_cp("script", "script.ephemeral");
355 mkdir("subdir.ephemeral", 0755);
356
357 fd = open("subdir.ephemeral/script", O_RDWR|O_CREAT|O_TRUNC, 0755);
358 write(fd, script, strlen(script));
359 close(fd);
360}
361
362int main(int argc, char **argv)
363{
364 int ii;
365 int rc;
366 const char *verbose = getenv("VERBOSE");
367
368 if (argc >= 2) {
369 /* If we are invoked with an argument, don't run tests. */
370 const char *in_test = getenv("IN_TEST");
371
372 if (verbose) {
373 printf(" invoked with:");
374 for (ii = 0; ii < argc; ii++)
375 printf(" [%d]='%s'", ii, argv[ii]);
376 printf("\n");
377 }
378
379 /* Check expected environment transferred. */
380 if (!in_test || strcmp(in_test, "yes") != 0) {
381 printf("[FAIL] (no IN_TEST=yes in env)\n");
382 return 1;
383 }
384
385 /* Use the final argument as an exit code. */
386 rc = atoi(argv[argc - 1]);
387 fflush(stdout);
388 } else {
389 prerequisites();
390 if (verbose)
391 envp[1] = "VERBOSE=1";
392 rc = run_tests();
393 if (rc > 0)
394 printf("%d tests failed\n", rc);
395 }
396 return rc;
397}
diff --git a/tools/vm/Makefile b/tools/vm/Makefile
index 3d907dacf2ac..ac884b65a072 100644
--- a/tools/vm/Makefile
+++ b/tools/vm/Makefile
@@ -1,6 +1,6 @@
1# Makefile for vm tools 1# Makefile for vm tools
2# 2#
3TARGETS=page-types slabinfo 3TARGETS=page-types slabinfo page_owner_sort
4 4
5LIB_DIR = ../lib/api 5LIB_DIR = ../lib/api
6LIBS = $(LIB_DIR)/libapikfs.a 6LIBS = $(LIB_DIR)/libapikfs.a
@@ -18,5 +18,5 @@ $(LIBS):
18 $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) 18 $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
19 19
20clean: 20clean:
21 $(RM) page-types slabinfo 21 $(RM) page-types slabinfo page_owner_sort
22 make -C $(LIB_DIR) clean 22 make -C $(LIB_DIR) clean
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
new file mode 100644
index 000000000000..77147b42d598
--- /dev/null
+++ b/tools/vm/page_owner_sort.c
@@ -0,0 +1,144 @@
1/*
2 * User-space helper to sort the output of /sys/kernel/debug/page_owner
3 *
4 * Example use:
5 * cat /sys/kernel/debug/page_owner > page_owner_full.txt
6 * grep -v ^PFN page_owner_full.txt > page_owner.txt
7 * ./sort page_owner.txt sorted_page_owner.txt
8*/
9
10#include <stdio.h>
11#include <stdlib.h>
12#include <sys/types.h>
13#include <sys/stat.h>
14#include <fcntl.h>
15#include <unistd.h>
16#include <string.h>
17
18struct block_list {
19 char *txt;
20 int len;
21 int num;
22};
23
24
25static struct block_list *list;
26static int list_size;
27static int max_size;
28
29struct block_list *block_head;
30
31int read_block(char *buf, int buf_size, FILE *fin)
32{
33 char *curr = buf, *const buf_end = buf + buf_size;
34
35 while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) {
36 if (*curr == '\n') /* empty line */
37 return curr - buf;
38 curr += strlen(curr);
39 }
40
41 return -1; /* EOF or no space left in buf. */
42}
43
44static int compare_txt(const void *p1, const void *p2)
45{
46 const struct block_list *l1 = p1, *l2 = p2;
47
48 return strcmp(l1->txt, l2->txt);
49}
50
51static int compare_num(const void *p1, const void *p2)
52{
53 const struct block_list *l1 = p1, *l2 = p2;
54
55 return l2->num - l1->num;
56}
57
58static void add_list(char *buf, int len)
59{
60 if (list_size != 0 &&
61 len == list[list_size-1].len &&
62 memcmp(buf, list[list_size-1].txt, len) == 0) {
63 list[list_size-1].num++;
64 return;
65 }
66 if (list_size == max_size) {
67 printf("max_size too small??\n");
68 exit(1);
69 }
70 list[list_size].txt = malloc(len+1);
71 list[list_size].len = len;
72 list[list_size].num = 1;
73 memcpy(list[list_size].txt, buf, len);
74 list[list_size].txt[len] = 0;
75 list_size++;
76 if (list_size % 1000 == 0) {
77 printf("loaded %d\r", list_size);
78 fflush(stdout);
79 }
80}
81
82#define BUF_SIZE 1024
83
84int main(int argc, char **argv)
85{
86 FILE *fin, *fout;
87 char buf[BUF_SIZE];
88 int ret, i, count;
89 struct block_list *list2;
90 struct stat st;
91
92 if (argc < 3) {
93 printf("Usage: ./program <input> <output>\n");
94 perror("open: ");
95 exit(1);
96 }
97
98 fin = fopen(argv[1], "r");
99 fout = fopen(argv[2], "w");
100 if (!fin || !fout) {
101 printf("Usage: ./program <input> <output>\n");
102 perror("open: ");
103 exit(1);
104 }
105
106 fstat(fileno(fin), &st);
107 max_size = st.st_size / 100; /* hack ... */
108
109 list = malloc(max_size * sizeof(*list));
110
111 for ( ; ; ) {
112 ret = read_block(buf, BUF_SIZE, fin);
113 if (ret < 0)
114 break;
115
116 add_list(buf, ret);
117 }
118
119 printf("loaded %d\n", list_size);
120
121 printf("sorting ....\n");
122
123 qsort(list, list_size, sizeof(list[0]), compare_txt);
124
125 list2 = malloc(sizeof(*list) * list_size);
126
127 printf("culling\n");
128
129 for (i = count = 0; i < list_size; i++) {
130 if (count == 0 ||
131 strcmp(list2[count-1].txt, list[i].txt) != 0) {
132 list2[count++] = list[i];
133 } else {
134 list2[count-1].num += list[i].num;
135 }
136 }
137
138 qsort(list2, count, sizeof(list[0]), compare_num);
139
140 for (i = 0; i < count; i++)
141 fprintf(fout, "%d times:\n%s\n", list2[i].num, list2[i].txt);
142
143 return 0;
144}
diff --git a/usr/Kconfig b/usr/Kconfig
index 2d4c77eecf2e..572dcf7b6a44 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -46,17 +46,17 @@ config INITRAMFS_ROOT_GID
46 If you are not sure, leave it set to "0". 46 If you are not sure, leave it set to "0".
47 47
48config RD_GZIP 48config RD_GZIP
49 bool "Support initial ramdisks compressed using gzip" if EXPERT 49 bool "Support initial ramdisks compressed using gzip"
50 default y
51 depends on BLK_DEV_INITRD 50 depends on BLK_DEV_INITRD
51 default y
52 select DECOMPRESS_GZIP 52 select DECOMPRESS_GZIP
53 help 53 help
54 Support loading of a gzip encoded initial ramdisk or cpio buffer. 54 Support loading of a gzip encoded initial ramdisk or cpio buffer.
55 If unsure, say Y. 55 If unsure, say Y.
56 56
57config RD_BZIP2 57config RD_BZIP2
58 bool "Support initial ramdisks compressed using bzip2" if EXPERT 58 bool "Support initial ramdisks compressed using bzip2"
59 default !EXPERT 59 default y
60 depends on BLK_DEV_INITRD 60 depends on BLK_DEV_INITRD
61 select DECOMPRESS_BZIP2 61 select DECOMPRESS_BZIP2
62 help 62 help
@@ -64,8 +64,8 @@ config RD_BZIP2
64 If unsure, say N. 64 If unsure, say N.
65 65
66config RD_LZMA 66config RD_LZMA
67 bool "Support initial ramdisks compressed using LZMA" if EXPERT 67 bool "Support initial ramdisks compressed using LZMA"
68 default !EXPERT 68 default y
69 depends on BLK_DEV_INITRD 69 depends on BLK_DEV_INITRD
70 select DECOMPRESS_LZMA 70 select DECOMPRESS_LZMA
71 help 71 help
@@ -73,17 +73,17 @@ config RD_LZMA
73 If unsure, say N. 73 If unsure, say N.
74 74
75config RD_XZ 75config RD_XZ
76 bool "Support initial ramdisks compressed using XZ" if EXPERT 76 bool "Support initial ramdisks compressed using XZ"
77 default !EXPERT
78 depends on BLK_DEV_INITRD 77 depends on BLK_DEV_INITRD
78 default y
79 select DECOMPRESS_XZ 79 select DECOMPRESS_XZ
80 help 80 help
81 Support loading of a XZ encoded initial ramdisk or cpio buffer. 81 Support loading of a XZ encoded initial ramdisk or cpio buffer.
82 If unsure, say N. 82 If unsure, say N.
83 83
84config RD_LZO 84config RD_LZO
85 bool "Support initial ramdisks compressed using LZO" if EXPERT 85 bool "Support initial ramdisks compressed using LZO"
86 default !EXPERT 86 default y
87 depends on BLK_DEV_INITRD 87 depends on BLK_DEV_INITRD
88 select DECOMPRESS_LZO 88 select DECOMPRESS_LZO
89 help 89 help
@@ -91,8 +91,8 @@ config RD_LZO
91 If unsure, say N. 91 If unsure, say N.
92 92
93config RD_LZ4 93config RD_LZ4
94 bool "Support initial ramdisks compressed using LZ4" if EXPERT 94 bool "Support initial ramdisks compressed using LZ4"
95 default !EXPERT 95 default y
96 depends on BLK_DEV_INITRD 96 depends on BLK_DEV_INITRD
97 select DECOMPRESS_LZ4 97 select DECOMPRESS_LZ4
98 help 98 help