aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-06 02:10:54 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-06 02:10:54 -0500
commit2e3078af2c67730c479f1d183af5b367f5d95337 (patch)
treeb7881c6c9c479aadac345df7e18e3c0e10f0811e
parentea5c58e70c3a148ada0d3061a8f529589bb766ba (diff)
parentb3b0d09c7a2330759ac293f5269bd932439ea0ff (diff)
Merge branch 'akpm' (patches from Andrew)
Merge patch-bomb from Andrew Morton: - inotify tweaks - some ocfs2 updates (many more are awaiting review) - various misc bits - kernel/watchdog.c updates - Some of mm. I have a huge number of MM patches this time and quite a lot of it is quite difficult and much will be held over to next time. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (162 commits) selftests: vm: add tests for lock on fault mm: mlock: add mlock flags to enable VM_LOCKONFAULT usage mm: introduce VM_LOCKONFAULT mm: mlock: add new mlock system call mm: mlock: refactor mlock, munlock, and munlockall code kasan: always taint kernel on report mm, slub, kasan: enable user tracking by default with KASAN=y kasan: use IS_ALIGNED in memory_is_poisoned_8() kasan: Fix a type conversion error lib: test_kasan: add some testcases kasan: update reference to kasan prototype repo kasan: move KASAN_SANITIZE in arch/x86/boot/Makefile kasan: various fixes in documentation kasan: update log messages kasan: accurately determine the type of the bad access kasan: update reported bug types for kernel memory accesses kasan: update reported bug types for not user nor kernel memory accesses mm/kasan: prevent deadlock in kasan reporting mm/kasan: don't use kasan shadow pointer in generic functions mm/kasan: MODULE_VADDR is not available on all archs ...
-rw-r--r--Documentation/filesystems/proc.txt22
-rw-r--r--Documentation/kasan.txt46
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--Documentation/lockup-watchdogs.txt5
-rw-r--r--Documentation/sysctl/kernel.txt12
-rw-r--r--Documentation/vm/page_migration27
-rw-r--r--Documentation/vm/transhuge.txt10
-rw-r--r--Documentation/vm/unevictable-lru.txt120
-rw-r--r--arch/alpha/include/uapi/asm/mman.h3
-rw-r--r--arch/arm/mm/alignment.c2
-rw-r--r--arch/mips/include/uapi/asm/mman.h6
-rw-r--r--arch/parisc/include/uapi/asm/mman.h3
-rw-r--r--arch/powerpc/include/uapi/asm/mman.h1
-rw-r--r--arch/powerpc/mm/numa.c2
-rw-r--r--arch/powerpc/sysdev/fsl_pci.c2
-rw-r--r--arch/sparc/include/uapi/asm/mman.h1
-rw-r--r--arch/tile/include/uapi/asm/mman.h1
-rw-r--r--arch/x86/boot/Makefile4
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/mm/kasan_init_64.c2
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h6
-rw-r--r--fs/9p/vfs_file.c3
-rw-r--r--fs/fs-writeback.c7
-rw-r--r--fs/logfs/dev_bdev.c4
-rw-r--r--fs/notify/fdinfo.c9
-rw-r--r--fs/notify/inotify/inotify_user.c14
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c19
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c4
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlm/dlmthread.c3
-rw-r--r--fs/ocfs2/dlmglue.c3
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/journal.c105
-rw-r--r--fs/ocfs2/namei.c13
-rw-r--r--fs/ocfs2/namei.h3
-rw-r--r--fs/ocfs2/refcounttree.c5
-rw-r--r--fs/ocfs2/suballoc.c5
-rw-r--r--fs/proc/base.c10
-rw-r--r--fs/proc/task_mmu.c60
-rw-r--r--fs/sync.c7
-rw-r--r--include/linux/compaction.h3
-rw-r--r--include/linux/compiler-gcc.h17
-rw-r--r--include/linux/compiler.h8
-rw-r--r--include/linux/cpuset.h4
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/hugetlb.h19
-rw-r--r--include/linux/memblock.h4
-rw-r--r--include/linux/memcontrol.h147
-rw-r--r--include/linux/mm.h11
-rw-r--r--include/linux/mm_types.h3
-rw-r--r--include/linux/mmzone.h3
-rw-r--r--include/linux/nmi.h1
-rw-r--r--include/linux/page-flags.h2
-rw-r--r--include/linux/page_counter.h6
-rw-r--r--include/linux/sched.h17
-rw-r--r--include/linux/slab.h2
-rw-r--r--include/linux/syscalls.h2
-rw-r--r--include/linux/tracehook.h3
-rw-r--r--include/linux/types.h16
-rw-r--r--include/linux/uaccess.h40
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/linux/vmstat.h25
-rw-r--r--include/trace/events/compaction.h72
-rw-r--r--include/uapi/asm-generic/mman-common.h5
-rw-r--r--include/uapi/asm-generic/mman.h1
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--kernel/cpuset.c14
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c20
-rw-r--r--kernel/watchdog.c121
-rw-r--r--lib/Kconfig.kasan3
-rw-r--r--lib/test_kasan.c69
-rw-r--r--mm/balloon_compaction.c10
-rw-r--r--mm/cma.c6
-rw-r--r--mm/compaction.c46
-rw-r--r--mm/debug.c1
-rw-r--r--mm/early_ioremap.c6
-rw-r--r--mm/filemap.c77
-rw-r--r--mm/frame_vector.c2
-rw-r--r--mm/gup.c10
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/hugetlb.c139
-rw-r--r--mm/hugetlb_cgroup.c3
-rw-r--r--mm/internal.h9
-rw-r--r--mm/kasan/kasan.c38
-rw-r--r--mm/kasan/kasan.h5
-rw-r--r--mm/kasan/report.c113
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/ksm.c49
-rw-r--r--mm/list_lru.c44
-rw-r--r--mm/maccess.c7
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c295
-rw-r--r--mm/memory-failure.c34
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/migrate.c247
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mlock.c100
-rw-r--r--mm/mmap.c61
-rw-r--r--mm/mremap.c12
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c18
-rw-r--r--mm/oom_kill.c59
-rw-r--r--mm/page_alloc.c41
-rw-r--r--mm/page_counter.c14
-rw-r--r--mm/percpu.c10
-rw-r--r--mm/readahead.c14
-rw-r--r--mm/rmap.c107
-rw-r--r--mm/shmem.c24
-rw-r--r--mm/slab.c17
-rw-r--r--mm/slab.h30
-rw-r--r--mm/slab_common.c142
-rw-r--r--mm/slub.c25
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmacache.c2
-rw-r--r--mm/vmalloc.c12
-rw-r--r--mm/vmscan.c27
-rw-r--r--mm/vmstat.c22
-rw-r--r--tools/testing/selftests/vm/Makefile2
-rw-r--r--tools/testing/selftests/vm/mlock2-tests.c736
-rw-r--r--tools/testing/selftests/vm/on-fault-limit.c47
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests22
-rw-r--r--tools/vm/slabinfo-gnuplot.sh275
-rw-r--r--tools/vm/slabinfo.c255
127 files changed, 3093 insertions, 1351 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 3a9d65c912e7..1e4a6cc1b6ea 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -175,6 +175,7 @@ read the file /proc/PID/status:
175 VmLib: 1412 kB 175 VmLib: 1412 kB
176 VmPTE: 20 kb 176 VmPTE: 20 kb
177 VmSwap: 0 kB 177 VmSwap: 0 kB
178 HugetlbPages: 0 kB
178 Threads: 1 179 Threads: 1
179 SigQ: 0/28578 180 SigQ: 0/28578
180 SigPnd: 0000000000000000 181 SigPnd: 0000000000000000
@@ -238,6 +239,7 @@ Table 1-2: Contents of the status files (as of 4.1)
238 VmPTE size of page table entries 239 VmPTE size of page table entries
239 VmPMD size of second level page tables 240 VmPMD size of second level page tables
240 VmSwap size of swap usage (the number of referred swapents) 241 VmSwap size of swap usage (the number of referred swapents)
242 HugetlbPages size of hugetlb memory portions
241 Threads number of threads 243 Threads number of threads
242 SigQ number of signals queued/max. number for queue 244 SigQ number of signals queued/max. number for queue
243 SigPnd bitmap of pending signals for the thread 245 SigPnd bitmap of pending signals for the thread
@@ -424,12 +426,15 @@ Private_Clean: 0 kB
424Private_Dirty: 0 kB 426Private_Dirty: 0 kB
425Referenced: 892 kB 427Referenced: 892 kB
426Anonymous: 0 kB 428Anonymous: 0 kB
429AnonHugePages: 0 kB
430Shared_Hugetlb: 0 kB
431Private_Hugetlb: 0 kB
427Swap: 0 kB 432Swap: 0 kB
428SwapPss: 0 kB 433SwapPss: 0 kB
429KernelPageSize: 4 kB 434KernelPageSize: 4 kB
430MMUPageSize: 4 kB 435MMUPageSize: 4 kB
431Locked: 374 kB 436Locked: 0 kB
432VmFlags: rd ex mr mw me de 437VmFlags: rd ex mr mw me dw
433 438
434the first of these lines shows the same information as is displayed for the 439the first of these lines shows the same information as is displayed for the
435mapping in /proc/PID/maps. The remaining lines show the size of the mapping 440mapping in /proc/PID/maps. The remaining lines show the size of the mapping
@@ -449,9 +454,14 @@ accessed.
449"Anonymous" shows the amount of memory that does not belong to any file. Even 454"Anonymous" shows the amount of memory that does not belong to any file. Even
450a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE 455a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
451and a page is modified, the file page is replaced by a private anonymous copy. 456and a page is modified, the file page is replaced by a private anonymous copy.
452"Swap" shows how much would-be-anonymous memory is also used, but out on 457"AnonHugePages" shows the ammount of memory backed by transparent hugepage.
453swap. 458"Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by
459hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
460reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
461"Swap" shows how much would-be-anonymous memory is also used, but out on swap.
454"SwapPss" shows proportional swap share of this mapping. 462"SwapPss" shows proportional swap share of this mapping.
463"Locked" indicates whether the mapping is locked in memory or not.
464
455"VmFlags" field deserves a separate description. This member represents the kernel 465"VmFlags" field deserves a separate description. This member represents the kernel
456flags associated with the particular virtual memory area in two letter encoded 466flags associated with the particular virtual memory area in two letter encoded
457manner. The codes are the following: 467manner. The codes are the following:
@@ -475,7 +485,6 @@ manner. The codes are the following:
475 ac - area is accountable 485 ac - area is accountable
476 nr - swap space is not reserved for the area 486 nr - swap space is not reserved for the area
477 ht - area uses huge tlb pages 487 ht - area uses huge tlb pages
478 nl - non-linear mapping
479 ar - architecture specific flag 488 ar - architecture specific flag
480 dd - do not include area into core dump 489 dd - do not include area into core dump
481 sd - soft-dirty flag 490 sd - soft-dirty flag
@@ -815,9 +824,6 @@ varies by architecture and compile options. The following is from a
815 824
816> cat /proc/meminfo 825> cat /proc/meminfo
817 826
818The "Locked" indicates whether the mapping is locked in memory or not.
819
820
821MemTotal: 16344972 kB 827MemTotal: 16344972 kB
822MemFree: 13634064 kB 828MemFree: 13634064 kB
823MemAvailable: 14836172 kB 829MemAvailable: 14836172 kB
diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt
index 0d32355a4c34..aa1e0c91e368 100644
--- a/Documentation/kasan.txt
+++ b/Documentation/kasan.txt
@@ -1,36 +1,34 @@
1Kernel address sanitizer 1KernelAddressSanitizer (KASAN)
2================ 2==============================
3 3
40. Overview 40. Overview
5=========== 5===========
6 6
7Kernel Address sanitizer (KASan) is a dynamic memory error detector. It provides 7KernelAddressSANitizer (KASAN) is a dynamic memory error detector. It provides
8a fast and comprehensive solution for finding use-after-free and out-of-bounds 8a fast and comprehensive solution for finding use-after-free and out-of-bounds
9bugs. 9bugs.
10 10
11KASan uses compile-time instrumentation for checking every memory access, 11KASAN uses compile-time instrumentation for checking every memory access,
12therefore you will need a gcc version of 4.9.2 or later. KASan could detect out 12therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is
13of bounds accesses to stack or global variables, but only if gcc 5.0 or later was 13required for detection of out-of-bounds accesses to stack or global variables.
14used to built the kernel.
15 14
16Currently KASan is supported only for x86_64 architecture and requires that the 15Currently KASAN is supported only for x86_64 architecture and requires the
17kernel be built with the SLUB allocator. 16kernel to be built with the SLUB allocator.
18 17
191. Usage 181. Usage
20========= 19========
21 20
22To enable KASAN configure kernel with: 21To enable KASAN configure kernel with:
23 22
24 CONFIG_KASAN = y 23 CONFIG_KASAN = y
25 24
26and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline/inline 25and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline and
27is compiler instrumentation types. The former produces smaller binary the 26inline are compiler instrumentation types. The former produces smaller binary
28latter is 1.1 - 2 times faster. Inline instrumentation requires a gcc version 27the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC
29of 5.0 or later. 28version 5.0 or later.
30 29
31Currently KASAN works only with the SLUB memory allocator. 30Currently KASAN works only with the SLUB memory allocator.
32For better bug detection and nicer report, enable CONFIG_STACKTRACE and put 31For better bug detection and nicer reporting, enable CONFIG_STACKTRACE.
33at least 'slub_debug=U' in the boot cmdline.
34 32
35To disable instrumentation for specific files or directories, add a line 33To disable instrumentation for specific files or directories, add a line
36similar to the following to the respective kernel Makefile: 34similar to the following to the respective kernel Makefile:
@@ -42,7 +40,7 @@ similar to the following to the respective kernel Makefile:
42 KASAN_SANITIZE := n 40 KASAN_SANITIZE := n
43 41
441.1 Error reports 421.1 Error reports
45========== 43=================
46 44
47A typical out of bounds access report looks like this: 45A typical out of bounds access report looks like this:
48 46
@@ -119,14 +117,16 @@ Memory state around the buggy address:
119 ffff8800693bc800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 117 ffff8800693bc800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
120================================================================== 118==================================================================
121 119
122First sections describe slub object where bad access happened. 120The header of the report discribe what kind of bug happened and what kind of
123See 'SLUB Debug output' section in Documentation/vm/slub.txt for details. 121access caused it. It's followed by the description of the accessed slub object
122(see 'SLUB Debug output' section in Documentation/vm/slub.txt for details) and
123the description of the accessed memory page.
124 124
125In the last section the report shows memory state around the accessed address. 125In the last section the report shows memory state around the accessed address.
126Reading this part requires some more understanding of how KASAN works. 126Reading this part requires some understanding of how KASAN works.
127 127
128Each 8 bytes of memory are encoded in one shadow byte as accessible, 128The state of each 8 aligned bytes of memory is encoded in one shadow byte.
129partially accessible, freed or they can be part of a redzone. 129Those 8 bytes can be accessible, partially accessible, freed or be a redzone.
130We use the following encoding for each shadow byte: 0 means that all 8 bytes 130We use the following encoding for each shadow byte: 0 means that all 8 bytes
131of the corresponding memory region are accessible; number N (1 <= N <= 7) means 131of the corresponding memory region are accessible; number N (1 <= N <= 7) means
132that the first N bytes are accessible, and other (8 - N) bytes are not; 132that the first N bytes are accessible, and other (8 - N) bytes are not;
@@ -139,7 +139,7 @@ the accessed address is partially accessible.
139 139
140 140
1412. Implementation details 1412. Implementation details
142======================== 142=========================
143 143
144From a high level, our approach to memory error detection is similar to that 144From a high level, our approach to memory error detection is similar to that
145of kmemcheck: use shadow memory to record whether each byte of memory is safe 145of kmemcheck: use shadow memory to record whether each byte of memory is safe
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 816bf2fe55f5..84c0214b64a7 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1275,6 +1275,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1275 Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0. 1275 Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
1276 Default: 1024 1276 Default: 1024
1277 1277
1278 hardlockup_all_cpu_backtrace=
1279 [KNL] Should the hard-lockup detector generate
1280 backtraces on all cpus.
1281 Format: <integer>
1282
1278 hashdist= [KNL,NUMA] Large hashes allocated during boot 1283 hashdist= [KNL,NUMA] Large hashes allocated during boot
1279 are distributed across NUMA nodes. Defaults on 1284 are distributed across NUMA nodes. Defaults on
1280 for 64-bit NUMA, off otherwise. 1285 for 64-bit NUMA, off otherwise.
diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt
index 22dd6af2e4bd..4a6e33e1af61 100644
--- a/Documentation/lockup-watchdogs.txt
+++ b/Documentation/lockup-watchdogs.txt
@@ -20,8 +20,9 @@ kernel mode for more than 10 seconds (see "Implementation" below for
20details), without letting other interrupts have a chance to run. 20details), without letting other interrupts have a chance to run.
21Similarly to the softlockup case, the current stack trace is displayed 21Similarly to the softlockup case, the current stack trace is displayed
22upon detection and the system will stay locked up unless the default 22upon detection and the system will stay locked up unless the default
23behavior is changed, which can be done through a compile time knob, 23behavior is changed, which can be done through a sysctl,
24"BOOTPARAM_HARDLOCKUP_PANIC", and a kernel parameter, "nmi_watchdog" 24'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC",
25and a kernel parameter, "nmi_watchdog"
25(see "Documentation/kernel-parameters.txt" for details). 26(see "Documentation/kernel-parameters.txt" for details).
26 27
27The panic option can be used in combination with panic_timeout (this 28The panic option can be used in combination with panic_timeout (this
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 6fccb69c03e7..af70d1541d3a 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
33- domainname 33- domainname
34- hostname 34- hostname
35- hotplug 35- hotplug
36- hardlockup_all_cpu_backtrace
36- hung_task_panic 37- hung_task_panic
37- hung_task_check_count 38- hung_task_check_count
38- hung_task_timeout_secs 39- hung_task_timeout_secs
@@ -293,6 +294,17 @@ domain names are in general different. For a detailed discussion
293see the hostname(1) man page. 294see the hostname(1) man page.
294 295
295============================================================== 296==============================================================
297hardlockup_all_cpu_backtrace:
298
299This value controls the hard lockup detector behavior when a hard
300lockup condition is detected as to whether or not to gather further
301debug information. If enabled, arch-specific all-CPU stack dumping
302will be initiated.
303
3040: do nothing. This is the default behavior.
305
3061: on detection capture more debug information.
307==============================================================
296 308
297hotplug: 309hotplug:
298 310
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index 6513fe2d90b8..fea5c0864170 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -92,29 +92,26 @@ Steps:
92 92
932. Insure that writeback is complete. 932. Insure that writeback is complete.
94 94
953. Prep the new page that we want to move to. It is locked 953. Lock the new page that we want to move to. It is locked so that accesses to
96 and set to not being uptodate so that all accesses to the new 96 this (not yet uptodate) page immediately lock while the move is in progress.
97 page immediately lock while the move is in progress.
98 97
994. The new page is prepped with some settings from the old page so that 984. All the page table references to the page are converted to migration
100 accesses to the new page will discover a page with the correct settings. 99 entries. This decreases the mapcount of a page. If the resulting
101 100 mapcount is not zero then we do not migrate the page. All user space
1025. All the page table references to the page are converted 101 processes that attempt to access the page will now wait on the page lock.
103 to migration entries or dropped (nonlinear vmas).
104 This decrease the mapcount of a page. If the resulting
105 mapcount is not zero then we do not migrate the page.
106 All user space processes that attempt to access the page
107 will now wait on the page lock.
108 102
1096. The radix tree lock is taken. This will cause all processes trying 1035. The radix tree lock is taken. This will cause all processes trying
110 to access the page via the mapping to block on the radix tree spinlock. 104 to access the page via the mapping to block on the radix tree spinlock.
111 105
1127. The refcount of the page is examined and we back out if references remain 1066. The refcount of the page is examined and we back out if references remain
113 otherwise we know that we are the only one referencing this page. 107 otherwise we know that we are the only one referencing this page.
114 108
1158. The radix tree is checked and if it does not contain the pointer to this 1097. The radix tree is checked and if it does not contain the pointer to this
116 page then we back out because someone else modified the radix tree. 110 page then we back out because someone else modified the radix tree.
117 111
1128. The new page is prepped with some settings from the old page so that
113 accesses to the new page will discover a page with the correct settings.
114
1189. The radix tree is changed to point to the new page. 1159. The radix tree is changed to point to the new page.
119 116
12010. The reference count of the old page is dropped because the radix tree 11710. The reference count of the old page is dropped because the radix tree
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 8143b9e8373d..8a282687ee06 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -170,6 +170,16 @@ A lower value leads to gain less thp performance. Value of
170max_ptes_none can waste cpu time very little, you can 170max_ptes_none can waste cpu time very little, you can
171ignore it. 171ignore it.
172 172
173max_ptes_swap specifies how many pages can be brought in from
174swap when collapsing a group of pages into a transparent huge page.
175
176/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
177
178A higher value can cause excessive swap IO and waste
179memory. A lower value can prevent THPs from being
180collapsed, resulting fewer pages being collapsed into
181THPs, and lower memory access performance.
182
173== Boot parameter == 183== Boot parameter ==
174 184
175You can change the sysfs boot time defaults of Transparent Hugepage 185You can change the sysfs boot time defaults of Transparent Hugepage
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index 32ee3a67dba2..fa3b527086fa 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -531,83 +531,20 @@ map.
531 531
532try_to_unmap() is always called, by either vmscan for reclaim or for page 532try_to_unmap() is always called, by either vmscan for reclaim or for page
533migration, with the argument page locked and isolated from the LRU. Separate 533migration, with the argument page locked and isolated from the LRU. Separate
534functions handle anonymous and mapped file pages, as these types of pages have 534functions handle anonymous and mapped file and KSM pages, as these types of
535different reverse map mechanisms. 535pages have different reverse map lookup mechanisms, with different locking.
536 536In each case, whether rmap_walk_anon() or rmap_walk_file() or rmap_walk_ksm(),
537 (*) try_to_unmap_anon() 537it will call try_to_unmap_one() for every VMA which might contain the page.
538 538
539 To unmap anonymous pages, each VMA in the list anchored in the anon_vma 539When trying to reclaim, if try_to_unmap_one() finds the page in a VM_LOCKED
540 must be visited - at least until a VM_LOCKED VMA is encountered. If the 540VMA, it will then mlock the page via mlock_vma_page() instead of unmapping it,
541 page is being unmapped for migration, VM_LOCKED VMAs do not stop the 541and return SWAP_MLOCK to indicate that the page is unevictable: and the scan
542 process because mlocked pages are migratable. However, for reclaim, if 542stops there.
543 the page is mapped into a VM_LOCKED VMA, the scan stops. 543
544 544mlock_vma_page() is called while holding the page table's lock (in addition
545 try_to_unmap_anon() attempts to acquire in read mode the mmap semaphore of 545to the page lock, and the rmap lock): to serialize against concurrent mlock or
546 the mm_struct to which the VMA belongs. If this is successful, it will 546munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim,
547 mlock the page via mlock_vma_page() - we wouldn't have gotten to 547holepunching, and truncation of file pages and their anonymous COWed pages.
548 try_to_unmap_anon() if the page were already mlocked - and will return
549 SWAP_MLOCK, indicating that the page is unevictable.
550
551 If the mmap semaphore cannot be acquired, we are not sure whether the page
552 is really unevictable or not. In this case, try_to_unmap_anon() will
553 return SWAP_AGAIN.
554
555 (*) try_to_unmap_file() - linear mappings
556
557 Unmapping of a mapped file page works the same as for anonymous mappings,
558 except that the scan visits all VMAs that map the page's index/page offset
559 in the page's mapping's reverse map priority search tree. It also visits
560 each VMA in the page's mapping's non-linear list, if the list is
561 non-empty.
562
563 As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file
564 page, try_to_unmap_file() will attempt to acquire the associated
565 mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this
566 is successful, and SWAP_AGAIN, if not.
567
568 (*) try_to_unmap_file() - non-linear mappings
569
570 If a page's mapping contains a non-empty non-linear mapping VMA list, then
571 try_to_un{map|lock}() must also visit each VMA in that list to determine
572 whether the page is mapped in a VM_LOCKED VMA. Again, the scan must visit
573 all VMAs in the non-linear list to ensure that the pages is not/should not
574 be mlocked.
575
576 If a VM_LOCKED VMA is found in the list, the scan could terminate.
577 However, there is no easy way to determine whether the page is actually
578 mapped in a given VMA - either for unmapping or testing whether the
579 VM_LOCKED VMA actually pins the page.
580
581 try_to_unmap_file() handles non-linear mappings by scanning a certain
582 number of pages - a "cluster" - in each non-linear VMA associated with the
583 page's mapping, for each file mapped page that vmscan tries to unmap. If
584 this happens to unmap the page we're trying to unmap, try_to_unmap() will
585 notice this on return (page_mapcount(page) will be 0) and return
586 SWAP_SUCCESS. Otherwise, it will return SWAP_AGAIN, causing vmscan to
587 recirculate this page. We take advantage of the cluster scan in
588 try_to_unmap_cluster() as follows:
589
590 For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the
591 mmap semaphore of the associated mm_struct for read without blocking.
592
593 If this attempt is successful and the VMA is VM_LOCKED,
594 try_to_unmap_cluster() will retain the mmap semaphore for the scan;
595 otherwise it drops it here.
596
597 Then, for each page in the cluster, if we're holding the mmap semaphore
598 for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to
599 mlock the page. This call is a no-op if the page is already locked,
600 but will mlock any pages in the non-linear mapping that happen to be
601 unlocked.
602
603 If one of the pages so mlocked is the page passed in to try_to_unmap(),
604 try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default
605 SWAP_AGAIN. This will allow vmscan to cull the page, rather than
606 recirculating it on the inactive list.
607
608 Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it
609 returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED
610 VMA, but couldn't be mlocked.
611 548
612 549
613try_to_munlock() REVERSE MAP SCAN 550try_to_munlock() REVERSE MAP SCAN
@@ -623,29 +560,15 @@ all PTEs from the page. For this purpose, the unevictable/mlock infrastructure
623introduced a variant of try_to_unmap() called try_to_munlock(). 560introduced a variant of try_to_unmap() called try_to_munlock().
624 561
625try_to_munlock() calls the same functions as try_to_unmap() for anonymous and 562try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
626mapped file pages with an additional argument specifying unlock versus unmap 563mapped file and KSM pages with a flag argument specifying unlock versus unmap
627processing. Again, these functions walk the respective reverse maps looking 564processing. Again, these functions walk the respective reverse maps looking
628for VM_LOCKED VMAs. When such a VMA is found for anonymous pages and file 565for VM_LOCKED VMAs. When such a VMA is found, as in the try_to_unmap() case,
629pages mapped in linear VMAs, as in the try_to_unmap() case, the functions 566the functions mlock the page via mlock_vma_page() and return SWAP_MLOCK. This
630attempt to acquire the associated mmap semaphore, mlock the page via 567undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page.
631mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the
632pre-clearing of the page's PG_mlocked done by munlock_vma_page.
633
634If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap
635semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() to
636recycle the page on the inactive list and hope that it has better luck with the
637page next time.
638
639For file pages mapped into non-linear VMAs, the try_to_munlock() logic works
640slightly differently. On encountering a VM_LOCKED non-linear VMA that might
641map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the
642page. munlock_vma_page() will just leave the page unlocked and let vmscan deal
643with it - the usual fallback position.
644 568
645Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's 569Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
646reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA. 570reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
647However, the scan can terminate when it encounters a VM_LOCKED VMA and can 571However, the scan can terminate when it encounters a VM_LOCKED VMA.
648successfully acquire the VMA's mmap semaphore for read and mlock the page.
649Although try_to_munlock() might be called a great many times when munlocking a 572Although try_to_munlock() might be called a great many times when munlocking a
650large region or tearing down a large address space that has been mlocked via 573large region or tearing down a large address space that has been mlocked via
651mlockall(), overall this is a fairly rare event. 574mlockall(), overall this is a fairly rare event.
@@ -673,11 +596,6 @@ Some examples of these unevictable pages on the LRU lists are:
673 (3) mlocked pages that could not be isolated from the LRU and moved to the 596 (3) mlocked pages that could not be isolated from the LRU and moved to the
674 unevictable list in mlock_vma_page(). 597 unevictable list in mlock_vma_page().
675 598
676 (4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't
677 acquire the VMA's mmap semaphore to test the flags and set PageMlocked.
678 munlock_vma_page() was forced to let the page back on to the normal LRU
679 list for vmscan to handle.
680
681shrink_inactive_list() also diverts any unevictable pages that it finds on the 599shrink_inactive_list() also diverts any unevictable pages that it finds on the
682inactive lists to the appropriate zone's unevictable list. 600inactive lists to the appropriate zone's unevictable list.
683 601
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 0086b472bc2b..f2f949671798 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -37,6 +37,9 @@
37 37
38#define MCL_CURRENT 8192 /* lock all currently mapped pages */ 38#define MCL_CURRENT 8192 /* lock all currently mapped pages */
39#define MCL_FUTURE 16384 /* lock all additions to address space */ 39#define MCL_FUTURE 16384 /* lock all additions to address space */
40#define MCL_ONFAULT 32768 /* lock all pages that are faulted in */
41
42#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
40 43
41#define MADV_NORMAL 0 /* no further special treatment */ 44#define MADV_NORMAL 0 /* no further special treatment */
42#define MADV_RANDOM 1 /* expect random page references */ 45#define MADV_RANDOM 1 /* expect random page references */
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index 00b7f7de28a1..7d5f4c736a16 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -803,7 +803,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
803 } 803 }
804 } 804 }
805 } else { 805 } else {
806 fault = probe_kernel_address(instrptr, instr); 806 fault = probe_kernel_address((void *)instrptr, instr);
807 instr = __mem_to_opcode_arm(instr); 807 instr = __mem_to_opcode_arm(instr);
808 } 808 }
809 809
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index cfcb876cae6b..97c03f468924 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -61,6 +61,12 @@
61 */ 61 */
62#define MCL_CURRENT 1 /* lock all current mappings */ 62#define MCL_CURRENT 1 /* lock all current mappings */
63#define MCL_FUTURE 2 /* lock all future mappings */ 63#define MCL_FUTURE 2 /* lock all future mappings */
64#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
65
66/*
67 * Flags for mlock
68 */
69#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
64 70
65#define MADV_NORMAL 0 /* no further special treatment */ 71#define MADV_NORMAL 0 /* no further special treatment */
66#define MADV_RANDOM 1 /* expect random page references */ 72#define MADV_RANDOM 1 /* expect random page references */
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 294d251ca7b2..ecc3ae1ca28e 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -31,6 +31,9 @@
31 31
32#define MCL_CURRENT 1 /* lock all current mappings */ 32#define MCL_CURRENT 1 /* lock all current mappings */
33#define MCL_FUTURE 2 /* lock all future mappings */ 33#define MCL_FUTURE 2 /* lock all future mappings */
34#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
35
36#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
34 37
35#define MADV_NORMAL 0 /* no further special treatment */ 38#define MADV_NORMAL 0 /* no further special treatment */
36#define MADV_RANDOM 1 /* expect random page references */ 39#define MADV_RANDOM 1 /* expect random page references */
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index 6ea26df0a73c..03c06ba7464f 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -22,6 +22,7 @@
22 22
23#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ 23#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
24#define MCL_FUTURE 0x4000 /* lock all additions to address space */ 24#define MCL_FUTURE 0x4000 /* lock all additions to address space */
25#define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */
25 26
26#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ 27#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
27#define MAP_NONBLOCK 0x10000 /* do not block on IO */ 28#define MAP_NONBLOCK 0x10000 /* do not block on IO */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 8b9502adaf79..8d8a541211d0 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -80,7 +80,7 @@ static void __init setup_node_to_cpumask_map(void)
80 setup_nr_node_ids(); 80 setup_nr_node_ids();
81 81
82 /* allocate the map */ 82 /* allocate the map */
83 for (node = 0; node < nr_node_ids; node++) 83 for_each_node(node)
84 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 84 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
85 85
86 /* cpumask_of_node() will now work */ 86 /* cpumask_of_node() will now work */
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index ebc1f412cf49..13b9bcf5485e 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -999,7 +999,7 @@ int fsl_pci_mcheck_exception(struct pt_regs *regs)
999 ret = get_user(regs->nip, &inst); 999 ret = get_user(regs->nip, &inst);
1000 pagefault_enable(); 1000 pagefault_enable();
1001 } else { 1001 } else {
1002 ret = probe_kernel_address(regs->nip, inst); 1002 ret = probe_kernel_address((void *)regs->nip, inst);
1003 } 1003 }
1004 1004
1005 if (mcheck_handle_load(regs, inst)) { 1005 if (mcheck_handle_load(regs, inst)) {
diff --git a/arch/sparc/include/uapi/asm/mman.h b/arch/sparc/include/uapi/asm/mman.h
index 0b14df33cffa..9765896ecb2c 100644
--- a/arch/sparc/include/uapi/asm/mman.h
+++ b/arch/sparc/include/uapi/asm/mman.h
@@ -17,6 +17,7 @@
17 17
18#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ 18#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
19#define MCL_FUTURE 0x4000 /* lock all additions to address space */ 19#define MCL_FUTURE 0x4000 /* lock all additions to address space */
20#define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */
20 21
21#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ 22#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
22#define MAP_NONBLOCK 0x10000 /* do not block on IO */ 23#define MAP_NONBLOCK 0x10000 /* do not block on IO */
diff --git a/arch/tile/include/uapi/asm/mman.h b/arch/tile/include/uapi/asm/mman.h
index 81b8fc348d63..63ee13faf17d 100644
--- a/arch/tile/include/uapi/asm/mman.h
+++ b/arch/tile/include/uapi/asm/mman.h
@@ -36,6 +36,7 @@
36 */ 36 */
37#define MCL_CURRENT 1 /* lock all current mappings */ 37#define MCL_CURRENT 1 /* lock all current mappings */
38#define MCL_FUTURE 2 /* lock all future mappings */ 38#define MCL_FUTURE 2 /* lock all future mappings */
39#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
39 40
40 41
41#endif /* _ASM_TILE_MMAN_H */ 42#endif /* _ASM_TILE_MMAN_H */
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 0d553e54171b..2ee62dba0373 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,13 +9,13 @@
9# Changed by many, many contributors over the years. 9# Changed by many, many contributors over the years.
10# 10#
11 11
12KASAN_SANITIZE := n
13
12# If you want to preset the SVGA mode, uncomment the next line and 14# If you want to preset the SVGA mode, uncomment the next line and
13# set SVGA_MODE to whatever number you want. 15# set SVGA_MODE to whatever number you want.
14# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. 16# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
15# The number is the same as you would ordinarily press at bootup. 17# The number is the same as you would ordinarily press at bootup.
16 18
17KASAN_SANITIZE := n
18
19SVGA_MODE := -DSVGA_MODE=NORMAL_VGA 19SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
20 20
21targets := vmlinux.bin setup.bin setup.elf bzImage 21targets := vmlinux.bin setup.bin setup.elf bzImage
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index caa2c712d1e7..f17705e1332c 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -382,3 +382,4 @@
382373 i386 shutdown sys_shutdown 382373 i386 shutdown sys_shutdown
383374 i386 userfaultfd sys_userfaultfd 383374 i386 userfaultfd sys_userfaultfd
384375 i386 membarrier sys_membarrier 384375 i386 membarrier sys_membarrier
385376 i386 mlock2 sys_mlock2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 278842fdf1f6..314a90bfc09c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -331,6 +331,7 @@
331322 64 execveat stub_execveat 331322 64 execveat stub_execveat
332323 common userfaultfd sys_userfaultfd 332323 common userfaultfd sys_userfaultfd
333324 common membarrier sys_membarrier 333324 common membarrier sys_membarrier
334325 common mlock2 sys_mlock2
334 335
335# 336#
336# x32-specific system call numbers start at 512 to avoid cache impact 337# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 9ce5da27b136..d470cf219a2d 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -126,5 +126,5 @@ void __init kasan_init(void)
126 __flush_tlb_all(); 126 __flush_tlb_all();
127 init_task.kasan_depth = 0; 127 init_task.kasan_depth = 0;
128 128
129 pr_info("Kernel address sanitizer initialized\n"); 129 pr_info("KernelAddressSanitizer initialized\n");
130} 130}
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 201aec0e0446..360944e1da52 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -74,6 +74,12 @@
74 */ 74 */
75#define MCL_CURRENT 1 /* lock all current mappings */ 75#define MCL_CURRENT 1 /* lock all current mappings */
76#define MCL_FUTURE 2 /* lock all future mappings */ 76#define MCL_FUTURE 2 /* lock all future mappings */
77#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
78
79/*
80 * Flags for mlock
81 */
82#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
77 83
78#define MADV_NORMAL 0 /* no further special treatment */ 84#define MADV_NORMAL 0 /* no further special treatment */
79#define MADV_RANDOM 1 /* expect random page references */ 85#define MADV_RANDOM 1 /* expect random page references */
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index f23fd86697ea..7bf835f85bc8 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -231,7 +231,8 @@ out_unlock:
231 if (res < 0 && fl->fl_type != F_UNLCK) { 231 if (res < 0 && fl->fl_type != F_UNLCK) {
232 fl_type = fl->fl_type; 232 fl_type = fl->fl_type;
233 fl->fl_type = F_UNLCK; 233 fl->fl_type = F_UNLCK;
234 res = locks_lock_file_wait(filp, fl); 234 /* Even if this fails we want to return the remote error */
235 locks_lock_file_wait(filp, fl);
235 fl->fl_type = fl_type; 236 fl->fl_type = fl_type;
236 } 237 }
237out: 238out:
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7378169e90be..206a68b1db1a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2149,7 +2149,12 @@ static void wait_sb_inodes(struct super_block *sb)
2149 iput(old_inode); 2149 iput(old_inode);
2150 old_inode = inode; 2150 old_inode = inode;
2151 2151
2152 filemap_fdatawait(mapping); 2152 /*
2153 * We keep the error status of individual mapping so that
2154 * applications can catch the writeback error using fsync(2).
2155 * See filemap_fdatawait_keep_errors() for details.
2156 */
2157 filemap_fdatawait_keep_errors(mapping);
2153 2158
2154 cond_resched(); 2159 cond_resched();
2155 2160
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index a7fdbd868474..a709d80c8ebc 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -81,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
81 unsigned int max_pages; 81 unsigned int max_pages;
82 int i; 82 int i;
83 83
84 max_pages = min(nr_pages, BIO_MAX_PAGES); 84 max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
85 85
86 bio = bio_alloc(GFP_NOFS, max_pages); 86 bio = bio_alloc(GFP_NOFS, max_pages);
87 BUG_ON(!bio); 87 BUG_ON(!bio);
@@ -171,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
171 unsigned int max_pages; 171 unsigned int max_pages;
172 int i; 172 int i;
173 173
174 max_pages = min(nr_pages, BIO_MAX_PAGES); 174 max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
175 175
176 bio = bio_alloc(GFP_NOFS, max_pages); 176 bio = bio_alloc(GFP_NOFS, max_pages);
177 BUG_ON(!bio); 177 BUG_ON(!bio);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 6b6f0d472ae8..fd98e5100cab 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -83,9 +83,16 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
83 inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); 83 inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
84 inode = igrab(mark->inode); 84 inode = igrab(mark->inode);
85 if (inode) { 85 if (inode) {
86 /*
87 * IN_ALL_EVENTS represents all of the mask bits
88 * that we expose to userspace. There is at
89 * least one bit (FS_EVENT_ON_CHILD) which is
90 * used only internally to the kernel.
91 */
92 u32 mask = mark->mask & IN_ALL_EVENTS;
86 seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", 93 seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
87 inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, 94 inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
88 mark->mask, mark->ignored_mask); 95 mask, mark->ignored_mask);
89 show_mark_fhandle(m, inode); 96 show_mark_fhandle(m, inode);
90 seq_putc(m, '\n'); 97 seq_putc(m, '\n');
91 iput(inode); 98 iput(inode);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 5b1e2a497e51..b8d08d0d0a4d 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -706,7 +706,19 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
706 int ret; 706 int ret;
707 unsigned flags = 0; 707 unsigned flags = 0;
708 708
709 /* don't allow invalid bits: we don't want flags set */ 709 /*
710 * We share a lot of code with fs/dnotify. We also share
711 * the bit layout between inotify's IN_* and the fsnotify
712 * FS_*. This check ensures that only the inotify IN_*
713 * bits get passed in and set in watches/events.
714 */
715 if (unlikely(mask & ~ALL_INOTIFY_BITS))
716 return -EINVAL;
717 /*
718 * Require at least one valid bit set in the mask.
719 * Without _something_ set, we would have no events to
720 * watch for.
721 */
710 if (unlikely(!(mask & ALL_INOTIFY_BITS))) 722 if (unlikely(!(mask & ALL_INOTIFY_BITS)))
711 return -EINVAL; 723 return -EINVAL;
712 724
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 64b11d90eca6..7f604727f487 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -589,6 +589,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
589 ret = -EIO; 589 ret = -EIO;
590 goto bail; 590 goto bail;
591 } 591 }
592 set_buffer_new(bh_result);
592 up_write(&OCFS2_I(inode)->ip_alloc_sem); 593 up_write(&OCFS2_I(inode)->ip_alloc_sem);
593 } 594 }
594 595
@@ -864,6 +865,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
864 is_overwrite = ocfs2_is_overwrite(osb, inode, offset); 865 is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
865 if (is_overwrite < 0) { 866 if (is_overwrite < 0) {
866 mlog_errno(is_overwrite); 867 mlog_errno(is_overwrite);
868 ret = is_overwrite;
867 ocfs2_inode_unlock(inode, 1); 869 ocfs2_inode_unlock(inode, 1);
868 goto clean_orphan; 870 goto clean_orphan;
869 } 871 }
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index fa15debcc02b..ddddef0021a0 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -219,7 +219,8 @@ struct o2hb_region {
219 unsigned hr_unclean_stop:1, 219 unsigned hr_unclean_stop:1,
220 hr_aborted_start:1, 220 hr_aborted_start:1,
221 hr_item_pinned:1, 221 hr_item_pinned:1,
222 hr_item_dropped:1; 222 hr_item_dropped:1,
223 hr_node_deleted:1;
223 224
224 /* protected by the hr_callback_sem */ 225 /* protected by the hr_callback_sem */
225 struct task_struct *hr_task; 226 struct task_struct *hr_task;
@@ -1078,7 +1079,13 @@ static int o2hb_thread(void *data)
1078 set_user_nice(current, MIN_NICE); 1079 set_user_nice(current, MIN_NICE);
1079 1080
1080 /* Pin node */ 1081 /* Pin node */
1081 o2nm_depend_this_node(); 1082 ret = o2nm_depend_this_node();
1083 if (ret) {
1084 mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret);
1085 reg->hr_node_deleted = 1;
1086 wake_up(&o2hb_steady_queue);
1087 return 0;
1088 }
1082 1089
1083 while (!kthread_should_stop() && 1090 while (!kthread_should_stop() &&
1084 !reg->hr_unclean_stop && !reg->hr_aborted_start) { 1091 !reg->hr_unclean_stop && !reg->hr_aborted_start) {
@@ -1787,7 +1794,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1787 spin_unlock(&o2hb_live_lock); 1794 spin_unlock(&o2hb_live_lock);
1788 1795
1789 ret = wait_event_interruptible(o2hb_steady_queue, 1796 ret = wait_event_interruptible(o2hb_steady_queue,
1790 atomic_read(&reg->hr_steady_iterations) == 0); 1797 atomic_read(&reg->hr_steady_iterations) == 0 ||
1798 reg->hr_node_deleted);
1791 if (ret) { 1799 if (ret) {
1792 atomic_set(&reg->hr_steady_iterations, 0); 1800 atomic_set(&reg->hr_steady_iterations, 0);
1793 reg->hr_aborted_start = 1; 1801 reg->hr_aborted_start = 1;
@@ -1798,6 +1806,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1798 goto out3; 1806 goto out3;
1799 } 1807 }
1800 1808
1809 if (reg->hr_node_deleted) {
1810 ret = -EINVAL;
1811 goto out3;
1812 }
1813
1801 /* Ok, we were woken. Make sure it wasn't by drop_item() */ 1814 /* Ok, we were woken. Make sure it wasn't by drop_item() */
1802 spin_lock(&o2hb_live_lock); 1815 spin_lock(&o2hb_live_lock);
1803 hb_task = reg->hr_task; 1816 hb_task = reg->hr_task;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6918f30d02cd..2ee7fe747cea 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1866,6 +1866,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
1866 int status; 1866 int status;
1867 unsigned int backoff; 1867 unsigned int backoff;
1868 unsigned int total_backoff = 0; 1868 unsigned int total_backoff = 0;
1869 char wq_name[O2NM_MAX_NAME_LEN];
1869 1870
1870 BUG_ON(!dlm); 1871 BUG_ON(!dlm);
1871 1872
@@ -1895,7 +1896,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
1895 goto bail; 1896 goto bail;
1896 } 1897 }
1897 1898
1898 dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); 1899 snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
1900 dlm->dlm_worker = create_singlethread_workqueue(wq_name);
1899 if (!dlm->dlm_worker) { 1901 if (!dlm->dlm_worker) {
1900 status = -ENOMEM; 1902 status = -ENOMEM;
1901 mlog_errno(status); 1903 mlog_errno(status);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 58eaa5c0d387..9e4f862d20fe 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -205,7 +205,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
205 mlog(0, "starting dlm recovery thread...\n"); 205 mlog(0, "starting dlm recovery thread...\n");
206 206
207 dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, 207 dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
208 "dlm_reco_thread"); 208 "dlm_reco-%s", dlm->name);
209 if (IS_ERR(dlm->dlm_reco_thread_task)) { 209 if (IS_ERR(dlm->dlm_reco_thread_task)) {
210 mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); 210 mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
211 dlm->dlm_reco_thread_task = NULL; 211 dlm->dlm_reco_thread_task = NULL;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2e5e6d5fffe8..c5f6c241ecd7 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -493,7 +493,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
493{ 493{
494 mlog(0, "Starting dlm_thread...\n"); 494 mlog(0, "Starting dlm_thread...\n");
495 495
496 dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); 496 dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s",
497 dlm->name);
497 if (IS_ERR(dlm->dlm_thread_task)) { 498 if (IS_ERR(dlm->dlm_thread_task)) {
498 mlog_errno(PTR_ERR(dlm->dlm_thread_task)); 499 mlog_errno(PTR_ERR(dlm->dlm_thread_task));
499 dlm->dlm_thread_task = NULL; 500 dlm->dlm_thread_task = NULL;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1c91103c1333..20276e340339 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2998,7 +2998,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2998 } 2998 }
2999 2999
3000 /* launch downconvert thread */ 3000 /* launch downconvert thread */
3001 osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc"); 3001 osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s",
3002 osb->uuid_str);
3002 if (IS_ERR(osb->dc_task)) { 3003 if (IS_ERR(osb->dc_task)) {
3003 status = PTR_ERR(osb->dc_task); 3004 status = PTR_ERR(osb->dc_task);
3004 osb->dc_task = NULL; 3005 osb->dc_task = NULL;
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ca3431ee7f24..aac8b86f312e 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -112,6 +112,8 @@ struct ocfs2_inode_info
112#define OCFS2_INODE_OPEN_DIRECT 0x00000020 112#define OCFS2_INODE_OPEN_DIRECT 0x00000020
113/* Tell the inode wipe code it's not in orphan dir */ 113/* Tell the inode wipe code it's not in orphan dir */
114#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 114#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
115/* Entry in orphan dir with 'dio-' prefix */
116#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080
115 117
116static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) 118static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
117{ 119{
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ff82b28462a6..13534f4fe5b5 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1090,7 +1090,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
1090 /* Launch the commit thread */ 1090 /* Launch the commit thread */
1091 if (!local) { 1091 if (!local) {
1092 osb->commit_task = kthread_run(ocfs2_commit_thread, osb, 1092 osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
1093 "ocfs2cmt"); 1093 "ocfs2cmt-%s", osb->uuid_str);
1094 if (IS_ERR(osb->commit_task)) { 1094 if (IS_ERR(osb->commit_task)) {
1095 status = PTR_ERR(osb->commit_task); 1095 status = PTR_ERR(osb->commit_task);
1096 osb->commit_task = NULL; 1096 osb->commit_task = NULL;
@@ -1507,7 +1507,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1507 goto out; 1507 goto out;
1508 1508
1509 osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, 1509 osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,
1510 "ocfs2rec"); 1510 "ocfs2rec-%s", osb->uuid_str);
1511 if (IS_ERR(osb->recovery_thread_task)) { 1511 if (IS_ERR(osb->recovery_thread_task)) {
1512 mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); 1512 mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
1513 osb->recovery_thread_task = NULL; 1513 osb->recovery_thread_task = NULL;
@@ -2021,6 +2021,7 @@ struct ocfs2_orphan_filldir_priv {
2021 struct dir_context ctx; 2021 struct dir_context ctx;
2022 struct inode *head; 2022 struct inode *head;
2023 struct ocfs2_super *osb; 2023 struct ocfs2_super *osb;
2024 enum ocfs2_orphan_reco_type orphan_reco_type;
2024}; 2025};
2025 2026
2026static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, 2027static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
@@ -2036,12 +2037,22 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
2036 if (name_len == 2 && !strncmp("..", name, 2)) 2037 if (name_len == 2 && !strncmp("..", name, 2))
2037 return 0; 2038 return 0;
2038 2039
2040 /* do not include dio entry in case of orphan scan */
2041 if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) &&
2042 (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
2043 OCFS2_DIO_ORPHAN_PREFIX_LEN)))
2044 return 0;
2045
2039 /* Skip bad inodes so that recovery can continue */ 2046 /* Skip bad inodes so that recovery can continue */
2040 iter = ocfs2_iget(p->osb, ino, 2047 iter = ocfs2_iget(p->osb, ino,
2041 OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); 2048 OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
2042 if (IS_ERR(iter)) 2049 if (IS_ERR(iter))
2043 return 0; 2050 return 0;
2044 2051
2052 if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
2053 OCFS2_DIO_ORPHAN_PREFIX_LEN))
2054 OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY;
2055
2045 /* Skip inodes which are already added to recover list, since dio may 2056 /* Skip inodes which are already added to recover list, since dio may
2046 * happen concurrently with unlink/rename */ 2057 * happen concurrently with unlink/rename */
2047 if (OCFS2_I(iter)->ip_next_orphan) { 2058 if (OCFS2_I(iter)->ip_next_orphan) {
@@ -2060,14 +2071,16 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
2060 2071
2061static int ocfs2_queue_orphans(struct ocfs2_super *osb, 2072static int ocfs2_queue_orphans(struct ocfs2_super *osb,
2062 int slot, 2073 int slot,
2063 struct inode **head) 2074 struct inode **head,
2075 enum ocfs2_orphan_reco_type orphan_reco_type)
2064{ 2076{
2065 int status; 2077 int status;
2066 struct inode *orphan_dir_inode = NULL; 2078 struct inode *orphan_dir_inode = NULL;
2067 struct ocfs2_orphan_filldir_priv priv = { 2079 struct ocfs2_orphan_filldir_priv priv = {
2068 .ctx.actor = ocfs2_orphan_filldir, 2080 .ctx.actor = ocfs2_orphan_filldir,
2069 .osb = osb, 2081 .osb = osb,
2070 .head = *head 2082 .head = *head,
2083 .orphan_reco_type = orphan_reco_type
2071 }; 2084 };
2072 2085
2073 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 2086 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
@@ -2170,7 +2183,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2170 trace_ocfs2_recover_orphans(slot); 2183 trace_ocfs2_recover_orphans(slot);
2171 2184
2172 ocfs2_mark_recovering_orphan_dir(osb, slot); 2185 ocfs2_mark_recovering_orphan_dir(osb, slot);
2173 ret = ocfs2_queue_orphans(osb, slot, &inode); 2186 ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type);
2174 ocfs2_clear_recovering_orphan_dir(osb, slot); 2187 ocfs2_clear_recovering_orphan_dir(osb, slot);
2175 2188
2176 /* Error here should be noted, but we want to continue with as 2189 /* Error here should be noted, but we want to continue with as
@@ -2186,25 +2199,51 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2186 iter = oi->ip_next_orphan; 2199 iter = oi->ip_next_orphan;
2187 oi->ip_next_orphan = NULL; 2200 oi->ip_next_orphan = NULL;
2188 2201
2189 mutex_lock(&inode->i_mutex); 2202 if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
2190 ret = ocfs2_rw_lock(inode, 1); 2203 mutex_lock(&inode->i_mutex);
2191 if (ret < 0) { 2204 ret = ocfs2_rw_lock(inode, 1);
2192 mlog_errno(ret); 2205 if (ret < 0) {
2193 goto next; 2206 mlog_errno(ret);
2194 } 2207 goto unlock_mutex;
2195 /* 2208 }
2196 * We need to take and drop the inode lock to 2209 /*
2197 * force read inode from disk. 2210 * We need to take and drop the inode lock to
2198 */ 2211 * force read inode from disk.
2199 ret = ocfs2_inode_lock(inode, &di_bh, 1); 2212 */
2200 if (ret) { 2213 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2201 mlog_errno(ret); 2214 if (ret) {
2202 goto unlock_rw; 2215 mlog_errno(ret);
2203 } 2216 goto unlock_rw;
2217 }
2204 2218
2205 di = (struct ocfs2_dinode *)di_bh->b_data; 2219 di = (struct ocfs2_dinode *)di_bh->b_data;
2206 2220
2207 if (inode->i_nlink == 0) { 2221 if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) {
2222 ret = ocfs2_truncate_file(inode, di_bh,
2223 i_size_read(inode));
2224 if (ret < 0) {
2225 if (ret != -ENOSPC)
2226 mlog_errno(ret);
2227 goto unlock_inode;
2228 }
2229
2230 ret = ocfs2_del_inode_from_orphan(osb, inode,
2231 di_bh, 0, 0);
2232 if (ret)
2233 mlog_errno(ret);
2234 }
2235unlock_inode:
2236 ocfs2_inode_unlock(inode, 1);
2237 brelse(di_bh);
2238 di_bh = NULL;
2239unlock_rw:
2240 ocfs2_rw_unlock(inode, 1);
2241unlock_mutex:
2242 mutex_unlock(&inode->i_mutex);
2243
2244 /* clear dio flag in ocfs2_inode_info */
2245 oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
2246 } else {
2208 spin_lock(&oi->ip_lock); 2247 spin_lock(&oi->ip_lock);
2209 /* Set the proper information to get us going into 2248 /* Set the proper information to get us going into
2210 * ocfs2_delete_inode. */ 2249 * ocfs2_delete_inode. */
@@ -2212,28 +2251,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2212 spin_unlock(&oi->ip_lock); 2251 spin_unlock(&oi->ip_lock);
2213 } 2252 }
2214 2253
2215 if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
2216 (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
2217 ret = ocfs2_truncate_file(inode, di_bh,
2218 i_size_read(inode));
2219 if (ret < 0) {
2220 if (ret != -ENOSPC)
2221 mlog_errno(ret);
2222 goto unlock_inode;
2223 }
2224
2225 ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
2226 if (ret)
2227 mlog_errno(ret);
2228 } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
2229unlock_inode:
2230 ocfs2_inode_unlock(inode, 1);
2231 brelse(di_bh);
2232 di_bh = NULL;
2233unlock_rw:
2234 ocfs2_rw_unlock(inode, 1);
2235next:
2236 mutex_unlock(&inode->i_mutex);
2237 iput(inode); 2254 iput(inode);
2238 inode = iter; 2255 inode = iter;
2239 } 2256 }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b7dfac226b1e..3b48ac25d8a7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -106,8 +106,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
106static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); 106static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
107/* An orphan dir name is an 8 byte value, printed as a hex string */ 107/* An orphan dir name is an 8 byte value, printed as a hex string */
108#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) 108#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
109#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
110#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
111 109
112static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, 110static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
113 unsigned int flags) 111 unsigned int flags)
@@ -657,9 +655,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
657 return status; 655 return status;
658 } 656 }
659 657
660 return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, 658 status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
661 parent_fe_bh, handle, inode_ac, 659 parent_fe_bh, handle, inode_ac,
662 fe_blkno, suballoc_loc, suballoc_bit); 660 fe_blkno, suballoc_loc, suballoc_bit);
661 if (status < 0) {
662 u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
663 int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
664 inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
665 if (tmp)
666 mlog_errno(tmp);
667 }
668
669 return status;
663} 670}
664 671
665static int ocfs2_mkdir(struct inode *dir, 672static int ocfs2_mkdir(struct inode *dir,
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index e173329eb830..1155918d6784 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -26,6 +26,9 @@
26#ifndef OCFS2_NAMEI_H 26#ifndef OCFS2_NAMEI_H
27#define OCFS2_NAMEI_H 27#define OCFS2_NAMEI_H
28 28
29#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
30#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
31
29extern const struct inode_operations ocfs2_dir_iops; 32extern const struct inode_operations ocfs2_dir_iops;
30 33
31struct dentry *ocfs2_get_parent(struct dentry *child); 34struct dentry *ocfs2_get_parent(struct dentry *child);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index e5d57cd32505..252119860e6c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2920,16 +2920,13 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2920 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2920 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2921 struct page *page; 2921 struct page *page;
2922 pgoff_t page_index; 2922 pgoff_t page_index;
2923 unsigned int from, to, readahead_pages; 2923 unsigned int from, to;
2924 loff_t offset, end, map_end; 2924 loff_t offset, end, map_end;
2925 struct address_space *mapping = inode->i_mapping; 2925 struct address_space *mapping = inode->i_mapping;
2926 2926
2927 trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, 2927 trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
2928 new_cluster, new_len); 2928 new_cluster, new_len);
2929 2929
2930 readahead_pages =
2931 (ocfs2_cow_contig_clusters(sb) <<
2932 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
2933 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 2930 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2934 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); 2931 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2935 /* 2932 /*
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d83d2602cf2b..fc6d25f6d444 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1920,7 +1920,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1920 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1920 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1921 res, &bits_left); 1921 res, &bits_left);
1922 if (!status) { 1922 if (!status) {
1923 hint = ocfs2_group_from_res(res); 1923 if (ocfs2_is_cluster_bitmap(ac->ac_inode))
1924 hint = res->sr_bg_blkno;
1925 else
1926 hint = ocfs2_group_from_res(res);
1924 goto set_hint; 1927 goto set_hint;
1925 } 1928 }
1926 if (status < 0 && status != -ENOSPC) { 1929 if (status < 0 && status != -ENOSPC) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 29595af32866..bd3e9e68125b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1032,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
1032 return simple_read_from_buffer(buf, count, ppos, buffer, len); 1032 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1033} 1033}
1034 1034
1035/*
1036 * /proc/pid/oom_adj exists solely for backwards compatibility with previous
1037 * kernels. The effective policy is defined by oom_score_adj, which has a
1038 * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
1039 * Values written to oom_adj are simply mapped linearly to oom_score_adj.
1040 * Processes that become oom disabled via oom_adj will still be oom disabled
1041 * with this implementation.
1042 *
1043 * oom_adj cannot be removed since existing userspace binaries use it.
1044 */
1035static ssize_t oom_adj_write(struct file *file, const char __user *buf, 1045static ssize_t oom_adj_write(struct file *file, const char __user *buf,
1036 size_t count, loff_t *ppos) 1046 size_t count, loff_t *ppos)
1037{ 1047{
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b029d426c558..187b3b5f242e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -70,6 +70,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
70 ptes >> 10, 70 ptes >> 10,
71 pmds >> 10, 71 pmds >> 10,
72 swap << (PAGE_SHIFT-10)); 72 swap << (PAGE_SHIFT-10));
73 hugetlb_report_usage(m, mm);
73} 74}
74 75
75unsigned long task_vsize(struct mm_struct *mm) 76unsigned long task_vsize(struct mm_struct *mm)
@@ -446,6 +447,8 @@ struct mem_size_stats {
446 unsigned long anonymous; 447 unsigned long anonymous;
447 unsigned long anonymous_thp; 448 unsigned long anonymous_thp;
448 unsigned long swap; 449 unsigned long swap;
450 unsigned long shared_hugetlb;
451 unsigned long private_hugetlb;
449 u64 pss; 452 u64 pss;
450 u64 swap_pss; 453 u64 swap_pss;
451}; 454};
@@ -625,12 +628,44 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
625 seq_putc(m, '\n'); 628 seq_putc(m, '\n');
626} 629}
627 630
631#ifdef CONFIG_HUGETLB_PAGE
632static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
633 unsigned long addr, unsigned long end,
634 struct mm_walk *walk)
635{
636 struct mem_size_stats *mss = walk->private;
637 struct vm_area_struct *vma = walk->vma;
638 struct page *page = NULL;
639
640 if (pte_present(*pte)) {
641 page = vm_normal_page(vma, addr, *pte);
642 } else if (is_swap_pte(*pte)) {
643 swp_entry_t swpent = pte_to_swp_entry(*pte);
644
645 if (is_migration_entry(swpent))
646 page = migration_entry_to_page(swpent);
647 }
648 if (page) {
649 int mapcount = page_mapcount(page);
650
651 if (mapcount >= 2)
652 mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
653 else
654 mss->private_hugetlb += huge_page_size(hstate_vma(vma));
655 }
656 return 0;
657}
658#endif /* HUGETLB_PAGE */
659
628static int show_smap(struct seq_file *m, void *v, int is_pid) 660static int show_smap(struct seq_file *m, void *v, int is_pid)
629{ 661{
630 struct vm_area_struct *vma = v; 662 struct vm_area_struct *vma = v;
631 struct mem_size_stats mss; 663 struct mem_size_stats mss;
632 struct mm_walk smaps_walk = { 664 struct mm_walk smaps_walk = {
633 .pmd_entry = smaps_pte_range, 665 .pmd_entry = smaps_pte_range,
666#ifdef CONFIG_HUGETLB_PAGE
667 .hugetlb_entry = smaps_hugetlb_range,
668#endif
634 .mm = vma->vm_mm, 669 .mm = vma->vm_mm,
635 .private = &mss, 670 .private = &mss,
636 }; 671 };
@@ -652,6 +687,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
652 "Referenced: %8lu kB\n" 687 "Referenced: %8lu kB\n"
653 "Anonymous: %8lu kB\n" 688 "Anonymous: %8lu kB\n"
654 "AnonHugePages: %8lu kB\n" 689 "AnonHugePages: %8lu kB\n"
690 "Shared_Hugetlb: %8lu kB\n"
691 "Private_Hugetlb: %7lu kB\n"
655 "Swap: %8lu kB\n" 692 "Swap: %8lu kB\n"
656 "SwapPss: %8lu kB\n" 693 "SwapPss: %8lu kB\n"
657 "KernelPageSize: %8lu kB\n" 694 "KernelPageSize: %8lu kB\n"
@@ -667,6 +704,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
667 mss.referenced >> 10, 704 mss.referenced >> 10,
668 mss.anonymous >> 10, 705 mss.anonymous >> 10,
669 mss.anonymous_thp >> 10, 706 mss.anonymous_thp >> 10,
707 mss.shared_hugetlb >> 10,
708 mss.private_hugetlb >> 10,
670 mss.swap >> 10, 709 mss.swap >> 10,
671 (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), 710 (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
672 vma_kernel_pagesize(vma) >> 10, 711 vma_kernel_pagesize(vma) >> 10,
@@ -753,19 +792,27 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
753 pte_t ptent = *pte; 792 pte_t ptent = *pte;
754 793
755 if (pte_present(ptent)) { 794 if (pte_present(ptent)) {
795 ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
756 ptent = pte_wrprotect(ptent); 796 ptent = pte_wrprotect(ptent);
757 ptent = pte_clear_soft_dirty(ptent); 797 ptent = pte_clear_soft_dirty(ptent);
798 ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
758 } else if (is_swap_pte(ptent)) { 799 } else if (is_swap_pte(ptent)) {
759 ptent = pte_swp_clear_soft_dirty(ptent); 800 ptent = pte_swp_clear_soft_dirty(ptent);
801 set_pte_at(vma->vm_mm, addr, pte, ptent);
760 } 802 }
761
762 set_pte_at(vma->vm_mm, addr, pte, ptent);
763} 803}
804#else
805static inline void clear_soft_dirty(struct vm_area_struct *vma,
806 unsigned long addr, pte_t *pte)
807{
808}
809#endif
764 810
811#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
765static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, 812static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
766 unsigned long addr, pmd_t *pmdp) 813 unsigned long addr, pmd_t *pmdp)
767{ 814{
768 pmd_t pmd = *pmdp; 815 pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
769 816
770 pmd = pmd_wrprotect(pmd); 817 pmd = pmd_wrprotect(pmd);
771 pmd = pmd_clear_soft_dirty(pmd); 818 pmd = pmd_clear_soft_dirty(pmd);
@@ -775,14 +822,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
775 822
776 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 823 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
777} 824}
778
779#else 825#else
780
781static inline void clear_soft_dirty(struct vm_area_struct *vma,
782 unsigned long addr, pte_t *pte)
783{
784}
785
786static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, 826static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
787 unsigned long addr, pmd_t *pmdp) 827 unsigned long addr, pmd_t *pmdp)
788{ 828{
diff --git a/fs/sync.c b/fs/sync.c
index fbc98ee62044..4ec430ae2b0d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -86,7 +86,12 @@ static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
86 86
87static void fdatawait_one_bdev(struct block_device *bdev, void *arg) 87static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
88{ 88{
89 filemap_fdatawait(bdev->bd_inode->i_mapping); 89 /*
90 * We keep the error status of individual mapping so that
91 * applications can catch the writeback error using fsync(2).
92 * See filemap_fdatawait_keep_errors() for details.
93 */
94 filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
90} 95}
91 96
92/* 97/*
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index aa8f61cf3a19..4cd4ddf64cc7 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -15,7 +15,8 @@
15/* For more detailed tracepoint output */ 15/* For more detailed tracepoint output */
16#define COMPACT_NO_SUITABLE_PAGE 5 16#define COMPACT_NO_SUITABLE_PAGE 5
17#define COMPACT_NOT_SUITABLE_ZONE 6 17#define COMPACT_NOT_SUITABLE_ZONE 6
18/* When adding new state, please change compaction_status_string, too */ 18#define COMPACT_CONTENDED 7
19/* When adding new states, please adjust include/trace/events/compaction.h */
19 20
20/* Used to signal whether compaction detected need_sched() or lock contention */ 21/* Used to signal whether compaction detected need_sched() or lock contention */
21/* No contention detected */ 22/* No contention detected */
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 8efb40e61d6e..0e3110a0b771 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -210,6 +210,23 @@
210#define __visible __attribute__((externally_visible)) 210#define __visible __attribute__((externally_visible))
211#endif 211#endif
212 212
213
214#if GCC_VERSION >= 40900 && !defined(__CHECKER__)
215/*
216 * __assume_aligned(n, k): Tell the optimizer that the returned
217 * pointer can be assumed to be k modulo n. The second argument is
218 * optional (default 0), so we use a variadic macro to make the
219 * shorthand.
220 *
221 * Beware: Do not apply this to functions which may return
222 * ERR_PTRs. Also, it is probably unwise to apply it to functions
223 * returning extra information in the low bits (but in that case the
224 * compiler should see some alignment anyway, when the return value is
225 * massaged by 'flags = ptr & 3; ptr &= ~3;').
226 */
227#define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__)))
228#endif
229
213/* 230/*
214 * GCC 'asm goto' miscompiles certain code sequences: 231 * GCC 'asm goto' miscompiles certain code sequences:
215 * 232 *
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 52a459ff75f4..4dac1036594f 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -417,6 +417,14 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
417#define __visible 417#define __visible
418#endif 418#endif
419 419
420/*
421 * Assume alignment of return value.
422 */
423#ifndef __assume_aligned
424#define __assume_aligned(a, ...)
425#endif
426
427
420/* Are two types/vars the same type (ignoring qualifiers)? */ 428/* Are two types/vars the same type (ignoring qualifiers)? */
421#ifndef __same_type 429#ifndef __same_type
422# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) 430# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 1b357997cac5..5a1311942358 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -93,7 +93,7 @@ extern int current_cpuset_is_being_rebound(void);
93 93
94extern void rebuild_sched_domains(void); 94extern void rebuild_sched_domains(void);
95 95
96extern void cpuset_print_task_mems_allowed(struct task_struct *p); 96extern void cpuset_print_current_mems_allowed(void);
97 97
98/* 98/*
99 * read_mems_allowed_begin is required when making decisions involving 99 * read_mems_allowed_begin is required when making decisions involving
@@ -219,7 +219,7 @@ static inline void rebuild_sched_domains(void)
219 partition_sched_domains(1, NULL, NULL); 219 partition_sched_domains(1, NULL, NULL);
220} 220}
221 221
222static inline void cpuset_print_task_mems_allowed(struct task_struct *p) 222static inline void cpuset_print_current_mems_allowed(void)
223{ 223{
224} 224}
225 225
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 49749688156d..9a1cb8c605e0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2409,6 +2409,7 @@ extern int write_inode_now(struct inode *, int);
2409extern int filemap_fdatawrite(struct address_space *); 2409extern int filemap_fdatawrite(struct address_space *);
2410extern int filemap_flush(struct address_space *); 2410extern int filemap_flush(struct address_space *);
2411extern int filemap_fdatawait(struct address_space *); 2411extern int filemap_fdatawait(struct address_space *);
2412extern void filemap_fdatawait_keep_errors(struct address_space *);
2412extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, 2413extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
2413 loff_t lend); 2414 loff_t lend);
2414extern int filemap_write_and_wait(struct address_space *mapping); 2415extern int filemap_write_and_wait(struct address_space *mapping);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5e35379f58a5..685c262e0be8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -483,6 +483,17 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
483#define hugepages_supported() (HPAGE_SHIFT != 0) 483#define hugepages_supported() (HPAGE_SHIFT != 0)
484#endif 484#endif
485 485
486void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);
487
488static inline void hugetlb_count_add(long l, struct mm_struct *mm)
489{
490 atomic_long_add(l, &mm->hugetlb_usage);
491}
492
493static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
494{
495 atomic_long_sub(l, &mm->hugetlb_usage);
496}
486#else /* CONFIG_HUGETLB_PAGE */ 497#else /* CONFIG_HUGETLB_PAGE */
487struct hstate {}; 498struct hstate {};
488#define alloc_huge_page(v, a, r) NULL 499#define alloc_huge_page(v, a, r) NULL
@@ -519,6 +530,14 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
519{ 530{
520 return &mm->page_table_lock; 531 return &mm->page_table_lock;
521} 532}
533
534static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
535{
536}
537
538static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
539{
540}
522#endif /* CONFIG_HUGETLB_PAGE */ 541#endif /* CONFIG_HUGETLB_PAGE */
523 542
524static inline spinlock_t *huge_pte_lock(struct hstate *h, 543static inline spinlock_t *huge_pte_lock(struct hstate *h,
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index c518eb589260..24daf8fc4d7c 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -89,10 +89,6 @@ int memblock_add_range(struct memblock_type *type,
89 phys_addr_t base, phys_addr_t size, 89 phys_addr_t base, phys_addr_t size,
90 int nid, unsigned long flags); 90 int nid, unsigned long flags);
91 91
92int memblock_remove_range(struct memblock_type *type,
93 phys_addr_t base,
94 phys_addr_t size);
95
96void __next_mem_range(u64 *idx, int nid, ulong flags, 92void __next_mem_range(u64 *idx, int nid, ulong flags,
97 struct memblock_type *type_a, 93 struct memblock_type *type_a,
98 struct memblock_type *type_b, phys_addr_t *out_start, 94 struct memblock_type *type_b, phys_addr_t *out_start,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 27251ed428f7..cd0e2413c358 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -301,8 +301,7 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
301void mem_cgroup_uncharge(struct page *page); 301void mem_cgroup_uncharge(struct page *page);
302void mem_cgroup_uncharge_list(struct list_head *page_list); 302void mem_cgroup_uncharge_list(struct list_head *page_list);
303 303
304void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, 304void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage);
305 bool lrucare);
306 305
307struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); 306struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
308struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); 307struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
@@ -384,7 +383,7 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
384 return mz->lru_size[lru]; 383 return mz->lru_size[lru];
385} 384}
386 385
387static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 386static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
388{ 387{
389 unsigned long inactive_ratio; 388 unsigned long inactive_ratio;
390 unsigned long inactive; 389 unsigned long inactive;
@@ -403,24 +402,26 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
403 return inactive * inactive_ratio < active; 402 return inactive * inactive_ratio < active;
404} 403}
405 404
405void mem_cgroup_handle_over_high(void);
406
406void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, 407void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
407 struct task_struct *p); 408 struct task_struct *p);
408 409
409static inline void mem_cgroup_oom_enable(void) 410static inline void mem_cgroup_oom_enable(void)
410{ 411{
411 WARN_ON(current->memcg_oom.may_oom); 412 WARN_ON(current->memcg_may_oom);
412 current->memcg_oom.may_oom = 1; 413 current->memcg_may_oom = 1;
413} 414}
414 415
415static inline void mem_cgroup_oom_disable(void) 416static inline void mem_cgroup_oom_disable(void)
416{ 417{
417 WARN_ON(!current->memcg_oom.may_oom); 418 WARN_ON(!current->memcg_may_oom);
418 current->memcg_oom.may_oom = 0; 419 current->memcg_may_oom = 0;
419} 420}
420 421
421static inline bool task_in_memcg_oom(struct task_struct *p) 422static inline bool task_in_memcg_oom(struct task_struct *p)
422{ 423{
423 return p->memcg_oom.memcg; 424 return p->memcg_in_oom;
424} 425}
425 426
426bool mem_cgroup_oom_synchronize(bool wait); 427bool mem_cgroup_oom_synchronize(bool wait);
@@ -537,9 +538,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
537{ 538{
538} 539}
539 540
540static inline void mem_cgroup_migrate(struct page *oldpage, 541static inline void mem_cgroup_replace_page(struct page *old, struct page *new)
541 struct page *newpage,
542 bool lrucare)
543{ 542{
544} 543}
545 544
@@ -585,10 +584,10 @@ static inline bool mem_cgroup_disabled(void)
585 return true; 584 return true;
586} 585}
587 586
588static inline int 587static inline bool
589mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 588mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
590{ 589{
591 return 1; 590 return true;
592} 591}
593 592
594static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) 593static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
@@ -622,6 +621,10 @@ static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
622{ 621{
623} 622}
624 623
624static inline void mem_cgroup_handle_over_high(void)
625{
626}
627
625static inline void mem_cgroup_oom_enable(void) 628static inline void mem_cgroup_oom_enable(void)
626{ 629{
627} 630}
@@ -748,11 +751,10 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
748 * conditions, but because they are pretty simple, they are expected to be 751 * conditions, but because they are pretty simple, they are expected to be
749 * fast. 752 * fast.
750 */ 753 */
751bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, 754int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
752 int order); 755 struct mem_cgroup *memcg);
753void __memcg_kmem_commit_charge(struct page *page, 756int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
754 struct mem_cgroup *memcg, int order); 757void __memcg_kmem_uncharge(struct page *page, int order);
755void __memcg_kmem_uncharge_pages(struct page *page, int order);
756 758
757/* 759/*
758 * helper for acessing a memcg's index. It will be used as an index in the 760 * helper for acessing a memcg's index. It will be used as an index in the
@@ -767,77 +769,42 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
767struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); 769struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
768void __memcg_kmem_put_cache(struct kmem_cache *cachep); 770void __memcg_kmem_put_cache(struct kmem_cache *cachep);
769 771
770struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr); 772static inline bool __memcg_kmem_bypass(gfp_t gfp)
771
772int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
773 unsigned long nr_pages);
774void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
775
776/**
777 * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
778 * @gfp: the gfp allocation flags.
779 * @memcg: a pointer to the memcg this was charged against.
780 * @order: allocation order.
781 *
782 * returns true if the memcg where the current task belongs can hold this
783 * allocation.
784 *
785 * We return true automatically if this allocation is not to be accounted to
786 * any memcg.
787 */
788static inline bool
789memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
790{ 773{
791 if (!memcg_kmem_enabled()) 774 if (!memcg_kmem_enabled())
792 return true; 775 return true;
793
794 if (gfp & __GFP_NOACCOUNT) 776 if (gfp & __GFP_NOACCOUNT)
795 return true; 777 return true;
796 /*
797 * __GFP_NOFAIL allocations will move on even if charging is not
798 * possible. Therefore we don't even try, and have this allocation
799 * unaccounted. We could in theory charge it forcibly, but we hope
800 * those allocations are rare, and won't be worth the trouble.
801 */
802 if (gfp & __GFP_NOFAIL)
803 return true;
804 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) 778 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
805 return true; 779 return true;
806 780 return false;
807 /* If the test is dying, just let it go. */
808 if (unlikely(fatal_signal_pending(current)))
809 return true;
810
811 return __memcg_kmem_newpage_charge(gfp, memcg, order);
812} 781}
813 782
814/** 783/**
815 * memcg_kmem_uncharge_pages: uncharge pages from memcg 784 * memcg_kmem_charge: charge a kmem page
816 * @page: pointer to struct page being freed 785 * @page: page to charge
817 * @order: allocation order. 786 * @gfp: reclaim mode
787 * @order: allocation order
788 *
789 * Returns 0 on success, an error code on failure.
818 */ 790 */
819static inline void 791static __always_inline int memcg_kmem_charge(struct page *page,
820memcg_kmem_uncharge_pages(struct page *page, int order) 792 gfp_t gfp, int order)
821{ 793{
822 if (memcg_kmem_enabled()) 794 if (__memcg_kmem_bypass(gfp))
823 __memcg_kmem_uncharge_pages(page, order); 795 return 0;
796 return __memcg_kmem_charge(page, gfp, order);
824} 797}
825 798
826/** 799/**
827 * memcg_kmem_commit_charge: embeds correct memcg in a page 800 * memcg_kmem_uncharge: uncharge a kmem page
828 * @page: pointer to struct page recently allocated 801 * @page: page to uncharge
829 * @memcg: the memcg structure we charged against 802 * @order: allocation order
830 * @order: allocation order.
831 *
832 * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
833 * failure of the allocation. if @page is NULL, this function will revert the
834 * charges. Otherwise, it will commit @page to @memcg.
835 */ 803 */
836static inline void 804static __always_inline void memcg_kmem_uncharge(struct page *page, int order)
837memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
838{ 805{
839 if (memcg_kmem_enabled() && memcg) 806 if (memcg_kmem_enabled())
840 __memcg_kmem_commit_charge(page, memcg, order); 807 __memcg_kmem_uncharge(page, order);
841} 808}
842 809
843/** 810/**
@@ -850,17 +817,8 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
850static __always_inline struct kmem_cache * 817static __always_inline struct kmem_cache *
851memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) 818memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
852{ 819{
853 if (!memcg_kmem_enabled()) 820 if (__memcg_kmem_bypass(gfp))
854 return cachep;
855 if (gfp & __GFP_NOACCOUNT)
856 return cachep;
857 if (gfp & __GFP_NOFAIL)
858 return cachep;
859 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
860 return cachep; 821 return cachep;
861 if (unlikely(fatal_signal_pending(current)))
862 return cachep;
863
864 return __memcg_kmem_get_cache(cachep); 822 return __memcg_kmem_get_cache(cachep);
865} 823}
866 824
@@ -869,13 +827,6 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
869 if (memcg_kmem_enabled()) 827 if (memcg_kmem_enabled())
870 __memcg_kmem_put_cache(cachep); 828 __memcg_kmem_put_cache(cachep);
871} 829}
872
873static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
874{
875 if (!memcg_kmem_enabled())
876 return NULL;
877 return __mem_cgroup_from_kmem(ptr);
878}
879#else 830#else
880#define for_each_memcg_cache_index(_idx) \ 831#define for_each_memcg_cache_index(_idx) \
881 for (; NULL; ) 832 for (; NULL; )
@@ -890,18 +841,12 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
890 return false; 841 return false;
891} 842}
892 843
893static inline bool 844static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
894memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
895{
896 return true;
897}
898
899static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
900{ 845{
846 return 0;
901} 847}
902 848
903static inline void 849static inline void memcg_kmem_uncharge(struct page *page, int order)
904memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
905{ 850{
906} 851}
907 852
@@ -927,11 +872,5 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
927static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) 872static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
928{ 873{
929} 874}
930
931static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
932{
933 return NULL;
934}
935#endif /* CONFIG_MEMCG_KMEM */ 875#endif /* CONFIG_MEMCG_KMEM */
936#endif /* _LINUX_MEMCONTROL_H */ 876#endif /* _LINUX_MEMCONTROL_H */
937
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80001de019ba..906c46a05707 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -139,6 +139,7 @@ extern unsigned int kobjsize(const void *objp);
139 139
140#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ 140#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
141#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ 141#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
142#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */
142#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ 143#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
143#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ 144#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
144#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 145#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
@@ -202,6 +203,9 @@ extern unsigned int kobjsize(const void *objp);
202/* This mask defines which mm->def_flags a process can inherit its parent */ 203/* This mask defines which mm->def_flags a process can inherit its parent */
203#define VM_INIT_DEF_MASK VM_NOHUGEPAGE 204#define VM_INIT_DEF_MASK VM_NOHUGEPAGE
204 205
206/* This mask is used to clear all the VMA flags used by mlock */
207#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT))
208
205/* 209/*
206 * mapping from the currently active vm_flags protection bits (the 210 * mapping from the currently active vm_flags protection bits (the
207 * low four bits) to a page protection mask.. 211 * low four bits) to a page protection mask..
@@ -1606,8 +1610,10 @@ static inline void pgtable_init(void)
1606 1610
1607static inline bool pgtable_page_ctor(struct page *page) 1611static inline bool pgtable_page_ctor(struct page *page)
1608{ 1612{
1613 if (!ptlock_init(page))
1614 return false;
1609 inc_zone_page_state(page, NR_PAGETABLE); 1615 inc_zone_page_state(page, NR_PAGETABLE);
1610 return ptlock_init(page); 1616 return true;
1611} 1617}
1612 1618
1613static inline void pgtable_page_dtor(struct page *page) 1619static inline void pgtable_page_dtor(struct page *page)
@@ -2036,8 +2042,6 @@ void page_cache_async_readahead(struct address_space *mapping,
2036 pgoff_t offset, 2042 pgoff_t offset,
2037 unsigned long size); 2043 unsigned long size);
2038 2044
2039unsigned long max_sane_readahead(unsigned long nr);
2040
2041/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ 2045/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
2042extern int expand_stack(struct vm_area_struct *vma, unsigned long address); 2046extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
2043 2047
@@ -2137,6 +2141,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
2137#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ 2141#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
2138#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ 2142#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
2139#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ 2143#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
2144#define FOLL_MLOCK 0x1000 /* lock present pages */
2140 2145
2141typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, 2146typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
2142 void *data); 2147 void *data);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3d6baa7d4534..0a85da25a822 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -486,6 +486,9 @@ struct mm_struct {
486 /* address of the bounds directory */ 486 /* address of the bounds directory */
487 void __user *bd_addr; 487 void __user *bd_addr;
488#endif 488#endif
489#ifdef CONFIG_HUGETLB_PAGE
490 atomic_long_t hugetlb_usage;
491#endif
489}; 492};
490 493
491static inline void mm_init_cpumask(struct mm_struct *mm) 494static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d94347737292..2d7e660cdefe 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -823,8 +823,7 @@ enum memmap_context {
823 MEMMAP_HOTPLUG, 823 MEMMAP_HOTPLUG,
824}; 824};
825extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, 825extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
826 unsigned long size, 826 unsigned long size);
827 enum memmap_context context);
828 827
829extern void lruvec_init(struct lruvec *lruvec); 828extern void lruvec_init(struct lruvec *lruvec);
830 829
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 78488e099ce7..7ec5b86735f3 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -73,6 +73,7 @@ extern int watchdog_user_enabled;
73extern int watchdog_thresh; 73extern int watchdog_thresh;
74extern unsigned long *watchdog_cpumask_bits; 74extern unsigned long *watchdog_cpumask_bits;
75extern int sysctl_softlockup_all_cpu_backtrace; 75extern int sysctl_softlockup_all_cpu_backtrace;
76extern int sysctl_hardlockup_all_cpu_backtrace;
76struct ctl_table; 77struct ctl_table;
77extern int proc_watchdog(struct ctl_table *, int , 78extern int proc_watchdog(struct ctl_table *, int ,
78 void __user *, size_t *, loff_t *); 79 void __user *, size_t *, loff_t *);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 416509e26d6d..a525e5067484 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -256,7 +256,7 @@ PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
256 * Must use a macro here due to header dependency issues. page_zone() is not 256 * Must use a macro here due to header dependency issues. page_zone() is not
257 * available at this point. 257 * available at this point.
258 */ 258 */
259#define PageHighMem(__p) is_highmem(page_zone(__p)) 259#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
260#else 260#else
261PAGEFLAG_FALSE(HighMem) 261PAGEFLAG_FALSE(HighMem)
262#endif 262#endif
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 17fa4f8de3a6..7e62920a3a94 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -36,9 +36,9 @@ static inline unsigned long page_counter_read(struct page_counter *counter)
36 36
37void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); 37void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages);
38void page_counter_charge(struct page_counter *counter, unsigned long nr_pages); 38void page_counter_charge(struct page_counter *counter, unsigned long nr_pages);
39int page_counter_try_charge(struct page_counter *counter, 39bool page_counter_try_charge(struct page_counter *counter,
40 unsigned long nr_pages, 40 unsigned long nr_pages,
41 struct page_counter **fail); 41 struct page_counter **fail);
42void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); 42void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
43int page_counter_limit(struct page_counter *counter, unsigned long limit); 43int page_counter_limit(struct page_counter *counter, unsigned long limit);
44int page_counter_memparse(const char *buf, const char *max, 44int page_counter_memparse(const char *buf, const char *max,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4effb1025fbb..eeb5066a44fb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -384,6 +384,7 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
384 void __user *buffer, 384 void __user *buffer,
385 size_t *lenp, loff_t *ppos); 385 size_t *lenp, loff_t *ppos);
386extern unsigned int softlockup_panic; 386extern unsigned int softlockup_panic;
387extern unsigned int hardlockup_panic;
387void lockup_detector_init(void); 388void lockup_detector_init(void);
388#else 389#else
389static inline void touch_softlockup_watchdog(void) 390static inline void touch_softlockup_watchdog(void)
@@ -1460,7 +1461,9 @@ struct task_struct {
1460 unsigned sched_reset_on_fork:1; 1461 unsigned sched_reset_on_fork:1;
1461 unsigned sched_contributes_to_load:1; 1462 unsigned sched_contributes_to_load:1;
1462 unsigned sched_migrated:1; 1463 unsigned sched_migrated:1;
1463 1464#ifdef CONFIG_MEMCG
1465 unsigned memcg_may_oom:1;
1466#endif
1464#ifdef CONFIG_MEMCG_KMEM 1467#ifdef CONFIG_MEMCG_KMEM
1465 unsigned memcg_kmem_skip_account:1; 1468 unsigned memcg_kmem_skip_account:1;
1466#endif 1469#endif
@@ -1791,12 +1794,12 @@ struct task_struct {
1791 unsigned long trace_recursion; 1794 unsigned long trace_recursion;
1792#endif /* CONFIG_TRACING */ 1795#endif /* CONFIG_TRACING */
1793#ifdef CONFIG_MEMCG 1796#ifdef CONFIG_MEMCG
1794 struct memcg_oom_info { 1797 struct mem_cgroup *memcg_in_oom;
1795 struct mem_cgroup *memcg; 1798 gfp_t memcg_oom_gfp_mask;
1796 gfp_t gfp_mask; 1799 int memcg_oom_order;
1797 int order; 1800
1798 unsigned int may_oom:1; 1801 /* number of pages to reclaim on returning to userland */
1799 } memcg_oom; 1802 unsigned int memcg_nr_pages_over_high;
1800#endif 1803#endif
1801#ifdef CONFIG_UPROBES 1804#ifdef CONFIG_UPROBES
1802 struct uprobe_task *utask; 1805 struct uprobe_task *utask;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 7e37d448ed91..7c82e3b307a3 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -111,7 +111,7 @@ struct mem_cgroup;
111 * struct kmem_cache related prototypes 111 * struct kmem_cache related prototypes
112 */ 112 */
113void __init kmem_cache_init(void); 113void __init kmem_cache_init(void);
114int slab_is_available(void); 114bool slab_is_available(void);
115 115
116struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, 116struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
117 unsigned long, 117 unsigned long,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a460e2ef2843..a156b82dd14c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -887,4 +887,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
887 887
888asmlinkage long sys_membarrier(int cmd, int flags); 888asmlinkage long sys_membarrier(int cmd, int flags);
889 889
890asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
891
890#endif 892#endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 84d497297c5f..26c152122a42 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -50,6 +50,7 @@
50#include <linux/ptrace.h> 50#include <linux/ptrace.h>
51#include <linux/security.h> 51#include <linux/security.h>
52#include <linux/task_work.h> 52#include <linux/task_work.h>
53#include <linux/memcontrol.h>
53struct linux_binprm; 54struct linux_binprm;
54 55
55/* 56/*
@@ -188,6 +189,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
188 smp_mb__after_atomic(); 189 smp_mb__after_atomic();
189 if (unlikely(current->task_works)) 190 if (unlikely(current->task_works))
190 task_work_run(); 191 task_work_run();
192
193 mem_cgroup_handle_over_high();
191} 194}
192 195
193#endif /* <linux/tracehook.h> */ 196#endif /* <linux/tracehook.h> */
diff --git a/include/linux/types.h b/include/linux/types.h
index c314989d9158..70d8500bddf1 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -205,11 +205,25 @@ struct ustat {
205 * struct callback_head - callback structure for use with RCU and task_work 205 * struct callback_head - callback structure for use with RCU and task_work
206 * @next: next update requests in a list 206 * @next: next update requests in a list
207 * @func: actual update function to call after the grace period. 207 * @func: actual update function to call after the grace period.
208 *
209 * The struct is aligned to size of pointer. On most architectures it happens
210 * naturally due ABI requirements, but some architectures (like CRIS) have
211 * weird ABI and we need to ask it explicitly.
212 *
213 * The alignment is required to guarantee that bits 0 and 1 of @next will be
214 * clear under normal conditions -- as long as we use call_rcu(),
215 * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback.
216 *
217 * This guarantee is important for few reasons:
218 * - future call_rcu_lazy() will make use of lower bits in the pointer;
219 * - the structure shares storage spacer in struct page with @compound_head,
220 * which encode PageTail() in bit 0. The guarantee is needed to avoid
221 * false-positive PageTail().
208 */ 222 */
209struct callback_head { 223struct callback_head {
210 struct callback_head *next; 224 struct callback_head *next;
211 void (*func)(struct callback_head *head); 225 void (*func)(struct callback_head *head);
212}; 226} __attribute__((aligned(sizeof(void *))));
213#define rcu_head callback_head 227#define rcu_head callback_head
214 228
215typedef void (*rcu_callback_t)(struct rcu_head *head); 229typedef void (*rcu_callback_t)(struct rcu_head *head);
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index d6f2c2c5b043..558129af828a 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -75,36 +75,6 @@ static inline unsigned long __copy_from_user_nocache(void *to,
75 75
76#endif /* ARCH_HAS_NOCACHE_UACCESS */ 76#endif /* ARCH_HAS_NOCACHE_UACCESS */
77 77
78/**
79 * probe_kernel_address(): safely attempt to read from a location
80 * @addr: address to read from - its type is type typeof(retval)*
81 * @retval: read into this variable
82 *
83 * Safely read from address @addr into variable @revtal. If a kernel fault
84 * happens, handle that and return -EFAULT.
85 * We ensure that the __get_user() is executed in atomic context so that
86 * do_page_fault() doesn't attempt to take mmap_sem. This makes
87 * probe_kernel_address() suitable for use within regions where the caller
88 * already holds mmap_sem, or other locks which nest inside mmap_sem.
89 * This must be a macro because __get_user() needs to know the types of the
90 * args.
91 *
92 * We don't include enough header files to be able to do the set_fs(). We
93 * require that the probe_kernel_address() caller will do that.
94 */
95#define probe_kernel_address(addr, retval) \
96 ({ \
97 long ret; \
98 mm_segment_t old_fs = get_fs(); \
99 \
100 set_fs(KERNEL_DS); \
101 pagefault_disable(); \
102 ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \
103 pagefault_enable(); \
104 set_fs(old_fs); \
105 ret; \
106 })
107
108/* 78/*
109 * probe_kernel_read(): safely attempt to read from a location 79 * probe_kernel_read(): safely attempt to read from a location
110 * @dst: pointer to the buffer that shall take the data 80 * @dst: pointer to the buffer that shall take the data
@@ -131,4 +101,14 @@ extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size
131 101
132extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); 102extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
133 103
104/**
105 * probe_kernel_address(): safely attempt to read from a location
106 * @addr: address to read from
107 * @retval: read into this variable
108 *
109 * Returns 0 on success, or -EFAULT.
110 */
111#define probe_kernel_address(addr, retval) \
112 probe_kernel_read(&retval, addr, sizeof(retval))
113
134#endif /* __LINUX_UACCESS_H__ */ 114#endif /* __LINUX_UACCESS_H__ */
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 9246d32dc973..e623d392db0c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -14,12 +14,12 @@
14#endif 14#endif
15 15
16#ifdef CONFIG_HIGHMEM 16#ifdef CONFIG_HIGHMEM
17#define HIGHMEM_ZONE(xx) , xx##_HIGH 17#define HIGHMEM_ZONE(xx) xx##_HIGH,
18#else 18#else
19#define HIGHMEM_ZONE(xx) 19#define HIGHMEM_ZONE(xx)
20#endif 20#endif
21 21
22#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx) , xx##_MOVABLE 22#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE
23 23
24enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, 24enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
25 FOR_ALL_ZONES(PGALLOC), 25 FOR_ALL_ZONES(PGALLOC),
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 82e7db7f7100..5dbc8b0ee567 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -161,30 +161,8 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
161} 161}
162 162
163#ifdef CONFIG_NUMA 163#ifdef CONFIG_NUMA
164/*
165 * Determine the per node value of a stat item. This function
166 * is called frequently in a NUMA machine, so try to be as
167 * frugal as possible.
168 */
169static inline unsigned long node_page_state(int node,
170 enum zone_stat_item item)
171{
172 struct zone *zones = NODE_DATA(node)->node_zones;
173
174 return
175#ifdef CONFIG_ZONE_DMA
176 zone_page_state(&zones[ZONE_DMA], item) +
177#endif
178#ifdef CONFIG_ZONE_DMA32
179 zone_page_state(&zones[ZONE_DMA32], item) +
180#endif
181#ifdef CONFIG_HIGHMEM
182 zone_page_state(&zones[ZONE_HIGHMEM], item) +
183#endif
184 zone_page_state(&zones[ZONE_NORMAL], item) +
185 zone_page_state(&zones[ZONE_MOVABLE], item);
186}
187 164
165extern unsigned long node_page_state(int node, enum zone_stat_item item);
188extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); 166extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
189 167
190#else 168#else
@@ -269,7 +247,6 @@ static inline void __dec_zone_page_state(struct page *page,
269 247
270#define set_pgdat_percpu_threshold(pgdat, callback) { } 248#define set_pgdat_percpu_threshold(pgdat, callback) { }
271 249
272static inline void refresh_cpu_vm_stats(int cpu) { }
273static inline void refresh_zone_stat_thresholds(void) { } 250static inline void refresh_zone_stat_thresholds(void) { }
274static inline void cpu_vm_stats_fold(int cpu) { } 251static inline void cpu_vm_stats_fold(int cpu) { }
275 252
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 9a6a3fe0fb51..c92d1e1cbad9 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -9,6 +9,62 @@
9#include <linux/tracepoint.h> 9#include <linux/tracepoint.h>
10#include <trace/events/gfpflags.h> 10#include <trace/events/gfpflags.h>
11 11
12#define COMPACTION_STATUS \
13 EM( COMPACT_DEFERRED, "deferred") \
14 EM( COMPACT_SKIPPED, "skipped") \
15 EM( COMPACT_CONTINUE, "continue") \
16 EM( COMPACT_PARTIAL, "partial") \
17 EM( COMPACT_COMPLETE, "complete") \
18 EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \
19 EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \
20 EMe(COMPACT_CONTENDED, "contended")
21
22#ifdef CONFIG_ZONE_DMA
23#define IFDEF_ZONE_DMA(X) X
24#else
25#define IFDEF_ZONE_DMA(X)
26#endif
27
28#ifdef CONFIG_ZONE_DMA32
29#define IFDEF_ZONE_DMA32(X) X
30#else
31#define IFDEF_ZONE_DMA32(X)
32#endif
33
34#ifdef CONFIG_HIGHMEM
35#define IFDEF_ZONE_HIGHMEM(X) X
36#else
37#define IFDEF_ZONE_HIGHMEM(X)
38#endif
39
40#define ZONE_TYPE \
41 IFDEF_ZONE_DMA( EM (ZONE_DMA, "DMA")) \
42 IFDEF_ZONE_DMA32( EM (ZONE_DMA32, "DMA32")) \
43 EM (ZONE_NORMAL, "Normal") \
44 IFDEF_ZONE_HIGHMEM( EM (ZONE_HIGHMEM,"HighMem")) \
45 EMe(ZONE_MOVABLE,"Movable")
46
47/*
48 * First define the enums in the above macros to be exported to userspace
49 * via TRACE_DEFINE_ENUM().
50 */
51#undef EM
52#undef EMe
53#define EM(a, b) TRACE_DEFINE_ENUM(a);
54#define EMe(a, b) TRACE_DEFINE_ENUM(a);
55
56COMPACTION_STATUS
57ZONE_TYPE
58
59/*
60 * Now redefine the EM() and EMe() macros to map the enums to the strings
61 * that will be printed in the output.
62 */
63#undef EM
64#undef EMe
65#define EM(a, b) {a, b},
66#define EMe(a, b) {a, b}
67
12DECLARE_EVENT_CLASS(mm_compaction_isolate_template, 68DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
13 69
14 TP_PROTO( 70 TP_PROTO(
@@ -161,7 +217,7 @@ TRACE_EVENT(mm_compaction_end,
161 __entry->free_pfn, 217 __entry->free_pfn,
162 __entry->zone_end, 218 __entry->zone_end,
163 __entry->sync ? "sync" : "async", 219 __entry->sync ? "sync" : "async",
164 compaction_status_string[__entry->status]) 220 __print_symbolic(__entry->status, COMPACTION_STATUS))
165); 221);
166 222
167TRACE_EVENT(mm_compaction_try_to_compact_pages, 223TRACE_EVENT(mm_compaction_try_to_compact_pages,
@@ -201,23 +257,23 @@ DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
201 257
202 TP_STRUCT__entry( 258 TP_STRUCT__entry(
203 __field(int, nid) 259 __field(int, nid)
204 __field(char *, name) 260 __field(enum zone_type, idx)
205 __field(int, order) 261 __field(int, order)
206 __field(int, ret) 262 __field(int, ret)
207 ), 263 ),
208 264
209 TP_fast_assign( 265 TP_fast_assign(
210 __entry->nid = zone_to_nid(zone); 266 __entry->nid = zone_to_nid(zone);
211 __entry->name = (char *)zone->name; 267 __entry->idx = zone_idx(zone);
212 __entry->order = order; 268 __entry->order = order;
213 __entry->ret = ret; 269 __entry->ret = ret;
214 ), 270 ),
215 271
216 TP_printk("node=%d zone=%-8s order=%d ret=%s", 272 TP_printk("node=%d zone=%-8s order=%d ret=%s",
217 __entry->nid, 273 __entry->nid,
218 __entry->name, 274 __print_symbolic(__entry->idx, ZONE_TYPE),
219 __entry->order, 275 __entry->order,
220 compaction_status_string[__entry->ret]) 276 __print_symbolic(__entry->ret, COMPACTION_STATUS))
221); 277);
222 278
223DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished, 279DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished,
@@ -247,7 +303,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template,
247 303
248 TP_STRUCT__entry( 304 TP_STRUCT__entry(
249 __field(int, nid) 305 __field(int, nid)
250 __field(char *, name) 306 __field(enum zone_type, idx)
251 __field(int, order) 307 __field(int, order)
252 __field(unsigned int, considered) 308 __field(unsigned int, considered)
253 __field(unsigned int, defer_shift) 309 __field(unsigned int, defer_shift)
@@ -256,7 +312,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template,
256 312
257 TP_fast_assign( 313 TP_fast_assign(
258 __entry->nid = zone_to_nid(zone); 314 __entry->nid = zone_to_nid(zone);
259 __entry->name = (char *)zone->name; 315 __entry->idx = zone_idx(zone);
260 __entry->order = order; 316 __entry->order = order;
261 __entry->considered = zone->compact_considered; 317 __entry->considered = zone->compact_considered;
262 __entry->defer_shift = zone->compact_defer_shift; 318 __entry->defer_shift = zone->compact_defer_shift;
@@ -265,7 +321,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template,
265 321
266 TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu", 322 TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu",
267 __entry->nid, 323 __entry->nid,
268 __entry->name, 324 __print_symbolic(__entry->idx, ZONE_TYPE),
269 __entry->order, 325 __entry->order,
270 __entry->order_failed, 326 __entry->order_failed,
271 __entry->considered, 327 __entry->considered,
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index ddc3b36f1046..a74dd84bbb6d 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -25,6 +25,11 @@
25# define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ 25# define MAP_UNINITIALIZED 0x0 /* Don't support this flag */
26#endif 26#endif
27 27
28/*
29 * Flags for mlock
30 */
31#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
32
28#define MS_ASYNC 1 /* sync memory asynchronously */ 33#define MS_ASYNC 1 /* sync memory asynchronously */
29#define MS_INVALIDATE 2 /* invalidate the caches */ 34#define MS_INVALIDATE 2 /* invalidate the caches */
30#define MS_SYNC 4 /* synchronous memory sync */ 35#define MS_SYNC 4 /* synchronous memory sync */
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index e9fe6fd2a074..7162cd4cca73 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -17,5 +17,6 @@
17 17
18#define MCL_CURRENT 1 /* lock all current mappings */ 18#define MCL_CURRENT 1 /* lock all current mappings */
19#define MCL_FUTURE 2 /* lock all future mappings */ 19#define MCL_FUTURE 2 /* lock all future mappings */
20#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
20 21
21#endif /* __ASM_GENERIC_MMAN_H */ 22#endif /* __ASM_GENERIC_MMAN_H */
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index ee124009e12a..1324b0292ec2 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -713,9 +713,11 @@ __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
713__SYSCALL(__NR_userfaultfd, sys_userfaultfd) 713__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
714#define __NR_membarrier 283 714#define __NR_membarrier 283
715__SYSCALL(__NR_membarrier, sys_membarrier) 715__SYSCALL(__NR_membarrier, sys_membarrier)
716#define __NR_mlock2 284
717__SYSCALL(__NR_mlock2, sys_mlock2)
716 718
717#undef __NR_syscalls 719#undef __NR_syscalls
718#define __NR_syscalls 284 720#define __NR_syscalls 285
719 721
720/* 722/*
721 * All syscalls below here should go away really, 723 * All syscalls below here should go away really,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d7ccb87a6714..10ae73611d80 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2598,22 +2598,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2598} 2598}
2599 2599
2600/** 2600/**
2601 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2601 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
2602 * @tsk: pointer to task_struct of some task.
2603 * 2602 *
2604 * Description: Prints @task's name, cpuset name, and cached copy of its 2603 * Description: Prints current's name, cpuset name, and cached copy of its
2605 * mems_allowed to the kernel log. 2604 * mems_allowed to the kernel log.
2606 */ 2605 */
2607void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2606void cpuset_print_current_mems_allowed(void)
2608{ 2607{
2609 struct cgroup *cgrp; 2608 struct cgroup *cgrp;
2610 2609
2611 rcu_read_lock(); 2610 rcu_read_lock();
2612 2611
2613 cgrp = task_cs(tsk)->css.cgroup; 2612 cgrp = task_cs(current)->css.cgroup;
2614 pr_info("%s cpuset=", tsk->comm); 2613 pr_info("%s cpuset=", current->comm);
2615 pr_cont_cgroup_name(cgrp); 2614 pr_cont_cgroup_name(cgrp);
2616 pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed)); 2615 pr_cont(" mems_allowed=%*pbl\n",
2616 nodemask_pr_args(&current->mems_allowed));
2617 2617
2618 rcu_read_unlock(); 2618 rcu_read_unlock();
2619} 2619}
diff --git a/kernel/fork.c b/kernel/fork.c
index 825ecc32454d..f97f2c449f5c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -455,7 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
455 tmp->vm_mm = mm; 455 tmp->vm_mm = mm;
456 if (anon_vma_fork(tmp, mpnt)) 456 if (anon_vma_fork(tmp, mpnt))
457 goto fail_nomem_anon_vma_fork; 457 goto fail_nomem_anon_vma_fork;
458 tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); 458 tmp->vm_flags &=
459 ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
459 tmp->vm_next = tmp->vm_prev = NULL; 460 tmp->vm_next = tmp->vm_prev = NULL;
460 tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 461 tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
461 file = tmp->vm_file; 462 file = tmp->vm_file;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a02decf15583..0623787ec67a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -194,6 +194,7 @@ cond_syscall(sys_mlock);
194cond_syscall(sys_munlock); 194cond_syscall(sys_munlock);
195cond_syscall(sys_mlockall); 195cond_syscall(sys_mlockall);
196cond_syscall(sys_munlockall); 196cond_syscall(sys_munlockall);
197cond_syscall(sys_mlock2);
197cond_syscall(sys_mincore); 198cond_syscall(sys_mincore);
198cond_syscall(sys_madvise); 199cond_syscall(sys_madvise);
199cond_syscall(sys_mremap); 200cond_syscall(sys_mremap);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 96c856b04081..dc6858d6639e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -888,6 +888,17 @@ static struct ctl_table kern_table[] = {
888 .extra1 = &zero, 888 .extra1 = &zero,
889 .extra2 = &one, 889 .extra2 = &one,
890 }, 890 },
891#ifdef CONFIG_HARDLOCKUP_DETECTOR
892 {
893 .procname = "hardlockup_panic",
894 .data = &hardlockup_panic,
895 .maxlen = sizeof(int),
896 .mode = 0644,
897 .proc_handler = proc_dointvec_minmax,
898 .extra1 = &zero,
899 .extra2 = &one,
900 },
901#endif
891#ifdef CONFIG_SMP 902#ifdef CONFIG_SMP
892 { 903 {
893 .procname = "softlockup_all_cpu_backtrace", 904 .procname = "softlockup_all_cpu_backtrace",
@@ -898,6 +909,15 @@ static struct ctl_table kern_table[] = {
898 .extra1 = &zero, 909 .extra1 = &zero,
899 .extra2 = &one, 910 .extra2 = &one,
900 }, 911 },
912 {
913 .procname = "hardlockup_all_cpu_backtrace",
914 .data = &sysctl_hardlockup_all_cpu_backtrace,
915 .maxlen = sizeof(int),
916 .mode = 0644,
917 .proc_handler = proc_dointvec_minmax,
918 .extra1 = &zero,
919 .extra2 = &one,
920 },
901#endif /* CONFIG_SMP */ 921#endif /* CONFIG_SMP */
902#endif 922#endif
903#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 923#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 64ed1c37bd1f..18f34cf75f74 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
57 57
58#ifdef CONFIG_SMP 58#ifdef CONFIG_SMP
59int __read_mostly sysctl_softlockup_all_cpu_backtrace; 59int __read_mostly sysctl_softlockup_all_cpu_backtrace;
60int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
60#else 61#else
61#define sysctl_softlockup_all_cpu_backtrace 0 62#define sysctl_softlockup_all_cpu_backtrace 0
63#define sysctl_hardlockup_all_cpu_backtrace 0
62#endif 64#endif
63static struct cpumask watchdog_cpumask __read_mostly; 65static struct cpumask watchdog_cpumask __read_mostly;
64unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); 66unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -110,8 +112,9 @@ static unsigned long soft_lockup_nmi_warn;
110 * Should we panic when a soft-lockup or hard-lockup occurs: 112 * Should we panic when a soft-lockup or hard-lockup occurs:
111 */ 113 */
112#ifdef CONFIG_HARDLOCKUP_DETECTOR 114#ifdef CONFIG_HARDLOCKUP_DETECTOR
113static int hardlockup_panic = 115unsigned int __read_mostly hardlockup_panic =
114 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; 116 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
117static unsigned long hardlockup_allcpu_dumped;
115/* 118/*
116 * We may not want to enable hard lockup detection by default in all cases, 119 * We may not want to enable hard lockup detection by default in all cases,
117 * for example when running the kernel as a guest on a hypervisor. In these 120 * for example when running the kernel as a guest on a hypervisor. In these
@@ -173,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
173 return 1; 176 return 1;
174} 177}
175__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); 178__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
179static int __init hardlockup_all_cpu_backtrace_setup(char *str)
180{
181 sysctl_hardlockup_all_cpu_backtrace =
182 !!simple_strtol(str, NULL, 0);
183 return 1;
184}
185__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
176#endif 186#endif
177 187
178/* 188/*
@@ -263,15 +273,15 @@ void touch_softlockup_watchdog_sync(void)
263 273
264#ifdef CONFIG_HARDLOCKUP_DETECTOR 274#ifdef CONFIG_HARDLOCKUP_DETECTOR
265/* watchdog detector functions */ 275/* watchdog detector functions */
266static int is_hardlockup(void) 276static bool is_hardlockup(void)
267{ 277{
268 unsigned long hrint = __this_cpu_read(hrtimer_interrupts); 278 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
269 279
270 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) 280 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
271 return 1; 281 return true;
272 282
273 __this_cpu_write(hrtimer_interrupts_saved, hrint); 283 __this_cpu_write(hrtimer_interrupts_saved, hrint);
274 return 0; 284 return false;
275} 285}
276#endif 286#endif
277 287
@@ -279,7 +289,7 @@ static int is_softlockup(unsigned long touch_ts)
279{ 289{
280 unsigned long now = get_timestamp(); 290 unsigned long now = get_timestamp();
281 291
282 if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { 292 if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
283 /* Warn about unreasonable delays. */ 293 /* Warn about unreasonable delays. */
284 if (time_after(now, touch_ts + get_softlockup_thresh())) 294 if (time_after(now, touch_ts + get_softlockup_thresh()))
285 return now - touch_ts; 295 return now - touch_ts;
@@ -318,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event,
318 */ 328 */
319 if (is_hardlockup()) { 329 if (is_hardlockup()) {
320 int this_cpu = smp_processor_id(); 330 int this_cpu = smp_processor_id();
331 struct pt_regs *regs = get_irq_regs();
321 332
322 /* only print hardlockups once */ 333 /* only print hardlockups once */
323 if (__this_cpu_read(hard_watchdog_warn) == true) 334 if (__this_cpu_read(hard_watchdog_warn) == true)
324 return; 335 return;
325 336
326 if (hardlockup_panic) 337 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
327 panic("Watchdog detected hard LOCKUP on cpu %d", 338 print_modules();
328 this_cpu); 339 print_irqtrace_events(current);
340 if (regs)
341 show_regs(regs);
329 else 342 else
330 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", 343 dump_stack();
331 this_cpu); 344
345 /*
346 * Perform all-CPU dump only once to avoid multiple hardlockups
347 * generating interleaving traces
348 */
349 if (sysctl_hardlockup_all_cpu_backtrace &&
350 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
351 trigger_allbutself_cpu_backtrace();
352
353 if (hardlockup_panic)
354 panic("Hard LOCKUP");
332 355
333 __this_cpu_write(hard_watchdog_warn, true); 356 __this_cpu_write(hard_watchdog_warn, true);
334 return; 357 return;
@@ -347,6 +370,9 @@ static void watchdog_interrupt_count(void)
347static int watchdog_nmi_enable(unsigned int cpu); 370static int watchdog_nmi_enable(unsigned int cpu);
348static void watchdog_nmi_disable(unsigned int cpu); 371static void watchdog_nmi_disable(unsigned int cpu);
349 372
373static int watchdog_enable_all_cpus(void);
374static void watchdog_disable_all_cpus(void);
375
350/* watchdog kicker functions */ 376/* watchdog kicker functions */
351static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 377static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
352{ 378{
@@ -651,37 +677,41 @@ static struct smp_hotplug_thread watchdog_threads = {
651 677
652/* 678/*
653 * park all watchdog threads that are specified in 'watchdog_cpumask' 679 * park all watchdog threads that are specified in 'watchdog_cpumask'
680 *
681 * This function returns an error if kthread_park() of a watchdog thread
682 * fails. In this situation, the watchdog threads of some CPUs can already
683 * be parked and the watchdog threads of other CPUs can still be runnable.
684 * Callers are expected to handle this special condition as appropriate in
685 * their context.
686 *
687 * This function may only be called in a context that is protected against
688 * races with CPU hotplug - for example, via get_online_cpus().
654 */ 689 */
655static int watchdog_park_threads(void) 690static int watchdog_park_threads(void)
656{ 691{
657 int cpu, ret = 0; 692 int cpu, ret = 0;
658 693
659 get_online_cpus();
660 for_each_watchdog_cpu(cpu) { 694 for_each_watchdog_cpu(cpu) {
661 ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); 695 ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
662 if (ret) 696 if (ret)
663 break; 697 break;
664 } 698 }
665 if (ret) {
666 for_each_watchdog_cpu(cpu)
667 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
668 }
669 put_online_cpus();
670 699
671 return ret; 700 return ret;
672} 701}
673 702
674/* 703/*
675 * unpark all watchdog threads that are specified in 'watchdog_cpumask' 704 * unpark all watchdog threads that are specified in 'watchdog_cpumask'
705 *
706 * This function may only be called in a context that is protected against
707 * races with CPU hotplug - for example, via get_online_cpus().
676 */ 708 */
677static void watchdog_unpark_threads(void) 709static void watchdog_unpark_threads(void)
678{ 710{
679 int cpu; 711 int cpu;
680 712
681 get_online_cpus();
682 for_each_watchdog_cpu(cpu) 713 for_each_watchdog_cpu(cpu)
683 kthread_unpark(per_cpu(softlockup_watchdog, cpu)); 714 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
684 put_online_cpus();
685} 715}
686 716
687/* 717/*
@@ -691,6 +721,7 @@ int lockup_detector_suspend(void)
691{ 721{
692 int ret = 0; 722 int ret = 0;
693 723
724 get_online_cpus();
694 mutex_lock(&watchdog_proc_mutex); 725 mutex_lock(&watchdog_proc_mutex);
695 /* 726 /*
696 * Multiple suspend requests can be active in parallel (counted by 727 * Multiple suspend requests can be active in parallel (counted by
@@ -704,6 +735,11 @@ int lockup_detector_suspend(void)
704 735
705 if (ret == 0) 736 if (ret == 0)
706 watchdog_suspended++; 737 watchdog_suspended++;
738 else {
739 watchdog_disable_all_cpus();
740 pr_err("Failed to suspend lockup detectors, disabled\n");
741 watchdog_enabled = 0;
742 }
707 743
708 mutex_unlock(&watchdog_proc_mutex); 744 mutex_unlock(&watchdog_proc_mutex);
709 745
@@ -726,12 +762,20 @@ void lockup_detector_resume(void)
726 watchdog_unpark_threads(); 762 watchdog_unpark_threads();
727 763
728 mutex_unlock(&watchdog_proc_mutex); 764 mutex_unlock(&watchdog_proc_mutex);
765 put_online_cpus();
729} 766}
730 767
731static void update_watchdog_all_cpus(void) 768static int update_watchdog_all_cpus(void)
732{ 769{
733 watchdog_park_threads(); 770 int ret;
771
772 ret = watchdog_park_threads();
773 if (ret)
774 return ret;
775
734 watchdog_unpark_threads(); 776 watchdog_unpark_threads();
777
778 return 0;
735} 779}
736 780
737static int watchdog_enable_all_cpus(void) 781static int watchdog_enable_all_cpus(void)
@@ -750,15 +794,20 @@ static int watchdog_enable_all_cpus(void)
750 * Enable/disable the lockup detectors or 794 * Enable/disable the lockup detectors or
751 * change the sample period 'on the fly'. 795 * change the sample period 'on the fly'.
752 */ 796 */
753 update_watchdog_all_cpus(); 797 err = update_watchdog_all_cpus();
798
799 if (err) {
800 watchdog_disable_all_cpus();
801 pr_err("Failed to update lockup detectors, disabled\n");
802 }
754 } 803 }
755 804
805 if (err)
806 watchdog_enabled = 0;
807
756 return err; 808 return err;
757} 809}
758 810
759/* prepare/enable/disable routines */
760/* sysctl functions */
761#ifdef CONFIG_SYSCTL
762static void watchdog_disable_all_cpus(void) 811static void watchdog_disable_all_cpus(void)
763{ 812{
764 if (watchdog_running) { 813 if (watchdog_running) {
@@ -767,6 +816,8 @@ static void watchdog_disable_all_cpus(void)
767 } 816 }
768} 817}
769 818
819#ifdef CONFIG_SYSCTL
820
770/* 821/*
771 * Update the run state of the lockup detectors. 822 * Update the run state of the lockup detectors.
772 */ 823 */
@@ -808,6 +859,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
808 int err, old, new; 859 int err, old, new;
809 int *watchdog_param = (int *)table->data; 860 int *watchdog_param = (int *)table->data;
810 861
862 get_online_cpus();
811 mutex_lock(&watchdog_proc_mutex); 863 mutex_lock(&watchdog_proc_mutex);
812 864
813 if (watchdog_suspended) { 865 if (watchdog_suspended) {
@@ -849,15 +901,17 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
849 } while (cmpxchg(&watchdog_enabled, old, new) != old); 901 } while (cmpxchg(&watchdog_enabled, old, new) != old);
850 902
851 /* 903 /*
852 * Update the run state of the lockup detectors. 904 * Update the run state of the lockup detectors. There is _no_
853 * Restore 'watchdog_enabled' on failure. 905 * need to check the value returned by proc_watchdog_update()
906 * and to restore the previous value of 'watchdog_enabled' as
907 * both lockup detectors are disabled if proc_watchdog_update()
908 * returns an error.
854 */ 909 */
855 err = proc_watchdog_update(); 910 err = proc_watchdog_update();
856 if (err)
857 watchdog_enabled = old;
858 } 911 }
859out: 912out:
860 mutex_unlock(&watchdog_proc_mutex); 913 mutex_unlock(&watchdog_proc_mutex);
914 put_online_cpus();
861 return err; 915 return err;
862} 916}
863 917
@@ -899,6 +953,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
899{ 953{
900 int err, old; 954 int err, old;
901 955
956 get_online_cpus();
902 mutex_lock(&watchdog_proc_mutex); 957 mutex_lock(&watchdog_proc_mutex);
903 958
904 if (watchdog_suspended) { 959 if (watchdog_suspended) {
@@ -914,15 +969,17 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
914 goto out; 969 goto out;
915 970
916 /* 971 /*
917 * Update the sample period. 972 * Update the sample period. Restore on failure.
918 * Restore 'watchdog_thresh' on failure.
919 */ 973 */
920 set_sample_period(); 974 set_sample_period();
921 err = proc_watchdog_update(); 975 err = proc_watchdog_update();
922 if (err) 976 if (err) {
923 watchdog_thresh = old; 977 watchdog_thresh = old;
978 set_sample_period();
979 }
924out: 980out:
925 mutex_unlock(&watchdog_proc_mutex); 981 mutex_unlock(&watchdog_proc_mutex);
982 put_online_cpus();
926 return err; 983 return err;
927} 984}
928 985
@@ -937,6 +994,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
937{ 994{
938 int err; 995 int err;
939 996
997 get_online_cpus();
940 mutex_lock(&watchdog_proc_mutex); 998 mutex_lock(&watchdog_proc_mutex);
941 999
942 if (watchdog_suspended) { 1000 if (watchdog_suspended) {
@@ -964,6 +1022,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
964 } 1022 }
965out: 1023out:
966 mutex_unlock(&watchdog_proc_mutex); 1024 mutex_unlock(&watchdog_proc_mutex);
1025 put_online_cpus();
967 return err; 1026 return err;
968} 1027}
969 1028
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 39f24d6721e5..0fee5acd5aa0 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -15,8 +15,7 @@ config KASAN
15 global variables requires gcc 5.0 or later. 15 global variables requires gcc 5.0 or later.
16 This feature consumes about 1/8 of available memory and brings about 16 This feature consumes about 1/8 of available memory and brings about
17 ~x3 performance slowdown. 17 ~x3 performance slowdown.
18 For better error detection enable CONFIG_STACKTRACE, 18 For better error detection enable CONFIG_STACKTRACE.
19 and add slub_debug=U to boot cmdline.
20 19
21choice 20choice
22 prompt "Instrumentation type" 21 prompt "Instrumentation type"
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index c1efb1b61017..c32f3b0048dc 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -138,6 +138,71 @@ static noinline void __init kmalloc_oob_16(void)
138 kfree(ptr2); 138 kfree(ptr2);
139} 139}
140 140
141static noinline void __init kmalloc_oob_memset_2(void)
142{
143 char *ptr;
144 size_t size = 8;
145
146 pr_info("out-of-bounds in memset2\n");
147 ptr = kmalloc(size, GFP_KERNEL);
148 if (!ptr) {
149 pr_err("Allocation failed\n");
150 return;
151 }
152
153 memset(ptr+7, 0, 2);
154 kfree(ptr);
155}
156
157static noinline void __init kmalloc_oob_memset_4(void)
158{
159 char *ptr;
160 size_t size = 8;
161
162 pr_info("out-of-bounds in memset4\n");
163 ptr = kmalloc(size, GFP_KERNEL);
164 if (!ptr) {
165 pr_err("Allocation failed\n");
166 return;
167 }
168
169 memset(ptr+5, 0, 4);
170 kfree(ptr);
171}
172
173
174static noinline void __init kmalloc_oob_memset_8(void)
175{
176 char *ptr;
177 size_t size = 8;
178
179 pr_info("out-of-bounds in memset8\n");
180 ptr = kmalloc(size, GFP_KERNEL);
181 if (!ptr) {
182 pr_err("Allocation failed\n");
183 return;
184 }
185
186 memset(ptr+1, 0, 8);
187 kfree(ptr);
188}
189
190static noinline void __init kmalloc_oob_memset_16(void)
191{
192 char *ptr;
193 size_t size = 16;
194
195 pr_info("out-of-bounds in memset16\n");
196 ptr = kmalloc(size, GFP_KERNEL);
197 if (!ptr) {
198 pr_err("Allocation failed\n");
199 return;
200 }
201
202 memset(ptr+1, 0, 16);
203 kfree(ptr);
204}
205
141static noinline void __init kmalloc_oob_in_memset(void) 206static noinline void __init kmalloc_oob_in_memset(void)
142{ 207{
143 char *ptr; 208 char *ptr;
@@ -264,6 +329,10 @@ static int __init kmalloc_tests_init(void)
264 kmalloc_oob_krealloc_less(); 329 kmalloc_oob_krealloc_less();
265 kmalloc_oob_16(); 330 kmalloc_oob_16();
266 kmalloc_oob_in_memset(); 331 kmalloc_oob_in_memset();
332 kmalloc_oob_memset_2();
333 kmalloc_oob_memset_4();
334 kmalloc_oob_memset_8();
335 kmalloc_oob_memset_16();
267 kmalloc_uaf(); 336 kmalloc_uaf();
268 kmalloc_uaf_memset(); 337 kmalloc_uaf_memset();
269 kmalloc_uaf2(); 338 kmalloc_uaf2();
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index fcad8322ef36..d3116be5a00f 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -199,23 +199,17 @@ int balloon_page_migrate(struct page *newpage,
199 struct balloon_dev_info *balloon = balloon_page_device(page); 199 struct balloon_dev_info *balloon = balloon_page_device(page);
200 int rc = -EAGAIN; 200 int rc = -EAGAIN;
201 201
202 /* 202 VM_BUG_ON_PAGE(!PageLocked(page), page);
203 * Block others from accessing the 'newpage' when we get around to 203 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
204 * establishing additional references. We should be the only one
205 * holding a reference to the 'newpage' at this point.
206 */
207 BUG_ON(!trylock_page(newpage));
208 204
209 if (WARN_ON(!__is_movable_balloon_page(page))) { 205 if (WARN_ON(!__is_movable_balloon_page(page))) {
210 dump_page(page, "not movable balloon page"); 206 dump_page(page, "not movable balloon page");
211 unlock_page(newpage);
212 return rc; 207 return rc;
213 } 208 }
214 209
215 if (balloon && balloon->migratepage) 210 if (balloon && balloon->migratepage)
216 rc = balloon->migratepage(balloon, newpage, page, mode); 211 rc = balloon->migratepage(balloon, newpage, page, mode);
217 212
218 unlock_page(newpage);
219 return rc; 213 return rc;
220} 214}
221#endif /* CONFIG_BALLOON_COMPACTION */ 215#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/cma.c b/mm/cma.c
index 4eb56badf37e..ea506eb18cd6 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -363,7 +363,9 @@ err:
363 */ 363 */
364struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) 364struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
365{ 365{
366 unsigned long mask, offset, pfn, start = 0; 366 unsigned long mask, offset;
367 unsigned long pfn = -1;
368 unsigned long start = 0;
367 unsigned long bitmap_maxno, bitmap_no, bitmap_count; 369 unsigned long bitmap_maxno, bitmap_no, bitmap_count;
368 struct page *page = NULL; 370 struct page *page = NULL;
369 int ret; 371 int ret;
@@ -418,7 +420,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
418 start = bitmap_no + mask + 1; 420 start = bitmap_no + mask + 1;
419 } 421 }
420 422
421 trace_cma_alloc(page ? pfn : -1UL, page, count, align); 423 trace_cma_alloc(pfn, page, count, align);
422 424
423 pr_debug("%s(): returned %p\n", __func__, page); 425 pr_debug("%s(): returned %p\n", __func__, page);
424 return page; 426 return page;
diff --git a/mm/compaction.c b/mm/compaction.c
index c5c627aae996..de3e1e71cd9f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,17 +35,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
35#endif 35#endif
36 36
37#if defined CONFIG_COMPACTION || defined CONFIG_CMA 37#if defined CONFIG_COMPACTION || defined CONFIG_CMA
38#ifdef CONFIG_TRACEPOINTS
39static const char *const compaction_status_string[] = {
40 "deferred",
41 "skipped",
42 "continue",
43 "partial",
44 "complete",
45 "no_suitable_page",
46 "not_suitable_zone",
47};
48#endif
49 38
50#define CREATE_TRACE_POINTS 39#define CREATE_TRACE_POINTS
51#include <trace/events/compaction.h> 40#include <trace/events/compaction.h>
@@ -1197,6 +1186,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1197 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; 1186 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
1198} 1187}
1199 1188
1189/*
1190 * order == -1 is expected when compacting via
1191 * /proc/sys/vm/compact_memory
1192 */
1193static inline bool is_via_compact_memory(int order)
1194{
1195 return order == -1;
1196}
1197
1200static int __compact_finished(struct zone *zone, struct compact_control *cc, 1198static int __compact_finished(struct zone *zone, struct compact_control *cc,
1201 const int migratetype) 1199 const int migratetype)
1202{ 1200{
@@ -1204,7 +1202,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
1204 unsigned long watermark; 1202 unsigned long watermark;
1205 1203
1206 if (cc->contended || fatal_signal_pending(current)) 1204 if (cc->contended || fatal_signal_pending(current))
1207 return COMPACT_PARTIAL; 1205 return COMPACT_CONTENDED;
1208 1206
1209 /* Compaction run completes if the migrate and free scanner meet */ 1207 /* Compaction run completes if the migrate and free scanner meet */
1210 if (compact_scanners_met(cc)) { 1208 if (compact_scanners_met(cc)) {
@@ -1223,11 +1221,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
1223 return COMPACT_COMPLETE; 1221 return COMPACT_COMPLETE;
1224 } 1222 }
1225 1223
1226 /* 1224 if (is_via_compact_memory(cc->order))
1227 * order == -1 is expected when compacting via
1228 * /proc/sys/vm/compact_memory
1229 */
1230 if (cc->order == -1)
1231 return COMPACT_CONTINUE; 1225 return COMPACT_CONTINUE;
1232 1226
1233 /* Compaction run is not finished if the watermark is not met */ 1227 /* Compaction run is not finished if the watermark is not met */
@@ -1290,11 +1284,7 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
1290 int fragindex; 1284 int fragindex;
1291 unsigned long watermark; 1285 unsigned long watermark;
1292 1286
1293 /* 1287 if (is_via_compact_memory(order))
1294 * order == -1 is expected when compacting via
1295 * /proc/sys/vm/compact_memory
1296 */
1297 if (order == -1)
1298 return COMPACT_CONTINUE; 1288 return COMPACT_CONTINUE;
1299 1289
1300 watermark = low_wmark_pages(zone); 1290 watermark = low_wmark_pages(zone);
@@ -1403,7 +1393,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1403 1393
1404 switch (isolate_migratepages(zone, cc)) { 1394 switch (isolate_migratepages(zone, cc)) {
1405 case ISOLATE_ABORT: 1395 case ISOLATE_ABORT:
1406 ret = COMPACT_PARTIAL; 1396 ret = COMPACT_CONTENDED;
1407 putback_movable_pages(&cc->migratepages); 1397 putback_movable_pages(&cc->migratepages);
1408 cc->nr_migratepages = 0; 1398 cc->nr_migratepages = 0;
1409 goto out; 1399 goto out;
@@ -1434,7 +1424,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1434 * and we want compact_finished() to detect it 1424 * and we want compact_finished() to detect it
1435 */ 1425 */
1436 if (err == -ENOMEM && !compact_scanners_met(cc)) { 1426 if (err == -ENOMEM && !compact_scanners_met(cc)) {
1437 ret = COMPACT_PARTIAL; 1427 ret = COMPACT_CONTENDED;
1438 goto out; 1428 goto out;
1439 } 1429 }
1440 } 1430 }
@@ -1487,6 +1477,9 @@ out:
1487 trace_mm_compaction_end(start_pfn, cc->migrate_pfn, 1477 trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
1488 cc->free_pfn, end_pfn, sync, ret); 1478 cc->free_pfn, end_pfn, sync, ret);
1489 1479
1480 if (ret == COMPACT_CONTENDED)
1481 ret = COMPACT_PARTIAL;
1482
1490 return ret; 1483 return ret;
1491} 1484}
1492 1485
@@ -1658,10 +1651,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1658 * this makes sure we compact the whole zone regardless of 1651 * this makes sure we compact the whole zone regardless of
1659 * cached scanner positions. 1652 * cached scanner positions.
1660 */ 1653 */
1661 if (cc->order == -1) 1654 if (is_via_compact_memory(cc->order))
1662 __reset_isolation_suitable(zone); 1655 __reset_isolation_suitable(zone);
1663 1656
1664 if (cc->order == -1 || !compaction_deferred(zone, cc->order)) 1657 if (is_via_compact_memory(cc->order) ||
1658 !compaction_deferred(zone, cc->order))
1665 compact_zone(zone, cc); 1659 compact_zone(zone, cc);
1666 1660
1667 if (cc->order > 0) { 1661 if (cc->order > 0) {
diff --git a/mm/debug.c b/mm/debug.c
index 6c1b3ea61bfd..e784110fb51d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -125,6 +125,7 @@ static const struct trace_print_flags vmaflags_names[] = {
125 {VM_GROWSDOWN, "growsdown" }, 125 {VM_GROWSDOWN, "growsdown" },
126 {VM_PFNMAP, "pfnmap" }, 126 {VM_PFNMAP, "pfnmap" },
127 {VM_DENYWRITE, "denywrite" }, 127 {VM_DENYWRITE, "denywrite" },
128 {VM_LOCKONFAULT, "lockonfault" },
128 {VM_LOCKED, "locked" }, 129 {VM_LOCKED, "locked" },
129 {VM_IO, "io" }, 130 {VM_IO, "io" },
130 {VM_SEQ_READ, "seqread" }, 131 {VM_SEQ_READ, "seqread" },
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 17ae14b5aefa..6d5717bd7197 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -126,7 +126,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
126 /* 126 /*
127 * Mappings have to be page-aligned 127 * Mappings have to be page-aligned
128 */ 128 */
129 offset = phys_addr & ~PAGE_MASK; 129 offset = offset_in_page(phys_addr);
130 phys_addr &= PAGE_MASK; 130 phys_addr &= PAGE_MASK;
131 size = PAGE_ALIGN(last_addr + 1) - phys_addr; 131 size = PAGE_ALIGN(last_addr + 1) - phys_addr;
132 132
@@ -189,7 +189,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
189 if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) 189 if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
190 return; 190 return;
191 191
192 offset = virt_addr & ~PAGE_MASK; 192 offset = offset_in_page(virt_addr);
193 nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; 193 nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
194 194
195 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; 195 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
@@ -234,7 +234,7 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
234 char *p; 234 char *p;
235 235
236 while (size) { 236 while (size) {
237 slop = src & ~PAGE_MASK; 237 slop = offset_in_page(src);
238 clen = size; 238 clen = size;
239 if (clen > MAX_MAP_CHUNK - slop) 239 if (clen > MAX_MAP_CHUNK - slop)
240 clen = MAX_MAP_CHUNK - slop; 240 clen = MAX_MAP_CHUNK - slop;
diff --git a/mm/filemap.c b/mm/filemap.c
index 327910c2400c..58e04e26f996 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -331,23 +331,14 @@ int filemap_flush(struct address_space *mapping)
331} 331}
332EXPORT_SYMBOL(filemap_flush); 332EXPORT_SYMBOL(filemap_flush);
333 333
334/** 334static int __filemap_fdatawait_range(struct address_space *mapping,
335 * filemap_fdatawait_range - wait for writeback to complete 335 loff_t start_byte, loff_t end_byte)
336 * @mapping: address space structure to wait for
337 * @start_byte: offset in bytes where the range starts
338 * @end_byte: offset in bytes where the range ends (inclusive)
339 *
340 * Walk the list of under-writeback pages of the given address space
341 * in the given range and wait for all of them.
342 */
343int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
344 loff_t end_byte)
345{ 336{
346 pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; 337 pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
347 pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; 338 pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
348 struct pagevec pvec; 339 struct pagevec pvec;
349 int nr_pages; 340 int nr_pages;
350 int ret2, ret = 0; 341 int ret = 0;
351 342
352 if (end_byte < start_byte) 343 if (end_byte < start_byte)
353 goto out; 344 goto out;
@@ -374,6 +365,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
374 cond_resched(); 365 cond_resched();
375 } 366 }
376out: 367out:
368 return ret;
369}
370
371/**
372 * filemap_fdatawait_range - wait for writeback to complete
373 * @mapping: address space structure to wait for
374 * @start_byte: offset in bytes where the range starts
375 * @end_byte: offset in bytes where the range ends (inclusive)
376 *
377 * Walk the list of under-writeback pages of the given address space
378 * in the given range and wait for all of them. Check error status of
379 * the address space and return it.
380 *
381 * Since the error status of the address space is cleared by this function,
382 * callers are responsible for checking the return value and handling and/or
383 * reporting the error.
384 */
385int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
386 loff_t end_byte)
387{
388 int ret, ret2;
389
390 ret = __filemap_fdatawait_range(mapping, start_byte, end_byte);
377 ret2 = filemap_check_errors(mapping); 391 ret2 = filemap_check_errors(mapping);
378 if (!ret) 392 if (!ret)
379 ret = ret2; 393 ret = ret2;
@@ -383,11 +397,38 @@ out:
383EXPORT_SYMBOL(filemap_fdatawait_range); 397EXPORT_SYMBOL(filemap_fdatawait_range);
384 398
385/** 399/**
400 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
401 * @mapping: address space structure to wait for
402 *
403 * Walk the list of under-writeback pages of the given address space
404 * and wait for all of them. Unlike filemap_fdatawait(), this function
405 * does not clear error status of the address space.
406 *
407 * Use this function if callers don't handle errors themselves. Expected
408 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
409 * fsfreeze(8)
410 */
411void filemap_fdatawait_keep_errors(struct address_space *mapping)
412{
413 loff_t i_size = i_size_read(mapping->host);
414
415 if (i_size == 0)
416 return;
417
418 __filemap_fdatawait_range(mapping, 0, i_size - 1);
419}
420
421/**
386 * filemap_fdatawait - wait for all under-writeback pages to complete 422 * filemap_fdatawait - wait for all under-writeback pages to complete
387 * @mapping: address space structure to wait for 423 * @mapping: address space structure to wait for
388 * 424 *
389 * Walk the list of under-writeback pages of the given address space 425 * Walk the list of under-writeback pages of the given address space
390 * and wait for all of them. 426 * and wait for all of them. Check error status of the address space
427 * and return it.
428 *
429 * Since the error status of the address space is cleared by this function,
430 * callers are responsible for checking the return value and handling and/or
431 * reporting the error.
391 */ 432 */
392int filemap_fdatawait(struct address_space *mapping) 433int filemap_fdatawait(struct address_space *mapping)
393{ 434{
@@ -510,7 +551,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
510 __inc_zone_page_state(new, NR_SHMEM); 551 __inc_zone_page_state(new, NR_SHMEM);
511 spin_unlock_irqrestore(&mapping->tree_lock, flags); 552 spin_unlock_irqrestore(&mapping->tree_lock, flags);
512 mem_cgroup_end_page_stat(memcg); 553 mem_cgroup_end_page_stat(memcg);
513 mem_cgroup_migrate(old, new, true); 554 mem_cgroup_replace_page(old, new);
514 radix_tree_preload_end(); 555 radix_tree_preload_end();
515 if (freepage) 556 if (freepage)
516 freepage(old); 557 freepage(old);
@@ -1807,7 +1848,6 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1807 struct file *file, 1848 struct file *file,
1808 pgoff_t offset) 1849 pgoff_t offset)
1809{ 1850{
1810 unsigned long ra_pages;
1811 struct address_space *mapping = file->f_mapping; 1851 struct address_space *mapping = file->f_mapping;
1812 1852
1813 /* If we don't want any read-ahead, don't bother */ 1853 /* If we don't want any read-ahead, don't bother */
@@ -1836,10 +1876,9 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1836 /* 1876 /*
1837 * mmap read-around 1877 * mmap read-around
1838 */ 1878 */
1839 ra_pages = max_sane_readahead(ra->ra_pages); 1879 ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
1840 ra->start = max_t(long, 0, offset - ra_pages / 2); 1880 ra->size = ra->ra_pages;
1841 ra->size = ra_pages; 1881 ra->async_size = ra->ra_pages / 4;
1842 ra->async_size = ra_pages / 4;
1843 ra_submit(ra, mapping, file); 1882 ra_submit(ra, mapping, file);
1844} 1883}
1845 1884
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index cdabcb93c6a6..7cf2b7163222 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -7,7 +7,7 @@
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9 9
10/* 10/**
11 * get_vaddr_frames() - map virtual addresses to pfns 11 * get_vaddr_frames() - map virtual addresses to pfns
12 * @start: starting user address 12 * @start: starting user address
13 * @nr_frames: number of pages / pfns from start to map 13 * @nr_frames: number of pages / pfns from start to map
diff --git a/mm/gup.c b/mm/gup.c
index a798293fc648..deafa2c91b36 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -129,7 +129,7 @@ retry:
129 */ 129 */
130 mark_page_accessed(page); 130 mark_page_accessed(page);
131 } 131 }
132 if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { 132 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
133 /* 133 /*
134 * The preliminary mapping check is mainly to avoid the 134 * The preliminary mapping check is mainly to avoid the
135 * pointless overhead of lock_page on the ZERO_PAGE 135 * pointless overhead of lock_page on the ZERO_PAGE
@@ -299,6 +299,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
299 unsigned int fault_flags = 0; 299 unsigned int fault_flags = 0;
300 int ret; 300 int ret;
301 301
302 /* mlock all present pages, but do not fault in new pages */
303 if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
304 return -ENOENT;
302 /* For mm_populate(), just skip the stack guard page. */ 305 /* For mm_populate(), just skip the stack guard page. */
303 if ((*flags & FOLL_POPULATE) && 306 if ((*flags & FOLL_POPULATE) &&
304 (stack_guard_page_start(vma, address) || 307 (stack_guard_page_start(vma, address) ||
@@ -890,7 +893,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
890 VM_BUG_ON_VMA(end > vma->vm_end, vma); 893 VM_BUG_ON_VMA(end > vma->vm_end, vma);
891 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); 894 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
892 895
893 gup_flags = FOLL_TOUCH | FOLL_POPULATE; 896 gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
897 if (vma->vm_flags & VM_LOCKONFAULT)
898 gup_flags &= ~FOLL_POPULATE;
899
894 /* 900 /*
895 * We want to touch writable mappings with a write fault in order 901 * We want to touch writable mappings with a write fault in order
896 * to break COW, except for shared mappings because these don't COW 902 * to break COW, except for shared mappings because these don't COW
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3fd0311c3ba7..f5c08b46fef8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1307,7 +1307,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1307 pmd, _pmd, 1)) 1307 pmd, _pmd, 1))
1308 update_mmu_cache_pmd(vma, addr, pmd); 1308 update_mmu_cache_pmd(vma, addr, pmd);
1309 } 1309 }
1310 if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { 1310 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1311 if (page->mapping && trylock_page(page)) { 1311 if (page->mapping && trylock_page(page)) {
1312 lru_add_drain(); 1312 lru_add_drain();
1313 if (page->mapping) 1313 if (page->mapping)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9cc773483624..74ef0c6a25dd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1437,7 +1437,82 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1437 dissolve_free_huge_page(pfn_to_page(pfn)); 1437 dissolve_free_huge_page(pfn_to_page(pfn));
1438} 1438}
1439 1439
1440static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) 1440/*
1441 * There are 3 ways this can get called:
1442 * 1. With vma+addr: we use the VMA's memory policy
1443 * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge
1444 * page from any node, and let the buddy allocator itself figure
1445 * it out.
1446 * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page
1447 * strictly from 'nid'
1448 */
1449static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
1450 struct vm_area_struct *vma, unsigned long addr, int nid)
1451{
1452 int order = huge_page_order(h);
1453 gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
1454 unsigned int cpuset_mems_cookie;
1455
1456 /*
1457 * We need a VMA to get a memory policy. If we do not
1458 * have one, we use the 'nid' argument.
1459 *
1460 * The mempolicy stuff below has some non-inlined bits
1461 * and calls ->vm_ops. That makes it hard to optimize at
1462 * compile-time, even when NUMA is off and it does
1463 * nothing. This helps the compiler optimize it out.
1464 */
1465 if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
1466 /*
1467 * If a specific node is requested, make sure to
1468 * get memory from there, but only when a node
1469 * is explicitly specified.
1470 */
1471 if (nid != NUMA_NO_NODE)
1472 gfp |= __GFP_THISNODE;
1473 /*
1474 * Make sure to call something that can handle
1475 * nid=NUMA_NO_NODE
1476 */
1477 return alloc_pages_node(nid, gfp, order);
1478 }
1479
1480 /*
1481 * OK, so we have a VMA. Fetch the mempolicy and try to
1482 * allocate a huge page with it. We will only reach this
1483 * when CONFIG_NUMA=y.
1484 */
1485 do {
1486 struct page *page;
1487 struct mempolicy *mpol;
1488 struct zonelist *zl;
1489 nodemask_t *nodemask;
1490
1491 cpuset_mems_cookie = read_mems_allowed_begin();
1492 zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
1493 mpol_cond_put(mpol);
1494 page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
1495 if (page)
1496 return page;
1497 } while (read_mems_allowed_retry(cpuset_mems_cookie));
1498
1499 return NULL;
1500}
1501
1502/*
1503 * There are two ways to allocate a huge page:
1504 * 1. When you have a VMA and an address (like a fault)
1505 * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
1506 *
1507 * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in
1508 * this case which signifies that the allocation should be done with
1509 * respect for the VMA's memory policy.
1510 *
1511 * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
1512 * implies that memory policies will not be taken in to account.
1513 */
1514static struct page *__alloc_buddy_huge_page(struct hstate *h,
1515 struct vm_area_struct *vma, unsigned long addr, int nid)
1441{ 1516{
1442 struct page *page; 1517 struct page *page;
1443 unsigned int r_nid; 1518 unsigned int r_nid;
@@ -1446,6 +1521,15 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
1446 return NULL; 1521 return NULL;
1447 1522
1448 /* 1523 /*
1524 * Make sure that anyone specifying 'nid' is not also specifying a VMA.
1525 * This makes sure the caller is picking _one_ of the modes with which
1526 * we can call this function, not both.
1527 */
1528 if (vma || (addr != -1)) {
1529 VM_WARN_ON_ONCE(addr == -1);
1530 VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
1531 }
1532 /*
1449 * Assume we will successfully allocate the surplus page to 1533 * Assume we will successfully allocate the surplus page to
1450 * prevent racing processes from causing the surplus to exceed 1534 * prevent racing processes from causing the surplus to exceed
1451 * overcommit 1535 * overcommit
@@ -1478,14 +1562,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
1478 } 1562 }
1479 spin_unlock(&hugetlb_lock); 1563 spin_unlock(&hugetlb_lock);
1480 1564
1481 if (nid == NUMA_NO_NODE) 1565 page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
1482 page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
1483 __GFP_REPEAT|__GFP_NOWARN,
1484 huge_page_order(h));
1485 else
1486 page = __alloc_pages_node(nid,
1487 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
1488 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
1489 1566
1490 spin_lock(&hugetlb_lock); 1567 spin_lock(&hugetlb_lock);
1491 if (page) { 1568 if (page) {
@@ -1510,6 +1587,29 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
1510} 1587}
1511 1588
1512/* 1589/*
1590 * Allocate a huge page from 'nid'. Note, 'nid' may be
1591 * NUMA_NO_NODE, which means that it may be allocated
1592 * anywhere.
1593 */
1594static
1595struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
1596{
1597 unsigned long addr = -1;
1598
1599 return __alloc_buddy_huge_page(h, NULL, addr, nid);
1600}
1601
1602/*
1603 * Use the VMA's mpolicy to allocate a huge page from the buddy.
1604 */
1605static
1606struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
1607 struct vm_area_struct *vma, unsigned long addr)
1608{
1609 return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
1610}
1611
1612/*
1513 * This allocation function is useful in the context where vma is irrelevant. 1613 * This allocation function is useful in the context where vma is irrelevant.
1514 * E.g. soft-offlining uses this function because it only cares physical 1614 * E.g. soft-offlining uses this function because it only cares physical
1515 * address of error page. 1615 * address of error page.
@@ -1524,7 +1624,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
1524 spin_unlock(&hugetlb_lock); 1624 spin_unlock(&hugetlb_lock);
1525 1625
1526 if (!page) 1626 if (!page)
1527 page = alloc_buddy_huge_page(h, nid); 1627 page = __alloc_buddy_huge_page_no_mpol(h, nid);
1528 1628
1529 return page; 1629 return page;
1530} 1630}
@@ -1554,7 +1654,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
1554retry: 1654retry:
1555 spin_unlock(&hugetlb_lock); 1655 spin_unlock(&hugetlb_lock);
1556 for (i = 0; i < needed; i++) { 1656 for (i = 0; i < needed; i++) {
1557 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1657 page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
1558 if (!page) { 1658 if (!page) {
1559 alloc_ok = false; 1659 alloc_ok = false;
1560 break; 1660 break;
@@ -1787,7 +1887,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
1787 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); 1887 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
1788 if (!page) { 1888 if (!page) {
1789 spin_unlock(&hugetlb_lock); 1889 spin_unlock(&hugetlb_lock);
1790 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1890 page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
1791 if (!page) 1891 if (!page)
1792 goto out_uncharge_cgroup; 1892 goto out_uncharge_cgroup;
1793 1893
@@ -2376,7 +2476,7 @@ struct node_hstate {
2376 struct kobject *hugepages_kobj; 2476 struct kobject *hugepages_kobj;
2377 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 2477 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
2378}; 2478};
2379struct node_hstate node_hstates[MAX_NUMNODES]; 2479static struct node_hstate node_hstates[MAX_NUMNODES];
2380 2480
2381/* 2481/*
2382 * A subset of global hstate attributes for node devices 2482 * A subset of global hstate attributes for node devices
@@ -2790,6 +2890,12 @@ void hugetlb_show_meminfo(void)
2790 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 2890 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2791} 2891}
2792 2892
2893void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
2894{
2895 seq_printf(m, "HugetlbPages:\t%8lu kB\n",
2896 atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
2897}
2898
2793/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 2899/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
2794unsigned long hugetlb_total_pages(void) 2900unsigned long hugetlb_total_pages(void)
2795{ 2901{
@@ -3025,6 +3131,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3025 get_page(ptepage); 3131 get_page(ptepage);
3026 page_dup_rmap(ptepage); 3132 page_dup_rmap(ptepage);
3027 set_huge_pte_at(dst, addr, dst_pte, entry); 3133 set_huge_pte_at(dst, addr, dst_pte, entry);
3134 hugetlb_count_add(pages_per_huge_page(h), dst);
3028 } 3135 }
3029 spin_unlock(src_ptl); 3136 spin_unlock(src_ptl);
3030 spin_unlock(dst_ptl); 3137 spin_unlock(dst_ptl);
@@ -3105,6 +3212,7 @@ again:
3105 if (huge_pte_dirty(pte)) 3212 if (huge_pte_dirty(pte))
3106 set_page_dirty(page); 3213 set_page_dirty(page);
3107 3214
3215 hugetlb_count_sub(pages_per_huge_page(h), mm);
3108 page_remove_rmap(page); 3216 page_remove_rmap(page);
3109 force_flush = !__tlb_remove_page(tlb, page); 3217 force_flush = !__tlb_remove_page(tlb, page);
3110 if (force_flush) { 3218 if (force_flush) {
@@ -3509,6 +3617,7 @@ retry:
3509 && (vma->vm_flags & VM_SHARED))); 3617 && (vma->vm_flags & VM_SHARED)));
3510 set_huge_pte_at(mm, address, ptep, new_pte); 3618 set_huge_pte_at(mm, address, ptep, new_pte);
3511 3619
3620 hugetlb_count_add(pages_per_huge_page(h), mm);
3512 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 3621 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3513 /* Optimization, do the COW without a second fault */ 3622 /* Optimization, do the COW without a second fault */
3514 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); 3623 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
@@ -4028,8 +4137,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
4028 unsigned long s_end = sbase + PUD_SIZE; 4137 unsigned long s_end = sbase + PUD_SIZE;
4029 4138
4030 /* Allow segments to share if only one is marked locked */ 4139 /* Allow segments to share if only one is marked locked */
4031 unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; 4140 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
4032 unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; 4141 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
4033 4142
4034 /* 4143 /*
4035 * match the virtual addresses, permission and the alignment of the 4144 * match the virtual addresses, permission and the alignment of the
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 6e0057439a46..33d59abe91f1 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -186,7 +186,8 @@ again:
186 } 186 }
187 rcu_read_unlock(); 187 rcu_read_unlock();
188 188
189 ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); 189 if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter))
190 ret = -ENOMEM;
190 css_put(&h_cg->css); 191 css_put(&h_cg->css);
191done: 192done:
192 *ptr = h_cg; 193 *ptr = h_cg;
diff --git a/mm/internal.h b/mm/internal.h
index bc0fa9a69e46..d4b807d6c963 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -271,20 +271,19 @@ extern unsigned int munlock_vma_page(struct page *page);
271extern void clear_page_mlock(struct page *page); 271extern void clear_page_mlock(struct page *page);
272 272
273/* 273/*
274 * mlock_migrate_page - called only from migrate_page_copy() to 274 * mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
275 * migrate the Mlocked page flag; update statistics. 275 * (because that does not go through the full procedure of migration ptes):
276 * to migrate the Mlocked page flag; update statistics.
276 */ 277 */
277static inline void mlock_migrate_page(struct page *newpage, struct page *page) 278static inline void mlock_migrate_page(struct page *newpage, struct page *page)
278{ 279{
279 if (TestClearPageMlocked(page)) { 280 if (TestClearPageMlocked(page)) {
280 unsigned long flags;
281 int nr_pages = hpage_nr_pages(page); 281 int nr_pages = hpage_nr_pages(page);
282 282
283 local_irq_save(flags); 283 /* Holding pmd lock, no change in irq context: __mod is safe */
284 __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); 284 __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
285 SetPageMlocked(newpage); 285 SetPageMlocked(newpage);
286 __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); 286 __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
287 local_irq_restore(flags);
288 } 287 }
289} 288}
290 289
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 8da211411b57..d41b21bce6a0 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -4,7 +4,7 @@
4 * Copyright (c) 2014 Samsung Electronics Co., Ltd. 4 * Copyright (c) 2014 Samsung Electronics Co., Ltd.
5 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> 5 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
6 * 6 *
7 * Some of code borrowed from https://github.com/xairy/linux by 7 * Some code borrowed from https://github.com/xairy/kasan-prototype by
8 * Andrey Konovalov <adech.fo@gmail.com> 8 * Andrey Konovalov <adech.fo@gmail.com>
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify 10 * This program is free software; you can redistribute it and/or modify
@@ -86,6 +86,11 @@ static __always_inline bool memory_is_poisoned_2(unsigned long addr)
86 if (memory_is_poisoned_1(addr + 1)) 86 if (memory_is_poisoned_1(addr + 1))
87 return true; 87 return true;
88 88
89 /*
90 * If single shadow byte covers 2-byte access, we don't
91 * need to do anything more. Otherwise, test the first
92 * shadow byte.
93 */
89 if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) 94 if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
90 return false; 95 return false;
91 96
@@ -103,6 +108,11 @@ static __always_inline bool memory_is_poisoned_4(unsigned long addr)
103 if (memory_is_poisoned_1(addr + 3)) 108 if (memory_is_poisoned_1(addr + 3))
104 return true; 109 return true;
105 110
111 /*
112 * If single shadow byte covers 4-byte access, we don't
113 * need to do anything more. Otherwise, test the first
114 * shadow byte.
115 */
106 if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) 116 if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
107 return false; 117 return false;
108 118
@@ -120,7 +130,12 @@ static __always_inline bool memory_is_poisoned_8(unsigned long addr)
120 if (memory_is_poisoned_1(addr + 7)) 130 if (memory_is_poisoned_1(addr + 7))
121 return true; 131 return true;
122 132
123 if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7)) 133 /*
134 * If single shadow byte covers 8-byte access, we don't
135 * need to do anything more. Otherwise, test the first
136 * shadow byte.
137 */
138 if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
124 return false; 139 return false;
125 140
126 return unlikely(*(u8 *)shadow_addr); 141 return unlikely(*(u8 *)shadow_addr);
@@ -139,7 +154,12 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr)
139 if (unlikely(shadow_first_bytes)) 154 if (unlikely(shadow_first_bytes))
140 return true; 155 return true;
141 156
142 if (likely(IS_ALIGNED(addr, 8))) 157 /*
158 * If two shadow bytes covers 16-byte access, we don't
159 * need to do anything more. Otherwise, test the last
160 * shadow byte.
161 */
162 if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
143 return false; 163 return false;
144 164
145 return memory_is_poisoned_1(addr + 15); 165 return memory_is_poisoned_1(addr + 15);
@@ -203,7 +223,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr,
203 s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); 223 s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
204 224
205 if (unlikely(ret != (unsigned long)last_shadow || 225 if (unlikely(ret != (unsigned long)last_shadow ||
206 ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) 226 ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
207 return true; 227 return true;
208 } 228 }
209 return false; 229 return false;
@@ -235,18 +255,12 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
235static __always_inline void check_memory_region(unsigned long addr, 255static __always_inline void check_memory_region(unsigned long addr,
236 size_t size, bool write) 256 size_t size, bool write)
237{ 257{
238 struct kasan_access_info info;
239
240 if (unlikely(size == 0)) 258 if (unlikely(size == 0))
241 return; 259 return;
242 260
243 if (unlikely((void *)addr < 261 if (unlikely((void *)addr <
244 kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { 262 kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
245 info.access_addr = (void *)addr; 263 kasan_report(addr, size, write, _RET_IP_);
246 info.access_size = size;
247 info.is_write = write;
248 info.ip = _RET_IP_;
249 kasan_report_user_access(&info);
250 return; 264 return;
251 } 265 }
252 266
@@ -524,7 +538,7 @@ static int kasan_mem_notifier(struct notifier_block *nb,
524 538
525static int __init kasan_memhotplug_init(void) 539static int __init kasan_memhotplug_init(void)
526{ 540{
527 pr_err("WARNING: KASan doesn't support memory hot-add\n"); 541 pr_err("WARNING: KASAN doesn't support memory hot-add\n");
528 pr_err("Memory hot-add will be disabled\n"); 542 pr_err("Memory hot-add will be disabled\n");
529 543
530 hotplug_memory_notifier(kasan_mem_notifier, 0); 544 hotplug_memory_notifier(kasan_mem_notifier, 0);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index c242adf6bc85..4f6c62e5c21e 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -54,16 +54,13 @@ struct kasan_global {
54#endif 54#endif
55}; 55};
56 56
57void kasan_report_error(struct kasan_access_info *info);
58void kasan_report_user_access(struct kasan_access_info *info);
59
60static inline const void *kasan_shadow_to_mem(const void *shadow_addr) 57static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
61{ 58{
62 return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) 59 return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
63 << KASAN_SHADOW_SCALE_SHIFT); 60 << KASAN_SHADOW_SCALE_SHIFT);
64} 61}
65 62
66static inline bool kasan_enabled(void) 63static inline bool kasan_report_enabled(void)
67{ 64{
68 return !current->kasan_depth; 65 return !current->kasan_depth;
69} 66}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index e07c94fbd0ac..12f222d0224b 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -4,7 +4,7 @@
4 * Copyright (c) 2014 Samsung Electronics Co., Ltd. 4 * Copyright (c) 2014 Samsung Electronics Co., Ltd.
5 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> 5 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
6 * 6 *
7 * Some of code borrowed from https://github.com/xairy/linux by 7 * Some code borrowed from https://github.com/xairy/kasan-prototype by
8 * Andrey Konovalov <adech.fo@gmail.com> 8 * Andrey Konovalov <adech.fo@gmail.com>
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify 10 * This program is free software; you can redistribute it and/or modify
@@ -22,6 +22,7 @@
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/kasan.h> 24#include <linux/kasan.h>
25#include <linux/module.h>
25 26
26#include <asm/sections.h> 27#include <asm/sections.h>
27 28
@@ -48,34 +49,49 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
48 49
49static void print_error_description(struct kasan_access_info *info) 50static void print_error_description(struct kasan_access_info *info)
50{ 51{
51 const char *bug_type = "unknown crash"; 52 const char *bug_type = "unknown-crash";
52 u8 shadow_val; 53 u8 *shadow_addr;
53 54
54 info->first_bad_addr = find_first_bad_addr(info->access_addr, 55 info->first_bad_addr = find_first_bad_addr(info->access_addr,
55 info->access_size); 56 info->access_size);
56 57
57 shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr); 58 shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
58 59
59 switch (shadow_val) { 60 /*
60 case KASAN_FREE_PAGE: 61 * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
61 case KASAN_KMALLOC_FREE: 62 * at the next shadow byte to determine the type of the bad access.
62 bug_type = "use after free"; 63 */
64 if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
65 shadow_addr++;
66
67 switch (*shadow_addr) {
68 case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
69 /*
70 * In theory it's still possible to see these shadow values
71 * due to a data race in the kernel code.
72 */
73 bug_type = "out-of-bounds";
63 break; 74 break;
64 case KASAN_PAGE_REDZONE: 75 case KASAN_PAGE_REDZONE:
65 case KASAN_KMALLOC_REDZONE: 76 case KASAN_KMALLOC_REDZONE:
77 bug_type = "slab-out-of-bounds";
78 break;
66 case KASAN_GLOBAL_REDZONE: 79 case KASAN_GLOBAL_REDZONE:
67 case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: 80 bug_type = "global-out-of-bounds";
68 bug_type = "out of bounds access";
69 break; 81 break;
70 case KASAN_STACK_LEFT: 82 case KASAN_STACK_LEFT:
71 case KASAN_STACK_MID: 83 case KASAN_STACK_MID:
72 case KASAN_STACK_RIGHT: 84 case KASAN_STACK_RIGHT:
73 case KASAN_STACK_PARTIAL: 85 case KASAN_STACK_PARTIAL:
74 bug_type = "out of bounds on stack"; 86 bug_type = "stack-out-of-bounds";
87 break;
88 case KASAN_FREE_PAGE:
89 case KASAN_KMALLOC_FREE:
90 bug_type = "use-after-free";
75 break; 91 break;
76 } 92 }
77 93
78 pr_err("BUG: KASan: %s in %pS at addr %p\n", 94 pr_err("BUG: KASAN: %s in %pS at addr %p\n",
79 bug_type, (void *)info->ip, 95 bug_type, (void *)info->ip,
80 info->access_addr); 96 info->access_addr);
81 pr_err("%s of size %zu by task %s/%d\n", 97 pr_err("%s of size %zu by task %s/%d\n",
@@ -85,9 +101,11 @@ static void print_error_description(struct kasan_access_info *info)
85 101
86static inline bool kernel_or_module_addr(const void *addr) 102static inline bool kernel_or_module_addr(const void *addr)
87{ 103{
88 return (addr >= (void *)_stext && addr < (void *)_end) 104 if (addr >= (void *)_stext && addr < (void *)_end)
89 || (addr >= (void *)MODULES_VADDR 105 return true;
90 && addr < (void *)MODULES_END); 106 if (is_module_address((unsigned long)addr))
107 return true;
108 return false;
91} 109}
92 110
93static inline bool init_task_stack_addr(const void *addr) 111static inline bool init_task_stack_addr(const void *addr)
@@ -161,15 +179,19 @@ static void print_shadow_for_address(const void *addr)
161 for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) { 179 for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
162 const void *kaddr = kasan_shadow_to_mem(shadow_row); 180 const void *kaddr = kasan_shadow_to_mem(shadow_row);
163 char buffer[4 + (BITS_PER_LONG/8)*2]; 181 char buffer[4 + (BITS_PER_LONG/8)*2];
182 char shadow_buf[SHADOW_BYTES_PER_ROW];
164 183
165 snprintf(buffer, sizeof(buffer), 184 snprintf(buffer, sizeof(buffer),
166 (i == 0) ? ">%p: " : " %p: ", kaddr); 185 (i == 0) ? ">%p: " : " %p: ", kaddr);
167 186 /*
168 kasan_disable_current(); 187 * We should not pass a shadow pointer to generic
188 * function, because generic functions may try to
189 * access kasan mapping for the passed address.
190 */
191 memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW);
169 print_hex_dump(KERN_ERR, buffer, 192 print_hex_dump(KERN_ERR, buffer,
170 DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, 193 DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
171 shadow_row, SHADOW_BYTES_PER_ROW, 0); 194 shadow_buf, SHADOW_BYTES_PER_ROW, 0);
172 kasan_enable_current();
173 195
174 if (row_is_guilty(shadow_row, shadow)) 196 if (row_is_guilty(shadow_row, shadow))
175 pr_err("%*c\n", 197 pr_err("%*c\n",
@@ -182,37 +204,43 @@ static void print_shadow_for_address(const void *addr)
182 204
183static DEFINE_SPINLOCK(report_lock); 205static DEFINE_SPINLOCK(report_lock);
184 206
185void kasan_report_error(struct kasan_access_info *info) 207static void kasan_report_error(struct kasan_access_info *info)
186{
187 unsigned long flags;
188
189 spin_lock_irqsave(&report_lock, flags);
190 pr_err("================================="
191 "=================================\n");
192 print_error_description(info);
193 print_address_description(info);
194 print_shadow_for_address(info->first_bad_addr);
195 pr_err("================================="
196 "=================================\n");
197 spin_unlock_irqrestore(&report_lock, flags);
198}
199
200void kasan_report_user_access(struct kasan_access_info *info)
201{ 208{
202 unsigned long flags; 209 unsigned long flags;
210 const char *bug_type;
203 211
212 /*
213 * Make sure we don't end up in loop.
214 */
215 kasan_disable_current();
204 spin_lock_irqsave(&report_lock, flags); 216 spin_lock_irqsave(&report_lock, flags);
205 pr_err("=================================" 217 pr_err("================================="
206 "=================================\n"); 218 "=================================\n");
207 pr_err("BUG: KASan: user-memory-access on address %p\n", 219 if (info->access_addr <
208 info->access_addr); 220 kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
209 pr_err("%s of size %zu by task %s/%d\n", 221 if ((unsigned long)info->access_addr < PAGE_SIZE)
210 info->is_write ? "Write" : "Read", 222 bug_type = "null-ptr-deref";
211 info->access_size, current->comm, task_pid_nr(current)); 223 else if ((unsigned long)info->access_addr < TASK_SIZE)
212 dump_stack(); 224 bug_type = "user-memory-access";
225 else
226 bug_type = "wild-memory-access";
227 pr_err("BUG: KASAN: %s on address %p\n",
228 bug_type, info->access_addr);
229 pr_err("%s of size %zu by task %s/%d\n",
230 info->is_write ? "Write" : "Read",
231 info->access_size, current->comm,
232 task_pid_nr(current));
233 dump_stack();
234 } else {
235 print_error_description(info);
236 print_address_description(info);
237 print_shadow_for_address(info->first_bad_addr);
238 }
213 pr_err("=================================" 239 pr_err("================================="
214 "=================================\n"); 240 "=================================\n");
241 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
215 spin_unlock_irqrestore(&report_lock, flags); 242 spin_unlock_irqrestore(&report_lock, flags);
243 kasan_enable_current();
216} 244}
217 245
218void kasan_report(unsigned long addr, size_t size, 246void kasan_report(unsigned long addr, size_t size,
@@ -220,13 +248,14 @@ void kasan_report(unsigned long addr, size_t size,
220{ 248{
221 struct kasan_access_info info; 249 struct kasan_access_info info;
222 250
223 if (likely(!kasan_enabled())) 251 if (likely(!kasan_report_enabled()))
224 return; 252 return;
225 253
226 info.access_addr = (void *)addr; 254 info.access_addr = (void *)addr;
227 info.access_size = size; 255 info.access_size = size;
228 info.is_write = is_write; 256 info.is_write = is_write;
229 info.ip = ip; 257 info.ip = ip;
258
230 kasan_report_error(&info); 259 kasan_report_error(&info);
231} 260}
232 261
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 77191eccdc6f..19423a45d7d7 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -479,7 +479,7 @@ static void put_object(struct kmemleak_object *object)
479static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) 479static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
480{ 480{
481 unsigned long flags; 481 unsigned long flags;
482 struct kmemleak_object *object = NULL; 482 struct kmemleak_object *object;
483 483
484 rcu_read_lock(); 484 rcu_read_lock();
485 read_lock_irqsave(&kmemleak_lock, flags); 485 read_lock_irqsave(&kmemleak_lock, flags);
diff --git a/mm/ksm.c b/mm/ksm.c
index 7ee101eaacdf..b5cd647daa52 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -475,7 +475,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
475 flush_dcache_page(page); 475 flush_dcache_page(page);
476 } else { 476 } else {
477 put_page(page); 477 put_page(page);
478out: page = NULL; 478out:
479 page = NULL;
479 } 480 }
480 up_read(&mm->mmap_sem); 481 up_read(&mm->mmap_sem);
481 return page; 482 return page;
@@ -625,7 +626,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
625 unlock_page(page); 626 unlock_page(page);
626 put_page(page); 627 put_page(page);
627 628
628 if (stable_node->hlist.first) 629 if (!hlist_empty(&stable_node->hlist))
629 ksm_pages_sharing--; 630 ksm_pages_sharing--;
630 else 631 else
631 ksm_pages_shared--; 632 ksm_pages_shared--;
@@ -1021,8 +1022,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
1021 if (page == kpage) /* ksm page forked */ 1022 if (page == kpage) /* ksm page forked */
1022 return 0; 1023 return 0;
1023 1024
1024 if (!(vma->vm_flags & VM_MERGEABLE))
1025 goto out;
1026 if (PageTransCompound(page) && page_trans_compound_anon_split(page)) 1025 if (PageTransCompound(page) && page_trans_compound_anon_split(page))
1027 goto out; 1026 goto out;
1028 BUG_ON(PageTransCompound(page)); 1027 BUG_ON(PageTransCompound(page));
@@ -1087,10 +1086,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1087 int err = -EFAULT; 1086 int err = -EFAULT;
1088 1087
1089 down_read(&mm->mmap_sem); 1088 down_read(&mm->mmap_sem);
1090 if (ksm_test_exit(mm)) 1089 vma = find_mergeable_vma(mm, rmap_item->address);
1091 goto out; 1090 if (!vma)
1092 vma = find_vma(mm, rmap_item->address);
1093 if (!vma || vma->vm_start > rmap_item->address)
1094 goto out; 1091 goto out;
1095 1092
1096 err = try_to_merge_one_page(vma, page, kpage); 1093 err = try_to_merge_one_page(vma, page, kpage);
@@ -1177,8 +1174,18 @@ again:
1177 cond_resched(); 1174 cond_resched();
1178 stable_node = rb_entry(*new, struct stable_node, node); 1175 stable_node = rb_entry(*new, struct stable_node, node);
1179 tree_page = get_ksm_page(stable_node, false); 1176 tree_page = get_ksm_page(stable_node, false);
1180 if (!tree_page) 1177 if (!tree_page) {
1181 return NULL; 1178 /*
1179 * If we walked over a stale stable_node,
1180 * get_ksm_page() will call rb_erase() and it
1181 * may rebalance the tree from under us. So
1182 * restart the search from scratch. Returning
1183 * NULL would be safe too, but we'd generate
1184 * false negative insertions just because some
1185 * stable_node was stale.
1186 */
1187 goto again;
1188 }
1182 1189
1183 ret = memcmp_pages(page, tree_page); 1190 ret = memcmp_pages(page, tree_page);
1184 put_page(tree_page); 1191 put_page(tree_page);
@@ -1254,12 +1261,14 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
1254 unsigned long kpfn; 1261 unsigned long kpfn;
1255 struct rb_root *root; 1262 struct rb_root *root;
1256 struct rb_node **new; 1263 struct rb_node **new;
1257 struct rb_node *parent = NULL; 1264 struct rb_node *parent;
1258 struct stable_node *stable_node; 1265 struct stable_node *stable_node;
1259 1266
1260 kpfn = page_to_pfn(kpage); 1267 kpfn = page_to_pfn(kpage);
1261 nid = get_kpfn_nid(kpfn); 1268 nid = get_kpfn_nid(kpfn);
1262 root = root_stable_tree + nid; 1269 root = root_stable_tree + nid;
1270again:
1271 parent = NULL;
1263 new = &root->rb_node; 1272 new = &root->rb_node;
1264 1273
1265 while (*new) { 1274 while (*new) {
@@ -1269,8 +1278,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
1269 cond_resched(); 1278 cond_resched();
1270 stable_node = rb_entry(*new, struct stable_node, node); 1279 stable_node = rb_entry(*new, struct stable_node, node);
1271 tree_page = get_ksm_page(stable_node, false); 1280 tree_page = get_ksm_page(stable_node, false);
1272 if (!tree_page) 1281 if (!tree_page) {
1273 return NULL; 1282 /*
1283 * If we walked over a stale stable_node,
1284 * get_ksm_page() will call rb_erase() and it
1285 * may rebalance the tree from under us. So
1286 * restart the search from scratch. Returning
1287 * NULL would be safe too, but we'd generate
1288 * false negative insertions just because some
1289 * stable_node was stale.
1290 */
1291 goto again;
1292 }
1274 1293
1275 ret = memcmp_pages(kpage, tree_page); 1294 ret = memcmp_pages(kpage, tree_page);
1276 put_page(tree_page); 1295 put_page(tree_page);
@@ -1340,7 +1359,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1340 cond_resched(); 1359 cond_resched();
1341 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1360 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1342 tree_page = get_mergeable_page(tree_rmap_item); 1361 tree_page = get_mergeable_page(tree_rmap_item);
1343 if (IS_ERR_OR_NULL(tree_page)) 1362 if (!tree_page)
1344 return NULL; 1363 return NULL;
1345 1364
1346 /* 1365 /*
@@ -1914,9 +1933,11 @@ again:
1914 struct anon_vma_chain *vmac; 1933 struct anon_vma_chain *vmac;
1915 struct vm_area_struct *vma; 1934 struct vm_area_struct *vma;
1916 1935
1936 cond_resched();
1917 anon_vma_lock_read(anon_vma); 1937 anon_vma_lock_read(anon_vma);
1918 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1938 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1919 0, ULONG_MAX) { 1939 0, ULONG_MAX) {
1940 cond_resched();
1920 vma = vmac->vma; 1941 vma = vmac->vma;
1921 if (rmap_item->address < vma->vm_start || 1942 if (rmap_item->address < vma->vm_start ||
1922 rmap_item->address >= vma->vm_end) 1943 rmap_item->address >= vma->vm_end)
diff --git a/mm/list_lru.c b/mm/list_lru.c
index e1da19fac1b3..afc71ea9a381 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -42,6 +42,10 @@ static void list_lru_unregister(struct list_lru *lru)
42#ifdef CONFIG_MEMCG_KMEM 42#ifdef CONFIG_MEMCG_KMEM
43static inline bool list_lru_memcg_aware(struct list_lru *lru) 43static inline bool list_lru_memcg_aware(struct list_lru *lru)
44{ 44{
45 /*
46 * This needs node 0 to be always present, even
47 * in the systems supporting sparse numa ids.
48 */
45 return !!lru->node[0].memcg_lrus; 49 return !!lru->node[0].memcg_lrus;
46} 50}
47 51
@@ -59,6 +63,16 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
59 return &nlru->lru; 63 return &nlru->lru;
60} 64}
61 65
66static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
67{
68 struct page *page;
69
70 if (!memcg_kmem_enabled())
71 return NULL;
72 page = virt_to_head_page(ptr);
73 return page->mem_cgroup;
74}
75
62static inline struct list_lru_one * 76static inline struct list_lru_one *
63list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) 77list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
64{ 78{
@@ -377,16 +391,20 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
377{ 391{
378 int i; 392 int i;
379 393
380 for (i = 0; i < nr_node_ids; i++) { 394 if (!memcg_aware)
381 if (!memcg_aware) 395 return 0;
382 lru->node[i].memcg_lrus = NULL; 396
383 else if (memcg_init_list_lru_node(&lru->node[i])) 397 for_each_node(i) {
398 if (memcg_init_list_lru_node(&lru->node[i]))
384 goto fail; 399 goto fail;
385 } 400 }
386 return 0; 401 return 0;
387fail: 402fail:
388 for (i = i - 1; i >= 0; i--) 403 for (i = i - 1; i >= 0; i--) {
404 if (!lru->node[i].memcg_lrus)
405 continue;
389 memcg_destroy_list_lru_node(&lru->node[i]); 406 memcg_destroy_list_lru_node(&lru->node[i]);
407 }
390 return -ENOMEM; 408 return -ENOMEM;
391} 409}
392 410
@@ -397,7 +415,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
397 if (!list_lru_memcg_aware(lru)) 415 if (!list_lru_memcg_aware(lru))
398 return; 416 return;
399 417
400 for (i = 0; i < nr_node_ids; i++) 418 for_each_node(i)
401 memcg_destroy_list_lru_node(&lru->node[i]); 419 memcg_destroy_list_lru_node(&lru->node[i]);
402} 420}
403 421
@@ -409,16 +427,20 @@ static int memcg_update_list_lru(struct list_lru *lru,
409 if (!list_lru_memcg_aware(lru)) 427 if (!list_lru_memcg_aware(lru))
410 return 0; 428 return 0;
411 429
412 for (i = 0; i < nr_node_ids; i++) { 430 for_each_node(i) {
413 if (memcg_update_list_lru_node(&lru->node[i], 431 if (memcg_update_list_lru_node(&lru->node[i],
414 old_size, new_size)) 432 old_size, new_size))
415 goto fail; 433 goto fail;
416 } 434 }
417 return 0; 435 return 0;
418fail: 436fail:
419 for (i = i - 1; i >= 0; i--) 437 for (i = i - 1; i >= 0; i--) {
438 if (!lru->node[i].memcg_lrus)
439 continue;
440
420 memcg_cancel_update_list_lru_node(&lru->node[i], 441 memcg_cancel_update_list_lru_node(&lru->node[i],
421 old_size, new_size); 442 old_size, new_size);
443 }
422 return -ENOMEM; 444 return -ENOMEM;
423} 445}
424 446
@@ -430,7 +452,7 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru,
430 if (!list_lru_memcg_aware(lru)) 452 if (!list_lru_memcg_aware(lru))
431 return; 453 return;
432 454
433 for (i = 0; i < nr_node_ids; i++) 455 for_each_node(i)
434 memcg_cancel_update_list_lru_node(&lru->node[i], 456 memcg_cancel_update_list_lru_node(&lru->node[i],
435 old_size, new_size); 457 old_size, new_size);
436} 458}
@@ -485,7 +507,7 @@ static void memcg_drain_list_lru(struct list_lru *lru,
485 if (!list_lru_memcg_aware(lru)) 507 if (!list_lru_memcg_aware(lru))
486 return; 508 return;
487 509
488 for (i = 0; i < nr_node_ids; i++) 510 for_each_node(i)
489 memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); 511 memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
490} 512}
491 513
@@ -522,7 +544,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
522 if (!lru->node) 544 if (!lru->node)
523 goto out; 545 goto out;
524 546
525 for (i = 0; i < nr_node_ids; i++) { 547 for_each_node(i) {
526 spin_lock_init(&lru->node[i].lock); 548 spin_lock_init(&lru->node[i].lock);
527 if (key) 549 if (key)
528 lockdep_set_class(&lru->node[i].lock, key); 550 lockdep_set_class(&lru->node[i].lock, key);
diff --git a/mm/maccess.c b/mm/maccess.c
index 34fe24759ed1..d159b1c96e48 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -13,6 +13,11 @@
13 * 13 *
14 * Safely read from address @src to the buffer at @dst. If a kernel fault 14 * Safely read from address @src to the buffer at @dst. If a kernel fault
15 * happens, handle that and return -EFAULT. 15 * happens, handle that and return -EFAULT.
16 *
17 * We ensure that the copy_from_user is executed in atomic context so that
18 * do_page_fault() doesn't attempt to take mmap_sem. This makes
19 * probe_kernel_read() suitable for use within regions where the caller
20 * already holds mmap_sem, or other locks which nest inside mmap_sem.
16 */ 21 */
17 22
18long __weak probe_kernel_read(void *dst, const void *src, size_t size) 23long __weak probe_kernel_read(void *dst, const void *src, size_t size)
@@ -99,5 +104,5 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
99 pagefault_enable(); 104 pagefault_enable();
100 set_fs(old_fs); 105 set_fs(old_fs);
101 106
102 return ret < 0 ? ret : src - unsafe_addr; 107 return ret ? -EFAULT : src - unsafe_addr;
103} 108}
diff --git a/mm/memblock.c b/mm/memblock.c
index 1c7b647e5897..d300f1329814 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -706,7 +706,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
706 return 0; 706 return 0;
707} 707}
708 708
709int __init_memblock memblock_remove_range(struct memblock_type *type, 709static int __init_memblock memblock_remove_range(struct memblock_type *type,
710 phys_addr_t base, phys_addr_t size) 710 phys_addr_t base, phys_addr_t size)
711{ 711{
712 int start_rgn, end_rgn; 712 int start_rgn, end_rgn;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b732edfddb76..bc502e590366 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
62#include <linux/oom.h> 62#include <linux/oom.h>
63#include <linux/lockdep.h> 63#include <linux/lockdep.h>
64#include <linux/file.h> 64#include <linux/file.h>
65#include <linux/tracehook.h>
65#include "internal.h" 66#include "internal.h"
66#include <net/sock.h> 67#include <net/sock.h>
67#include <net/ip.h> 68#include <net/ip.h>
@@ -1661,7 +1662,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
1661 1662
1662static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1663static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1663{ 1664{
1664 if (!current->memcg_oom.may_oom) 1665 if (!current->memcg_may_oom)
1665 return; 1666 return;
1666 /* 1667 /*
1667 * We are in the middle of the charge context here, so we 1668 * We are in the middle of the charge context here, so we
@@ -1678,9 +1679,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1678 * and when we know whether the fault was overall successful. 1679 * and when we know whether the fault was overall successful.
1679 */ 1680 */
1680 css_get(&memcg->css); 1681 css_get(&memcg->css);
1681 current->memcg_oom.memcg = memcg; 1682 current->memcg_in_oom = memcg;
1682 current->memcg_oom.gfp_mask = mask; 1683 current->memcg_oom_gfp_mask = mask;
1683 current->memcg_oom.order = order; 1684 current->memcg_oom_order = order;
1684} 1685}
1685 1686
1686/** 1687/**
@@ -1702,7 +1703,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1702 */ 1703 */
1703bool mem_cgroup_oom_synchronize(bool handle) 1704bool mem_cgroup_oom_synchronize(bool handle)
1704{ 1705{
1705 struct mem_cgroup *memcg = current->memcg_oom.memcg; 1706 struct mem_cgroup *memcg = current->memcg_in_oom;
1706 struct oom_wait_info owait; 1707 struct oom_wait_info owait;
1707 bool locked; 1708 bool locked;
1708 1709
@@ -1730,8 +1731,8 @@ bool mem_cgroup_oom_synchronize(bool handle)
1730 if (locked && !memcg->oom_kill_disable) { 1731 if (locked && !memcg->oom_kill_disable) {
1731 mem_cgroup_unmark_under_oom(memcg); 1732 mem_cgroup_unmark_under_oom(memcg);
1732 finish_wait(&memcg_oom_waitq, &owait.wait); 1733 finish_wait(&memcg_oom_waitq, &owait.wait);
1733 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 1734 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1734 current->memcg_oom.order); 1735 current->memcg_oom_order);
1735 } else { 1736 } else {
1736 schedule(); 1737 schedule();
1737 mem_cgroup_unmark_under_oom(memcg); 1738 mem_cgroup_unmark_under_oom(memcg);
@@ -1748,7 +1749,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
1748 memcg_oom_recover(memcg); 1749 memcg_oom_recover(memcg);
1749 } 1750 }
1750cleanup: 1751cleanup:
1751 current->memcg_oom.memcg = NULL; 1752 current->memcg_in_oom = NULL;
1752 css_put(&memcg->css); 1753 css_put(&memcg->css);
1753 return true; 1754 return true;
1754} 1755}
@@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
1972 return NOTIFY_OK; 1973 return NOTIFY_OK;
1973} 1974}
1974 1975
1976/*
1977 * Scheduled by try_charge() to be executed from the userland return path
1978 * and reclaims memory over the high limit.
1979 */
1980void mem_cgroup_handle_over_high(void)
1981{
1982 unsigned int nr_pages = current->memcg_nr_pages_over_high;
1983 struct mem_cgroup *memcg, *pos;
1984
1985 if (likely(!nr_pages))
1986 return;
1987
1988 pos = memcg = get_mem_cgroup_from_mm(current->mm);
1989
1990 do {
1991 if (page_counter_read(&pos->memory) <= pos->high)
1992 continue;
1993 mem_cgroup_events(pos, MEMCG_HIGH, 1);
1994 try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
1995 } while ((pos = parent_mem_cgroup(pos)));
1996
1997 css_put(&memcg->css);
1998 current->memcg_nr_pages_over_high = 0;
1999}
2000
1975static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2001static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
1976 unsigned int nr_pages) 2002 unsigned int nr_pages)
1977{ 2003{
@@ -1982,17 +2008,16 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
1982 unsigned long nr_reclaimed; 2008 unsigned long nr_reclaimed;
1983 bool may_swap = true; 2009 bool may_swap = true;
1984 bool drained = false; 2010 bool drained = false;
1985 int ret = 0;
1986 2011
1987 if (mem_cgroup_is_root(memcg)) 2012 if (mem_cgroup_is_root(memcg))
1988 goto done; 2013 return 0;
1989retry: 2014retry:
1990 if (consume_stock(memcg, nr_pages)) 2015 if (consume_stock(memcg, nr_pages))
1991 goto done; 2016 return 0;
1992 2017
1993 if (!do_swap_account || 2018 if (!do_swap_account ||
1994 !page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2019 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
1995 if (!page_counter_try_charge(&memcg->memory, batch, &counter)) 2020 if (page_counter_try_charge(&memcg->memory, batch, &counter))
1996 goto done_restock; 2021 goto done_restock;
1997 if (do_swap_account) 2022 if (do_swap_account)
1998 page_counter_uncharge(&memcg->memsw, batch); 2023 page_counter_uncharge(&memcg->memsw, batch);
@@ -2016,7 +2041,7 @@ retry:
2016 if (unlikely(test_thread_flag(TIF_MEMDIE) || 2041 if (unlikely(test_thread_flag(TIF_MEMDIE) ||
2017 fatal_signal_pending(current) || 2042 fatal_signal_pending(current) ||
2018 current->flags & PF_EXITING)) 2043 current->flags & PF_EXITING))
2019 goto bypass; 2044 goto force;
2020 2045
2021 if (unlikely(task_in_memcg_oom(current))) 2046 if (unlikely(task_in_memcg_oom(current)))
2022 goto nomem; 2047 goto nomem;
@@ -2062,38 +2087,54 @@ retry:
2062 goto retry; 2087 goto retry;
2063 2088
2064 if (gfp_mask & __GFP_NOFAIL) 2089 if (gfp_mask & __GFP_NOFAIL)
2065 goto bypass; 2090 goto force;
2066 2091
2067 if (fatal_signal_pending(current)) 2092 if (fatal_signal_pending(current))
2068 goto bypass; 2093 goto force;
2069 2094
2070 mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); 2095 mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
2071 2096
2072 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); 2097 mem_cgroup_oom(mem_over_limit, gfp_mask,
2098 get_order(nr_pages * PAGE_SIZE));
2073nomem: 2099nomem:
2074 if (!(gfp_mask & __GFP_NOFAIL)) 2100 if (!(gfp_mask & __GFP_NOFAIL))
2075 return -ENOMEM; 2101 return -ENOMEM;
2076bypass: 2102force:
2077 return -EINTR; 2103 /*
2104 * The allocation either can't fail or will lead to more memory
2105 * being freed very soon. Allow memory usage go over the limit
2106 * temporarily by force charging it.
2107 */
2108 page_counter_charge(&memcg->memory, nr_pages);
2109 if (do_swap_account)
2110 page_counter_charge(&memcg->memsw, nr_pages);
2111 css_get_many(&memcg->css, nr_pages);
2112
2113 return 0;
2078 2114
2079done_restock: 2115done_restock:
2080 css_get_many(&memcg->css, batch); 2116 css_get_many(&memcg->css, batch);
2081 if (batch > nr_pages) 2117 if (batch > nr_pages)
2082 refill_stock(memcg, batch - nr_pages); 2118 refill_stock(memcg, batch - nr_pages);
2083 if (!(gfp_mask & __GFP_WAIT)) 2119
2084 goto done;
2085 /* 2120 /*
2086 * If the hierarchy is above the normal consumption range, 2121 * If the hierarchy is above the normal consumption range, schedule
2087 * make the charging task trim their excess contribution. 2122 * reclaim on returning to userland. We can perform reclaim here
2123 * if __GFP_WAIT but let's always punt for simplicity and so that
2124 * GFP_KERNEL can consistently be used during reclaim. @memcg is
2125 * not recorded as it most likely matches current's and won't
2126 * change in the meantime. As high limit is checked again before
2127 * reclaim, the cost of mismatch is negligible.
2088 */ 2128 */
2089 do { 2129 do {
2090 if (page_counter_read(&memcg->memory) <= memcg->high) 2130 if (page_counter_read(&memcg->memory) > memcg->high) {
2091 continue; 2131 current->memcg_nr_pages_over_high += nr_pages;
2092 mem_cgroup_events(memcg, MEMCG_HIGH, 1); 2132 set_notify_resume(current);
2093 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); 2133 break;
2134 }
2094 } while ((memcg = parent_mem_cgroup(memcg))); 2135 } while ((memcg = parent_mem_cgroup(memcg)));
2095done: 2136
2096 return ret; 2137 return 0;
2097} 2138}
2098 2139
2099static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2140static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2174,55 +2215,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2174} 2215}
2175 2216
2176#ifdef CONFIG_MEMCG_KMEM 2217#ifdef CONFIG_MEMCG_KMEM
2177int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2178 unsigned long nr_pages)
2179{
2180 struct page_counter *counter;
2181 int ret = 0;
2182
2183 ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
2184 if (ret < 0)
2185 return ret;
2186
2187 ret = try_charge(memcg, gfp, nr_pages);
2188 if (ret == -EINTR) {
2189 /*
2190 * try_charge() chose to bypass to root due to OOM kill or
2191 * fatal signal. Since our only options are to either fail
2192 * the allocation or charge it to this cgroup, do it as a
2193 * temporary condition. But we can't fail. From a kmem/slab
2194 * perspective, the cache has already been selected, by
2195 * mem_cgroup_kmem_get_cache(), so it is too late to change
2196 * our minds.
2197 *
2198 * This condition will only trigger if the task entered
2199 * memcg_charge_kmem in a sane state, but was OOM-killed
2200 * during try_charge() above. Tasks that were already dying
2201 * when the allocation triggers should have been already
2202 * directed to the root cgroup in memcontrol.h
2203 */
2204 page_counter_charge(&memcg->memory, nr_pages);
2205 if (do_swap_account)
2206 page_counter_charge(&memcg->memsw, nr_pages);
2207 css_get_many(&memcg->css, nr_pages);
2208 ret = 0;
2209 } else if (ret)
2210 page_counter_uncharge(&memcg->kmem, nr_pages);
2211
2212 return ret;
2213}
2214
2215void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
2216{
2217 page_counter_uncharge(&memcg->memory, nr_pages);
2218 if (do_swap_account)
2219 page_counter_uncharge(&memcg->memsw, nr_pages);
2220
2221 page_counter_uncharge(&memcg->kmem, nr_pages);
2222
2223 css_put_many(&memcg->css, nr_pages);
2224}
2225
2226static int memcg_alloc_cache_id(void) 2218static int memcg_alloc_cache_id(void)
2227{ 2219{
2228 int id, size; 2220 int id, size;
@@ -2384,85 +2376,58 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep)
2384 css_put(&cachep->memcg_params.memcg->css); 2376 css_put(&cachep->memcg_params.memcg->css);
2385} 2377}
2386 2378
2387/* 2379int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2388 * We need to verify if the allocation against current->mm->owner's memcg is 2380 struct mem_cgroup *memcg)
2389 * possible for the given order. But the page is not allocated yet, so we'll
2390 * need a further commit step to do the final arrangements.
2391 *
2392 * It is possible for the task to switch cgroups in this mean time, so at
2393 * commit time, we can't rely on task conversion any longer. We'll then use
2394 * the handle argument to return to the caller which cgroup we should commit
2395 * against. We could also return the memcg directly and avoid the pointer
2396 * passing, but a boolean return value gives better semantics considering
2397 * the compiled-out case as well.
2398 *
2399 * Returning true means the allocation is possible.
2400 */
2401bool
2402__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
2403{ 2381{
2404 struct mem_cgroup *memcg; 2382 unsigned int nr_pages = 1 << order;
2383 struct page_counter *counter;
2405 int ret; 2384 int ret;
2406 2385
2407 *_memcg = NULL; 2386 if (!memcg_kmem_is_active(memcg))
2387 return 0;
2408 2388
2409 memcg = get_mem_cgroup_from_mm(current->mm); 2389 if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
2390 return -ENOMEM;
2410 2391
2411 if (!memcg_kmem_is_active(memcg)) { 2392 ret = try_charge(memcg, gfp, nr_pages);
2412 css_put(&memcg->css); 2393 if (ret) {
2413 return true; 2394 page_counter_uncharge(&memcg->kmem, nr_pages);
2395 return ret;
2414 } 2396 }
2415 2397
2416 ret = memcg_charge_kmem(memcg, gfp, 1 << order); 2398 page->mem_cgroup = memcg;
2417 if (!ret)
2418 *_memcg = memcg;
2419 2399
2420 css_put(&memcg->css); 2400 return 0;
2421 return (ret == 0);
2422} 2401}
2423 2402
2424void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 2403int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2425 int order)
2426{ 2404{
2427 VM_BUG_ON(mem_cgroup_is_root(memcg)); 2405 struct mem_cgroup *memcg;
2406 int ret;
2428 2407
2429 /* The page allocation failed. Revert */ 2408 memcg = get_mem_cgroup_from_mm(current->mm);
2430 if (!page) { 2409 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
2431 memcg_uncharge_kmem(memcg, 1 << order); 2410 css_put(&memcg->css);
2432 return; 2411 return ret;
2433 }
2434 page->mem_cgroup = memcg;
2435} 2412}
2436 2413
2437void __memcg_kmem_uncharge_pages(struct page *page, int order) 2414void __memcg_kmem_uncharge(struct page *page, int order)
2438{ 2415{
2439 struct mem_cgroup *memcg = page->mem_cgroup; 2416 struct mem_cgroup *memcg = page->mem_cgroup;
2417 unsigned int nr_pages = 1 << order;
2440 2418
2441 if (!memcg) 2419 if (!memcg)
2442 return; 2420 return;
2443 2421
2444 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2422 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2445 2423
2446 memcg_uncharge_kmem(memcg, 1 << order); 2424 page_counter_uncharge(&memcg->kmem, nr_pages);
2447 page->mem_cgroup = NULL; 2425 page_counter_uncharge(&memcg->memory, nr_pages);
2448} 2426 if (do_swap_account)
2449 2427 page_counter_uncharge(&memcg->memsw, nr_pages);
2450struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
2451{
2452 struct mem_cgroup *memcg = NULL;
2453 struct kmem_cache *cachep;
2454 struct page *page;
2455
2456 page = virt_to_head_page(ptr);
2457 if (PageSlab(page)) {
2458 cachep = page->slab_cache;
2459 if (!is_root_cache(cachep))
2460 memcg = cachep->memcg_params.memcg;
2461 } else
2462 /* page allocated by alloc_kmem_pages */
2463 memcg = page->mem_cgroup;
2464 2428
2465 return memcg; 2429 page->mem_cgroup = NULL;
2430 css_put_many(&memcg->css, nr_pages);
2466} 2431}
2467#endif /* CONFIG_MEMCG_KMEM */ 2432#endif /* CONFIG_MEMCG_KMEM */
2468 2433
@@ -2836,9 +2801,9 @@ static unsigned long tree_stat(struct mem_cgroup *memcg,
2836 return val; 2801 return val;
2837} 2802}
2838 2803
2839static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 2804static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
2840{ 2805{
2841 u64 val; 2806 unsigned long val;
2842 2807
2843 if (mem_cgroup_is_root(memcg)) { 2808 if (mem_cgroup_is_root(memcg)) {
2844 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); 2809 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
@@ -2851,7 +2816,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
2851 else 2816 else
2852 val = page_counter_read(&memcg->memsw); 2817 val = page_counter_read(&memcg->memsw);
2853 } 2818 }
2854 return val << PAGE_SHIFT; 2819 return val;
2855} 2820}
2856 2821
2857enum { 2822enum {
@@ -2885,9 +2850,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
2885 switch (MEMFILE_ATTR(cft->private)) { 2850 switch (MEMFILE_ATTR(cft->private)) {
2886 case RES_USAGE: 2851 case RES_USAGE:
2887 if (counter == &memcg->memory) 2852 if (counter == &memcg->memory)
2888 return mem_cgroup_usage(memcg, false); 2853 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
2889 if (counter == &memcg->memsw) 2854 if (counter == &memcg->memsw)
2890 return mem_cgroup_usage(memcg, true); 2855 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
2891 return (u64)page_counter_read(counter) * PAGE_SIZE; 2856 return (u64)page_counter_read(counter) * PAGE_SIZE;
2892 case RES_LIMIT: 2857 case RES_LIMIT:
2893 return (u64)counter->limit * PAGE_SIZE; 2858 return (u64)counter->limit * PAGE_SIZE;
@@ -3387,7 +3352,6 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3387 ret = page_counter_memparse(args, "-1", &threshold); 3352 ret = page_counter_memparse(args, "-1", &threshold);
3388 if (ret) 3353 if (ret)
3389 return ret; 3354 return ret;
3390 threshold <<= PAGE_SHIFT;
3391 3355
3392 mutex_lock(&memcg->thresholds_lock); 3356 mutex_lock(&memcg->thresholds_lock);
3393 3357
@@ -4406,22 +4370,10 @@ static int mem_cgroup_do_precharge(unsigned long count)
4406 mc.precharge += count; 4370 mc.precharge += count;
4407 return ret; 4371 return ret;
4408 } 4372 }
4409 if (ret == -EINTR) {
4410 cancel_charge(root_mem_cgroup, count);
4411 return ret;
4412 }
4413 4373
4414 /* Try charges one by one with reclaim */ 4374 /* Try charges one by one with reclaim */
4415 while (count--) { 4375 while (count--) {
4416 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); 4376 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
4417 /*
4418 * In case of failure, any residual charges against
4419 * mc.to will be dropped by mem_cgroup_clear_mc()
4420 * later on. However, cancel any charges that are
4421 * bypassed to root right away or they'll be lost.
4422 */
4423 if (ret == -EINTR)
4424 cancel_charge(root_mem_cgroup, 1);
4425 if (ret) 4377 if (ret)
4426 return ret; 4378 return ret;
4427 mc.precharge++; 4379 mc.precharge++;
@@ -4576,9 +4528,8 @@ static int mem_cgroup_move_account(struct page *page,
4576 goto out; 4528 goto out;
4577 4529
4578 /* 4530 /*
4579 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup 4531 * Prevent mem_cgroup_replace_page() from looking at
4580 * of its source page while we change it: page migration takes 4532 * page->mem_cgroup of its source page while we change it.
4581 * both pages off the LRU, but page cache replacement doesn't.
4582 */ 4533 */
4583 if (!trylock_page(page)) 4534 if (!trylock_page(page))
4584 goto out; 4535 goto out;
@@ -5085,7 +5036,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5085static u64 memory_current_read(struct cgroup_subsys_state *css, 5036static u64 memory_current_read(struct cgroup_subsys_state *css,
5086 struct cftype *cft) 5037 struct cftype *cft)
5087{ 5038{
5088 return mem_cgroup_usage(mem_cgroup_from_css(css), false); 5039 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5040
5041 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5089} 5042}
5090 5043
5091static int memory_low_show(struct seq_file *m, void *v) 5044static int memory_low_show(struct seq_file *m, void *v)
@@ -5197,6 +5150,7 @@ static int memory_events_show(struct seq_file *m, void *v)
5197static struct cftype memory_files[] = { 5150static struct cftype memory_files[] = {
5198 { 5151 {
5199 .name = "current", 5152 .name = "current",
5153 .flags = CFTYPE_NOT_ON_ROOT,
5200 .read_u64 = memory_current_read, 5154 .read_u64 = memory_current_read,
5201 }, 5155 },
5202 { 5156 {
@@ -5340,11 +5294,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5340 ret = try_charge(memcg, gfp_mask, nr_pages); 5294 ret = try_charge(memcg, gfp_mask, nr_pages);
5341 5295
5342 css_put(&memcg->css); 5296 css_put(&memcg->css);
5343
5344 if (ret == -EINTR) {
5345 memcg = root_mem_cgroup;
5346 ret = 0;
5347 }
5348out: 5297out:
5349 *memcgp = memcg; 5298 *memcgp = memcg;
5350 return ret; 5299 return ret;
@@ -5559,7 +5508,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
5559} 5508}
5560 5509
5561/** 5510/**
5562 * mem_cgroup_migrate - migrate a charge to another page 5511 * mem_cgroup_replace_page - migrate a charge to another page
5563 * @oldpage: currently charged page 5512 * @oldpage: currently charged page
5564 * @newpage: page to transfer the charge to 5513 * @newpage: page to transfer the charge to
5565 * @lrucare: either or both pages might be on the LRU already 5514 * @lrucare: either or both pages might be on the LRU already
@@ -5568,16 +5517,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
5568 * 5517 *
5569 * Both pages must be locked, @newpage->mapping must be set up. 5518 * Both pages must be locked, @newpage->mapping must be set up.
5570 */ 5519 */
5571void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, 5520void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
5572 bool lrucare)
5573{ 5521{
5574 struct mem_cgroup *memcg; 5522 struct mem_cgroup *memcg;
5575 int isolated; 5523 int isolated;
5576 5524
5577 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 5525 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
5578 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 5526 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
5579 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
5580 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
5581 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 5527 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
5582 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 5528 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
5583 newpage); 5529 newpage);
@@ -5589,25 +5535,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
5589 if (newpage->mem_cgroup) 5535 if (newpage->mem_cgroup)
5590 return; 5536 return;
5591 5537
5592 /* 5538 /* Swapcache readahead pages can get replaced before being charged */
5593 * Swapcache readahead pages can get migrated before being
5594 * charged, and migration from compaction can happen to an
5595 * uncharged page when the PFN walker finds a page that
5596 * reclaim just put back on the LRU but has not released yet.
5597 */
5598 memcg = oldpage->mem_cgroup; 5539 memcg = oldpage->mem_cgroup;
5599 if (!memcg) 5540 if (!memcg)
5600 return; 5541 return;
5601 5542
5602 if (lrucare) 5543 lock_page_lru(oldpage, &isolated);
5603 lock_page_lru(oldpage, &isolated);
5604
5605 oldpage->mem_cgroup = NULL; 5544 oldpage->mem_cgroup = NULL;
5545 unlock_page_lru(oldpage, isolated);
5606 5546
5607 if (lrucare) 5547 commit_charge(newpage, memcg, true);
5608 unlock_page_lru(oldpage, isolated);
5609
5610 commit_charge(newpage, memcg, lrucare);
5611} 5548}
5612 5549
5613/* 5550/*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 95882692e747..16a0ec385320 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
56#include <linux/memory_hotplug.h> 56#include <linux/memory_hotplug.h>
57#include <linux/mm_inline.h> 57#include <linux/mm_inline.h>
58#include <linux/kfifo.h> 58#include <linux/kfifo.h>
59#include <linux/ratelimit.h>
59#include "internal.h" 60#include "internal.h"
60#include "ras/ras_event.h" 61#include "ras/ras_event.h"
61 62
@@ -1403,6 +1404,12 @@ static int __init memory_failure_init(void)
1403} 1404}
1404core_initcall(memory_failure_init); 1405core_initcall(memory_failure_init);
1405 1406
1407#define unpoison_pr_info(fmt, pfn, rs) \
1408({ \
1409 if (__ratelimit(rs)) \
1410 pr_info(fmt, pfn); \
1411})
1412
1406/** 1413/**
1407 * unpoison_memory - Unpoison a previously poisoned page 1414 * unpoison_memory - Unpoison a previously poisoned page
1408 * @pfn: Page number of the to be unpoisoned page 1415 * @pfn: Page number of the to be unpoisoned page
@@ -1421,6 +1428,8 @@ int unpoison_memory(unsigned long pfn)
1421 struct page *p; 1428 struct page *p;
1422 int freeit = 0; 1429 int freeit = 0;
1423 unsigned int nr_pages; 1430 unsigned int nr_pages;
1431 static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
1432 DEFAULT_RATELIMIT_BURST);
1424 1433
1425 if (!pfn_valid(pfn)) 1434 if (!pfn_valid(pfn))
1426 return -ENXIO; 1435 return -ENXIO;
@@ -1429,23 +1438,26 @@ int unpoison_memory(unsigned long pfn)
1429 page = compound_head(p); 1438 page = compound_head(p);
1430 1439
1431 if (!PageHWPoison(p)) { 1440 if (!PageHWPoison(p)) {
1432 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); 1441 unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n",
1442 pfn, &unpoison_rs);
1433 return 0; 1443 return 0;
1434 } 1444 }
1435 1445
1436 if (page_count(page) > 1) { 1446 if (page_count(page) > 1) {
1437 pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn); 1447 unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n",
1448 pfn, &unpoison_rs);
1438 return 0; 1449 return 0;
1439 } 1450 }
1440 1451
1441 if (page_mapped(page)) { 1452 if (page_mapped(page)) {
1442 pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn); 1453 unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n",
1454 pfn, &unpoison_rs);
1443 return 0; 1455 return 0;
1444 } 1456 }
1445 1457
1446 if (page_mapping(page)) { 1458 if (page_mapping(page)) {
1447 pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", 1459 unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
1448 pfn); 1460 pfn, &unpoison_rs);
1449 return 0; 1461 return 0;
1450 } 1462 }
1451 1463
@@ -1455,7 +1467,8 @@ int unpoison_memory(unsigned long pfn)
1455 * In such case, we yield to memory_failure() and make unpoison fail. 1467 * In such case, we yield to memory_failure() and make unpoison fail.
1456 */ 1468 */
1457 if (!PageHuge(page) && PageTransHuge(page)) { 1469 if (!PageHuge(page) && PageTransHuge(page)) {
1458 pr_info("MCE: Memory failure is now running on %#lx\n", pfn); 1470 unpoison_pr_info("MCE: Memory failure is now running on %#lx\n",
1471 pfn, &unpoison_rs);
1459 return 0; 1472 return 0;
1460 } 1473 }
1461 1474
@@ -1469,12 +1482,14 @@ int unpoison_memory(unsigned long pfn)
1469 * to the end. 1482 * to the end.
1470 */ 1483 */
1471 if (PageHuge(page)) { 1484 if (PageHuge(page)) {
1472 pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); 1485 unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n",
1486 pfn, &unpoison_rs);
1473 return 0; 1487 return 0;
1474 } 1488 }
1475 if (TestClearPageHWPoison(p)) 1489 if (TestClearPageHWPoison(p))
1476 num_poisoned_pages_dec(); 1490 num_poisoned_pages_dec();
1477 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); 1491 unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n",
1492 pfn, &unpoison_rs);
1478 return 0; 1493 return 0;
1479 } 1494 }
1480 1495
@@ -1486,7 +1501,8 @@ int unpoison_memory(unsigned long pfn)
1486 * the free buddy page pool. 1501 * the free buddy page pool.
1487 */ 1502 */
1488 if (TestClearPageHWPoison(page)) { 1503 if (TestClearPageHWPoison(page)) {
1489 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); 1504 unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n",
1505 pfn, &unpoison_rs);
1490 num_poisoned_pages_sub(nr_pages); 1506 num_poisoned_pages_sub(nr_pages);
1491 freeit = 1; 1507 freeit = 1;
1492 if (PageHuge(page)) 1508 if (PageHuge(page))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0780d118d26e..67d488ab495e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -339,8 +339,8 @@ static int __ref ensure_zone_is_initialized(struct zone *zone,
339 unsigned long start_pfn, unsigned long num_pages) 339 unsigned long start_pfn, unsigned long num_pages)
340{ 340{
341 if (!zone_is_initialized(zone)) 341 if (!zone_is_initialized(zone))
342 return init_currently_empty_zone(zone, start_pfn, num_pages, 342 return init_currently_empty_zone(zone, start_pfn, num_pages);
343 MEMMAP_HOTPLUG); 343
344 return 0; 344 return 0;
345} 345}
346 346
diff --git a/mm/migrate.c b/mm/migrate.c
index 842ecd7aaf7f..2834faba719a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Memory Migration functionality - linux/mm/migration.c 2 * Memory Migration functionality - linux/mm/migrate.c
3 * 3 *
4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5 * 5 *
@@ -30,7 +30,7 @@
30#include <linux/mempolicy.h> 30#include <linux/mempolicy.h>
31#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/memcontrol.h> 33#include <linux/backing-dev.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h> 36#include <linux/hugetlb_cgroup.h>
@@ -171,6 +171,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
171 else 171 else
172 page_add_file_rmap(new); 172 page_add_file_rmap(new);
173 173
174 if (vma->vm_flags & VM_LOCKED)
175 mlock_vma_page(new);
176
174 /* No need to invalidate - it was non-present before */ 177 /* No need to invalidate - it was non-present before */
175 update_mmu_cache(vma, addr, ptep); 178 update_mmu_cache(vma, addr, ptep);
176unlock: 179unlock:
@@ -311,6 +314,8 @@ int migrate_page_move_mapping(struct address_space *mapping,
311 struct buffer_head *head, enum migrate_mode mode, 314 struct buffer_head *head, enum migrate_mode mode,
312 int extra_count) 315 int extra_count)
313{ 316{
317 struct zone *oldzone, *newzone;
318 int dirty;
314 int expected_count = 1 + extra_count; 319 int expected_count = 1 + extra_count;
315 void **pslot; 320 void **pslot;
316 321
@@ -318,9 +323,20 @@ int migrate_page_move_mapping(struct address_space *mapping,
318 /* Anonymous page without mapping */ 323 /* Anonymous page without mapping */
319 if (page_count(page) != expected_count) 324 if (page_count(page) != expected_count)
320 return -EAGAIN; 325 return -EAGAIN;
326
327 /* No turning back from here */
328 set_page_memcg(newpage, page_memcg(page));
329 newpage->index = page->index;
330 newpage->mapping = page->mapping;
331 if (PageSwapBacked(page))
332 SetPageSwapBacked(newpage);
333
321 return MIGRATEPAGE_SUCCESS; 334 return MIGRATEPAGE_SUCCESS;
322 } 335 }
323 336
337 oldzone = page_zone(page);
338 newzone = page_zone(newpage);
339
324 spin_lock_irq(&mapping->tree_lock); 340 spin_lock_irq(&mapping->tree_lock);
325 341
326 pslot = radix_tree_lookup_slot(&mapping->page_tree, 342 pslot = radix_tree_lookup_slot(&mapping->page_tree,
@@ -353,14 +369,28 @@ int migrate_page_move_mapping(struct address_space *mapping,
353 } 369 }
354 370
355 /* 371 /*
356 * Now we know that no one else is looking at the page. 372 * Now we know that no one else is looking at the page:
373 * no turning back from here.
357 */ 374 */
375 set_page_memcg(newpage, page_memcg(page));
376 newpage->index = page->index;
377 newpage->mapping = page->mapping;
378 if (PageSwapBacked(page))
379 SetPageSwapBacked(newpage);
380
358 get_page(newpage); /* add cache reference */ 381 get_page(newpage); /* add cache reference */
359 if (PageSwapCache(page)) { 382 if (PageSwapCache(page)) {
360 SetPageSwapCache(newpage); 383 SetPageSwapCache(newpage);
361 set_page_private(newpage, page_private(page)); 384 set_page_private(newpage, page_private(page));
362 } 385 }
363 386
387 /* Move dirty while page refs frozen and newpage not yet exposed */
388 dirty = PageDirty(page);
389 if (dirty) {
390 ClearPageDirty(page);
391 SetPageDirty(newpage);
392 }
393
364 radix_tree_replace_slot(pslot, newpage); 394 radix_tree_replace_slot(pslot, newpage);
365 395
366 /* 396 /*
@@ -370,6 +400,9 @@ int migrate_page_move_mapping(struct address_space *mapping,
370 */ 400 */
371 page_unfreeze_refs(page, expected_count - 1); 401 page_unfreeze_refs(page, expected_count - 1);
372 402
403 spin_unlock(&mapping->tree_lock);
404 /* Leave irq disabled to prevent preemption while updating stats */
405
373 /* 406 /*
374 * If moved to a different zone then also account 407 * If moved to a different zone then also account
375 * the page for that zone. Other VM counters will be 408 * the page for that zone. Other VM counters will be
@@ -380,13 +413,19 @@ int migrate_page_move_mapping(struct address_space *mapping,
380 * via NR_FILE_PAGES and NR_ANON_PAGES if they 413 * via NR_FILE_PAGES and NR_ANON_PAGES if they
381 * are mapped to swap space. 414 * are mapped to swap space.
382 */ 415 */
383 __dec_zone_page_state(page, NR_FILE_PAGES); 416 if (newzone != oldzone) {
384 __inc_zone_page_state(newpage, NR_FILE_PAGES); 417 __dec_zone_state(oldzone, NR_FILE_PAGES);
385 if (!PageSwapCache(page) && PageSwapBacked(page)) { 418 __inc_zone_state(newzone, NR_FILE_PAGES);
386 __dec_zone_page_state(page, NR_SHMEM); 419 if (PageSwapBacked(page) && !PageSwapCache(page)) {
387 __inc_zone_page_state(newpage, NR_SHMEM); 420 __dec_zone_state(oldzone, NR_SHMEM);
421 __inc_zone_state(newzone, NR_SHMEM);
422 }
423 if (dirty && mapping_cap_account_dirty(mapping)) {
424 __dec_zone_state(oldzone, NR_FILE_DIRTY);
425 __inc_zone_state(newzone, NR_FILE_DIRTY);
426 }
388 } 427 }
389 spin_unlock_irq(&mapping->tree_lock); 428 local_irq_enable();
390 429
391 return MIGRATEPAGE_SUCCESS; 430 return MIGRATEPAGE_SUCCESS;
392} 431}
@@ -401,12 +440,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
401 int expected_count; 440 int expected_count;
402 void **pslot; 441 void **pslot;
403 442
404 if (!mapping) {
405 if (page_count(page) != 1)
406 return -EAGAIN;
407 return MIGRATEPAGE_SUCCESS;
408 }
409
410 spin_lock_irq(&mapping->tree_lock); 443 spin_lock_irq(&mapping->tree_lock);
411 444
412 pslot = radix_tree_lookup_slot(&mapping->page_tree, 445 pslot = radix_tree_lookup_slot(&mapping->page_tree,
@@ -424,6 +457,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
424 return -EAGAIN; 457 return -EAGAIN;
425 } 458 }
426 459
460 set_page_memcg(newpage, page_memcg(page));
461 newpage->index = page->index;
462 newpage->mapping = page->mapping;
427 get_page(newpage); 463 get_page(newpage);
428 464
429 radix_tree_replace_slot(pslot, newpage); 465 radix_tree_replace_slot(pslot, newpage);
@@ -510,20 +546,9 @@ void migrate_page_copy(struct page *newpage, struct page *page)
510 if (PageMappedToDisk(page)) 546 if (PageMappedToDisk(page))
511 SetPageMappedToDisk(newpage); 547 SetPageMappedToDisk(newpage);
512 548
513 if (PageDirty(page)) { 549 /* Move dirty on pages not done by migrate_page_move_mapping() */
514 clear_page_dirty_for_io(page); 550 if (PageDirty(page))
515 /* 551 SetPageDirty(newpage);
516 * Want to mark the page and the radix tree as dirty, and
517 * redo the accounting that clear_page_dirty_for_io undid,
518 * but we can't use set_page_dirty because that function
519 * is actually a signal that all of the page has become dirty.
520 * Whereas only part of our page may be dirty.
521 */
522 if (PageSwapBacked(page))
523 SetPageDirty(newpage);
524 else
525 __set_page_dirty_nobuffers(newpage);
526 }
527 552
528 if (page_is_young(page)) 553 if (page_is_young(page))
529 set_page_young(newpage); 554 set_page_young(newpage);
@@ -537,7 +562,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
537 cpupid = page_cpupid_xchg_last(page, -1); 562 cpupid = page_cpupid_xchg_last(page, -1);
538 page_cpupid_xchg_last(newpage, cpupid); 563 page_cpupid_xchg_last(newpage, cpupid);
539 564
540 mlock_migrate_page(newpage, page);
541 ksm_migrate_page(newpage, page); 565 ksm_migrate_page(newpage, page);
542 /* 566 /*
543 * Please do not reorder this without considering how mm/ksm.c's 567 * Please do not reorder this without considering how mm/ksm.c's
@@ -721,33 +745,13 @@ static int fallback_migrate_page(struct address_space *mapping,
721 * MIGRATEPAGE_SUCCESS - success 745 * MIGRATEPAGE_SUCCESS - success
722 */ 746 */
723static int move_to_new_page(struct page *newpage, struct page *page, 747static int move_to_new_page(struct page *newpage, struct page *page,
724 int page_was_mapped, enum migrate_mode mode) 748 enum migrate_mode mode)
725{ 749{
726 struct address_space *mapping; 750 struct address_space *mapping;
727 int rc; 751 int rc;
728 752
729 /* 753 VM_BUG_ON_PAGE(!PageLocked(page), page);
730 * Block others from accessing the page when we get around to 754 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
731 * establishing additional references. We are the only one
732 * holding a reference to the new page at this point.
733 */
734 if (!trylock_page(newpage))
735 BUG();
736
737 /* Prepare mapping for the new page.*/
738 newpage->index = page->index;
739 newpage->mapping = page->mapping;
740 if (PageSwapBacked(page))
741 SetPageSwapBacked(newpage);
742
743 /*
744 * Indirectly called below, migrate_page_copy() copies PG_dirty and thus
745 * needs newpage's memcg set to transfer memcg dirty page accounting.
746 * So perform memcg migration in two steps:
747 * 1. set newpage->mem_cgroup (here)
748 * 2. clear page->mem_cgroup (below)
749 */
750 set_page_memcg(newpage, page_memcg(page));
751 755
752 mapping = page_mapping(page); 756 mapping = page_mapping(page);
753 if (!mapping) 757 if (!mapping)
@@ -759,23 +763,19 @@ static int move_to_new_page(struct page *newpage, struct page *page,
759 * space which also has its own migratepage callback. This 763 * space which also has its own migratepage callback. This
760 * is the most common path for page migration. 764 * is the most common path for page migration.
761 */ 765 */
762 rc = mapping->a_ops->migratepage(mapping, 766 rc = mapping->a_ops->migratepage(mapping, newpage, page, mode);
763 newpage, page, mode);
764 else 767 else
765 rc = fallback_migrate_page(mapping, newpage, page, mode); 768 rc = fallback_migrate_page(mapping, newpage, page, mode);
766 769
767 if (rc != MIGRATEPAGE_SUCCESS) { 770 /*
768 set_page_memcg(newpage, NULL); 771 * When successful, old pagecache page->mapping must be cleared before
769 newpage->mapping = NULL; 772 * page is freed; but stats require that PageAnon be left as PageAnon.
770 } else { 773 */
774 if (rc == MIGRATEPAGE_SUCCESS) {
771 set_page_memcg(page, NULL); 775 set_page_memcg(page, NULL);
772 if (page_was_mapped) 776 if (!PageAnon(page))
773 remove_migration_ptes(page, newpage); 777 page->mapping = NULL;
774 page->mapping = NULL;
775 } 778 }
776
777 unlock_page(newpage);
778
779 return rc; 779 return rc;
780} 780}
781 781
@@ -824,6 +824,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
824 goto out_unlock; 824 goto out_unlock;
825 wait_on_page_writeback(page); 825 wait_on_page_writeback(page);
826 } 826 }
827
827 /* 828 /*
828 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 829 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
829 * we cannot notice that anon_vma is freed while we migrates a page. 830 * we cannot notice that anon_vma is freed while we migrates a page.
@@ -831,34 +832,26 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
831 * of migration. File cache pages are no problem because of page_lock() 832 * of migration. File cache pages are no problem because of page_lock()
832 * File Caches may use write_page() or lock_page() in migration, then, 833 * File Caches may use write_page() or lock_page() in migration, then,
833 * just care Anon page here. 834 * just care Anon page here.
835 *
836 * Only page_get_anon_vma() understands the subtleties of
837 * getting a hold on an anon_vma from outside one of its mms.
838 * But if we cannot get anon_vma, then we won't need it anyway,
839 * because that implies that the anon page is no longer mapped
840 * (and cannot be remapped so long as we hold the page lock).
834 */ 841 */
835 if (PageAnon(page) && !PageKsm(page)) { 842 if (PageAnon(page) && !PageKsm(page))
836 /*
837 * Only page_lock_anon_vma_read() understands the subtleties of
838 * getting a hold on an anon_vma from outside one of its mms.
839 */
840 anon_vma = page_get_anon_vma(page); 843 anon_vma = page_get_anon_vma(page);
841 if (anon_vma) { 844
842 /* 845 /*
843 * Anon page 846 * Block others from accessing the new page when we get around to
844 */ 847 * establishing additional references. We are usually the only one
845 } else if (PageSwapCache(page)) { 848 * holding a reference to newpage at this point. We used to have a BUG
846 /* 849 * here if trylock_page(newpage) fails, but would like to allow for
847 * We cannot be sure that the anon_vma of an unmapped 850 * cases where there might be a race with the previous use of newpage.
848 * swapcache page is safe to use because we don't 851 * This is much like races on refcount of oldpage: just don't BUG().
849 * know in advance if the VMA that this page belonged 852 */
850 * to still exists. If the VMA and others sharing the 853 if (unlikely(!trylock_page(newpage)))
851 * data have been freed, then the anon_vma could 854 goto out_unlock;
852 * already be invalid.
853 *
854 * To avoid this possibility, swapcache pages get
855 * migrated but are not remapped when migration
856 * completes
857 */
858 } else {
859 goto out_unlock;
860 }
861 }
862 855
863 if (unlikely(isolated_balloon_page(page))) { 856 if (unlikely(isolated_balloon_page(page))) {
864 /* 857 /*
@@ -869,7 +862,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
869 * the page migration right away (proteced by page lock). 862 * the page migration right away (proteced by page lock).
870 */ 863 */
871 rc = balloon_page_migrate(newpage, page, mode); 864 rc = balloon_page_migrate(newpage, page, mode);
872 goto out_unlock; 865 goto out_unlock_both;
873 } 866 }
874 867
875 /* 868 /*
@@ -888,30 +881,30 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
888 VM_BUG_ON_PAGE(PageAnon(page), page); 881 VM_BUG_ON_PAGE(PageAnon(page), page);
889 if (page_has_private(page)) { 882 if (page_has_private(page)) {
890 try_to_free_buffers(page); 883 try_to_free_buffers(page);
891 goto out_unlock; 884 goto out_unlock_both;
892 } 885 }
893 goto skip_unmap; 886 } else if (page_mapped(page)) {
894 } 887 /* Establish migration ptes */
895 888 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
896 /* Establish migration ptes or remove ptes */ 889 page);
897 if (page_mapped(page)) {
898 try_to_unmap(page, 890 try_to_unmap(page,
899 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 891 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
900 page_was_mapped = 1; 892 page_was_mapped = 1;
901 } 893 }
902 894
903skip_unmap:
904 if (!page_mapped(page)) 895 if (!page_mapped(page))
905 rc = move_to_new_page(newpage, page, page_was_mapped, mode); 896 rc = move_to_new_page(newpage, page, mode);
906 897
907 if (rc && page_was_mapped) 898 if (page_was_mapped)
908 remove_migration_ptes(page, page); 899 remove_migration_ptes(page,
900 rc == MIGRATEPAGE_SUCCESS ? newpage : page);
909 901
902out_unlock_both:
903 unlock_page(newpage);
904out_unlock:
910 /* Drop an anon_vma reference if we took one */ 905 /* Drop an anon_vma reference if we took one */
911 if (anon_vma) 906 if (anon_vma)
912 put_anon_vma(anon_vma); 907 put_anon_vma(anon_vma);
913
914out_unlock:
915 unlock_page(page); 908 unlock_page(page);
916out: 909out:
917 return rc; 910 return rc;
@@ -937,10 +930,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
937 int force, enum migrate_mode mode, 930 int force, enum migrate_mode mode,
938 enum migrate_reason reason) 931 enum migrate_reason reason)
939{ 932{
940 int rc = 0; 933 int rc = MIGRATEPAGE_SUCCESS;
941 int *result = NULL; 934 int *result = NULL;
942 struct page *newpage = get_new_page(page, private, &result); 935 struct page *newpage;
943 936
937 newpage = get_new_page(page, private, &result);
944 if (!newpage) 938 if (!newpage)
945 return -ENOMEM; 939 return -ENOMEM;
946 940
@@ -954,6 +948,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
954 goto out; 948 goto out;
955 949
956 rc = __unmap_and_move(page, newpage, force, mode); 950 rc = __unmap_and_move(page, newpage, force, mode);
951 if (rc == MIGRATEPAGE_SUCCESS)
952 put_new_page = NULL;
957 953
958out: 954out:
959 if (rc != -EAGAIN) { 955 if (rc != -EAGAIN) {
@@ -980,10 +976,9 @@ out:
980 * it. Otherwise, putback_lru_page() will drop the reference grabbed 976 * it. Otherwise, putback_lru_page() will drop the reference grabbed
981 * during isolation. 977 * during isolation.
982 */ 978 */
983 if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { 979 if (put_new_page)
984 ClearPageSwapBacked(newpage);
985 put_new_page(newpage, private); 980 put_new_page(newpage, private);
986 } else if (unlikely(__is_movable_balloon_page(newpage))) { 981 else if (unlikely(__is_movable_balloon_page(newpage))) {
987 /* drop our reference, page already in the balloon */ 982 /* drop our reference, page already in the balloon */
988 put_page(newpage); 983 put_page(newpage);
989 } else 984 } else
@@ -1021,7 +1016,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1021 struct page *hpage, int force, 1016 struct page *hpage, int force,
1022 enum migrate_mode mode) 1017 enum migrate_mode mode)
1023{ 1018{
1024 int rc = 0; 1019 int rc = -EAGAIN;
1025 int *result = NULL; 1020 int *result = NULL;
1026 int page_was_mapped = 0; 1021 int page_was_mapped = 0;
1027 struct page *new_hpage; 1022 struct page *new_hpage;
@@ -1043,8 +1038,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1043 if (!new_hpage) 1038 if (!new_hpage)
1044 return -ENOMEM; 1039 return -ENOMEM;
1045 1040
1046 rc = -EAGAIN;
1047
1048 if (!trylock_page(hpage)) { 1041 if (!trylock_page(hpage)) {
1049 if (!force || mode != MIGRATE_SYNC) 1042 if (!force || mode != MIGRATE_SYNC)
1050 goto out; 1043 goto out;
@@ -1054,6 +1047,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1054 if (PageAnon(hpage)) 1047 if (PageAnon(hpage))
1055 anon_vma = page_get_anon_vma(hpage); 1048 anon_vma = page_get_anon_vma(hpage);
1056 1049
1050 if (unlikely(!trylock_page(new_hpage)))
1051 goto put_anon;
1052
1057 if (page_mapped(hpage)) { 1053 if (page_mapped(hpage)) {
1058 try_to_unmap(hpage, 1054 try_to_unmap(hpage,
1059 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 1055 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
@@ -1061,16 +1057,22 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1061 } 1057 }
1062 1058
1063 if (!page_mapped(hpage)) 1059 if (!page_mapped(hpage))
1064 rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode); 1060 rc = move_to_new_page(new_hpage, hpage, mode);
1061
1062 if (page_was_mapped)
1063 remove_migration_ptes(hpage,
1064 rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage);
1065 1065
1066 if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) 1066 unlock_page(new_hpage);
1067 remove_migration_ptes(hpage, hpage);
1068 1067
1068put_anon:
1069 if (anon_vma) 1069 if (anon_vma)
1070 put_anon_vma(anon_vma); 1070 put_anon_vma(anon_vma);
1071 1071
1072 if (rc == MIGRATEPAGE_SUCCESS) 1072 if (rc == MIGRATEPAGE_SUCCESS) {
1073 hugetlb_cgroup_migrate(hpage, new_hpage); 1073 hugetlb_cgroup_migrate(hpage, new_hpage);
1074 put_new_page = NULL;
1075 }
1074 1076
1075 unlock_page(hpage); 1077 unlock_page(hpage);
1076out: 1078out:
@@ -1082,7 +1084,7 @@ out:
1082 * it. Otherwise, put_page() will drop the reference grabbed during 1084 * it. Otherwise, put_page() will drop the reference grabbed during
1083 * isolation. 1085 * isolation.
1084 */ 1086 */
1085 if (rc != MIGRATEPAGE_SUCCESS && put_new_page) 1087 if (put_new_page)
1086 put_new_page(new_hpage, private); 1088 put_new_page(new_hpage, private);
1087 else 1089 else
1088 putback_active_hugepage(new_hpage); 1090 putback_active_hugepage(new_hpage);
@@ -1112,7 +1114,7 @@ out:
1112 * 1114 *
1113 * The function returns after 10 attempts or if no pages are movable any more 1115 * The function returns after 10 attempts or if no pages are movable any more
1114 * because the list has become empty or no retryable pages exist any more. 1116 * because the list has become empty or no retryable pages exist any more.
1115 * The caller should call putback_lru_pages() to return pages to the LRU 1117 * The caller should call putback_movable_pages() to return pages to the LRU
1116 * or free list only if ret != 0. 1118 * or free list only if ret != 0.
1117 * 1119 *
1118 * Returns the number of pages that were not migrated, or an error code. 1120 * Returns the number of pages that were not migrated, or an error code.
@@ -1169,7 +1171,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
1169 } 1171 }
1170 } 1172 }
1171 } 1173 }
1172 rc = nr_failed + retry; 1174 nr_failed += retry;
1175 rc = nr_failed;
1173out: 1176out:
1174 if (nr_succeeded) 1177 if (nr_succeeded)
1175 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); 1178 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
@@ -1786,7 +1789,6 @@ fail_putback:
1786 SetPageActive(page); 1789 SetPageActive(page);
1787 if (TestClearPageUnevictable(new_page)) 1790 if (TestClearPageUnevictable(new_page))
1788 SetPageUnevictable(page); 1791 SetPageUnevictable(page);
1789 mlock_migrate_page(page, new_page);
1790 1792
1791 unlock_page(new_page); 1793 unlock_page(new_page);
1792 put_page(new_page); /* Free it */ 1794 put_page(new_page); /* Free it */
@@ -1828,8 +1830,9 @@ fail_putback:
1828 goto fail_putback; 1830 goto fail_putback;
1829 } 1831 }
1830 1832
1831 mem_cgroup_migrate(page, new_page, false); 1833 mlock_migrate_page(new_page, page);
1832 1834 set_page_memcg(new_page, page_memcg(page));
1835 set_page_memcg(page, NULL);
1833 page_remove_rmap(page); 1836 page_remove_rmap(page);
1834 1837
1835 spin_unlock(ptl); 1838 spin_unlock(ptl);
diff --git a/mm/mincore.c b/mm/mincore.c
index be25efde64a4..14bb9fb37f0c 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -234,7 +234,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
234 234
235 /* This also avoids any overflows on PAGE_CACHE_ALIGN */ 235 /* This also avoids any overflows on PAGE_CACHE_ALIGN */
236 pages = len >> PAGE_SHIFT; 236 pages = len >> PAGE_SHIFT;
237 pages += (len & ~PAGE_MASK) != 0; 237 pages += (offset_in_page(len)) != 0;
238 238
239 if (!access_ok(VERIFY_WRITE, vec, pages)) 239 if (!access_ok(VERIFY_WRITE, vec, pages))
240 return -EFAULT; 240 return -EFAULT;
diff --git a/mm/mlock.c b/mm/mlock.c
index 25936680064f..339d9e0949b6 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
422void munlock_vma_pages_range(struct vm_area_struct *vma, 422void munlock_vma_pages_range(struct vm_area_struct *vma,
423 unsigned long start, unsigned long end) 423 unsigned long start, unsigned long end)
424{ 424{
425 vma->vm_flags &= ~VM_LOCKED; 425 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
426 426
427 while (start < end) { 427 while (start < end) {
428 struct page *page = NULL; 428 struct page *page = NULL;
@@ -506,7 +506,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
506 506
507 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || 507 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
508 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) 508 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
509 goto out; /* don't set VM_LOCKED, don't count */ 509 /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
510 goto out;
510 511
511 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 512 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
512 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, 513 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
@@ -554,13 +555,14 @@ out:
554 return ret; 555 return ret;
555} 556}
556 557
557static int do_mlock(unsigned long start, size_t len, int on) 558static int apply_vma_lock_flags(unsigned long start, size_t len,
559 vm_flags_t flags)
558{ 560{
559 unsigned long nstart, end, tmp; 561 unsigned long nstart, end, tmp;
560 struct vm_area_struct * vma, * prev; 562 struct vm_area_struct * vma, * prev;
561 int error; 563 int error;
562 564
563 VM_BUG_ON(start & ~PAGE_MASK); 565 VM_BUG_ON(offset_in_page(start));
564 VM_BUG_ON(len != PAGE_ALIGN(len)); 566 VM_BUG_ON(len != PAGE_ALIGN(len));
565 end = start + len; 567 end = start + len;
566 if (end < start) 568 if (end < start)
@@ -576,14 +578,11 @@ static int do_mlock(unsigned long start, size_t len, int on)
576 prev = vma; 578 prev = vma;
577 579
578 for (nstart = start ; ; ) { 580 for (nstart = start ; ; ) {
579 vm_flags_t newflags; 581 vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
580 582
581 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 583 newflags |= flags;
582
583 newflags = vma->vm_flags & ~VM_LOCKED;
584 if (on)
585 newflags |= VM_LOCKED;
586 584
585 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
587 tmp = vma->vm_end; 586 tmp = vma->vm_end;
588 if (tmp > end) 587 if (tmp > end)
589 tmp = end; 588 tmp = end;
@@ -605,7 +604,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
605 return error; 604 return error;
606} 605}
607 606
608SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) 607static int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
609{ 608{
610 unsigned long locked; 609 unsigned long locked;
611 unsigned long lock_limit; 610 unsigned long lock_limit;
@@ -616,7 +615,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
616 615
617 lru_add_drain_all(); /* flush pagevec */ 616 lru_add_drain_all(); /* flush pagevec */
618 617
619 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 618 len = PAGE_ALIGN(len + (offset_in_page(start)));
620 start &= PAGE_MASK; 619 start &= PAGE_MASK;
621 620
622 lock_limit = rlimit(RLIMIT_MEMLOCK); 621 lock_limit = rlimit(RLIMIT_MEMLOCK);
@@ -629,7 +628,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
629 628
630 /* check against resource limits */ 629 /* check against resource limits */
631 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) 630 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
632 error = do_mlock(start, len, 1); 631 error = apply_vma_lock_flags(start, len, flags);
633 632
634 up_write(&current->mm->mmap_sem); 633 up_write(&current->mm->mmap_sem);
635 if (error) 634 if (error)
@@ -641,37 +640,75 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
641 return 0; 640 return 0;
642} 641}
643 642
643SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
644{
645 return do_mlock(start, len, VM_LOCKED);
646}
647
648SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
649{
650 vm_flags_t vm_flags = VM_LOCKED;
651
652 if (flags & ~MLOCK_ONFAULT)
653 return -EINVAL;
654
655 if (flags & MLOCK_ONFAULT)
656 vm_flags |= VM_LOCKONFAULT;
657
658 return do_mlock(start, len, vm_flags);
659}
660
644SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) 661SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
645{ 662{
646 int ret; 663 int ret;
647 664
648 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 665 len = PAGE_ALIGN(len + (offset_in_page(start)));
649 start &= PAGE_MASK; 666 start &= PAGE_MASK;
650 667
651 down_write(&current->mm->mmap_sem); 668 down_write(&current->mm->mmap_sem);
652 ret = do_mlock(start, len, 0); 669 ret = apply_vma_lock_flags(start, len, 0);
653 up_write(&current->mm->mmap_sem); 670 up_write(&current->mm->mmap_sem);
654 671
655 return ret; 672 return ret;
656} 673}
657 674
658static int do_mlockall(int flags) 675/*
676 * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
677 * and translate into the appropriate modifications to mm->def_flags and/or the
678 * flags for all current VMAs.
679 *
680 * There are a couple of subtleties with this. If mlockall() is called multiple
681 * times with different flags, the values do not necessarily stack. If mlockall
682 * is called once including the MCL_FUTURE flag and then a second time without
683 * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
684 */
685static int apply_mlockall_flags(int flags)
659{ 686{
660 struct vm_area_struct * vma, * prev = NULL; 687 struct vm_area_struct * vma, * prev = NULL;
688 vm_flags_t to_add = 0;
661 689
662 if (flags & MCL_FUTURE) 690 current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
691 if (flags & MCL_FUTURE) {
663 current->mm->def_flags |= VM_LOCKED; 692 current->mm->def_flags |= VM_LOCKED;
664 else 693
665 current->mm->def_flags &= ~VM_LOCKED; 694 if (flags & MCL_ONFAULT)
666 if (flags == MCL_FUTURE) 695 current->mm->def_flags |= VM_LOCKONFAULT;
667 goto out; 696
697 if (!(flags & MCL_CURRENT))
698 goto out;
699 }
700
701 if (flags & MCL_CURRENT) {
702 to_add |= VM_LOCKED;
703 if (flags & MCL_ONFAULT)
704 to_add |= VM_LOCKONFAULT;
705 }
668 706
669 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { 707 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
670 vm_flags_t newflags; 708 vm_flags_t newflags;
671 709
672 newflags = vma->vm_flags & ~VM_LOCKED; 710 newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
673 if (flags & MCL_CURRENT) 711 newflags |= to_add;
674 newflags |= VM_LOCKED;
675 712
676 /* Ignore errors */ 713 /* Ignore errors */
677 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); 714 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
@@ -684,14 +721,13 @@ out:
684SYSCALL_DEFINE1(mlockall, int, flags) 721SYSCALL_DEFINE1(mlockall, int, flags)
685{ 722{
686 unsigned long lock_limit; 723 unsigned long lock_limit;
687 int ret = -EINVAL; 724 int ret;
688 725
689 if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) 726 if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)))
690 goto out; 727 return -EINVAL;
691 728
692 ret = -EPERM;
693 if (!can_do_mlock()) 729 if (!can_do_mlock())
694 goto out; 730 return -EPERM;
695 731
696 if (flags & MCL_CURRENT) 732 if (flags & MCL_CURRENT)
697 lru_add_drain_all(); /* flush pagevec */ 733 lru_add_drain_all(); /* flush pagevec */
@@ -704,11 +740,11 @@ SYSCALL_DEFINE1(mlockall, int, flags)
704 740
705 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || 741 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
706 capable(CAP_IPC_LOCK)) 742 capable(CAP_IPC_LOCK))
707 ret = do_mlockall(flags); 743 ret = apply_mlockall_flags(flags);
708 up_write(&current->mm->mmap_sem); 744 up_write(&current->mm->mmap_sem);
709 if (!ret && (flags & MCL_CURRENT)) 745 if (!ret && (flags & MCL_CURRENT))
710 mm_populate(0, TASK_SIZE); 746 mm_populate(0, TASK_SIZE);
711out: 747
712 return ret; 748 return ret;
713} 749}
714 750
@@ -717,7 +753,7 @@ SYSCALL_DEFINE0(munlockall)
717 int ret; 753 int ret;
718 754
719 down_write(&current->mm->mmap_sem); 755 down_write(&current->mm->mmap_sem);
720 ret = do_mlockall(0); 756 ret = apply_mlockall_flags(0);
721 up_write(&current->mm->mmap_sem); 757 up_write(&current->mm->mmap_sem);
722 return ret; 758 return ret;
723} 759}
diff --git a/mm/mmap.c b/mm/mmap.c
index 79bcc9f92e48..2ce04a649f6b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1302,7 +1302,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
1302 * that it represents a valid section of the address space. 1302 * that it represents a valid section of the address space.
1303 */ 1303 */
1304 addr = get_unmapped_area(file, addr, len, pgoff, flags); 1304 addr = get_unmapped_area(file, addr, len, pgoff, flags);
1305 if (addr & ~PAGE_MASK) 1305 if (offset_in_page(addr))
1306 return addr; 1306 return addr;
1307 1307
1308 /* Do simple checking here so the lower-level routines won't have 1308 /* Do simple checking here so the lower-level routines won't have
@@ -1412,13 +1412,13 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1412 unsigned long, fd, unsigned long, pgoff) 1412 unsigned long, fd, unsigned long, pgoff)
1413{ 1413{
1414 struct file *file = NULL; 1414 struct file *file = NULL;
1415 unsigned long retval = -EBADF; 1415 unsigned long retval;
1416 1416
1417 if (!(flags & MAP_ANONYMOUS)) { 1417 if (!(flags & MAP_ANONYMOUS)) {
1418 audit_mmap_fd(fd, flags); 1418 audit_mmap_fd(fd, flags);
1419 file = fget(fd); 1419 file = fget(fd);
1420 if (!file) 1420 if (!file)
1421 goto out; 1421 return -EBADF;
1422 if (is_file_hugepages(file)) 1422 if (is_file_hugepages(file))
1423 len = ALIGN(len, huge_page_size(hstate_file(file))); 1423 len = ALIGN(len, huge_page_size(hstate_file(file)));
1424 retval = -EINVAL; 1424 retval = -EINVAL;
@@ -1453,7 +1453,6 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1453out_fput: 1453out_fput:
1454 if (file) 1454 if (file)
1455 fput(file); 1455 fput(file);
1456out:
1457 return retval; 1456 return retval;
1458} 1457}
1459 1458
@@ -1473,7 +1472,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1473 1472
1474 if (copy_from_user(&a, arg, sizeof(a))) 1473 if (copy_from_user(&a, arg, sizeof(a)))
1475 return -EFAULT; 1474 return -EFAULT;
1476 if (a.offset & ~PAGE_MASK) 1475 if (offset_in_page(a.offset))
1477 return -EINVAL; 1476 return -EINVAL;
1478 1477
1479 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, 1478 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
@@ -1562,7 +1561,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1562 } 1561 }
1563 1562
1564 /* Clear old maps */ 1563 /* Clear old maps */
1565 error = -ENOMEM;
1566 while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, 1564 while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
1567 &rb_parent)) { 1565 &rb_parent)) {
1568 if (do_munmap(mm, addr, len)) 1566 if (do_munmap(mm, addr, len))
@@ -1663,7 +1661,7 @@ out:
1663 vma == get_gate_vma(current->mm))) 1661 vma == get_gate_vma(current->mm)))
1664 mm->locked_vm += (len >> PAGE_SHIFT); 1662 mm->locked_vm += (len >> PAGE_SHIFT);
1665 else 1663 else
1666 vma->vm_flags &= ~VM_LOCKED; 1664 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
1667 } 1665 }
1668 1666
1669 if (file) 1667 if (file)
@@ -1989,7 +1987,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1989 * can happen with large stack limits and large mmap() 1987 * can happen with large stack limits and large mmap()
1990 * allocations. 1988 * allocations.
1991 */ 1989 */
1992 if (addr & ~PAGE_MASK) { 1990 if (offset_in_page(addr)) {
1993 VM_BUG_ON(addr != -ENOMEM); 1991 VM_BUG_ON(addr != -ENOMEM);
1994 info.flags = 0; 1992 info.flags = 0;
1995 info.low_limit = TASK_UNMAPPED_BASE; 1993 info.low_limit = TASK_UNMAPPED_BASE;
@@ -2025,7 +2023,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
2025 2023
2026 if (addr > TASK_SIZE - len) 2024 if (addr > TASK_SIZE - len)
2027 return -ENOMEM; 2025 return -ENOMEM;
2028 if (addr & ~PAGE_MASK) 2026 if (offset_in_page(addr))
2029 return -EINVAL; 2027 return -EINVAL;
2030 2028
2031 addr = arch_rebalance_pgtables(addr, len); 2029 addr = arch_rebalance_pgtables(addr, len);
@@ -2047,7 +2045,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
2047 return vma; 2045 return vma;
2048 2046
2049 rb_node = mm->mm_rb.rb_node; 2047 rb_node = mm->mm_rb.rb_node;
2050 vma = NULL;
2051 2048
2052 while (rb_node) { 2049 while (rb_node) {
2053 struct vm_area_struct *tmp; 2050 struct vm_area_struct *tmp;
@@ -2139,10 +2136,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
2139 if (security_vm_enough_memory_mm(mm, grow)) 2136 if (security_vm_enough_memory_mm(mm, grow))
2140 return -ENOMEM; 2137 return -ENOMEM;
2141 2138
2142 /* Ok, everything looks good - let it rip */
2143 if (vma->vm_flags & VM_LOCKED)
2144 mm->locked_vm += grow;
2145 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
2146 return 0; 2139 return 0;
2147} 2140}
2148 2141
@@ -2153,6 +2146,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
2153 */ 2146 */
2154int expand_upwards(struct vm_area_struct *vma, unsigned long address) 2147int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2155{ 2148{
2149 struct mm_struct *mm = vma->vm_mm;
2156 int error; 2150 int error;
2157 2151
2158 if (!(vma->vm_flags & VM_GROWSUP)) 2152 if (!(vma->vm_flags & VM_GROWSUP))
@@ -2202,15 +2196,19 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2202 * So, we reuse mm->page_table_lock to guard 2196 * So, we reuse mm->page_table_lock to guard
2203 * against concurrent vma expansions. 2197 * against concurrent vma expansions.
2204 */ 2198 */
2205 spin_lock(&vma->vm_mm->page_table_lock); 2199 spin_lock(&mm->page_table_lock);
2200 if (vma->vm_flags & VM_LOCKED)
2201 mm->locked_vm += grow;
2202 vm_stat_account(mm, vma->vm_flags,
2203 vma->vm_file, grow);
2206 anon_vma_interval_tree_pre_update_vma(vma); 2204 anon_vma_interval_tree_pre_update_vma(vma);
2207 vma->vm_end = address; 2205 vma->vm_end = address;
2208 anon_vma_interval_tree_post_update_vma(vma); 2206 anon_vma_interval_tree_post_update_vma(vma);
2209 if (vma->vm_next) 2207 if (vma->vm_next)
2210 vma_gap_update(vma->vm_next); 2208 vma_gap_update(vma->vm_next);
2211 else 2209 else
2212 vma->vm_mm->highest_vm_end = address; 2210 mm->highest_vm_end = address;
2213 spin_unlock(&vma->vm_mm->page_table_lock); 2211 spin_unlock(&mm->page_table_lock);
2214 2212
2215 perf_event_mmap(vma); 2213 perf_event_mmap(vma);
2216 } 2214 }
@@ -2218,7 +2216,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2218 } 2216 }
2219 vma_unlock_anon_vma(vma); 2217 vma_unlock_anon_vma(vma);
2220 khugepaged_enter_vma_merge(vma, vma->vm_flags); 2218 khugepaged_enter_vma_merge(vma, vma->vm_flags);
2221 validate_mm(vma->vm_mm); 2219 validate_mm(mm);
2222 return error; 2220 return error;
2223} 2221}
2224#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 2222#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -2229,6 +2227,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2229int expand_downwards(struct vm_area_struct *vma, 2227int expand_downwards(struct vm_area_struct *vma,
2230 unsigned long address) 2228 unsigned long address)
2231{ 2229{
2230 struct mm_struct *mm = vma->vm_mm;
2232 int error; 2231 int error;
2233 2232
2234 /* 2233 /*
@@ -2273,13 +2272,17 @@ int expand_downwards(struct vm_area_struct *vma,
2273 * So, we reuse mm->page_table_lock to guard 2272 * So, we reuse mm->page_table_lock to guard
2274 * against concurrent vma expansions. 2273 * against concurrent vma expansions.
2275 */ 2274 */
2276 spin_lock(&vma->vm_mm->page_table_lock); 2275 spin_lock(&mm->page_table_lock);
2276 if (vma->vm_flags & VM_LOCKED)
2277 mm->locked_vm += grow;
2278 vm_stat_account(mm, vma->vm_flags,
2279 vma->vm_file, grow);
2277 anon_vma_interval_tree_pre_update_vma(vma); 2280 anon_vma_interval_tree_pre_update_vma(vma);
2278 vma->vm_start = address; 2281 vma->vm_start = address;
2279 vma->vm_pgoff -= grow; 2282 vma->vm_pgoff -= grow;
2280 anon_vma_interval_tree_post_update_vma(vma); 2283 anon_vma_interval_tree_post_update_vma(vma);
2281 vma_gap_update(vma); 2284 vma_gap_update(vma);
2282 spin_unlock(&vma->vm_mm->page_table_lock); 2285 spin_unlock(&mm->page_table_lock);
2283 2286
2284 perf_event_mmap(vma); 2287 perf_event_mmap(vma);
2285 } 2288 }
@@ -2287,7 +2290,7 @@ int expand_downwards(struct vm_area_struct *vma,
2287 } 2290 }
2288 vma_unlock_anon_vma(vma); 2291 vma_unlock_anon_vma(vma);
2289 khugepaged_enter_vma_merge(vma, vma->vm_flags); 2292 khugepaged_enter_vma_merge(vma, vma->vm_flags);
2290 validate_mm(vma->vm_mm); 2293 validate_mm(mm);
2291 return error; 2294 return error;
2292} 2295}
2293 2296
@@ -2536,7 +2539,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2536 unsigned long end; 2539 unsigned long end;
2537 struct vm_area_struct *vma, *prev, *last; 2540 struct vm_area_struct *vma, *prev, *last;
2538 2541
2539 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) 2542 if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
2540 return -EINVAL; 2543 return -EINVAL;
2541 2544
2542 len = PAGE_ALIGN(len); 2545 len = PAGE_ALIGN(len);
@@ -2734,7 +2737,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2734 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2737 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2735 2738
2736 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); 2739 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
2737 if (error & ~PAGE_MASK) 2740 if (offset_in_page(error))
2738 return error; 2741 return error;
2739 2742
2740 error = mlock_future_check(mm, mm->def_flags, len); 2743 error = mlock_future_check(mm, mm->def_flags, len);
@@ -3049,8 +3052,8 @@ static int special_mapping_fault(struct vm_area_struct *vma,
3049static struct vm_area_struct *__install_special_mapping( 3052static struct vm_area_struct *__install_special_mapping(
3050 struct mm_struct *mm, 3053 struct mm_struct *mm,
3051 unsigned long addr, unsigned long len, 3054 unsigned long addr, unsigned long len,
3052 unsigned long vm_flags, const struct vm_operations_struct *ops, 3055 unsigned long vm_flags, void *priv,
3053 void *priv) 3056 const struct vm_operations_struct *ops)
3054{ 3057{
3055 int ret; 3058 int ret;
3056 struct vm_area_struct *vma; 3059 struct vm_area_struct *vma;
@@ -3099,8 +3102,8 @@ struct vm_area_struct *_install_special_mapping(
3099 unsigned long addr, unsigned long len, 3102 unsigned long addr, unsigned long len,
3100 unsigned long vm_flags, const struct vm_special_mapping *spec) 3103 unsigned long vm_flags, const struct vm_special_mapping *spec)
3101{ 3104{
3102 return __install_special_mapping(mm, addr, len, vm_flags, 3105 return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
3103 &special_mapping_vmops, (void *)spec); 3106 &special_mapping_vmops);
3104} 3107}
3105 3108
3106int install_special_mapping(struct mm_struct *mm, 3109int install_special_mapping(struct mm_struct *mm,
@@ -3108,8 +3111,8 @@ int install_special_mapping(struct mm_struct *mm,
3108 unsigned long vm_flags, struct page **pages) 3111 unsigned long vm_flags, struct page **pages)
3109{ 3112{
3110 struct vm_area_struct *vma = __install_special_mapping( 3113 struct vm_area_struct *vma = __install_special_mapping(
3111 mm, addr, len, vm_flags, &legacy_special_mapping_vmops, 3114 mm, addr, len, vm_flags, (void *)pages,
3112 (void *)pages); 3115 &legacy_special_mapping_vmops);
3113 3116
3114 return PTR_ERR_OR_ZERO(vma); 3117 return PTR_ERR_OR_ZERO(vma);
3115} 3118}
diff --git a/mm/mremap.c b/mm/mremap.c
index 5a71cce8c6ea..c25bc6268e46 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -401,7 +401,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
401 unsigned long charged = 0; 401 unsigned long charged = 0;
402 unsigned long map_flags; 402 unsigned long map_flags;
403 403
404 if (new_addr & ~PAGE_MASK) 404 if (offset_in_page(new_addr))
405 goto out; 405 goto out;
406 406
407 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) 407 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
@@ -435,11 +435,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
435 ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + 435 ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
436 ((addr - vma->vm_start) >> PAGE_SHIFT), 436 ((addr - vma->vm_start) >> PAGE_SHIFT),
437 map_flags); 437 map_flags);
438 if (ret & ~PAGE_MASK) 438 if (offset_in_page(ret))
439 goto out1; 439 goto out1;
440 440
441 ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); 441 ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
442 if (!(ret & ~PAGE_MASK)) 442 if (!(offset_in_page(ret)))
443 goto out; 443 goto out;
444out1: 444out1:
445 vm_unacct_memory(charged); 445 vm_unacct_memory(charged);
@@ -484,7 +484,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
484 if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) 484 if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
485 return ret; 485 return ret;
486 486
487 if (addr & ~PAGE_MASK) 487 if (offset_in_page(addr))
488 return ret; 488 return ret;
489 489
490 old_len = PAGE_ALIGN(old_len); 490 old_len = PAGE_ALIGN(old_len);
@@ -566,7 +566,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
566 vma->vm_pgoff + 566 vma->vm_pgoff +
567 ((addr - vma->vm_start) >> PAGE_SHIFT), 567 ((addr - vma->vm_start) >> PAGE_SHIFT),
568 map_flags); 568 map_flags);
569 if (new_addr & ~PAGE_MASK) { 569 if (offset_in_page(new_addr)) {
570 ret = new_addr; 570 ret = new_addr;
571 goto out; 571 goto out;
572 } 572 }
@@ -574,7 +574,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
574 ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); 574 ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
575 } 575 }
576out: 576out:
577 if (ret & ~PAGE_MASK) { 577 if (offset_in_page(ret)) {
578 vm_unacct_memory(charged); 578 vm_unacct_memory(charged);
579 locked = 0; 579 locked = 0;
580 } 580 }
diff --git a/mm/msync.c b/mm/msync.c
index bb04d53ae852..24e612fefa04 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -38,7 +38,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
38 38
39 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) 39 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
40 goto out; 40 goto out;
41 if (start & ~PAGE_MASK) 41 if (offset_in_page(start))
42 goto out; 42 goto out;
43 if ((flags & MS_ASYNC) && (flags & MS_SYNC)) 43 if ((flags & MS_ASYNC) && (flags & MS_SYNC))
44 goto out; 44 goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index ab14a2014dea..92be862c859b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -578,16 +578,16 @@ static noinline void validate_nommu_regions(void)
578 return; 578 return;
579 579
580 last = rb_entry(lastp, struct vm_region, vm_rb); 580 last = rb_entry(lastp, struct vm_region, vm_rb);
581 BUG_ON(unlikely(last->vm_end <= last->vm_start)); 581 BUG_ON(last->vm_end <= last->vm_start);
582 BUG_ON(unlikely(last->vm_top < last->vm_end)); 582 BUG_ON(last->vm_top < last->vm_end);
583 583
584 while ((p = rb_next(lastp))) { 584 while ((p = rb_next(lastp))) {
585 region = rb_entry(p, struct vm_region, vm_rb); 585 region = rb_entry(p, struct vm_region, vm_rb);
586 last = rb_entry(lastp, struct vm_region, vm_rb); 586 last = rb_entry(lastp, struct vm_region, vm_rb);
587 587
588 BUG_ON(unlikely(region->vm_end <= region->vm_start)); 588 BUG_ON(region->vm_end <= region->vm_start);
589 BUG_ON(unlikely(region->vm_top < region->vm_end)); 589 BUG_ON(region->vm_top < region->vm_end);
590 BUG_ON(unlikely(region->vm_start < last->vm_top)); 590 BUG_ON(region->vm_start < last->vm_top);
591 591
592 lastp = p; 592 lastp = p;
593 } 593 }
@@ -1497,7 +1497,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1497 1497
1498 if (copy_from_user(&a, arg, sizeof(a))) 1498 if (copy_from_user(&a, arg, sizeof(a)))
1499 return -EFAULT; 1499 return -EFAULT;
1500 if (a.offset & ~PAGE_MASK) 1500 if (offset_in_page(a.offset))
1501 return -EINVAL; 1501 return -EINVAL;
1502 1502
1503 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, 1503 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
@@ -1653,9 +1653,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1653 goto erase_whole_vma; 1653 goto erase_whole_vma;
1654 if (start < vma->vm_start || end > vma->vm_end) 1654 if (start < vma->vm_start || end > vma->vm_end)
1655 return -EINVAL; 1655 return -EINVAL;
1656 if (start & ~PAGE_MASK) 1656 if (offset_in_page(start))
1657 return -EINVAL; 1657 return -EINVAL;
1658 if (end != vma->vm_end && end & ~PAGE_MASK) 1658 if (end != vma->vm_end && offset_in_page(end))
1659 return -EINVAL; 1659 return -EINVAL;
1660 if (start != vma->vm_start && end != vma->vm_end) { 1660 if (start != vma->vm_start && end != vma->vm_end) {
1661 ret = split_vma(mm, vma, start, 1); 1661 ret = split_vma(mm, vma, start, 1);
@@ -1736,7 +1736,7 @@ static unsigned long do_mremap(unsigned long addr,
1736 if (old_len == 0 || new_len == 0) 1736 if (old_len == 0 || new_len == 0)
1737 return (unsigned long) -EINVAL; 1737 return (unsigned long) -EINVAL;
1738 1738
1739 if (addr & ~PAGE_MASK) 1739 if (offset_in_page(addr))
1740 return -EINVAL; 1740 return -EINVAL;
1741 1741
1742 if (flags & MREMAP_FIXED && new_addr != addr) 1742 if (flags & MREMAP_FIXED && new_addr != addr)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1ecc0bcaecc5..e4778285d8d1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -377,13 +377,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
377static void dump_header(struct oom_control *oc, struct task_struct *p, 377static void dump_header(struct oom_control *oc, struct task_struct *p,
378 struct mem_cgroup *memcg) 378 struct mem_cgroup *memcg)
379{ 379{
380 task_lock(current);
381 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 380 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
382 "oom_score_adj=%hd\n", 381 "oom_score_adj=%hd\n",
383 current->comm, oc->gfp_mask, oc->order, 382 current->comm, oc->gfp_mask, oc->order,
384 current->signal->oom_score_adj); 383 current->signal->oom_score_adj);
385 cpuset_print_task_mems_allowed(current); 384 cpuset_print_current_mems_allowed();
386 task_unlock(current);
387 dump_stack(); 385 dump_stack();
388 if (memcg) 386 if (memcg)
389 mem_cgroup_print_oom_info(memcg, p); 387 mem_cgroup_print_oom_info(memcg, p);
@@ -476,6 +474,24 @@ void oom_killer_enable(void)
476 oom_killer_disabled = false; 474 oom_killer_disabled = false;
477} 475}
478 476
477/*
478 * task->mm can be NULL if the task is the exited group leader. So to
479 * determine whether the task is using a particular mm, we examine all the
480 * task's threads: if one of those is using this mm then this task was also
481 * using it.
482 */
483static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
484{
485 struct task_struct *t;
486
487 for_each_thread(p, t) {
488 struct mm_struct *t_mm = READ_ONCE(t->mm);
489 if (t_mm)
490 return t_mm == mm;
491 }
492 return false;
493}
494
479#define K(x) ((x) << (PAGE_SHIFT-10)) 495#define K(x) ((x) << (PAGE_SHIFT-10))
480/* 496/*
481 * Must be called while holding a reference to p, which will be released upon 497 * Must be called while holding a reference to p, which will be released upon
@@ -509,10 +525,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
509 if (__ratelimit(&oom_rs)) 525 if (__ratelimit(&oom_rs))
510 dump_header(oc, p, memcg); 526 dump_header(oc, p, memcg);
511 527
512 task_lock(p);
513 pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", 528 pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
514 message, task_pid_nr(p), p->comm, points); 529 message, task_pid_nr(p), p->comm, points);
515 task_unlock(p);
516 530
517 /* 531 /*
518 * If any of p's children has a different mm and is eligible for kill, 532 * If any of p's children has a different mm and is eligible for kill,
@@ -525,7 +539,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
525 list_for_each_entry(child, &t->children, sibling) { 539 list_for_each_entry(child, &t->children, sibling) {
526 unsigned int child_points; 540 unsigned int child_points;
527 541
528 if (child->mm == p->mm) 542 if (process_shares_mm(child, p->mm))
529 continue; 543 continue;
530 /* 544 /*
531 * oom_badness() returns 0 if the thread is unkillable 545 * oom_badness() returns 0 if the thread is unkillable
@@ -552,8 +566,15 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
552 victim = p; 566 victim = p;
553 } 567 }
554 568
555 /* mm cannot safely be dereferenced after task_unlock(victim) */ 569 /* Get a reference to safely compare mm after task_unlock(victim) */
556 mm = victim->mm; 570 mm = victim->mm;
571 atomic_inc(&mm->mm_count);
572 /*
573 * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
574 * the OOM victim from depleting the memory reserves from the user
575 * space under its control.
576 */
577 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
557 mark_oom_victim(victim); 578 mark_oom_victim(victim);
558 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 579 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
559 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), 580 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
@@ -571,21 +592,21 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
571 * pending fatal signal. 592 * pending fatal signal.
572 */ 593 */
573 rcu_read_lock(); 594 rcu_read_lock();
574 for_each_process(p) 595 for_each_process(p) {
575 if (p->mm == mm && !same_thread_group(p, victim) && 596 if (!process_shares_mm(p, mm))
576 !(p->flags & PF_KTHREAD)) { 597 continue;
577 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) 598 if (same_thread_group(p, victim))
578 continue; 599 continue;
600 if (unlikely(p->flags & PF_KTHREAD))
601 continue;
602 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
603 continue;
579 604
580 task_lock(p); /* Protect ->comm from prctl() */ 605 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
581 pr_err("Kill process %d (%s) sharing same memory\n", 606 }
582 task_pid_nr(p), p->comm);
583 task_unlock(p);
584 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
585 }
586 rcu_read_unlock(); 607 rcu_read_unlock();
587 608
588 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); 609 mmdrop(mm);
589 put_task_struct(victim); 610 put_task_struct(victim);
590} 611}
591#undef K 612#undef K
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 805bbad2e24e..446bb36ee59d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3428,24 +3428,24 @@ EXPORT_SYMBOL(__free_page_frag);
3428struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) 3428struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
3429{ 3429{
3430 struct page *page; 3430 struct page *page;
3431 struct mem_cgroup *memcg = NULL;
3432 3431
3433 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
3434 return NULL;
3435 page = alloc_pages(gfp_mask, order); 3432 page = alloc_pages(gfp_mask, order);
3436 memcg_kmem_commit_charge(page, memcg, order); 3433 if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
3434 __free_pages(page, order);
3435 page = NULL;
3436 }
3437 return page; 3437 return page;
3438} 3438}
3439 3439
3440struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) 3440struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
3441{ 3441{
3442 struct page *page; 3442 struct page *page;
3443 struct mem_cgroup *memcg = NULL;
3444 3443
3445 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
3446 return NULL;
3447 page = alloc_pages_node(nid, gfp_mask, order); 3444 page = alloc_pages_node(nid, gfp_mask, order);
3448 memcg_kmem_commit_charge(page, memcg, order); 3445 if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
3446 __free_pages(page, order);
3447 page = NULL;
3448 }
3449 return page; 3449 return page;
3450} 3450}
3451 3451
@@ -3455,7 +3455,7 @@ struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
3455 */ 3455 */
3456void __free_kmem_pages(struct page *page, unsigned int order) 3456void __free_kmem_pages(struct page *page, unsigned int order)
3457{ 3457{
3458 memcg_kmem_uncharge_pages(page, order); 3458 memcg_kmem_uncharge(page, order);
3459 __free_pages(page, order); 3459 __free_pages(page, order);
3460} 3460}
3461 3461
@@ -4900,8 +4900,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
4900 4900
4901int __meminit init_currently_empty_zone(struct zone *zone, 4901int __meminit init_currently_empty_zone(struct zone *zone,
4902 unsigned long zone_start_pfn, 4902 unsigned long zone_start_pfn,
4903 unsigned long size, 4903 unsigned long size)
4904 enum memmap_context context)
4905{ 4904{
4906 struct pglist_data *pgdat = zone->zone_pgdat; 4905 struct pglist_data *pgdat = zone->zone_pgdat;
4907 int ret; 4906 int ret;
@@ -5413,8 +5412,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5413 5412
5414 set_pageblock_order(); 5413 set_pageblock_order();
5415 setup_usemap(pgdat, zone, zone_start_pfn, size); 5414 setup_usemap(pgdat, zone, zone_start_pfn, size);
5416 ret = init_currently_empty_zone(zone, zone_start_pfn, 5415 ret = init_currently_empty_zone(zone, zone_start_pfn, size);
5417 size, MEMMAP_EARLY);
5418 BUG_ON(ret); 5416 BUG_ON(ret);
5419 memmap_init(size, nid, j, zone_start_pfn); 5417 memmap_init(size, nid, j, zone_start_pfn);
5420 zone_start_pfn += size; 5418 zone_start_pfn += size;
@@ -5423,6 +5421,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5423 5421
5424static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 5422static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
5425{ 5423{
5424 unsigned long __maybe_unused offset = 0;
5425
5426 /* Skip empty nodes */ 5426 /* Skip empty nodes */
5427 if (!pgdat->node_spanned_pages) 5427 if (!pgdat->node_spanned_pages)
5428 return; 5428 return;
@@ -5439,6 +5439,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
5439 * for the buddy allocator to function correctly. 5439 * for the buddy allocator to function correctly.
5440 */ 5440 */
5441 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 5441 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
5442 offset = pgdat->node_start_pfn - start;
5442 end = pgdat_end_pfn(pgdat); 5443 end = pgdat_end_pfn(pgdat);
5443 end = ALIGN(end, MAX_ORDER_NR_PAGES); 5444 end = ALIGN(end, MAX_ORDER_NR_PAGES);
5444 size = (end - start) * sizeof(struct page); 5445 size = (end - start) * sizeof(struct page);
@@ -5446,7 +5447,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
5446 if (!map) 5447 if (!map)
5447 map = memblock_virt_alloc_node_nopanic(size, 5448 map = memblock_virt_alloc_node_nopanic(size,
5448 pgdat->node_id); 5449 pgdat->node_id);
5449 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 5450 pgdat->node_mem_map = map + offset;
5450 } 5451 }
5451#ifndef CONFIG_NEED_MULTIPLE_NODES 5452#ifndef CONFIG_NEED_MULTIPLE_NODES
5452 /* 5453 /*
@@ -5454,9 +5455,9 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
5454 */ 5455 */
5455 if (pgdat == NODE_DATA(0)) { 5456 if (pgdat == NODE_DATA(0)) {
5456 mem_map = NODE_DATA(0)->node_mem_map; 5457 mem_map = NODE_DATA(0)->node_mem_map;
5457#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5458#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
5458 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 5459 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
5459 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 5460 mem_map -= offset;
5460#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5461#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5461 } 5462 }
5462#endif 5463#endif
@@ -5668,13 +5669,17 @@ static void __init find_zone_movable_pfns_for_nodes(void)
5668 */ 5669 */
5669 required_movablecore = 5670 required_movablecore =
5670 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 5671 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
5672 required_movablecore = min(totalpages, required_movablecore);
5671 corepages = totalpages - required_movablecore; 5673 corepages = totalpages - required_movablecore;
5672 5674
5673 required_kernelcore = max(required_kernelcore, corepages); 5675 required_kernelcore = max(required_kernelcore, corepages);
5674 } 5676 }
5675 5677
5676 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 5678 /*
5677 if (!required_kernelcore) 5679 * If kernelcore was not specified or kernelcore size is larger
5680 * than totalpages, there is no ZONE_MOVABLE.
5681 */
5682 if (!required_kernelcore || required_kernelcore >= totalpages)
5678 goto out; 5683 goto out;
5679 5684
5680 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 5685 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 11b4beda14ba..7c6a63d2c27f 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -56,12 +56,12 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
56 * @nr_pages: number of pages to charge 56 * @nr_pages: number of pages to charge
57 * @fail: points first counter to hit its limit, if any 57 * @fail: points first counter to hit its limit, if any
58 * 58 *
59 * Returns 0 on success, or -ENOMEM and @fail if the counter or one of 59 * Returns %true on success, or %false and @fail if the counter or one
60 * its ancestors has hit its configured limit. 60 * of its ancestors has hit its configured limit.
61 */ 61 */
62int page_counter_try_charge(struct page_counter *counter, 62bool page_counter_try_charge(struct page_counter *counter,
63 unsigned long nr_pages, 63 unsigned long nr_pages,
64 struct page_counter **fail) 64 struct page_counter **fail)
65{ 65{
66 struct page_counter *c; 66 struct page_counter *c;
67 67
@@ -99,13 +99,13 @@ int page_counter_try_charge(struct page_counter *counter,
99 if (new > c->watermark) 99 if (new > c->watermark)
100 c->watermark = new; 100 c->watermark = new;
101 } 101 }
102 return 0; 102 return true;
103 103
104failed: 104failed:
105 for (c = counter; c != *fail; c = c->parent) 105 for (c = counter; c != *fail; c = c->parent)
106 page_counter_cancel(c, nr_pages); 106 page_counter_cancel(c, nr_pages);
107 107
108 return -ENOMEM; 108 return false;
109} 109}
110 110
111/** 111/**
diff --git a/mm/percpu.c b/mm/percpu.c
index a63b4d82a141..8a943b97a053 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1554,12 +1554,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1554 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); 1554 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1555#ifdef CONFIG_SMP 1555#ifdef CONFIG_SMP
1556 PCPU_SETUP_BUG_ON(!ai->static_size); 1556 PCPU_SETUP_BUG_ON(!ai->static_size);
1557 PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK); 1557 PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
1558#endif 1558#endif
1559 PCPU_SETUP_BUG_ON(!base_addr); 1559 PCPU_SETUP_BUG_ON(!base_addr);
1560 PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK); 1560 PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
1561 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); 1561 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1562 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); 1562 PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
1563 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); 1563 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1564 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); 1564 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
1565 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); 1565 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
@@ -1806,7 +1806,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1806 1806
1807 alloc_size = roundup(min_unit_size, atom_size); 1807 alloc_size = roundup(min_unit_size, atom_size);
1808 upa = alloc_size / min_unit_size; 1808 upa = alloc_size / min_unit_size;
1809 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) 1809 while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
1810 upa--; 1810 upa--;
1811 max_upa = upa; 1811 max_upa = upa;
1812 1812
@@ -1838,7 +1838,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1838 for (upa = max_upa; upa; upa--) { 1838 for (upa = max_upa; upa; upa--) {
1839 int allocs = 0, wasted = 0; 1839 int allocs = 0, wasted = 0;
1840 1840
1841 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) 1841 if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
1842 continue; 1842 continue;
1843 1843
1844 for (group = 0; group < nr_groups; group++) { 1844 for (group = 0; group < nr_groups; group++) {
diff --git a/mm/readahead.c b/mm/readahead.c
index 24682f6f4cfd..998ad592f408 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -213,7 +213,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
213 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) 213 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
214 return -EINVAL; 214 return -EINVAL;
215 215
216 nr_to_read = max_sane_readahead(nr_to_read); 216 nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
217 while (nr_to_read) { 217 while (nr_to_read) {
218 int err; 218 int err;
219 219
@@ -232,16 +232,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
232 return 0; 232 return 0;
233} 233}
234 234
235#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE)
236/*
237 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
238 * sensible upper limit.
239 */
240unsigned long max_sane_readahead(unsigned long nr)
241{
242 return min(nr, MAX_READAHEAD);
243}
244
245/* 235/*
246 * Set the initial window size, round to next power of 2 and square 236 * Set the initial window size, round to next power of 2 and square
247 * for small size, x 4 for medium, and x 2 for large 237 * for small size, x 4 for medium, and x 2 for large
@@ -380,7 +370,7 @@ ondemand_readahead(struct address_space *mapping,
380 bool hit_readahead_marker, pgoff_t offset, 370 bool hit_readahead_marker, pgoff_t offset,
381 unsigned long req_size) 371 unsigned long req_size)
382{ 372{
383 unsigned long max = max_sane_readahead(ra->ra_pages); 373 unsigned long max = ra->ra_pages;
384 pgoff_t prev_offset; 374 pgoff_t prev_offset;
385 375
386 /* 376 /*
diff --git a/mm/rmap.c b/mm/rmap.c
index f5b5c1f3dcd7..b577fbb98d4b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1304,6 +1304,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1304 int ret = SWAP_AGAIN; 1304 int ret = SWAP_AGAIN;
1305 enum ttu_flags flags = (enum ttu_flags)arg; 1305 enum ttu_flags flags = (enum ttu_flags)arg;
1306 1306
1307 /* munlock has nothing to gain from examining un-locked vmas */
1308 if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
1309 goto out;
1310
1307 pte = page_check_address(page, mm, address, &ptl, 0); 1311 pte = page_check_address(page, mm, address, &ptl, 0);
1308 if (!pte) 1312 if (!pte)
1309 goto out; 1313 goto out;
@@ -1314,9 +1318,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1314 * skipped over this mm) then we should reactivate it. 1318 * skipped over this mm) then we should reactivate it.
1315 */ 1319 */
1316 if (!(flags & TTU_IGNORE_MLOCK)) { 1320 if (!(flags & TTU_IGNORE_MLOCK)) {
1317 if (vma->vm_flags & VM_LOCKED) 1321 if (vma->vm_flags & VM_LOCKED) {
1318 goto out_mlock; 1322 /* Holding pte lock, we do *not* need mmap_sem here */
1319 1323 mlock_vma_page(page);
1324 ret = SWAP_MLOCK;
1325 goto out_unmap;
1326 }
1320 if (flags & TTU_MUNLOCK) 1327 if (flags & TTU_MUNLOCK)
1321 goto out_unmap; 1328 goto out_unmap;
1322 } 1329 }
@@ -1352,7 +1359,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1352 update_hiwater_rss(mm); 1359 update_hiwater_rss(mm);
1353 1360
1354 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1361 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
1355 if (!PageHuge(page)) { 1362 if (PageHuge(page)) {
1363 hugetlb_count_sub(1 << compound_order(page), mm);
1364 } else {
1356 if (PageAnon(page)) 1365 if (PageAnon(page))
1357 dec_mm_counter(mm, MM_ANONPAGES); 1366 dec_mm_counter(mm, MM_ANONPAGES);
1358 else 1367 else
@@ -1370,47 +1379,44 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1370 dec_mm_counter(mm, MM_ANONPAGES); 1379 dec_mm_counter(mm, MM_ANONPAGES);
1371 else 1380 else
1372 dec_mm_counter(mm, MM_FILEPAGES); 1381 dec_mm_counter(mm, MM_FILEPAGES);
1382 } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
1383 swp_entry_t entry;
1384 pte_t swp_pte;
1385 /*
1386 * Store the pfn of the page in a special migration
1387 * pte. do_swap_page() will wait until the migration
1388 * pte is removed and then restart fault handling.
1389 */
1390 entry = make_migration_entry(page, pte_write(pteval));
1391 swp_pte = swp_entry_to_pte(entry);
1392 if (pte_soft_dirty(pteval))
1393 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1394 set_pte_at(mm, address, pte, swp_pte);
1373 } else if (PageAnon(page)) { 1395 } else if (PageAnon(page)) {
1374 swp_entry_t entry = { .val = page_private(page) }; 1396 swp_entry_t entry = { .val = page_private(page) };
1375 pte_t swp_pte; 1397 pte_t swp_pte;
1376 1398 /*
1377 if (PageSwapCache(page)) { 1399 * Store the swap location in the pte.
1378 /* 1400 * See handle_pte_fault() ...
1379 * Store the swap location in the pte. 1401 */
1380 * See handle_pte_fault() ... 1402 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
1381 */ 1403 if (swap_duplicate(entry) < 0) {
1382 if (swap_duplicate(entry) < 0) { 1404 set_pte_at(mm, address, pte, pteval);
1383 set_pte_at(mm, address, pte, pteval); 1405 ret = SWAP_FAIL;
1384 ret = SWAP_FAIL; 1406 goto out_unmap;
1385 goto out_unmap; 1407 }
1386 } 1408 if (list_empty(&mm->mmlist)) {
1387 if (list_empty(&mm->mmlist)) { 1409 spin_lock(&mmlist_lock);
1388 spin_lock(&mmlist_lock); 1410 if (list_empty(&mm->mmlist))
1389 if (list_empty(&mm->mmlist)) 1411 list_add(&mm->mmlist, &init_mm.mmlist);
1390 list_add(&mm->mmlist, &init_mm.mmlist); 1412 spin_unlock(&mmlist_lock);
1391 spin_unlock(&mmlist_lock);
1392 }
1393 dec_mm_counter(mm, MM_ANONPAGES);
1394 inc_mm_counter(mm, MM_SWAPENTS);
1395 } else if (IS_ENABLED(CONFIG_MIGRATION)) {
1396 /*
1397 * Store the pfn of the page in a special migration
1398 * pte. do_swap_page() will wait until the migration
1399 * pte is removed and then restart fault handling.
1400 */
1401 BUG_ON(!(flags & TTU_MIGRATION));
1402 entry = make_migration_entry(page, pte_write(pteval));
1403 } 1413 }
1414 dec_mm_counter(mm, MM_ANONPAGES);
1415 inc_mm_counter(mm, MM_SWAPENTS);
1404 swp_pte = swp_entry_to_pte(entry); 1416 swp_pte = swp_entry_to_pte(entry);
1405 if (pte_soft_dirty(pteval)) 1417 if (pte_soft_dirty(pteval))
1406 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1418 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1407 set_pte_at(mm, address, pte, swp_pte); 1419 set_pte_at(mm, address, pte, swp_pte);
1408 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1409 (flags & TTU_MIGRATION)) {
1410 /* Establish migration entry for a file page */
1411 swp_entry_t entry;
1412 entry = make_migration_entry(page, pte_write(pteval));
1413 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
1414 } else 1420 } else
1415 dec_mm_counter(mm, MM_FILEPAGES); 1421 dec_mm_counter(mm, MM_FILEPAGES);
1416 1422
@@ -1419,31 +1425,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1419 1425
1420out_unmap: 1426out_unmap:
1421 pte_unmap_unlock(pte, ptl); 1427 pte_unmap_unlock(pte, ptl);
1422 if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) 1428 if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK))
1423 mmu_notifier_invalidate_page(mm, address); 1429 mmu_notifier_invalidate_page(mm, address);
1424out: 1430out:
1425 return ret; 1431 return ret;
1426
1427out_mlock:
1428 pte_unmap_unlock(pte, ptl);
1429
1430
1431 /*
1432 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1433 * unstable result and race. Plus, We can't wait here because
1434 * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
1435 * if trylock failed, the page remain in evictable lru and later
1436 * vmscan could retry to move the page to unevictable lru if the
1437 * page is actually mlocked.
1438 */
1439 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
1440 if (vma->vm_flags & VM_LOCKED) {
1441 mlock_vma_page(page);
1442 ret = SWAP_MLOCK;
1443 }
1444 up_read(&vma->vm_mm->mmap_sem);
1445 }
1446 return ret;
1447} 1432}
1448 1433
1449bool is_vma_temporary_stack(struct vm_area_struct *vma) 1434bool is_vma_temporary_stack(struct vm_area_struct *vma)
@@ -1607,6 +1592,8 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1607 struct vm_area_struct *vma = avc->vma; 1592 struct vm_area_struct *vma = avc->vma;
1608 unsigned long address = vma_address(page, vma); 1593 unsigned long address = vma_address(page, vma);
1609 1594
1595 cond_resched();
1596
1610 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1597 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1611 continue; 1598 continue;
1612 1599
@@ -1656,6 +1643,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1656 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1643 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1657 unsigned long address = vma_address(page, vma); 1644 unsigned long address = vma_address(page, vma);
1658 1645
1646 cond_resched();
1647
1659 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1648 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1660 continue; 1649 continue;
1661 1650
diff --git a/mm/shmem.c b/mm/shmem.c
index 48ce82926d93..3b8b73928398 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -548,12 +548,12 @@ static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
548 struct inode *inode = dentry->d_inode; 548 struct inode *inode = dentry->d_inode;
549 struct shmem_inode_info *info = SHMEM_I(inode); 549 struct shmem_inode_info *info = SHMEM_I(inode);
550 550
551 spin_lock(&info->lock); 551 if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
552 shmem_recalc_inode(inode); 552 spin_lock(&info->lock);
553 spin_unlock(&info->lock); 553 shmem_recalc_inode(inode);
554 554 spin_unlock(&info->lock);
555 }
555 generic_fillattr(inode, stat); 556 generic_fillattr(inode, stat);
556
557 return 0; 557 return 0;
558} 558}
559 559
@@ -586,10 +586,16 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
586 } 586 }
587 if (newsize <= oldsize) { 587 if (newsize <= oldsize) {
588 loff_t holebegin = round_up(newsize, PAGE_SIZE); 588 loff_t holebegin = round_up(newsize, PAGE_SIZE);
589 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 589 if (oldsize > holebegin)
590 shmem_truncate_range(inode, newsize, (loff_t)-1); 590 unmap_mapping_range(inode->i_mapping,
591 holebegin, 0, 1);
592 if (info->alloced)
593 shmem_truncate_range(inode,
594 newsize, (loff_t)-1);
591 /* unmap again to remove racily COWed private pages */ 595 /* unmap again to remove racily COWed private pages */
592 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 596 if (oldsize > holebegin)
597 unmap_mapping_range(inode->i_mapping,
598 holebegin, 0, 1);
593 } 599 }
594 } 600 }
595 601
@@ -1023,7 +1029,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1023 */ 1029 */
1024 oldpage = newpage; 1030 oldpage = newpage;
1025 } else { 1031 } else {
1026 mem_cgroup_migrate(oldpage, newpage, true); 1032 mem_cgroup_replace_page(oldpage, newpage);
1027 lru_cache_add_anon(newpage); 1033 lru_cache_add_anon(newpage);
1028 *pagep = newpage; 1034 *pagep = newpage;
1029 } 1035 }
diff --git a/mm/slab.c b/mm/slab.c
index 4fcc5dd8d5a6..272e809404d5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -282,6 +282,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
282 282
283#define CFLGS_OFF_SLAB (0x80000000UL) 283#define CFLGS_OFF_SLAB (0x80000000UL)
284#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 284#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
285#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
285 286
286#define BATCHREFILL_LIMIT 16 287#define BATCHREFILL_LIMIT 16
287/* 288/*
@@ -1592,16 +1593,17 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1592 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1593 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1593 flags |= __GFP_RECLAIMABLE; 1594 flags |= __GFP_RECLAIMABLE;
1594 1595
1595 if (memcg_charge_slab(cachep, flags, cachep->gfporder))
1596 return NULL;
1597
1598 page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1596 page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1599 if (!page) { 1597 if (!page) {
1600 memcg_uncharge_slab(cachep, cachep->gfporder);
1601 slab_out_of_memory(cachep, flags, nodeid); 1598 slab_out_of_memory(cachep, flags, nodeid);
1602 return NULL; 1599 return NULL;
1603 } 1600 }
1604 1601
1602 if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) {
1603 __free_pages(page, cachep->gfporder);
1604 return NULL;
1605 }
1606
1605 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ 1607 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1606 if (page_is_pfmemalloc(page)) 1608 if (page_is_pfmemalloc(page))
1607 pfmemalloc_active = true; 1609 pfmemalloc_active = true;
@@ -1653,8 +1655,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
1653 1655
1654 if (current->reclaim_state) 1656 if (current->reclaim_state)
1655 current->reclaim_state->reclaimed_slab += nr_freed; 1657 current->reclaim_state->reclaimed_slab += nr_freed;
1656 __free_pages(page, cachep->gfporder); 1658 __free_kmem_pages(page, cachep->gfporder);
1657 memcg_uncharge_slab(cachep, cachep->gfporder);
1658} 1659}
1659 1660
1660static void kmem_rcu_free(struct rcu_head *head) 1661static void kmem_rcu_free(struct rcu_head *head)
@@ -2212,7 +2213,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2212 * it too early on. Always use on-slab management when 2213 * it too early on. Always use on-slab management when
2213 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) 2214 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2214 */ 2215 */
2215 if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && 2216 if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
2216 !(flags & SLAB_NOLEAKTRACE)) 2217 !(flags & SLAB_NOLEAKTRACE))
2217 /* 2218 /*
2218 * Size is large, assume best to place the slab management obj 2219 * Size is large, assume best to place the slab management obj
@@ -2276,7 +2277,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2276 /* 2277 /*
2277 * This is a possibility for one of the kmalloc_{dma,}_caches. 2278 * This is a possibility for one of the kmalloc_{dma,}_caches.
2278 * But since we go off slab only for object size greater than 2279 * But since we go off slab only for object size greater than
2279 * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created 2280 * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created
2280 * in ascending order,this should not happen at all. 2281 * in ascending order,this should not happen at all.
2281 * But leave a BUG_ON for some lucky dude. 2282 * But leave a BUG_ON for some lucky dude.
2282 */ 2283 */
diff --git a/mm/slab.h b/mm/slab.h
index a3a967d7d7c2..27492eb678f7 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -181,10 +181,6 @@ bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
181 list_for_each_entry(iter, &(root)->memcg_params.list, \ 181 list_for_each_entry(iter, &(root)->memcg_params.list, \
182 memcg_params.list) 182 memcg_params.list)
183 183
184#define for_each_memcg_cache_safe(iter, tmp, root) \
185 list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
186 memcg_params.list)
187
188static inline bool is_root_cache(struct kmem_cache *s) 184static inline bool is_root_cache(struct kmem_cache *s)
189{ 185{
190 return s->memcg_params.is_root_cache; 186 return s->memcg_params.is_root_cache;
@@ -240,23 +236,16 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
240 return s->memcg_params.root_cache; 236 return s->memcg_params.root_cache;
241} 237}
242 238
243static __always_inline int memcg_charge_slab(struct kmem_cache *s, 239static __always_inline int memcg_charge_slab(struct page *page,
244 gfp_t gfp, int order) 240 gfp_t gfp, int order,
241 struct kmem_cache *s)
245{ 242{
246 if (!memcg_kmem_enabled()) 243 if (!memcg_kmem_enabled())
247 return 0; 244 return 0;
248 if (is_root_cache(s)) 245 if (is_root_cache(s))
249 return 0; 246 return 0;
250 return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); 247 return __memcg_kmem_charge_memcg(page, gfp, order,
251} 248 s->memcg_params.memcg);
252
253static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
254{
255 if (!memcg_kmem_enabled())
256 return;
257 if (is_root_cache(s))
258 return;
259 memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
260} 249}
261 250
262extern void slab_init_memcg_params(struct kmem_cache *); 251extern void slab_init_memcg_params(struct kmem_cache *);
@@ -265,8 +254,6 @@ extern void slab_init_memcg_params(struct kmem_cache *);
265 254
266#define for_each_memcg_cache(iter, root) \ 255#define for_each_memcg_cache(iter, root) \
267 for ((void)(iter), (void)(root); 0; ) 256 for ((void)(iter), (void)(root); 0; )
268#define for_each_memcg_cache_safe(iter, tmp, root) \
269 for ((void)(iter), (void)(tmp), (void)(root); 0; )
270 257
271static inline bool is_root_cache(struct kmem_cache *s) 258static inline bool is_root_cache(struct kmem_cache *s)
272{ 259{
@@ -295,15 +282,12 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
295 return s; 282 return s;
296} 283}
297 284
298static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) 285static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
286 struct kmem_cache *s)
299{ 287{
300 return 0; 288 return 0;
301} 289}
302 290
303static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
304{
305}
306
307static inline void slab_init_memcg_params(struct kmem_cache *s) 291static inline void slab_init_memcg_params(struct kmem_cache *s)
308{ 292{
309} 293}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5ce4faeb16fb..d88e97c10a2e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -316,10 +316,10 @@ unsigned long calculate_alignment(unsigned long flags,
316 return ALIGN(align, sizeof(void *)); 316 return ALIGN(align, sizeof(void *));
317} 317}
318 318
319static struct kmem_cache * 319static struct kmem_cache *create_cache(const char *name,
320do_kmem_cache_create(const char *name, size_t object_size, size_t size, 320 size_t object_size, size_t size, size_t align,
321 size_t align, unsigned long flags, void (*ctor)(void *), 321 unsigned long flags, void (*ctor)(void *),
322 struct mem_cgroup *memcg, struct kmem_cache *root_cache) 322 struct mem_cgroup *memcg, struct kmem_cache *root_cache)
323{ 323{
324 struct kmem_cache *s; 324 struct kmem_cache *s;
325 int err; 325 int err;
@@ -384,7 +384,7 @@ struct kmem_cache *
384kmem_cache_create(const char *name, size_t size, size_t align, 384kmem_cache_create(const char *name, size_t size, size_t align,
385 unsigned long flags, void (*ctor)(void *)) 385 unsigned long flags, void (*ctor)(void *))
386{ 386{
387 struct kmem_cache *s; 387 struct kmem_cache *s = NULL;
388 const char *cache_name; 388 const char *cache_name;
389 int err; 389 int err;
390 390
@@ -396,7 +396,6 @@ kmem_cache_create(const char *name, size_t size, size_t align,
396 396
397 err = kmem_cache_sanity_check(name, size); 397 err = kmem_cache_sanity_check(name, size);
398 if (err) { 398 if (err) {
399 s = NULL; /* suppress uninit var warning */
400 goto out_unlock; 399 goto out_unlock;
401 } 400 }
402 401
@@ -418,9 +417,9 @@ kmem_cache_create(const char *name, size_t size, size_t align,
418 goto out_unlock; 417 goto out_unlock;
419 } 418 }
420 419
421 s = do_kmem_cache_create(cache_name, size, size, 420 s = create_cache(cache_name, size, size,
422 calculate_alignment(flags, align, size), 421 calculate_alignment(flags, align, size),
423 flags, ctor, NULL, NULL); 422 flags, ctor, NULL, NULL);
424 if (IS_ERR(s)) { 423 if (IS_ERR(s)) {
425 err = PTR_ERR(s); 424 err = PTR_ERR(s);
426 kfree_const(cache_name); 425 kfree_const(cache_name);
@@ -448,29 +447,20 @@ out_unlock:
448} 447}
449EXPORT_SYMBOL(kmem_cache_create); 448EXPORT_SYMBOL(kmem_cache_create);
450 449
451static int do_kmem_cache_shutdown(struct kmem_cache *s, 450static int shutdown_cache(struct kmem_cache *s,
452 struct list_head *release, bool *need_rcu_barrier) 451 struct list_head *release, bool *need_rcu_barrier)
453{ 452{
454 if (__kmem_cache_shutdown(s) != 0) { 453 if (__kmem_cache_shutdown(s) != 0)
455 printk(KERN_ERR "kmem_cache_destroy %s: "
456 "Slab cache still has objects\n", s->name);
457 dump_stack();
458 return -EBUSY; 454 return -EBUSY;
459 }
460 455
461 if (s->flags & SLAB_DESTROY_BY_RCU) 456 if (s->flags & SLAB_DESTROY_BY_RCU)
462 *need_rcu_barrier = true; 457 *need_rcu_barrier = true;
463 458
464#ifdef CONFIG_MEMCG_KMEM
465 if (!is_root_cache(s))
466 list_del(&s->memcg_params.list);
467#endif
468 list_move(&s->list, release); 459 list_move(&s->list, release);
469 return 0; 460 return 0;
470} 461}
471 462
472static void do_kmem_cache_release(struct list_head *release, 463static void release_caches(struct list_head *release, bool need_rcu_barrier)
473 bool need_rcu_barrier)
474{ 464{
475 struct kmem_cache *s, *s2; 465 struct kmem_cache *s, *s2;
476 466
@@ -536,10 +526,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
536 if (!cache_name) 526 if (!cache_name)
537 goto out_unlock; 527 goto out_unlock;
538 528
539 s = do_kmem_cache_create(cache_name, root_cache->object_size, 529 s = create_cache(cache_name, root_cache->object_size,
540 root_cache->size, root_cache->align, 530 root_cache->size, root_cache->align,
541 root_cache->flags, root_cache->ctor, 531 root_cache->flags, root_cache->ctor,
542 memcg, root_cache); 532 memcg, root_cache);
543 /* 533 /*
544 * If we could not create a memcg cache, do not complain, because 534 * If we could not create a memcg cache, do not complain, because
545 * that's not critical at all as we can always proceed with the root 535 * that's not critical at all as we can always proceed with the root
@@ -598,6 +588,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
598 put_online_cpus(); 588 put_online_cpus();
599} 589}
600 590
591static int __shutdown_memcg_cache(struct kmem_cache *s,
592 struct list_head *release, bool *need_rcu_barrier)
593{
594 BUG_ON(is_root_cache(s));
595
596 if (shutdown_cache(s, release, need_rcu_barrier))
597 return -EBUSY;
598
599 list_del(&s->memcg_params.list);
600 return 0;
601}
602
601void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) 603void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
602{ 604{
603 LIST_HEAD(release); 605 LIST_HEAD(release);
@@ -615,14 +617,76 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
615 * The cgroup is about to be freed and therefore has no charges 617 * The cgroup is about to be freed and therefore has no charges
616 * left. Hence, all its caches must be empty by now. 618 * left. Hence, all its caches must be empty by now.
617 */ 619 */
618 BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); 620 BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier));
619 } 621 }
620 mutex_unlock(&slab_mutex); 622 mutex_unlock(&slab_mutex);
621 623
622 put_online_mems(); 624 put_online_mems();
623 put_online_cpus(); 625 put_online_cpus();
624 626
625 do_kmem_cache_release(&release, need_rcu_barrier); 627 release_caches(&release, need_rcu_barrier);
628}
629
630static int shutdown_memcg_caches(struct kmem_cache *s,
631 struct list_head *release, bool *need_rcu_barrier)
632{
633 struct memcg_cache_array *arr;
634 struct kmem_cache *c, *c2;
635 LIST_HEAD(busy);
636 int i;
637
638 BUG_ON(!is_root_cache(s));
639
640 /*
641 * First, shutdown active caches, i.e. caches that belong to online
642 * memory cgroups.
643 */
644 arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
645 lockdep_is_held(&slab_mutex));
646 for_each_memcg_cache_index(i) {
647 c = arr->entries[i];
648 if (!c)
649 continue;
650 if (__shutdown_memcg_cache(c, release, need_rcu_barrier))
651 /*
652 * The cache still has objects. Move it to a temporary
653 * list so as not to try to destroy it for a second
654 * time while iterating over inactive caches below.
655 */
656 list_move(&c->memcg_params.list, &busy);
657 else
658 /*
659 * The cache is empty and will be destroyed soon. Clear
660 * the pointer to it in the memcg_caches array so that
661 * it will never be accessed even if the root cache
662 * stays alive.
663 */
664 arr->entries[i] = NULL;
665 }
666
667 /*
668 * Second, shutdown all caches left from memory cgroups that are now
669 * offline.
670 */
671 list_for_each_entry_safe(c, c2, &s->memcg_params.list,
672 memcg_params.list)
673 __shutdown_memcg_cache(c, release, need_rcu_barrier);
674
675 list_splice(&busy, &s->memcg_params.list);
676
677 /*
678 * A cache being destroyed must be empty. In particular, this means
679 * that all per memcg caches attached to it must be empty too.
680 */
681 if (!list_empty(&s->memcg_params.list))
682 return -EBUSY;
683 return 0;
684}
685#else
686static inline int shutdown_memcg_caches(struct kmem_cache *s,
687 struct list_head *release, bool *need_rcu_barrier)
688{
689 return 0;
626} 690}
627#endif /* CONFIG_MEMCG_KMEM */ 691#endif /* CONFIG_MEMCG_KMEM */
628 692
@@ -635,16 +699,13 @@ void slab_kmem_cache_release(struct kmem_cache *s)
635 699
636void kmem_cache_destroy(struct kmem_cache *s) 700void kmem_cache_destroy(struct kmem_cache *s)
637{ 701{
638 struct kmem_cache *c, *c2;
639 LIST_HEAD(release); 702 LIST_HEAD(release);
640 bool need_rcu_barrier = false; 703 bool need_rcu_barrier = false;
641 bool busy = false; 704 int err;
642 705
643 if (unlikely(!s)) 706 if (unlikely(!s))
644 return; 707 return;
645 708
646 BUG_ON(!is_root_cache(s));
647
648 get_online_cpus(); 709 get_online_cpus();
649 get_online_mems(); 710 get_online_mems();
650 711
@@ -654,21 +715,22 @@ void kmem_cache_destroy(struct kmem_cache *s)
654 if (s->refcount) 715 if (s->refcount)
655 goto out_unlock; 716 goto out_unlock;
656 717
657 for_each_memcg_cache_safe(c, c2, s) { 718 err = shutdown_memcg_caches(s, &release, &need_rcu_barrier);
658 if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) 719 if (!err)
659 busy = true; 720 err = shutdown_cache(s, &release, &need_rcu_barrier);
660 }
661
662 if (!busy)
663 do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
664 721
722 if (err) {
723 pr_err("kmem_cache_destroy %s: "
724 "Slab cache still has objects\n", s->name);
725 dump_stack();
726 }
665out_unlock: 727out_unlock:
666 mutex_unlock(&slab_mutex); 728 mutex_unlock(&slab_mutex);
667 729
668 put_online_mems(); 730 put_online_mems();
669 put_online_cpus(); 731 put_online_cpus();
670 732
671 do_kmem_cache_release(&release, need_rcu_barrier); 733 release_caches(&release, need_rcu_barrier);
672} 734}
673EXPORT_SYMBOL(kmem_cache_destroy); 735EXPORT_SYMBOL(kmem_cache_destroy);
674 736
@@ -692,7 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
692} 754}
693EXPORT_SYMBOL(kmem_cache_shrink); 755EXPORT_SYMBOL(kmem_cache_shrink);
694 756
695int slab_is_available(void) 757bool slab_is_available(void)
696{ 758{
697 return slab_state >= UP; 759 return slab_state >= UP;
698} 760}
diff --git a/mm/slub.c b/mm/slub.c
index f614b5dc396b..75a5fa92ac2a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -459,8 +459,10 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
459/* 459/*
460 * Debug settings: 460 * Debug settings:
461 */ 461 */
462#ifdef CONFIG_SLUB_DEBUG_ON 462#if defined(CONFIG_SLUB_DEBUG_ON)
463static int slub_debug = DEBUG_DEFAULT_FLAGS; 463static int slub_debug = DEBUG_DEFAULT_FLAGS;
464#elif defined(CONFIG_KASAN)
465static int slub_debug = SLAB_STORE_USER;
464#else 466#else
465static int slub_debug; 467static int slub_debug;
466#endif 468#endif
@@ -1328,16 +1330,15 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
1328 1330
1329 flags |= __GFP_NOTRACK; 1331 flags |= __GFP_NOTRACK;
1330 1332
1331 if (memcg_charge_slab(s, flags, order))
1332 return NULL;
1333
1334 if (node == NUMA_NO_NODE) 1333 if (node == NUMA_NO_NODE)
1335 page = alloc_pages(flags, order); 1334 page = alloc_pages(flags, order);
1336 else 1335 else
1337 page = __alloc_pages_node(node, flags, order); 1336 page = __alloc_pages_node(node, flags, order);
1338 1337
1339 if (!page) 1338 if (page && memcg_charge_slab(page, flags, order, s)) {
1340 memcg_uncharge_slab(s, order); 1339 __free_pages(page, order);
1340 page = NULL;
1341 }
1341 1342
1342 return page; 1343 return page;
1343} 1344}
@@ -1476,8 +1477,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1476 page_mapcount_reset(page); 1477 page_mapcount_reset(page);
1477 if (current->reclaim_state) 1478 if (current->reclaim_state)
1478 current->reclaim_state->reclaimed_slab += pages; 1479 current->reclaim_state->reclaimed_slab += pages;
1479 __free_pages(page, order); 1480 __free_kmem_pages(page, order);
1480 memcg_uncharge_slab(s, order);
1481} 1481}
1482 1482
1483#define need_reserve_slab_rcu \ 1483#define need_reserve_slab_rcu \
@@ -2912,20 +2912,15 @@ static inline int slab_order(int size, int min_objects,
2912 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) 2912 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
2913 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 2913 return get_order(size * MAX_OBJS_PER_PAGE) - 1;
2914 2914
2915 for (order = max(min_order, 2915 for (order = max(min_order, get_order(min_objects * size + reserved));
2916 fls(min_objects * size - 1) - PAGE_SHIFT);
2917 order <= max_order; order++) { 2916 order <= max_order; order++) {
2918 2917
2919 unsigned long slab_size = PAGE_SIZE << order; 2918 unsigned long slab_size = PAGE_SIZE << order;
2920 2919
2921 if (slab_size < min_objects * size + reserved)
2922 continue;
2923
2924 rem = (slab_size - reserved) % size; 2920 rem = (slab_size - reserved) % size;
2925 2921
2926 if (rem <= slab_size / fract_leftover) 2922 if (rem <= slab_size / fract_leftover)
2927 break; 2923 break;
2928
2929 } 2924 }
2930 2925
2931 return order; 2926 return order;
@@ -2943,7 +2938,7 @@ static inline int calculate_order(int size, int reserved)
2943 * works by first attempting to generate a layout with 2938 * works by first attempting to generate a layout with
2944 * the best configuration and backing off gradually. 2939 * the best configuration and backing off gradually.
2945 * 2940 *
2946 * First we reduce the acceptable waste in a slab. Then 2941 * First we increase the acceptable waste in a slab. Then
2947 * we reduce the minimum objects required in a slab. 2942 * we reduce the minimum objects required in a slab.
2948 */ 2943 */
2949 min_objects = slub_min_objects; 2944 min_objects = slub_min_objects;
diff --git a/mm/util.c b/mm/util.c
index 68ff8a5361e7..9af1c12b310c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -309,7 +309,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
309{ 309{
310 if (unlikely(offset + PAGE_ALIGN(len) < offset)) 310 if (unlikely(offset + PAGE_ALIGN(len) < offset))
311 return -EINVAL; 311 return -EINVAL;
312 if (unlikely(offset & ~PAGE_MASK)) 312 if (unlikely(offset_in_page(offset)))
313 return -EINVAL; 313 return -EINVAL;
314 314
315 return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); 315 return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
diff --git a/mm/vmacache.c b/mm/vmacache.c
index b6e3662fe339..fd09dc9c6812 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -52,7 +52,7 @@ void vmacache_flush_all(struct mm_struct *mm)
52 * Also handle the case where a kernel thread has adopted this mm via use_mm(). 52 * Also handle the case where a kernel thread has adopted this mm via use_mm().
53 * That kernel thread's vmacache is not applicable to this mm. 53 * That kernel thread's vmacache is not applicable to this mm.
54 */ 54 */
55static bool vmacache_valid_mm(struct mm_struct *mm) 55static inline bool vmacache_valid_mm(struct mm_struct *mm)
56{ 56{
57 return current->mm == mm && !(current->flags & PF_KTHREAD); 57 return current->mm == mm && !(current->flags & PF_KTHREAD);
58} 58}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af3a519e40c2..9db9ef5e8481 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -358,7 +358,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
358 struct vmap_area *first; 358 struct vmap_area *first;
359 359
360 BUG_ON(!size); 360 BUG_ON(!size);
361 BUG_ON(size & ~PAGE_MASK); 361 BUG_ON(offset_in_page(size));
362 BUG_ON(!is_power_of_2(align)); 362 BUG_ON(!is_power_of_2(align));
363 363
364 va = kmalloc_node(sizeof(struct vmap_area), 364 va = kmalloc_node(sizeof(struct vmap_area),
@@ -936,7 +936,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
936 void *vaddr = NULL; 936 void *vaddr = NULL;
937 unsigned int order; 937 unsigned int order;
938 938
939 BUG_ON(size & ~PAGE_MASK); 939 BUG_ON(offset_in_page(size));
940 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 940 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
941 if (WARN_ON(size == 0)) { 941 if (WARN_ON(size == 0)) {
942 /* 942 /*
@@ -989,7 +989,7 @@ static void vb_free(const void *addr, unsigned long size)
989 unsigned int order; 989 unsigned int order;
990 struct vmap_block *vb; 990 struct vmap_block *vb;
991 991
992 BUG_ON(size & ~PAGE_MASK); 992 BUG_ON(offset_in_page(size));
993 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 993 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
994 994
995 flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); 995 flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
@@ -1902,7 +1902,7 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
1902 while (count) { 1902 while (count) {
1903 unsigned long offset, length; 1903 unsigned long offset, length;
1904 1904
1905 offset = (unsigned long)addr & ~PAGE_MASK; 1905 offset = offset_in_page(addr);
1906 length = PAGE_SIZE - offset; 1906 length = PAGE_SIZE - offset;
1907 if (length > count) 1907 if (length > count)
1908 length = count; 1908 length = count;
@@ -1941,7 +1941,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1941 while (count) { 1941 while (count) {
1942 unsigned long offset, length; 1942 unsigned long offset, length;
1943 1943
1944 offset = (unsigned long)addr & ~PAGE_MASK; 1944 offset = offset_in_page(addr);
1945 length = PAGE_SIZE - offset; 1945 length = PAGE_SIZE - offset;
1946 if (length > count) 1946 if (length > count)
1947 length = count; 1947 length = count;
@@ -2392,7 +2392,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2392 bool purged = false; 2392 bool purged = false;
2393 2393
2394 /* verify parameters and allocate data structures */ 2394 /* verify parameters and allocate data structures */
2395 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); 2395 BUG_ON(offset_in_page(align) || !is_power_of_2(align));
2396 for (last_area = 0, area = 0; area < nr_vms; area++) { 2396 for (last_area = 0, area = 0; area < nr_vms; area++) {
2397 start = offsets[area]; 2397 start = offsets[area];
2398 end = start + sizes[area]; 2398 end = start + sizes[area];
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e7057af54b6e..55721b619aee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -194,7 +194,7 @@ static bool sane_reclaim(struct scan_control *sc)
194 194
195static unsigned long zone_reclaimable_pages(struct zone *zone) 195static unsigned long zone_reclaimable_pages(struct zone *zone)
196{ 196{
197 int nr; 197 unsigned long nr;
198 198
199 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 199 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
200 zone_page_state(zone, NR_INACTIVE_FILE); 200 zone_page_state(zone, NR_INACTIVE_FILE);
@@ -1859,17 +1859,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
1859} 1859}
1860 1860
1861#ifdef CONFIG_SWAP 1861#ifdef CONFIG_SWAP
1862static int inactive_anon_is_low_global(struct zone *zone) 1862static bool inactive_anon_is_low_global(struct zone *zone)
1863{ 1863{
1864 unsigned long active, inactive; 1864 unsigned long active, inactive;
1865 1865
1866 active = zone_page_state(zone, NR_ACTIVE_ANON); 1866 active = zone_page_state(zone, NR_ACTIVE_ANON);
1867 inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1867 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1868 1868
1869 if (inactive * zone->inactive_ratio < active) 1869 return inactive * zone->inactive_ratio < active;
1870 return 1;
1871
1872 return 0;
1873} 1870}
1874 1871
1875/** 1872/**
@@ -1879,14 +1876,14 @@ static int inactive_anon_is_low_global(struct zone *zone)
1879 * Returns true if the zone does not have enough inactive anon pages, 1876 * Returns true if the zone does not have enough inactive anon pages,
1880 * meaning some active anon pages need to be deactivated. 1877 * meaning some active anon pages need to be deactivated.
1881 */ 1878 */
1882static int inactive_anon_is_low(struct lruvec *lruvec) 1879static bool inactive_anon_is_low(struct lruvec *lruvec)
1883{ 1880{
1884 /* 1881 /*
1885 * If we don't have swap space, anonymous page deactivation 1882 * If we don't have swap space, anonymous page deactivation
1886 * is pointless. 1883 * is pointless.
1887 */ 1884 */
1888 if (!total_swap_pages) 1885 if (!total_swap_pages)
1889 return 0; 1886 return false;
1890 1887
1891 if (!mem_cgroup_disabled()) 1888 if (!mem_cgroup_disabled())
1892 return mem_cgroup_inactive_anon_is_low(lruvec); 1889 return mem_cgroup_inactive_anon_is_low(lruvec);
@@ -1894,9 +1891,9 @@ static int inactive_anon_is_low(struct lruvec *lruvec)
1894 return inactive_anon_is_low_global(lruvec_zone(lruvec)); 1891 return inactive_anon_is_low_global(lruvec_zone(lruvec));
1895} 1892}
1896#else 1893#else
1897static inline int inactive_anon_is_low(struct lruvec *lruvec) 1894static inline bool inactive_anon_is_low(struct lruvec *lruvec)
1898{ 1895{
1899 return 0; 1896 return false;
1900} 1897}
1901#endif 1898#endif
1902 1899
@@ -1914,7 +1911,7 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
1914 * This uses a different ratio than the anonymous pages, because 1911 * This uses a different ratio than the anonymous pages, because
1915 * the page cache uses a use-once replacement algorithm. 1912 * the page cache uses a use-once replacement algorithm.
1916 */ 1913 */
1917static int inactive_file_is_low(struct lruvec *lruvec) 1914static bool inactive_file_is_low(struct lruvec *lruvec)
1918{ 1915{
1919 unsigned long inactive; 1916 unsigned long inactive;
1920 unsigned long active; 1917 unsigned long active;
@@ -1925,7 +1922,7 @@ static int inactive_file_is_low(struct lruvec *lruvec)
1925 return active > inactive; 1922 return active > inactive;
1926} 1923}
1927 1924
1928static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) 1925static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
1929{ 1926{
1930 if (is_file_lru(lru)) 1927 if (is_file_lru(lru))
1931 return inactive_file_is_low(lruvec); 1928 return inactive_file_is_low(lruvec);
@@ -3696,10 +3693,10 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
3696} 3693}
3697 3694
3698/* Work out how many page cache pages we can reclaim in this reclaim_mode */ 3695/* Work out how many page cache pages we can reclaim in this reclaim_mode */
3699static long zone_pagecache_reclaimable(struct zone *zone) 3696static unsigned long zone_pagecache_reclaimable(struct zone *zone)
3700{ 3697{
3701 long nr_pagecache_reclaimable; 3698 unsigned long nr_pagecache_reclaimable;
3702 long delta = 0; 3699 unsigned long delta = 0;
3703 3700
3704 /* 3701 /*
3705 * If RECLAIM_UNMAP is set, then all file pages are considered 3702 * If RECLAIM_UNMAP is set, then all file pages are considered
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fbf14485a049..ffcb4f58bf3e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -591,6 +591,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
591 else 591 else
592 __inc_zone_state(z, NUMA_OTHER); 592 __inc_zone_state(z, NUMA_OTHER);
593} 593}
594
595/*
596 * Determine the per node value of a stat item.
597 */
598unsigned long node_page_state(int node, enum zone_stat_item item)
599{
600 struct zone *zones = NODE_DATA(node)->node_zones;
601
602 return
603#ifdef CONFIG_ZONE_DMA
604 zone_page_state(&zones[ZONE_DMA], item) +
605#endif
606#ifdef CONFIG_ZONE_DMA32
607 zone_page_state(&zones[ZONE_DMA32], item) +
608#endif
609#ifdef CONFIG_HIGHMEM
610 zone_page_state(&zones[ZONE_HIGHMEM], item) +
611#endif
612 zone_page_state(&zones[ZONE_NORMAL], item) +
613 zone_page_state(&zones[ZONE_MOVABLE], item);
614}
615
594#endif 616#endif
595 617
596#ifdef CONFIG_COMPACTION 618#ifdef CONFIG_COMPACTION
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 3c53cac15de1..e4bb1de1d526 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -5,6 +5,8 @@ BINARIES = compaction_test
5BINARIES += hugepage-mmap 5BINARIES += hugepage-mmap
6BINARIES += hugepage-shm 6BINARIES += hugepage-shm
7BINARIES += map_hugetlb 7BINARIES += map_hugetlb
8BINARIES += mlock2-tests
9BINARIES += on-fault-limit
8BINARIES += thuge-gen 10BINARIES += thuge-gen
9BINARIES += transhuge-stress 11BINARIES += transhuge-stress
10BINARIES += userfaultfd 12BINARIES += userfaultfd
diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c
new file mode 100644
index 000000000000..4431994aade2
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock2-tests.c
@@ -0,0 +1,736 @@
1#include <sys/mman.h>
2#include <stdint.h>
3#include <stdio.h>
4#include <stdlib.h>
5#include <unistd.h>
6#include <string.h>
7#include <sys/time.h>
8#include <sys/resource.h>
9#include <syscall.h>
10#include <errno.h>
11#include <stdbool.h>
12
13#ifndef MLOCK_ONFAULT
14#define MLOCK_ONFAULT 1
15#endif
16
17#ifndef MCL_ONFAULT
18#define MCL_ONFAULT (MCL_FUTURE << 1)
19#endif
20
21static int mlock2_(void *start, size_t len, int flags)
22{
23#ifdef __NR_mlock2
24 return syscall(__NR_mlock2, start, len, flags);
25#else
26 errno = ENOSYS;
27 return -1;
28#endif
29}
30
31struct vm_boundaries {
32 unsigned long start;
33 unsigned long end;
34};
35
36static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
37{
38 FILE *file;
39 int ret = 1;
40 char line[1024] = {0};
41 char *end_addr;
42 char *stop;
43 unsigned long start;
44 unsigned long end;
45
46 if (!area)
47 return ret;
48
49 file = fopen("/proc/self/maps", "r");
50 if (!file) {
51 perror("fopen");
52 return ret;
53 }
54
55 memset(area, 0, sizeof(struct vm_boundaries));
56
57 while(fgets(line, 1024, file)) {
58 end_addr = strchr(line, '-');
59 if (!end_addr) {
60 printf("cannot parse /proc/self/maps\n");
61 goto out;
62 }
63 *end_addr = '\0';
64 end_addr++;
65 stop = strchr(end_addr, ' ');
66 if (!stop) {
67 printf("cannot parse /proc/self/maps\n");
68 goto out;
69 }
70 stop = '\0';
71
72 sscanf(line, "%lx", &start);
73 sscanf(end_addr, "%lx", &end);
74
75 if (start <= addr && end > addr) {
76 area->start = start;
77 area->end = end;
78 ret = 0;
79 goto out;
80 }
81 }
82out:
83 fclose(file);
84 return ret;
85}
86
87static uint64_t get_pageflags(unsigned long addr)
88{
89 FILE *file;
90 uint64_t pfn;
91 unsigned long offset;
92
93 file = fopen("/proc/self/pagemap", "r");
94 if (!file) {
95 perror("fopen pagemap");
96 _exit(1);
97 }
98
99 offset = addr / getpagesize() * sizeof(pfn);
100
101 if (fseek(file, offset, SEEK_SET)) {
102 perror("fseek pagemap");
103 _exit(1);
104 }
105
106 if (fread(&pfn, sizeof(pfn), 1, file) != 1) {
107 perror("fread pagemap");
108 _exit(1);
109 }
110
111 fclose(file);
112 return pfn;
113}
114
115static uint64_t get_kpageflags(unsigned long pfn)
116{
117 uint64_t flags;
118 FILE *file;
119
120 file = fopen("/proc/kpageflags", "r");
121 if (!file) {
122 perror("fopen kpageflags");
123 _exit(1);
124 }
125
126 if (fseek(file, pfn * sizeof(flags), SEEK_SET)) {
127 perror("fseek kpageflags");
128 _exit(1);
129 }
130
131 if (fread(&flags, sizeof(flags), 1, file) != 1) {
132 perror("fread kpageflags");
133 _exit(1);
134 }
135
136 fclose(file);
137 return flags;
138}
139
140static FILE *seek_to_smaps_entry(unsigned long addr)
141{
142 FILE *file;
143 char *line = NULL;
144 size_t size = 0;
145 unsigned long start, end;
146 char perms[5];
147 unsigned long offset;
148 char dev[32];
149 unsigned long inode;
150 char path[BUFSIZ];
151
152 file = fopen("/proc/self/smaps", "r");
153 if (!file) {
154 perror("fopen smaps");
155 _exit(1);
156 }
157
158 while (getline(&line, &size, file) > 0) {
159 if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
160 &start, &end, perms, &offset, dev, &inode, path) < 6)
161 goto next;
162
163 if (start <= addr && addr < end)
164 goto out;
165
166next:
167 free(line);
168 line = NULL;
169 size = 0;
170 }
171
172 fclose(file);
173 file = NULL;
174
175out:
176 free(line);
177 return file;
178}
179
180#define VMFLAGS "VmFlags:"
181
182static bool is_vmflag_set(unsigned long addr, const char *vmflag)
183{
184 char *line = NULL;
185 char *flags;
186 size_t size = 0;
187 bool ret = false;
188 FILE *smaps;
189
190 smaps = seek_to_smaps_entry(addr);
191 if (!smaps) {
192 printf("Unable to parse /proc/self/smaps\n");
193 goto out;
194 }
195
196 while (getline(&line, &size, smaps) > 0) {
197 if (!strstr(line, VMFLAGS)) {
198 free(line);
199 line = NULL;
200 size = 0;
201 continue;
202 }
203
204 flags = line + strlen(VMFLAGS);
205 ret = (strstr(flags, vmflag) != NULL);
206 goto out;
207 }
208
209out:
210 free(line);
211 fclose(smaps);
212 return ret;
213}
214
215#define SIZE "Size:"
216#define RSS "Rss:"
217#define LOCKED "lo"
218
219static bool is_vma_lock_on_fault(unsigned long addr)
220{
221 bool ret = false;
222 bool locked;
223 FILE *smaps = NULL;
224 unsigned long vma_size, vma_rss;
225 char *line = NULL;
226 char *value;
227 size_t size = 0;
228
229 locked = is_vmflag_set(addr, LOCKED);
230 if (!locked)
231 goto out;
232
233 smaps = seek_to_smaps_entry(addr);
234 if (!smaps) {
235 printf("Unable to parse /proc/self/smaps\n");
236 goto out;
237 }
238
239 while (getline(&line, &size, smaps) > 0) {
240 if (!strstr(line, SIZE)) {
241 free(line);
242 line = NULL;
243 size = 0;
244 continue;
245 }
246
247 value = line + strlen(SIZE);
248 if (sscanf(value, "%lu kB", &vma_size) < 1) {
249 printf("Unable to parse smaps entry for Size\n");
250 goto out;
251 }
252 break;
253 }
254
255 while (getline(&line, &size, smaps) > 0) {
256 if (!strstr(line, RSS)) {
257 free(line);
258 line = NULL;
259 size = 0;
260 continue;
261 }
262
263 value = line + strlen(RSS);
264 if (sscanf(value, "%lu kB", &vma_rss) < 1) {
265 printf("Unable to parse smaps entry for Rss\n");
266 goto out;
267 }
268 break;
269 }
270
271 ret = locked && (vma_rss < vma_size);
272out:
273 free(line);
274 if (smaps)
275 fclose(smaps);
276 return ret;
277}
278
279#define PRESENT_BIT 0x8000000000000000
280#define PFN_MASK 0x007FFFFFFFFFFFFF
281#define UNEVICTABLE_BIT (1UL << 18)
282
283static int lock_check(char *map)
284{
285 unsigned long page_size = getpagesize();
286 uint64_t page1_flags, page2_flags;
287
288 page1_flags = get_pageflags((unsigned long)map);
289 page2_flags = get_pageflags((unsigned long)map + page_size);
290
291 /* Both pages should be present */
292 if (((page1_flags & PRESENT_BIT) == 0) ||
293 ((page2_flags & PRESENT_BIT) == 0)) {
294 printf("Failed to make both pages present\n");
295 return 1;
296 }
297
298 page1_flags = get_kpageflags(page1_flags & PFN_MASK);
299 page2_flags = get_kpageflags(page2_flags & PFN_MASK);
300
301 /* Both pages should be unevictable */
302 if (((page1_flags & UNEVICTABLE_BIT) == 0) ||
303 ((page2_flags & UNEVICTABLE_BIT) == 0)) {
304 printf("Failed to make both pages unevictable\n");
305 return 1;
306 }
307
308 if (!is_vmflag_set((unsigned long)map, LOCKED)) {
309 printf("VMA flag %s is missing on page 1\n", LOCKED);
310 return 1;
311 }
312
313 if (!is_vmflag_set((unsigned long)map + page_size, LOCKED)) {
314 printf("VMA flag %s is missing on page 2\n", LOCKED);
315 return 1;
316 }
317
318 return 0;
319}
320
321static int unlock_lock_check(char *map)
322{
323 unsigned long page_size = getpagesize();
324 uint64_t page1_flags, page2_flags;
325
326 page1_flags = get_pageflags((unsigned long)map);
327 page2_flags = get_pageflags((unsigned long)map + page_size);
328 page1_flags = get_kpageflags(page1_flags & PFN_MASK);
329 page2_flags = get_kpageflags(page2_flags & PFN_MASK);
330
331 if ((page1_flags & UNEVICTABLE_BIT) || (page2_flags & UNEVICTABLE_BIT)) {
332 printf("A page is still marked unevictable after unlock\n");
333 return 1;
334 }
335
336 if (is_vmflag_set((unsigned long)map, LOCKED)) {
337 printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
338 return 1;
339 }
340
341 if (is_vmflag_set((unsigned long)map + page_size, LOCKED)) {
342 printf("VMA flag %s is present on page 2 after unlock\n", LOCKED);
343 return 1;
344 }
345
346 return 0;
347}
348
349static int test_mlock_lock()
350{
351 char *map;
352 int ret = 1;
353 unsigned long page_size = getpagesize();
354
355 map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
356 MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
357 if (map == MAP_FAILED) {
358 perror("test_mlock_locked mmap");
359 goto out;
360 }
361
362 if (mlock2_(map, 2 * page_size, 0)) {
363 if (errno == ENOSYS) {
364 printf("Cannot call new mlock family, skipping test\n");
365 _exit(0);
366 }
367 perror("mlock2(0)");
368 goto unmap;
369 }
370
371 if (lock_check(map))
372 goto unmap;
373
374 /* Now unlock and recheck attributes */
375 if (munlock(map, 2 * page_size)) {
376 perror("munlock()");
377 goto unmap;
378 }
379
380 ret = unlock_lock_check(map);
381
382unmap:
383 munmap(map, 2 * page_size);
384out:
385 return ret;
386}
387
388static int onfault_check(char *map)
389{
390 unsigned long page_size = getpagesize();
391 uint64_t page1_flags, page2_flags;
392
393 page1_flags = get_pageflags((unsigned long)map);
394 page2_flags = get_pageflags((unsigned long)map + page_size);
395
396 /* Neither page should be present */
397 if ((page1_flags & PRESENT_BIT) || (page2_flags & PRESENT_BIT)) {
398 printf("Pages were made present by MLOCK_ONFAULT\n");
399 return 1;
400 }
401
402 *map = 'a';
403 page1_flags = get_pageflags((unsigned long)map);
404 page2_flags = get_pageflags((unsigned long)map + page_size);
405
406 /* Only page 1 should be present */
407 if ((page1_flags & PRESENT_BIT) == 0) {
408 printf("Page 1 is not present after fault\n");
409 return 1;
410 } else if (page2_flags & PRESENT_BIT) {
411 printf("Page 2 was made present\n");
412 return 1;
413 }
414
415 page1_flags = get_kpageflags(page1_flags & PFN_MASK);
416
417 /* Page 1 should be unevictable */
418 if ((page1_flags & UNEVICTABLE_BIT) == 0) {
419 printf("Failed to make faulted page unevictable\n");
420 return 1;
421 }
422
423 if (!is_vma_lock_on_fault((unsigned long)map)) {
424 printf("VMA is not marked for lock on fault\n");
425 return 1;
426 }
427
428 if (!is_vma_lock_on_fault((unsigned long)map + page_size)) {
429 printf("VMA is not marked for lock on fault\n");
430 return 1;
431 }
432
433 return 0;
434}
435
436static int unlock_onfault_check(char *map)
437{
438 unsigned long page_size = getpagesize();
439 uint64_t page1_flags;
440
441 page1_flags = get_pageflags((unsigned long)map);
442 page1_flags = get_kpageflags(page1_flags & PFN_MASK);
443
444 if (page1_flags & UNEVICTABLE_BIT) {
445 printf("Page 1 is still marked unevictable after unlock\n");
446 return 1;
447 }
448
449 if (is_vma_lock_on_fault((unsigned long)map) ||
450 is_vma_lock_on_fault((unsigned long)map + page_size)) {
451 printf("VMA is still lock on fault after unlock\n");
452 return 1;
453 }
454
455 return 0;
456}
457
458static int test_mlock_onfault()
459{
460 char *map;
461 int ret = 1;
462 unsigned long page_size = getpagesize();
463
464 map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
465 MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
466 if (map == MAP_FAILED) {
467 perror("test_mlock_locked mmap");
468 goto out;
469 }
470
471 if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
472 if (errno == ENOSYS) {
473 printf("Cannot call new mlock family, skipping test\n");
474 _exit(0);
475 }
476 perror("mlock2(MLOCK_ONFAULT)");
477 goto unmap;
478 }
479
480 if (onfault_check(map))
481 goto unmap;
482
483 /* Now unlock and recheck attributes */
484 if (munlock(map, 2 * page_size)) {
485 if (errno == ENOSYS) {
486 printf("Cannot call new mlock family, skipping test\n");
487 _exit(0);
488 }
489 perror("munlock()");
490 goto unmap;
491 }
492
493 ret = unlock_onfault_check(map);
494unmap:
495 munmap(map, 2 * page_size);
496out:
497 return ret;
498}
499
500static int test_lock_onfault_of_present()
501{
502 char *map;
503 int ret = 1;
504 unsigned long page_size = getpagesize();
505 uint64_t page1_flags, page2_flags;
506
507 map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
508 MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
509 if (map == MAP_FAILED) {
510 perror("test_mlock_locked mmap");
511 goto out;
512 }
513
514 *map = 'a';
515
516 if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
517 if (errno == ENOSYS) {
518 printf("Cannot call new mlock family, skipping test\n");
519 _exit(0);
520 }
521 perror("mlock2(MLOCK_ONFAULT)");
522 goto unmap;
523 }
524
525 page1_flags = get_pageflags((unsigned long)map);
526 page2_flags = get_pageflags((unsigned long)map + page_size);
527 page1_flags = get_kpageflags(page1_flags & PFN_MASK);
528 page2_flags = get_kpageflags(page2_flags & PFN_MASK);
529
530 /* Page 1 should be unevictable */
531 if ((page1_flags & UNEVICTABLE_BIT) == 0) {
532 printf("Failed to make present page unevictable\n");
533 goto unmap;
534 }
535
536 if (!is_vma_lock_on_fault((unsigned long)map) ||
537 !is_vma_lock_on_fault((unsigned long)map + page_size)) {
538 printf("VMA with present pages is not marked lock on fault\n");
539 goto unmap;
540 }
541 ret = 0;
542unmap:
543 munmap(map, 2 * page_size);
544out:
545 return ret;
546}
547
548static int test_munlockall()
549{
550 char *map;
551 int ret = 1;
552 unsigned long page_size = getpagesize();
553
554 map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
555 MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
556
557 if (map == MAP_FAILED) {
558 perror("test_munlockall mmap");
559 goto out;
560 }
561
562 if (mlockall(MCL_CURRENT)) {
563 perror("mlockall(MCL_CURRENT)");
564 goto out;
565 }
566
567 if (lock_check(map))
568 goto unmap;
569
570 if (munlockall()) {
571 perror("munlockall()");
572 goto unmap;
573 }
574
575 if (unlock_lock_check(map))
576 goto unmap;
577
578 munmap(map, 2 * page_size);
579
580 map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
581 MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
582
583 if (map == MAP_FAILED) {
584 perror("test_munlockall second mmap");
585 goto out;
586 }
587
588 if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
589 perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
590 goto unmap;
591 }
592
593 if (onfault_check(map))
594 goto unmap;
595
596 if (munlockall()) {
597 perror("munlockall()");
598 goto unmap;
599 }
600
601 if (unlock_onfault_check(map))
602 goto unmap;
603
604 if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
605 perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
606 goto out;
607 }
608
609 if (lock_check(map))
610 goto unmap;
611
612 if (munlockall()) {
613 perror("munlockall()");
614 goto unmap;
615 }
616
617 ret = unlock_lock_check(map);
618
619unmap:
620 munmap(map, 2 * page_size);
621out:
622 munlockall();
623 return ret;
624}
625
626static int test_vma_management(bool call_mlock)
627{
628 int ret = 1;
629 void *map;
630 unsigned long page_size = getpagesize();
631 struct vm_boundaries page1;
632 struct vm_boundaries page2;
633 struct vm_boundaries page3;
634
635 map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
636 MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
637 if (map == MAP_FAILED) {
638 perror("mmap()");
639 return ret;
640 }
641
642 if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
643 if (errno == ENOSYS) {
644 printf("Cannot call new mlock family, skipping test\n");
645 _exit(0);
646 }
647 perror("mlock(ONFAULT)\n");
648 goto out;
649 }
650
651 if (get_vm_area((unsigned long)map, &page1) ||
652 get_vm_area((unsigned long)map + page_size, &page2) ||
653 get_vm_area((unsigned long)map + page_size * 2, &page3)) {
654 printf("couldn't find mapping in /proc/self/maps\n");
655 goto out;
656 }
657
658 /*
659 * Before we unlock a portion, we need to that all three pages are in
660 * the same VMA. If they are not we abort this test (Note that this is
661 * not a failure)
662 */
663 if (page1.start != page2.start || page2.start != page3.start) {
664 printf("VMAs are not merged to start, aborting test\n");
665 ret = 0;
666 goto out;
667 }
668
669 if (munlock(map + page_size, page_size)) {
670 perror("munlock()");
671 goto out;
672 }
673
674 if (get_vm_area((unsigned long)map, &page1) ||
675 get_vm_area((unsigned long)map + page_size, &page2) ||
676 get_vm_area((unsigned long)map + page_size * 2, &page3)) {
677 printf("couldn't find mapping in /proc/self/maps\n");
678 goto out;
679 }
680
681 /* All three VMAs should be different */
682 if (page1.start == page2.start || page2.start == page3.start) {
683 printf("failed to split VMA for munlock\n");
684 goto out;
685 }
686
687 /* Now unlock the first and third page and check the VMAs again */
688 if (munlock(map, page_size * 3)) {
689 perror("munlock()");
690 goto out;
691 }
692
693 if (get_vm_area((unsigned long)map, &page1) ||
694 get_vm_area((unsigned long)map + page_size, &page2) ||
695 get_vm_area((unsigned long)map + page_size * 2, &page3)) {
696 printf("couldn't find mapping in /proc/self/maps\n");
697 goto out;
698 }
699
700 /* Now all three VMAs should be the same */
701 if (page1.start != page2.start || page2.start != page3.start) {
702 printf("failed to merge VMAs after munlock\n");
703 goto out;
704 }
705
706 ret = 0;
707out:
708 munmap(map, 3 * page_size);
709 return ret;
710}
711
712static int test_mlockall(int (test_function)(bool call_mlock))
713{
714 int ret = 1;
715
716 if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
717 perror("mlockall");
718 return ret;
719 }
720
721 ret = test_function(false);
722 munlockall();
723 return ret;
724}
725
726int main(int argc, char **argv)
727{
728 int ret = 0;
729 ret += test_mlock_lock();
730 ret += test_mlock_onfault();
731 ret += test_munlockall();
732 ret += test_lock_onfault_of_present();
733 ret += test_vma_management(true);
734 ret += test_mlockall(test_vma_management);
735 return ret;
736}
diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c
new file mode 100644
index 000000000000..245acccce42d
--- /dev/null
+++ b/tools/testing/selftests/vm/on-fault-limit.c
@@ -0,0 +1,47 @@
1#include <sys/mman.h>
2#include <stdio.h>
3#include <unistd.h>
4#include <string.h>
5#include <sys/time.h>
6#include <sys/resource.h>
7
8#ifndef MCL_ONFAULT
9#define MCL_ONFAULT (MCL_FUTURE << 1)
10#endif
11
12static int test_limit(void)
13{
14 int ret = 1;
15 struct rlimit lims;
16 void *map;
17
18 if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
19 perror("getrlimit");
20 return ret;
21 }
22
23 if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
24 perror("mlockall");
25 return ret;
26 }
27
28 map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
29 MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, 0, 0);
30 if (map != MAP_FAILED)
31 printf("mmap should have failed, but didn't\n");
32 else {
33 ret = 0;
34 munmap(map, 2 * lims.rlim_max);
35 }
36
37 munlockall();
38 return ret;
39}
40
41int main(int argc, char **argv)
42{
43 int ret = 0;
44
45 ret += test_limit();
46 return ret;
47}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 9179ce8df485..2df21b3bb26d 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -106,4 +106,26 @@ else
106 echo "[PASS]" 106 echo "[PASS]"
107fi 107fi
108 108
109echo "--------------------"
110echo "running on-fault-limit"
111echo "--------------------"
112sudo -u nobody ./on-fault-limit
113if [ $? -ne 0 ]; then
114 echo "[FAIL]"
115 exitcode=1
116else
117 echo "[PASS]"
118fi
119
120echo "--------------------"
121echo "running mlock2-tests"
122echo "--------------------"
123./mlock2-tests
124if [ $? -ne 0 ]; then
125 echo "[FAIL]"
126 exitcode=1
127else
128 echo "[PASS]"
129fi
130
109exit $exitcode 131exit $exitcode
diff --git a/tools/vm/slabinfo-gnuplot.sh b/tools/vm/slabinfo-gnuplot.sh
new file mode 100644
index 000000000000..35b039864b77
--- /dev/null
+++ b/tools/vm/slabinfo-gnuplot.sh
@@ -0,0 +1,275 @@
1#!/bin/sh
2
3# Sergey Senozhatsky, 2015
4# sergey.senozhatsky.work@gmail.com
5#
6# This software is licensed under the terms of the GNU General Public
7# License version 2, as published by the Free Software Foundation, and
8# may be copied, distributed, and modified under those terms.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU General Public License for more details.
14
15
16# This program is intended to plot a `slabinfo -X' stats, collected,
17# for example, using the following command:
18# while [ 1 ]; do slabinfo -X >> stats; sleep 1; done
19#
20# Use `slabinfo-gnuplot.sh stats' to pre-process collected records
21# and generate graphs (totals, slabs sorted by size, slabs sorted
22# by size).
23#
24# Graphs can be [individually] regenerate with different ranges and
25# size (-r %d,%d and -s %d,%d options).
26#
27# To visually compare N `totals' graphs, do
28# slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals
29#
30
31min_slab_name_size=11
32xmin=0
33xmax=0
34width=1500
35height=700
36mode=preprocess
37
38usage()
39{
40 echo "Usage: [-s W,H] [-r MIN,MAX] [-t|-l] FILE1 [FILE2 ..]"
41 echo "FILEs must contain 'slabinfo -X' samples"
42 echo "-t - plot totals for FILE(s)"
43 echo "-l - plot slabs stats for FILE(s)"
44 echo "-s %d,%d - set image width and height"
45 echo "-r %d,%d - use data samples from a given range"
46}
47
48check_file_exist()
49{
50 if [ ! -f "$1" ]; then
51 echo "File '$1' does not exist"
52 exit 1
53 fi
54}
55
56do_slabs_plotting()
57{
58 local file=$1
59 local out_file
60 local range="every ::$xmin"
61 local xtic=""
62 local xtic_rotate="norotate"
63 local lines=2000000
64 local wc_lines
65
66 check_file_exist "$file"
67
68 out_file=`basename "$file"`
69 if [ $xmax -ne 0 ]; then
70 range="$range::$xmax"
71 lines=$((xmax-xmin))
72 fi
73
74 wc_lines=`cat "$file" | wc -l`
75 if [ $? -ne 0 ] || [ "$wc_lines" -eq 0 ] ; then
76 wc_lines=$lines
77 fi
78
79 if [ "$wc_lines" -lt "$lines" ]; then
80 lines=$wc_lines
81 fi
82
83 if [ $((width / lines)) -gt $min_slab_name_size ]; then
84 xtic=":xtic(1)"
85 xtic_rotate=90
86 fi
87
88gnuplot -p << EOF
89#!/usr/bin/env gnuplot
90
91set terminal png enhanced size $width,$height large
92set output '$out_file.png'
93set autoscale xy
94set xlabel 'samples'
95set ylabel 'bytes'
96set style histogram columnstacked title textcolor lt -1
97set style fill solid 0.15
98set xtics rotate $xtic_rotate
99set key left above Left title reverse
100
101plot "$file" $range u 2$xtic title 'SIZE' with boxes,\
102 '' $range u 3 title 'LOSS' with boxes
103EOF
104
105 if [ $? -eq 0 ]; then
106 echo "$out_file.png"
107 fi
108}
109
110do_totals_plotting()
111{
112 local gnuplot_cmd=""
113 local range="every ::$xmin"
114 local file=""
115
116 if [ $xmax -ne 0 ]; then
117 range="$range::$xmax"
118 fi
119
120 for i in "${t_files[@]}"; do
121 check_file_exist "$i"
122
123 file="$file"`basename "$i"`
124 gnuplot_cmd="$gnuplot_cmd '$i' $range using 1 title\
125 '$i Memory usage' with lines,"
126 gnuplot_cmd="$gnuplot_cmd '' $range using 2 title \
127 '$i Loss' with lines,"
128 done
129
130gnuplot -p << EOF
131#!/usr/bin/env gnuplot
132
133set terminal png enhanced size $width,$height large
134set autoscale xy
135set output '$file.png'
136set xlabel 'samples'
137set ylabel 'bytes'
138set key left above Left title reverse
139
140plot $gnuplot_cmd
141EOF
142
143 if [ $? -eq 0 ]; then
144 echo "$file.png"
145 fi
146}
147
148do_preprocess()
149{
150 local out
151 local lines
152 local in=$1
153
154 check_file_exist "$in"
155
156 # use only 'TOP' slab (biggest memory usage or loss)
157 let lines=3
158 out=`basename "$in"`"-slabs-by-loss"
159 `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\
160 egrep -iv '\-\-|Name|Slabs'\
161 | awk '{print $1" "$4+$2*$3" "$4}' > "$out"`
162 if [ $? -eq 0 ]; then
163 do_slabs_plotting "$out"
164 fi
165
166 let lines=3
167 out=`basename "$in"`"-slabs-by-size"
168 `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\
169 egrep -iv '\-\-|Name|Slabs'\
170 | awk '{print $1" "$4" "$4-$2*$3}' > "$out"`
171 if [ $? -eq 0 ]; then
172 do_slabs_plotting "$out"
173 fi
174
175 out=`basename "$in"`"-totals"
176 `cat "$in" | grep "Memory used" |\
177 awk '{print $3" "$7}' > "$out"`
178 if [ $? -eq 0 ]; then
179 t_files[0]=$out
180 do_totals_plotting
181 fi
182}
183
184parse_opts()
185{
186 local opt
187
188 while getopts "tlr::s::h" opt; do
189 case $opt in
190 t)
191 mode=totals
192 ;;
193 l)
194 mode=slabs
195 ;;
196 s)
197 array=(${OPTARG//,/ })
198 width=${array[0]}
199 height=${array[1]}
200 ;;
201 r)
202 array=(${OPTARG//,/ })
203 xmin=${array[0]}
204 xmax=${array[1]}
205 ;;
206 h)
207 usage
208 exit 0
209 ;;
210 \?)
211 echo "Invalid option: -$OPTARG" >&2
212 exit 1
213 ;;
214 :)
215 echo "-$OPTARG requires an argument." >&2
216 exit 1
217 ;;
218 esac
219 done
220
221 return $OPTIND
222}
223
224parse_args()
225{
226 local idx=0
227 local p
228
229 for p in "$@"; do
230 case $mode in
231 preprocess)
232 files[$idx]=$p
233 idx=$idx+1
234 ;;
235 totals)
236 t_files[$idx]=$p
237 idx=$idx+1
238 ;;
239 slabs)
240 files[$idx]=$p
241 idx=$idx+1
242 ;;
243 esac
244 done
245}
246
247parse_opts "$@"
248argstart=$?
249parse_args "${@:$argstart}"
250
251if [ ${#files[@]} -eq 0 ] && [ ${#t_files[@]} -eq 0 ]; then
252 usage
253 exit 1
254fi
255
256case $mode in
257 preprocess)
258 for i in "${files[@]}"; do
259 do_preprocess "$i"
260 done
261 ;;
262 totals)
263 do_totals_plotting
264 ;;
265 slabs)
266 for i in "${files[@]}"; do
267 do_slabs_plotting "$i"
268 done
269 ;;
270 *)
271 echo "Unknown mode $mode" >&2
272 usage
273 exit 1
274 ;;
275esac
diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c
index 808d5a9d5dcf..86e698d07e20 100644
--- a/tools/vm/slabinfo.c
+++ b/tools/vm/slabinfo.c
@@ -53,39 +53,43 @@ struct aliasinfo {
53 struct slabinfo *slab; 53 struct slabinfo *slab;
54} aliasinfo[MAX_ALIASES]; 54} aliasinfo[MAX_ALIASES];
55 55
56int slabs = 0; 56int slabs;
57int actual_slabs = 0; 57int actual_slabs;
58int aliases = 0; 58int aliases;
59int alias_targets = 0; 59int alias_targets;
60int highest_node = 0; 60int highest_node;
61 61
62char buffer[4096]; 62char buffer[4096];
63 63
64int show_empty = 0; 64int show_empty;
65int show_report = 0; 65int show_report;
66int show_alias = 0; 66int show_alias;
67int show_slab = 0; 67int show_slab;
68int skip_zero = 1; 68int skip_zero = 1;
69int show_numa = 0; 69int show_numa;
70int show_track = 0; 70int show_track;
71int show_first_alias = 0; 71int show_first_alias;
72int validate = 0; 72int validate;
73int shrink = 0; 73int shrink;
74int show_inverted = 0; 74int show_inverted;
75int show_single_ref = 0; 75int show_single_ref;
76int show_totals = 0; 76int show_totals;
77int sort_size = 0; 77int sort_size;
78int sort_active = 0; 78int sort_active;
79int set_debug = 0; 79int set_debug;
80int show_ops = 0; 80int show_ops;
81int show_activity = 0; 81int show_activity;
82int output_lines = -1;
83int sort_loss;
84int extended_totals;
85int show_bytes;
82 86
83/* Debug options */ 87/* Debug options */
84int sanity = 0; 88int sanity;
85int redzone = 0; 89int redzone;
86int poison = 0; 90int poison;
87int tracking = 0; 91int tracking;
88int tracing = 0; 92int tracing;
89 93
90int page_size; 94int page_size;
91 95
@@ -124,6 +128,10 @@ static void usage(void)
124 "-v|--validate Validate slabs\n" 128 "-v|--validate Validate slabs\n"
125 "-z|--zero Include empty slabs\n" 129 "-z|--zero Include empty slabs\n"
126 "-1|--1ref Single reference\n" 130 "-1|--1ref Single reference\n"
131 "-N|--lines=K Show the first K slabs\n"
132 "-L|--Loss Sort by loss\n"
133 "-X|--Xtotals Show extended summary information\n"
134 "-B|--Bytes Show size in bytes\n"
127 "\nValid debug options (FZPUT may be combined)\n" 135 "\nValid debug options (FZPUT may be combined)\n"
128 "a / A Switch on all debug options (=FZUP)\n" 136 "a / A Switch on all debug options (=FZUP)\n"
129 "- Switch off all debug options\n" 137 "- Switch off all debug options\n"
@@ -225,15 +233,17 @@ static int store_size(char *buffer, unsigned long value)
225 char trailer = 0; 233 char trailer = 0;
226 int n; 234 int n;
227 235
228 if (value > 1000000000UL) { 236 if (!show_bytes) {
229 divisor = 100000000UL; 237 if (value > 1000000000UL) {
230 trailer = 'G'; 238 divisor = 100000000UL;
231 } else if (value > 1000000UL) { 239 trailer = 'G';
232 divisor = 100000UL; 240 } else if (value > 1000000UL) {
233 trailer = 'M'; 241 divisor = 100000UL;
234 } else if (value > 1000UL) { 242 trailer = 'M';
235 divisor = 100; 243 } else if (value > 1000UL) {
236 trailer = 'K'; 244 divisor = 100;
245 trailer = 'K';
246 }
237 } 247 }
238 248
239 value /= divisor; 249 value /= divisor;
@@ -297,10 +307,12 @@ int line = 0;
297static void first_line(void) 307static void first_line(void)
298{ 308{
299 if (show_activity) 309 if (show_activity)
300 printf("Name Objects Alloc Free %%Fast Fallb O CmpX UL\n"); 310 printf("Name Objects Alloc Free"
311 " %%Fast Fallb O CmpX UL\n");
301 else 312 else
302 printf("Name Objects Objsize Space " 313 printf("Name Objects Objsize %s "
303 "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n"); 314 "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n",
315 sort_loss ? " Loss" : "Space");
304} 316}
305 317
306/* 318/*
@@ -333,6 +345,11 @@ static unsigned long slab_activity(struct slabinfo *s)
333 s->alloc_slowpath + s->free_slowpath; 345 s->alloc_slowpath + s->free_slowpath;
334} 346}
335 347
348static unsigned long slab_waste(struct slabinfo *s)
349{
350 return slab_size(s) - s->objects * s->object_size;
351}
352
336static void slab_numa(struct slabinfo *s, int mode) 353static void slab_numa(struct slabinfo *s, int mode)
337{ 354{
338 int node; 355 int node;
@@ -504,7 +521,7 @@ static void report(struct slabinfo *s)
504 if (strcmp(s->name, "*") == 0) 521 if (strcmp(s->name, "*") == 0)
505 return; 522 return;
506 523
507 printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %lu\n", 524 printf("\nSlabcache: %-15s Aliases: %2d Order : %2d Objects: %lu\n",
508 s->name, s->aliases, s->order, s->objects); 525 s->name, s->aliases, s->order, s->objects);
509 if (s->hwcache_align) 526 if (s->hwcache_align)
510 printf("** Hardware cacheline aligned\n"); 527 printf("** Hardware cacheline aligned\n");
@@ -561,7 +578,10 @@ static void slabcache(struct slabinfo *s)
561 if (show_empty && s->slabs) 578 if (show_empty && s->slabs)
562 return; 579 return;
563 580
564 store_size(size_str, slab_size(s)); 581 if (sort_loss == 0)
582 store_size(size_str, slab_size(s));
583 else
584 store_size(size_str, slab_waste(s));
565 snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs, 585 snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs,
566 s->partial, s->cpu_slabs); 586 s->partial, s->cpu_slabs);
567 587
@@ -602,15 +622,15 @@ static void slabcache(struct slabinfo *s)
602 total_free ? (s->free_fastpath * 100 / total_free) : 0, 622 total_free ? (s->free_fastpath * 100 / total_free) : 0,
603 s->order_fallback, s->order, s->cmpxchg_double_fail, 623 s->order_fallback, s->order, s->cmpxchg_double_fail,
604 s->cmpxchg_double_cpu_fail); 624 s->cmpxchg_double_cpu_fail);
605 } 625 } else {
606 else 626 printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n",
607 printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
608 s->name, s->objects, s->object_size, size_str, dist_str, 627 s->name, s->objects, s->object_size, size_str, dist_str,
609 s->objs_per_slab, s->order, 628 s->objs_per_slab, s->order,
610 s->slabs ? (s->partial * 100) / s->slabs : 100, 629 s->slabs ? (s->partial * 100) / s->slabs : 100,
611 s->slabs ? (s->objects * s->object_size * 100) / 630 s->slabs ? (s->objects * s->object_size * 100) /
612 (s->slabs * (page_size << s->order)) : 100, 631 (s->slabs * (page_size << s->order)) : 100,
613 flags); 632 flags);
633 }
614} 634}
615 635
616/* 636/*
@@ -918,84 +938,88 @@ static void totals(void)
918 938
919 printf("Slabcache Totals\n"); 939 printf("Slabcache Totals\n");
920 printf("----------------\n"); 940 printf("----------------\n");
921 printf("Slabcaches : %3d Aliases : %3d->%-3d Active: %3d\n", 941 printf("Slabcaches : %15d Aliases : %11d->%-3d Active: %3d\n",
922 slabs, aliases, alias_targets, used_slabs); 942 slabs, aliases, alias_targets, used_slabs);
923 943
924 store_size(b1, total_size);store_size(b2, total_waste); 944 store_size(b1, total_size);store_size(b2, total_waste);
925 store_size(b3, total_waste * 100 / total_used); 945 store_size(b3, total_waste * 100 / total_used);
926 printf("Memory used: %6s # Loss : %6s MRatio:%6s%%\n", b1, b2, b3); 946 printf("Memory used: %15s # Loss : %15s MRatio:%6s%%\n", b1, b2, b3);
927 947
928 store_size(b1, total_objects);store_size(b2, total_partobj); 948 store_size(b1, total_objects);store_size(b2, total_partobj);
929 store_size(b3, total_partobj * 100 / total_objects); 949 store_size(b3, total_partobj * 100 / total_objects);
930 printf("# Objects : %6s # PartObj: %6s ORatio:%6s%%\n", b1, b2, b3); 950 printf("# Objects : %15s # PartObj: %15s ORatio:%6s%%\n", b1, b2, b3);
931 951
932 printf("\n"); 952 printf("\n");
933 printf("Per Cache Average Min Max Total\n"); 953 printf("Per Cache Average "
934 printf("---------------------------------------------------------\n"); 954 "Min Max Total\n");
955 printf("---------------------------------------"
956 "-------------------------------------\n");
935 957
936 store_size(b1, avg_objects);store_size(b2, min_objects); 958 store_size(b1, avg_objects);store_size(b2, min_objects);
937 store_size(b3, max_objects);store_size(b4, total_objects); 959 store_size(b3, max_objects);store_size(b4, total_objects);
938 printf("#Objects %10s %10s %10s %10s\n", 960 printf("#Objects %15s %15s %15s %15s\n",
939 b1, b2, b3, b4); 961 b1, b2, b3, b4);
940 962
941 store_size(b1, avg_slabs);store_size(b2, min_slabs); 963 store_size(b1, avg_slabs);store_size(b2, min_slabs);
942 store_size(b3, max_slabs);store_size(b4, total_slabs); 964 store_size(b3, max_slabs);store_size(b4, total_slabs);
943 printf("#Slabs %10s %10s %10s %10s\n", 965 printf("#Slabs %15s %15s %15s %15s\n",
944 b1, b2, b3, b4); 966 b1, b2, b3, b4);
945 967
946 store_size(b1, avg_partial);store_size(b2, min_partial); 968 store_size(b1, avg_partial);store_size(b2, min_partial);
947 store_size(b3, max_partial);store_size(b4, total_partial); 969 store_size(b3, max_partial);store_size(b4, total_partial);
948 printf("#PartSlab %10s %10s %10s %10s\n", 970 printf("#PartSlab %15s %15s %15s %15s\n",
949 b1, b2, b3, b4); 971 b1, b2, b3, b4);
950 store_size(b1, avg_ppart);store_size(b2, min_ppart); 972 store_size(b1, avg_ppart);store_size(b2, min_ppart);
951 store_size(b3, max_ppart); 973 store_size(b3, max_ppart);
952 store_size(b4, total_partial * 100 / total_slabs); 974 store_size(b4, total_partial * 100 / total_slabs);
953 printf("%%PartSlab%10s%% %10s%% %10s%% %10s%%\n", 975 printf("%%PartSlab%15s%% %15s%% %15s%% %15s%%\n",
954 b1, b2, b3, b4); 976 b1, b2, b3, b4);
955 977
956 store_size(b1, avg_partobj);store_size(b2, min_partobj); 978 store_size(b1, avg_partobj);store_size(b2, min_partobj);
957 store_size(b3, max_partobj); 979 store_size(b3, max_partobj);
958 store_size(b4, total_partobj); 980 store_size(b4, total_partobj);
959 printf("PartObjs %10s %10s %10s %10s\n", 981 printf("PartObjs %15s %15s %15s %15s\n",
960 b1, b2, b3, b4); 982 b1, b2, b3, b4);
961 983
962 store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj); 984 store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj);
963 store_size(b3, max_ppartobj); 985 store_size(b3, max_ppartobj);
964 store_size(b4, total_partobj * 100 / total_objects); 986 store_size(b4, total_partobj * 100 / total_objects);
965 printf("%% PartObj%10s%% %10s%% %10s%% %10s%%\n", 987 printf("%% PartObj%15s%% %15s%% %15s%% %15s%%\n",
966 b1, b2, b3, b4); 988 b1, b2, b3, b4);
967 989
968 store_size(b1, avg_size);store_size(b2, min_size); 990 store_size(b1, avg_size);store_size(b2, min_size);
969 store_size(b3, max_size);store_size(b4, total_size); 991 store_size(b3, max_size);store_size(b4, total_size);
970 printf("Memory %10s %10s %10s %10s\n", 992 printf("Memory %15s %15s %15s %15s\n",
971 b1, b2, b3, b4); 993 b1, b2, b3, b4);
972 994
973 store_size(b1, avg_used);store_size(b2, min_used); 995 store_size(b1, avg_used);store_size(b2, min_used);
974 store_size(b3, max_used);store_size(b4, total_used); 996 store_size(b3, max_used);store_size(b4, total_used);
975 printf("Used %10s %10s %10s %10s\n", 997 printf("Used %15s %15s %15s %15s\n",
976 b1, b2, b3, b4); 998 b1, b2, b3, b4);
977 999
978 store_size(b1, avg_waste);store_size(b2, min_waste); 1000 store_size(b1, avg_waste);store_size(b2, min_waste);
979 store_size(b3, max_waste);store_size(b4, total_waste); 1001 store_size(b3, max_waste);store_size(b4, total_waste);
980 printf("Loss %10s %10s %10s %10s\n", 1002 printf("Loss %15s %15s %15s %15s\n",
981 b1, b2, b3, b4); 1003 b1, b2, b3, b4);
982 1004
983 printf("\n"); 1005 printf("\n");
984 printf("Per Object Average Min Max\n"); 1006 printf("Per Object Average "
985 printf("---------------------------------------------\n"); 1007 "Min Max\n");
1008 printf("---------------------------------------"
1009 "--------------------\n");
986 1010
987 store_size(b1, avg_memobj);store_size(b2, min_memobj); 1011 store_size(b1, avg_memobj);store_size(b2, min_memobj);
988 store_size(b3, max_memobj); 1012 store_size(b3, max_memobj);
989 printf("Memory %10s %10s %10s\n", 1013 printf("Memory %15s %15s %15s\n",
990 b1, b2, b3); 1014 b1, b2, b3);
991 store_size(b1, avg_objsize);store_size(b2, min_objsize); 1015 store_size(b1, avg_objsize);store_size(b2, min_objsize);
992 store_size(b3, max_objsize); 1016 store_size(b3, max_objsize);
993 printf("User %10s %10s %10s\n", 1017 printf("User %15s %15s %15s\n",
994 b1, b2, b3); 1018 b1, b2, b3);
995 1019
996 store_size(b1, avg_objwaste);store_size(b2, min_objwaste); 1020 store_size(b1, avg_objwaste);store_size(b2, min_objwaste);
997 store_size(b3, max_objwaste); 1021 store_size(b3, max_objwaste);
998 printf("Loss %10s %10s %10s\n", 1022 printf("Loss %15s %15s %15s\n",
999 b1, b2, b3); 1023 b1, b2, b3);
1000} 1024}
1001 1025
@@ -1011,6 +1035,8 @@ static void sort_slabs(void)
1011 result = slab_size(s1) < slab_size(s2); 1035 result = slab_size(s1) < slab_size(s2);
1012 else if (sort_active) 1036 else if (sort_active)
1013 result = slab_activity(s1) < slab_activity(s2); 1037 result = slab_activity(s1) < slab_activity(s2);
1038 else if (sort_loss)
1039 result = slab_waste(s1) < slab_waste(s2);
1014 else 1040 else
1015 result = strcasecmp(s1->name, s2->name); 1041 result = strcasecmp(s1->name, s2->name);
1016 1042
@@ -1095,7 +1121,7 @@ static void alias(void)
1095 active = a->slab->name; 1121 active = a->slab->name;
1096 } 1122 }
1097 else 1123 else
1098 printf("%-20s -> %s\n", a->name, a->slab->name); 1124 printf("%-15s -> %s\n", a->name, a->slab->name);
1099 } 1125 }
1100 if (active) 1126 if (active)
1101 printf("\n"); 1127 printf("\n");
@@ -1241,12 +1267,16 @@ static void read_slab_dir(void)
1241static void output_slabs(void) 1267static void output_slabs(void)
1242{ 1268{
1243 struct slabinfo *slab; 1269 struct slabinfo *slab;
1270 int lines = output_lines;
1244 1271
1245 for (slab = slabinfo; slab < slabinfo + slabs; slab++) { 1272 for (slab = slabinfo; (slab < slabinfo + slabs) &&
1273 lines != 0; slab++) {
1246 1274
1247 if (slab->alias) 1275 if (slab->alias)
1248 continue; 1276 continue;
1249 1277
1278 if (lines != -1)
1279 lines--;
1250 1280
1251 if (show_numa) 1281 if (show_numa)
1252 slab_numa(slab, 0); 1282 slab_numa(slab, 0);
@@ -1267,24 +1297,54 @@ static void output_slabs(void)
1267 } 1297 }
1268} 1298}
1269 1299
1300static void xtotals(void)
1301{
1302 totals();
1303
1304 link_slabs();
1305 rename_slabs();
1306
1307 printf("\nSlabs sorted by size\n");
1308 printf("--------------------\n");
1309 sort_loss = 0;
1310 sort_size = 1;
1311 sort_slabs();
1312 output_slabs();
1313
1314 printf("\nSlabs sorted by loss\n");
1315 printf("--------------------\n");
1316 line = 0;
1317 sort_loss = 1;
1318 sort_size = 0;
1319 sort_slabs();
1320 output_slabs();
1321 printf("\n");
1322}
1323
1270struct option opts[] = { 1324struct option opts[] = {
1271 { "aliases", 0, NULL, 'a' }, 1325 { "aliases", no_argument, NULL, 'a' },
1272 { "activity", 0, NULL, 'A' }, 1326 { "activity", no_argument, NULL, 'A' },
1273 { "debug", 2, NULL, 'd' }, 1327 { "debug", optional_argument, NULL, 'd' },
1274 { "display-activity", 0, NULL, 'D' }, 1328 { "display-activity", no_argument, NULL, 'D' },
1275 { "empty", 0, NULL, 'e' }, 1329 { "empty", no_argument, NULL, 'e' },
1276 { "first-alias", 0, NULL, 'f' }, 1330 { "first-alias", no_argument, NULL, 'f' },
1277 { "help", 0, NULL, 'h' }, 1331 { "help", no_argument, NULL, 'h' },
1278 { "inverted", 0, NULL, 'i'}, 1332 { "inverted", no_argument, NULL, 'i'},
1279 { "numa", 0, NULL, 'n' }, 1333 { "slabs", no_argument, NULL, 'l' },
1280 { "ops", 0, NULL, 'o' }, 1334 { "numa", no_argument, NULL, 'n' },
1281 { "report", 0, NULL, 'r' }, 1335 { "ops", no_argument, NULL, 'o' },
1282 { "shrink", 0, NULL, 's' }, 1336 { "shrink", no_argument, NULL, 's' },
1283 { "slabs", 0, NULL, 'l' }, 1337 { "report", no_argument, NULL, 'r' },
1284 { "track", 0, NULL, 't'}, 1338 { "Size", no_argument, NULL, 'S'},
1285 { "validate", 0, NULL, 'v' }, 1339 { "tracking", no_argument, NULL, 't'},
1286 { "zero", 0, NULL, 'z' }, 1340 { "Totals", no_argument, NULL, 'T'},
1287 { "1ref", 0, NULL, '1'}, 1341 { "validate", no_argument, NULL, 'v' },
1342 { "zero", no_argument, NULL, 'z' },
1343 { "1ref", no_argument, NULL, '1'},
1344 { "lines", required_argument, NULL, 'N'},
1345 { "Loss", no_argument, NULL, 'L'},
1346 { "Xtotals", no_argument, NULL, 'X'},
1347 { "Bytes", no_argument, NULL, 'B'},
1288 { NULL, 0, NULL, 0 } 1348 { NULL, 0, NULL, 0 }
1289}; 1349};
1290 1350
@@ -1296,7 +1356,7 @@ int main(int argc, char *argv[])
1296 1356
1297 page_size = getpagesize(); 1357 page_size = getpagesize();
1298 1358
1299 while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS", 1359 while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXB",
1300 opts, NULL)) != -1) 1360 opts, NULL)) != -1)
1301 switch (c) { 1361 switch (c) {
1302 case '1': 1362 case '1':
@@ -1358,7 +1418,25 @@ int main(int argc, char *argv[])
1358 case 'S': 1418 case 'S':
1359 sort_size = 1; 1419 sort_size = 1;
1360 break; 1420 break;
1361 1421 case 'N':
1422 if (optarg) {
1423 output_lines = atoi(optarg);
1424 if (output_lines < 1)
1425 output_lines = 1;
1426 }
1427 break;
1428 case 'L':
1429 sort_loss = 1;
1430 break;
1431 case 'X':
1432 if (output_lines == -1)
1433 output_lines = 1;
1434 extended_totals = 1;
1435 show_bytes = 1;
1436 break;
1437 case 'B':
1438 show_bytes = 1;
1439 break;
1362 default: 1440 default:
1363 fatal("%s: Invalid option '%c'\n", argv[0], optopt); 1441 fatal("%s: Invalid option '%c'\n", argv[0], optopt);
1364 1442
@@ -1378,12 +1456,13 @@ int main(int argc, char *argv[])
1378 fatal("%s: Invalid pattern '%s' code %d\n", 1456 fatal("%s: Invalid pattern '%s' code %d\n",
1379 argv[0], pattern_source, err); 1457 argv[0], pattern_source, err);
1380 read_slab_dir(); 1458 read_slab_dir();
1381 if (show_alias) 1459 if (show_alias) {
1382 alias(); 1460 alias();
1383 else 1461 } else if (extended_totals) {
1384 if (show_totals) 1462 xtotals();
1463 } else if (show_totals) {
1385 totals(); 1464 totals();
1386 else { 1465 } else {
1387 link_slabs(); 1466 link_slabs();
1388 rename_slabs(); 1467 rename_slabs();
1389 sort_slabs(); 1468 sort_slabs();