diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-06 02:10:54 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-06 02:10:54 -0500 |
commit | 2e3078af2c67730c479f1d183af5b367f5d95337 (patch) | |
tree | b7881c6c9c479aadac345df7e18e3c0e10f0811e | |
parent | ea5c58e70c3a148ada0d3061a8f529589bb766ba (diff) | |
parent | b3b0d09c7a2330759ac293f5269bd932439ea0ff (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge patch-bomb from Andrew Morton:
- inotify tweaks
- some ocfs2 updates (many more are awaiting review)
- various misc bits
- kernel/watchdog.c updates
- Some of mm. I have a huge number of MM patches this time and quite a
lot of it is quite difficult and much will be held over to next time.
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (162 commits)
selftests: vm: add tests for lock on fault
mm: mlock: add mlock flags to enable VM_LOCKONFAULT usage
mm: introduce VM_LOCKONFAULT
mm: mlock: add new mlock system call
mm: mlock: refactor mlock, munlock, and munlockall code
kasan: always taint kernel on report
mm, slub, kasan: enable user tracking by default with KASAN=y
kasan: use IS_ALIGNED in memory_is_poisoned_8()
kasan: Fix a type conversion error
lib: test_kasan: add some testcases
kasan: update reference to kasan prototype repo
kasan: move KASAN_SANITIZE in arch/x86/boot/Makefile
kasan: various fixes in documentation
kasan: update log messages
kasan: accurately determine the type of the bad access
kasan: update reported bug types for kernel memory accesses
kasan: update reported bug types for not user nor kernel memory accesses
mm/kasan: prevent deadlock in kasan reporting
mm/kasan: don't use kasan shadow pointer in generic functions
mm/kasan: MODULE_VADDR is not available on all archs
...
127 files changed, 3093 insertions, 1351 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 3a9d65c912e7..1e4a6cc1b6ea 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -175,6 +175,7 @@ read the file /proc/PID/status: | |||
175 | VmLib: 1412 kB | 175 | VmLib: 1412 kB |
176 | VmPTE: 20 kb | 176 | VmPTE: 20 kb |
177 | VmSwap: 0 kB | 177 | VmSwap: 0 kB |
178 | HugetlbPages: 0 kB | ||
178 | Threads: 1 | 179 | Threads: 1 |
179 | SigQ: 0/28578 | 180 | SigQ: 0/28578 |
180 | SigPnd: 0000000000000000 | 181 | SigPnd: 0000000000000000 |
@@ -238,6 +239,7 @@ Table 1-2: Contents of the status files (as of 4.1) | |||
238 | VmPTE size of page table entries | 239 | VmPTE size of page table entries |
239 | VmPMD size of second level page tables | 240 | VmPMD size of second level page tables |
240 | VmSwap size of swap usage (the number of referred swapents) | 241 | VmSwap size of swap usage (the number of referred swapents) |
242 | HugetlbPages size of hugetlb memory portions | ||
241 | Threads number of threads | 243 | Threads number of threads |
242 | SigQ number of signals queued/max. number for queue | 244 | SigQ number of signals queued/max. number for queue |
243 | SigPnd bitmap of pending signals for the thread | 245 | SigPnd bitmap of pending signals for the thread |
@@ -424,12 +426,15 @@ Private_Clean: 0 kB | |||
424 | Private_Dirty: 0 kB | 426 | Private_Dirty: 0 kB |
425 | Referenced: 892 kB | 427 | Referenced: 892 kB |
426 | Anonymous: 0 kB | 428 | Anonymous: 0 kB |
429 | AnonHugePages: 0 kB | ||
430 | Shared_Hugetlb: 0 kB | ||
431 | Private_Hugetlb: 0 kB | ||
427 | Swap: 0 kB | 432 | Swap: 0 kB |
428 | SwapPss: 0 kB | 433 | SwapPss: 0 kB |
429 | KernelPageSize: 4 kB | 434 | KernelPageSize: 4 kB |
430 | MMUPageSize: 4 kB | 435 | MMUPageSize: 4 kB |
431 | Locked: 374 kB | 436 | Locked: 0 kB |
432 | VmFlags: rd ex mr mw me de | 437 | VmFlags: rd ex mr mw me dw |
433 | 438 | ||
434 | the first of these lines shows the same information as is displayed for the | 439 | the first of these lines shows the same information as is displayed for the |
435 | mapping in /proc/PID/maps. The remaining lines show the size of the mapping | 440 | mapping in /proc/PID/maps. The remaining lines show the size of the mapping |
@@ -449,9 +454,14 @@ accessed. | |||
449 | "Anonymous" shows the amount of memory that does not belong to any file. Even | 454 | "Anonymous" shows the amount of memory that does not belong to any file. Even |
450 | a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE | 455 | a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE |
451 | and a page is modified, the file page is replaced by a private anonymous copy. | 456 | and a page is modified, the file page is replaced by a private anonymous copy. |
452 | "Swap" shows how much would-be-anonymous memory is also used, but out on | 457 | "AnonHugePages" shows the ammount of memory backed by transparent hugepage. |
453 | swap. | 458 | "Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by |
459 | hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical | ||
460 | reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field. | ||
461 | "Swap" shows how much would-be-anonymous memory is also used, but out on swap. | ||
454 | "SwapPss" shows proportional swap share of this mapping. | 462 | "SwapPss" shows proportional swap share of this mapping. |
463 | "Locked" indicates whether the mapping is locked in memory or not. | ||
464 | |||
455 | "VmFlags" field deserves a separate description. This member represents the kernel | 465 | "VmFlags" field deserves a separate description. This member represents the kernel |
456 | flags associated with the particular virtual memory area in two letter encoded | 466 | flags associated with the particular virtual memory area in two letter encoded |
457 | manner. The codes are the following: | 467 | manner. The codes are the following: |
@@ -475,7 +485,6 @@ manner. The codes are the following: | |||
475 | ac - area is accountable | 485 | ac - area is accountable |
476 | nr - swap space is not reserved for the area | 486 | nr - swap space is not reserved for the area |
477 | ht - area uses huge tlb pages | 487 | ht - area uses huge tlb pages |
478 | nl - non-linear mapping | ||
479 | ar - architecture specific flag | 488 | ar - architecture specific flag |
480 | dd - do not include area into core dump | 489 | dd - do not include area into core dump |
481 | sd - soft-dirty flag | 490 | sd - soft-dirty flag |
@@ -815,9 +824,6 @@ varies by architecture and compile options. The following is from a | |||
815 | 824 | ||
816 | > cat /proc/meminfo | 825 | > cat /proc/meminfo |
817 | 826 | ||
818 | The "Locked" indicates whether the mapping is locked in memory or not. | ||
819 | |||
820 | |||
821 | MemTotal: 16344972 kB | 827 | MemTotal: 16344972 kB |
822 | MemFree: 13634064 kB | 828 | MemFree: 13634064 kB |
823 | MemAvailable: 14836172 kB | 829 | MemAvailable: 14836172 kB |
diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt index 0d32355a4c34..aa1e0c91e368 100644 --- a/Documentation/kasan.txt +++ b/Documentation/kasan.txt | |||
@@ -1,36 +1,34 @@ | |||
1 | Kernel address sanitizer | 1 | KernelAddressSanitizer (KASAN) |
2 | ================ | 2 | ============================== |
3 | 3 | ||
4 | 0. Overview | 4 | 0. Overview |
5 | =========== | 5 | =========== |
6 | 6 | ||
7 | Kernel Address sanitizer (KASan) is a dynamic memory error detector. It provides | 7 | KernelAddressSANitizer (KASAN) is a dynamic memory error detector. It provides |
8 | a fast and comprehensive solution for finding use-after-free and out-of-bounds | 8 | a fast and comprehensive solution for finding use-after-free and out-of-bounds |
9 | bugs. | 9 | bugs. |
10 | 10 | ||
11 | KASan uses compile-time instrumentation for checking every memory access, | 11 | KASAN uses compile-time instrumentation for checking every memory access, |
12 | therefore you will need a gcc version of 4.9.2 or later. KASan could detect out | 12 | therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is |
13 | of bounds accesses to stack or global variables, but only if gcc 5.0 or later was | 13 | required for detection of out-of-bounds accesses to stack or global variables. |
14 | used to built the kernel. | ||
15 | 14 | ||
16 | Currently KASan is supported only for x86_64 architecture and requires that the | 15 | Currently KASAN is supported only for x86_64 architecture and requires the |
17 | kernel be built with the SLUB allocator. | 16 | kernel to be built with the SLUB allocator. |
18 | 17 | ||
19 | 1. Usage | 18 | 1. Usage |
20 | ========= | 19 | ======== |
21 | 20 | ||
22 | To enable KASAN configure kernel with: | 21 | To enable KASAN configure kernel with: |
23 | 22 | ||
24 | CONFIG_KASAN = y | 23 | CONFIG_KASAN = y |
25 | 24 | ||
26 | and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline/inline | 25 | and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline and |
27 | is compiler instrumentation types. The former produces smaller binary the | 26 | inline are compiler instrumentation types. The former produces smaller binary |
28 | latter is 1.1 - 2 times faster. Inline instrumentation requires a gcc version | 27 | the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC |
29 | of 5.0 or later. | 28 | version 5.0 or later. |
30 | 29 | ||
31 | Currently KASAN works only with the SLUB memory allocator. | 30 | Currently KASAN works only with the SLUB memory allocator. |
32 | For better bug detection and nicer report, enable CONFIG_STACKTRACE and put | 31 | For better bug detection and nicer reporting, enable CONFIG_STACKTRACE. |
33 | at least 'slub_debug=U' in the boot cmdline. | ||
34 | 32 | ||
35 | To disable instrumentation for specific files or directories, add a line | 33 | To disable instrumentation for specific files or directories, add a line |
36 | similar to the following to the respective kernel Makefile: | 34 | similar to the following to the respective kernel Makefile: |
@@ -42,7 +40,7 @@ similar to the following to the respective kernel Makefile: | |||
42 | KASAN_SANITIZE := n | 40 | KASAN_SANITIZE := n |
43 | 41 | ||
44 | 1.1 Error reports | 42 | 1.1 Error reports |
45 | ========== | 43 | ================= |
46 | 44 | ||
47 | A typical out of bounds access report looks like this: | 45 | A typical out of bounds access report looks like this: |
48 | 46 | ||
@@ -119,14 +117,16 @@ Memory state around the buggy address: | |||
119 | ffff8800693bc800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb | 117 | ffff8800693bc800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb |
120 | ================================================================== | 118 | ================================================================== |
121 | 119 | ||
122 | First sections describe slub object where bad access happened. | 120 | The header of the report discribe what kind of bug happened and what kind of |
123 | See 'SLUB Debug output' section in Documentation/vm/slub.txt for details. | 121 | access caused it. It's followed by the description of the accessed slub object |
122 | (see 'SLUB Debug output' section in Documentation/vm/slub.txt for details) and | ||
123 | the description of the accessed memory page. | ||
124 | 124 | ||
125 | In the last section the report shows memory state around the accessed address. | 125 | In the last section the report shows memory state around the accessed address. |
126 | Reading this part requires some more understanding of how KASAN works. | 126 | Reading this part requires some understanding of how KASAN works. |
127 | 127 | ||
128 | Each 8 bytes of memory are encoded in one shadow byte as accessible, | 128 | The state of each 8 aligned bytes of memory is encoded in one shadow byte. |
129 | partially accessible, freed or they can be part of a redzone. | 129 | Those 8 bytes can be accessible, partially accessible, freed or be a redzone. |
130 | We use the following encoding for each shadow byte: 0 means that all 8 bytes | 130 | We use the following encoding for each shadow byte: 0 means that all 8 bytes |
131 | of the corresponding memory region are accessible; number N (1 <= N <= 7) means | 131 | of the corresponding memory region are accessible; number N (1 <= N <= 7) means |
132 | that the first N bytes are accessible, and other (8 - N) bytes are not; | 132 | that the first N bytes are accessible, and other (8 - N) bytes are not; |
@@ -139,7 +139,7 @@ the accessed address is partially accessible. | |||
139 | 139 | ||
140 | 140 | ||
141 | 2. Implementation details | 141 | 2. Implementation details |
142 | ======================== | 142 | ========================= |
143 | 143 | ||
144 | From a high level, our approach to memory error detection is similar to that | 144 | From a high level, our approach to memory error detection is similar to that |
145 | of kmemcheck: use shadow memory to record whether each byte of memory is safe | 145 | of kmemcheck: use shadow memory to record whether each byte of memory is safe |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 816bf2fe55f5..84c0214b64a7 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1275,6 +1275,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1275 | Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0. | 1275 | Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0. |
1276 | Default: 1024 | 1276 | Default: 1024 |
1277 | 1277 | ||
1278 | hardlockup_all_cpu_backtrace= | ||
1279 | [KNL] Should the hard-lockup detector generate | ||
1280 | backtraces on all cpus. | ||
1281 | Format: <integer> | ||
1282 | |||
1278 | hashdist= [KNL,NUMA] Large hashes allocated during boot | 1283 | hashdist= [KNL,NUMA] Large hashes allocated during boot |
1279 | are distributed across NUMA nodes. Defaults on | 1284 | are distributed across NUMA nodes. Defaults on |
1280 | for 64-bit NUMA, off otherwise. | 1285 | for 64-bit NUMA, off otherwise. |
diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt index 22dd6af2e4bd..4a6e33e1af61 100644 --- a/Documentation/lockup-watchdogs.txt +++ b/Documentation/lockup-watchdogs.txt | |||
@@ -20,8 +20,9 @@ kernel mode for more than 10 seconds (see "Implementation" below for | |||
20 | details), without letting other interrupts have a chance to run. | 20 | details), without letting other interrupts have a chance to run. |
21 | Similarly to the softlockup case, the current stack trace is displayed | 21 | Similarly to the softlockup case, the current stack trace is displayed |
22 | upon detection and the system will stay locked up unless the default | 22 | upon detection and the system will stay locked up unless the default |
23 | behavior is changed, which can be done through a compile time knob, | 23 | behavior is changed, which can be done through a sysctl, |
24 | "BOOTPARAM_HARDLOCKUP_PANIC", and a kernel parameter, "nmi_watchdog" | 24 | 'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC", |
25 | and a kernel parameter, "nmi_watchdog" | ||
25 | (see "Documentation/kernel-parameters.txt" for details). | 26 | (see "Documentation/kernel-parameters.txt" for details). |
26 | 27 | ||
27 | The panic option can be used in combination with panic_timeout (this | 28 | The panic option can be used in combination with panic_timeout (this |
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 6fccb69c03e7..af70d1541d3a 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -33,6 +33,7 @@ show up in /proc/sys/kernel: | |||
33 | - domainname | 33 | - domainname |
34 | - hostname | 34 | - hostname |
35 | - hotplug | 35 | - hotplug |
36 | - hardlockup_all_cpu_backtrace | ||
36 | - hung_task_panic | 37 | - hung_task_panic |
37 | - hung_task_check_count | 38 | - hung_task_check_count |
38 | - hung_task_timeout_secs | 39 | - hung_task_timeout_secs |
@@ -293,6 +294,17 @@ domain names are in general different. For a detailed discussion | |||
293 | see the hostname(1) man page. | 294 | see the hostname(1) man page. |
294 | 295 | ||
295 | ============================================================== | 296 | ============================================================== |
297 | hardlockup_all_cpu_backtrace: | ||
298 | |||
299 | This value controls the hard lockup detector behavior when a hard | ||
300 | lockup condition is detected as to whether or not to gather further | ||
301 | debug information. If enabled, arch-specific all-CPU stack dumping | ||
302 | will be initiated. | ||
303 | |||
304 | 0: do nothing. This is the default behavior. | ||
305 | |||
306 | 1: on detection capture more debug information. | ||
307 | ============================================================== | ||
296 | 308 | ||
297 | hotplug: | 309 | hotplug: |
298 | 310 | ||
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index 6513fe2d90b8..fea5c0864170 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration | |||
@@ -92,29 +92,26 @@ Steps: | |||
92 | 92 | ||
93 | 2. Insure that writeback is complete. | 93 | 2. Insure that writeback is complete. |
94 | 94 | ||
95 | 3. Prep the new page that we want to move to. It is locked | 95 | 3. Lock the new page that we want to move to. It is locked so that accesses to |
96 | and set to not being uptodate so that all accesses to the new | 96 | this (not yet uptodate) page immediately lock while the move is in progress. |
97 | page immediately lock while the move is in progress. | ||
98 | 97 | ||
99 | 4. The new page is prepped with some settings from the old page so that | 98 | 4. All the page table references to the page are converted to migration |
100 | accesses to the new page will discover a page with the correct settings. | 99 | entries. This decreases the mapcount of a page. If the resulting |
101 | 100 | mapcount is not zero then we do not migrate the page. All user space | |
102 | 5. All the page table references to the page are converted | 101 | processes that attempt to access the page will now wait on the page lock. |
103 | to migration entries or dropped (nonlinear vmas). | ||
104 | This decrease the mapcount of a page. If the resulting | ||
105 | mapcount is not zero then we do not migrate the page. | ||
106 | All user space processes that attempt to access the page | ||
107 | will now wait on the page lock. | ||
108 | 102 | ||
109 | 6. The radix tree lock is taken. This will cause all processes trying | 103 | 5. The radix tree lock is taken. This will cause all processes trying |
110 | to access the page via the mapping to block on the radix tree spinlock. | 104 | to access the page via the mapping to block on the radix tree spinlock. |
111 | 105 | ||
112 | 7. The refcount of the page is examined and we back out if references remain | 106 | 6. The refcount of the page is examined and we back out if references remain |
113 | otherwise we know that we are the only one referencing this page. | 107 | otherwise we know that we are the only one referencing this page. |
114 | 108 | ||
115 | 8. The radix tree is checked and if it does not contain the pointer to this | 109 | 7. The radix tree is checked and if it does not contain the pointer to this |
116 | page then we back out because someone else modified the radix tree. | 110 | page then we back out because someone else modified the radix tree. |
117 | 111 | ||
112 | 8. The new page is prepped with some settings from the old page so that | ||
113 | accesses to the new page will discover a page with the correct settings. | ||
114 | |||
118 | 9. The radix tree is changed to point to the new page. | 115 | 9. The radix tree is changed to point to the new page. |
119 | 116 | ||
120 | 10. The reference count of the old page is dropped because the radix tree | 117 | 10. The reference count of the old page is dropped because the radix tree |
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 8143b9e8373d..8a282687ee06 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt | |||
@@ -170,6 +170,16 @@ A lower value leads to gain less thp performance. Value of | |||
170 | max_ptes_none can waste cpu time very little, you can | 170 | max_ptes_none can waste cpu time very little, you can |
171 | ignore it. | 171 | ignore it. |
172 | 172 | ||
173 | max_ptes_swap specifies how many pages can be brought in from | ||
174 | swap when collapsing a group of pages into a transparent huge page. | ||
175 | |||
176 | /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap | ||
177 | |||
178 | A higher value can cause excessive swap IO and waste | ||
179 | memory. A lower value can prevent THPs from being | ||
180 | collapsed, resulting fewer pages being collapsed into | ||
181 | THPs, and lower memory access performance. | ||
182 | |||
173 | == Boot parameter == | 183 | == Boot parameter == |
174 | 184 | ||
175 | You can change the sysfs boot time defaults of Transparent Hugepage | 185 | You can change the sysfs boot time defaults of Transparent Hugepage |
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index 32ee3a67dba2..fa3b527086fa 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt | |||
@@ -531,83 +531,20 @@ map. | |||
531 | 531 | ||
532 | try_to_unmap() is always called, by either vmscan for reclaim or for page | 532 | try_to_unmap() is always called, by either vmscan for reclaim or for page |
533 | migration, with the argument page locked and isolated from the LRU. Separate | 533 | migration, with the argument page locked and isolated from the LRU. Separate |
534 | functions handle anonymous and mapped file pages, as these types of pages have | 534 | functions handle anonymous and mapped file and KSM pages, as these types of |
535 | different reverse map mechanisms. | 535 | pages have different reverse map lookup mechanisms, with different locking. |
536 | 536 | In each case, whether rmap_walk_anon() or rmap_walk_file() or rmap_walk_ksm(), | |
537 | (*) try_to_unmap_anon() | 537 | it will call try_to_unmap_one() for every VMA which might contain the page. |
538 | 538 | ||
539 | To unmap anonymous pages, each VMA in the list anchored in the anon_vma | 539 | When trying to reclaim, if try_to_unmap_one() finds the page in a VM_LOCKED |
540 | must be visited - at least until a VM_LOCKED VMA is encountered. If the | 540 | VMA, it will then mlock the page via mlock_vma_page() instead of unmapping it, |
541 | page is being unmapped for migration, VM_LOCKED VMAs do not stop the | 541 | and return SWAP_MLOCK to indicate that the page is unevictable: and the scan |
542 | process because mlocked pages are migratable. However, for reclaim, if | 542 | stops there. |
543 | the page is mapped into a VM_LOCKED VMA, the scan stops. | 543 | |
544 | 544 | mlock_vma_page() is called while holding the page table's lock (in addition | |
545 | try_to_unmap_anon() attempts to acquire in read mode the mmap semaphore of | 545 | to the page lock, and the rmap lock): to serialize against concurrent mlock or |
546 | the mm_struct to which the VMA belongs. If this is successful, it will | 546 | munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim, |
547 | mlock the page via mlock_vma_page() - we wouldn't have gotten to | 547 | holepunching, and truncation of file pages and their anonymous COWed pages. |
548 | try_to_unmap_anon() if the page were already mlocked - and will return | ||
549 | SWAP_MLOCK, indicating that the page is unevictable. | ||
550 | |||
551 | If the mmap semaphore cannot be acquired, we are not sure whether the page | ||
552 | is really unevictable or not. In this case, try_to_unmap_anon() will | ||
553 | return SWAP_AGAIN. | ||
554 | |||
555 | (*) try_to_unmap_file() - linear mappings | ||
556 | |||
557 | Unmapping of a mapped file page works the same as for anonymous mappings, | ||
558 | except that the scan visits all VMAs that map the page's index/page offset | ||
559 | in the page's mapping's reverse map priority search tree. It also visits | ||
560 | each VMA in the page's mapping's non-linear list, if the list is | ||
561 | non-empty. | ||
562 | |||
563 | As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file | ||
564 | page, try_to_unmap_file() will attempt to acquire the associated | ||
565 | mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this | ||
566 | is successful, and SWAP_AGAIN, if not. | ||
567 | |||
568 | (*) try_to_unmap_file() - non-linear mappings | ||
569 | |||
570 | If a page's mapping contains a non-empty non-linear mapping VMA list, then | ||
571 | try_to_un{map|lock}() must also visit each VMA in that list to determine | ||
572 | whether the page is mapped in a VM_LOCKED VMA. Again, the scan must visit | ||
573 | all VMAs in the non-linear list to ensure that the pages is not/should not | ||
574 | be mlocked. | ||
575 | |||
576 | If a VM_LOCKED VMA is found in the list, the scan could terminate. | ||
577 | However, there is no easy way to determine whether the page is actually | ||
578 | mapped in a given VMA - either for unmapping or testing whether the | ||
579 | VM_LOCKED VMA actually pins the page. | ||
580 | |||
581 | try_to_unmap_file() handles non-linear mappings by scanning a certain | ||
582 | number of pages - a "cluster" - in each non-linear VMA associated with the | ||
583 | page's mapping, for each file mapped page that vmscan tries to unmap. If | ||
584 | this happens to unmap the page we're trying to unmap, try_to_unmap() will | ||
585 | notice this on return (page_mapcount(page) will be 0) and return | ||
586 | SWAP_SUCCESS. Otherwise, it will return SWAP_AGAIN, causing vmscan to | ||
587 | recirculate this page. We take advantage of the cluster scan in | ||
588 | try_to_unmap_cluster() as follows: | ||
589 | |||
590 | For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the | ||
591 | mmap semaphore of the associated mm_struct for read without blocking. | ||
592 | |||
593 | If this attempt is successful and the VMA is VM_LOCKED, | ||
594 | try_to_unmap_cluster() will retain the mmap semaphore for the scan; | ||
595 | otherwise it drops it here. | ||
596 | |||
597 | Then, for each page in the cluster, if we're holding the mmap semaphore | ||
598 | for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to | ||
599 | mlock the page. This call is a no-op if the page is already locked, | ||
600 | but will mlock any pages in the non-linear mapping that happen to be | ||
601 | unlocked. | ||
602 | |||
603 | If one of the pages so mlocked is the page passed in to try_to_unmap(), | ||
604 | try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default | ||
605 | SWAP_AGAIN. This will allow vmscan to cull the page, rather than | ||
606 | recirculating it on the inactive list. | ||
607 | |||
608 | Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it | ||
609 | returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED | ||
610 | VMA, but couldn't be mlocked. | ||
611 | 548 | ||
612 | 549 | ||
613 | try_to_munlock() REVERSE MAP SCAN | 550 | try_to_munlock() REVERSE MAP SCAN |
@@ -623,29 +560,15 @@ all PTEs from the page. For this purpose, the unevictable/mlock infrastructure | |||
623 | introduced a variant of try_to_unmap() called try_to_munlock(). | 560 | introduced a variant of try_to_unmap() called try_to_munlock(). |
624 | 561 | ||
625 | try_to_munlock() calls the same functions as try_to_unmap() for anonymous and | 562 | try_to_munlock() calls the same functions as try_to_unmap() for anonymous and |
626 | mapped file pages with an additional argument specifying unlock versus unmap | 563 | mapped file and KSM pages with a flag argument specifying unlock versus unmap |
627 | processing. Again, these functions walk the respective reverse maps looking | 564 | processing. Again, these functions walk the respective reverse maps looking |
628 | for VM_LOCKED VMAs. When such a VMA is found for anonymous pages and file | 565 | for VM_LOCKED VMAs. When such a VMA is found, as in the try_to_unmap() case, |
629 | pages mapped in linear VMAs, as in the try_to_unmap() case, the functions | 566 | the functions mlock the page via mlock_vma_page() and return SWAP_MLOCK. This |
630 | attempt to acquire the associated mmap semaphore, mlock the page via | 567 | undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page. |
631 | mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the | ||
632 | pre-clearing of the page's PG_mlocked done by munlock_vma_page. | ||
633 | |||
634 | If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap | ||
635 | semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() to | ||
636 | recycle the page on the inactive list and hope that it has better luck with the | ||
637 | page next time. | ||
638 | |||
639 | For file pages mapped into non-linear VMAs, the try_to_munlock() logic works | ||
640 | slightly differently. On encountering a VM_LOCKED non-linear VMA that might | ||
641 | map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the | ||
642 | page. munlock_vma_page() will just leave the page unlocked and let vmscan deal | ||
643 | with it - the usual fallback position. | ||
644 | 568 | ||
645 | Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's | 569 | Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's |
646 | reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA. | 570 | reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA. |
647 | However, the scan can terminate when it encounters a VM_LOCKED VMA and can | 571 | However, the scan can terminate when it encounters a VM_LOCKED VMA. |
648 | successfully acquire the VMA's mmap semaphore for read and mlock the page. | ||
649 | Although try_to_munlock() might be called a great many times when munlocking a | 572 | Although try_to_munlock() might be called a great many times when munlocking a |
650 | large region or tearing down a large address space that has been mlocked via | 573 | large region or tearing down a large address space that has been mlocked via |
651 | mlockall(), overall this is a fairly rare event. | 574 | mlockall(), overall this is a fairly rare event. |
@@ -673,11 +596,6 @@ Some examples of these unevictable pages on the LRU lists are: | |||
673 | (3) mlocked pages that could not be isolated from the LRU and moved to the | 596 | (3) mlocked pages that could not be isolated from the LRU and moved to the |
674 | unevictable list in mlock_vma_page(). | 597 | unevictable list in mlock_vma_page(). |
675 | 598 | ||
676 | (4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't | ||
677 | acquire the VMA's mmap semaphore to test the flags and set PageMlocked. | ||
678 | munlock_vma_page() was forced to let the page back on to the normal LRU | ||
679 | list for vmscan to handle. | ||
680 | |||
681 | shrink_inactive_list() also diverts any unevictable pages that it finds on the | 599 | shrink_inactive_list() also diverts any unevictable pages that it finds on the |
682 | inactive lists to the appropriate zone's unevictable list. | 600 | inactive lists to the appropriate zone's unevictable list. |
683 | 601 | ||
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 0086b472bc2b..f2f949671798 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h | |||
@@ -37,6 +37,9 @@ | |||
37 | 37 | ||
38 | #define MCL_CURRENT 8192 /* lock all currently mapped pages */ | 38 | #define MCL_CURRENT 8192 /* lock all currently mapped pages */ |
39 | #define MCL_FUTURE 16384 /* lock all additions to address space */ | 39 | #define MCL_FUTURE 16384 /* lock all additions to address space */ |
40 | #define MCL_ONFAULT 32768 /* lock all pages that are faulted in */ | ||
41 | |||
42 | #define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */ | ||
40 | 43 | ||
41 | #define MADV_NORMAL 0 /* no further special treatment */ | 44 | #define MADV_NORMAL 0 /* no further special treatment */ |
42 | #define MADV_RANDOM 1 /* expect random page references */ | 45 | #define MADV_RANDOM 1 /* expect random page references */ |
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index 00b7f7de28a1..7d5f4c736a16 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c | |||
@@ -803,7 +803,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs) | |||
803 | } | 803 | } |
804 | } | 804 | } |
805 | } else { | 805 | } else { |
806 | fault = probe_kernel_address(instrptr, instr); | 806 | fault = probe_kernel_address((void *)instrptr, instr); |
807 | instr = __mem_to_opcode_arm(instr); | 807 | instr = __mem_to_opcode_arm(instr); |
808 | } | 808 | } |
809 | 809 | ||
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index cfcb876cae6b..97c03f468924 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h | |||
@@ -61,6 +61,12 @@ | |||
61 | */ | 61 | */ |
62 | #define MCL_CURRENT 1 /* lock all current mappings */ | 62 | #define MCL_CURRENT 1 /* lock all current mappings */ |
63 | #define MCL_FUTURE 2 /* lock all future mappings */ | 63 | #define MCL_FUTURE 2 /* lock all future mappings */ |
64 | #define MCL_ONFAULT 4 /* lock all pages that are faulted in */ | ||
65 | |||
66 | /* | ||
67 | * Flags for mlock | ||
68 | */ | ||
69 | #define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */ | ||
64 | 70 | ||
65 | #define MADV_NORMAL 0 /* no further special treatment */ | 71 | #define MADV_NORMAL 0 /* no further special treatment */ |
66 | #define MADV_RANDOM 1 /* expect random page references */ | 72 | #define MADV_RANDOM 1 /* expect random page references */ |
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 294d251ca7b2..ecc3ae1ca28e 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h | |||
@@ -31,6 +31,9 @@ | |||
31 | 31 | ||
32 | #define MCL_CURRENT 1 /* lock all current mappings */ | 32 | #define MCL_CURRENT 1 /* lock all current mappings */ |
33 | #define MCL_FUTURE 2 /* lock all future mappings */ | 33 | #define MCL_FUTURE 2 /* lock all future mappings */ |
34 | #define MCL_ONFAULT 4 /* lock all pages that are faulted in */ | ||
35 | |||
36 | #define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */ | ||
34 | 37 | ||
35 | #define MADV_NORMAL 0 /* no further special treatment */ | 38 | #define MADV_NORMAL 0 /* no further special treatment */ |
36 | #define MADV_RANDOM 1 /* expect random page references */ | 39 | #define MADV_RANDOM 1 /* expect random page references */ |
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h index 6ea26df0a73c..03c06ba7464f 100644 --- a/arch/powerpc/include/uapi/asm/mman.h +++ b/arch/powerpc/include/uapi/asm/mman.h | |||
@@ -22,6 +22,7 @@ | |||
22 | 22 | ||
23 | #define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ | 23 | #define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ |
24 | #define MCL_FUTURE 0x4000 /* lock all additions to address space */ | 24 | #define MCL_FUTURE 0x4000 /* lock all additions to address space */ |
25 | #define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */ | ||
25 | 26 | ||
26 | #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ | 27 | #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ |
27 | #define MAP_NONBLOCK 0x10000 /* do not block on IO */ | 28 | #define MAP_NONBLOCK 0x10000 /* do not block on IO */ |
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 8b9502adaf79..8d8a541211d0 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c | |||
@@ -80,7 +80,7 @@ static void __init setup_node_to_cpumask_map(void) | |||
80 | setup_nr_node_ids(); | 80 | setup_nr_node_ids(); |
81 | 81 | ||
82 | /* allocate the map */ | 82 | /* allocate the map */ |
83 | for (node = 0; node < nr_node_ids; node++) | 83 | for_each_node(node) |
84 | alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); | 84 | alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); |
85 | 85 | ||
86 | /* cpumask_of_node() will now work */ | 86 | /* cpumask_of_node() will now work */ |
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index ebc1f412cf49..13b9bcf5485e 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c | |||
@@ -999,7 +999,7 @@ int fsl_pci_mcheck_exception(struct pt_regs *regs) | |||
999 | ret = get_user(regs->nip, &inst); | 999 | ret = get_user(regs->nip, &inst); |
1000 | pagefault_enable(); | 1000 | pagefault_enable(); |
1001 | } else { | 1001 | } else { |
1002 | ret = probe_kernel_address(regs->nip, inst); | 1002 | ret = probe_kernel_address((void *)regs->nip, inst); |
1003 | } | 1003 | } |
1004 | 1004 | ||
1005 | if (mcheck_handle_load(regs, inst)) { | 1005 | if (mcheck_handle_load(regs, inst)) { |
diff --git a/arch/sparc/include/uapi/asm/mman.h b/arch/sparc/include/uapi/asm/mman.h index 0b14df33cffa..9765896ecb2c 100644 --- a/arch/sparc/include/uapi/asm/mman.h +++ b/arch/sparc/include/uapi/asm/mman.h | |||
@@ -17,6 +17,7 @@ | |||
17 | 17 | ||
18 | #define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ | 18 | #define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ |
19 | #define MCL_FUTURE 0x4000 /* lock all additions to address space */ | 19 | #define MCL_FUTURE 0x4000 /* lock all additions to address space */ |
20 | #define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */ | ||
20 | 21 | ||
21 | #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ | 22 | #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ |
22 | #define MAP_NONBLOCK 0x10000 /* do not block on IO */ | 23 | #define MAP_NONBLOCK 0x10000 /* do not block on IO */ |
diff --git a/arch/tile/include/uapi/asm/mman.h b/arch/tile/include/uapi/asm/mman.h index 81b8fc348d63..63ee13faf17d 100644 --- a/arch/tile/include/uapi/asm/mman.h +++ b/arch/tile/include/uapi/asm/mman.h | |||
@@ -36,6 +36,7 @@ | |||
36 | */ | 36 | */ |
37 | #define MCL_CURRENT 1 /* lock all current mappings */ | 37 | #define MCL_CURRENT 1 /* lock all current mappings */ |
38 | #define MCL_FUTURE 2 /* lock all future mappings */ | 38 | #define MCL_FUTURE 2 /* lock all future mappings */ |
39 | #define MCL_ONFAULT 4 /* lock all pages that are faulted in */ | ||
39 | 40 | ||
40 | 41 | ||
41 | #endif /* _ASM_TILE_MMAN_H */ | 42 | #endif /* _ASM_TILE_MMAN_H */ |
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 0d553e54171b..2ee62dba0373 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile | |||
@@ -9,13 +9,13 @@ | |||
9 | # Changed by many, many contributors over the years. | 9 | # Changed by many, many contributors over the years. |
10 | # | 10 | # |
11 | 11 | ||
12 | KASAN_SANITIZE := n | ||
13 | |||
12 | # If you want to preset the SVGA mode, uncomment the next line and | 14 | # If you want to preset the SVGA mode, uncomment the next line and |
13 | # set SVGA_MODE to whatever number you want. | 15 | # set SVGA_MODE to whatever number you want. |
14 | # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. | 16 | # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. |
15 | # The number is the same as you would ordinarily press at bootup. | 17 | # The number is the same as you would ordinarily press at bootup. |
16 | 18 | ||
17 | KASAN_SANITIZE := n | ||
18 | |||
19 | SVGA_MODE := -DSVGA_MODE=NORMAL_VGA | 19 | SVGA_MODE := -DSVGA_MODE=NORMAL_VGA |
20 | 20 | ||
21 | targets := vmlinux.bin setup.bin setup.elf bzImage | 21 | targets := vmlinux.bin setup.bin setup.elf bzImage |
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index caa2c712d1e7..f17705e1332c 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl | |||
@@ -382,3 +382,4 @@ | |||
382 | 373 i386 shutdown sys_shutdown | 382 | 373 i386 shutdown sys_shutdown |
383 | 374 i386 userfaultfd sys_userfaultfd | 383 | 374 i386 userfaultfd sys_userfaultfd |
384 | 375 i386 membarrier sys_membarrier | 384 | 375 i386 membarrier sys_membarrier |
385 | 376 i386 mlock2 sys_mlock2 | ||
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 278842fdf1f6..314a90bfc09c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl | |||
@@ -331,6 +331,7 @@ | |||
331 | 322 64 execveat stub_execveat | 331 | 322 64 execveat stub_execveat |
332 | 323 common userfaultfd sys_userfaultfd | 332 | 323 common userfaultfd sys_userfaultfd |
333 | 324 common membarrier sys_membarrier | 333 | 324 common membarrier sys_membarrier |
334 | 325 common mlock2 sys_mlock2 | ||
334 | 335 | ||
335 | # | 336 | # |
336 | # x32-specific system call numbers start at 512 to avoid cache impact | 337 | # x32-specific system call numbers start at 512 to avoid cache impact |
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 9ce5da27b136..d470cf219a2d 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c | |||
@@ -126,5 +126,5 @@ void __init kasan_init(void) | |||
126 | __flush_tlb_all(); | 126 | __flush_tlb_all(); |
127 | init_task.kasan_depth = 0; | 127 | init_task.kasan_depth = 0; |
128 | 128 | ||
129 | pr_info("Kernel address sanitizer initialized\n"); | 129 | pr_info("KernelAddressSanitizer initialized\n"); |
130 | } | 130 | } |
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 201aec0e0446..360944e1da52 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h | |||
@@ -74,6 +74,12 @@ | |||
74 | */ | 74 | */ |
75 | #define MCL_CURRENT 1 /* lock all current mappings */ | 75 | #define MCL_CURRENT 1 /* lock all current mappings */ |
76 | #define MCL_FUTURE 2 /* lock all future mappings */ | 76 | #define MCL_FUTURE 2 /* lock all future mappings */ |
77 | #define MCL_ONFAULT 4 /* lock all pages that are faulted in */ | ||
78 | |||
79 | /* | ||
80 | * Flags for mlock | ||
81 | */ | ||
82 | #define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */ | ||
77 | 83 | ||
78 | #define MADV_NORMAL 0 /* no further special treatment */ | 84 | #define MADV_NORMAL 0 /* no further special treatment */ |
79 | #define MADV_RANDOM 1 /* expect random page references */ | 85 | #define MADV_RANDOM 1 /* expect random page references */ |
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index f23fd86697ea..7bf835f85bc8 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c | |||
@@ -231,7 +231,8 @@ out_unlock: | |||
231 | if (res < 0 && fl->fl_type != F_UNLCK) { | 231 | if (res < 0 && fl->fl_type != F_UNLCK) { |
232 | fl_type = fl->fl_type; | 232 | fl_type = fl->fl_type; |
233 | fl->fl_type = F_UNLCK; | 233 | fl->fl_type = F_UNLCK; |
234 | res = locks_lock_file_wait(filp, fl); | 234 | /* Even if this fails we want to return the remote error */ |
235 | locks_lock_file_wait(filp, fl); | ||
235 | fl->fl_type = fl_type; | 236 | fl->fl_type = fl_type; |
236 | } | 237 | } |
237 | out: | 238 | out: |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7378169e90be..206a68b1db1a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -2149,7 +2149,12 @@ static void wait_sb_inodes(struct super_block *sb) | |||
2149 | iput(old_inode); | 2149 | iput(old_inode); |
2150 | old_inode = inode; | 2150 | old_inode = inode; |
2151 | 2151 | ||
2152 | filemap_fdatawait(mapping); | 2152 | /* |
2153 | * We keep the error status of individual mapping so that | ||
2154 | * applications can catch the writeback error using fsync(2). | ||
2155 | * See filemap_fdatawait_keep_errors() for details. | ||
2156 | */ | ||
2157 | filemap_fdatawait_keep_errors(mapping); | ||
2153 | 2158 | ||
2154 | cond_resched(); | 2159 | cond_resched(); |
2155 | 2160 | ||
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index a7fdbd868474..a709d80c8ebc 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c | |||
@@ -81,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, | |||
81 | unsigned int max_pages; | 81 | unsigned int max_pages; |
82 | int i; | 82 | int i; |
83 | 83 | ||
84 | max_pages = min(nr_pages, BIO_MAX_PAGES); | 84 | max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES); |
85 | 85 | ||
86 | bio = bio_alloc(GFP_NOFS, max_pages); | 86 | bio = bio_alloc(GFP_NOFS, max_pages); |
87 | BUG_ON(!bio); | 87 | BUG_ON(!bio); |
@@ -171,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, | |||
171 | unsigned int max_pages; | 171 | unsigned int max_pages; |
172 | int i; | 172 | int i; |
173 | 173 | ||
174 | max_pages = min(nr_pages, BIO_MAX_PAGES); | 174 | max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES); |
175 | 175 | ||
176 | bio = bio_alloc(GFP_NOFS, max_pages); | 176 | bio = bio_alloc(GFP_NOFS, max_pages); |
177 | BUG_ON(!bio); | 177 | BUG_ON(!bio); |
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 6b6f0d472ae8..fd98e5100cab 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c | |||
@@ -83,9 +83,16 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) | |||
83 | inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); | 83 | inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); |
84 | inode = igrab(mark->inode); | 84 | inode = igrab(mark->inode); |
85 | if (inode) { | 85 | if (inode) { |
86 | /* | ||
87 | * IN_ALL_EVENTS represents all of the mask bits | ||
88 | * that we expose to userspace. There is at | ||
89 | * least one bit (FS_EVENT_ON_CHILD) which is | ||
90 | * used only internally to the kernel. | ||
91 | */ | ||
92 | u32 mask = mark->mask & IN_ALL_EVENTS; | ||
86 | seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", | 93 | seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", |
87 | inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, | 94 | inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, |
88 | mark->mask, mark->ignored_mask); | 95 | mask, mark->ignored_mask); |
89 | show_mark_fhandle(m, inode); | 96 | show_mark_fhandle(m, inode); |
90 | seq_putc(m, '\n'); | 97 | seq_putc(m, '\n'); |
91 | iput(inode); | 98 | iput(inode); |
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 5b1e2a497e51..b8d08d0d0a4d 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c | |||
@@ -706,7 +706,19 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, | |||
706 | int ret; | 706 | int ret; |
707 | unsigned flags = 0; | 707 | unsigned flags = 0; |
708 | 708 | ||
709 | /* don't allow invalid bits: we don't want flags set */ | 709 | /* |
710 | * We share a lot of code with fs/dnotify. We also share | ||
711 | * the bit layout between inotify's IN_* and the fsnotify | ||
712 | * FS_*. This check ensures that only the inotify IN_* | ||
713 | * bits get passed in and set in watches/events. | ||
714 | */ | ||
715 | if (unlikely(mask & ~ALL_INOTIFY_BITS)) | ||
716 | return -EINVAL; | ||
717 | /* | ||
718 | * Require at least one valid bit set in the mask. | ||
719 | * Without _something_ set, we would have no events to | ||
720 | * watch for. | ||
721 | */ | ||
710 | if (unlikely(!(mask & ALL_INOTIFY_BITS))) | 722 | if (unlikely(!(mask & ALL_INOTIFY_BITS))) |
711 | return -EINVAL; | 723 | return -EINVAL; |
712 | 724 | ||
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 64b11d90eca6..7f604727f487 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -589,6 +589,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
589 | ret = -EIO; | 589 | ret = -EIO; |
590 | goto bail; | 590 | goto bail; |
591 | } | 591 | } |
592 | set_buffer_new(bh_result); | ||
592 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | 593 | up_write(&OCFS2_I(inode)->ip_alloc_sem); |
593 | } | 594 | } |
594 | 595 | ||
@@ -864,6 +865,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, | |||
864 | is_overwrite = ocfs2_is_overwrite(osb, inode, offset); | 865 | is_overwrite = ocfs2_is_overwrite(osb, inode, offset); |
865 | if (is_overwrite < 0) { | 866 | if (is_overwrite < 0) { |
866 | mlog_errno(is_overwrite); | 867 | mlog_errno(is_overwrite); |
868 | ret = is_overwrite; | ||
867 | ocfs2_inode_unlock(inode, 1); | 869 | ocfs2_inode_unlock(inode, 1); |
868 | goto clean_orphan; | 870 | goto clean_orphan; |
869 | } | 871 | } |
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index fa15debcc02b..ddddef0021a0 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -219,7 +219,8 @@ struct o2hb_region { | |||
219 | unsigned hr_unclean_stop:1, | 219 | unsigned hr_unclean_stop:1, |
220 | hr_aborted_start:1, | 220 | hr_aborted_start:1, |
221 | hr_item_pinned:1, | 221 | hr_item_pinned:1, |
222 | hr_item_dropped:1; | 222 | hr_item_dropped:1, |
223 | hr_node_deleted:1; | ||
223 | 224 | ||
224 | /* protected by the hr_callback_sem */ | 225 | /* protected by the hr_callback_sem */ |
225 | struct task_struct *hr_task; | 226 | struct task_struct *hr_task; |
@@ -1078,7 +1079,13 @@ static int o2hb_thread(void *data) | |||
1078 | set_user_nice(current, MIN_NICE); | 1079 | set_user_nice(current, MIN_NICE); |
1079 | 1080 | ||
1080 | /* Pin node */ | 1081 | /* Pin node */ |
1081 | o2nm_depend_this_node(); | 1082 | ret = o2nm_depend_this_node(); |
1083 | if (ret) { | ||
1084 | mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret); | ||
1085 | reg->hr_node_deleted = 1; | ||
1086 | wake_up(&o2hb_steady_queue); | ||
1087 | return 0; | ||
1088 | } | ||
1082 | 1089 | ||
1083 | while (!kthread_should_stop() && | 1090 | while (!kthread_should_stop() && |
1084 | !reg->hr_unclean_stop && !reg->hr_aborted_start) { | 1091 | !reg->hr_unclean_stop && !reg->hr_aborted_start) { |
@@ -1787,7 +1794,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | |||
1787 | spin_unlock(&o2hb_live_lock); | 1794 | spin_unlock(&o2hb_live_lock); |
1788 | 1795 | ||
1789 | ret = wait_event_interruptible(o2hb_steady_queue, | 1796 | ret = wait_event_interruptible(o2hb_steady_queue, |
1790 | atomic_read(®->hr_steady_iterations) == 0); | 1797 | atomic_read(®->hr_steady_iterations) == 0 || |
1798 | reg->hr_node_deleted); | ||
1791 | if (ret) { | 1799 | if (ret) { |
1792 | atomic_set(®->hr_steady_iterations, 0); | 1800 | atomic_set(®->hr_steady_iterations, 0); |
1793 | reg->hr_aborted_start = 1; | 1801 | reg->hr_aborted_start = 1; |
@@ -1798,6 +1806,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | |||
1798 | goto out3; | 1806 | goto out3; |
1799 | } | 1807 | } |
1800 | 1808 | ||
1809 | if (reg->hr_node_deleted) { | ||
1810 | ret = -EINVAL; | ||
1811 | goto out3; | ||
1812 | } | ||
1813 | |||
1801 | /* Ok, we were woken. Make sure it wasn't by drop_item() */ | 1814 | /* Ok, we were woken. Make sure it wasn't by drop_item() */ |
1802 | spin_lock(&o2hb_live_lock); | 1815 | spin_lock(&o2hb_live_lock); |
1803 | hb_task = reg->hr_task; | 1816 | hb_task = reg->hr_task; |
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6918f30d02cd..2ee7fe747cea 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -1866,6 +1866,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) | |||
1866 | int status; | 1866 | int status; |
1867 | unsigned int backoff; | 1867 | unsigned int backoff; |
1868 | unsigned int total_backoff = 0; | 1868 | unsigned int total_backoff = 0; |
1869 | char wq_name[O2NM_MAX_NAME_LEN]; | ||
1869 | 1870 | ||
1870 | BUG_ON(!dlm); | 1871 | BUG_ON(!dlm); |
1871 | 1872 | ||
@@ -1895,7 +1896,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) | |||
1895 | goto bail; | 1896 | goto bail; |
1896 | } | 1897 | } |
1897 | 1898 | ||
1898 | dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); | 1899 | snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); |
1900 | dlm->dlm_worker = create_singlethread_workqueue(wq_name); | ||
1899 | if (!dlm->dlm_worker) { | 1901 | if (!dlm->dlm_worker) { |
1900 | status = -ENOMEM; | 1902 | status = -ENOMEM; |
1901 | mlog_errno(status); | 1903 | mlog_errno(status); |
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 58eaa5c0d387..9e4f862d20fe 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -205,7 +205,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm) | |||
205 | mlog(0, "starting dlm recovery thread...\n"); | 205 | mlog(0, "starting dlm recovery thread...\n"); |
206 | 206 | ||
207 | dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, | 207 | dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, |
208 | "dlm_reco_thread"); | 208 | "dlm_reco-%s", dlm->name); |
209 | if (IS_ERR(dlm->dlm_reco_thread_task)) { | 209 | if (IS_ERR(dlm->dlm_reco_thread_task)) { |
210 | mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); | 210 | mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); |
211 | dlm->dlm_reco_thread_task = NULL; | 211 | dlm->dlm_reco_thread_task = NULL; |
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 2e5e6d5fffe8..c5f6c241ecd7 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c | |||
@@ -493,7 +493,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm) | |||
493 | { | 493 | { |
494 | mlog(0, "Starting dlm_thread...\n"); | 494 | mlog(0, "Starting dlm_thread...\n"); |
495 | 495 | ||
496 | dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); | 496 | dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s", |
497 | dlm->name); | ||
497 | if (IS_ERR(dlm->dlm_thread_task)) { | 498 | if (IS_ERR(dlm->dlm_thread_task)) { |
498 | mlog_errno(PTR_ERR(dlm->dlm_thread_task)); | 499 | mlog_errno(PTR_ERR(dlm->dlm_thread_task)); |
499 | dlm->dlm_thread_task = NULL; | 500 | dlm->dlm_thread_task = NULL; |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1c91103c1333..20276e340339 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -2998,7 +2998,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) | |||
2998 | } | 2998 | } |
2999 | 2999 | ||
3000 | /* launch downconvert thread */ | 3000 | /* launch downconvert thread */ |
3001 | osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc"); | 3001 | osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s", |
3002 | osb->uuid_str); | ||
3002 | if (IS_ERR(osb->dc_task)) { | 3003 | if (IS_ERR(osb->dc_task)) { |
3003 | status = PTR_ERR(osb->dc_task); | 3004 | status = PTR_ERR(osb->dc_task); |
3004 | osb->dc_task = NULL; | 3005 | osb->dc_task = NULL; |
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ca3431ee7f24..aac8b86f312e 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
@@ -112,6 +112,8 @@ struct ocfs2_inode_info | |||
112 | #define OCFS2_INODE_OPEN_DIRECT 0x00000020 | 112 | #define OCFS2_INODE_OPEN_DIRECT 0x00000020 |
113 | /* Tell the inode wipe code it's not in orphan dir */ | 113 | /* Tell the inode wipe code it's not in orphan dir */ |
114 | #define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 | 114 | #define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 |
115 | /* Entry in orphan dir with 'dio-' prefix */ | ||
116 | #define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080 | ||
115 | 117 | ||
116 | static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) | 118 | static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) |
117 | { | 119 | { |
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ff82b28462a6..13534f4fe5b5 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -1090,7 +1090,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) | |||
1090 | /* Launch the commit thread */ | 1090 | /* Launch the commit thread */ |
1091 | if (!local) { | 1091 | if (!local) { |
1092 | osb->commit_task = kthread_run(ocfs2_commit_thread, osb, | 1092 | osb->commit_task = kthread_run(ocfs2_commit_thread, osb, |
1093 | "ocfs2cmt"); | 1093 | "ocfs2cmt-%s", osb->uuid_str); |
1094 | if (IS_ERR(osb->commit_task)) { | 1094 | if (IS_ERR(osb->commit_task)) { |
1095 | status = PTR_ERR(osb->commit_task); | 1095 | status = PTR_ERR(osb->commit_task); |
1096 | osb->commit_task = NULL; | 1096 | osb->commit_task = NULL; |
@@ -1507,7 +1507,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) | |||
1507 | goto out; | 1507 | goto out; |
1508 | 1508 | ||
1509 | osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, | 1509 | osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, |
1510 | "ocfs2rec"); | 1510 | "ocfs2rec-%s", osb->uuid_str); |
1511 | if (IS_ERR(osb->recovery_thread_task)) { | 1511 | if (IS_ERR(osb->recovery_thread_task)) { |
1512 | mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); | 1512 | mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); |
1513 | osb->recovery_thread_task = NULL; | 1513 | osb->recovery_thread_task = NULL; |
@@ -2021,6 +2021,7 @@ struct ocfs2_orphan_filldir_priv { | |||
2021 | struct dir_context ctx; | 2021 | struct dir_context ctx; |
2022 | struct inode *head; | 2022 | struct inode *head; |
2023 | struct ocfs2_super *osb; | 2023 | struct ocfs2_super *osb; |
2024 | enum ocfs2_orphan_reco_type orphan_reco_type; | ||
2024 | }; | 2025 | }; |
2025 | 2026 | ||
2026 | static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, | 2027 | static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, |
@@ -2036,12 +2037,22 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, | |||
2036 | if (name_len == 2 && !strncmp("..", name, 2)) | 2037 | if (name_len == 2 && !strncmp("..", name, 2)) |
2037 | return 0; | 2038 | return 0; |
2038 | 2039 | ||
2040 | /* do not include dio entry in case of orphan scan */ | ||
2041 | if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) && | ||
2042 | (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, | ||
2043 | OCFS2_DIO_ORPHAN_PREFIX_LEN))) | ||
2044 | return 0; | ||
2045 | |||
2039 | /* Skip bad inodes so that recovery can continue */ | 2046 | /* Skip bad inodes so that recovery can continue */ |
2040 | iter = ocfs2_iget(p->osb, ino, | 2047 | iter = ocfs2_iget(p->osb, ino, |
2041 | OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); | 2048 | OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); |
2042 | if (IS_ERR(iter)) | 2049 | if (IS_ERR(iter)) |
2043 | return 0; | 2050 | return 0; |
2044 | 2051 | ||
2052 | if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, | ||
2053 | OCFS2_DIO_ORPHAN_PREFIX_LEN)) | ||
2054 | OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY; | ||
2055 | |||
2045 | /* Skip inodes which are already added to recover list, since dio may | 2056 | /* Skip inodes which are already added to recover list, since dio may |
2046 | * happen concurrently with unlink/rename */ | 2057 | * happen concurrently with unlink/rename */ |
2047 | if (OCFS2_I(iter)->ip_next_orphan) { | 2058 | if (OCFS2_I(iter)->ip_next_orphan) { |
@@ -2060,14 +2071,16 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, | |||
2060 | 2071 | ||
2061 | static int ocfs2_queue_orphans(struct ocfs2_super *osb, | 2072 | static int ocfs2_queue_orphans(struct ocfs2_super *osb, |
2062 | int slot, | 2073 | int slot, |
2063 | struct inode **head) | 2074 | struct inode **head, |
2075 | enum ocfs2_orphan_reco_type orphan_reco_type) | ||
2064 | { | 2076 | { |
2065 | int status; | 2077 | int status; |
2066 | struct inode *orphan_dir_inode = NULL; | 2078 | struct inode *orphan_dir_inode = NULL; |
2067 | struct ocfs2_orphan_filldir_priv priv = { | 2079 | struct ocfs2_orphan_filldir_priv priv = { |
2068 | .ctx.actor = ocfs2_orphan_filldir, | 2080 | .ctx.actor = ocfs2_orphan_filldir, |
2069 | .osb = osb, | 2081 | .osb = osb, |
2070 | .head = *head | 2082 | .head = *head, |
2083 | .orphan_reco_type = orphan_reco_type | ||
2071 | }; | 2084 | }; |
2072 | 2085 | ||
2073 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, | 2086 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, |
@@ -2170,7 +2183,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
2170 | trace_ocfs2_recover_orphans(slot); | 2183 | trace_ocfs2_recover_orphans(slot); |
2171 | 2184 | ||
2172 | ocfs2_mark_recovering_orphan_dir(osb, slot); | 2185 | ocfs2_mark_recovering_orphan_dir(osb, slot); |
2173 | ret = ocfs2_queue_orphans(osb, slot, &inode); | 2186 | ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type); |
2174 | ocfs2_clear_recovering_orphan_dir(osb, slot); | 2187 | ocfs2_clear_recovering_orphan_dir(osb, slot); |
2175 | 2188 | ||
2176 | /* Error here should be noted, but we want to continue with as | 2189 | /* Error here should be noted, but we want to continue with as |
@@ -2186,25 +2199,51 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
2186 | iter = oi->ip_next_orphan; | 2199 | iter = oi->ip_next_orphan; |
2187 | oi->ip_next_orphan = NULL; | 2200 | oi->ip_next_orphan = NULL; |
2188 | 2201 | ||
2189 | mutex_lock(&inode->i_mutex); | 2202 | if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) { |
2190 | ret = ocfs2_rw_lock(inode, 1); | 2203 | mutex_lock(&inode->i_mutex); |
2191 | if (ret < 0) { | 2204 | ret = ocfs2_rw_lock(inode, 1); |
2192 | mlog_errno(ret); | 2205 | if (ret < 0) { |
2193 | goto next; | 2206 | mlog_errno(ret); |
2194 | } | 2207 | goto unlock_mutex; |
2195 | /* | 2208 | } |
2196 | * We need to take and drop the inode lock to | 2209 | /* |
2197 | * force read inode from disk. | 2210 | * We need to take and drop the inode lock to |
2198 | */ | 2211 | * force read inode from disk. |
2199 | ret = ocfs2_inode_lock(inode, &di_bh, 1); | 2212 | */ |
2200 | if (ret) { | 2213 | ret = ocfs2_inode_lock(inode, &di_bh, 1); |
2201 | mlog_errno(ret); | 2214 | if (ret) { |
2202 | goto unlock_rw; | 2215 | mlog_errno(ret); |
2203 | } | 2216 | goto unlock_rw; |
2217 | } | ||
2204 | 2218 | ||
2205 | di = (struct ocfs2_dinode *)di_bh->b_data; | 2219 | di = (struct ocfs2_dinode *)di_bh->b_data; |
2206 | 2220 | ||
2207 | if (inode->i_nlink == 0) { | 2221 | if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) { |
2222 | ret = ocfs2_truncate_file(inode, di_bh, | ||
2223 | i_size_read(inode)); | ||
2224 | if (ret < 0) { | ||
2225 | if (ret != -ENOSPC) | ||
2226 | mlog_errno(ret); | ||
2227 | goto unlock_inode; | ||
2228 | } | ||
2229 | |||
2230 | ret = ocfs2_del_inode_from_orphan(osb, inode, | ||
2231 | di_bh, 0, 0); | ||
2232 | if (ret) | ||
2233 | mlog_errno(ret); | ||
2234 | } | ||
2235 | unlock_inode: | ||
2236 | ocfs2_inode_unlock(inode, 1); | ||
2237 | brelse(di_bh); | ||
2238 | di_bh = NULL; | ||
2239 | unlock_rw: | ||
2240 | ocfs2_rw_unlock(inode, 1); | ||
2241 | unlock_mutex: | ||
2242 | mutex_unlock(&inode->i_mutex); | ||
2243 | |||
2244 | /* clear dio flag in ocfs2_inode_info */ | ||
2245 | oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY; | ||
2246 | } else { | ||
2208 | spin_lock(&oi->ip_lock); | 2247 | spin_lock(&oi->ip_lock); |
2209 | /* Set the proper information to get us going into | 2248 | /* Set the proper information to get us going into |
2210 | * ocfs2_delete_inode. */ | 2249 | * ocfs2_delete_inode. */ |
@@ -2212,28 +2251,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
2212 | spin_unlock(&oi->ip_lock); | 2251 | spin_unlock(&oi->ip_lock); |
2213 | } | 2252 | } |
2214 | 2253 | ||
2215 | if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && | ||
2216 | (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { | ||
2217 | ret = ocfs2_truncate_file(inode, di_bh, | ||
2218 | i_size_read(inode)); | ||
2219 | if (ret < 0) { | ||
2220 | if (ret != -ENOSPC) | ||
2221 | mlog_errno(ret); | ||
2222 | goto unlock_inode; | ||
2223 | } | ||
2224 | |||
2225 | ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); | ||
2226 | if (ret) | ||
2227 | mlog_errno(ret); | ||
2228 | } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ | ||
2229 | unlock_inode: | ||
2230 | ocfs2_inode_unlock(inode, 1); | ||
2231 | brelse(di_bh); | ||
2232 | di_bh = NULL; | ||
2233 | unlock_rw: | ||
2234 | ocfs2_rw_unlock(inode, 1); | ||
2235 | next: | ||
2236 | mutex_unlock(&inode->i_mutex); | ||
2237 | iput(inode); | 2254 | iput(inode); |
2238 | inode = iter; | 2255 | inode = iter; |
2239 | } | 2256 | } |
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b7dfac226b1e..3b48ac25d8a7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
@@ -106,8 +106,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, | |||
106 | static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); | 106 | static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); |
107 | /* An orphan dir name is an 8 byte value, printed as a hex string */ | 107 | /* An orphan dir name is an 8 byte value, printed as a hex string */ |
108 | #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) | 108 | #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) |
109 | #define OCFS2_DIO_ORPHAN_PREFIX "dio-" | ||
110 | #define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 | ||
111 | 109 | ||
112 | static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, | 110 | static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, |
113 | unsigned int flags) | 111 | unsigned int flags) |
@@ -657,9 +655,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, | |||
657 | return status; | 655 | return status; |
658 | } | 656 | } |
659 | 657 | ||
660 | return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, | 658 | status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, |
661 | parent_fe_bh, handle, inode_ac, | 659 | parent_fe_bh, handle, inode_ac, |
662 | fe_blkno, suballoc_loc, suballoc_bit); | 660 | fe_blkno, suballoc_loc, suballoc_bit); |
661 | if (status < 0) { | ||
662 | u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); | ||
663 | int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, | ||
664 | inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); | ||
665 | if (tmp) | ||
666 | mlog_errno(tmp); | ||
667 | } | ||
668 | |||
669 | return status; | ||
663 | } | 670 | } |
664 | 671 | ||
665 | static int ocfs2_mkdir(struct inode *dir, | 672 | static int ocfs2_mkdir(struct inode *dir, |
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index e173329eb830..1155918d6784 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h | |||
@@ -26,6 +26,9 @@ | |||
26 | #ifndef OCFS2_NAMEI_H | 26 | #ifndef OCFS2_NAMEI_H |
27 | #define OCFS2_NAMEI_H | 27 | #define OCFS2_NAMEI_H |
28 | 28 | ||
29 | #define OCFS2_DIO_ORPHAN_PREFIX "dio-" | ||
30 | #define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 | ||
31 | |||
29 | extern const struct inode_operations ocfs2_dir_iops; | 32 | extern const struct inode_operations ocfs2_dir_iops; |
30 | 33 | ||
31 | struct dentry *ocfs2_get_parent(struct dentry *child); | 34 | struct dentry *ocfs2_get_parent(struct dentry *child); |
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index e5d57cd32505..252119860e6c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c | |||
@@ -2920,16 +2920,13 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, | |||
2920 | u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); | 2920 | u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); |
2921 | struct page *page; | 2921 | struct page *page; |
2922 | pgoff_t page_index; | 2922 | pgoff_t page_index; |
2923 | unsigned int from, to, readahead_pages; | 2923 | unsigned int from, to; |
2924 | loff_t offset, end, map_end; | 2924 | loff_t offset, end, map_end; |
2925 | struct address_space *mapping = inode->i_mapping; | 2925 | struct address_space *mapping = inode->i_mapping; |
2926 | 2926 | ||
2927 | trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, | 2927 | trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, |
2928 | new_cluster, new_len); | 2928 | new_cluster, new_len); |
2929 | 2929 | ||
2930 | readahead_pages = | ||
2931 | (ocfs2_cow_contig_clusters(sb) << | ||
2932 | OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; | ||
2933 | offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; | 2930 | offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; |
2934 | end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); | 2931 | end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); |
2935 | /* | 2932 | /* |
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index d83d2602cf2b..fc6d25f6d444 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c | |||
@@ -1920,7 +1920,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, | |||
1920 | status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, | 1920 | status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, |
1921 | res, &bits_left); | 1921 | res, &bits_left); |
1922 | if (!status) { | 1922 | if (!status) { |
1923 | hint = ocfs2_group_from_res(res); | 1923 | if (ocfs2_is_cluster_bitmap(ac->ac_inode)) |
1924 | hint = res->sr_bg_blkno; | ||
1925 | else | ||
1926 | hint = ocfs2_group_from_res(res); | ||
1924 | goto set_hint; | 1927 | goto set_hint; |
1925 | } | 1928 | } |
1926 | if (status < 0 && status != -ENOSPC) { | 1929 | if (status < 0 && status != -ENOSPC) { |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 29595af32866..bd3e9e68125b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -1032,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, | |||
1032 | return simple_read_from_buffer(buf, count, ppos, buffer, len); | 1032 | return simple_read_from_buffer(buf, count, ppos, buffer, len); |
1033 | } | 1033 | } |
1034 | 1034 | ||
1035 | /* | ||
1036 | * /proc/pid/oom_adj exists solely for backwards compatibility with previous | ||
1037 | * kernels. The effective policy is defined by oom_score_adj, which has a | ||
1038 | * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. | ||
1039 | * Values written to oom_adj are simply mapped linearly to oom_score_adj. | ||
1040 | * Processes that become oom disabled via oom_adj will still be oom disabled | ||
1041 | * with this implementation. | ||
1042 | * | ||
1043 | * oom_adj cannot be removed since existing userspace binaries use it. | ||
1044 | */ | ||
1035 | static ssize_t oom_adj_write(struct file *file, const char __user *buf, | 1045 | static ssize_t oom_adj_write(struct file *file, const char __user *buf, |
1036 | size_t count, loff_t *ppos) | 1046 | size_t count, loff_t *ppos) |
1037 | { | 1047 | { |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b029d426c558..187b3b5f242e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -70,6 +70,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) | |||
70 | ptes >> 10, | 70 | ptes >> 10, |
71 | pmds >> 10, | 71 | pmds >> 10, |
72 | swap << (PAGE_SHIFT-10)); | 72 | swap << (PAGE_SHIFT-10)); |
73 | hugetlb_report_usage(m, mm); | ||
73 | } | 74 | } |
74 | 75 | ||
75 | unsigned long task_vsize(struct mm_struct *mm) | 76 | unsigned long task_vsize(struct mm_struct *mm) |
@@ -446,6 +447,8 @@ struct mem_size_stats { | |||
446 | unsigned long anonymous; | 447 | unsigned long anonymous; |
447 | unsigned long anonymous_thp; | 448 | unsigned long anonymous_thp; |
448 | unsigned long swap; | 449 | unsigned long swap; |
450 | unsigned long shared_hugetlb; | ||
451 | unsigned long private_hugetlb; | ||
449 | u64 pss; | 452 | u64 pss; |
450 | u64 swap_pss; | 453 | u64 swap_pss; |
451 | }; | 454 | }; |
@@ -625,12 +628,44 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) | |||
625 | seq_putc(m, '\n'); | 628 | seq_putc(m, '\n'); |
626 | } | 629 | } |
627 | 630 | ||
631 | #ifdef CONFIG_HUGETLB_PAGE | ||
632 | static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, | ||
633 | unsigned long addr, unsigned long end, | ||
634 | struct mm_walk *walk) | ||
635 | { | ||
636 | struct mem_size_stats *mss = walk->private; | ||
637 | struct vm_area_struct *vma = walk->vma; | ||
638 | struct page *page = NULL; | ||
639 | |||
640 | if (pte_present(*pte)) { | ||
641 | page = vm_normal_page(vma, addr, *pte); | ||
642 | } else if (is_swap_pte(*pte)) { | ||
643 | swp_entry_t swpent = pte_to_swp_entry(*pte); | ||
644 | |||
645 | if (is_migration_entry(swpent)) | ||
646 | page = migration_entry_to_page(swpent); | ||
647 | } | ||
648 | if (page) { | ||
649 | int mapcount = page_mapcount(page); | ||
650 | |||
651 | if (mapcount >= 2) | ||
652 | mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); | ||
653 | else | ||
654 | mss->private_hugetlb += huge_page_size(hstate_vma(vma)); | ||
655 | } | ||
656 | return 0; | ||
657 | } | ||
658 | #endif /* HUGETLB_PAGE */ | ||
659 | |||
628 | static int show_smap(struct seq_file *m, void *v, int is_pid) | 660 | static int show_smap(struct seq_file *m, void *v, int is_pid) |
629 | { | 661 | { |
630 | struct vm_area_struct *vma = v; | 662 | struct vm_area_struct *vma = v; |
631 | struct mem_size_stats mss; | 663 | struct mem_size_stats mss; |
632 | struct mm_walk smaps_walk = { | 664 | struct mm_walk smaps_walk = { |
633 | .pmd_entry = smaps_pte_range, | 665 | .pmd_entry = smaps_pte_range, |
666 | #ifdef CONFIG_HUGETLB_PAGE | ||
667 | .hugetlb_entry = smaps_hugetlb_range, | ||
668 | #endif | ||
634 | .mm = vma->vm_mm, | 669 | .mm = vma->vm_mm, |
635 | .private = &mss, | 670 | .private = &mss, |
636 | }; | 671 | }; |
@@ -652,6 +687,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) | |||
652 | "Referenced: %8lu kB\n" | 687 | "Referenced: %8lu kB\n" |
653 | "Anonymous: %8lu kB\n" | 688 | "Anonymous: %8lu kB\n" |
654 | "AnonHugePages: %8lu kB\n" | 689 | "AnonHugePages: %8lu kB\n" |
690 | "Shared_Hugetlb: %8lu kB\n" | ||
691 | "Private_Hugetlb: %7lu kB\n" | ||
655 | "Swap: %8lu kB\n" | 692 | "Swap: %8lu kB\n" |
656 | "SwapPss: %8lu kB\n" | 693 | "SwapPss: %8lu kB\n" |
657 | "KernelPageSize: %8lu kB\n" | 694 | "KernelPageSize: %8lu kB\n" |
@@ -667,6 +704,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) | |||
667 | mss.referenced >> 10, | 704 | mss.referenced >> 10, |
668 | mss.anonymous >> 10, | 705 | mss.anonymous >> 10, |
669 | mss.anonymous_thp >> 10, | 706 | mss.anonymous_thp >> 10, |
707 | mss.shared_hugetlb >> 10, | ||
708 | mss.private_hugetlb >> 10, | ||
670 | mss.swap >> 10, | 709 | mss.swap >> 10, |
671 | (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), | 710 | (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), |
672 | vma_kernel_pagesize(vma) >> 10, | 711 | vma_kernel_pagesize(vma) >> 10, |
@@ -753,19 +792,27 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, | |||
753 | pte_t ptent = *pte; | 792 | pte_t ptent = *pte; |
754 | 793 | ||
755 | if (pte_present(ptent)) { | 794 | if (pte_present(ptent)) { |
795 | ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); | ||
756 | ptent = pte_wrprotect(ptent); | 796 | ptent = pte_wrprotect(ptent); |
757 | ptent = pte_clear_soft_dirty(ptent); | 797 | ptent = pte_clear_soft_dirty(ptent); |
798 | ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); | ||
758 | } else if (is_swap_pte(ptent)) { | 799 | } else if (is_swap_pte(ptent)) { |
759 | ptent = pte_swp_clear_soft_dirty(ptent); | 800 | ptent = pte_swp_clear_soft_dirty(ptent); |
801 | set_pte_at(vma->vm_mm, addr, pte, ptent); | ||
760 | } | 802 | } |
761 | |||
762 | set_pte_at(vma->vm_mm, addr, pte, ptent); | ||
763 | } | 803 | } |
804 | #else | ||
805 | static inline void clear_soft_dirty(struct vm_area_struct *vma, | ||
806 | unsigned long addr, pte_t *pte) | ||
807 | { | ||
808 | } | ||
809 | #endif | ||
764 | 810 | ||
811 | #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) | ||
765 | static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, | 812 | static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, |
766 | unsigned long addr, pmd_t *pmdp) | 813 | unsigned long addr, pmd_t *pmdp) |
767 | { | 814 | { |
768 | pmd_t pmd = *pmdp; | 815 | pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); |
769 | 816 | ||
770 | pmd = pmd_wrprotect(pmd); | 817 | pmd = pmd_wrprotect(pmd); |
771 | pmd = pmd_clear_soft_dirty(pmd); | 818 | pmd = pmd_clear_soft_dirty(pmd); |
@@ -775,14 +822,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, | |||
775 | 822 | ||
776 | set_pmd_at(vma->vm_mm, addr, pmdp, pmd); | 823 | set_pmd_at(vma->vm_mm, addr, pmdp, pmd); |
777 | } | 824 | } |
778 | |||
779 | #else | 825 | #else |
780 | |||
781 | static inline void clear_soft_dirty(struct vm_area_struct *vma, | ||
782 | unsigned long addr, pte_t *pte) | ||
783 | { | ||
784 | } | ||
785 | |||
786 | static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, | 826 | static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, |
787 | unsigned long addr, pmd_t *pmdp) | 827 | unsigned long addr, pmd_t *pmdp) |
788 | { | 828 | { |
@@ -86,7 +86,12 @@ static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) | |||
86 | 86 | ||
87 | static void fdatawait_one_bdev(struct block_device *bdev, void *arg) | 87 | static void fdatawait_one_bdev(struct block_device *bdev, void *arg) |
88 | { | 88 | { |
89 | filemap_fdatawait(bdev->bd_inode->i_mapping); | 89 | /* |
90 | * We keep the error status of individual mapping so that | ||
91 | * applications can catch the writeback error using fsync(2). | ||
92 | * See filemap_fdatawait_keep_errors() for details. | ||
93 | */ | ||
94 | filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping); | ||
90 | } | 95 | } |
91 | 96 | ||
92 | /* | 97 | /* |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index aa8f61cf3a19..4cd4ddf64cc7 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -15,7 +15,8 @@ | |||
15 | /* For more detailed tracepoint output */ | 15 | /* For more detailed tracepoint output */ |
16 | #define COMPACT_NO_SUITABLE_PAGE 5 | 16 | #define COMPACT_NO_SUITABLE_PAGE 5 |
17 | #define COMPACT_NOT_SUITABLE_ZONE 6 | 17 | #define COMPACT_NOT_SUITABLE_ZONE 6 |
18 | /* When adding new state, please change compaction_status_string, too */ | 18 | #define COMPACT_CONTENDED 7 |
19 | /* When adding new states, please adjust include/trace/events/compaction.h */ | ||
19 | 20 | ||
20 | /* Used to signal whether compaction detected need_sched() or lock contention */ | 21 | /* Used to signal whether compaction detected need_sched() or lock contention */ |
21 | /* No contention detected */ | 22 | /* No contention detected */ |
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 8efb40e61d6e..0e3110a0b771 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h | |||
@@ -210,6 +210,23 @@ | |||
210 | #define __visible __attribute__((externally_visible)) | 210 | #define __visible __attribute__((externally_visible)) |
211 | #endif | 211 | #endif |
212 | 212 | ||
213 | |||
214 | #if GCC_VERSION >= 40900 && !defined(__CHECKER__) | ||
215 | /* | ||
216 | * __assume_aligned(n, k): Tell the optimizer that the returned | ||
217 | * pointer can be assumed to be k modulo n. The second argument is | ||
218 | * optional (default 0), so we use a variadic macro to make the | ||
219 | * shorthand. | ||
220 | * | ||
221 | * Beware: Do not apply this to functions which may return | ||
222 | * ERR_PTRs. Also, it is probably unwise to apply it to functions | ||
223 | * returning extra information in the low bits (but in that case the | ||
224 | * compiler should see some alignment anyway, when the return value is | ||
225 | * massaged by 'flags = ptr & 3; ptr &= ~3;'). | ||
226 | */ | ||
227 | #define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__))) | ||
228 | #endif | ||
229 | |||
213 | /* | 230 | /* |
214 | * GCC 'asm goto' miscompiles certain code sequences: | 231 | * GCC 'asm goto' miscompiles certain code sequences: |
215 | * | 232 | * |
diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 52a459ff75f4..4dac1036594f 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h | |||
@@ -417,6 +417,14 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s | |||
417 | #define __visible | 417 | #define __visible |
418 | #endif | 418 | #endif |
419 | 419 | ||
420 | /* | ||
421 | * Assume alignment of return value. | ||
422 | */ | ||
423 | #ifndef __assume_aligned | ||
424 | #define __assume_aligned(a, ...) | ||
425 | #endif | ||
426 | |||
427 | |||
420 | /* Are two types/vars the same type (ignoring qualifiers)? */ | 428 | /* Are two types/vars the same type (ignoring qualifiers)? */ |
421 | #ifndef __same_type | 429 | #ifndef __same_type |
422 | # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) | 430 | # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) |
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 1b357997cac5..5a1311942358 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -93,7 +93,7 @@ extern int current_cpuset_is_being_rebound(void); | |||
93 | 93 | ||
94 | extern void rebuild_sched_domains(void); | 94 | extern void rebuild_sched_domains(void); |
95 | 95 | ||
96 | extern void cpuset_print_task_mems_allowed(struct task_struct *p); | 96 | extern void cpuset_print_current_mems_allowed(void); |
97 | 97 | ||
98 | /* | 98 | /* |
99 | * read_mems_allowed_begin is required when making decisions involving | 99 | * read_mems_allowed_begin is required when making decisions involving |
@@ -219,7 +219,7 @@ static inline void rebuild_sched_domains(void) | |||
219 | partition_sched_domains(1, NULL, NULL); | 219 | partition_sched_domains(1, NULL, NULL); |
220 | } | 220 | } |
221 | 221 | ||
222 | static inline void cpuset_print_task_mems_allowed(struct task_struct *p) | 222 | static inline void cpuset_print_current_mems_allowed(void) |
223 | { | 223 | { |
224 | } | 224 | } |
225 | 225 | ||
diff --git a/include/linux/fs.h b/include/linux/fs.h index 49749688156d..9a1cb8c605e0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -2409,6 +2409,7 @@ extern int write_inode_now(struct inode *, int); | |||
2409 | extern int filemap_fdatawrite(struct address_space *); | 2409 | extern int filemap_fdatawrite(struct address_space *); |
2410 | extern int filemap_flush(struct address_space *); | 2410 | extern int filemap_flush(struct address_space *); |
2411 | extern int filemap_fdatawait(struct address_space *); | 2411 | extern int filemap_fdatawait(struct address_space *); |
2412 | extern void filemap_fdatawait_keep_errors(struct address_space *); | ||
2412 | extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, | 2413 | extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, |
2413 | loff_t lend); | 2414 | loff_t lend); |
2414 | extern int filemap_write_and_wait(struct address_space *mapping); | 2415 | extern int filemap_write_and_wait(struct address_space *mapping); |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5e35379f58a5..685c262e0be8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -483,6 +483,17 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, | |||
483 | #define hugepages_supported() (HPAGE_SHIFT != 0) | 483 | #define hugepages_supported() (HPAGE_SHIFT != 0) |
484 | #endif | 484 | #endif |
485 | 485 | ||
486 | void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm); | ||
487 | |||
488 | static inline void hugetlb_count_add(long l, struct mm_struct *mm) | ||
489 | { | ||
490 | atomic_long_add(l, &mm->hugetlb_usage); | ||
491 | } | ||
492 | |||
493 | static inline void hugetlb_count_sub(long l, struct mm_struct *mm) | ||
494 | { | ||
495 | atomic_long_sub(l, &mm->hugetlb_usage); | ||
496 | } | ||
486 | #else /* CONFIG_HUGETLB_PAGE */ | 497 | #else /* CONFIG_HUGETLB_PAGE */ |
487 | struct hstate {}; | 498 | struct hstate {}; |
488 | #define alloc_huge_page(v, a, r) NULL | 499 | #define alloc_huge_page(v, a, r) NULL |
@@ -519,6 +530,14 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, | |||
519 | { | 530 | { |
520 | return &mm->page_table_lock; | 531 | return &mm->page_table_lock; |
521 | } | 532 | } |
533 | |||
534 | static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) | ||
535 | { | ||
536 | } | ||
537 | |||
538 | static inline void hugetlb_count_sub(long l, struct mm_struct *mm) | ||
539 | { | ||
540 | } | ||
522 | #endif /* CONFIG_HUGETLB_PAGE */ | 541 | #endif /* CONFIG_HUGETLB_PAGE */ |
523 | 542 | ||
524 | static inline spinlock_t *huge_pte_lock(struct hstate *h, | 543 | static inline spinlock_t *huge_pte_lock(struct hstate *h, |
diff --git a/include/linux/memblock.h b/include/linux/memblock.h index c518eb589260..24daf8fc4d7c 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h | |||
@@ -89,10 +89,6 @@ int memblock_add_range(struct memblock_type *type, | |||
89 | phys_addr_t base, phys_addr_t size, | 89 | phys_addr_t base, phys_addr_t size, |
90 | int nid, unsigned long flags); | 90 | int nid, unsigned long flags); |
91 | 91 | ||
92 | int memblock_remove_range(struct memblock_type *type, | ||
93 | phys_addr_t base, | ||
94 | phys_addr_t size); | ||
95 | |||
96 | void __next_mem_range(u64 *idx, int nid, ulong flags, | 92 | void __next_mem_range(u64 *idx, int nid, ulong flags, |
97 | struct memblock_type *type_a, | 93 | struct memblock_type *type_a, |
98 | struct memblock_type *type_b, phys_addr_t *out_start, | 94 | struct memblock_type *type_b, phys_addr_t *out_start, |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 27251ed428f7..cd0e2413c358 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -301,8 +301,7 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); | |||
301 | void mem_cgroup_uncharge(struct page *page); | 301 | void mem_cgroup_uncharge(struct page *page); |
302 | void mem_cgroup_uncharge_list(struct list_head *page_list); | 302 | void mem_cgroup_uncharge_list(struct list_head *page_list); |
303 | 303 | ||
304 | void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, | 304 | void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage); |
305 | bool lrucare); | ||
306 | 305 | ||
307 | struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); | 306 | struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); |
308 | struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); | 307 | struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); |
@@ -384,7 +383,7 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) | |||
384 | return mz->lru_size[lru]; | 383 | return mz->lru_size[lru]; |
385 | } | 384 | } |
386 | 385 | ||
387 | static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | 386 | static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) |
388 | { | 387 | { |
389 | unsigned long inactive_ratio; | 388 | unsigned long inactive_ratio; |
390 | unsigned long inactive; | 389 | unsigned long inactive; |
@@ -403,24 +402,26 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
403 | return inactive * inactive_ratio < active; | 402 | return inactive * inactive_ratio < active; |
404 | } | 403 | } |
405 | 404 | ||
405 | void mem_cgroup_handle_over_high(void); | ||
406 | |||
406 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, | 407 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, |
407 | struct task_struct *p); | 408 | struct task_struct *p); |
408 | 409 | ||
409 | static inline void mem_cgroup_oom_enable(void) | 410 | static inline void mem_cgroup_oom_enable(void) |
410 | { | 411 | { |
411 | WARN_ON(current->memcg_oom.may_oom); | 412 | WARN_ON(current->memcg_may_oom); |
412 | current->memcg_oom.may_oom = 1; | 413 | current->memcg_may_oom = 1; |
413 | } | 414 | } |
414 | 415 | ||
415 | static inline void mem_cgroup_oom_disable(void) | 416 | static inline void mem_cgroup_oom_disable(void) |
416 | { | 417 | { |
417 | WARN_ON(!current->memcg_oom.may_oom); | 418 | WARN_ON(!current->memcg_may_oom); |
418 | current->memcg_oom.may_oom = 0; | 419 | current->memcg_may_oom = 0; |
419 | } | 420 | } |
420 | 421 | ||
421 | static inline bool task_in_memcg_oom(struct task_struct *p) | 422 | static inline bool task_in_memcg_oom(struct task_struct *p) |
422 | { | 423 | { |
423 | return p->memcg_oom.memcg; | 424 | return p->memcg_in_oom; |
424 | } | 425 | } |
425 | 426 | ||
426 | bool mem_cgroup_oom_synchronize(bool wait); | 427 | bool mem_cgroup_oom_synchronize(bool wait); |
@@ -537,9 +538,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) | |||
537 | { | 538 | { |
538 | } | 539 | } |
539 | 540 | ||
540 | static inline void mem_cgroup_migrate(struct page *oldpage, | 541 | static inline void mem_cgroup_replace_page(struct page *old, struct page *new) |
541 | struct page *newpage, | ||
542 | bool lrucare) | ||
543 | { | 542 | { |
544 | } | 543 | } |
545 | 544 | ||
@@ -585,10 +584,10 @@ static inline bool mem_cgroup_disabled(void) | |||
585 | return true; | 584 | return true; |
586 | } | 585 | } |
587 | 586 | ||
588 | static inline int | 587 | static inline bool |
589 | mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | 588 | mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) |
590 | { | 589 | { |
591 | return 1; | 590 | return true; |
592 | } | 591 | } |
593 | 592 | ||
594 | static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) | 593 | static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) |
@@ -622,6 +621,10 @@ static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) | |||
622 | { | 621 | { |
623 | } | 622 | } |
624 | 623 | ||
624 | static inline void mem_cgroup_handle_over_high(void) | ||
625 | { | ||
626 | } | ||
627 | |||
625 | static inline void mem_cgroup_oom_enable(void) | 628 | static inline void mem_cgroup_oom_enable(void) |
626 | { | 629 | { |
627 | } | 630 | } |
@@ -748,11 +751,10 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) | |||
748 | * conditions, but because they are pretty simple, they are expected to be | 751 | * conditions, but because they are pretty simple, they are expected to be |
749 | * fast. | 752 | * fast. |
750 | */ | 753 | */ |
751 | bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, | 754 | int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, |
752 | int order); | 755 | struct mem_cgroup *memcg); |
753 | void __memcg_kmem_commit_charge(struct page *page, | 756 | int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); |
754 | struct mem_cgroup *memcg, int order); | 757 | void __memcg_kmem_uncharge(struct page *page, int order); |
755 | void __memcg_kmem_uncharge_pages(struct page *page, int order); | ||
756 | 758 | ||
757 | /* | 759 | /* |
758 | * helper for acessing a memcg's index. It will be used as an index in the | 760 | * helper for acessing a memcg's index. It will be used as an index in the |
@@ -767,77 +769,42 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) | |||
767 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); | 769 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); |
768 | void __memcg_kmem_put_cache(struct kmem_cache *cachep); | 770 | void __memcg_kmem_put_cache(struct kmem_cache *cachep); |
769 | 771 | ||
770 | struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr); | 772 | static inline bool __memcg_kmem_bypass(gfp_t gfp) |
771 | |||
772 | int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, | ||
773 | unsigned long nr_pages); | ||
774 | void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); | ||
775 | |||
776 | /** | ||
777 | * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. | ||
778 | * @gfp: the gfp allocation flags. | ||
779 | * @memcg: a pointer to the memcg this was charged against. | ||
780 | * @order: allocation order. | ||
781 | * | ||
782 | * returns true if the memcg where the current task belongs can hold this | ||
783 | * allocation. | ||
784 | * | ||
785 | * We return true automatically if this allocation is not to be accounted to | ||
786 | * any memcg. | ||
787 | */ | ||
788 | static inline bool | ||
789 | memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) | ||
790 | { | 773 | { |
791 | if (!memcg_kmem_enabled()) | 774 | if (!memcg_kmem_enabled()) |
792 | return true; | 775 | return true; |
793 | |||
794 | if (gfp & __GFP_NOACCOUNT) | 776 | if (gfp & __GFP_NOACCOUNT) |
795 | return true; | 777 | return true; |
796 | /* | ||
797 | * __GFP_NOFAIL allocations will move on even if charging is not | ||
798 | * possible. Therefore we don't even try, and have this allocation | ||
799 | * unaccounted. We could in theory charge it forcibly, but we hope | ||
800 | * those allocations are rare, and won't be worth the trouble. | ||
801 | */ | ||
802 | if (gfp & __GFP_NOFAIL) | ||
803 | return true; | ||
804 | if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) | 778 | if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) |
805 | return true; | 779 | return true; |
806 | 780 | return false; | |
807 | /* If the test is dying, just let it go. */ | ||
808 | if (unlikely(fatal_signal_pending(current))) | ||
809 | return true; | ||
810 | |||
811 | return __memcg_kmem_newpage_charge(gfp, memcg, order); | ||
812 | } | 781 | } |
813 | 782 | ||
814 | /** | 783 | /** |
815 | * memcg_kmem_uncharge_pages: uncharge pages from memcg | 784 | * memcg_kmem_charge: charge a kmem page |
816 | * @page: pointer to struct page being freed | 785 | * @page: page to charge |
817 | * @order: allocation order. | 786 | * @gfp: reclaim mode |
787 | * @order: allocation order | ||
788 | * | ||
789 | * Returns 0 on success, an error code on failure. | ||
818 | */ | 790 | */ |
819 | static inline void | 791 | static __always_inline int memcg_kmem_charge(struct page *page, |
820 | memcg_kmem_uncharge_pages(struct page *page, int order) | 792 | gfp_t gfp, int order) |
821 | { | 793 | { |
822 | if (memcg_kmem_enabled()) | 794 | if (__memcg_kmem_bypass(gfp)) |
823 | __memcg_kmem_uncharge_pages(page, order); | 795 | return 0; |
796 | return __memcg_kmem_charge(page, gfp, order); | ||
824 | } | 797 | } |
825 | 798 | ||
826 | /** | 799 | /** |
827 | * memcg_kmem_commit_charge: embeds correct memcg in a page | 800 | * memcg_kmem_uncharge: uncharge a kmem page |
828 | * @page: pointer to struct page recently allocated | 801 | * @page: page to uncharge |
829 | * @memcg: the memcg structure we charged against | 802 | * @order: allocation order |
830 | * @order: allocation order. | ||
831 | * | ||
832 | * Needs to be called after memcg_kmem_newpage_charge, regardless of success or | ||
833 | * failure of the allocation. if @page is NULL, this function will revert the | ||
834 | * charges. Otherwise, it will commit @page to @memcg. | ||
835 | */ | 803 | */ |
836 | static inline void | 804 | static __always_inline void memcg_kmem_uncharge(struct page *page, int order) |
837 | memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) | ||
838 | { | 805 | { |
839 | if (memcg_kmem_enabled() && memcg) | 806 | if (memcg_kmem_enabled()) |
840 | __memcg_kmem_commit_charge(page, memcg, order); | 807 | __memcg_kmem_uncharge(page, order); |
841 | } | 808 | } |
842 | 809 | ||
843 | /** | 810 | /** |
@@ -850,17 +817,8 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) | |||
850 | static __always_inline struct kmem_cache * | 817 | static __always_inline struct kmem_cache * |
851 | memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | 818 | memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) |
852 | { | 819 | { |
853 | if (!memcg_kmem_enabled()) | 820 | if (__memcg_kmem_bypass(gfp)) |
854 | return cachep; | ||
855 | if (gfp & __GFP_NOACCOUNT) | ||
856 | return cachep; | ||
857 | if (gfp & __GFP_NOFAIL) | ||
858 | return cachep; | ||
859 | if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) | ||
860 | return cachep; | 821 | return cachep; |
861 | if (unlikely(fatal_signal_pending(current))) | ||
862 | return cachep; | ||
863 | |||
864 | return __memcg_kmem_get_cache(cachep); | 822 | return __memcg_kmem_get_cache(cachep); |
865 | } | 823 | } |
866 | 824 | ||
@@ -869,13 +827,6 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) | |||
869 | if (memcg_kmem_enabled()) | 827 | if (memcg_kmem_enabled()) |
870 | __memcg_kmem_put_cache(cachep); | 828 | __memcg_kmem_put_cache(cachep); |
871 | } | 829 | } |
872 | |||
873 | static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) | ||
874 | { | ||
875 | if (!memcg_kmem_enabled()) | ||
876 | return NULL; | ||
877 | return __mem_cgroup_from_kmem(ptr); | ||
878 | } | ||
879 | #else | 830 | #else |
880 | #define for_each_memcg_cache_index(_idx) \ | 831 | #define for_each_memcg_cache_index(_idx) \ |
881 | for (; NULL; ) | 832 | for (; NULL; ) |
@@ -890,18 +841,12 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) | |||
890 | return false; | 841 | return false; |
891 | } | 842 | } |
892 | 843 | ||
893 | static inline bool | 844 | static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) |
894 | memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) | ||
895 | { | ||
896 | return true; | ||
897 | } | ||
898 | |||
899 | static inline void memcg_kmem_uncharge_pages(struct page *page, int order) | ||
900 | { | 845 | { |
846 | return 0; | ||
901 | } | 847 | } |
902 | 848 | ||
903 | static inline void | 849 | static inline void memcg_kmem_uncharge(struct page *page, int order) |
904 | memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) | ||
905 | { | 850 | { |
906 | } | 851 | } |
907 | 852 | ||
@@ -927,11 +872,5 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
927 | static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) | 872 | static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) |
928 | { | 873 | { |
929 | } | 874 | } |
930 | |||
931 | static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) | ||
932 | { | ||
933 | return NULL; | ||
934 | } | ||
935 | #endif /* CONFIG_MEMCG_KMEM */ | 875 | #endif /* CONFIG_MEMCG_KMEM */ |
936 | #endif /* _LINUX_MEMCONTROL_H */ | 876 | #endif /* _LINUX_MEMCONTROL_H */ |
937 | |||
diff --git a/include/linux/mm.h b/include/linux/mm.h index 80001de019ba..906c46a05707 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -139,6 +139,7 @@ extern unsigned int kobjsize(const void *objp); | |||
139 | 139 | ||
140 | #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ | 140 | #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ |
141 | #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ | 141 | #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ |
142 | #define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ | ||
142 | #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ | 143 | #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ |
143 | #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ | 144 | #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ |
144 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ | 145 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ |
@@ -202,6 +203,9 @@ extern unsigned int kobjsize(const void *objp); | |||
202 | /* This mask defines which mm->def_flags a process can inherit its parent */ | 203 | /* This mask defines which mm->def_flags a process can inherit its parent */ |
203 | #define VM_INIT_DEF_MASK VM_NOHUGEPAGE | 204 | #define VM_INIT_DEF_MASK VM_NOHUGEPAGE |
204 | 205 | ||
206 | /* This mask is used to clear all the VMA flags used by mlock */ | ||
207 | #define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT)) | ||
208 | |||
205 | /* | 209 | /* |
206 | * mapping from the currently active vm_flags protection bits (the | 210 | * mapping from the currently active vm_flags protection bits (the |
207 | * low four bits) to a page protection mask.. | 211 | * low four bits) to a page protection mask.. |
@@ -1606,8 +1610,10 @@ static inline void pgtable_init(void) | |||
1606 | 1610 | ||
1607 | static inline bool pgtable_page_ctor(struct page *page) | 1611 | static inline bool pgtable_page_ctor(struct page *page) |
1608 | { | 1612 | { |
1613 | if (!ptlock_init(page)) | ||
1614 | return false; | ||
1609 | inc_zone_page_state(page, NR_PAGETABLE); | 1615 | inc_zone_page_state(page, NR_PAGETABLE); |
1610 | return ptlock_init(page); | 1616 | return true; |
1611 | } | 1617 | } |
1612 | 1618 | ||
1613 | static inline void pgtable_page_dtor(struct page *page) | 1619 | static inline void pgtable_page_dtor(struct page *page) |
@@ -2036,8 +2042,6 @@ void page_cache_async_readahead(struct address_space *mapping, | |||
2036 | pgoff_t offset, | 2042 | pgoff_t offset, |
2037 | unsigned long size); | 2043 | unsigned long size); |
2038 | 2044 | ||
2039 | unsigned long max_sane_readahead(unsigned long nr); | ||
2040 | |||
2041 | /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ | 2045 | /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ |
2042 | extern int expand_stack(struct vm_area_struct *vma, unsigned long address); | 2046 | extern int expand_stack(struct vm_area_struct *vma, unsigned long address); |
2043 | 2047 | ||
@@ -2137,6 +2141,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, | |||
2137 | #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ | 2141 | #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ |
2138 | #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ | 2142 | #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ |
2139 | #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ | 2143 | #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ |
2144 | #define FOLL_MLOCK 0x1000 /* lock present pages */ | ||
2140 | 2145 | ||
2141 | typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, | 2146 | typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, |
2142 | void *data); | 2147 | void *data); |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3d6baa7d4534..0a85da25a822 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -486,6 +486,9 @@ struct mm_struct { | |||
486 | /* address of the bounds directory */ | 486 | /* address of the bounds directory */ |
487 | void __user *bd_addr; | 487 | void __user *bd_addr; |
488 | #endif | 488 | #endif |
489 | #ifdef CONFIG_HUGETLB_PAGE | ||
490 | atomic_long_t hugetlb_usage; | ||
491 | #endif | ||
489 | }; | 492 | }; |
490 | 493 | ||
491 | static inline void mm_init_cpumask(struct mm_struct *mm) | 494 | static inline void mm_init_cpumask(struct mm_struct *mm) |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d94347737292..2d7e660cdefe 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -823,8 +823,7 @@ enum memmap_context { | |||
823 | MEMMAP_HOTPLUG, | 823 | MEMMAP_HOTPLUG, |
824 | }; | 824 | }; |
825 | extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, | 825 | extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, |
826 | unsigned long size, | 826 | unsigned long size); |
827 | enum memmap_context context); | ||
828 | 827 | ||
829 | extern void lruvec_init(struct lruvec *lruvec); | 828 | extern void lruvec_init(struct lruvec *lruvec); |
830 | 829 | ||
diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 78488e099ce7..7ec5b86735f3 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h | |||
@@ -73,6 +73,7 @@ extern int watchdog_user_enabled; | |||
73 | extern int watchdog_thresh; | 73 | extern int watchdog_thresh; |
74 | extern unsigned long *watchdog_cpumask_bits; | 74 | extern unsigned long *watchdog_cpumask_bits; |
75 | extern int sysctl_softlockup_all_cpu_backtrace; | 75 | extern int sysctl_softlockup_all_cpu_backtrace; |
76 | extern int sysctl_hardlockup_all_cpu_backtrace; | ||
76 | struct ctl_table; | 77 | struct ctl_table; |
77 | extern int proc_watchdog(struct ctl_table *, int , | 78 | extern int proc_watchdog(struct ctl_table *, int , |
78 | void __user *, size_t *, loff_t *); | 79 | void __user *, size_t *, loff_t *); |
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 416509e26d6d..a525e5067484 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -256,7 +256,7 @@ PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) | |||
256 | * Must use a macro here due to header dependency issues. page_zone() is not | 256 | * Must use a macro here due to header dependency issues. page_zone() is not |
257 | * available at this point. | 257 | * available at this point. |
258 | */ | 258 | */ |
259 | #define PageHighMem(__p) is_highmem(page_zone(__p)) | 259 | #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) |
260 | #else | 260 | #else |
261 | PAGEFLAG_FALSE(HighMem) | 261 | PAGEFLAG_FALSE(HighMem) |
262 | #endif | 262 | #endif |
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 17fa4f8de3a6..7e62920a3a94 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h | |||
@@ -36,9 +36,9 @@ static inline unsigned long page_counter_read(struct page_counter *counter) | |||
36 | 36 | ||
37 | void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); | 37 | void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); |
38 | void page_counter_charge(struct page_counter *counter, unsigned long nr_pages); | 38 | void page_counter_charge(struct page_counter *counter, unsigned long nr_pages); |
39 | int page_counter_try_charge(struct page_counter *counter, | 39 | bool page_counter_try_charge(struct page_counter *counter, |
40 | unsigned long nr_pages, | 40 | unsigned long nr_pages, |
41 | struct page_counter **fail); | 41 | struct page_counter **fail); |
42 | void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); | 42 | void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); |
43 | int page_counter_limit(struct page_counter *counter, unsigned long limit); | 43 | int page_counter_limit(struct page_counter *counter, unsigned long limit); |
44 | int page_counter_memparse(const char *buf, const char *max, | 44 | int page_counter_memparse(const char *buf, const char *max, |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 4effb1025fbb..eeb5066a44fb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -384,6 +384,7 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, | |||
384 | void __user *buffer, | 384 | void __user *buffer, |
385 | size_t *lenp, loff_t *ppos); | 385 | size_t *lenp, loff_t *ppos); |
386 | extern unsigned int softlockup_panic; | 386 | extern unsigned int softlockup_panic; |
387 | extern unsigned int hardlockup_panic; | ||
387 | void lockup_detector_init(void); | 388 | void lockup_detector_init(void); |
388 | #else | 389 | #else |
389 | static inline void touch_softlockup_watchdog(void) | 390 | static inline void touch_softlockup_watchdog(void) |
@@ -1460,7 +1461,9 @@ struct task_struct { | |||
1460 | unsigned sched_reset_on_fork:1; | 1461 | unsigned sched_reset_on_fork:1; |
1461 | unsigned sched_contributes_to_load:1; | 1462 | unsigned sched_contributes_to_load:1; |
1462 | unsigned sched_migrated:1; | 1463 | unsigned sched_migrated:1; |
1463 | 1464 | #ifdef CONFIG_MEMCG | |
1465 | unsigned memcg_may_oom:1; | ||
1466 | #endif | ||
1464 | #ifdef CONFIG_MEMCG_KMEM | 1467 | #ifdef CONFIG_MEMCG_KMEM |
1465 | unsigned memcg_kmem_skip_account:1; | 1468 | unsigned memcg_kmem_skip_account:1; |
1466 | #endif | 1469 | #endif |
@@ -1791,12 +1794,12 @@ struct task_struct { | |||
1791 | unsigned long trace_recursion; | 1794 | unsigned long trace_recursion; |
1792 | #endif /* CONFIG_TRACING */ | 1795 | #endif /* CONFIG_TRACING */ |
1793 | #ifdef CONFIG_MEMCG | 1796 | #ifdef CONFIG_MEMCG |
1794 | struct memcg_oom_info { | 1797 | struct mem_cgroup *memcg_in_oom; |
1795 | struct mem_cgroup *memcg; | 1798 | gfp_t memcg_oom_gfp_mask; |
1796 | gfp_t gfp_mask; | 1799 | int memcg_oom_order; |
1797 | int order; | 1800 | |
1798 | unsigned int may_oom:1; | 1801 | /* number of pages to reclaim on returning to userland */ |
1799 | } memcg_oom; | 1802 | unsigned int memcg_nr_pages_over_high; |
1800 | #endif | 1803 | #endif |
1801 | #ifdef CONFIG_UPROBES | 1804 | #ifdef CONFIG_UPROBES |
1802 | struct uprobe_task *utask; | 1805 | struct uprobe_task *utask; |
diff --git a/include/linux/slab.h b/include/linux/slab.h index 7e37d448ed91..7c82e3b307a3 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -111,7 +111,7 @@ struct mem_cgroup; | |||
111 | * struct kmem_cache related prototypes | 111 | * struct kmem_cache related prototypes |
112 | */ | 112 | */ |
113 | void __init kmem_cache_init(void); | 113 | void __init kmem_cache_init(void); |
114 | int slab_is_available(void); | 114 | bool slab_is_available(void); |
115 | 115 | ||
116 | struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, | 116 | struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, |
117 | unsigned long, | 117 | unsigned long, |
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a460e2ef2843..a156b82dd14c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h | |||
@@ -887,4 +887,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, | |||
887 | 887 | ||
888 | asmlinkage long sys_membarrier(int cmd, int flags); | 888 | asmlinkage long sys_membarrier(int cmd, int flags); |
889 | 889 | ||
890 | asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); | ||
891 | |||
890 | #endif | 892 | #endif |
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 84d497297c5f..26c152122a42 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h | |||
@@ -50,6 +50,7 @@ | |||
50 | #include <linux/ptrace.h> | 50 | #include <linux/ptrace.h> |
51 | #include <linux/security.h> | 51 | #include <linux/security.h> |
52 | #include <linux/task_work.h> | 52 | #include <linux/task_work.h> |
53 | #include <linux/memcontrol.h> | ||
53 | struct linux_binprm; | 54 | struct linux_binprm; |
54 | 55 | ||
55 | /* | 56 | /* |
@@ -188,6 +189,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) | |||
188 | smp_mb__after_atomic(); | 189 | smp_mb__after_atomic(); |
189 | if (unlikely(current->task_works)) | 190 | if (unlikely(current->task_works)) |
190 | task_work_run(); | 191 | task_work_run(); |
192 | |||
193 | mem_cgroup_handle_over_high(); | ||
191 | } | 194 | } |
192 | 195 | ||
193 | #endif /* <linux/tracehook.h> */ | 196 | #endif /* <linux/tracehook.h> */ |
diff --git a/include/linux/types.h b/include/linux/types.h index c314989d9158..70d8500bddf1 100644 --- a/include/linux/types.h +++ b/include/linux/types.h | |||
@@ -205,11 +205,25 @@ struct ustat { | |||
205 | * struct callback_head - callback structure for use with RCU and task_work | 205 | * struct callback_head - callback structure for use with RCU and task_work |
206 | * @next: next update requests in a list | 206 | * @next: next update requests in a list |
207 | * @func: actual update function to call after the grace period. | 207 | * @func: actual update function to call after the grace period. |
208 | * | ||
209 | * The struct is aligned to size of pointer. On most architectures it happens | ||
210 | * naturally due ABI requirements, but some architectures (like CRIS) have | ||
211 | * weird ABI and we need to ask it explicitly. | ||
212 | * | ||
213 | * The alignment is required to guarantee that bits 0 and 1 of @next will be | ||
214 | * clear under normal conditions -- as long as we use call_rcu(), | ||
215 | * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. | ||
216 | * | ||
217 | * This guarantee is important for few reasons: | ||
218 | * - future call_rcu_lazy() will make use of lower bits in the pointer; | ||
219 | * - the structure shares storage spacer in struct page with @compound_head, | ||
220 | * which encode PageTail() in bit 0. The guarantee is needed to avoid | ||
221 | * false-positive PageTail(). | ||
208 | */ | 222 | */ |
209 | struct callback_head { | 223 | struct callback_head { |
210 | struct callback_head *next; | 224 | struct callback_head *next; |
211 | void (*func)(struct callback_head *head); | 225 | void (*func)(struct callback_head *head); |
212 | }; | 226 | } __attribute__((aligned(sizeof(void *)))); |
213 | #define rcu_head callback_head | 227 | #define rcu_head callback_head |
214 | 228 | ||
215 | typedef void (*rcu_callback_t)(struct rcu_head *head); | 229 | typedef void (*rcu_callback_t)(struct rcu_head *head); |
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index d6f2c2c5b043..558129af828a 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h | |||
@@ -75,36 +75,6 @@ static inline unsigned long __copy_from_user_nocache(void *to, | |||
75 | 75 | ||
76 | #endif /* ARCH_HAS_NOCACHE_UACCESS */ | 76 | #endif /* ARCH_HAS_NOCACHE_UACCESS */ |
77 | 77 | ||
78 | /** | ||
79 | * probe_kernel_address(): safely attempt to read from a location | ||
80 | * @addr: address to read from - its type is type typeof(retval)* | ||
81 | * @retval: read into this variable | ||
82 | * | ||
83 | * Safely read from address @addr into variable @revtal. If a kernel fault | ||
84 | * happens, handle that and return -EFAULT. | ||
85 | * We ensure that the __get_user() is executed in atomic context so that | ||
86 | * do_page_fault() doesn't attempt to take mmap_sem. This makes | ||
87 | * probe_kernel_address() suitable for use within regions where the caller | ||
88 | * already holds mmap_sem, or other locks which nest inside mmap_sem. | ||
89 | * This must be a macro because __get_user() needs to know the types of the | ||
90 | * args. | ||
91 | * | ||
92 | * We don't include enough header files to be able to do the set_fs(). We | ||
93 | * require that the probe_kernel_address() caller will do that. | ||
94 | */ | ||
95 | #define probe_kernel_address(addr, retval) \ | ||
96 | ({ \ | ||
97 | long ret; \ | ||
98 | mm_segment_t old_fs = get_fs(); \ | ||
99 | \ | ||
100 | set_fs(KERNEL_DS); \ | ||
101 | pagefault_disable(); \ | ||
102 | ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \ | ||
103 | pagefault_enable(); \ | ||
104 | set_fs(old_fs); \ | ||
105 | ret; \ | ||
106 | }) | ||
107 | |||
108 | /* | 78 | /* |
109 | * probe_kernel_read(): safely attempt to read from a location | 79 | * probe_kernel_read(): safely attempt to read from a location |
110 | * @dst: pointer to the buffer that shall take the data | 80 | * @dst: pointer to the buffer that shall take the data |
@@ -131,4 +101,14 @@ extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size | |||
131 | 101 | ||
132 | extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); | 102 | extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); |
133 | 103 | ||
104 | /** | ||
105 | * probe_kernel_address(): safely attempt to read from a location | ||
106 | * @addr: address to read from | ||
107 | * @retval: read into this variable | ||
108 | * | ||
109 | * Returns 0 on success, or -EFAULT. | ||
110 | */ | ||
111 | #define probe_kernel_address(addr, retval) \ | ||
112 | probe_kernel_read(&retval, addr, sizeof(retval)) | ||
113 | |||
134 | #endif /* __LINUX_UACCESS_H__ */ | 114 | #endif /* __LINUX_UACCESS_H__ */ |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9246d32dc973..e623d392db0c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -14,12 +14,12 @@ | |||
14 | #endif | 14 | #endif |
15 | 15 | ||
16 | #ifdef CONFIG_HIGHMEM | 16 | #ifdef CONFIG_HIGHMEM |
17 | #define HIGHMEM_ZONE(xx) , xx##_HIGH | 17 | #define HIGHMEM_ZONE(xx) xx##_HIGH, |
18 | #else | 18 | #else |
19 | #define HIGHMEM_ZONE(xx) | 19 | #define HIGHMEM_ZONE(xx) |
20 | #endif | 20 | #endif |
21 | 21 | ||
22 | #define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx) , xx##_MOVABLE | 22 | #define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE |
23 | 23 | ||
24 | enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | 24 | enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, |
25 | FOR_ALL_ZONES(PGALLOC), | 25 | FOR_ALL_ZONES(PGALLOC), |
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 82e7db7f7100..5dbc8b0ee567 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
@@ -161,30 +161,8 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, | |||
161 | } | 161 | } |
162 | 162 | ||
163 | #ifdef CONFIG_NUMA | 163 | #ifdef CONFIG_NUMA |
164 | /* | ||
165 | * Determine the per node value of a stat item. This function | ||
166 | * is called frequently in a NUMA machine, so try to be as | ||
167 | * frugal as possible. | ||
168 | */ | ||
169 | static inline unsigned long node_page_state(int node, | ||
170 | enum zone_stat_item item) | ||
171 | { | ||
172 | struct zone *zones = NODE_DATA(node)->node_zones; | ||
173 | |||
174 | return | ||
175 | #ifdef CONFIG_ZONE_DMA | ||
176 | zone_page_state(&zones[ZONE_DMA], item) + | ||
177 | #endif | ||
178 | #ifdef CONFIG_ZONE_DMA32 | ||
179 | zone_page_state(&zones[ZONE_DMA32], item) + | ||
180 | #endif | ||
181 | #ifdef CONFIG_HIGHMEM | ||
182 | zone_page_state(&zones[ZONE_HIGHMEM], item) + | ||
183 | #endif | ||
184 | zone_page_state(&zones[ZONE_NORMAL], item) + | ||
185 | zone_page_state(&zones[ZONE_MOVABLE], item); | ||
186 | } | ||
187 | 164 | ||
165 | extern unsigned long node_page_state(int node, enum zone_stat_item item); | ||
188 | extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); | 166 | extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); |
189 | 167 | ||
190 | #else | 168 | #else |
@@ -269,7 +247,6 @@ static inline void __dec_zone_page_state(struct page *page, | |||
269 | 247 | ||
270 | #define set_pgdat_percpu_threshold(pgdat, callback) { } | 248 | #define set_pgdat_percpu_threshold(pgdat, callback) { } |
271 | 249 | ||
272 | static inline void refresh_cpu_vm_stats(int cpu) { } | ||
273 | static inline void refresh_zone_stat_thresholds(void) { } | 250 | static inline void refresh_zone_stat_thresholds(void) { } |
274 | static inline void cpu_vm_stats_fold(int cpu) { } | 251 | static inline void cpu_vm_stats_fold(int cpu) { } |
275 | 252 | ||
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 9a6a3fe0fb51..c92d1e1cbad9 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h | |||
@@ -9,6 +9,62 @@ | |||
9 | #include <linux/tracepoint.h> | 9 | #include <linux/tracepoint.h> |
10 | #include <trace/events/gfpflags.h> | 10 | #include <trace/events/gfpflags.h> |
11 | 11 | ||
12 | #define COMPACTION_STATUS \ | ||
13 | EM( COMPACT_DEFERRED, "deferred") \ | ||
14 | EM( COMPACT_SKIPPED, "skipped") \ | ||
15 | EM( COMPACT_CONTINUE, "continue") \ | ||
16 | EM( COMPACT_PARTIAL, "partial") \ | ||
17 | EM( COMPACT_COMPLETE, "complete") \ | ||
18 | EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ | ||
19 | EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \ | ||
20 | EMe(COMPACT_CONTENDED, "contended") | ||
21 | |||
22 | #ifdef CONFIG_ZONE_DMA | ||
23 | #define IFDEF_ZONE_DMA(X) X | ||
24 | #else | ||
25 | #define IFDEF_ZONE_DMA(X) | ||
26 | #endif | ||
27 | |||
28 | #ifdef CONFIG_ZONE_DMA32 | ||
29 | #define IFDEF_ZONE_DMA32(X) X | ||
30 | #else | ||
31 | #define IFDEF_ZONE_DMA32(X) | ||
32 | #endif | ||
33 | |||
34 | #ifdef CONFIG_HIGHMEM | ||
35 | #define IFDEF_ZONE_HIGHMEM(X) X | ||
36 | #else | ||
37 | #define IFDEF_ZONE_HIGHMEM(X) | ||
38 | #endif | ||
39 | |||
40 | #define ZONE_TYPE \ | ||
41 | IFDEF_ZONE_DMA( EM (ZONE_DMA, "DMA")) \ | ||
42 | IFDEF_ZONE_DMA32( EM (ZONE_DMA32, "DMA32")) \ | ||
43 | EM (ZONE_NORMAL, "Normal") \ | ||
44 | IFDEF_ZONE_HIGHMEM( EM (ZONE_HIGHMEM,"HighMem")) \ | ||
45 | EMe(ZONE_MOVABLE,"Movable") | ||
46 | |||
47 | /* | ||
48 | * First define the enums in the above macros to be exported to userspace | ||
49 | * via TRACE_DEFINE_ENUM(). | ||
50 | */ | ||
51 | #undef EM | ||
52 | #undef EMe | ||
53 | #define EM(a, b) TRACE_DEFINE_ENUM(a); | ||
54 | #define EMe(a, b) TRACE_DEFINE_ENUM(a); | ||
55 | |||
56 | COMPACTION_STATUS | ||
57 | ZONE_TYPE | ||
58 | |||
59 | /* | ||
60 | * Now redefine the EM() and EMe() macros to map the enums to the strings | ||
61 | * that will be printed in the output. | ||
62 | */ | ||
63 | #undef EM | ||
64 | #undef EMe | ||
65 | #define EM(a, b) {a, b}, | ||
66 | #define EMe(a, b) {a, b} | ||
67 | |||
12 | DECLARE_EVENT_CLASS(mm_compaction_isolate_template, | 68 | DECLARE_EVENT_CLASS(mm_compaction_isolate_template, |
13 | 69 | ||
14 | TP_PROTO( | 70 | TP_PROTO( |
@@ -161,7 +217,7 @@ TRACE_EVENT(mm_compaction_end, | |||
161 | __entry->free_pfn, | 217 | __entry->free_pfn, |
162 | __entry->zone_end, | 218 | __entry->zone_end, |
163 | __entry->sync ? "sync" : "async", | 219 | __entry->sync ? "sync" : "async", |
164 | compaction_status_string[__entry->status]) | 220 | __print_symbolic(__entry->status, COMPACTION_STATUS)) |
165 | ); | 221 | ); |
166 | 222 | ||
167 | TRACE_EVENT(mm_compaction_try_to_compact_pages, | 223 | TRACE_EVENT(mm_compaction_try_to_compact_pages, |
@@ -201,23 +257,23 @@ DECLARE_EVENT_CLASS(mm_compaction_suitable_template, | |||
201 | 257 | ||
202 | TP_STRUCT__entry( | 258 | TP_STRUCT__entry( |
203 | __field(int, nid) | 259 | __field(int, nid) |
204 | __field(char *, name) | 260 | __field(enum zone_type, idx) |
205 | __field(int, order) | 261 | __field(int, order) |
206 | __field(int, ret) | 262 | __field(int, ret) |
207 | ), | 263 | ), |
208 | 264 | ||
209 | TP_fast_assign( | 265 | TP_fast_assign( |
210 | __entry->nid = zone_to_nid(zone); | 266 | __entry->nid = zone_to_nid(zone); |
211 | __entry->name = (char *)zone->name; | 267 | __entry->idx = zone_idx(zone); |
212 | __entry->order = order; | 268 | __entry->order = order; |
213 | __entry->ret = ret; | 269 | __entry->ret = ret; |
214 | ), | 270 | ), |
215 | 271 | ||
216 | TP_printk("node=%d zone=%-8s order=%d ret=%s", | 272 | TP_printk("node=%d zone=%-8s order=%d ret=%s", |
217 | __entry->nid, | 273 | __entry->nid, |
218 | __entry->name, | 274 | __print_symbolic(__entry->idx, ZONE_TYPE), |
219 | __entry->order, | 275 | __entry->order, |
220 | compaction_status_string[__entry->ret]) | 276 | __print_symbolic(__entry->ret, COMPACTION_STATUS)) |
221 | ); | 277 | ); |
222 | 278 | ||
223 | DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished, | 279 | DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished, |
@@ -247,7 +303,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template, | |||
247 | 303 | ||
248 | TP_STRUCT__entry( | 304 | TP_STRUCT__entry( |
249 | __field(int, nid) | 305 | __field(int, nid) |
250 | __field(char *, name) | 306 | __field(enum zone_type, idx) |
251 | __field(int, order) | 307 | __field(int, order) |
252 | __field(unsigned int, considered) | 308 | __field(unsigned int, considered) |
253 | __field(unsigned int, defer_shift) | 309 | __field(unsigned int, defer_shift) |
@@ -256,7 +312,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template, | |||
256 | 312 | ||
257 | TP_fast_assign( | 313 | TP_fast_assign( |
258 | __entry->nid = zone_to_nid(zone); | 314 | __entry->nid = zone_to_nid(zone); |
259 | __entry->name = (char *)zone->name; | 315 | __entry->idx = zone_idx(zone); |
260 | __entry->order = order; | 316 | __entry->order = order; |
261 | __entry->considered = zone->compact_considered; | 317 | __entry->considered = zone->compact_considered; |
262 | __entry->defer_shift = zone->compact_defer_shift; | 318 | __entry->defer_shift = zone->compact_defer_shift; |
@@ -265,7 +321,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template, | |||
265 | 321 | ||
266 | TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu", | 322 | TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu", |
267 | __entry->nid, | 323 | __entry->nid, |
268 | __entry->name, | 324 | __print_symbolic(__entry->idx, ZONE_TYPE), |
269 | __entry->order, | 325 | __entry->order, |
270 | __entry->order_failed, | 326 | __entry->order_failed, |
271 | __entry->considered, | 327 | __entry->considered, |
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index ddc3b36f1046..a74dd84bbb6d 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h | |||
@@ -25,6 +25,11 @@ | |||
25 | # define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ | 25 | # define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ |
26 | #endif | 26 | #endif |
27 | 27 | ||
28 | /* | ||
29 | * Flags for mlock | ||
30 | */ | ||
31 | #define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */ | ||
32 | |||
28 | #define MS_ASYNC 1 /* sync memory asynchronously */ | 33 | #define MS_ASYNC 1 /* sync memory asynchronously */ |
29 | #define MS_INVALIDATE 2 /* invalidate the caches */ | 34 | #define MS_INVALIDATE 2 /* invalidate the caches */ |
30 | #define MS_SYNC 4 /* synchronous memory sync */ | 35 | #define MS_SYNC 4 /* synchronous memory sync */ |
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index e9fe6fd2a074..7162cd4cca73 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h | |||
@@ -17,5 +17,6 @@ | |||
17 | 17 | ||
18 | #define MCL_CURRENT 1 /* lock all current mappings */ | 18 | #define MCL_CURRENT 1 /* lock all current mappings */ |
19 | #define MCL_FUTURE 2 /* lock all future mappings */ | 19 | #define MCL_FUTURE 2 /* lock all future mappings */ |
20 | #define MCL_ONFAULT 4 /* lock all pages that are faulted in */ | ||
20 | 21 | ||
21 | #endif /* __ASM_GENERIC_MMAN_H */ | 22 | #endif /* __ASM_GENERIC_MMAN_H */ |
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index ee124009e12a..1324b0292ec2 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h | |||
@@ -713,9 +713,11 @@ __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) | |||
713 | __SYSCALL(__NR_userfaultfd, sys_userfaultfd) | 713 | __SYSCALL(__NR_userfaultfd, sys_userfaultfd) |
714 | #define __NR_membarrier 283 | 714 | #define __NR_membarrier 283 |
715 | __SYSCALL(__NR_membarrier, sys_membarrier) | 715 | __SYSCALL(__NR_membarrier, sys_membarrier) |
716 | #define __NR_mlock2 284 | ||
717 | __SYSCALL(__NR_mlock2, sys_mlock2) | ||
716 | 718 | ||
717 | #undef __NR_syscalls | 719 | #undef __NR_syscalls |
718 | #define __NR_syscalls 284 | 720 | #define __NR_syscalls 285 |
719 | 721 | ||
720 | /* | 722 | /* |
721 | * All syscalls below here should go away really, | 723 | * All syscalls below here should go away really, |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d7ccb87a6714..10ae73611d80 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -2598,22 +2598,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, | |||
2598 | } | 2598 | } |
2599 | 2599 | ||
2600 | /** | 2600 | /** |
2601 | * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed | 2601 | * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed |
2602 | * @tsk: pointer to task_struct of some task. | ||
2603 | * | 2602 | * |
2604 | * Description: Prints @task's name, cpuset name, and cached copy of its | 2603 | * Description: Prints current's name, cpuset name, and cached copy of its |
2605 | * mems_allowed to the kernel log. | 2604 | * mems_allowed to the kernel log. |
2606 | */ | 2605 | */ |
2607 | void cpuset_print_task_mems_allowed(struct task_struct *tsk) | 2606 | void cpuset_print_current_mems_allowed(void) |
2608 | { | 2607 | { |
2609 | struct cgroup *cgrp; | 2608 | struct cgroup *cgrp; |
2610 | 2609 | ||
2611 | rcu_read_lock(); | 2610 | rcu_read_lock(); |
2612 | 2611 | ||
2613 | cgrp = task_cs(tsk)->css.cgroup; | 2612 | cgrp = task_cs(current)->css.cgroup; |
2614 | pr_info("%s cpuset=", tsk->comm); | 2613 | pr_info("%s cpuset=", current->comm); |
2615 | pr_cont_cgroup_name(cgrp); | 2614 | pr_cont_cgroup_name(cgrp); |
2616 | pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed)); | 2615 | pr_cont(" mems_allowed=%*pbl\n", |
2616 | nodemask_pr_args(¤t->mems_allowed)); | ||
2617 | 2617 | ||
2618 | rcu_read_unlock(); | 2618 | rcu_read_unlock(); |
2619 | } | 2619 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index 825ecc32454d..f97f2c449f5c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -455,7 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
455 | tmp->vm_mm = mm; | 455 | tmp->vm_mm = mm; |
456 | if (anon_vma_fork(tmp, mpnt)) | 456 | if (anon_vma_fork(tmp, mpnt)) |
457 | goto fail_nomem_anon_vma_fork; | 457 | goto fail_nomem_anon_vma_fork; |
458 | tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); | 458 | tmp->vm_flags &= |
459 | ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP); | ||
459 | tmp->vm_next = tmp->vm_prev = NULL; | 460 | tmp->vm_next = tmp->vm_prev = NULL; |
460 | tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; | 461 | tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; |
461 | file = tmp->vm_file; | 462 | file = tmp->vm_file; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a02decf15583..0623787ec67a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -194,6 +194,7 @@ cond_syscall(sys_mlock); | |||
194 | cond_syscall(sys_munlock); | 194 | cond_syscall(sys_munlock); |
195 | cond_syscall(sys_mlockall); | 195 | cond_syscall(sys_mlockall); |
196 | cond_syscall(sys_munlockall); | 196 | cond_syscall(sys_munlockall); |
197 | cond_syscall(sys_mlock2); | ||
197 | cond_syscall(sys_mincore); | 198 | cond_syscall(sys_mincore); |
198 | cond_syscall(sys_madvise); | 199 | cond_syscall(sys_madvise); |
199 | cond_syscall(sys_mremap); | 200 | cond_syscall(sys_mremap); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 96c856b04081..dc6858d6639e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -888,6 +888,17 @@ static struct ctl_table kern_table[] = { | |||
888 | .extra1 = &zero, | 888 | .extra1 = &zero, |
889 | .extra2 = &one, | 889 | .extra2 = &one, |
890 | }, | 890 | }, |
891 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
892 | { | ||
893 | .procname = "hardlockup_panic", | ||
894 | .data = &hardlockup_panic, | ||
895 | .maxlen = sizeof(int), | ||
896 | .mode = 0644, | ||
897 | .proc_handler = proc_dointvec_minmax, | ||
898 | .extra1 = &zero, | ||
899 | .extra2 = &one, | ||
900 | }, | ||
901 | #endif | ||
891 | #ifdef CONFIG_SMP | 902 | #ifdef CONFIG_SMP |
892 | { | 903 | { |
893 | .procname = "softlockup_all_cpu_backtrace", | 904 | .procname = "softlockup_all_cpu_backtrace", |
@@ -898,6 +909,15 @@ static struct ctl_table kern_table[] = { | |||
898 | .extra1 = &zero, | 909 | .extra1 = &zero, |
899 | .extra2 = &one, | 910 | .extra2 = &one, |
900 | }, | 911 | }, |
912 | { | ||
913 | .procname = "hardlockup_all_cpu_backtrace", | ||
914 | .data = &sysctl_hardlockup_all_cpu_backtrace, | ||
915 | .maxlen = sizeof(int), | ||
916 | .mode = 0644, | ||
917 | .proc_handler = proc_dointvec_minmax, | ||
918 | .extra1 = &zero, | ||
919 | .extra2 = &one, | ||
920 | }, | ||
901 | #endif /* CONFIG_SMP */ | 921 | #endif /* CONFIG_SMP */ |
902 | #endif | 922 | #endif |
903 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 923 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 64ed1c37bd1f..18f34cf75f74 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10; | |||
57 | 57 | ||
58 | #ifdef CONFIG_SMP | 58 | #ifdef CONFIG_SMP |
59 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; | 59 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; |
60 | int __read_mostly sysctl_hardlockup_all_cpu_backtrace; | ||
60 | #else | 61 | #else |
61 | #define sysctl_softlockup_all_cpu_backtrace 0 | 62 | #define sysctl_softlockup_all_cpu_backtrace 0 |
63 | #define sysctl_hardlockup_all_cpu_backtrace 0 | ||
62 | #endif | 64 | #endif |
63 | static struct cpumask watchdog_cpumask __read_mostly; | 65 | static struct cpumask watchdog_cpumask __read_mostly; |
64 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); | 66 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); |
@@ -110,8 +112,9 @@ static unsigned long soft_lockup_nmi_warn; | |||
110 | * Should we panic when a soft-lockup or hard-lockup occurs: | 112 | * Should we panic when a soft-lockup or hard-lockup occurs: |
111 | */ | 113 | */ |
112 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 114 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
113 | static int hardlockup_panic = | 115 | unsigned int __read_mostly hardlockup_panic = |
114 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | 116 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; |
117 | static unsigned long hardlockup_allcpu_dumped; | ||
115 | /* | 118 | /* |
116 | * We may not want to enable hard lockup detection by default in all cases, | 119 | * We may not want to enable hard lockup detection by default in all cases, |
117 | * for example when running the kernel as a guest on a hypervisor. In these | 120 | * for example when running the kernel as a guest on a hypervisor. In these |
@@ -173,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) | |||
173 | return 1; | 176 | return 1; |
174 | } | 177 | } |
175 | __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); | 178 | __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); |
179 | static int __init hardlockup_all_cpu_backtrace_setup(char *str) | ||
180 | { | ||
181 | sysctl_hardlockup_all_cpu_backtrace = | ||
182 | !!simple_strtol(str, NULL, 0); | ||
183 | return 1; | ||
184 | } | ||
185 | __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); | ||
176 | #endif | 186 | #endif |
177 | 187 | ||
178 | /* | 188 | /* |
@@ -263,15 +273,15 @@ void touch_softlockup_watchdog_sync(void) | |||
263 | 273 | ||
264 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 274 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
265 | /* watchdog detector functions */ | 275 | /* watchdog detector functions */ |
266 | static int is_hardlockup(void) | 276 | static bool is_hardlockup(void) |
267 | { | 277 | { |
268 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); | 278 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); |
269 | 279 | ||
270 | if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) | 280 | if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) |
271 | return 1; | 281 | return true; |
272 | 282 | ||
273 | __this_cpu_write(hrtimer_interrupts_saved, hrint); | 283 | __this_cpu_write(hrtimer_interrupts_saved, hrint); |
274 | return 0; | 284 | return false; |
275 | } | 285 | } |
276 | #endif | 286 | #endif |
277 | 287 | ||
@@ -279,7 +289,7 @@ static int is_softlockup(unsigned long touch_ts) | |||
279 | { | 289 | { |
280 | unsigned long now = get_timestamp(); | 290 | unsigned long now = get_timestamp(); |
281 | 291 | ||
282 | if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { | 292 | if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ |
283 | /* Warn about unreasonable delays. */ | 293 | /* Warn about unreasonable delays. */ |
284 | if (time_after(now, touch_ts + get_softlockup_thresh())) | 294 | if (time_after(now, touch_ts + get_softlockup_thresh())) |
285 | return now - touch_ts; | 295 | return now - touch_ts; |
@@ -318,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event, | |||
318 | */ | 328 | */ |
319 | if (is_hardlockup()) { | 329 | if (is_hardlockup()) { |
320 | int this_cpu = smp_processor_id(); | 330 | int this_cpu = smp_processor_id(); |
331 | struct pt_regs *regs = get_irq_regs(); | ||
321 | 332 | ||
322 | /* only print hardlockups once */ | 333 | /* only print hardlockups once */ |
323 | if (__this_cpu_read(hard_watchdog_warn) == true) | 334 | if (__this_cpu_read(hard_watchdog_warn) == true) |
324 | return; | 335 | return; |
325 | 336 | ||
326 | if (hardlockup_panic) | 337 | pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); |
327 | panic("Watchdog detected hard LOCKUP on cpu %d", | 338 | print_modules(); |
328 | this_cpu); | 339 | print_irqtrace_events(current); |
340 | if (regs) | ||
341 | show_regs(regs); | ||
329 | else | 342 | else |
330 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", | 343 | dump_stack(); |
331 | this_cpu); | 344 | |
345 | /* | ||
346 | * Perform all-CPU dump only once to avoid multiple hardlockups | ||
347 | * generating interleaving traces | ||
348 | */ | ||
349 | if (sysctl_hardlockup_all_cpu_backtrace && | ||
350 | !test_and_set_bit(0, &hardlockup_allcpu_dumped)) | ||
351 | trigger_allbutself_cpu_backtrace(); | ||
352 | |||
353 | if (hardlockup_panic) | ||
354 | panic("Hard LOCKUP"); | ||
332 | 355 | ||
333 | __this_cpu_write(hard_watchdog_warn, true); | 356 | __this_cpu_write(hard_watchdog_warn, true); |
334 | return; | 357 | return; |
@@ -347,6 +370,9 @@ static void watchdog_interrupt_count(void) | |||
347 | static int watchdog_nmi_enable(unsigned int cpu); | 370 | static int watchdog_nmi_enable(unsigned int cpu); |
348 | static void watchdog_nmi_disable(unsigned int cpu); | 371 | static void watchdog_nmi_disable(unsigned int cpu); |
349 | 372 | ||
373 | static int watchdog_enable_all_cpus(void); | ||
374 | static void watchdog_disable_all_cpus(void); | ||
375 | |||
350 | /* watchdog kicker functions */ | 376 | /* watchdog kicker functions */ |
351 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | 377 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) |
352 | { | 378 | { |
@@ -651,37 +677,41 @@ static struct smp_hotplug_thread watchdog_threads = { | |||
651 | 677 | ||
652 | /* | 678 | /* |
653 | * park all watchdog threads that are specified in 'watchdog_cpumask' | 679 | * park all watchdog threads that are specified in 'watchdog_cpumask' |
680 | * | ||
681 | * This function returns an error if kthread_park() of a watchdog thread | ||
682 | * fails. In this situation, the watchdog threads of some CPUs can already | ||
683 | * be parked and the watchdog threads of other CPUs can still be runnable. | ||
684 | * Callers are expected to handle this special condition as appropriate in | ||
685 | * their context. | ||
686 | * | ||
687 | * This function may only be called in a context that is protected against | ||
688 | * races with CPU hotplug - for example, via get_online_cpus(). | ||
654 | */ | 689 | */ |
655 | static int watchdog_park_threads(void) | 690 | static int watchdog_park_threads(void) |
656 | { | 691 | { |
657 | int cpu, ret = 0; | 692 | int cpu, ret = 0; |
658 | 693 | ||
659 | get_online_cpus(); | ||
660 | for_each_watchdog_cpu(cpu) { | 694 | for_each_watchdog_cpu(cpu) { |
661 | ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); | 695 | ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); |
662 | if (ret) | 696 | if (ret) |
663 | break; | 697 | break; |
664 | } | 698 | } |
665 | if (ret) { | ||
666 | for_each_watchdog_cpu(cpu) | ||
667 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | ||
668 | } | ||
669 | put_online_cpus(); | ||
670 | 699 | ||
671 | return ret; | 700 | return ret; |
672 | } | 701 | } |
673 | 702 | ||
674 | /* | 703 | /* |
675 | * unpark all watchdog threads that are specified in 'watchdog_cpumask' | 704 | * unpark all watchdog threads that are specified in 'watchdog_cpumask' |
705 | * | ||
706 | * This function may only be called in a context that is protected against | ||
707 | * races with CPU hotplug - for example, via get_online_cpus(). | ||
676 | */ | 708 | */ |
677 | static void watchdog_unpark_threads(void) | 709 | static void watchdog_unpark_threads(void) |
678 | { | 710 | { |
679 | int cpu; | 711 | int cpu; |
680 | 712 | ||
681 | get_online_cpus(); | ||
682 | for_each_watchdog_cpu(cpu) | 713 | for_each_watchdog_cpu(cpu) |
683 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | 714 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); |
684 | put_online_cpus(); | ||
685 | } | 715 | } |
686 | 716 | ||
687 | /* | 717 | /* |
@@ -691,6 +721,7 @@ int lockup_detector_suspend(void) | |||
691 | { | 721 | { |
692 | int ret = 0; | 722 | int ret = 0; |
693 | 723 | ||
724 | get_online_cpus(); | ||
694 | mutex_lock(&watchdog_proc_mutex); | 725 | mutex_lock(&watchdog_proc_mutex); |
695 | /* | 726 | /* |
696 | * Multiple suspend requests can be active in parallel (counted by | 727 | * Multiple suspend requests can be active in parallel (counted by |
@@ -704,6 +735,11 @@ int lockup_detector_suspend(void) | |||
704 | 735 | ||
705 | if (ret == 0) | 736 | if (ret == 0) |
706 | watchdog_suspended++; | 737 | watchdog_suspended++; |
738 | else { | ||
739 | watchdog_disable_all_cpus(); | ||
740 | pr_err("Failed to suspend lockup detectors, disabled\n"); | ||
741 | watchdog_enabled = 0; | ||
742 | } | ||
707 | 743 | ||
708 | mutex_unlock(&watchdog_proc_mutex); | 744 | mutex_unlock(&watchdog_proc_mutex); |
709 | 745 | ||
@@ -726,12 +762,20 @@ void lockup_detector_resume(void) | |||
726 | watchdog_unpark_threads(); | 762 | watchdog_unpark_threads(); |
727 | 763 | ||
728 | mutex_unlock(&watchdog_proc_mutex); | 764 | mutex_unlock(&watchdog_proc_mutex); |
765 | put_online_cpus(); | ||
729 | } | 766 | } |
730 | 767 | ||
731 | static void update_watchdog_all_cpus(void) | 768 | static int update_watchdog_all_cpus(void) |
732 | { | 769 | { |
733 | watchdog_park_threads(); | 770 | int ret; |
771 | |||
772 | ret = watchdog_park_threads(); | ||
773 | if (ret) | ||
774 | return ret; | ||
775 | |||
734 | watchdog_unpark_threads(); | 776 | watchdog_unpark_threads(); |
777 | |||
778 | return 0; | ||
735 | } | 779 | } |
736 | 780 | ||
737 | static int watchdog_enable_all_cpus(void) | 781 | static int watchdog_enable_all_cpus(void) |
@@ -750,15 +794,20 @@ static int watchdog_enable_all_cpus(void) | |||
750 | * Enable/disable the lockup detectors or | 794 | * Enable/disable the lockup detectors or |
751 | * change the sample period 'on the fly'. | 795 | * change the sample period 'on the fly'. |
752 | */ | 796 | */ |
753 | update_watchdog_all_cpus(); | 797 | err = update_watchdog_all_cpus(); |
798 | |||
799 | if (err) { | ||
800 | watchdog_disable_all_cpus(); | ||
801 | pr_err("Failed to update lockup detectors, disabled\n"); | ||
802 | } | ||
754 | } | 803 | } |
755 | 804 | ||
805 | if (err) | ||
806 | watchdog_enabled = 0; | ||
807 | |||
756 | return err; | 808 | return err; |
757 | } | 809 | } |
758 | 810 | ||
759 | /* prepare/enable/disable routines */ | ||
760 | /* sysctl functions */ | ||
761 | #ifdef CONFIG_SYSCTL | ||
762 | static void watchdog_disable_all_cpus(void) | 811 | static void watchdog_disable_all_cpus(void) |
763 | { | 812 | { |
764 | if (watchdog_running) { | 813 | if (watchdog_running) { |
@@ -767,6 +816,8 @@ static void watchdog_disable_all_cpus(void) | |||
767 | } | 816 | } |
768 | } | 817 | } |
769 | 818 | ||
819 | #ifdef CONFIG_SYSCTL | ||
820 | |||
770 | /* | 821 | /* |
771 | * Update the run state of the lockup detectors. | 822 | * Update the run state of the lockup detectors. |
772 | */ | 823 | */ |
@@ -808,6 +859,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, | |||
808 | int err, old, new; | 859 | int err, old, new; |
809 | int *watchdog_param = (int *)table->data; | 860 | int *watchdog_param = (int *)table->data; |
810 | 861 | ||
862 | get_online_cpus(); | ||
811 | mutex_lock(&watchdog_proc_mutex); | 863 | mutex_lock(&watchdog_proc_mutex); |
812 | 864 | ||
813 | if (watchdog_suspended) { | 865 | if (watchdog_suspended) { |
@@ -849,15 +901,17 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, | |||
849 | } while (cmpxchg(&watchdog_enabled, old, new) != old); | 901 | } while (cmpxchg(&watchdog_enabled, old, new) != old); |
850 | 902 | ||
851 | /* | 903 | /* |
852 | * Update the run state of the lockup detectors. | 904 | * Update the run state of the lockup detectors. There is _no_ |
853 | * Restore 'watchdog_enabled' on failure. | 905 | * need to check the value returned by proc_watchdog_update() |
906 | * and to restore the previous value of 'watchdog_enabled' as | ||
907 | * both lockup detectors are disabled if proc_watchdog_update() | ||
908 | * returns an error. | ||
854 | */ | 909 | */ |
855 | err = proc_watchdog_update(); | 910 | err = proc_watchdog_update(); |
856 | if (err) | ||
857 | watchdog_enabled = old; | ||
858 | } | 911 | } |
859 | out: | 912 | out: |
860 | mutex_unlock(&watchdog_proc_mutex); | 913 | mutex_unlock(&watchdog_proc_mutex); |
914 | put_online_cpus(); | ||
861 | return err; | 915 | return err; |
862 | } | 916 | } |
863 | 917 | ||
@@ -899,6 +953,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, | |||
899 | { | 953 | { |
900 | int err, old; | 954 | int err, old; |
901 | 955 | ||
956 | get_online_cpus(); | ||
902 | mutex_lock(&watchdog_proc_mutex); | 957 | mutex_lock(&watchdog_proc_mutex); |
903 | 958 | ||
904 | if (watchdog_suspended) { | 959 | if (watchdog_suspended) { |
@@ -914,15 +969,17 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, | |||
914 | goto out; | 969 | goto out; |
915 | 970 | ||
916 | /* | 971 | /* |
917 | * Update the sample period. | 972 | * Update the sample period. Restore on failure. |
918 | * Restore 'watchdog_thresh' on failure. | ||
919 | */ | 973 | */ |
920 | set_sample_period(); | 974 | set_sample_period(); |
921 | err = proc_watchdog_update(); | 975 | err = proc_watchdog_update(); |
922 | if (err) | 976 | if (err) { |
923 | watchdog_thresh = old; | 977 | watchdog_thresh = old; |
978 | set_sample_period(); | ||
979 | } | ||
924 | out: | 980 | out: |
925 | mutex_unlock(&watchdog_proc_mutex); | 981 | mutex_unlock(&watchdog_proc_mutex); |
982 | put_online_cpus(); | ||
926 | return err; | 983 | return err; |
927 | } | 984 | } |
928 | 985 | ||
@@ -937,6 +994,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, | |||
937 | { | 994 | { |
938 | int err; | 995 | int err; |
939 | 996 | ||
997 | get_online_cpus(); | ||
940 | mutex_lock(&watchdog_proc_mutex); | 998 | mutex_lock(&watchdog_proc_mutex); |
941 | 999 | ||
942 | if (watchdog_suspended) { | 1000 | if (watchdog_suspended) { |
@@ -964,6 +1022,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, | |||
964 | } | 1022 | } |
965 | out: | 1023 | out: |
966 | mutex_unlock(&watchdog_proc_mutex); | 1024 | mutex_unlock(&watchdog_proc_mutex); |
1025 | put_online_cpus(); | ||
967 | return err; | 1026 | return err; |
968 | } | 1027 | } |
969 | 1028 | ||
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 39f24d6721e5..0fee5acd5aa0 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan | |||
@@ -15,8 +15,7 @@ config KASAN | |||
15 | global variables requires gcc 5.0 or later. | 15 | global variables requires gcc 5.0 or later. |
16 | This feature consumes about 1/8 of available memory and brings about | 16 | This feature consumes about 1/8 of available memory and brings about |
17 | ~x3 performance slowdown. | 17 | ~x3 performance slowdown. |
18 | For better error detection enable CONFIG_STACKTRACE, | 18 | For better error detection enable CONFIG_STACKTRACE. |
19 | and add slub_debug=U to boot cmdline. | ||
20 | 19 | ||
21 | choice | 20 | choice |
22 | prompt "Instrumentation type" | 21 | prompt "Instrumentation type" |
diff --git a/lib/test_kasan.c b/lib/test_kasan.c index c1efb1b61017..c32f3b0048dc 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c | |||
@@ -138,6 +138,71 @@ static noinline void __init kmalloc_oob_16(void) | |||
138 | kfree(ptr2); | 138 | kfree(ptr2); |
139 | } | 139 | } |
140 | 140 | ||
141 | static noinline void __init kmalloc_oob_memset_2(void) | ||
142 | { | ||
143 | char *ptr; | ||
144 | size_t size = 8; | ||
145 | |||
146 | pr_info("out-of-bounds in memset2\n"); | ||
147 | ptr = kmalloc(size, GFP_KERNEL); | ||
148 | if (!ptr) { | ||
149 | pr_err("Allocation failed\n"); | ||
150 | return; | ||
151 | } | ||
152 | |||
153 | memset(ptr+7, 0, 2); | ||
154 | kfree(ptr); | ||
155 | } | ||
156 | |||
157 | static noinline void __init kmalloc_oob_memset_4(void) | ||
158 | { | ||
159 | char *ptr; | ||
160 | size_t size = 8; | ||
161 | |||
162 | pr_info("out-of-bounds in memset4\n"); | ||
163 | ptr = kmalloc(size, GFP_KERNEL); | ||
164 | if (!ptr) { | ||
165 | pr_err("Allocation failed\n"); | ||
166 | return; | ||
167 | } | ||
168 | |||
169 | memset(ptr+5, 0, 4); | ||
170 | kfree(ptr); | ||
171 | } | ||
172 | |||
173 | |||
174 | static noinline void __init kmalloc_oob_memset_8(void) | ||
175 | { | ||
176 | char *ptr; | ||
177 | size_t size = 8; | ||
178 | |||
179 | pr_info("out-of-bounds in memset8\n"); | ||
180 | ptr = kmalloc(size, GFP_KERNEL); | ||
181 | if (!ptr) { | ||
182 | pr_err("Allocation failed\n"); | ||
183 | return; | ||
184 | } | ||
185 | |||
186 | memset(ptr+1, 0, 8); | ||
187 | kfree(ptr); | ||
188 | } | ||
189 | |||
190 | static noinline void __init kmalloc_oob_memset_16(void) | ||
191 | { | ||
192 | char *ptr; | ||
193 | size_t size = 16; | ||
194 | |||
195 | pr_info("out-of-bounds in memset16\n"); | ||
196 | ptr = kmalloc(size, GFP_KERNEL); | ||
197 | if (!ptr) { | ||
198 | pr_err("Allocation failed\n"); | ||
199 | return; | ||
200 | } | ||
201 | |||
202 | memset(ptr+1, 0, 16); | ||
203 | kfree(ptr); | ||
204 | } | ||
205 | |||
141 | static noinline void __init kmalloc_oob_in_memset(void) | 206 | static noinline void __init kmalloc_oob_in_memset(void) |
142 | { | 207 | { |
143 | char *ptr; | 208 | char *ptr; |
@@ -264,6 +329,10 @@ static int __init kmalloc_tests_init(void) | |||
264 | kmalloc_oob_krealloc_less(); | 329 | kmalloc_oob_krealloc_less(); |
265 | kmalloc_oob_16(); | 330 | kmalloc_oob_16(); |
266 | kmalloc_oob_in_memset(); | 331 | kmalloc_oob_in_memset(); |
332 | kmalloc_oob_memset_2(); | ||
333 | kmalloc_oob_memset_4(); | ||
334 | kmalloc_oob_memset_8(); | ||
335 | kmalloc_oob_memset_16(); | ||
267 | kmalloc_uaf(); | 336 | kmalloc_uaf(); |
268 | kmalloc_uaf_memset(); | 337 | kmalloc_uaf_memset(); |
269 | kmalloc_uaf2(); | 338 | kmalloc_uaf2(); |
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index fcad8322ef36..d3116be5a00f 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c | |||
@@ -199,23 +199,17 @@ int balloon_page_migrate(struct page *newpage, | |||
199 | struct balloon_dev_info *balloon = balloon_page_device(page); | 199 | struct balloon_dev_info *balloon = balloon_page_device(page); |
200 | int rc = -EAGAIN; | 200 | int rc = -EAGAIN; |
201 | 201 | ||
202 | /* | 202 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
203 | * Block others from accessing the 'newpage' when we get around to | 203 | VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); |
204 | * establishing additional references. We should be the only one | ||
205 | * holding a reference to the 'newpage' at this point. | ||
206 | */ | ||
207 | BUG_ON(!trylock_page(newpage)); | ||
208 | 204 | ||
209 | if (WARN_ON(!__is_movable_balloon_page(page))) { | 205 | if (WARN_ON(!__is_movable_balloon_page(page))) { |
210 | dump_page(page, "not movable balloon page"); | 206 | dump_page(page, "not movable balloon page"); |
211 | unlock_page(newpage); | ||
212 | return rc; | 207 | return rc; |
213 | } | 208 | } |
214 | 209 | ||
215 | if (balloon && balloon->migratepage) | 210 | if (balloon && balloon->migratepage) |
216 | rc = balloon->migratepage(balloon, newpage, page, mode); | 211 | rc = balloon->migratepage(balloon, newpage, page, mode); |
217 | 212 | ||
218 | unlock_page(newpage); | ||
219 | return rc; | 213 | return rc; |
220 | } | 214 | } |
221 | #endif /* CONFIG_BALLOON_COMPACTION */ | 215 | #endif /* CONFIG_BALLOON_COMPACTION */ |
@@ -363,7 +363,9 @@ err: | |||
363 | */ | 363 | */ |
364 | struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) | 364 | struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) |
365 | { | 365 | { |
366 | unsigned long mask, offset, pfn, start = 0; | 366 | unsigned long mask, offset; |
367 | unsigned long pfn = -1; | ||
368 | unsigned long start = 0; | ||
367 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; | 369 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; |
368 | struct page *page = NULL; | 370 | struct page *page = NULL; |
369 | int ret; | 371 | int ret; |
@@ -418,7 +420,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) | |||
418 | start = bitmap_no + mask + 1; | 420 | start = bitmap_no + mask + 1; |
419 | } | 421 | } |
420 | 422 | ||
421 | trace_cma_alloc(page ? pfn : -1UL, page, count, align); | 423 | trace_cma_alloc(pfn, page, count, align); |
422 | 424 | ||
423 | pr_debug("%s(): returned %p\n", __func__, page); | 425 | pr_debug("%s(): returned %p\n", __func__, page); |
424 | return page; | 426 | return page; |
diff --git a/mm/compaction.c b/mm/compaction.c index c5c627aae996..de3e1e71cd9f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -35,17 +35,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta) | |||
35 | #endif | 35 | #endif |
36 | 36 | ||
37 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 37 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
38 | #ifdef CONFIG_TRACEPOINTS | ||
39 | static const char *const compaction_status_string[] = { | ||
40 | "deferred", | ||
41 | "skipped", | ||
42 | "continue", | ||
43 | "partial", | ||
44 | "complete", | ||
45 | "no_suitable_page", | ||
46 | "not_suitable_zone", | ||
47 | }; | ||
48 | #endif | ||
49 | 38 | ||
50 | #define CREATE_TRACE_POINTS | 39 | #define CREATE_TRACE_POINTS |
51 | #include <trace/events/compaction.h> | 40 | #include <trace/events/compaction.h> |
@@ -1197,6 +1186,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1197 | return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; | 1186 | return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; |
1198 | } | 1187 | } |
1199 | 1188 | ||
1189 | /* | ||
1190 | * order == -1 is expected when compacting via | ||
1191 | * /proc/sys/vm/compact_memory | ||
1192 | */ | ||
1193 | static inline bool is_via_compact_memory(int order) | ||
1194 | { | ||
1195 | return order == -1; | ||
1196 | } | ||
1197 | |||
1200 | static int __compact_finished(struct zone *zone, struct compact_control *cc, | 1198 | static int __compact_finished(struct zone *zone, struct compact_control *cc, |
1201 | const int migratetype) | 1199 | const int migratetype) |
1202 | { | 1200 | { |
@@ -1204,7 +1202,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, | |||
1204 | unsigned long watermark; | 1202 | unsigned long watermark; |
1205 | 1203 | ||
1206 | if (cc->contended || fatal_signal_pending(current)) | 1204 | if (cc->contended || fatal_signal_pending(current)) |
1207 | return COMPACT_PARTIAL; | 1205 | return COMPACT_CONTENDED; |
1208 | 1206 | ||
1209 | /* Compaction run completes if the migrate and free scanner meet */ | 1207 | /* Compaction run completes if the migrate and free scanner meet */ |
1210 | if (compact_scanners_met(cc)) { | 1208 | if (compact_scanners_met(cc)) { |
@@ -1223,11 +1221,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, | |||
1223 | return COMPACT_COMPLETE; | 1221 | return COMPACT_COMPLETE; |
1224 | } | 1222 | } |
1225 | 1223 | ||
1226 | /* | 1224 | if (is_via_compact_memory(cc->order)) |
1227 | * order == -1 is expected when compacting via | ||
1228 | * /proc/sys/vm/compact_memory | ||
1229 | */ | ||
1230 | if (cc->order == -1) | ||
1231 | return COMPACT_CONTINUE; | 1225 | return COMPACT_CONTINUE; |
1232 | 1226 | ||
1233 | /* Compaction run is not finished if the watermark is not met */ | 1227 | /* Compaction run is not finished if the watermark is not met */ |
@@ -1290,11 +1284,7 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, | |||
1290 | int fragindex; | 1284 | int fragindex; |
1291 | unsigned long watermark; | 1285 | unsigned long watermark; |
1292 | 1286 | ||
1293 | /* | 1287 | if (is_via_compact_memory(order)) |
1294 | * order == -1 is expected when compacting via | ||
1295 | * /proc/sys/vm/compact_memory | ||
1296 | */ | ||
1297 | if (order == -1) | ||
1298 | return COMPACT_CONTINUE; | 1288 | return COMPACT_CONTINUE; |
1299 | 1289 | ||
1300 | watermark = low_wmark_pages(zone); | 1290 | watermark = low_wmark_pages(zone); |
@@ -1403,7 +1393,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1403 | 1393 | ||
1404 | switch (isolate_migratepages(zone, cc)) { | 1394 | switch (isolate_migratepages(zone, cc)) { |
1405 | case ISOLATE_ABORT: | 1395 | case ISOLATE_ABORT: |
1406 | ret = COMPACT_PARTIAL; | 1396 | ret = COMPACT_CONTENDED; |
1407 | putback_movable_pages(&cc->migratepages); | 1397 | putback_movable_pages(&cc->migratepages); |
1408 | cc->nr_migratepages = 0; | 1398 | cc->nr_migratepages = 0; |
1409 | goto out; | 1399 | goto out; |
@@ -1434,7 +1424,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1434 | * and we want compact_finished() to detect it | 1424 | * and we want compact_finished() to detect it |
1435 | */ | 1425 | */ |
1436 | if (err == -ENOMEM && !compact_scanners_met(cc)) { | 1426 | if (err == -ENOMEM && !compact_scanners_met(cc)) { |
1437 | ret = COMPACT_PARTIAL; | 1427 | ret = COMPACT_CONTENDED; |
1438 | goto out; | 1428 | goto out; |
1439 | } | 1429 | } |
1440 | } | 1430 | } |
@@ -1487,6 +1477,9 @@ out: | |||
1487 | trace_mm_compaction_end(start_pfn, cc->migrate_pfn, | 1477 | trace_mm_compaction_end(start_pfn, cc->migrate_pfn, |
1488 | cc->free_pfn, end_pfn, sync, ret); | 1478 | cc->free_pfn, end_pfn, sync, ret); |
1489 | 1479 | ||
1480 | if (ret == COMPACT_CONTENDED) | ||
1481 | ret = COMPACT_PARTIAL; | ||
1482 | |||
1490 | return ret; | 1483 | return ret; |
1491 | } | 1484 | } |
1492 | 1485 | ||
@@ -1658,10 +1651,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
1658 | * this makes sure we compact the whole zone regardless of | 1651 | * this makes sure we compact the whole zone regardless of |
1659 | * cached scanner positions. | 1652 | * cached scanner positions. |
1660 | */ | 1653 | */ |
1661 | if (cc->order == -1) | 1654 | if (is_via_compact_memory(cc->order)) |
1662 | __reset_isolation_suitable(zone); | 1655 | __reset_isolation_suitable(zone); |
1663 | 1656 | ||
1664 | if (cc->order == -1 || !compaction_deferred(zone, cc->order)) | 1657 | if (is_via_compact_memory(cc->order) || |
1658 | !compaction_deferred(zone, cc->order)) | ||
1665 | compact_zone(zone, cc); | 1659 | compact_zone(zone, cc); |
1666 | 1660 | ||
1667 | if (cc->order > 0) { | 1661 | if (cc->order > 0) { |
diff --git a/mm/debug.c b/mm/debug.c index 6c1b3ea61bfd..e784110fb51d 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -125,6 +125,7 @@ static const struct trace_print_flags vmaflags_names[] = { | |||
125 | {VM_GROWSDOWN, "growsdown" }, | 125 | {VM_GROWSDOWN, "growsdown" }, |
126 | {VM_PFNMAP, "pfnmap" }, | 126 | {VM_PFNMAP, "pfnmap" }, |
127 | {VM_DENYWRITE, "denywrite" }, | 127 | {VM_DENYWRITE, "denywrite" }, |
128 | {VM_LOCKONFAULT, "lockonfault" }, | ||
128 | {VM_LOCKED, "locked" }, | 129 | {VM_LOCKED, "locked" }, |
129 | {VM_IO, "io" }, | 130 | {VM_IO, "io" }, |
130 | {VM_SEQ_READ, "seqread" }, | 131 | {VM_SEQ_READ, "seqread" }, |
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 17ae14b5aefa..6d5717bd7197 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c | |||
@@ -126,7 +126,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) | |||
126 | /* | 126 | /* |
127 | * Mappings have to be page-aligned | 127 | * Mappings have to be page-aligned |
128 | */ | 128 | */ |
129 | offset = phys_addr & ~PAGE_MASK; | 129 | offset = offset_in_page(phys_addr); |
130 | phys_addr &= PAGE_MASK; | 130 | phys_addr &= PAGE_MASK; |
131 | size = PAGE_ALIGN(last_addr + 1) - phys_addr; | 131 | size = PAGE_ALIGN(last_addr + 1) - phys_addr; |
132 | 132 | ||
@@ -189,7 +189,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) | |||
189 | if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) | 189 | if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) |
190 | return; | 190 | return; |
191 | 191 | ||
192 | offset = virt_addr & ~PAGE_MASK; | 192 | offset = offset_in_page(virt_addr); |
193 | nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; | 193 | nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; |
194 | 194 | ||
195 | idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; | 195 | idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; |
@@ -234,7 +234,7 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) | |||
234 | char *p; | 234 | char *p; |
235 | 235 | ||
236 | while (size) { | 236 | while (size) { |
237 | slop = src & ~PAGE_MASK; | 237 | slop = offset_in_page(src); |
238 | clen = size; | 238 | clen = size; |
239 | if (clen > MAX_MAP_CHUNK - slop) | 239 | if (clen > MAX_MAP_CHUNK - slop) |
240 | clen = MAX_MAP_CHUNK - slop; | 240 | clen = MAX_MAP_CHUNK - slop; |
diff --git a/mm/filemap.c b/mm/filemap.c index 327910c2400c..58e04e26f996 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -331,23 +331,14 @@ int filemap_flush(struct address_space *mapping) | |||
331 | } | 331 | } |
332 | EXPORT_SYMBOL(filemap_flush); | 332 | EXPORT_SYMBOL(filemap_flush); |
333 | 333 | ||
334 | /** | 334 | static int __filemap_fdatawait_range(struct address_space *mapping, |
335 | * filemap_fdatawait_range - wait for writeback to complete | 335 | loff_t start_byte, loff_t end_byte) |
336 | * @mapping: address space structure to wait for | ||
337 | * @start_byte: offset in bytes where the range starts | ||
338 | * @end_byte: offset in bytes where the range ends (inclusive) | ||
339 | * | ||
340 | * Walk the list of under-writeback pages of the given address space | ||
341 | * in the given range and wait for all of them. | ||
342 | */ | ||
343 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, | ||
344 | loff_t end_byte) | ||
345 | { | 336 | { |
346 | pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; | 337 | pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; |
347 | pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; | 338 | pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; |
348 | struct pagevec pvec; | 339 | struct pagevec pvec; |
349 | int nr_pages; | 340 | int nr_pages; |
350 | int ret2, ret = 0; | 341 | int ret = 0; |
351 | 342 | ||
352 | if (end_byte < start_byte) | 343 | if (end_byte < start_byte) |
353 | goto out; | 344 | goto out; |
@@ -374,6 +365,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, | |||
374 | cond_resched(); | 365 | cond_resched(); |
375 | } | 366 | } |
376 | out: | 367 | out: |
368 | return ret; | ||
369 | } | ||
370 | |||
371 | /** | ||
372 | * filemap_fdatawait_range - wait for writeback to complete | ||
373 | * @mapping: address space structure to wait for | ||
374 | * @start_byte: offset in bytes where the range starts | ||
375 | * @end_byte: offset in bytes where the range ends (inclusive) | ||
376 | * | ||
377 | * Walk the list of under-writeback pages of the given address space | ||
378 | * in the given range and wait for all of them. Check error status of | ||
379 | * the address space and return it. | ||
380 | * | ||
381 | * Since the error status of the address space is cleared by this function, | ||
382 | * callers are responsible for checking the return value and handling and/or | ||
383 | * reporting the error. | ||
384 | */ | ||
385 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, | ||
386 | loff_t end_byte) | ||
387 | { | ||
388 | int ret, ret2; | ||
389 | |||
390 | ret = __filemap_fdatawait_range(mapping, start_byte, end_byte); | ||
377 | ret2 = filemap_check_errors(mapping); | 391 | ret2 = filemap_check_errors(mapping); |
378 | if (!ret) | 392 | if (!ret) |
379 | ret = ret2; | 393 | ret = ret2; |
@@ -383,11 +397,38 @@ out: | |||
383 | EXPORT_SYMBOL(filemap_fdatawait_range); | 397 | EXPORT_SYMBOL(filemap_fdatawait_range); |
384 | 398 | ||
385 | /** | 399 | /** |
400 | * filemap_fdatawait_keep_errors - wait for writeback without clearing errors | ||
401 | * @mapping: address space structure to wait for | ||
402 | * | ||
403 | * Walk the list of under-writeback pages of the given address space | ||
404 | * and wait for all of them. Unlike filemap_fdatawait(), this function | ||
405 | * does not clear error status of the address space. | ||
406 | * | ||
407 | * Use this function if callers don't handle errors themselves. Expected | ||
408 | * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), | ||
409 | * fsfreeze(8) | ||
410 | */ | ||
411 | void filemap_fdatawait_keep_errors(struct address_space *mapping) | ||
412 | { | ||
413 | loff_t i_size = i_size_read(mapping->host); | ||
414 | |||
415 | if (i_size == 0) | ||
416 | return; | ||
417 | |||
418 | __filemap_fdatawait_range(mapping, 0, i_size - 1); | ||
419 | } | ||
420 | |||
421 | /** | ||
386 | * filemap_fdatawait - wait for all under-writeback pages to complete | 422 | * filemap_fdatawait - wait for all under-writeback pages to complete |
387 | * @mapping: address space structure to wait for | 423 | * @mapping: address space structure to wait for |
388 | * | 424 | * |
389 | * Walk the list of under-writeback pages of the given address space | 425 | * Walk the list of under-writeback pages of the given address space |
390 | * and wait for all of them. | 426 | * and wait for all of them. Check error status of the address space |
427 | * and return it. | ||
428 | * | ||
429 | * Since the error status of the address space is cleared by this function, | ||
430 | * callers are responsible for checking the return value and handling and/or | ||
431 | * reporting the error. | ||
391 | */ | 432 | */ |
392 | int filemap_fdatawait(struct address_space *mapping) | 433 | int filemap_fdatawait(struct address_space *mapping) |
393 | { | 434 | { |
@@ -510,7 +551,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | |||
510 | __inc_zone_page_state(new, NR_SHMEM); | 551 | __inc_zone_page_state(new, NR_SHMEM); |
511 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 552 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
512 | mem_cgroup_end_page_stat(memcg); | 553 | mem_cgroup_end_page_stat(memcg); |
513 | mem_cgroup_migrate(old, new, true); | 554 | mem_cgroup_replace_page(old, new); |
514 | radix_tree_preload_end(); | 555 | radix_tree_preload_end(); |
515 | if (freepage) | 556 | if (freepage) |
516 | freepage(old); | 557 | freepage(old); |
@@ -1807,7 +1848,6 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, | |||
1807 | struct file *file, | 1848 | struct file *file, |
1808 | pgoff_t offset) | 1849 | pgoff_t offset) |
1809 | { | 1850 | { |
1810 | unsigned long ra_pages; | ||
1811 | struct address_space *mapping = file->f_mapping; | 1851 | struct address_space *mapping = file->f_mapping; |
1812 | 1852 | ||
1813 | /* If we don't want any read-ahead, don't bother */ | 1853 | /* If we don't want any read-ahead, don't bother */ |
@@ -1836,10 +1876,9 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, | |||
1836 | /* | 1876 | /* |
1837 | * mmap read-around | 1877 | * mmap read-around |
1838 | */ | 1878 | */ |
1839 | ra_pages = max_sane_readahead(ra->ra_pages); | 1879 | ra->start = max_t(long, 0, offset - ra->ra_pages / 2); |
1840 | ra->start = max_t(long, 0, offset - ra_pages / 2); | 1880 | ra->size = ra->ra_pages; |
1841 | ra->size = ra_pages; | 1881 | ra->async_size = ra->ra_pages / 4; |
1842 | ra->async_size = ra_pages / 4; | ||
1843 | ra_submit(ra, mapping, file); | 1882 | ra_submit(ra, mapping, file); |
1844 | } | 1883 | } |
1845 | 1884 | ||
diff --git a/mm/frame_vector.c b/mm/frame_vector.c index cdabcb93c6a6..7cf2b7163222 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c | |||
@@ -7,7 +7,7 @@ | |||
7 | #include <linux/pagemap.h> | 7 | #include <linux/pagemap.h> |
8 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
9 | 9 | ||
10 | /* | 10 | /** |
11 | * get_vaddr_frames() - map virtual addresses to pfns | 11 | * get_vaddr_frames() - map virtual addresses to pfns |
12 | * @start: starting user address | 12 | * @start: starting user address |
13 | * @nr_frames: number of pages / pfns from start to map | 13 | * @nr_frames: number of pages / pfns from start to map |
@@ -129,7 +129,7 @@ retry: | |||
129 | */ | 129 | */ |
130 | mark_page_accessed(page); | 130 | mark_page_accessed(page); |
131 | } | 131 | } |
132 | if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { | 132 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { |
133 | /* | 133 | /* |
134 | * The preliminary mapping check is mainly to avoid the | 134 | * The preliminary mapping check is mainly to avoid the |
135 | * pointless overhead of lock_page on the ZERO_PAGE | 135 | * pointless overhead of lock_page on the ZERO_PAGE |
@@ -299,6 +299,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, | |||
299 | unsigned int fault_flags = 0; | 299 | unsigned int fault_flags = 0; |
300 | int ret; | 300 | int ret; |
301 | 301 | ||
302 | /* mlock all present pages, but do not fault in new pages */ | ||
303 | if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) | ||
304 | return -ENOENT; | ||
302 | /* For mm_populate(), just skip the stack guard page. */ | 305 | /* For mm_populate(), just skip the stack guard page. */ |
303 | if ((*flags & FOLL_POPULATE) && | 306 | if ((*flags & FOLL_POPULATE) && |
304 | (stack_guard_page_start(vma, address) || | 307 | (stack_guard_page_start(vma, address) || |
@@ -890,7 +893,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, | |||
890 | VM_BUG_ON_VMA(end > vma->vm_end, vma); | 893 | VM_BUG_ON_VMA(end > vma->vm_end, vma); |
891 | VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); | 894 | VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); |
892 | 895 | ||
893 | gup_flags = FOLL_TOUCH | FOLL_POPULATE; | 896 | gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; |
897 | if (vma->vm_flags & VM_LOCKONFAULT) | ||
898 | gup_flags &= ~FOLL_POPULATE; | ||
899 | |||
894 | /* | 900 | /* |
895 | * We want to touch writable mappings with a write fault in order | 901 | * We want to touch writable mappings with a write fault in order |
896 | * to break COW, except for shared mappings because these don't COW | 902 | * to break COW, except for shared mappings because these don't COW |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3fd0311c3ba7..f5c08b46fef8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1307,7 +1307,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1307 | pmd, _pmd, 1)) | 1307 | pmd, _pmd, 1)) |
1308 | update_mmu_cache_pmd(vma, addr, pmd); | 1308 | update_mmu_cache_pmd(vma, addr, pmd); |
1309 | } | 1309 | } |
1310 | if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { | 1310 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { |
1311 | if (page->mapping && trylock_page(page)) { | 1311 | if (page->mapping && trylock_page(page)) { |
1312 | lru_add_drain(); | 1312 | lru_add_drain(); |
1313 | if (page->mapping) | 1313 | if (page->mapping) |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9cc773483624..74ef0c6a25dd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1437,7 +1437,82 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
1437 | dissolve_free_huge_page(pfn_to_page(pfn)); | 1437 | dissolve_free_huge_page(pfn_to_page(pfn)); |
1438 | } | 1438 | } |
1439 | 1439 | ||
1440 | static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | 1440 | /* |
1441 | * There are 3 ways this can get called: | ||
1442 | * 1. With vma+addr: we use the VMA's memory policy | ||
1443 | * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge | ||
1444 | * page from any node, and let the buddy allocator itself figure | ||
1445 | * it out. | ||
1446 | * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page | ||
1447 | * strictly from 'nid' | ||
1448 | */ | ||
1449 | static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, | ||
1450 | struct vm_area_struct *vma, unsigned long addr, int nid) | ||
1451 | { | ||
1452 | int order = huge_page_order(h); | ||
1453 | gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; | ||
1454 | unsigned int cpuset_mems_cookie; | ||
1455 | |||
1456 | /* | ||
1457 | * We need a VMA to get a memory policy. If we do not | ||
1458 | * have one, we use the 'nid' argument. | ||
1459 | * | ||
1460 | * The mempolicy stuff below has some non-inlined bits | ||
1461 | * and calls ->vm_ops. That makes it hard to optimize at | ||
1462 | * compile-time, even when NUMA is off and it does | ||
1463 | * nothing. This helps the compiler optimize it out. | ||
1464 | */ | ||
1465 | if (!IS_ENABLED(CONFIG_NUMA) || !vma) { | ||
1466 | /* | ||
1467 | * If a specific node is requested, make sure to | ||
1468 | * get memory from there, but only when a node | ||
1469 | * is explicitly specified. | ||
1470 | */ | ||
1471 | if (nid != NUMA_NO_NODE) | ||
1472 | gfp |= __GFP_THISNODE; | ||
1473 | /* | ||
1474 | * Make sure to call something that can handle | ||
1475 | * nid=NUMA_NO_NODE | ||
1476 | */ | ||
1477 | return alloc_pages_node(nid, gfp, order); | ||
1478 | } | ||
1479 | |||
1480 | /* | ||
1481 | * OK, so we have a VMA. Fetch the mempolicy and try to | ||
1482 | * allocate a huge page with it. We will only reach this | ||
1483 | * when CONFIG_NUMA=y. | ||
1484 | */ | ||
1485 | do { | ||
1486 | struct page *page; | ||
1487 | struct mempolicy *mpol; | ||
1488 | struct zonelist *zl; | ||
1489 | nodemask_t *nodemask; | ||
1490 | |||
1491 | cpuset_mems_cookie = read_mems_allowed_begin(); | ||
1492 | zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); | ||
1493 | mpol_cond_put(mpol); | ||
1494 | page = __alloc_pages_nodemask(gfp, order, zl, nodemask); | ||
1495 | if (page) | ||
1496 | return page; | ||
1497 | } while (read_mems_allowed_retry(cpuset_mems_cookie)); | ||
1498 | |||
1499 | return NULL; | ||
1500 | } | ||
1501 | |||
1502 | /* | ||
1503 | * There are two ways to allocate a huge page: | ||
1504 | * 1. When you have a VMA and an address (like a fault) | ||
1505 | * 2. When you have no VMA (like when setting /proc/.../nr_hugepages) | ||
1506 | * | ||
1507 | * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in | ||
1508 | * this case which signifies that the allocation should be done with | ||
1509 | * respect for the VMA's memory policy. | ||
1510 | * | ||
1511 | * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This | ||
1512 | * implies that memory policies will not be taken in to account. | ||
1513 | */ | ||
1514 | static struct page *__alloc_buddy_huge_page(struct hstate *h, | ||
1515 | struct vm_area_struct *vma, unsigned long addr, int nid) | ||
1441 | { | 1516 | { |
1442 | struct page *page; | 1517 | struct page *page; |
1443 | unsigned int r_nid; | 1518 | unsigned int r_nid; |
@@ -1446,6 +1521,15 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
1446 | return NULL; | 1521 | return NULL; |
1447 | 1522 | ||
1448 | /* | 1523 | /* |
1524 | * Make sure that anyone specifying 'nid' is not also specifying a VMA. | ||
1525 | * This makes sure the caller is picking _one_ of the modes with which | ||
1526 | * we can call this function, not both. | ||
1527 | */ | ||
1528 | if (vma || (addr != -1)) { | ||
1529 | VM_WARN_ON_ONCE(addr == -1); | ||
1530 | VM_WARN_ON_ONCE(nid != NUMA_NO_NODE); | ||
1531 | } | ||
1532 | /* | ||
1449 | * Assume we will successfully allocate the surplus page to | 1533 | * Assume we will successfully allocate the surplus page to |
1450 | * prevent racing processes from causing the surplus to exceed | 1534 | * prevent racing processes from causing the surplus to exceed |
1451 | * overcommit | 1535 | * overcommit |
@@ -1478,14 +1562,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
1478 | } | 1562 | } |
1479 | spin_unlock(&hugetlb_lock); | 1563 | spin_unlock(&hugetlb_lock); |
1480 | 1564 | ||
1481 | if (nid == NUMA_NO_NODE) | 1565 | page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); |
1482 | page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| | ||
1483 | __GFP_REPEAT|__GFP_NOWARN, | ||
1484 | huge_page_order(h)); | ||
1485 | else | ||
1486 | page = __alloc_pages_node(nid, | ||
1487 | htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| | ||
1488 | __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); | ||
1489 | 1566 | ||
1490 | spin_lock(&hugetlb_lock); | 1567 | spin_lock(&hugetlb_lock); |
1491 | if (page) { | 1568 | if (page) { |
@@ -1510,6 +1587,29 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
1510 | } | 1587 | } |
1511 | 1588 | ||
1512 | /* | 1589 | /* |
1590 | * Allocate a huge page from 'nid'. Note, 'nid' may be | ||
1591 | * NUMA_NO_NODE, which means that it may be allocated | ||
1592 | * anywhere. | ||
1593 | */ | ||
1594 | static | ||
1595 | struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) | ||
1596 | { | ||
1597 | unsigned long addr = -1; | ||
1598 | |||
1599 | return __alloc_buddy_huge_page(h, NULL, addr, nid); | ||
1600 | } | ||
1601 | |||
1602 | /* | ||
1603 | * Use the VMA's mpolicy to allocate a huge page from the buddy. | ||
1604 | */ | ||
1605 | static | ||
1606 | struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, | ||
1607 | struct vm_area_struct *vma, unsigned long addr) | ||
1608 | { | ||
1609 | return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE); | ||
1610 | } | ||
1611 | |||
1612 | /* | ||
1513 | * This allocation function is useful in the context where vma is irrelevant. | 1613 | * This allocation function is useful in the context where vma is irrelevant. |
1514 | * E.g. soft-offlining uses this function because it only cares physical | 1614 | * E.g. soft-offlining uses this function because it only cares physical |
1515 | * address of error page. | 1615 | * address of error page. |
@@ -1524,7 +1624,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) | |||
1524 | spin_unlock(&hugetlb_lock); | 1624 | spin_unlock(&hugetlb_lock); |
1525 | 1625 | ||
1526 | if (!page) | 1626 | if (!page) |
1527 | page = alloc_buddy_huge_page(h, nid); | 1627 | page = __alloc_buddy_huge_page_no_mpol(h, nid); |
1528 | 1628 | ||
1529 | return page; | 1629 | return page; |
1530 | } | 1630 | } |
@@ -1554,7 +1654,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) | |||
1554 | retry: | 1654 | retry: |
1555 | spin_unlock(&hugetlb_lock); | 1655 | spin_unlock(&hugetlb_lock); |
1556 | for (i = 0; i < needed; i++) { | 1656 | for (i = 0; i < needed; i++) { |
1557 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1657 | page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE); |
1558 | if (!page) { | 1658 | if (!page) { |
1559 | alloc_ok = false; | 1659 | alloc_ok = false; |
1560 | break; | 1660 | break; |
@@ -1787,7 +1887,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1787 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); | 1887 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); |
1788 | if (!page) { | 1888 | if (!page) { |
1789 | spin_unlock(&hugetlb_lock); | 1889 | spin_unlock(&hugetlb_lock); |
1790 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1890 | page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); |
1791 | if (!page) | 1891 | if (!page) |
1792 | goto out_uncharge_cgroup; | 1892 | goto out_uncharge_cgroup; |
1793 | 1893 | ||
@@ -2376,7 +2476,7 @@ struct node_hstate { | |||
2376 | struct kobject *hugepages_kobj; | 2476 | struct kobject *hugepages_kobj; |
2377 | struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | 2477 | struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; |
2378 | }; | 2478 | }; |
2379 | struct node_hstate node_hstates[MAX_NUMNODES]; | 2479 | static struct node_hstate node_hstates[MAX_NUMNODES]; |
2380 | 2480 | ||
2381 | /* | 2481 | /* |
2382 | * A subset of global hstate attributes for node devices | 2482 | * A subset of global hstate attributes for node devices |
@@ -2790,6 +2890,12 @@ void hugetlb_show_meminfo(void) | |||
2790 | 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); | 2890 | 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); |
2791 | } | 2891 | } |
2792 | 2892 | ||
2893 | void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) | ||
2894 | { | ||
2895 | seq_printf(m, "HugetlbPages:\t%8lu kB\n", | ||
2896 | atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); | ||
2897 | } | ||
2898 | |||
2793 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ | 2899 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ |
2794 | unsigned long hugetlb_total_pages(void) | 2900 | unsigned long hugetlb_total_pages(void) |
2795 | { | 2901 | { |
@@ -3025,6 +3131,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
3025 | get_page(ptepage); | 3131 | get_page(ptepage); |
3026 | page_dup_rmap(ptepage); | 3132 | page_dup_rmap(ptepage); |
3027 | set_huge_pte_at(dst, addr, dst_pte, entry); | 3133 | set_huge_pte_at(dst, addr, dst_pte, entry); |
3134 | hugetlb_count_add(pages_per_huge_page(h), dst); | ||
3028 | } | 3135 | } |
3029 | spin_unlock(src_ptl); | 3136 | spin_unlock(src_ptl); |
3030 | spin_unlock(dst_ptl); | 3137 | spin_unlock(dst_ptl); |
@@ -3105,6 +3212,7 @@ again: | |||
3105 | if (huge_pte_dirty(pte)) | 3212 | if (huge_pte_dirty(pte)) |
3106 | set_page_dirty(page); | 3213 | set_page_dirty(page); |
3107 | 3214 | ||
3215 | hugetlb_count_sub(pages_per_huge_page(h), mm); | ||
3108 | page_remove_rmap(page); | 3216 | page_remove_rmap(page); |
3109 | force_flush = !__tlb_remove_page(tlb, page); | 3217 | force_flush = !__tlb_remove_page(tlb, page); |
3110 | if (force_flush) { | 3218 | if (force_flush) { |
@@ -3509,6 +3617,7 @@ retry: | |||
3509 | && (vma->vm_flags & VM_SHARED))); | 3617 | && (vma->vm_flags & VM_SHARED))); |
3510 | set_huge_pte_at(mm, address, ptep, new_pte); | 3618 | set_huge_pte_at(mm, address, ptep, new_pte); |
3511 | 3619 | ||
3620 | hugetlb_count_add(pages_per_huge_page(h), mm); | ||
3512 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { | 3621 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { |
3513 | /* Optimization, do the COW without a second fault */ | 3622 | /* Optimization, do the COW without a second fault */ |
3514 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); | 3623 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); |
@@ -4028,8 +4137,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, | |||
4028 | unsigned long s_end = sbase + PUD_SIZE; | 4137 | unsigned long s_end = sbase + PUD_SIZE; |
4029 | 4138 | ||
4030 | /* Allow segments to share if only one is marked locked */ | 4139 | /* Allow segments to share if only one is marked locked */ |
4031 | unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; | 4140 | unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; |
4032 | unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; | 4141 | unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; |
4033 | 4142 | ||
4034 | /* | 4143 | /* |
4035 | * match the virtual addresses, permission and the alignment of the | 4144 | * match the virtual addresses, permission and the alignment of the |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 6e0057439a46..33d59abe91f1 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
@@ -186,7 +186,8 @@ again: | |||
186 | } | 186 | } |
187 | rcu_read_unlock(); | 187 | rcu_read_unlock(); |
188 | 188 | ||
189 | ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); | 189 | if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter)) |
190 | ret = -ENOMEM; | ||
190 | css_put(&h_cg->css); | 191 | css_put(&h_cg->css); |
191 | done: | 192 | done: |
192 | *ptr = h_cg; | 193 | *ptr = h_cg; |
diff --git a/mm/internal.h b/mm/internal.h index bc0fa9a69e46..d4b807d6c963 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -271,20 +271,19 @@ extern unsigned int munlock_vma_page(struct page *page); | |||
271 | extern void clear_page_mlock(struct page *page); | 271 | extern void clear_page_mlock(struct page *page); |
272 | 272 | ||
273 | /* | 273 | /* |
274 | * mlock_migrate_page - called only from migrate_page_copy() to | 274 | * mlock_migrate_page - called only from migrate_misplaced_transhuge_page() |
275 | * migrate the Mlocked page flag; update statistics. | 275 | * (because that does not go through the full procedure of migration ptes): |
276 | * to migrate the Mlocked page flag; update statistics. | ||
276 | */ | 277 | */ |
277 | static inline void mlock_migrate_page(struct page *newpage, struct page *page) | 278 | static inline void mlock_migrate_page(struct page *newpage, struct page *page) |
278 | { | 279 | { |
279 | if (TestClearPageMlocked(page)) { | 280 | if (TestClearPageMlocked(page)) { |
280 | unsigned long flags; | ||
281 | int nr_pages = hpage_nr_pages(page); | 281 | int nr_pages = hpage_nr_pages(page); |
282 | 282 | ||
283 | local_irq_save(flags); | 283 | /* Holding pmd lock, no change in irq context: __mod is safe */ |
284 | __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); | 284 | __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
285 | SetPageMlocked(newpage); | 285 | SetPageMlocked(newpage); |
286 | __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); | 286 | __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); |
287 | local_irq_restore(flags); | ||
288 | } | 287 | } |
289 | } | 288 | } |
290 | 289 | ||
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 8da211411b57..d41b21bce6a0 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. | 4 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. |
5 | * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> | 5 | * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> |
6 | * | 6 | * |
7 | * Some of code borrowed from https://github.com/xairy/linux by | 7 | * Some code borrowed from https://github.com/xairy/kasan-prototype by |
8 | * Andrey Konovalov <adech.fo@gmail.com> | 8 | * Andrey Konovalov <adech.fo@gmail.com> |
9 | * | 9 | * |
10 | * This program is free software; you can redistribute it and/or modify | 10 | * This program is free software; you can redistribute it and/or modify |
@@ -86,6 +86,11 @@ static __always_inline bool memory_is_poisoned_2(unsigned long addr) | |||
86 | if (memory_is_poisoned_1(addr + 1)) | 86 | if (memory_is_poisoned_1(addr + 1)) |
87 | return true; | 87 | return true; |
88 | 88 | ||
89 | /* | ||
90 | * If single shadow byte covers 2-byte access, we don't | ||
91 | * need to do anything more. Otherwise, test the first | ||
92 | * shadow byte. | ||
93 | */ | ||
89 | if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) | 94 | if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) |
90 | return false; | 95 | return false; |
91 | 96 | ||
@@ -103,6 +108,11 @@ static __always_inline bool memory_is_poisoned_4(unsigned long addr) | |||
103 | if (memory_is_poisoned_1(addr + 3)) | 108 | if (memory_is_poisoned_1(addr + 3)) |
104 | return true; | 109 | return true; |
105 | 110 | ||
111 | /* | ||
112 | * If single shadow byte covers 4-byte access, we don't | ||
113 | * need to do anything more. Otherwise, test the first | ||
114 | * shadow byte. | ||
115 | */ | ||
106 | if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) | 116 | if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) |
107 | return false; | 117 | return false; |
108 | 118 | ||
@@ -120,7 +130,12 @@ static __always_inline bool memory_is_poisoned_8(unsigned long addr) | |||
120 | if (memory_is_poisoned_1(addr + 7)) | 130 | if (memory_is_poisoned_1(addr + 7)) |
121 | return true; | 131 | return true; |
122 | 132 | ||
123 | if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7)) | 133 | /* |
134 | * If single shadow byte covers 8-byte access, we don't | ||
135 | * need to do anything more. Otherwise, test the first | ||
136 | * shadow byte. | ||
137 | */ | ||
138 | if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) | ||
124 | return false; | 139 | return false; |
125 | 140 | ||
126 | return unlikely(*(u8 *)shadow_addr); | 141 | return unlikely(*(u8 *)shadow_addr); |
@@ -139,7 +154,12 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr) | |||
139 | if (unlikely(shadow_first_bytes)) | 154 | if (unlikely(shadow_first_bytes)) |
140 | return true; | 155 | return true; |
141 | 156 | ||
142 | if (likely(IS_ALIGNED(addr, 8))) | 157 | /* |
158 | * If two shadow bytes covers 16-byte access, we don't | ||
159 | * need to do anything more. Otherwise, test the last | ||
160 | * shadow byte. | ||
161 | */ | ||
162 | if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) | ||
143 | return false; | 163 | return false; |
144 | 164 | ||
145 | return memory_is_poisoned_1(addr + 15); | 165 | return memory_is_poisoned_1(addr + 15); |
@@ -203,7 +223,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr, | |||
203 | s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); | 223 | s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); |
204 | 224 | ||
205 | if (unlikely(ret != (unsigned long)last_shadow || | 225 | if (unlikely(ret != (unsigned long)last_shadow || |
206 | ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) | 226 | ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) |
207 | return true; | 227 | return true; |
208 | } | 228 | } |
209 | return false; | 229 | return false; |
@@ -235,18 +255,12 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) | |||
235 | static __always_inline void check_memory_region(unsigned long addr, | 255 | static __always_inline void check_memory_region(unsigned long addr, |
236 | size_t size, bool write) | 256 | size_t size, bool write) |
237 | { | 257 | { |
238 | struct kasan_access_info info; | ||
239 | |||
240 | if (unlikely(size == 0)) | 258 | if (unlikely(size == 0)) |
241 | return; | 259 | return; |
242 | 260 | ||
243 | if (unlikely((void *)addr < | 261 | if (unlikely((void *)addr < |
244 | kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { | 262 | kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { |
245 | info.access_addr = (void *)addr; | 263 | kasan_report(addr, size, write, _RET_IP_); |
246 | info.access_size = size; | ||
247 | info.is_write = write; | ||
248 | info.ip = _RET_IP_; | ||
249 | kasan_report_user_access(&info); | ||
250 | return; | 264 | return; |
251 | } | 265 | } |
252 | 266 | ||
@@ -524,7 +538,7 @@ static int kasan_mem_notifier(struct notifier_block *nb, | |||
524 | 538 | ||
525 | static int __init kasan_memhotplug_init(void) | 539 | static int __init kasan_memhotplug_init(void) |
526 | { | 540 | { |
527 | pr_err("WARNING: KASan doesn't support memory hot-add\n"); | 541 | pr_err("WARNING: KASAN doesn't support memory hot-add\n"); |
528 | pr_err("Memory hot-add will be disabled\n"); | 542 | pr_err("Memory hot-add will be disabled\n"); |
529 | 543 | ||
530 | hotplug_memory_notifier(kasan_mem_notifier, 0); | 544 | hotplug_memory_notifier(kasan_mem_notifier, 0); |
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c242adf6bc85..4f6c62e5c21e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h | |||
@@ -54,16 +54,13 @@ struct kasan_global { | |||
54 | #endif | 54 | #endif |
55 | }; | 55 | }; |
56 | 56 | ||
57 | void kasan_report_error(struct kasan_access_info *info); | ||
58 | void kasan_report_user_access(struct kasan_access_info *info); | ||
59 | |||
60 | static inline const void *kasan_shadow_to_mem(const void *shadow_addr) | 57 | static inline const void *kasan_shadow_to_mem(const void *shadow_addr) |
61 | { | 58 | { |
62 | return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) | 59 | return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) |
63 | << KASAN_SHADOW_SCALE_SHIFT); | 60 | << KASAN_SHADOW_SCALE_SHIFT); |
64 | } | 61 | } |
65 | 62 | ||
66 | static inline bool kasan_enabled(void) | 63 | static inline bool kasan_report_enabled(void) |
67 | { | 64 | { |
68 | return !current->kasan_depth; | 65 | return !current->kasan_depth; |
69 | } | 66 | } |
diff --git a/mm/kasan/report.c b/mm/kasan/report.c index e07c94fbd0ac..12f222d0224b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. | 4 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. |
5 | * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> | 5 | * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> |
6 | * | 6 | * |
7 | * Some of code borrowed from https://github.com/xairy/linux by | 7 | * Some code borrowed from https://github.com/xairy/kasan-prototype by |
8 | * Andrey Konovalov <adech.fo@gmail.com> | 8 | * Andrey Konovalov <adech.fo@gmail.com> |
9 | * | 9 | * |
10 | * This program is free software; you can redistribute it and/or modify | 10 | * This program is free software; you can redistribute it and/or modify |
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/string.h> | 22 | #include <linux/string.h> |
23 | #include <linux/types.h> | 23 | #include <linux/types.h> |
24 | #include <linux/kasan.h> | 24 | #include <linux/kasan.h> |
25 | #include <linux/module.h> | ||
25 | 26 | ||
26 | #include <asm/sections.h> | 27 | #include <asm/sections.h> |
27 | 28 | ||
@@ -48,34 +49,49 @@ static const void *find_first_bad_addr(const void *addr, size_t size) | |||
48 | 49 | ||
49 | static void print_error_description(struct kasan_access_info *info) | 50 | static void print_error_description(struct kasan_access_info *info) |
50 | { | 51 | { |
51 | const char *bug_type = "unknown crash"; | 52 | const char *bug_type = "unknown-crash"; |
52 | u8 shadow_val; | 53 | u8 *shadow_addr; |
53 | 54 | ||
54 | info->first_bad_addr = find_first_bad_addr(info->access_addr, | 55 | info->first_bad_addr = find_first_bad_addr(info->access_addr, |
55 | info->access_size); | 56 | info->access_size); |
56 | 57 | ||
57 | shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr); | 58 | shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); |
58 | 59 | ||
59 | switch (shadow_val) { | 60 | /* |
60 | case KASAN_FREE_PAGE: | 61 | * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look |
61 | case KASAN_KMALLOC_FREE: | 62 | * at the next shadow byte to determine the type of the bad access. |
62 | bug_type = "use after free"; | 63 | */ |
64 | if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1) | ||
65 | shadow_addr++; | ||
66 | |||
67 | switch (*shadow_addr) { | ||
68 | case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: | ||
69 | /* | ||
70 | * In theory it's still possible to see these shadow values | ||
71 | * due to a data race in the kernel code. | ||
72 | */ | ||
73 | bug_type = "out-of-bounds"; | ||
63 | break; | 74 | break; |
64 | case KASAN_PAGE_REDZONE: | 75 | case KASAN_PAGE_REDZONE: |
65 | case KASAN_KMALLOC_REDZONE: | 76 | case KASAN_KMALLOC_REDZONE: |
77 | bug_type = "slab-out-of-bounds"; | ||
78 | break; | ||
66 | case KASAN_GLOBAL_REDZONE: | 79 | case KASAN_GLOBAL_REDZONE: |
67 | case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: | 80 | bug_type = "global-out-of-bounds"; |
68 | bug_type = "out of bounds access"; | ||
69 | break; | 81 | break; |
70 | case KASAN_STACK_LEFT: | 82 | case KASAN_STACK_LEFT: |
71 | case KASAN_STACK_MID: | 83 | case KASAN_STACK_MID: |
72 | case KASAN_STACK_RIGHT: | 84 | case KASAN_STACK_RIGHT: |
73 | case KASAN_STACK_PARTIAL: | 85 | case KASAN_STACK_PARTIAL: |
74 | bug_type = "out of bounds on stack"; | 86 | bug_type = "stack-out-of-bounds"; |
87 | break; | ||
88 | case KASAN_FREE_PAGE: | ||
89 | case KASAN_KMALLOC_FREE: | ||
90 | bug_type = "use-after-free"; | ||
75 | break; | 91 | break; |
76 | } | 92 | } |
77 | 93 | ||
78 | pr_err("BUG: KASan: %s in %pS at addr %p\n", | 94 | pr_err("BUG: KASAN: %s in %pS at addr %p\n", |
79 | bug_type, (void *)info->ip, | 95 | bug_type, (void *)info->ip, |
80 | info->access_addr); | 96 | info->access_addr); |
81 | pr_err("%s of size %zu by task %s/%d\n", | 97 | pr_err("%s of size %zu by task %s/%d\n", |
@@ -85,9 +101,11 @@ static void print_error_description(struct kasan_access_info *info) | |||
85 | 101 | ||
86 | static inline bool kernel_or_module_addr(const void *addr) | 102 | static inline bool kernel_or_module_addr(const void *addr) |
87 | { | 103 | { |
88 | return (addr >= (void *)_stext && addr < (void *)_end) | 104 | if (addr >= (void *)_stext && addr < (void *)_end) |
89 | || (addr >= (void *)MODULES_VADDR | 105 | return true; |
90 | && addr < (void *)MODULES_END); | 106 | if (is_module_address((unsigned long)addr)) |
107 | return true; | ||
108 | return false; | ||
91 | } | 109 | } |
92 | 110 | ||
93 | static inline bool init_task_stack_addr(const void *addr) | 111 | static inline bool init_task_stack_addr(const void *addr) |
@@ -161,15 +179,19 @@ static void print_shadow_for_address(const void *addr) | |||
161 | for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) { | 179 | for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) { |
162 | const void *kaddr = kasan_shadow_to_mem(shadow_row); | 180 | const void *kaddr = kasan_shadow_to_mem(shadow_row); |
163 | char buffer[4 + (BITS_PER_LONG/8)*2]; | 181 | char buffer[4 + (BITS_PER_LONG/8)*2]; |
182 | char shadow_buf[SHADOW_BYTES_PER_ROW]; | ||
164 | 183 | ||
165 | snprintf(buffer, sizeof(buffer), | 184 | snprintf(buffer, sizeof(buffer), |
166 | (i == 0) ? ">%p: " : " %p: ", kaddr); | 185 | (i == 0) ? ">%p: " : " %p: ", kaddr); |
167 | 186 | /* | |
168 | kasan_disable_current(); | 187 | * We should not pass a shadow pointer to generic |
188 | * function, because generic functions may try to | ||
189 | * access kasan mapping for the passed address. | ||
190 | */ | ||
191 | memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW); | ||
169 | print_hex_dump(KERN_ERR, buffer, | 192 | print_hex_dump(KERN_ERR, buffer, |
170 | DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, | 193 | DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, |
171 | shadow_row, SHADOW_BYTES_PER_ROW, 0); | 194 | shadow_buf, SHADOW_BYTES_PER_ROW, 0); |
172 | kasan_enable_current(); | ||
173 | 195 | ||
174 | if (row_is_guilty(shadow_row, shadow)) | 196 | if (row_is_guilty(shadow_row, shadow)) |
175 | pr_err("%*c\n", | 197 | pr_err("%*c\n", |
@@ -182,37 +204,43 @@ static void print_shadow_for_address(const void *addr) | |||
182 | 204 | ||
183 | static DEFINE_SPINLOCK(report_lock); | 205 | static DEFINE_SPINLOCK(report_lock); |
184 | 206 | ||
185 | void kasan_report_error(struct kasan_access_info *info) | 207 | static void kasan_report_error(struct kasan_access_info *info) |
186 | { | ||
187 | unsigned long flags; | ||
188 | |||
189 | spin_lock_irqsave(&report_lock, flags); | ||
190 | pr_err("=================================" | ||
191 | "=================================\n"); | ||
192 | print_error_description(info); | ||
193 | print_address_description(info); | ||
194 | print_shadow_for_address(info->first_bad_addr); | ||
195 | pr_err("=================================" | ||
196 | "=================================\n"); | ||
197 | spin_unlock_irqrestore(&report_lock, flags); | ||
198 | } | ||
199 | |||
200 | void kasan_report_user_access(struct kasan_access_info *info) | ||
201 | { | 208 | { |
202 | unsigned long flags; | 209 | unsigned long flags; |
210 | const char *bug_type; | ||
203 | 211 | ||
212 | /* | ||
213 | * Make sure we don't end up in loop. | ||
214 | */ | ||
215 | kasan_disable_current(); | ||
204 | spin_lock_irqsave(&report_lock, flags); | 216 | spin_lock_irqsave(&report_lock, flags); |
205 | pr_err("=================================" | 217 | pr_err("=================================" |
206 | "=================================\n"); | 218 | "=================================\n"); |
207 | pr_err("BUG: KASan: user-memory-access on address %p\n", | 219 | if (info->access_addr < |
208 | info->access_addr); | 220 | kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { |
209 | pr_err("%s of size %zu by task %s/%d\n", | 221 | if ((unsigned long)info->access_addr < PAGE_SIZE) |
210 | info->is_write ? "Write" : "Read", | 222 | bug_type = "null-ptr-deref"; |
211 | info->access_size, current->comm, task_pid_nr(current)); | 223 | else if ((unsigned long)info->access_addr < TASK_SIZE) |
212 | dump_stack(); | 224 | bug_type = "user-memory-access"; |
225 | else | ||
226 | bug_type = "wild-memory-access"; | ||
227 | pr_err("BUG: KASAN: %s on address %p\n", | ||
228 | bug_type, info->access_addr); | ||
229 | pr_err("%s of size %zu by task %s/%d\n", | ||
230 | info->is_write ? "Write" : "Read", | ||
231 | info->access_size, current->comm, | ||
232 | task_pid_nr(current)); | ||
233 | dump_stack(); | ||
234 | } else { | ||
235 | print_error_description(info); | ||
236 | print_address_description(info); | ||
237 | print_shadow_for_address(info->first_bad_addr); | ||
238 | } | ||
213 | pr_err("=================================" | 239 | pr_err("=================================" |
214 | "=================================\n"); | 240 | "=================================\n"); |
241 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); | ||
215 | spin_unlock_irqrestore(&report_lock, flags); | 242 | spin_unlock_irqrestore(&report_lock, flags); |
243 | kasan_enable_current(); | ||
216 | } | 244 | } |
217 | 245 | ||
218 | void kasan_report(unsigned long addr, size_t size, | 246 | void kasan_report(unsigned long addr, size_t size, |
@@ -220,13 +248,14 @@ void kasan_report(unsigned long addr, size_t size, | |||
220 | { | 248 | { |
221 | struct kasan_access_info info; | 249 | struct kasan_access_info info; |
222 | 250 | ||
223 | if (likely(!kasan_enabled())) | 251 | if (likely(!kasan_report_enabled())) |
224 | return; | 252 | return; |
225 | 253 | ||
226 | info.access_addr = (void *)addr; | 254 | info.access_addr = (void *)addr; |
227 | info.access_size = size; | 255 | info.access_size = size; |
228 | info.is_write = is_write; | 256 | info.is_write = is_write; |
229 | info.ip = ip; | 257 | info.ip = ip; |
258 | |||
230 | kasan_report_error(&info); | 259 | kasan_report_error(&info); |
231 | } | 260 | } |
232 | 261 | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 77191eccdc6f..19423a45d7d7 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -479,7 +479,7 @@ static void put_object(struct kmemleak_object *object) | |||
479 | static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) | 479 | static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) |
480 | { | 480 | { |
481 | unsigned long flags; | 481 | unsigned long flags; |
482 | struct kmemleak_object *object = NULL; | 482 | struct kmemleak_object *object; |
483 | 483 | ||
484 | rcu_read_lock(); | 484 | rcu_read_lock(); |
485 | read_lock_irqsave(&kmemleak_lock, flags); | 485 | read_lock_irqsave(&kmemleak_lock, flags); |
@@ -475,7 +475,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) | |||
475 | flush_dcache_page(page); | 475 | flush_dcache_page(page); |
476 | } else { | 476 | } else { |
477 | put_page(page); | 477 | put_page(page); |
478 | out: page = NULL; | 478 | out: |
479 | page = NULL; | ||
479 | } | 480 | } |
480 | up_read(&mm->mmap_sem); | 481 | up_read(&mm->mmap_sem); |
481 | return page; | 482 | return page; |
@@ -625,7 +626,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
625 | unlock_page(page); | 626 | unlock_page(page); |
626 | put_page(page); | 627 | put_page(page); |
627 | 628 | ||
628 | if (stable_node->hlist.first) | 629 | if (!hlist_empty(&stable_node->hlist)) |
629 | ksm_pages_sharing--; | 630 | ksm_pages_sharing--; |
630 | else | 631 | else |
631 | ksm_pages_shared--; | 632 | ksm_pages_shared--; |
@@ -1021,8 +1022,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
1021 | if (page == kpage) /* ksm page forked */ | 1022 | if (page == kpage) /* ksm page forked */ |
1022 | return 0; | 1023 | return 0; |
1023 | 1024 | ||
1024 | if (!(vma->vm_flags & VM_MERGEABLE)) | ||
1025 | goto out; | ||
1026 | if (PageTransCompound(page) && page_trans_compound_anon_split(page)) | 1025 | if (PageTransCompound(page) && page_trans_compound_anon_split(page)) |
1027 | goto out; | 1026 | goto out; |
1028 | BUG_ON(PageTransCompound(page)); | 1027 | BUG_ON(PageTransCompound(page)); |
@@ -1087,10 +1086,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, | |||
1087 | int err = -EFAULT; | 1086 | int err = -EFAULT; |
1088 | 1087 | ||
1089 | down_read(&mm->mmap_sem); | 1088 | down_read(&mm->mmap_sem); |
1090 | if (ksm_test_exit(mm)) | 1089 | vma = find_mergeable_vma(mm, rmap_item->address); |
1091 | goto out; | 1090 | if (!vma) |
1092 | vma = find_vma(mm, rmap_item->address); | ||
1093 | if (!vma || vma->vm_start > rmap_item->address) | ||
1094 | goto out; | 1091 | goto out; |
1095 | 1092 | ||
1096 | err = try_to_merge_one_page(vma, page, kpage); | 1093 | err = try_to_merge_one_page(vma, page, kpage); |
@@ -1177,8 +1174,18 @@ again: | |||
1177 | cond_resched(); | 1174 | cond_resched(); |
1178 | stable_node = rb_entry(*new, struct stable_node, node); | 1175 | stable_node = rb_entry(*new, struct stable_node, node); |
1179 | tree_page = get_ksm_page(stable_node, false); | 1176 | tree_page = get_ksm_page(stable_node, false); |
1180 | if (!tree_page) | 1177 | if (!tree_page) { |
1181 | return NULL; | 1178 | /* |
1179 | * If we walked over a stale stable_node, | ||
1180 | * get_ksm_page() will call rb_erase() and it | ||
1181 | * may rebalance the tree from under us. So | ||
1182 | * restart the search from scratch. Returning | ||
1183 | * NULL would be safe too, but we'd generate | ||
1184 | * false negative insertions just because some | ||
1185 | * stable_node was stale. | ||
1186 | */ | ||
1187 | goto again; | ||
1188 | } | ||
1182 | 1189 | ||
1183 | ret = memcmp_pages(page, tree_page); | 1190 | ret = memcmp_pages(page, tree_page); |
1184 | put_page(tree_page); | 1191 | put_page(tree_page); |
@@ -1254,12 +1261,14 @@ static struct stable_node *stable_tree_insert(struct page *kpage) | |||
1254 | unsigned long kpfn; | 1261 | unsigned long kpfn; |
1255 | struct rb_root *root; | 1262 | struct rb_root *root; |
1256 | struct rb_node **new; | 1263 | struct rb_node **new; |
1257 | struct rb_node *parent = NULL; | 1264 | struct rb_node *parent; |
1258 | struct stable_node *stable_node; | 1265 | struct stable_node *stable_node; |
1259 | 1266 | ||
1260 | kpfn = page_to_pfn(kpage); | 1267 | kpfn = page_to_pfn(kpage); |
1261 | nid = get_kpfn_nid(kpfn); | 1268 | nid = get_kpfn_nid(kpfn); |
1262 | root = root_stable_tree + nid; | 1269 | root = root_stable_tree + nid; |
1270 | again: | ||
1271 | parent = NULL; | ||
1263 | new = &root->rb_node; | 1272 | new = &root->rb_node; |
1264 | 1273 | ||
1265 | while (*new) { | 1274 | while (*new) { |
@@ -1269,8 +1278,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage) | |||
1269 | cond_resched(); | 1278 | cond_resched(); |
1270 | stable_node = rb_entry(*new, struct stable_node, node); | 1279 | stable_node = rb_entry(*new, struct stable_node, node); |
1271 | tree_page = get_ksm_page(stable_node, false); | 1280 | tree_page = get_ksm_page(stable_node, false); |
1272 | if (!tree_page) | 1281 | if (!tree_page) { |
1273 | return NULL; | 1282 | /* |
1283 | * If we walked over a stale stable_node, | ||
1284 | * get_ksm_page() will call rb_erase() and it | ||
1285 | * may rebalance the tree from under us. So | ||
1286 | * restart the search from scratch. Returning | ||
1287 | * NULL would be safe too, but we'd generate | ||
1288 | * false negative insertions just because some | ||
1289 | * stable_node was stale. | ||
1290 | */ | ||
1291 | goto again; | ||
1292 | } | ||
1274 | 1293 | ||
1275 | ret = memcmp_pages(kpage, tree_page); | 1294 | ret = memcmp_pages(kpage, tree_page); |
1276 | put_page(tree_page); | 1295 | put_page(tree_page); |
@@ -1340,7 +1359,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |||
1340 | cond_resched(); | 1359 | cond_resched(); |
1341 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | 1360 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); |
1342 | tree_page = get_mergeable_page(tree_rmap_item); | 1361 | tree_page = get_mergeable_page(tree_rmap_item); |
1343 | if (IS_ERR_OR_NULL(tree_page)) | 1362 | if (!tree_page) |
1344 | return NULL; | 1363 | return NULL; |
1345 | 1364 | ||
1346 | /* | 1365 | /* |
@@ -1914,9 +1933,11 @@ again: | |||
1914 | struct anon_vma_chain *vmac; | 1933 | struct anon_vma_chain *vmac; |
1915 | struct vm_area_struct *vma; | 1934 | struct vm_area_struct *vma; |
1916 | 1935 | ||
1936 | cond_resched(); | ||
1917 | anon_vma_lock_read(anon_vma); | 1937 | anon_vma_lock_read(anon_vma); |
1918 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1938 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1919 | 0, ULONG_MAX) { | 1939 | 0, ULONG_MAX) { |
1940 | cond_resched(); | ||
1920 | vma = vmac->vma; | 1941 | vma = vmac->vma; |
1921 | if (rmap_item->address < vma->vm_start || | 1942 | if (rmap_item->address < vma->vm_start || |
1922 | rmap_item->address >= vma->vm_end) | 1943 | rmap_item->address >= vma->vm_end) |
diff --git a/mm/list_lru.c b/mm/list_lru.c index e1da19fac1b3..afc71ea9a381 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c | |||
@@ -42,6 +42,10 @@ static void list_lru_unregister(struct list_lru *lru) | |||
42 | #ifdef CONFIG_MEMCG_KMEM | 42 | #ifdef CONFIG_MEMCG_KMEM |
43 | static inline bool list_lru_memcg_aware(struct list_lru *lru) | 43 | static inline bool list_lru_memcg_aware(struct list_lru *lru) |
44 | { | 44 | { |
45 | /* | ||
46 | * This needs node 0 to be always present, even | ||
47 | * in the systems supporting sparse numa ids. | ||
48 | */ | ||
45 | return !!lru->node[0].memcg_lrus; | 49 | return !!lru->node[0].memcg_lrus; |
46 | } | 50 | } |
47 | 51 | ||
@@ -59,6 +63,16 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) | |||
59 | return &nlru->lru; | 63 | return &nlru->lru; |
60 | } | 64 | } |
61 | 65 | ||
66 | static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) | ||
67 | { | ||
68 | struct page *page; | ||
69 | |||
70 | if (!memcg_kmem_enabled()) | ||
71 | return NULL; | ||
72 | page = virt_to_head_page(ptr); | ||
73 | return page->mem_cgroup; | ||
74 | } | ||
75 | |||
62 | static inline struct list_lru_one * | 76 | static inline struct list_lru_one * |
63 | list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) | 77 | list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) |
64 | { | 78 | { |
@@ -377,16 +391,20 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) | |||
377 | { | 391 | { |
378 | int i; | 392 | int i; |
379 | 393 | ||
380 | for (i = 0; i < nr_node_ids; i++) { | 394 | if (!memcg_aware) |
381 | if (!memcg_aware) | 395 | return 0; |
382 | lru->node[i].memcg_lrus = NULL; | 396 | |
383 | else if (memcg_init_list_lru_node(&lru->node[i])) | 397 | for_each_node(i) { |
398 | if (memcg_init_list_lru_node(&lru->node[i])) | ||
384 | goto fail; | 399 | goto fail; |
385 | } | 400 | } |
386 | return 0; | 401 | return 0; |
387 | fail: | 402 | fail: |
388 | for (i = i - 1; i >= 0; i--) | 403 | for (i = i - 1; i >= 0; i--) { |
404 | if (!lru->node[i].memcg_lrus) | ||
405 | continue; | ||
389 | memcg_destroy_list_lru_node(&lru->node[i]); | 406 | memcg_destroy_list_lru_node(&lru->node[i]); |
407 | } | ||
390 | return -ENOMEM; | 408 | return -ENOMEM; |
391 | } | 409 | } |
392 | 410 | ||
@@ -397,7 +415,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru) | |||
397 | if (!list_lru_memcg_aware(lru)) | 415 | if (!list_lru_memcg_aware(lru)) |
398 | return; | 416 | return; |
399 | 417 | ||
400 | for (i = 0; i < nr_node_ids; i++) | 418 | for_each_node(i) |
401 | memcg_destroy_list_lru_node(&lru->node[i]); | 419 | memcg_destroy_list_lru_node(&lru->node[i]); |
402 | } | 420 | } |
403 | 421 | ||
@@ -409,16 +427,20 @@ static int memcg_update_list_lru(struct list_lru *lru, | |||
409 | if (!list_lru_memcg_aware(lru)) | 427 | if (!list_lru_memcg_aware(lru)) |
410 | return 0; | 428 | return 0; |
411 | 429 | ||
412 | for (i = 0; i < nr_node_ids; i++) { | 430 | for_each_node(i) { |
413 | if (memcg_update_list_lru_node(&lru->node[i], | 431 | if (memcg_update_list_lru_node(&lru->node[i], |
414 | old_size, new_size)) | 432 | old_size, new_size)) |
415 | goto fail; | 433 | goto fail; |
416 | } | 434 | } |
417 | return 0; | 435 | return 0; |
418 | fail: | 436 | fail: |
419 | for (i = i - 1; i >= 0; i--) | 437 | for (i = i - 1; i >= 0; i--) { |
438 | if (!lru->node[i].memcg_lrus) | ||
439 | continue; | ||
440 | |||
420 | memcg_cancel_update_list_lru_node(&lru->node[i], | 441 | memcg_cancel_update_list_lru_node(&lru->node[i], |
421 | old_size, new_size); | 442 | old_size, new_size); |
443 | } | ||
422 | return -ENOMEM; | 444 | return -ENOMEM; |
423 | } | 445 | } |
424 | 446 | ||
@@ -430,7 +452,7 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru, | |||
430 | if (!list_lru_memcg_aware(lru)) | 452 | if (!list_lru_memcg_aware(lru)) |
431 | return; | 453 | return; |
432 | 454 | ||
433 | for (i = 0; i < nr_node_ids; i++) | 455 | for_each_node(i) |
434 | memcg_cancel_update_list_lru_node(&lru->node[i], | 456 | memcg_cancel_update_list_lru_node(&lru->node[i], |
435 | old_size, new_size); | 457 | old_size, new_size); |
436 | } | 458 | } |
@@ -485,7 +507,7 @@ static void memcg_drain_list_lru(struct list_lru *lru, | |||
485 | if (!list_lru_memcg_aware(lru)) | 507 | if (!list_lru_memcg_aware(lru)) |
486 | return; | 508 | return; |
487 | 509 | ||
488 | for (i = 0; i < nr_node_ids; i++) | 510 | for_each_node(i) |
489 | memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); | 511 | memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); |
490 | } | 512 | } |
491 | 513 | ||
@@ -522,7 +544,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, | |||
522 | if (!lru->node) | 544 | if (!lru->node) |
523 | goto out; | 545 | goto out; |
524 | 546 | ||
525 | for (i = 0; i < nr_node_ids; i++) { | 547 | for_each_node(i) { |
526 | spin_lock_init(&lru->node[i].lock); | 548 | spin_lock_init(&lru->node[i].lock); |
527 | if (key) | 549 | if (key) |
528 | lockdep_set_class(&lru->node[i].lock, key); | 550 | lockdep_set_class(&lru->node[i].lock, key); |
diff --git a/mm/maccess.c b/mm/maccess.c index 34fe24759ed1..d159b1c96e48 100644 --- a/mm/maccess.c +++ b/mm/maccess.c | |||
@@ -13,6 +13,11 @@ | |||
13 | * | 13 | * |
14 | * Safely read from address @src to the buffer at @dst. If a kernel fault | 14 | * Safely read from address @src to the buffer at @dst. If a kernel fault |
15 | * happens, handle that and return -EFAULT. | 15 | * happens, handle that and return -EFAULT. |
16 | * | ||
17 | * We ensure that the copy_from_user is executed in atomic context so that | ||
18 | * do_page_fault() doesn't attempt to take mmap_sem. This makes | ||
19 | * probe_kernel_read() suitable for use within regions where the caller | ||
20 | * already holds mmap_sem, or other locks which nest inside mmap_sem. | ||
16 | */ | 21 | */ |
17 | 22 | ||
18 | long __weak probe_kernel_read(void *dst, const void *src, size_t size) | 23 | long __weak probe_kernel_read(void *dst, const void *src, size_t size) |
@@ -99,5 +104,5 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) | |||
99 | pagefault_enable(); | 104 | pagefault_enable(); |
100 | set_fs(old_fs); | 105 | set_fs(old_fs); |
101 | 106 | ||
102 | return ret < 0 ? ret : src - unsafe_addr; | 107 | return ret ? -EFAULT : src - unsafe_addr; |
103 | } | 108 | } |
diff --git a/mm/memblock.c b/mm/memblock.c index 1c7b647e5897..d300f1329814 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -706,7 +706,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
706 | return 0; | 706 | return 0; |
707 | } | 707 | } |
708 | 708 | ||
709 | int __init_memblock memblock_remove_range(struct memblock_type *type, | 709 | static int __init_memblock memblock_remove_range(struct memblock_type *type, |
710 | phys_addr_t base, phys_addr_t size) | 710 | phys_addr_t base, phys_addr_t size) |
711 | { | 711 | { |
712 | int start_rgn, end_rgn; | 712 | int start_rgn, end_rgn; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b732edfddb76..bc502e590366 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -62,6 +62,7 @@ | |||
62 | #include <linux/oom.h> | 62 | #include <linux/oom.h> |
63 | #include <linux/lockdep.h> | 63 | #include <linux/lockdep.h> |
64 | #include <linux/file.h> | 64 | #include <linux/file.h> |
65 | #include <linux/tracehook.h> | ||
65 | #include "internal.h" | 66 | #include "internal.h" |
66 | #include <net/sock.h> | 67 | #include <net/sock.h> |
67 | #include <net/ip.h> | 68 | #include <net/ip.h> |
@@ -1661,7 +1662,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
1661 | 1662 | ||
1662 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 1663 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
1663 | { | 1664 | { |
1664 | if (!current->memcg_oom.may_oom) | 1665 | if (!current->memcg_may_oom) |
1665 | return; | 1666 | return; |
1666 | /* | 1667 | /* |
1667 | * We are in the middle of the charge context here, so we | 1668 | * We are in the middle of the charge context here, so we |
@@ -1678,9 +1679,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | |||
1678 | * and when we know whether the fault was overall successful. | 1679 | * and when we know whether the fault was overall successful. |
1679 | */ | 1680 | */ |
1680 | css_get(&memcg->css); | 1681 | css_get(&memcg->css); |
1681 | current->memcg_oom.memcg = memcg; | 1682 | current->memcg_in_oom = memcg; |
1682 | current->memcg_oom.gfp_mask = mask; | 1683 | current->memcg_oom_gfp_mask = mask; |
1683 | current->memcg_oom.order = order; | 1684 | current->memcg_oom_order = order; |
1684 | } | 1685 | } |
1685 | 1686 | ||
1686 | /** | 1687 | /** |
@@ -1702,7 +1703,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | |||
1702 | */ | 1703 | */ |
1703 | bool mem_cgroup_oom_synchronize(bool handle) | 1704 | bool mem_cgroup_oom_synchronize(bool handle) |
1704 | { | 1705 | { |
1705 | struct mem_cgroup *memcg = current->memcg_oom.memcg; | 1706 | struct mem_cgroup *memcg = current->memcg_in_oom; |
1706 | struct oom_wait_info owait; | 1707 | struct oom_wait_info owait; |
1707 | bool locked; | 1708 | bool locked; |
1708 | 1709 | ||
@@ -1730,8 +1731,8 @@ bool mem_cgroup_oom_synchronize(bool handle) | |||
1730 | if (locked && !memcg->oom_kill_disable) { | 1731 | if (locked && !memcg->oom_kill_disable) { |
1731 | mem_cgroup_unmark_under_oom(memcg); | 1732 | mem_cgroup_unmark_under_oom(memcg); |
1732 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1733 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1733 | mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, | 1734 | mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, |
1734 | current->memcg_oom.order); | 1735 | current->memcg_oom_order); |
1735 | } else { | 1736 | } else { |
1736 | schedule(); | 1737 | schedule(); |
1737 | mem_cgroup_unmark_under_oom(memcg); | 1738 | mem_cgroup_unmark_under_oom(memcg); |
@@ -1748,7 +1749,7 @@ bool mem_cgroup_oom_synchronize(bool handle) | |||
1748 | memcg_oom_recover(memcg); | 1749 | memcg_oom_recover(memcg); |
1749 | } | 1750 | } |
1750 | cleanup: | 1751 | cleanup: |
1751 | current->memcg_oom.memcg = NULL; | 1752 | current->memcg_in_oom = NULL; |
1752 | css_put(&memcg->css); | 1753 | css_put(&memcg->css); |
1753 | return true; | 1754 | return true; |
1754 | } | 1755 | } |
@@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, | |||
1972 | return NOTIFY_OK; | 1973 | return NOTIFY_OK; |
1973 | } | 1974 | } |
1974 | 1975 | ||
1976 | /* | ||
1977 | * Scheduled by try_charge() to be executed from the userland return path | ||
1978 | * and reclaims memory over the high limit. | ||
1979 | */ | ||
1980 | void mem_cgroup_handle_over_high(void) | ||
1981 | { | ||
1982 | unsigned int nr_pages = current->memcg_nr_pages_over_high; | ||
1983 | struct mem_cgroup *memcg, *pos; | ||
1984 | |||
1985 | if (likely(!nr_pages)) | ||
1986 | return; | ||
1987 | |||
1988 | pos = memcg = get_mem_cgroup_from_mm(current->mm); | ||
1989 | |||
1990 | do { | ||
1991 | if (page_counter_read(&pos->memory) <= pos->high) | ||
1992 | continue; | ||
1993 | mem_cgroup_events(pos, MEMCG_HIGH, 1); | ||
1994 | try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true); | ||
1995 | } while ((pos = parent_mem_cgroup(pos))); | ||
1996 | |||
1997 | css_put(&memcg->css); | ||
1998 | current->memcg_nr_pages_over_high = 0; | ||
1999 | } | ||
2000 | |||
1975 | static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 2001 | static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
1976 | unsigned int nr_pages) | 2002 | unsigned int nr_pages) |
1977 | { | 2003 | { |
@@ -1982,17 +2008,16 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1982 | unsigned long nr_reclaimed; | 2008 | unsigned long nr_reclaimed; |
1983 | bool may_swap = true; | 2009 | bool may_swap = true; |
1984 | bool drained = false; | 2010 | bool drained = false; |
1985 | int ret = 0; | ||
1986 | 2011 | ||
1987 | if (mem_cgroup_is_root(memcg)) | 2012 | if (mem_cgroup_is_root(memcg)) |
1988 | goto done; | 2013 | return 0; |
1989 | retry: | 2014 | retry: |
1990 | if (consume_stock(memcg, nr_pages)) | 2015 | if (consume_stock(memcg, nr_pages)) |
1991 | goto done; | 2016 | return 0; |
1992 | 2017 | ||
1993 | if (!do_swap_account || | 2018 | if (!do_swap_account || |
1994 | !page_counter_try_charge(&memcg->memsw, batch, &counter)) { | 2019 | page_counter_try_charge(&memcg->memsw, batch, &counter)) { |
1995 | if (!page_counter_try_charge(&memcg->memory, batch, &counter)) | 2020 | if (page_counter_try_charge(&memcg->memory, batch, &counter)) |
1996 | goto done_restock; | 2021 | goto done_restock; |
1997 | if (do_swap_account) | 2022 | if (do_swap_account) |
1998 | page_counter_uncharge(&memcg->memsw, batch); | 2023 | page_counter_uncharge(&memcg->memsw, batch); |
@@ -2016,7 +2041,7 @@ retry: | |||
2016 | if (unlikely(test_thread_flag(TIF_MEMDIE) || | 2041 | if (unlikely(test_thread_flag(TIF_MEMDIE) || |
2017 | fatal_signal_pending(current) || | 2042 | fatal_signal_pending(current) || |
2018 | current->flags & PF_EXITING)) | 2043 | current->flags & PF_EXITING)) |
2019 | goto bypass; | 2044 | goto force; |
2020 | 2045 | ||
2021 | if (unlikely(task_in_memcg_oom(current))) | 2046 | if (unlikely(task_in_memcg_oom(current))) |
2022 | goto nomem; | 2047 | goto nomem; |
@@ -2062,38 +2087,54 @@ retry: | |||
2062 | goto retry; | 2087 | goto retry; |
2063 | 2088 | ||
2064 | if (gfp_mask & __GFP_NOFAIL) | 2089 | if (gfp_mask & __GFP_NOFAIL) |
2065 | goto bypass; | 2090 | goto force; |
2066 | 2091 | ||
2067 | if (fatal_signal_pending(current)) | 2092 | if (fatal_signal_pending(current)) |
2068 | goto bypass; | 2093 | goto force; |
2069 | 2094 | ||
2070 | mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); | 2095 | mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); |
2071 | 2096 | ||
2072 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); | 2097 | mem_cgroup_oom(mem_over_limit, gfp_mask, |
2098 | get_order(nr_pages * PAGE_SIZE)); | ||
2073 | nomem: | 2099 | nomem: |
2074 | if (!(gfp_mask & __GFP_NOFAIL)) | 2100 | if (!(gfp_mask & __GFP_NOFAIL)) |
2075 | return -ENOMEM; | 2101 | return -ENOMEM; |
2076 | bypass: | 2102 | force: |
2077 | return -EINTR; | 2103 | /* |
2104 | * The allocation either can't fail or will lead to more memory | ||
2105 | * being freed very soon. Allow memory usage go over the limit | ||
2106 | * temporarily by force charging it. | ||
2107 | */ | ||
2108 | page_counter_charge(&memcg->memory, nr_pages); | ||
2109 | if (do_swap_account) | ||
2110 | page_counter_charge(&memcg->memsw, nr_pages); | ||
2111 | css_get_many(&memcg->css, nr_pages); | ||
2112 | |||
2113 | return 0; | ||
2078 | 2114 | ||
2079 | done_restock: | 2115 | done_restock: |
2080 | css_get_many(&memcg->css, batch); | 2116 | css_get_many(&memcg->css, batch); |
2081 | if (batch > nr_pages) | 2117 | if (batch > nr_pages) |
2082 | refill_stock(memcg, batch - nr_pages); | 2118 | refill_stock(memcg, batch - nr_pages); |
2083 | if (!(gfp_mask & __GFP_WAIT)) | 2119 | |
2084 | goto done; | ||
2085 | /* | 2120 | /* |
2086 | * If the hierarchy is above the normal consumption range, | 2121 | * If the hierarchy is above the normal consumption range, schedule |
2087 | * make the charging task trim their excess contribution. | 2122 | * reclaim on returning to userland. We can perform reclaim here |
2123 | * if __GFP_WAIT but let's always punt for simplicity and so that | ||
2124 | * GFP_KERNEL can consistently be used during reclaim. @memcg is | ||
2125 | * not recorded as it most likely matches current's and won't | ||
2126 | * change in the meantime. As high limit is checked again before | ||
2127 | * reclaim, the cost of mismatch is negligible. | ||
2088 | */ | 2128 | */ |
2089 | do { | 2129 | do { |
2090 | if (page_counter_read(&memcg->memory) <= memcg->high) | 2130 | if (page_counter_read(&memcg->memory) > memcg->high) { |
2091 | continue; | 2131 | current->memcg_nr_pages_over_high += nr_pages; |
2092 | mem_cgroup_events(memcg, MEMCG_HIGH, 1); | 2132 | set_notify_resume(current); |
2093 | try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); | 2133 | break; |
2134 | } | ||
2094 | } while ((memcg = parent_mem_cgroup(memcg))); | 2135 | } while ((memcg = parent_mem_cgroup(memcg))); |
2095 | done: | 2136 | |
2096 | return ret; | 2137 | return 0; |
2097 | } | 2138 | } |
2098 | 2139 | ||
2099 | static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) | 2140 | static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) |
@@ -2174,55 +2215,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, | |||
2174 | } | 2215 | } |
2175 | 2216 | ||
2176 | #ifdef CONFIG_MEMCG_KMEM | 2217 | #ifdef CONFIG_MEMCG_KMEM |
2177 | int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, | ||
2178 | unsigned long nr_pages) | ||
2179 | { | ||
2180 | struct page_counter *counter; | ||
2181 | int ret = 0; | ||
2182 | |||
2183 | ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); | ||
2184 | if (ret < 0) | ||
2185 | return ret; | ||
2186 | |||
2187 | ret = try_charge(memcg, gfp, nr_pages); | ||
2188 | if (ret == -EINTR) { | ||
2189 | /* | ||
2190 | * try_charge() chose to bypass to root due to OOM kill or | ||
2191 | * fatal signal. Since our only options are to either fail | ||
2192 | * the allocation or charge it to this cgroup, do it as a | ||
2193 | * temporary condition. But we can't fail. From a kmem/slab | ||
2194 | * perspective, the cache has already been selected, by | ||
2195 | * mem_cgroup_kmem_get_cache(), so it is too late to change | ||
2196 | * our minds. | ||
2197 | * | ||
2198 | * This condition will only trigger if the task entered | ||
2199 | * memcg_charge_kmem in a sane state, but was OOM-killed | ||
2200 | * during try_charge() above. Tasks that were already dying | ||
2201 | * when the allocation triggers should have been already | ||
2202 | * directed to the root cgroup in memcontrol.h | ||
2203 | */ | ||
2204 | page_counter_charge(&memcg->memory, nr_pages); | ||
2205 | if (do_swap_account) | ||
2206 | page_counter_charge(&memcg->memsw, nr_pages); | ||
2207 | css_get_many(&memcg->css, nr_pages); | ||
2208 | ret = 0; | ||
2209 | } else if (ret) | ||
2210 | page_counter_uncharge(&memcg->kmem, nr_pages); | ||
2211 | |||
2212 | return ret; | ||
2213 | } | ||
2214 | |||
2215 | void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) | ||
2216 | { | ||
2217 | page_counter_uncharge(&memcg->memory, nr_pages); | ||
2218 | if (do_swap_account) | ||
2219 | page_counter_uncharge(&memcg->memsw, nr_pages); | ||
2220 | |||
2221 | page_counter_uncharge(&memcg->kmem, nr_pages); | ||
2222 | |||
2223 | css_put_many(&memcg->css, nr_pages); | ||
2224 | } | ||
2225 | |||
2226 | static int memcg_alloc_cache_id(void) | 2218 | static int memcg_alloc_cache_id(void) |
2227 | { | 2219 | { |
2228 | int id, size; | 2220 | int id, size; |
@@ -2384,85 +2376,58 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep) | |||
2384 | css_put(&cachep->memcg_params.memcg->css); | 2376 | css_put(&cachep->memcg_params.memcg->css); |
2385 | } | 2377 | } |
2386 | 2378 | ||
2387 | /* | 2379 | int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, |
2388 | * We need to verify if the allocation against current->mm->owner's memcg is | 2380 | struct mem_cgroup *memcg) |
2389 | * possible for the given order. But the page is not allocated yet, so we'll | ||
2390 | * need a further commit step to do the final arrangements. | ||
2391 | * | ||
2392 | * It is possible for the task to switch cgroups in this mean time, so at | ||
2393 | * commit time, we can't rely on task conversion any longer. We'll then use | ||
2394 | * the handle argument to return to the caller which cgroup we should commit | ||
2395 | * against. We could also return the memcg directly and avoid the pointer | ||
2396 | * passing, but a boolean return value gives better semantics considering | ||
2397 | * the compiled-out case as well. | ||
2398 | * | ||
2399 | * Returning true means the allocation is possible. | ||
2400 | */ | ||
2401 | bool | ||
2402 | __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | ||
2403 | { | 2381 | { |
2404 | struct mem_cgroup *memcg; | 2382 | unsigned int nr_pages = 1 << order; |
2383 | struct page_counter *counter; | ||
2405 | int ret; | 2384 | int ret; |
2406 | 2385 | ||
2407 | *_memcg = NULL; | 2386 | if (!memcg_kmem_is_active(memcg)) |
2387 | return 0; | ||
2408 | 2388 | ||
2409 | memcg = get_mem_cgroup_from_mm(current->mm); | 2389 | if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) |
2390 | return -ENOMEM; | ||
2410 | 2391 | ||
2411 | if (!memcg_kmem_is_active(memcg)) { | 2392 | ret = try_charge(memcg, gfp, nr_pages); |
2412 | css_put(&memcg->css); | 2393 | if (ret) { |
2413 | return true; | 2394 | page_counter_uncharge(&memcg->kmem, nr_pages); |
2395 | return ret; | ||
2414 | } | 2396 | } |
2415 | 2397 | ||
2416 | ret = memcg_charge_kmem(memcg, gfp, 1 << order); | 2398 | page->mem_cgroup = memcg; |
2417 | if (!ret) | ||
2418 | *_memcg = memcg; | ||
2419 | 2399 | ||
2420 | css_put(&memcg->css); | 2400 | return 0; |
2421 | return (ret == 0); | ||
2422 | } | 2401 | } |
2423 | 2402 | ||
2424 | void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, | 2403 | int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) |
2425 | int order) | ||
2426 | { | 2404 | { |
2427 | VM_BUG_ON(mem_cgroup_is_root(memcg)); | 2405 | struct mem_cgroup *memcg; |
2406 | int ret; | ||
2428 | 2407 | ||
2429 | /* The page allocation failed. Revert */ | 2408 | memcg = get_mem_cgroup_from_mm(current->mm); |
2430 | if (!page) { | 2409 | ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); |
2431 | memcg_uncharge_kmem(memcg, 1 << order); | 2410 | css_put(&memcg->css); |
2432 | return; | 2411 | return ret; |
2433 | } | ||
2434 | page->mem_cgroup = memcg; | ||
2435 | } | 2412 | } |
2436 | 2413 | ||
2437 | void __memcg_kmem_uncharge_pages(struct page *page, int order) | 2414 | void __memcg_kmem_uncharge(struct page *page, int order) |
2438 | { | 2415 | { |
2439 | struct mem_cgroup *memcg = page->mem_cgroup; | 2416 | struct mem_cgroup *memcg = page->mem_cgroup; |
2417 | unsigned int nr_pages = 1 << order; | ||
2440 | 2418 | ||
2441 | if (!memcg) | 2419 | if (!memcg) |
2442 | return; | 2420 | return; |
2443 | 2421 | ||
2444 | VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); | 2422 | VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); |
2445 | 2423 | ||
2446 | memcg_uncharge_kmem(memcg, 1 << order); | 2424 | page_counter_uncharge(&memcg->kmem, nr_pages); |
2447 | page->mem_cgroup = NULL; | 2425 | page_counter_uncharge(&memcg->memory, nr_pages); |
2448 | } | 2426 | if (do_swap_account) |
2449 | 2427 | page_counter_uncharge(&memcg->memsw, nr_pages); | |
2450 | struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) | ||
2451 | { | ||
2452 | struct mem_cgroup *memcg = NULL; | ||
2453 | struct kmem_cache *cachep; | ||
2454 | struct page *page; | ||
2455 | |||
2456 | page = virt_to_head_page(ptr); | ||
2457 | if (PageSlab(page)) { | ||
2458 | cachep = page->slab_cache; | ||
2459 | if (!is_root_cache(cachep)) | ||
2460 | memcg = cachep->memcg_params.memcg; | ||
2461 | } else | ||
2462 | /* page allocated by alloc_kmem_pages */ | ||
2463 | memcg = page->mem_cgroup; | ||
2464 | 2428 | ||
2465 | return memcg; | 2429 | page->mem_cgroup = NULL; |
2430 | css_put_many(&memcg->css, nr_pages); | ||
2466 | } | 2431 | } |
2467 | #endif /* CONFIG_MEMCG_KMEM */ | 2432 | #endif /* CONFIG_MEMCG_KMEM */ |
2468 | 2433 | ||
@@ -2836,9 +2801,9 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, | |||
2836 | return val; | 2801 | return val; |
2837 | } | 2802 | } |
2838 | 2803 | ||
2839 | static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | 2804 | static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) |
2840 | { | 2805 | { |
2841 | u64 val; | 2806 | unsigned long val; |
2842 | 2807 | ||
2843 | if (mem_cgroup_is_root(memcg)) { | 2808 | if (mem_cgroup_is_root(memcg)) { |
2844 | val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); | 2809 | val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); |
@@ -2851,7 +2816,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
2851 | else | 2816 | else |
2852 | val = page_counter_read(&memcg->memsw); | 2817 | val = page_counter_read(&memcg->memsw); |
2853 | } | 2818 | } |
2854 | return val << PAGE_SHIFT; | 2819 | return val; |
2855 | } | 2820 | } |
2856 | 2821 | ||
2857 | enum { | 2822 | enum { |
@@ -2885,9 +2850,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, | |||
2885 | switch (MEMFILE_ATTR(cft->private)) { | 2850 | switch (MEMFILE_ATTR(cft->private)) { |
2886 | case RES_USAGE: | 2851 | case RES_USAGE: |
2887 | if (counter == &memcg->memory) | 2852 | if (counter == &memcg->memory) |
2888 | return mem_cgroup_usage(memcg, false); | 2853 | return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; |
2889 | if (counter == &memcg->memsw) | 2854 | if (counter == &memcg->memsw) |
2890 | return mem_cgroup_usage(memcg, true); | 2855 | return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; |
2891 | return (u64)page_counter_read(counter) * PAGE_SIZE; | 2856 | return (u64)page_counter_read(counter) * PAGE_SIZE; |
2892 | case RES_LIMIT: | 2857 | case RES_LIMIT: |
2893 | return (u64)counter->limit * PAGE_SIZE; | 2858 | return (u64)counter->limit * PAGE_SIZE; |
@@ -3387,7 +3352,6 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | |||
3387 | ret = page_counter_memparse(args, "-1", &threshold); | 3352 | ret = page_counter_memparse(args, "-1", &threshold); |
3388 | if (ret) | 3353 | if (ret) |
3389 | return ret; | 3354 | return ret; |
3390 | threshold <<= PAGE_SHIFT; | ||
3391 | 3355 | ||
3392 | mutex_lock(&memcg->thresholds_lock); | 3356 | mutex_lock(&memcg->thresholds_lock); |
3393 | 3357 | ||
@@ -4406,22 +4370,10 @@ static int mem_cgroup_do_precharge(unsigned long count) | |||
4406 | mc.precharge += count; | 4370 | mc.precharge += count; |
4407 | return ret; | 4371 | return ret; |
4408 | } | 4372 | } |
4409 | if (ret == -EINTR) { | ||
4410 | cancel_charge(root_mem_cgroup, count); | ||
4411 | return ret; | ||
4412 | } | ||
4413 | 4373 | ||
4414 | /* Try charges one by one with reclaim */ | 4374 | /* Try charges one by one with reclaim */ |
4415 | while (count--) { | 4375 | while (count--) { |
4416 | ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); | 4376 | ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); |
4417 | /* | ||
4418 | * In case of failure, any residual charges against | ||
4419 | * mc.to will be dropped by mem_cgroup_clear_mc() | ||
4420 | * later on. However, cancel any charges that are | ||
4421 | * bypassed to root right away or they'll be lost. | ||
4422 | */ | ||
4423 | if (ret == -EINTR) | ||
4424 | cancel_charge(root_mem_cgroup, 1); | ||
4425 | if (ret) | 4377 | if (ret) |
4426 | return ret; | 4378 | return ret; |
4427 | mc.precharge++; | 4379 | mc.precharge++; |
@@ -4576,9 +4528,8 @@ static int mem_cgroup_move_account(struct page *page, | |||
4576 | goto out; | 4528 | goto out; |
4577 | 4529 | ||
4578 | /* | 4530 | /* |
4579 | * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup | 4531 | * Prevent mem_cgroup_replace_page() from looking at |
4580 | * of its source page while we change it: page migration takes | 4532 | * page->mem_cgroup of its source page while we change it. |
4581 | * both pages off the LRU, but page cache replacement doesn't. | ||
4582 | */ | 4533 | */ |
4583 | if (!trylock_page(page)) | 4534 | if (!trylock_page(page)) |
4584 | goto out; | 4535 | goto out; |
@@ -5085,7 +5036,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) | |||
5085 | static u64 memory_current_read(struct cgroup_subsys_state *css, | 5036 | static u64 memory_current_read(struct cgroup_subsys_state *css, |
5086 | struct cftype *cft) | 5037 | struct cftype *cft) |
5087 | { | 5038 | { |
5088 | return mem_cgroup_usage(mem_cgroup_from_css(css), false); | 5039 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5040 | |||
5041 | return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; | ||
5089 | } | 5042 | } |
5090 | 5043 | ||
5091 | static int memory_low_show(struct seq_file *m, void *v) | 5044 | static int memory_low_show(struct seq_file *m, void *v) |
@@ -5197,6 +5150,7 @@ static int memory_events_show(struct seq_file *m, void *v) | |||
5197 | static struct cftype memory_files[] = { | 5150 | static struct cftype memory_files[] = { |
5198 | { | 5151 | { |
5199 | .name = "current", | 5152 | .name = "current", |
5153 | .flags = CFTYPE_NOT_ON_ROOT, | ||
5200 | .read_u64 = memory_current_read, | 5154 | .read_u64 = memory_current_read, |
5201 | }, | 5155 | }, |
5202 | { | 5156 | { |
@@ -5340,11 +5294,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | |||
5340 | ret = try_charge(memcg, gfp_mask, nr_pages); | 5294 | ret = try_charge(memcg, gfp_mask, nr_pages); |
5341 | 5295 | ||
5342 | css_put(&memcg->css); | 5296 | css_put(&memcg->css); |
5343 | |||
5344 | if (ret == -EINTR) { | ||
5345 | memcg = root_mem_cgroup; | ||
5346 | ret = 0; | ||
5347 | } | ||
5348 | out: | 5297 | out: |
5349 | *memcgp = memcg; | 5298 | *memcgp = memcg; |
5350 | return ret; | 5299 | return ret; |
@@ -5559,7 +5508,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) | |||
5559 | } | 5508 | } |
5560 | 5509 | ||
5561 | /** | 5510 | /** |
5562 | * mem_cgroup_migrate - migrate a charge to another page | 5511 | * mem_cgroup_replace_page - migrate a charge to another page |
5563 | * @oldpage: currently charged page | 5512 | * @oldpage: currently charged page |
5564 | * @newpage: page to transfer the charge to | 5513 | * @newpage: page to transfer the charge to |
5565 | * @lrucare: either or both pages might be on the LRU already | 5514 | * @lrucare: either or both pages might be on the LRU already |
@@ -5568,16 +5517,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) | |||
5568 | * | 5517 | * |
5569 | * Both pages must be locked, @newpage->mapping must be set up. | 5518 | * Both pages must be locked, @newpage->mapping must be set up. |
5570 | */ | 5519 | */ |
5571 | void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, | 5520 | void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) |
5572 | bool lrucare) | ||
5573 | { | 5521 | { |
5574 | struct mem_cgroup *memcg; | 5522 | struct mem_cgroup *memcg; |
5575 | int isolated; | 5523 | int isolated; |
5576 | 5524 | ||
5577 | VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); | 5525 | VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); |
5578 | VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); | 5526 | VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); |
5579 | VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); | ||
5580 | VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); | ||
5581 | VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); | 5527 | VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); |
5582 | VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), | 5528 | VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), |
5583 | newpage); | 5529 | newpage); |
@@ -5589,25 +5535,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, | |||
5589 | if (newpage->mem_cgroup) | 5535 | if (newpage->mem_cgroup) |
5590 | return; | 5536 | return; |
5591 | 5537 | ||
5592 | /* | 5538 | /* Swapcache readahead pages can get replaced before being charged */ |
5593 | * Swapcache readahead pages can get migrated before being | ||
5594 | * charged, and migration from compaction can happen to an | ||
5595 | * uncharged page when the PFN walker finds a page that | ||
5596 | * reclaim just put back on the LRU but has not released yet. | ||
5597 | */ | ||
5598 | memcg = oldpage->mem_cgroup; | 5539 | memcg = oldpage->mem_cgroup; |
5599 | if (!memcg) | 5540 | if (!memcg) |
5600 | return; | 5541 | return; |
5601 | 5542 | ||
5602 | if (lrucare) | 5543 | lock_page_lru(oldpage, &isolated); |
5603 | lock_page_lru(oldpage, &isolated); | ||
5604 | |||
5605 | oldpage->mem_cgroup = NULL; | 5544 | oldpage->mem_cgroup = NULL; |
5545 | unlock_page_lru(oldpage, isolated); | ||
5606 | 5546 | ||
5607 | if (lrucare) | 5547 | commit_charge(newpage, memcg, true); |
5608 | unlock_page_lru(oldpage, isolated); | ||
5609 | |||
5610 | commit_charge(newpage, memcg, lrucare); | ||
5611 | } | 5548 | } |
5612 | 5549 | ||
5613 | /* | 5550 | /* |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 95882692e747..16a0ec385320 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <linux/memory_hotplug.h> | 56 | #include <linux/memory_hotplug.h> |
57 | #include <linux/mm_inline.h> | 57 | #include <linux/mm_inline.h> |
58 | #include <linux/kfifo.h> | 58 | #include <linux/kfifo.h> |
59 | #include <linux/ratelimit.h> | ||
59 | #include "internal.h" | 60 | #include "internal.h" |
60 | #include "ras/ras_event.h" | 61 | #include "ras/ras_event.h" |
61 | 62 | ||
@@ -1403,6 +1404,12 @@ static int __init memory_failure_init(void) | |||
1403 | } | 1404 | } |
1404 | core_initcall(memory_failure_init); | 1405 | core_initcall(memory_failure_init); |
1405 | 1406 | ||
1407 | #define unpoison_pr_info(fmt, pfn, rs) \ | ||
1408 | ({ \ | ||
1409 | if (__ratelimit(rs)) \ | ||
1410 | pr_info(fmt, pfn); \ | ||
1411 | }) | ||
1412 | |||
1406 | /** | 1413 | /** |
1407 | * unpoison_memory - Unpoison a previously poisoned page | 1414 | * unpoison_memory - Unpoison a previously poisoned page |
1408 | * @pfn: Page number of the to be unpoisoned page | 1415 | * @pfn: Page number of the to be unpoisoned page |
@@ -1421,6 +1428,8 @@ int unpoison_memory(unsigned long pfn) | |||
1421 | struct page *p; | 1428 | struct page *p; |
1422 | int freeit = 0; | 1429 | int freeit = 0; |
1423 | unsigned int nr_pages; | 1430 | unsigned int nr_pages; |
1431 | static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, | ||
1432 | DEFAULT_RATELIMIT_BURST); | ||
1424 | 1433 | ||
1425 | if (!pfn_valid(pfn)) | 1434 | if (!pfn_valid(pfn)) |
1426 | return -ENXIO; | 1435 | return -ENXIO; |
@@ -1429,23 +1438,26 @@ int unpoison_memory(unsigned long pfn) | |||
1429 | page = compound_head(p); | 1438 | page = compound_head(p); |
1430 | 1439 | ||
1431 | if (!PageHWPoison(p)) { | 1440 | if (!PageHWPoison(p)) { |
1432 | pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); | 1441 | unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n", |
1442 | pfn, &unpoison_rs); | ||
1433 | return 0; | 1443 | return 0; |
1434 | } | 1444 | } |
1435 | 1445 | ||
1436 | if (page_count(page) > 1) { | 1446 | if (page_count(page) > 1) { |
1437 | pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn); | 1447 | unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n", |
1448 | pfn, &unpoison_rs); | ||
1438 | return 0; | 1449 | return 0; |
1439 | } | 1450 | } |
1440 | 1451 | ||
1441 | if (page_mapped(page)) { | 1452 | if (page_mapped(page)) { |
1442 | pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn); | 1453 | unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n", |
1454 | pfn, &unpoison_rs); | ||
1443 | return 0; | 1455 | return 0; |
1444 | } | 1456 | } |
1445 | 1457 | ||
1446 | if (page_mapping(page)) { | 1458 | if (page_mapping(page)) { |
1447 | pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", | 1459 | unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", |
1448 | pfn); | 1460 | pfn, &unpoison_rs); |
1449 | return 0; | 1461 | return 0; |
1450 | } | 1462 | } |
1451 | 1463 | ||
@@ -1455,7 +1467,8 @@ int unpoison_memory(unsigned long pfn) | |||
1455 | * In such case, we yield to memory_failure() and make unpoison fail. | 1467 | * In such case, we yield to memory_failure() and make unpoison fail. |
1456 | */ | 1468 | */ |
1457 | if (!PageHuge(page) && PageTransHuge(page)) { | 1469 | if (!PageHuge(page) && PageTransHuge(page)) { |
1458 | pr_info("MCE: Memory failure is now running on %#lx\n", pfn); | 1470 | unpoison_pr_info("MCE: Memory failure is now running on %#lx\n", |
1471 | pfn, &unpoison_rs); | ||
1459 | return 0; | 1472 | return 0; |
1460 | } | 1473 | } |
1461 | 1474 | ||
@@ -1469,12 +1482,14 @@ int unpoison_memory(unsigned long pfn) | |||
1469 | * to the end. | 1482 | * to the end. |
1470 | */ | 1483 | */ |
1471 | if (PageHuge(page)) { | 1484 | if (PageHuge(page)) { |
1472 | pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); | 1485 | unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", |
1486 | pfn, &unpoison_rs); | ||
1473 | return 0; | 1487 | return 0; |
1474 | } | 1488 | } |
1475 | if (TestClearPageHWPoison(p)) | 1489 | if (TestClearPageHWPoison(p)) |
1476 | num_poisoned_pages_dec(); | 1490 | num_poisoned_pages_dec(); |
1477 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1491 | unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n", |
1492 | pfn, &unpoison_rs); | ||
1478 | return 0; | 1493 | return 0; |
1479 | } | 1494 | } |
1480 | 1495 | ||
@@ -1486,7 +1501,8 @@ int unpoison_memory(unsigned long pfn) | |||
1486 | * the free buddy page pool. | 1501 | * the free buddy page pool. |
1487 | */ | 1502 | */ |
1488 | if (TestClearPageHWPoison(page)) { | 1503 | if (TestClearPageHWPoison(page)) { |
1489 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); | 1504 | unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n", |
1505 | pfn, &unpoison_rs); | ||
1490 | num_poisoned_pages_sub(nr_pages); | 1506 | num_poisoned_pages_sub(nr_pages); |
1491 | freeit = 1; | 1507 | freeit = 1; |
1492 | if (PageHuge(page)) | 1508 | if (PageHuge(page)) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0780d118d26e..67d488ab495e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -339,8 +339,8 @@ static int __ref ensure_zone_is_initialized(struct zone *zone, | |||
339 | unsigned long start_pfn, unsigned long num_pages) | 339 | unsigned long start_pfn, unsigned long num_pages) |
340 | { | 340 | { |
341 | if (!zone_is_initialized(zone)) | 341 | if (!zone_is_initialized(zone)) |
342 | return init_currently_empty_zone(zone, start_pfn, num_pages, | 342 | return init_currently_empty_zone(zone, start_pfn, num_pages); |
343 | MEMMAP_HOTPLUG); | 343 | |
344 | return 0; | 344 | return 0; |
345 | } | 345 | } |
346 | 346 | ||
diff --git a/mm/migrate.c b/mm/migrate.c index 842ecd7aaf7f..2834faba719a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Memory Migration functionality - linux/mm/migration.c | 2 | * Memory Migration functionality - linux/mm/migrate.c |
3 | * | 3 | * |
4 | * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter | 4 | * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter |
5 | * | 5 | * |
@@ -30,7 +30,7 @@ | |||
30 | #include <linux/mempolicy.h> | 30 | #include <linux/mempolicy.h> |
31 | #include <linux/vmalloc.h> | 31 | #include <linux/vmalloc.h> |
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/memcontrol.h> | 33 | #include <linux/backing-dev.h> |
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/hugetlb.h> | 35 | #include <linux/hugetlb.h> |
36 | #include <linux/hugetlb_cgroup.h> | 36 | #include <linux/hugetlb_cgroup.h> |
@@ -171,6 +171,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
171 | else | 171 | else |
172 | page_add_file_rmap(new); | 172 | page_add_file_rmap(new); |
173 | 173 | ||
174 | if (vma->vm_flags & VM_LOCKED) | ||
175 | mlock_vma_page(new); | ||
176 | |||
174 | /* No need to invalidate - it was non-present before */ | 177 | /* No need to invalidate - it was non-present before */ |
175 | update_mmu_cache(vma, addr, ptep); | 178 | update_mmu_cache(vma, addr, ptep); |
176 | unlock: | 179 | unlock: |
@@ -311,6 +314,8 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
311 | struct buffer_head *head, enum migrate_mode mode, | 314 | struct buffer_head *head, enum migrate_mode mode, |
312 | int extra_count) | 315 | int extra_count) |
313 | { | 316 | { |
317 | struct zone *oldzone, *newzone; | ||
318 | int dirty; | ||
314 | int expected_count = 1 + extra_count; | 319 | int expected_count = 1 + extra_count; |
315 | void **pslot; | 320 | void **pslot; |
316 | 321 | ||
@@ -318,9 +323,20 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
318 | /* Anonymous page without mapping */ | 323 | /* Anonymous page without mapping */ |
319 | if (page_count(page) != expected_count) | 324 | if (page_count(page) != expected_count) |
320 | return -EAGAIN; | 325 | return -EAGAIN; |
326 | |||
327 | /* No turning back from here */ | ||
328 | set_page_memcg(newpage, page_memcg(page)); | ||
329 | newpage->index = page->index; | ||
330 | newpage->mapping = page->mapping; | ||
331 | if (PageSwapBacked(page)) | ||
332 | SetPageSwapBacked(newpage); | ||
333 | |||
321 | return MIGRATEPAGE_SUCCESS; | 334 | return MIGRATEPAGE_SUCCESS; |
322 | } | 335 | } |
323 | 336 | ||
337 | oldzone = page_zone(page); | ||
338 | newzone = page_zone(newpage); | ||
339 | |||
324 | spin_lock_irq(&mapping->tree_lock); | 340 | spin_lock_irq(&mapping->tree_lock); |
325 | 341 | ||
326 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | 342 | pslot = radix_tree_lookup_slot(&mapping->page_tree, |
@@ -353,14 +369,28 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
353 | } | 369 | } |
354 | 370 | ||
355 | /* | 371 | /* |
356 | * Now we know that no one else is looking at the page. | 372 | * Now we know that no one else is looking at the page: |
373 | * no turning back from here. | ||
357 | */ | 374 | */ |
375 | set_page_memcg(newpage, page_memcg(page)); | ||
376 | newpage->index = page->index; | ||
377 | newpage->mapping = page->mapping; | ||
378 | if (PageSwapBacked(page)) | ||
379 | SetPageSwapBacked(newpage); | ||
380 | |||
358 | get_page(newpage); /* add cache reference */ | 381 | get_page(newpage); /* add cache reference */ |
359 | if (PageSwapCache(page)) { | 382 | if (PageSwapCache(page)) { |
360 | SetPageSwapCache(newpage); | 383 | SetPageSwapCache(newpage); |
361 | set_page_private(newpage, page_private(page)); | 384 | set_page_private(newpage, page_private(page)); |
362 | } | 385 | } |
363 | 386 | ||
387 | /* Move dirty while page refs frozen and newpage not yet exposed */ | ||
388 | dirty = PageDirty(page); | ||
389 | if (dirty) { | ||
390 | ClearPageDirty(page); | ||
391 | SetPageDirty(newpage); | ||
392 | } | ||
393 | |||
364 | radix_tree_replace_slot(pslot, newpage); | 394 | radix_tree_replace_slot(pslot, newpage); |
365 | 395 | ||
366 | /* | 396 | /* |
@@ -370,6 +400,9 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
370 | */ | 400 | */ |
371 | page_unfreeze_refs(page, expected_count - 1); | 401 | page_unfreeze_refs(page, expected_count - 1); |
372 | 402 | ||
403 | spin_unlock(&mapping->tree_lock); | ||
404 | /* Leave irq disabled to prevent preemption while updating stats */ | ||
405 | |||
373 | /* | 406 | /* |
374 | * If moved to a different zone then also account | 407 | * If moved to a different zone then also account |
375 | * the page for that zone. Other VM counters will be | 408 | * the page for that zone. Other VM counters will be |
@@ -380,13 +413,19 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
380 | * via NR_FILE_PAGES and NR_ANON_PAGES if they | 413 | * via NR_FILE_PAGES and NR_ANON_PAGES if they |
381 | * are mapped to swap space. | 414 | * are mapped to swap space. |
382 | */ | 415 | */ |
383 | __dec_zone_page_state(page, NR_FILE_PAGES); | 416 | if (newzone != oldzone) { |
384 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 417 | __dec_zone_state(oldzone, NR_FILE_PAGES); |
385 | if (!PageSwapCache(page) && PageSwapBacked(page)) { | 418 | __inc_zone_state(newzone, NR_FILE_PAGES); |
386 | __dec_zone_page_state(page, NR_SHMEM); | 419 | if (PageSwapBacked(page) && !PageSwapCache(page)) { |
387 | __inc_zone_page_state(newpage, NR_SHMEM); | 420 | __dec_zone_state(oldzone, NR_SHMEM); |
421 | __inc_zone_state(newzone, NR_SHMEM); | ||
422 | } | ||
423 | if (dirty && mapping_cap_account_dirty(mapping)) { | ||
424 | __dec_zone_state(oldzone, NR_FILE_DIRTY); | ||
425 | __inc_zone_state(newzone, NR_FILE_DIRTY); | ||
426 | } | ||
388 | } | 427 | } |
389 | spin_unlock_irq(&mapping->tree_lock); | 428 | local_irq_enable(); |
390 | 429 | ||
391 | return MIGRATEPAGE_SUCCESS; | 430 | return MIGRATEPAGE_SUCCESS; |
392 | } | 431 | } |
@@ -401,12 +440,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
401 | int expected_count; | 440 | int expected_count; |
402 | void **pslot; | 441 | void **pslot; |
403 | 442 | ||
404 | if (!mapping) { | ||
405 | if (page_count(page) != 1) | ||
406 | return -EAGAIN; | ||
407 | return MIGRATEPAGE_SUCCESS; | ||
408 | } | ||
409 | |||
410 | spin_lock_irq(&mapping->tree_lock); | 443 | spin_lock_irq(&mapping->tree_lock); |
411 | 444 | ||
412 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | 445 | pslot = radix_tree_lookup_slot(&mapping->page_tree, |
@@ -424,6 +457,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
424 | return -EAGAIN; | 457 | return -EAGAIN; |
425 | } | 458 | } |
426 | 459 | ||
460 | set_page_memcg(newpage, page_memcg(page)); | ||
461 | newpage->index = page->index; | ||
462 | newpage->mapping = page->mapping; | ||
427 | get_page(newpage); | 463 | get_page(newpage); |
428 | 464 | ||
429 | radix_tree_replace_slot(pslot, newpage); | 465 | radix_tree_replace_slot(pslot, newpage); |
@@ -510,20 +546,9 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
510 | if (PageMappedToDisk(page)) | 546 | if (PageMappedToDisk(page)) |
511 | SetPageMappedToDisk(newpage); | 547 | SetPageMappedToDisk(newpage); |
512 | 548 | ||
513 | if (PageDirty(page)) { | 549 | /* Move dirty on pages not done by migrate_page_move_mapping() */ |
514 | clear_page_dirty_for_io(page); | 550 | if (PageDirty(page)) |
515 | /* | 551 | SetPageDirty(newpage); |
516 | * Want to mark the page and the radix tree as dirty, and | ||
517 | * redo the accounting that clear_page_dirty_for_io undid, | ||
518 | * but we can't use set_page_dirty because that function | ||
519 | * is actually a signal that all of the page has become dirty. | ||
520 | * Whereas only part of our page may be dirty. | ||
521 | */ | ||
522 | if (PageSwapBacked(page)) | ||
523 | SetPageDirty(newpage); | ||
524 | else | ||
525 | __set_page_dirty_nobuffers(newpage); | ||
526 | } | ||
527 | 552 | ||
528 | if (page_is_young(page)) | 553 | if (page_is_young(page)) |
529 | set_page_young(newpage); | 554 | set_page_young(newpage); |
@@ -537,7 +562,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
537 | cpupid = page_cpupid_xchg_last(page, -1); | 562 | cpupid = page_cpupid_xchg_last(page, -1); |
538 | page_cpupid_xchg_last(newpage, cpupid); | 563 | page_cpupid_xchg_last(newpage, cpupid); |
539 | 564 | ||
540 | mlock_migrate_page(newpage, page); | ||
541 | ksm_migrate_page(newpage, page); | 565 | ksm_migrate_page(newpage, page); |
542 | /* | 566 | /* |
543 | * Please do not reorder this without considering how mm/ksm.c's | 567 | * Please do not reorder this without considering how mm/ksm.c's |
@@ -721,33 +745,13 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
721 | * MIGRATEPAGE_SUCCESS - success | 745 | * MIGRATEPAGE_SUCCESS - success |
722 | */ | 746 | */ |
723 | static int move_to_new_page(struct page *newpage, struct page *page, | 747 | static int move_to_new_page(struct page *newpage, struct page *page, |
724 | int page_was_mapped, enum migrate_mode mode) | 748 | enum migrate_mode mode) |
725 | { | 749 | { |
726 | struct address_space *mapping; | 750 | struct address_space *mapping; |
727 | int rc; | 751 | int rc; |
728 | 752 | ||
729 | /* | 753 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
730 | * Block others from accessing the page when we get around to | 754 | VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); |
731 | * establishing additional references. We are the only one | ||
732 | * holding a reference to the new page at this point. | ||
733 | */ | ||
734 | if (!trylock_page(newpage)) | ||
735 | BUG(); | ||
736 | |||
737 | /* Prepare mapping for the new page.*/ | ||
738 | newpage->index = page->index; | ||
739 | newpage->mapping = page->mapping; | ||
740 | if (PageSwapBacked(page)) | ||
741 | SetPageSwapBacked(newpage); | ||
742 | |||
743 | /* | ||
744 | * Indirectly called below, migrate_page_copy() copies PG_dirty and thus | ||
745 | * needs newpage's memcg set to transfer memcg dirty page accounting. | ||
746 | * So perform memcg migration in two steps: | ||
747 | * 1. set newpage->mem_cgroup (here) | ||
748 | * 2. clear page->mem_cgroup (below) | ||
749 | */ | ||
750 | set_page_memcg(newpage, page_memcg(page)); | ||
751 | 755 | ||
752 | mapping = page_mapping(page); | 756 | mapping = page_mapping(page); |
753 | if (!mapping) | 757 | if (!mapping) |
@@ -759,23 +763,19 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
759 | * space which also has its own migratepage callback. This | 763 | * space which also has its own migratepage callback. This |
760 | * is the most common path for page migration. | 764 | * is the most common path for page migration. |
761 | */ | 765 | */ |
762 | rc = mapping->a_ops->migratepage(mapping, | 766 | rc = mapping->a_ops->migratepage(mapping, newpage, page, mode); |
763 | newpage, page, mode); | ||
764 | else | 767 | else |
765 | rc = fallback_migrate_page(mapping, newpage, page, mode); | 768 | rc = fallback_migrate_page(mapping, newpage, page, mode); |
766 | 769 | ||
767 | if (rc != MIGRATEPAGE_SUCCESS) { | 770 | /* |
768 | set_page_memcg(newpage, NULL); | 771 | * When successful, old pagecache page->mapping must be cleared before |
769 | newpage->mapping = NULL; | 772 | * page is freed; but stats require that PageAnon be left as PageAnon. |
770 | } else { | 773 | */ |
774 | if (rc == MIGRATEPAGE_SUCCESS) { | ||
771 | set_page_memcg(page, NULL); | 775 | set_page_memcg(page, NULL); |
772 | if (page_was_mapped) | 776 | if (!PageAnon(page)) |
773 | remove_migration_ptes(page, newpage); | 777 | page->mapping = NULL; |
774 | page->mapping = NULL; | ||
775 | } | 778 | } |
776 | |||
777 | unlock_page(newpage); | ||
778 | |||
779 | return rc; | 779 | return rc; |
780 | } | 780 | } |
781 | 781 | ||
@@ -824,6 +824,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
824 | goto out_unlock; | 824 | goto out_unlock; |
825 | wait_on_page_writeback(page); | 825 | wait_on_page_writeback(page); |
826 | } | 826 | } |
827 | |||
827 | /* | 828 | /* |
828 | * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, | 829 | * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, |
829 | * we cannot notice that anon_vma is freed while we migrates a page. | 830 | * we cannot notice that anon_vma is freed while we migrates a page. |
@@ -831,34 +832,26 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
831 | * of migration. File cache pages are no problem because of page_lock() | 832 | * of migration. File cache pages are no problem because of page_lock() |
832 | * File Caches may use write_page() or lock_page() in migration, then, | 833 | * File Caches may use write_page() or lock_page() in migration, then, |
833 | * just care Anon page here. | 834 | * just care Anon page here. |
835 | * | ||
836 | * Only page_get_anon_vma() understands the subtleties of | ||
837 | * getting a hold on an anon_vma from outside one of its mms. | ||
838 | * But if we cannot get anon_vma, then we won't need it anyway, | ||
839 | * because that implies that the anon page is no longer mapped | ||
840 | * (and cannot be remapped so long as we hold the page lock). | ||
834 | */ | 841 | */ |
835 | if (PageAnon(page) && !PageKsm(page)) { | 842 | if (PageAnon(page) && !PageKsm(page)) |
836 | /* | ||
837 | * Only page_lock_anon_vma_read() understands the subtleties of | ||
838 | * getting a hold on an anon_vma from outside one of its mms. | ||
839 | */ | ||
840 | anon_vma = page_get_anon_vma(page); | 843 | anon_vma = page_get_anon_vma(page); |
841 | if (anon_vma) { | 844 | |
842 | /* | 845 | /* |
843 | * Anon page | 846 | * Block others from accessing the new page when we get around to |
844 | */ | 847 | * establishing additional references. We are usually the only one |
845 | } else if (PageSwapCache(page)) { | 848 | * holding a reference to newpage at this point. We used to have a BUG |
846 | /* | 849 | * here if trylock_page(newpage) fails, but would like to allow for |
847 | * We cannot be sure that the anon_vma of an unmapped | 850 | * cases where there might be a race with the previous use of newpage. |
848 | * swapcache page is safe to use because we don't | 851 | * This is much like races on refcount of oldpage: just don't BUG(). |
849 | * know in advance if the VMA that this page belonged | 852 | */ |
850 | * to still exists. If the VMA and others sharing the | 853 | if (unlikely(!trylock_page(newpage))) |
851 | * data have been freed, then the anon_vma could | 854 | goto out_unlock; |
852 | * already be invalid. | ||
853 | * | ||
854 | * To avoid this possibility, swapcache pages get | ||
855 | * migrated but are not remapped when migration | ||
856 | * completes | ||
857 | */ | ||
858 | } else { | ||
859 | goto out_unlock; | ||
860 | } | ||
861 | } | ||
862 | 855 | ||
863 | if (unlikely(isolated_balloon_page(page))) { | 856 | if (unlikely(isolated_balloon_page(page))) { |
864 | /* | 857 | /* |
@@ -869,7 +862,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
869 | * the page migration right away (proteced by page lock). | 862 | * the page migration right away (proteced by page lock). |
870 | */ | 863 | */ |
871 | rc = balloon_page_migrate(newpage, page, mode); | 864 | rc = balloon_page_migrate(newpage, page, mode); |
872 | goto out_unlock; | 865 | goto out_unlock_both; |
873 | } | 866 | } |
874 | 867 | ||
875 | /* | 868 | /* |
@@ -888,30 +881,30 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
888 | VM_BUG_ON_PAGE(PageAnon(page), page); | 881 | VM_BUG_ON_PAGE(PageAnon(page), page); |
889 | if (page_has_private(page)) { | 882 | if (page_has_private(page)) { |
890 | try_to_free_buffers(page); | 883 | try_to_free_buffers(page); |
891 | goto out_unlock; | 884 | goto out_unlock_both; |
892 | } | 885 | } |
893 | goto skip_unmap; | 886 | } else if (page_mapped(page)) { |
894 | } | 887 | /* Establish migration ptes */ |
895 | 888 | VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, | |
896 | /* Establish migration ptes or remove ptes */ | 889 | page); |
897 | if (page_mapped(page)) { | ||
898 | try_to_unmap(page, | 890 | try_to_unmap(page, |
899 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 891 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
900 | page_was_mapped = 1; | 892 | page_was_mapped = 1; |
901 | } | 893 | } |
902 | 894 | ||
903 | skip_unmap: | ||
904 | if (!page_mapped(page)) | 895 | if (!page_mapped(page)) |
905 | rc = move_to_new_page(newpage, page, page_was_mapped, mode); | 896 | rc = move_to_new_page(newpage, page, mode); |
906 | 897 | ||
907 | if (rc && page_was_mapped) | 898 | if (page_was_mapped) |
908 | remove_migration_ptes(page, page); | 899 | remove_migration_ptes(page, |
900 | rc == MIGRATEPAGE_SUCCESS ? newpage : page); | ||
909 | 901 | ||
902 | out_unlock_both: | ||
903 | unlock_page(newpage); | ||
904 | out_unlock: | ||
910 | /* Drop an anon_vma reference if we took one */ | 905 | /* Drop an anon_vma reference if we took one */ |
911 | if (anon_vma) | 906 | if (anon_vma) |
912 | put_anon_vma(anon_vma); | 907 | put_anon_vma(anon_vma); |
913 | |||
914 | out_unlock: | ||
915 | unlock_page(page); | 908 | unlock_page(page); |
916 | out: | 909 | out: |
917 | return rc; | 910 | return rc; |
@@ -937,10 +930,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, | |||
937 | int force, enum migrate_mode mode, | 930 | int force, enum migrate_mode mode, |
938 | enum migrate_reason reason) | 931 | enum migrate_reason reason) |
939 | { | 932 | { |
940 | int rc = 0; | 933 | int rc = MIGRATEPAGE_SUCCESS; |
941 | int *result = NULL; | 934 | int *result = NULL; |
942 | struct page *newpage = get_new_page(page, private, &result); | 935 | struct page *newpage; |
943 | 936 | ||
937 | newpage = get_new_page(page, private, &result); | ||
944 | if (!newpage) | 938 | if (!newpage) |
945 | return -ENOMEM; | 939 | return -ENOMEM; |
946 | 940 | ||
@@ -954,6 +948,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, | |||
954 | goto out; | 948 | goto out; |
955 | 949 | ||
956 | rc = __unmap_and_move(page, newpage, force, mode); | 950 | rc = __unmap_and_move(page, newpage, force, mode); |
951 | if (rc == MIGRATEPAGE_SUCCESS) | ||
952 | put_new_page = NULL; | ||
957 | 953 | ||
958 | out: | 954 | out: |
959 | if (rc != -EAGAIN) { | 955 | if (rc != -EAGAIN) { |
@@ -980,10 +976,9 @@ out: | |||
980 | * it. Otherwise, putback_lru_page() will drop the reference grabbed | 976 | * it. Otherwise, putback_lru_page() will drop the reference grabbed |
981 | * during isolation. | 977 | * during isolation. |
982 | */ | 978 | */ |
983 | if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { | 979 | if (put_new_page) |
984 | ClearPageSwapBacked(newpage); | ||
985 | put_new_page(newpage, private); | 980 | put_new_page(newpage, private); |
986 | } else if (unlikely(__is_movable_balloon_page(newpage))) { | 981 | else if (unlikely(__is_movable_balloon_page(newpage))) { |
987 | /* drop our reference, page already in the balloon */ | 982 | /* drop our reference, page already in the balloon */ |
988 | put_page(newpage); | 983 | put_page(newpage); |
989 | } else | 984 | } else |
@@ -1021,7 +1016,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1021 | struct page *hpage, int force, | 1016 | struct page *hpage, int force, |
1022 | enum migrate_mode mode) | 1017 | enum migrate_mode mode) |
1023 | { | 1018 | { |
1024 | int rc = 0; | 1019 | int rc = -EAGAIN; |
1025 | int *result = NULL; | 1020 | int *result = NULL; |
1026 | int page_was_mapped = 0; | 1021 | int page_was_mapped = 0; |
1027 | struct page *new_hpage; | 1022 | struct page *new_hpage; |
@@ -1043,8 +1038,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1043 | if (!new_hpage) | 1038 | if (!new_hpage) |
1044 | return -ENOMEM; | 1039 | return -ENOMEM; |
1045 | 1040 | ||
1046 | rc = -EAGAIN; | ||
1047 | |||
1048 | if (!trylock_page(hpage)) { | 1041 | if (!trylock_page(hpage)) { |
1049 | if (!force || mode != MIGRATE_SYNC) | 1042 | if (!force || mode != MIGRATE_SYNC) |
1050 | goto out; | 1043 | goto out; |
@@ -1054,6 +1047,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1054 | if (PageAnon(hpage)) | 1047 | if (PageAnon(hpage)) |
1055 | anon_vma = page_get_anon_vma(hpage); | 1048 | anon_vma = page_get_anon_vma(hpage); |
1056 | 1049 | ||
1050 | if (unlikely(!trylock_page(new_hpage))) | ||
1051 | goto put_anon; | ||
1052 | |||
1057 | if (page_mapped(hpage)) { | 1053 | if (page_mapped(hpage)) { |
1058 | try_to_unmap(hpage, | 1054 | try_to_unmap(hpage, |
1059 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 1055 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
@@ -1061,16 +1057,22 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1061 | } | 1057 | } |
1062 | 1058 | ||
1063 | if (!page_mapped(hpage)) | 1059 | if (!page_mapped(hpage)) |
1064 | rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode); | 1060 | rc = move_to_new_page(new_hpage, hpage, mode); |
1061 | |||
1062 | if (page_was_mapped) | ||
1063 | remove_migration_ptes(hpage, | ||
1064 | rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage); | ||
1065 | 1065 | ||
1066 | if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) | 1066 | unlock_page(new_hpage); |
1067 | remove_migration_ptes(hpage, hpage); | ||
1068 | 1067 | ||
1068 | put_anon: | ||
1069 | if (anon_vma) | 1069 | if (anon_vma) |
1070 | put_anon_vma(anon_vma); | 1070 | put_anon_vma(anon_vma); |
1071 | 1071 | ||
1072 | if (rc == MIGRATEPAGE_SUCCESS) | 1072 | if (rc == MIGRATEPAGE_SUCCESS) { |
1073 | hugetlb_cgroup_migrate(hpage, new_hpage); | 1073 | hugetlb_cgroup_migrate(hpage, new_hpage); |
1074 | put_new_page = NULL; | ||
1075 | } | ||
1074 | 1076 | ||
1075 | unlock_page(hpage); | 1077 | unlock_page(hpage); |
1076 | out: | 1078 | out: |
@@ -1082,7 +1084,7 @@ out: | |||
1082 | * it. Otherwise, put_page() will drop the reference grabbed during | 1084 | * it. Otherwise, put_page() will drop the reference grabbed during |
1083 | * isolation. | 1085 | * isolation. |
1084 | */ | 1086 | */ |
1085 | if (rc != MIGRATEPAGE_SUCCESS && put_new_page) | 1087 | if (put_new_page) |
1086 | put_new_page(new_hpage, private); | 1088 | put_new_page(new_hpage, private); |
1087 | else | 1089 | else |
1088 | putback_active_hugepage(new_hpage); | 1090 | putback_active_hugepage(new_hpage); |
@@ -1112,7 +1114,7 @@ out: | |||
1112 | * | 1114 | * |
1113 | * The function returns after 10 attempts or if no pages are movable any more | 1115 | * The function returns after 10 attempts or if no pages are movable any more |
1114 | * because the list has become empty or no retryable pages exist any more. | 1116 | * because the list has become empty or no retryable pages exist any more. |
1115 | * The caller should call putback_lru_pages() to return pages to the LRU | 1117 | * The caller should call putback_movable_pages() to return pages to the LRU |
1116 | * or free list only if ret != 0. | 1118 | * or free list only if ret != 0. |
1117 | * | 1119 | * |
1118 | * Returns the number of pages that were not migrated, or an error code. | 1120 | * Returns the number of pages that were not migrated, or an error code. |
@@ -1169,7 +1171,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, | |||
1169 | } | 1171 | } |
1170 | } | 1172 | } |
1171 | } | 1173 | } |
1172 | rc = nr_failed + retry; | 1174 | nr_failed += retry; |
1175 | rc = nr_failed; | ||
1173 | out: | 1176 | out: |
1174 | if (nr_succeeded) | 1177 | if (nr_succeeded) |
1175 | count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); | 1178 | count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); |
@@ -1786,7 +1789,6 @@ fail_putback: | |||
1786 | SetPageActive(page); | 1789 | SetPageActive(page); |
1787 | if (TestClearPageUnevictable(new_page)) | 1790 | if (TestClearPageUnevictable(new_page)) |
1788 | SetPageUnevictable(page); | 1791 | SetPageUnevictable(page); |
1789 | mlock_migrate_page(page, new_page); | ||
1790 | 1792 | ||
1791 | unlock_page(new_page); | 1793 | unlock_page(new_page); |
1792 | put_page(new_page); /* Free it */ | 1794 | put_page(new_page); /* Free it */ |
@@ -1828,8 +1830,9 @@ fail_putback: | |||
1828 | goto fail_putback; | 1830 | goto fail_putback; |
1829 | } | 1831 | } |
1830 | 1832 | ||
1831 | mem_cgroup_migrate(page, new_page, false); | 1833 | mlock_migrate_page(new_page, page); |
1832 | 1834 | set_page_memcg(new_page, page_memcg(page)); | |
1835 | set_page_memcg(page, NULL); | ||
1833 | page_remove_rmap(page); | 1836 | page_remove_rmap(page); |
1834 | 1837 | ||
1835 | spin_unlock(ptl); | 1838 | spin_unlock(ptl); |
diff --git a/mm/mincore.c b/mm/mincore.c index be25efde64a4..14bb9fb37f0c 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -234,7 +234,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, | |||
234 | 234 | ||
235 | /* This also avoids any overflows on PAGE_CACHE_ALIGN */ | 235 | /* This also avoids any overflows on PAGE_CACHE_ALIGN */ |
236 | pages = len >> PAGE_SHIFT; | 236 | pages = len >> PAGE_SHIFT; |
237 | pages += (len & ~PAGE_MASK) != 0; | 237 | pages += (offset_in_page(len)) != 0; |
238 | 238 | ||
239 | if (!access_ok(VERIFY_WRITE, vec, pages)) | 239 | if (!access_ok(VERIFY_WRITE, vec, pages)) |
240 | return -EFAULT; | 240 | return -EFAULT; |
diff --git a/mm/mlock.c b/mm/mlock.c index 25936680064f..339d9e0949b6 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, | |||
422 | void munlock_vma_pages_range(struct vm_area_struct *vma, | 422 | void munlock_vma_pages_range(struct vm_area_struct *vma, |
423 | unsigned long start, unsigned long end) | 423 | unsigned long start, unsigned long end) |
424 | { | 424 | { |
425 | vma->vm_flags &= ~VM_LOCKED; | 425 | vma->vm_flags &= VM_LOCKED_CLEAR_MASK; |
426 | 426 | ||
427 | while (start < end) { | 427 | while (start < end) { |
428 | struct page *page = NULL; | 428 | struct page *page = NULL; |
@@ -506,7 +506,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
506 | 506 | ||
507 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || | 507 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || |
508 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) | 508 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) |
509 | goto out; /* don't set VM_LOCKED, don't count */ | 509 | /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ |
510 | goto out; | ||
510 | 511 | ||
511 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 512 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
512 | *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, | 513 | *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, |
@@ -554,13 +555,14 @@ out: | |||
554 | return ret; | 555 | return ret; |
555 | } | 556 | } |
556 | 557 | ||
557 | static int do_mlock(unsigned long start, size_t len, int on) | 558 | static int apply_vma_lock_flags(unsigned long start, size_t len, |
559 | vm_flags_t flags) | ||
558 | { | 560 | { |
559 | unsigned long nstart, end, tmp; | 561 | unsigned long nstart, end, tmp; |
560 | struct vm_area_struct * vma, * prev; | 562 | struct vm_area_struct * vma, * prev; |
561 | int error; | 563 | int error; |
562 | 564 | ||
563 | VM_BUG_ON(start & ~PAGE_MASK); | 565 | VM_BUG_ON(offset_in_page(start)); |
564 | VM_BUG_ON(len != PAGE_ALIGN(len)); | 566 | VM_BUG_ON(len != PAGE_ALIGN(len)); |
565 | end = start + len; | 567 | end = start + len; |
566 | if (end < start) | 568 | if (end < start) |
@@ -576,14 +578,11 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
576 | prev = vma; | 578 | prev = vma; |
577 | 579 | ||
578 | for (nstart = start ; ; ) { | 580 | for (nstart = start ; ; ) { |
579 | vm_flags_t newflags; | 581 | vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; |
580 | 582 | ||
581 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | 583 | newflags |= flags; |
582 | |||
583 | newflags = vma->vm_flags & ~VM_LOCKED; | ||
584 | if (on) | ||
585 | newflags |= VM_LOCKED; | ||
586 | 584 | ||
585 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | ||
587 | tmp = vma->vm_end; | 586 | tmp = vma->vm_end; |
588 | if (tmp > end) | 587 | if (tmp > end) |
589 | tmp = end; | 588 | tmp = end; |
@@ -605,7 +604,7 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
605 | return error; | 604 | return error; |
606 | } | 605 | } |
607 | 606 | ||
608 | SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | 607 | static int do_mlock(unsigned long start, size_t len, vm_flags_t flags) |
609 | { | 608 | { |
610 | unsigned long locked; | 609 | unsigned long locked; |
611 | unsigned long lock_limit; | 610 | unsigned long lock_limit; |
@@ -616,7 +615,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
616 | 615 | ||
617 | lru_add_drain_all(); /* flush pagevec */ | 616 | lru_add_drain_all(); /* flush pagevec */ |
618 | 617 | ||
619 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | 618 | len = PAGE_ALIGN(len + (offset_in_page(start))); |
620 | start &= PAGE_MASK; | 619 | start &= PAGE_MASK; |
621 | 620 | ||
622 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 621 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
@@ -629,7 +628,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
629 | 628 | ||
630 | /* check against resource limits */ | 629 | /* check against resource limits */ |
631 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) | 630 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) |
632 | error = do_mlock(start, len, 1); | 631 | error = apply_vma_lock_flags(start, len, flags); |
633 | 632 | ||
634 | up_write(¤t->mm->mmap_sem); | 633 | up_write(¤t->mm->mmap_sem); |
635 | if (error) | 634 | if (error) |
@@ -641,37 +640,75 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
641 | return 0; | 640 | return 0; |
642 | } | 641 | } |
643 | 642 | ||
643 | SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | ||
644 | { | ||
645 | return do_mlock(start, len, VM_LOCKED); | ||
646 | } | ||
647 | |||
648 | SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) | ||
649 | { | ||
650 | vm_flags_t vm_flags = VM_LOCKED; | ||
651 | |||
652 | if (flags & ~MLOCK_ONFAULT) | ||
653 | return -EINVAL; | ||
654 | |||
655 | if (flags & MLOCK_ONFAULT) | ||
656 | vm_flags |= VM_LOCKONFAULT; | ||
657 | |||
658 | return do_mlock(start, len, vm_flags); | ||
659 | } | ||
660 | |||
644 | SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) | 661 | SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) |
645 | { | 662 | { |
646 | int ret; | 663 | int ret; |
647 | 664 | ||
648 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | 665 | len = PAGE_ALIGN(len + (offset_in_page(start))); |
649 | start &= PAGE_MASK; | 666 | start &= PAGE_MASK; |
650 | 667 | ||
651 | down_write(¤t->mm->mmap_sem); | 668 | down_write(¤t->mm->mmap_sem); |
652 | ret = do_mlock(start, len, 0); | 669 | ret = apply_vma_lock_flags(start, len, 0); |
653 | up_write(¤t->mm->mmap_sem); | 670 | up_write(¤t->mm->mmap_sem); |
654 | 671 | ||
655 | return ret; | 672 | return ret; |
656 | } | 673 | } |
657 | 674 | ||
658 | static int do_mlockall(int flags) | 675 | /* |
676 | * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall) | ||
677 | * and translate into the appropriate modifications to mm->def_flags and/or the | ||
678 | * flags for all current VMAs. | ||
679 | * | ||
680 | * There are a couple of subtleties with this. If mlockall() is called multiple | ||
681 | * times with different flags, the values do not necessarily stack. If mlockall | ||
682 | * is called once including the MCL_FUTURE flag and then a second time without | ||
683 | * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags. | ||
684 | */ | ||
685 | static int apply_mlockall_flags(int flags) | ||
659 | { | 686 | { |
660 | struct vm_area_struct * vma, * prev = NULL; | 687 | struct vm_area_struct * vma, * prev = NULL; |
688 | vm_flags_t to_add = 0; | ||
661 | 689 | ||
662 | if (flags & MCL_FUTURE) | 690 | current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; |
691 | if (flags & MCL_FUTURE) { | ||
663 | current->mm->def_flags |= VM_LOCKED; | 692 | current->mm->def_flags |= VM_LOCKED; |
664 | else | 693 | |
665 | current->mm->def_flags &= ~VM_LOCKED; | 694 | if (flags & MCL_ONFAULT) |
666 | if (flags == MCL_FUTURE) | 695 | current->mm->def_flags |= VM_LOCKONFAULT; |
667 | goto out; | 696 | |
697 | if (!(flags & MCL_CURRENT)) | ||
698 | goto out; | ||
699 | } | ||
700 | |||
701 | if (flags & MCL_CURRENT) { | ||
702 | to_add |= VM_LOCKED; | ||
703 | if (flags & MCL_ONFAULT) | ||
704 | to_add |= VM_LOCKONFAULT; | ||
705 | } | ||
668 | 706 | ||
669 | for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { | 707 | for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { |
670 | vm_flags_t newflags; | 708 | vm_flags_t newflags; |
671 | 709 | ||
672 | newflags = vma->vm_flags & ~VM_LOCKED; | 710 | newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; |
673 | if (flags & MCL_CURRENT) | 711 | newflags |= to_add; |
674 | newflags |= VM_LOCKED; | ||
675 | 712 | ||
676 | /* Ignore errors */ | 713 | /* Ignore errors */ |
677 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); | 714 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); |
@@ -684,14 +721,13 @@ out: | |||
684 | SYSCALL_DEFINE1(mlockall, int, flags) | 721 | SYSCALL_DEFINE1(mlockall, int, flags) |
685 | { | 722 | { |
686 | unsigned long lock_limit; | 723 | unsigned long lock_limit; |
687 | int ret = -EINVAL; | 724 | int ret; |
688 | 725 | ||
689 | if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) | 726 | if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT))) |
690 | goto out; | 727 | return -EINVAL; |
691 | 728 | ||
692 | ret = -EPERM; | ||
693 | if (!can_do_mlock()) | 729 | if (!can_do_mlock()) |
694 | goto out; | 730 | return -EPERM; |
695 | 731 | ||
696 | if (flags & MCL_CURRENT) | 732 | if (flags & MCL_CURRENT) |
697 | lru_add_drain_all(); /* flush pagevec */ | 733 | lru_add_drain_all(); /* flush pagevec */ |
@@ -704,11 +740,11 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
704 | 740 | ||
705 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || | 741 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || |
706 | capable(CAP_IPC_LOCK)) | 742 | capable(CAP_IPC_LOCK)) |
707 | ret = do_mlockall(flags); | 743 | ret = apply_mlockall_flags(flags); |
708 | up_write(¤t->mm->mmap_sem); | 744 | up_write(¤t->mm->mmap_sem); |
709 | if (!ret && (flags & MCL_CURRENT)) | 745 | if (!ret && (flags & MCL_CURRENT)) |
710 | mm_populate(0, TASK_SIZE); | 746 | mm_populate(0, TASK_SIZE); |
711 | out: | 747 | |
712 | return ret; | 748 | return ret; |
713 | } | 749 | } |
714 | 750 | ||
@@ -717,7 +753,7 @@ SYSCALL_DEFINE0(munlockall) | |||
717 | int ret; | 753 | int ret; |
718 | 754 | ||
719 | down_write(¤t->mm->mmap_sem); | 755 | down_write(¤t->mm->mmap_sem); |
720 | ret = do_mlockall(0); | 756 | ret = apply_mlockall_flags(0); |
721 | up_write(¤t->mm->mmap_sem); | 757 | up_write(¤t->mm->mmap_sem); |
722 | return ret; | 758 | return ret; |
723 | } | 759 | } |
@@ -1302,7 +1302,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, | |||
1302 | * that it represents a valid section of the address space. | 1302 | * that it represents a valid section of the address space. |
1303 | */ | 1303 | */ |
1304 | addr = get_unmapped_area(file, addr, len, pgoff, flags); | 1304 | addr = get_unmapped_area(file, addr, len, pgoff, flags); |
1305 | if (addr & ~PAGE_MASK) | 1305 | if (offset_in_page(addr)) |
1306 | return addr; | 1306 | return addr; |
1307 | 1307 | ||
1308 | /* Do simple checking here so the lower-level routines won't have | 1308 | /* Do simple checking here so the lower-level routines won't have |
@@ -1412,13 +1412,13 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1412 | unsigned long, fd, unsigned long, pgoff) | 1412 | unsigned long, fd, unsigned long, pgoff) |
1413 | { | 1413 | { |
1414 | struct file *file = NULL; | 1414 | struct file *file = NULL; |
1415 | unsigned long retval = -EBADF; | 1415 | unsigned long retval; |
1416 | 1416 | ||
1417 | if (!(flags & MAP_ANONYMOUS)) { | 1417 | if (!(flags & MAP_ANONYMOUS)) { |
1418 | audit_mmap_fd(fd, flags); | 1418 | audit_mmap_fd(fd, flags); |
1419 | file = fget(fd); | 1419 | file = fget(fd); |
1420 | if (!file) | 1420 | if (!file) |
1421 | goto out; | 1421 | return -EBADF; |
1422 | if (is_file_hugepages(file)) | 1422 | if (is_file_hugepages(file)) |
1423 | len = ALIGN(len, huge_page_size(hstate_file(file))); | 1423 | len = ALIGN(len, huge_page_size(hstate_file(file))); |
1424 | retval = -EINVAL; | 1424 | retval = -EINVAL; |
@@ -1453,7 +1453,6 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1453 | out_fput: | 1453 | out_fput: |
1454 | if (file) | 1454 | if (file) |
1455 | fput(file); | 1455 | fput(file); |
1456 | out: | ||
1457 | return retval; | 1456 | return retval; |
1458 | } | 1457 | } |
1459 | 1458 | ||
@@ -1473,7 +1472,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | |||
1473 | 1472 | ||
1474 | if (copy_from_user(&a, arg, sizeof(a))) | 1473 | if (copy_from_user(&a, arg, sizeof(a))) |
1475 | return -EFAULT; | 1474 | return -EFAULT; |
1476 | if (a.offset & ~PAGE_MASK) | 1475 | if (offset_in_page(a.offset)) |
1477 | return -EINVAL; | 1476 | return -EINVAL; |
1478 | 1477 | ||
1479 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, | 1478 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, |
@@ -1562,7 +1561,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |||
1562 | } | 1561 | } |
1563 | 1562 | ||
1564 | /* Clear old maps */ | 1563 | /* Clear old maps */ |
1565 | error = -ENOMEM; | ||
1566 | while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, | 1564 | while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, |
1567 | &rb_parent)) { | 1565 | &rb_parent)) { |
1568 | if (do_munmap(mm, addr, len)) | 1566 | if (do_munmap(mm, addr, len)) |
@@ -1663,7 +1661,7 @@ out: | |||
1663 | vma == get_gate_vma(current->mm))) | 1661 | vma == get_gate_vma(current->mm))) |
1664 | mm->locked_vm += (len >> PAGE_SHIFT); | 1662 | mm->locked_vm += (len >> PAGE_SHIFT); |
1665 | else | 1663 | else |
1666 | vma->vm_flags &= ~VM_LOCKED; | 1664 | vma->vm_flags &= VM_LOCKED_CLEAR_MASK; |
1667 | } | 1665 | } |
1668 | 1666 | ||
1669 | if (file) | 1667 | if (file) |
@@ -1989,7 +1987,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1989 | * can happen with large stack limits and large mmap() | 1987 | * can happen with large stack limits and large mmap() |
1990 | * allocations. | 1988 | * allocations. |
1991 | */ | 1989 | */ |
1992 | if (addr & ~PAGE_MASK) { | 1990 | if (offset_in_page(addr)) { |
1993 | VM_BUG_ON(addr != -ENOMEM); | 1991 | VM_BUG_ON(addr != -ENOMEM); |
1994 | info.flags = 0; | 1992 | info.flags = 0; |
1995 | info.low_limit = TASK_UNMAPPED_BASE; | 1993 | info.low_limit = TASK_UNMAPPED_BASE; |
@@ -2025,7 +2023,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
2025 | 2023 | ||
2026 | if (addr > TASK_SIZE - len) | 2024 | if (addr > TASK_SIZE - len) |
2027 | return -ENOMEM; | 2025 | return -ENOMEM; |
2028 | if (addr & ~PAGE_MASK) | 2026 | if (offset_in_page(addr)) |
2029 | return -EINVAL; | 2027 | return -EINVAL; |
2030 | 2028 | ||
2031 | addr = arch_rebalance_pgtables(addr, len); | 2029 | addr = arch_rebalance_pgtables(addr, len); |
@@ -2047,7 +2045,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
2047 | return vma; | 2045 | return vma; |
2048 | 2046 | ||
2049 | rb_node = mm->mm_rb.rb_node; | 2047 | rb_node = mm->mm_rb.rb_node; |
2050 | vma = NULL; | ||
2051 | 2048 | ||
2052 | while (rb_node) { | 2049 | while (rb_node) { |
2053 | struct vm_area_struct *tmp; | 2050 | struct vm_area_struct *tmp; |
@@ -2139,10 +2136,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
2139 | if (security_vm_enough_memory_mm(mm, grow)) | 2136 | if (security_vm_enough_memory_mm(mm, grow)) |
2140 | return -ENOMEM; | 2137 | return -ENOMEM; |
2141 | 2138 | ||
2142 | /* Ok, everything looks good - let it rip */ | ||
2143 | if (vma->vm_flags & VM_LOCKED) | ||
2144 | mm->locked_vm += grow; | ||
2145 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); | ||
2146 | return 0; | 2139 | return 0; |
2147 | } | 2140 | } |
2148 | 2141 | ||
@@ -2153,6 +2146,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
2153 | */ | 2146 | */ |
2154 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) | 2147 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) |
2155 | { | 2148 | { |
2149 | struct mm_struct *mm = vma->vm_mm; | ||
2156 | int error; | 2150 | int error; |
2157 | 2151 | ||
2158 | if (!(vma->vm_flags & VM_GROWSUP)) | 2152 | if (!(vma->vm_flags & VM_GROWSUP)) |
@@ -2202,15 +2196,19 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
2202 | * So, we reuse mm->page_table_lock to guard | 2196 | * So, we reuse mm->page_table_lock to guard |
2203 | * against concurrent vma expansions. | 2197 | * against concurrent vma expansions. |
2204 | */ | 2198 | */ |
2205 | spin_lock(&vma->vm_mm->page_table_lock); | 2199 | spin_lock(&mm->page_table_lock); |
2200 | if (vma->vm_flags & VM_LOCKED) | ||
2201 | mm->locked_vm += grow; | ||
2202 | vm_stat_account(mm, vma->vm_flags, | ||
2203 | vma->vm_file, grow); | ||
2206 | anon_vma_interval_tree_pre_update_vma(vma); | 2204 | anon_vma_interval_tree_pre_update_vma(vma); |
2207 | vma->vm_end = address; | 2205 | vma->vm_end = address; |
2208 | anon_vma_interval_tree_post_update_vma(vma); | 2206 | anon_vma_interval_tree_post_update_vma(vma); |
2209 | if (vma->vm_next) | 2207 | if (vma->vm_next) |
2210 | vma_gap_update(vma->vm_next); | 2208 | vma_gap_update(vma->vm_next); |
2211 | else | 2209 | else |
2212 | vma->vm_mm->highest_vm_end = address; | 2210 | mm->highest_vm_end = address; |
2213 | spin_unlock(&vma->vm_mm->page_table_lock); | 2211 | spin_unlock(&mm->page_table_lock); |
2214 | 2212 | ||
2215 | perf_event_mmap(vma); | 2213 | perf_event_mmap(vma); |
2216 | } | 2214 | } |
@@ -2218,7 +2216,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
2218 | } | 2216 | } |
2219 | vma_unlock_anon_vma(vma); | 2217 | vma_unlock_anon_vma(vma); |
2220 | khugepaged_enter_vma_merge(vma, vma->vm_flags); | 2218 | khugepaged_enter_vma_merge(vma, vma->vm_flags); |
2221 | validate_mm(vma->vm_mm); | 2219 | validate_mm(mm); |
2222 | return error; | 2220 | return error; |
2223 | } | 2221 | } |
2224 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ | 2222 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ |
@@ -2229,6 +2227,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
2229 | int expand_downwards(struct vm_area_struct *vma, | 2227 | int expand_downwards(struct vm_area_struct *vma, |
2230 | unsigned long address) | 2228 | unsigned long address) |
2231 | { | 2229 | { |
2230 | struct mm_struct *mm = vma->vm_mm; | ||
2232 | int error; | 2231 | int error; |
2233 | 2232 | ||
2234 | /* | 2233 | /* |
@@ -2273,13 +2272,17 @@ int expand_downwards(struct vm_area_struct *vma, | |||
2273 | * So, we reuse mm->page_table_lock to guard | 2272 | * So, we reuse mm->page_table_lock to guard |
2274 | * against concurrent vma expansions. | 2273 | * against concurrent vma expansions. |
2275 | */ | 2274 | */ |
2276 | spin_lock(&vma->vm_mm->page_table_lock); | 2275 | spin_lock(&mm->page_table_lock); |
2276 | if (vma->vm_flags & VM_LOCKED) | ||
2277 | mm->locked_vm += grow; | ||
2278 | vm_stat_account(mm, vma->vm_flags, | ||
2279 | vma->vm_file, grow); | ||
2277 | anon_vma_interval_tree_pre_update_vma(vma); | 2280 | anon_vma_interval_tree_pre_update_vma(vma); |
2278 | vma->vm_start = address; | 2281 | vma->vm_start = address; |
2279 | vma->vm_pgoff -= grow; | 2282 | vma->vm_pgoff -= grow; |
2280 | anon_vma_interval_tree_post_update_vma(vma); | 2283 | anon_vma_interval_tree_post_update_vma(vma); |
2281 | vma_gap_update(vma); | 2284 | vma_gap_update(vma); |
2282 | spin_unlock(&vma->vm_mm->page_table_lock); | 2285 | spin_unlock(&mm->page_table_lock); |
2283 | 2286 | ||
2284 | perf_event_mmap(vma); | 2287 | perf_event_mmap(vma); |
2285 | } | 2288 | } |
@@ -2287,7 +2290,7 @@ int expand_downwards(struct vm_area_struct *vma, | |||
2287 | } | 2290 | } |
2288 | vma_unlock_anon_vma(vma); | 2291 | vma_unlock_anon_vma(vma); |
2289 | khugepaged_enter_vma_merge(vma, vma->vm_flags); | 2292 | khugepaged_enter_vma_merge(vma, vma->vm_flags); |
2290 | validate_mm(vma->vm_mm); | 2293 | validate_mm(mm); |
2291 | return error; | 2294 | return error; |
2292 | } | 2295 | } |
2293 | 2296 | ||
@@ -2536,7 +2539,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
2536 | unsigned long end; | 2539 | unsigned long end; |
2537 | struct vm_area_struct *vma, *prev, *last; | 2540 | struct vm_area_struct *vma, *prev, *last; |
2538 | 2541 | ||
2539 | if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) | 2542 | if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) |
2540 | return -EINVAL; | 2543 | return -EINVAL; |
2541 | 2544 | ||
2542 | len = PAGE_ALIGN(len); | 2545 | len = PAGE_ALIGN(len); |
@@ -2734,7 +2737,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2734 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | 2737 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; |
2735 | 2738 | ||
2736 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); | 2739 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); |
2737 | if (error & ~PAGE_MASK) | 2740 | if (offset_in_page(error)) |
2738 | return error; | 2741 | return error; |
2739 | 2742 | ||
2740 | error = mlock_future_check(mm, mm->def_flags, len); | 2743 | error = mlock_future_check(mm, mm->def_flags, len); |
@@ -3049,8 +3052,8 @@ static int special_mapping_fault(struct vm_area_struct *vma, | |||
3049 | static struct vm_area_struct *__install_special_mapping( | 3052 | static struct vm_area_struct *__install_special_mapping( |
3050 | struct mm_struct *mm, | 3053 | struct mm_struct *mm, |
3051 | unsigned long addr, unsigned long len, | 3054 | unsigned long addr, unsigned long len, |
3052 | unsigned long vm_flags, const struct vm_operations_struct *ops, | 3055 | unsigned long vm_flags, void *priv, |
3053 | void *priv) | 3056 | const struct vm_operations_struct *ops) |
3054 | { | 3057 | { |
3055 | int ret; | 3058 | int ret; |
3056 | struct vm_area_struct *vma; | 3059 | struct vm_area_struct *vma; |
@@ -3099,8 +3102,8 @@ struct vm_area_struct *_install_special_mapping( | |||
3099 | unsigned long addr, unsigned long len, | 3102 | unsigned long addr, unsigned long len, |
3100 | unsigned long vm_flags, const struct vm_special_mapping *spec) | 3103 | unsigned long vm_flags, const struct vm_special_mapping *spec) |
3101 | { | 3104 | { |
3102 | return __install_special_mapping(mm, addr, len, vm_flags, | 3105 | return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, |
3103 | &special_mapping_vmops, (void *)spec); | 3106 | &special_mapping_vmops); |
3104 | } | 3107 | } |
3105 | 3108 | ||
3106 | int install_special_mapping(struct mm_struct *mm, | 3109 | int install_special_mapping(struct mm_struct *mm, |
@@ -3108,8 +3111,8 @@ int install_special_mapping(struct mm_struct *mm, | |||
3108 | unsigned long vm_flags, struct page **pages) | 3111 | unsigned long vm_flags, struct page **pages) |
3109 | { | 3112 | { |
3110 | struct vm_area_struct *vma = __install_special_mapping( | 3113 | struct vm_area_struct *vma = __install_special_mapping( |
3111 | mm, addr, len, vm_flags, &legacy_special_mapping_vmops, | 3114 | mm, addr, len, vm_flags, (void *)pages, |
3112 | (void *)pages); | 3115 | &legacy_special_mapping_vmops); |
3113 | 3116 | ||
3114 | return PTR_ERR_OR_ZERO(vma); | 3117 | return PTR_ERR_OR_ZERO(vma); |
3115 | } | 3118 | } |
diff --git a/mm/mremap.c b/mm/mremap.c index 5a71cce8c6ea..c25bc6268e46 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -401,7 +401,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, | |||
401 | unsigned long charged = 0; | 401 | unsigned long charged = 0; |
402 | unsigned long map_flags; | 402 | unsigned long map_flags; |
403 | 403 | ||
404 | if (new_addr & ~PAGE_MASK) | 404 | if (offset_in_page(new_addr)) |
405 | goto out; | 405 | goto out; |
406 | 406 | ||
407 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) | 407 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) |
@@ -435,11 +435,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, | |||
435 | ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + | 435 | ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + |
436 | ((addr - vma->vm_start) >> PAGE_SHIFT), | 436 | ((addr - vma->vm_start) >> PAGE_SHIFT), |
437 | map_flags); | 437 | map_flags); |
438 | if (ret & ~PAGE_MASK) | 438 | if (offset_in_page(ret)) |
439 | goto out1; | 439 | goto out1; |
440 | 440 | ||
441 | ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); | 441 | ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); |
442 | if (!(ret & ~PAGE_MASK)) | 442 | if (!(offset_in_page(ret))) |
443 | goto out; | 443 | goto out; |
444 | out1: | 444 | out1: |
445 | vm_unacct_memory(charged); | 445 | vm_unacct_memory(charged); |
@@ -484,7 +484,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
484 | if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) | 484 | if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) |
485 | return ret; | 485 | return ret; |
486 | 486 | ||
487 | if (addr & ~PAGE_MASK) | 487 | if (offset_in_page(addr)) |
488 | return ret; | 488 | return ret; |
489 | 489 | ||
490 | old_len = PAGE_ALIGN(old_len); | 490 | old_len = PAGE_ALIGN(old_len); |
@@ -566,7 +566,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
566 | vma->vm_pgoff + | 566 | vma->vm_pgoff + |
567 | ((addr - vma->vm_start) >> PAGE_SHIFT), | 567 | ((addr - vma->vm_start) >> PAGE_SHIFT), |
568 | map_flags); | 568 | map_flags); |
569 | if (new_addr & ~PAGE_MASK) { | 569 | if (offset_in_page(new_addr)) { |
570 | ret = new_addr; | 570 | ret = new_addr; |
571 | goto out; | 571 | goto out; |
572 | } | 572 | } |
@@ -574,7 +574,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
574 | ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); | 574 | ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); |
575 | } | 575 | } |
576 | out: | 576 | out: |
577 | if (ret & ~PAGE_MASK) { | 577 | if (offset_in_page(ret)) { |
578 | vm_unacct_memory(charged); | 578 | vm_unacct_memory(charged); |
579 | locked = 0; | 579 | locked = 0; |
580 | } | 580 | } |
diff --git a/mm/msync.c b/mm/msync.c index bb04d53ae852..24e612fefa04 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -38,7 +38,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) | |||
38 | 38 | ||
39 | if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) | 39 | if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) |
40 | goto out; | 40 | goto out; |
41 | if (start & ~PAGE_MASK) | 41 | if (offset_in_page(start)) |
42 | goto out; | 42 | goto out; |
43 | if ((flags & MS_ASYNC) && (flags & MS_SYNC)) | 43 | if ((flags & MS_ASYNC) && (flags & MS_SYNC)) |
44 | goto out; | 44 | goto out; |
diff --git a/mm/nommu.c b/mm/nommu.c index ab14a2014dea..92be862c859b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -578,16 +578,16 @@ static noinline void validate_nommu_regions(void) | |||
578 | return; | 578 | return; |
579 | 579 | ||
580 | last = rb_entry(lastp, struct vm_region, vm_rb); | 580 | last = rb_entry(lastp, struct vm_region, vm_rb); |
581 | BUG_ON(unlikely(last->vm_end <= last->vm_start)); | 581 | BUG_ON(last->vm_end <= last->vm_start); |
582 | BUG_ON(unlikely(last->vm_top < last->vm_end)); | 582 | BUG_ON(last->vm_top < last->vm_end); |
583 | 583 | ||
584 | while ((p = rb_next(lastp))) { | 584 | while ((p = rb_next(lastp))) { |
585 | region = rb_entry(p, struct vm_region, vm_rb); | 585 | region = rb_entry(p, struct vm_region, vm_rb); |
586 | last = rb_entry(lastp, struct vm_region, vm_rb); | 586 | last = rb_entry(lastp, struct vm_region, vm_rb); |
587 | 587 | ||
588 | BUG_ON(unlikely(region->vm_end <= region->vm_start)); | 588 | BUG_ON(region->vm_end <= region->vm_start); |
589 | BUG_ON(unlikely(region->vm_top < region->vm_end)); | 589 | BUG_ON(region->vm_top < region->vm_end); |
590 | BUG_ON(unlikely(region->vm_start < last->vm_top)); | 590 | BUG_ON(region->vm_start < last->vm_top); |
591 | 591 | ||
592 | lastp = p; | 592 | lastp = p; |
593 | } | 593 | } |
@@ -1497,7 +1497,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | |||
1497 | 1497 | ||
1498 | if (copy_from_user(&a, arg, sizeof(a))) | 1498 | if (copy_from_user(&a, arg, sizeof(a))) |
1499 | return -EFAULT; | 1499 | return -EFAULT; |
1500 | if (a.offset & ~PAGE_MASK) | 1500 | if (offset_in_page(a.offset)) |
1501 | return -EINVAL; | 1501 | return -EINVAL; |
1502 | 1502 | ||
1503 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, | 1503 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, |
@@ -1653,9 +1653,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1653 | goto erase_whole_vma; | 1653 | goto erase_whole_vma; |
1654 | if (start < vma->vm_start || end > vma->vm_end) | 1654 | if (start < vma->vm_start || end > vma->vm_end) |
1655 | return -EINVAL; | 1655 | return -EINVAL; |
1656 | if (start & ~PAGE_MASK) | 1656 | if (offset_in_page(start)) |
1657 | return -EINVAL; | 1657 | return -EINVAL; |
1658 | if (end != vma->vm_end && end & ~PAGE_MASK) | 1658 | if (end != vma->vm_end && offset_in_page(end)) |
1659 | return -EINVAL; | 1659 | return -EINVAL; |
1660 | if (start != vma->vm_start && end != vma->vm_end) { | 1660 | if (start != vma->vm_start && end != vma->vm_end) { |
1661 | ret = split_vma(mm, vma, start, 1); | 1661 | ret = split_vma(mm, vma, start, 1); |
@@ -1736,7 +1736,7 @@ static unsigned long do_mremap(unsigned long addr, | |||
1736 | if (old_len == 0 || new_len == 0) | 1736 | if (old_len == 0 || new_len == 0) |
1737 | return (unsigned long) -EINVAL; | 1737 | return (unsigned long) -EINVAL; |
1738 | 1738 | ||
1739 | if (addr & ~PAGE_MASK) | 1739 | if (offset_in_page(addr)) |
1740 | return -EINVAL; | 1740 | return -EINVAL; |
1741 | 1741 | ||
1742 | if (flags & MREMAP_FIXED && new_addr != addr) | 1742 | if (flags & MREMAP_FIXED && new_addr != addr) |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1ecc0bcaecc5..e4778285d8d1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -377,13 +377,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
377 | static void dump_header(struct oom_control *oc, struct task_struct *p, | 377 | static void dump_header(struct oom_control *oc, struct task_struct *p, |
378 | struct mem_cgroup *memcg) | 378 | struct mem_cgroup *memcg) |
379 | { | 379 | { |
380 | task_lock(current); | ||
381 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | 380 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " |
382 | "oom_score_adj=%hd\n", | 381 | "oom_score_adj=%hd\n", |
383 | current->comm, oc->gfp_mask, oc->order, | 382 | current->comm, oc->gfp_mask, oc->order, |
384 | current->signal->oom_score_adj); | 383 | current->signal->oom_score_adj); |
385 | cpuset_print_task_mems_allowed(current); | 384 | cpuset_print_current_mems_allowed(); |
386 | task_unlock(current); | ||
387 | dump_stack(); | 385 | dump_stack(); |
388 | if (memcg) | 386 | if (memcg) |
389 | mem_cgroup_print_oom_info(memcg, p); | 387 | mem_cgroup_print_oom_info(memcg, p); |
@@ -476,6 +474,24 @@ void oom_killer_enable(void) | |||
476 | oom_killer_disabled = false; | 474 | oom_killer_disabled = false; |
477 | } | 475 | } |
478 | 476 | ||
477 | /* | ||
478 | * task->mm can be NULL if the task is the exited group leader. So to | ||
479 | * determine whether the task is using a particular mm, we examine all the | ||
480 | * task's threads: if one of those is using this mm then this task was also | ||
481 | * using it. | ||
482 | */ | ||
483 | static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) | ||
484 | { | ||
485 | struct task_struct *t; | ||
486 | |||
487 | for_each_thread(p, t) { | ||
488 | struct mm_struct *t_mm = READ_ONCE(t->mm); | ||
489 | if (t_mm) | ||
490 | return t_mm == mm; | ||
491 | } | ||
492 | return false; | ||
493 | } | ||
494 | |||
479 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 495 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
480 | /* | 496 | /* |
481 | * Must be called while holding a reference to p, which will be released upon | 497 | * Must be called while holding a reference to p, which will be released upon |
@@ -509,10 +525,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
509 | if (__ratelimit(&oom_rs)) | 525 | if (__ratelimit(&oom_rs)) |
510 | dump_header(oc, p, memcg); | 526 | dump_header(oc, p, memcg); |
511 | 527 | ||
512 | task_lock(p); | ||
513 | pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", | 528 | pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", |
514 | message, task_pid_nr(p), p->comm, points); | 529 | message, task_pid_nr(p), p->comm, points); |
515 | task_unlock(p); | ||
516 | 530 | ||
517 | /* | 531 | /* |
518 | * If any of p's children has a different mm and is eligible for kill, | 532 | * If any of p's children has a different mm and is eligible for kill, |
@@ -525,7 +539,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
525 | list_for_each_entry(child, &t->children, sibling) { | 539 | list_for_each_entry(child, &t->children, sibling) { |
526 | unsigned int child_points; | 540 | unsigned int child_points; |
527 | 541 | ||
528 | if (child->mm == p->mm) | 542 | if (process_shares_mm(child, p->mm)) |
529 | continue; | 543 | continue; |
530 | /* | 544 | /* |
531 | * oom_badness() returns 0 if the thread is unkillable | 545 | * oom_badness() returns 0 if the thread is unkillable |
@@ -552,8 +566,15 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
552 | victim = p; | 566 | victim = p; |
553 | } | 567 | } |
554 | 568 | ||
555 | /* mm cannot safely be dereferenced after task_unlock(victim) */ | 569 | /* Get a reference to safely compare mm after task_unlock(victim) */ |
556 | mm = victim->mm; | 570 | mm = victim->mm; |
571 | atomic_inc(&mm->mm_count); | ||
572 | /* | ||
573 | * We should send SIGKILL before setting TIF_MEMDIE in order to prevent | ||
574 | * the OOM victim from depleting the memory reserves from the user | ||
575 | * space under its control. | ||
576 | */ | ||
577 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); | ||
557 | mark_oom_victim(victim); | 578 | mark_oom_victim(victim); |
558 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | 579 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
559 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), | 580 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), |
@@ -571,21 +592,21 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
571 | * pending fatal signal. | 592 | * pending fatal signal. |
572 | */ | 593 | */ |
573 | rcu_read_lock(); | 594 | rcu_read_lock(); |
574 | for_each_process(p) | 595 | for_each_process(p) { |
575 | if (p->mm == mm && !same_thread_group(p, victim) && | 596 | if (!process_shares_mm(p, mm)) |
576 | !(p->flags & PF_KTHREAD)) { | 597 | continue; |
577 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | 598 | if (same_thread_group(p, victim)) |
578 | continue; | 599 | continue; |
600 | if (unlikely(p->flags & PF_KTHREAD)) | ||
601 | continue; | ||
602 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
603 | continue; | ||
579 | 604 | ||
580 | task_lock(p); /* Protect ->comm from prctl() */ | 605 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); |
581 | pr_err("Kill process %d (%s) sharing same memory\n", | 606 | } |
582 | task_pid_nr(p), p->comm); | ||
583 | task_unlock(p); | ||
584 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); | ||
585 | } | ||
586 | rcu_read_unlock(); | 607 | rcu_read_unlock(); |
587 | 608 | ||
588 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); | 609 | mmdrop(mm); |
589 | put_task_struct(victim); | 610 | put_task_struct(victim); |
590 | } | 611 | } |
591 | #undef K | 612 | #undef K |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 805bbad2e24e..446bb36ee59d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -3428,24 +3428,24 @@ EXPORT_SYMBOL(__free_page_frag); | |||
3428 | struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) | 3428 | struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) |
3429 | { | 3429 | { |
3430 | struct page *page; | 3430 | struct page *page; |
3431 | struct mem_cgroup *memcg = NULL; | ||
3432 | 3431 | ||
3433 | if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) | ||
3434 | return NULL; | ||
3435 | page = alloc_pages(gfp_mask, order); | 3432 | page = alloc_pages(gfp_mask, order); |
3436 | memcg_kmem_commit_charge(page, memcg, order); | 3433 | if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { |
3434 | __free_pages(page, order); | ||
3435 | page = NULL; | ||
3436 | } | ||
3437 | return page; | 3437 | return page; |
3438 | } | 3438 | } |
3439 | 3439 | ||
3440 | struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) | 3440 | struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) |
3441 | { | 3441 | { |
3442 | struct page *page; | 3442 | struct page *page; |
3443 | struct mem_cgroup *memcg = NULL; | ||
3444 | 3443 | ||
3445 | if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) | ||
3446 | return NULL; | ||
3447 | page = alloc_pages_node(nid, gfp_mask, order); | 3444 | page = alloc_pages_node(nid, gfp_mask, order); |
3448 | memcg_kmem_commit_charge(page, memcg, order); | 3445 | if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { |
3446 | __free_pages(page, order); | ||
3447 | page = NULL; | ||
3448 | } | ||
3449 | return page; | 3449 | return page; |
3450 | } | 3450 | } |
3451 | 3451 | ||
@@ -3455,7 +3455,7 @@ struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) | |||
3455 | */ | 3455 | */ |
3456 | void __free_kmem_pages(struct page *page, unsigned int order) | 3456 | void __free_kmem_pages(struct page *page, unsigned int order) |
3457 | { | 3457 | { |
3458 | memcg_kmem_uncharge_pages(page, order); | 3458 | memcg_kmem_uncharge(page, order); |
3459 | __free_pages(page, order); | 3459 | __free_pages(page, order); |
3460 | } | 3460 | } |
3461 | 3461 | ||
@@ -4900,8 +4900,7 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
4900 | 4900 | ||
4901 | int __meminit init_currently_empty_zone(struct zone *zone, | 4901 | int __meminit init_currently_empty_zone(struct zone *zone, |
4902 | unsigned long zone_start_pfn, | 4902 | unsigned long zone_start_pfn, |
4903 | unsigned long size, | 4903 | unsigned long size) |
4904 | enum memmap_context context) | ||
4905 | { | 4904 | { |
4906 | struct pglist_data *pgdat = zone->zone_pgdat; | 4905 | struct pglist_data *pgdat = zone->zone_pgdat; |
4907 | int ret; | 4906 | int ret; |
@@ -5413,8 +5412,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
5413 | 5412 | ||
5414 | set_pageblock_order(); | 5413 | set_pageblock_order(); |
5415 | setup_usemap(pgdat, zone, zone_start_pfn, size); | 5414 | setup_usemap(pgdat, zone, zone_start_pfn, size); |
5416 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 5415 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); |
5417 | size, MEMMAP_EARLY); | ||
5418 | BUG_ON(ret); | 5416 | BUG_ON(ret); |
5419 | memmap_init(size, nid, j, zone_start_pfn); | 5417 | memmap_init(size, nid, j, zone_start_pfn); |
5420 | zone_start_pfn += size; | 5418 | zone_start_pfn += size; |
@@ -5423,6 +5421,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
5423 | 5421 | ||
5424 | static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | 5422 | static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) |
5425 | { | 5423 | { |
5424 | unsigned long __maybe_unused offset = 0; | ||
5425 | |||
5426 | /* Skip empty nodes */ | 5426 | /* Skip empty nodes */ |
5427 | if (!pgdat->node_spanned_pages) | 5427 | if (!pgdat->node_spanned_pages) |
5428 | return; | 5428 | return; |
@@ -5439,6 +5439,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
5439 | * for the buddy allocator to function correctly. | 5439 | * for the buddy allocator to function correctly. |
5440 | */ | 5440 | */ |
5441 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); | 5441 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); |
5442 | offset = pgdat->node_start_pfn - start; | ||
5442 | end = pgdat_end_pfn(pgdat); | 5443 | end = pgdat_end_pfn(pgdat); |
5443 | end = ALIGN(end, MAX_ORDER_NR_PAGES); | 5444 | end = ALIGN(end, MAX_ORDER_NR_PAGES); |
5444 | size = (end - start) * sizeof(struct page); | 5445 | size = (end - start) * sizeof(struct page); |
@@ -5446,7 +5447,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
5446 | if (!map) | 5447 | if (!map) |
5447 | map = memblock_virt_alloc_node_nopanic(size, | 5448 | map = memblock_virt_alloc_node_nopanic(size, |
5448 | pgdat->node_id); | 5449 | pgdat->node_id); |
5449 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); | 5450 | pgdat->node_mem_map = map + offset; |
5450 | } | 5451 | } |
5451 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 5452 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
5452 | /* | 5453 | /* |
@@ -5454,9 +5455,9 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
5454 | */ | 5455 | */ |
5455 | if (pgdat == NODE_DATA(0)) { | 5456 | if (pgdat == NODE_DATA(0)) { |
5456 | mem_map = NODE_DATA(0)->node_mem_map; | 5457 | mem_map = NODE_DATA(0)->node_mem_map; |
5457 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 5458 | #if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) |
5458 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) | 5459 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) |
5459 | mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); | 5460 | mem_map -= offset; |
5460 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 5461 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
5461 | } | 5462 | } |
5462 | #endif | 5463 | #endif |
@@ -5668,13 +5669,17 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
5668 | */ | 5669 | */ |
5669 | required_movablecore = | 5670 | required_movablecore = |
5670 | roundup(required_movablecore, MAX_ORDER_NR_PAGES); | 5671 | roundup(required_movablecore, MAX_ORDER_NR_PAGES); |
5672 | required_movablecore = min(totalpages, required_movablecore); | ||
5671 | corepages = totalpages - required_movablecore; | 5673 | corepages = totalpages - required_movablecore; |
5672 | 5674 | ||
5673 | required_kernelcore = max(required_kernelcore, corepages); | 5675 | required_kernelcore = max(required_kernelcore, corepages); |
5674 | } | 5676 | } |
5675 | 5677 | ||
5676 | /* If kernelcore was not specified, there is no ZONE_MOVABLE */ | 5678 | /* |
5677 | if (!required_kernelcore) | 5679 | * If kernelcore was not specified or kernelcore size is larger |
5680 | * than totalpages, there is no ZONE_MOVABLE. | ||
5681 | */ | ||
5682 | if (!required_kernelcore || required_kernelcore >= totalpages) | ||
5678 | goto out; | 5683 | goto out; |
5679 | 5684 | ||
5680 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ | 5685 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ |
diff --git a/mm/page_counter.c b/mm/page_counter.c index 11b4beda14ba..7c6a63d2c27f 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c | |||
@@ -56,12 +56,12 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) | |||
56 | * @nr_pages: number of pages to charge | 56 | * @nr_pages: number of pages to charge |
57 | * @fail: points first counter to hit its limit, if any | 57 | * @fail: points first counter to hit its limit, if any |
58 | * | 58 | * |
59 | * Returns 0 on success, or -ENOMEM and @fail if the counter or one of | 59 | * Returns %true on success, or %false and @fail if the counter or one |
60 | * its ancestors has hit its configured limit. | 60 | * of its ancestors has hit its configured limit. |
61 | */ | 61 | */ |
62 | int page_counter_try_charge(struct page_counter *counter, | 62 | bool page_counter_try_charge(struct page_counter *counter, |
63 | unsigned long nr_pages, | 63 | unsigned long nr_pages, |
64 | struct page_counter **fail) | 64 | struct page_counter **fail) |
65 | { | 65 | { |
66 | struct page_counter *c; | 66 | struct page_counter *c; |
67 | 67 | ||
@@ -99,13 +99,13 @@ int page_counter_try_charge(struct page_counter *counter, | |||
99 | if (new > c->watermark) | 99 | if (new > c->watermark) |
100 | c->watermark = new; | 100 | c->watermark = new; |
101 | } | 101 | } |
102 | return 0; | 102 | return true; |
103 | 103 | ||
104 | failed: | 104 | failed: |
105 | for (c = counter; c != *fail; c = c->parent) | 105 | for (c = counter; c != *fail; c = c->parent) |
106 | page_counter_cancel(c, nr_pages); | 106 | page_counter_cancel(c, nr_pages); |
107 | 107 | ||
108 | return -ENOMEM; | 108 | return false; |
109 | } | 109 | } |
110 | 110 | ||
111 | /** | 111 | /** |
diff --git a/mm/percpu.c b/mm/percpu.c index a63b4d82a141..8a943b97a053 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1554,12 +1554,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1554 | PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); | 1554 | PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); |
1555 | #ifdef CONFIG_SMP | 1555 | #ifdef CONFIG_SMP |
1556 | PCPU_SETUP_BUG_ON(!ai->static_size); | 1556 | PCPU_SETUP_BUG_ON(!ai->static_size); |
1557 | PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK); | 1557 | PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start)); |
1558 | #endif | 1558 | #endif |
1559 | PCPU_SETUP_BUG_ON(!base_addr); | 1559 | PCPU_SETUP_BUG_ON(!base_addr); |
1560 | PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK); | 1560 | PCPU_SETUP_BUG_ON(offset_in_page(base_addr)); |
1561 | PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); | 1561 | PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); |
1562 | PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); | 1562 | PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); |
1563 | PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); | 1563 | PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); |
1564 | PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); | 1564 | PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); |
1565 | PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); | 1565 | PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); |
@@ -1806,7 +1806,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( | |||
1806 | 1806 | ||
1807 | alloc_size = roundup(min_unit_size, atom_size); | 1807 | alloc_size = roundup(min_unit_size, atom_size); |
1808 | upa = alloc_size / min_unit_size; | 1808 | upa = alloc_size / min_unit_size; |
1809 | while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) | 1809 | while (alloc_size % upa || (offset_in_page(alloc_size / upa))) |
1810 | upa--; | 1810 | upa--; |
1811 | max_upa = upa; | 1811 | max_upa = upa; |
1812 | 1812 | ||
@@ -1838,7 +1838,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( | |||
1838 | for (upa = max_upa; upa; upa--) { | 1838 | for (upa = max_upa; upa; upa--) { |
1839 | int allocs = 0, wasted = 0; | 1839 | int allocs = 0, wasted = 0; |
1840 | 1840 | ||
1841 | if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) | 1841 | if (alloc_size % upa || (offset_in_page(alloc_size / upa))) |
1842 | continue; | 1842 | continue; |
1843 | 1843 | ||
1844 | for (group = 0; group < nr_groups; group++) { | 1844 | for (group = 0; group < nr_groups; group++) { |
diff --git a/mm/readahead.c b/mm/readahead.c index 24682f6f4cfd..998ad592f408 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -213,7 +213,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
213 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) | 213 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) |
214 | return -EINVAL; | 214 | return -EINVAL; |
215 | 215 | ||
216 | nr_to_read = max_sane_readahead(nr_to_read); | 216 | nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages); |
217 | while (nr_to_read) { | 217 | while (nr_to_read) { |
218 | int err; | 218 | int err; |
219 | 219 | ||
@@ -232,16 +232,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
232 | return 0; | 232 | return 0; |
233 | } | 233 | } |
234 | 234 | ||
235 | #define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) | ||
236 | /* | ||
237 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a | ||
238 | * sensible upper limit. | ||
239 | */ | ||
240 | unsigned long max_sane_readahead(unsigned long nr) | ||
241 | { | ||
242 | return min(nr, MAX_READAHEAD); | ||
243 | } | ||
244 | |||
245 | /* | 235 | /* |
246 | * Set the initial window size, round to next power of 2 and square | 236 | * Set the initial window size, round to next power of 2 and square |
247 | * for small size, x 4 for medium, and x 2 for large | 237 | * for small size, x 4 for medium, and x 2 for large |
@@ -380,7 +370,7 @@ ondemand_readahead(struct address_space *mapping, | |||
380 | bool hit_readahead_marker, pgoff_t offset, | 370 | bool hit_readahead_marker, pgoff_t offset, |
381 | unsigned long req_size) | 371 | unsigned long req_size) |
382 | { | 372 | { |
383 | unsigned long max = max_sane_readahead(ra->ra_pages); | 373 | unsigned long max = ra->ra_pages; |
384 | pgoff_t prev_offset; | 374 | pgoff_t prev_offset; |
385 | 375 | ||
386 | /* | 376 | /* |
@@ -1304,6 +1304,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1304 | int ret = SWAP_AGAIN; | 1304 | int ret = SWAP_AGAIN; |
1305 | enum ttu_flags flags = (enum ttu_flags)arg; | 1305 | enum ttu_flags flags = (enum ttu_flags)arg; |
1306 | 1306 | ||
1307 | /* munlock has nothing to gain from examining un-locked vmas */ | ||
1308 | if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) | ||
1309 | goto out; | ||
1310 | |||
1307 | pte = page_check_address(page, mm, address, &ptl, 0); | 1311 | pte = page_check_address(page, mm, address, &ptl, 0); |
1308 | if (!pte) | 1312 | if (!pte) |
1309 | goto out; | 1313 | goto out; |
@@ -1314,9 +1318,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1314 | * skipped over this mm) then we should reactivate it. | 1318 | * skipped over this mm) then we should reactivate it. |
1315 | */ | 1319 | */ |
1316 | if (!(flags & TTU_IGNORE_MLOCK)) { | 1320 | if (!(flags & TTU_IGNORE_MLOCK)) { |
1317 | if (vma->vm_flags & VM_LOCKED) | 1321 | if (vma->vm_flags & VM_LOCKED) { |
1318 | goto out_mlock; | 1322 | /* Holding pte lock, we do *not* need mmap_sem here */ |
1319 | 1323 | mlock_vma_page(page); | |
1324 | ret = SWAP_MLOCK; | ||
1325 | goto out_unmap; | ||
1326 | } | ||
1320 | if (flags & TTU_MUNLOCK) | 1327 | if (flags & TTU_MUNLOCK) |
1321 | goto out_unmap; | 1328 | goto out_unmap; |
1322 | } | 1329 | } |
@@ -1352,7 +1359,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1352 | update_hiwater_rss(mm); | 1359 | update_hiwater_rss(mm); |
1353 | 1360 | ||
1354 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 1361 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
1355 | if (!PageHuge(page)) { | 1362 | if (PageHuge(page)) { |
1363 | hugetlb_count_sub(1 << compound_order(page), mm); | ||
1364 | } else { | ||
1356 | if (PageAnon(page)) | 1365 | if (PageAnon(page)) |
1357 | dec_mm_counter(mm, MM_ANONPAGES); | 1366 | dec_mm_counter(mm, MM_ANONPAGES); |
1358 | else | 1367 | else |
@@ -1370,47 +1379,44 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1370 | dec_mm_counter(mm, MM_ANONPAGES); | 1379 | dec_mm_counter(mm, MM_ANONPAGES); |
1371 | else | 1380 | else |
1372 | dec_mm_counter(mm, MM_FILEPAGES); | 1381 | dec_mm_counter(mm, MM_FILEPAGES); |
1382 | } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { | ||
1383 | swp_entry_t entry; | ||
1384 | pte_t swp_pte; | ||
1385 | /* | ||
1386 | * Store the pfn of the page in a special migration | ||
1387 | * pte. do_swap_page() will wait until the migration | ||
1388 | * pte is removed and then restart fault handling. | ||
1389 | */ | ||
1390 | entry = make_migration_entry(page, pte_write(pteval)); | ||
1391 | swp_pte = swp_entry_to_pte(entry); | ||
1392 | if (pte_soft_dirty(pteval)) | ||
1393 | swp_pte = pte_swp_mksoft_dirty(swp_pte); | ||
1394 | set_pte_at(mm, address, pte, swp_pte); | ||
1373 | } else if (PageAnon(page)) { | 1395 | } else if (PageAnon(page)) { |
1374 | swp_entry_t entry = { .val = page_private(page) }; | 1396 | swp_entry_t entry = { .val = page_private(page) }; |
1375 | pte_t swp_pte; | 1397 | pte_t swp_pte; |
1376 | 1398 | /* | |
1377 | if (PageSwapCache(page)) { | 1399 | * Store the swap location in the pte. |
1378 | /* | 1400 | * See handle_pte_fault() ... |
1379 | * Store the swap location in the pte. | 1401 | */ |
1380 | * See handle_pte_fault() ... | 1402 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
1381 | */ | 1403 | if (swap_duplicate(entry) < 0) { |
1382 | if (swap_duplicate(entry) < 0) { | 1404 | set_pte_at(mm, address, pte, pteval); |
1383 | set_pte_at(mm, address, pte, pteval); | 1405 | ret = SWAP_FAIL; |
1384 | ret = SWAP_FAIL; | 1406 | goto out_unmap; |
1385 | goto out_unmap; | 1407 | } |
1386 | } | 1408 | if (list_empty(&mm->mmlist)) { |
1387 | if (list_empty(&mm->mmlist)) { | 1409 | spin_lock(&mmlist_lock); |
1388 | spin_lock(&mmlist_lock); | 1410 | if (list_empty(&mm->mmlist)) |
1389 | if (list_empty(&mm->mmlist)) | 1411 | list_add(&mm->mmlist, &init_mm.mmlist); |
1390 | list_add(&mm->mmlist, &init_mm.mmlist); | 1412 | spin_unlock(&mmlist_lock); |
1391 | spin_unlock(&mmlist_lock); | ||
1392 | } | ||
1393 | dec_mm_counter(mm, MM_ANONPAGES); | ||
1394 | inc_mm_counter(mm, MM_SWAPENTS); | ||
1395 | } else if (IS_ENABLED(CONFIG_MIGRATION)) { | ||
1396 | /* | ||
1397 | * Store the pfn of the page in a special migration | ||
1398 | * pte. do_swap_page() will wait until the migration | ||
1399 | * pte is removed and then restart fault handling. | ||
1400 | */ | ||
1401 | BUG_ON(!(flags & TTU_MIGRATION)); | ||
1402 | entry = make_migration_entry(page, pte_write(pteval)); | ||
1403 | } | 1413 | } |
1414 | dec_mm_counter(mm, MM_ANONPAGES); | ||
1415 | inc_mm_counter(mm, MM_SWAPENTS); | ||
1404 | swp_pte = swp_entry_to_pte(entry); | 1416 | swp_pte = swp_entry_to_pte(entry); |
1405 | if (pte_soft_dirty(pteval)) | 1417 | if (pte_soft_dirty(pteval)) |
1406 | swp_pte = pte_swp_mksoft_dirty(swp_pte); | 1418 | swp_pte = pte_swp_mksoft_dirty(swp_pte); |
1407 | set_pte_at(mm, address, pte, swp_pte); | 1419 | set_pte_at(mm, address, pte, swp_pte); |
1408 | } else if (IS_ENABLED(CONFIG_MIGRATION) && | ||
1409 | (flags & TTU_MIGRATION)) { | ||
1410 | /* Establish migration entry for a file page */ | ||
1411 | swp_entry_t entry; | ||
1412 | entry = make_migration_entry(page, pte_write(pteval)); | ||
1413 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | ||
1414 | } else | 1420 | } else |
1415 | dec_mm_counter(mm, MM_FILEPAGES); | 1421 | dec_mm_counter(mm, MM_FILEPAGES); |
1416 | 1422 | ||
@@ -1419,31 +1425,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1419 | 1425 | ||
1420 | out_unmap: | 1426 | out_unmap: |
1421 | pte_unmap_unlock(pte, ptl); | 1427 | pte_unmap_unlock(pte, ptl); |
1422 | if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) | 1428 | if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK)) |
1423 | mmu_notifier_invalidate_page(mm, address); | 1429 | mmu_notifier_invalidate_page(mm, address); |
1424 | out: | 1430 | out: |
1425 | return ret; | 1431 | return ret; |
1426 | |||
1427 | out_mlock: | ||
1428 | pte_unmap_unlock(pte, ptl); | ||
1429 | |||
1430 | |||
1431 | /* | ||
1432 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | ||
1433 | * unstable result and race. Plus, We can't wait here because | ||
1434 | * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem. | ||
1435 | * if trylock failed, the page remain in evictable lru and later | ||
1436 | * vmscan could retry to move the page to unevictable lru if the | ||
1437 | * page is actually mlocked. | ||
1438 | */ | ||
1439 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
1440 | if (vma->vm_flags & VM_LOCKED) { | ||
1441 | mlock_vma_page(page); | ||
1442 | ret = SWAP_MLOCK; | ||
1443 | } | ||
1444 | up_read(&vma->vm_mm->mmap_sem); | ||
1445 | } | ||
1446 | return ret; | ||
1447 | } | 1432 | } |
1448 | 1433 | ||
1449 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1434 | bool is_vma_temporary_stack(struct vm_area_struct *vma) |
@@ -1607,6 +1592,8 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | |||
1607 | struct vm_area_struct *vma = avc->vma; | 1592 | struct vm_area_struct *vma = avc->vma; |
1608 | unsigned long address = vma_address(page, vma); | 1593 | unsigned long address = vma_address(page, vma); |
1609 | 1594 | ||
1595 | cond_resched(); | ||
1596 | |||
1610 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | 1597 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) |
1611 | continue; | 1598 | continue; |
1612 | 1599 | ||
@@ -1656,6 +1643,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
1656 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 1643 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1657 | unsigned long address = vma_address(page, vma); | 1644 | unsigned long address = vma_address(page, vma); |
1658 | 1645 | ||
1646 | cond_resched(); | ||
1647 | |||
1659 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | 1648 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) |
1660 | continue; | 1649 | continue; |
1661 | 1650 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 48ce82926d93..3b8b73928398 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -548,12 +548,12 @@ static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
548 | struct inode *inode = dentry->d_inode; | 548 | struct inode *inode = dentry->d_inode; |
549 | struct shmem_inode_info *info = SHMEM_I(inode); | 549 | struct shmem_inode_info *info = SHMEM_I(inode); |
550 | 550 | ||
551 | spin_lock(&info->lock); | 551 | if (info->alloced - info->swapped != inode->i_mapping->nrpages) { |
552 | shmem_recalc_inode(inode); | 552 | spin_lock(&info->lock); |
553 | spin_unlock(&info->lock); | 553 | shmem_recalc_inode(inode); |
554 | 554 | spin_unlock(&info->lock); | |
555 | } | ||
555 | generic_fillattr(inode, stat); | 556 | generic_fillattr(inode, stat); |
556 | |||
557 | return 0; | 557 | return 0; |
558 | } | 558 | } |
559 | 559 | ||
@@ -586,10 +586,16 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
586 | } | 586 | } |
587 | if (newsize <= oldsize) { | 587 | if (newsize <= oldsize) { |
588 | loff_t holebegin = round_up(newsize, PAGE_SIZE); | 588 | loff_t holebegin = round_up(newsize, PAGE_SIZE); |
589 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); | 589 | if (oldsize > holebegin) |
590 | shmem_truncate_range(inode, newsize, (loff_t)-1); | 590 | unmap_mapping_range(inode->i_mapping, |
591 | holebegin, 0, 1); | ||
592 | if (info->alloced) | ||
593 | shmem_truncate_range(inode, | ||
594 | newsize, (loff_t)-1); | ||
591 | /* unmap again to remove racily COWed private pages */ | 595 | /* unmap again to remove racily COWed private pages */ |
592 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); | 596 | if (oldsize > holebegin) |
597 | unmap_mapping_range(inode->i_mapping, | ||
598 | holebegin, 0, 1); | ||
593 | } | 599 | } |
594 | } | 600 | } |
595 | 601 | ||
@@ -1023,7 +1029,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
1023 | */ | 1029 | */ |
1024 | oldpage = newpage; | 1030 | oldpage = newpage; |
1025 | } else { | 1031 | } else { |
1026 | mem_cgroup_migrate(oldpage, newpage, true); | 1032 | mem_cgroup_replace_page(oldpage, newpage); |
1027 | lru_cache_add_anon(newpage); | 1033 | lru_cache_add_anon(newpage); |
1028 | *pagep = newpage; | 1034 | *pagep = newpage; |
1029 | } | 1035 | } |
@@ -282,6 +282,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) | |||
282 | 282 | ||
283 | #define CFLGS_OFF_SLAB (0x80000000UL) | 283 | #define CFLGS_OFF_SLAB (0x80000000UL) |
284 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | 284 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) |
285 | #define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) | ||
285 | 286 | ||
286 | #define BATCHREFILL_LIMIT 16 | 287 | #define BATCHREFILL_LIMIT 16 |
287 | /* | 288 | /* |
@@ -1592,16 +1593,17 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, | |||
1592 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1593 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1593 | flags |= __GFP_RECLAIMABLE; | 1594 | flags |= __GFP_RECLAIMABLE; |
1594 | 1595 | ||
1595 | if (memcg_charge_slab(cachep, flags, cachep->gfporder)) | ||
1596 | return NULL; | ||
1597 | |||
1598 | page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); | 1596 | page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); |
1599 | if (!page) { | 1597 | if (!page) { |
1600 | memcg_uncharge_slab(cachep, cachep->gfporder); | ||
1601 | slab_out_of_memory(cachep, flags, nodeid); | 1598 | slab_out_of_memory(cachep, flags, nodeid); |
1602 | return NULL; | 1599 | return NULL; |
1603 | } | 1600 | } |
1604 | 1601 | ||
1602 | if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) { | ||
1603 | __free_pages(page, cachep->gfporder); | ||
1604 | return NULL; | ||
1605 | } | ||
1606 | |||
1605 | /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ | 1607 | /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ |
1606 | if (page_is_pfmemalloc(page)) | 1608 | if (page_is_pfmemalloc(page)) |
1607 | pfmemalloc_active = true; | 1609 | pfmemalloc_active = true; |
@@ -1653,8 +1655,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) | |||
1653 | 1655 | ||
1654 | if (current->reclaim_state) | 1656 | if (current->reclaim_state) |
1655 | current->reclaim_state->reclaimed_slab += nr_freed; | 1657 | current->reclaim_state->reclaimed_slab += nr_freed; |
1656 | __free_pages(page, cachep->gfporder); | 1658 | __free_kmem_pages(page, cachep->gfporder); |
1657 | memcg_uncharge_slab(cachep, cachep->gfporder); | ||
1658 | } | 1659 | } |
1659 | 1660 | ||
1660 | static void kmem_rcu_free(struct rcu_head *head) | 1661 | static void kmem_rcu_free(struct rcu_head *head) |
@@ -2212,7 +2213,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2212 | * it too early on. Always use on-slab management when | 2213 | * it too early on. Always use on-slab management when |
2213 | * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) | 2214 | * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) |
2214 | */ | 2215 | */ |
2215 | if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && | 2216 | if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && |
2216 | !(flags & SLAB_NOLEAKTRACE)) | 2217 | !(flags & SLAB_NOLEAKTRACE)) |
2217 | /* | 2218 | /* |
2218 | * Size is large, assume best to place the slab management obj | 2219 | * Size is large, assume best to place the slab management obj |
@@ -2276,7 +2277,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2276 | /* | 2277 | /* |
2277 | * This is a possibility for one of the kmalloc_{dma,}_caches. | 2278 | * This is a possibility for one of the kmalloc_{dma,}_caches. |
2278 | * But since we go off slab only for object size greater than | 2279 | * But since we go off slab only for object size greater than |
2279 | * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created | 2280 | * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created |
2280 | * in ascending order,this should not happen at all. | 2281 | * in ascending order,this should not happen at all. |
2281 | * But leave a BUG_ON for some lucky dude. | 2282 | * But leave a BUG_ON for some lucky dude. |
2282 | */ | 2283 | */ |
@@ -181,10 +181,6 @@ bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); | |||
181 | list_for_each_entry(iter, &(root)->memcg_params.list, \ | 181 | list_for_each_entry(iter, &(root)->memcg_params.list, \ |
182 | memcg_params.list) | 182 | memcg_params.list) |
183 | 183 | ||
184 | #define for_each_memcg_cache_safe(iter, tmp, root) \ | ||
185 | list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \ | ||
186 | memcg_params.list) | ||
187 | |||
188 | static inline bool is_root_cache(struct kmem_cache *s) | 184 | static inline bool is_root_cache(struct kmem_cache *s) |
189 | { | 185 | { |
190 | return s->memcg_params.is_root_cache; | 186 | return s->memcg_params.is_root_cache; |
@@ -240,23 +236,16 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | |||
240 | return s->memcg_params.root_cache; | 236 | return s->memcg_params.root_cache; |
241 | } | 237 | } |
242 | 238 | ||
243 | static __always_inline int memcg_charge_slab(struct kmem_cache *s, | 239 | static __always_inline int memcg_charge_slab(struct page *page, |
244 | gfp_t gfp, int order) | 240 | gfp_t gfp, int order, |
241 | struct kmem_cache *s) | ||
245 | { | 242 | { |
246 | if (!memcg_kmem_enabled()) | 243 | if (!memcg_kmem_enabled()) |
247 | return 0; | 244 | return 0; |
248 | if (is_root_cache(s)) | 245 | if (is_root_cache(s)) |
249 | return 0; | 246 | return 0; |
250 | return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); | 247 | return __memcg_kmem_charge_memcg(page, gfp, order, |
251 | } | 248 | s->memcg_params.memcg); |
252 | |||
253 | static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) | ||
254 | { | ||
255 | if (!memcg_kmem_enabled()) | ||
256 | return; | ||
257 | if (is_root_cache(s)) | ||
258 | return; | ||
259 | memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order); | ||
260 | } | 249 | } |
261 | 250 | ||
262 | extern void slab_init_memcg_params(struct kmem_cache *); | 251 | extern void slab_init_memcg_params(struct kmem_cache *); |
@@ -265,8 +254,6 @@ extern void slab_init_memcg_params(struct kmem_cache *); | |||
265 | 254 | ||
266 | #define for_each_memcg_cache(iter, root) \ | 255 | #define for_each_memcg_cache(iter, root) \ |
267 | for ((void)(iter), (void)(root); 0; ) | 256 | for ((void)(iter), (void)(root); 0; ) |
268 | #define for_each_memcg_cache_safe(iter, tmp, root) \ | ||
269 | for ((void)(iter), (void)(tmp), (void)(root); 0; ) | ||
270 | 257 | ||
271 | static inline bool is_root_cache(struct kmem_cache *s) | 258 | static inline bool is_root_cache(struct kmem_cache *s) |
272 | { | 259 | { |
@@ -295,15 +282,12 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | |||
295 | return s; | 282 | return s; |
296 | } | 283 | } |
297 | 284 | ||
298 | static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) | 285 | static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, |
286 | struct kmem_cache *s) | ||
299 | { | 287 | { |
300 | return 0; | 288 | return 0; |
301 | } | 289 | } |
302 | 290 | ||
303 | static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) | ||
304 | { | ||
305 | } | ||
306 | |||
307 | static inline void slab_init_memcg_params(struct kmem_cache *s) | 291 | static inline void slab_init_memcg_params(struct kmem_cache *s) |
308 | { | 292 | { |
309 | } | 293 | } |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 5ce4faeb16fb..d88e97c10a2e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -316,10 +316,10 @@ unsigned long calculate_alignment(unsigned long flags, | |||
316 | return ALIGN(align, sizeof(void *)); | 316 | return ALIGN(align, sizeof(void *)); |
317 | } | 317 | } |
318 | 318 | ||
319 | static struct kmem_cache * | 319 | static struct kmem_cache *create_cache(const char *name, |
320 | do_kmem_cache_create(const char *name, size_t object_size, size_t size, | 320 | size_t object_size, size_t size, size_t align, |
321 | size_t align, unsigned long flags, void (*ctor)(void *), | 321 | unsigned long flags, void (*ctor)(void *), |
322 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) | 322 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) |
323 | { | 323 | { |
324 | struct kmem_cache *s; | 324 | struct kmem_cache *s; |
325 | int err; | 325 | int err; |
@@ -384,7 +384,7 @@ struct kmem_cache * | |||
384 | kmem_cache_create(const char *name, size_t size, size_t align, | 384 | kmem_cache_create(const char *name, size_t size, size_t align, |
385 | unsigned long flags, void (*ctor)(void *)) | 385 | unsigned long flags, void (*ctor)(void *)) |
386 | { | 386 | { |
387 | struct kmem_cache *s; | 387 | struct kmem_cache *s = NULL; |
388 | const char *cache_name; | 388 | const char *cache_name; |
389 | int err; | 389 | int err; |
390 | 390 | ||
@@ -396,7 +396,6 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
396 | 396 | ||
397 | err = kmem_cache_sanity_check(name, size); | 397 | err = kmem_cache_sanity_check(name, size); |
398 | if (err) { | 398 | if (err) { |
399 | s = NULL; /* suppress uninit var warning */ | ||
400 | goto out_unlock; | 399 | goto out_unlock; |
401 | } | 400 | } |
402 | 401 | ||
@@ -418,9 +417,9 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
418 | goto out_unlock; | 417 | goto out_unlock; |
419 | } | 418 | } |
420 | 419 | ||
421 | s = do_kmem_cache_create(cache_name, size, size, | 420 | s = create_cache(cache_name, size, size, |
422 | calculate_alignment(flags, align, size), | 421 | calculate_alignment(flags, align, size), |
423 | flags, ctor, NULL, NULL); | 422 | flags, ctor, NULL, NULL); |
424 | if (IS_ERR(s)) { | 423 | if (IS_ERR(s)) { |
425 | err = PTR_ERR(s); | 424 | err = PTR_ERR(s); |
426 | kfree_const(cache_name); | 425 | kfree_const(cache_name); |
@@ -448,29 +447,20 @@ out_unlock: | |||
448 | } | 447 | } |
449 | EXPORT_SYMBOL(kmem_cache_create); | 448 | EXPORT_SYMBOL(kmem_cache_create); |
450 | 449 | ||
451 | static int do_kmem_cache_shutdown(struct kmem_cache *s, | 450 | static int shutdown_cache(struct kmem_cache *s, |
452 | struct list_head *release, bool *need_rcu_barrier) | 451 | struct list_head *release, bool *need_rcu_barrier) |
453 | { | 452 | { |
454 | if (__kmem_cache_shutdown(s) != 0) { | 453 | if (__kmem_cache_shutdown(s) != 0) |
455 | printk(KERN_ERR "kmem_cache_destroy %s: " | ||
456 | "Slab cache still has objects\n", s->name); | ||
457 | dump_stack(); | ||
458 | return -EBUSY; | 454 | return -EBUSY; |
459 | } | ||
460 | 455 | ||
461 | if (s->flags & SLAB_DESTROY_BY_RCU) | 456 | if (s->flags & SLAB_DESTROY_BY_RCU) |
462 | *need_rcu_barrier = true; | 457 | *need_rcu_barrier = true; |
463 | 458 | ||
464 | #ifdef CONFIG_MEMCG_KMEM | ||
465 | if (!is_root_cache(s)) | ||
466 | list_del(&s->memcg_params.list); | ||
467 | #endif | ||
468 | list_move(&s->list, release); | 459 | list_move(&s->list, release); |
469 | return 0; | 460 | return 0; |
470 | } | 461 | } |
471 | 462 | ||
472 | static void do_kmem_cache_release(struct list_head *release, | 463 | static void release_caches(struct list_head *release, bool need_rcu_barrier) |
473 | bool need_rcu_barrier) | ||
474 | { | 464 | { |
475 | struct kmem_cache *s, *s2; | 465 | struct kmem_cache *s, *s2; |
476 | 466 | ||
@@ -536,10 +526,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, | |||
536 | if (!cache_name) | 526 | if (!cache_name) |
537 | goto out_unlock; | 527 | goto out_unlock; |
538 | 528 | ||
539 | s = do_kmem_cache_create(cache_name, root_cache->object_size, | 529 | s = create_cache(cache_name, root_cache->object_size, |
540 | root_cache->size, root_cache->align, | 530 | root_cache->size, root_cache->align, |
541 | root_cache->flags, root_cache->ctor, | 531 | root_cache->flags, root_cache->ctor, |
542 | memcg, root_cache); | 532 | memcg, root_cache); |
543 | /* | 533 | /* |
544 | * If we could not create a memcg cache, do not complain, because | 534 | * If we could not create a memcg cache, do not complain, because |
545 | * that's not critical at all as we can always proceed with the root | 535 | * that's not critical at all as we can always proceed with the root |
@@ -598,6 +588,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) | |||
598 | put_online_cpus(); | 588 | put_online_cpus(); |
599 | } | 589 | } |
600 | 590 | ||
591 | static int __shutdown_memcg_cache(struct kmem_cache *s, | ||
592 | struct list_head *release, bool *need_rcu_barrier) | ||
593 | { | ||
594 | BUG_ON(is_root_cache(s)); | ||
595 | |||
596 | if (shutdown_cache(s, release, need_rcu_barrier)) | ||
597 | return -EBUSY; | ||
598 | |||
599 | list_del(&s->memcg_params.list); | ||
600 | return 0; | ||
601 | } | ||
602 | |||
601 | void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) | 603 | void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) |
602 | { | 604 | { |
603 | LIST_HEAD(release); | 605 | LIST_HEAD(release); |
@@ -615,14 +617,76 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) | |||
615 | * The cgroup is about to be freed and therefore has no charges | 617 | * The cgroup is about to be freed and therefore has no charges |
616 | * left. Hence, all its caches must be empty by now. | 618 | * left. Hence, all its caches must be empty by now. |
617 | */ | 619 | */ |
618 | BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); | 620 | BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier)); |
619 | } | 621 | } |
620 | mutex_unlock(&slab_mutex); | 622 | mutex_unlock(&slab_mutex); |
621 | 623 | ||
622 | put_online_mems(); | 624 | put_online_mems(); |
623 | put_online_cpus(); | 625 | put_online_cpus(); |
624 | 626 | ||
625 | do_kmem_cache_release(&release, need_rcu_barrier); | 627 | release_caches(&release, need_rcu_barrier); |
628 | } | ||
629 | |||
630 | static int shutdown_memcg_caches(struct kmem_cache *s, | ||
631 | struct list_head *release, bool *need_rcu_barrier) | ||
632 | { | ||
633 | struct memcg_cache_array *arr; | ||
634 | struct kmem_cache *c, *c2; | ||
635 | LIST_HEAD(busy); | ||
636 | int i; | ||
637 | |||
638 | BUG_ON(!is_root_cache(s)); | ||
639 | |||
640 | /* | ||
641 | * First, shutdown active caches, i.e. caches that belong to online | ||
642 | * memory cgroups. | ||
643 | */ | ||
644 | arr = rcu_dereference_protected(s->memcg_params.memcg_caches, | ||
645 | lockdep_is_held(&slab_mutex)); | ||
646 | for_each_memcg_cache_index(i) { | ||
647 | c = arr->entries[i]; | ||
648 | if (!c) | ||
649 | continue; | ||
650 | if (__shutdown_memcg_cache(c, release, need_rcu_barrier)) | ||
651 | /* | ||
652 | * The cache still has objects. Move it to a temporary | ||
653 | * list so as not to try to destroy it for a second | ||
654 | * time while iterating over inactive caches below. | ||
655 | */ | ||
656 | list_move(&c->memcg_params.list, &busy); | ||
657 | else | ||
658 | /* | ||
659 | * The cache is empty and will be destroyed soon. Clear | ||
660 | * the pointer to it in the memcg_caches array so that | ||
661 | * it will never be accessed even if the root cache | ||
662 | * stays alive. | ||
663 | */ | ||
664 | arr->entries[i] = NULL; | ||
665 | } | ||
666 | |||
667 | /* | ||
668 | * Second, shutdown all caches left from memory cgroups that are now | ||
669 | * offline. | ||
670 | */ | ||
671 | list_for_each_entry_safe(c, c2, &s->memcg_params.list, | ||
672 | memcg_params.list) | ||
673 | __shutdown_memcg_cache(c, release, need_rcu_barrier); | ||
674 | |||
675 | list_splice(&busy, &s->memcg_params.list); | ||
676 | |||
677 | /* | ||
678 | * A cache being destroyed must be empty. In particular, this means | ||
679 | * that all per memcg caches attached to it must be empty too. | ||
680 | */ | ||
681 | if (!list_empty(&s->memcg_params.list)) | ||
682 | return -EBUSY; | ||
683 | return 0; | ||
684 | } | ||
685 | #else | ||
686 | static inline int shutdown_memcg_caches(struct kmem_cache *s, | ||
687 | struct list_head *release, bool *need_rcu_barrier) | ||
688 | { | ||
689 | return 0; | ||
626 | } | 690 | } |
627 | #endif /* CONFIG_MEMCG_KMEM */ | 691 | #endif /* CONFIG_MEMCG_KMEM */ |
628 | 692 | ||
@@ -635,16 +699,13 @@ void slab_kmem_cache_release(struct kmem_cache *s) | |||
635 | 699 | ||
636 | void kmem_cache_destroy(struct kmem_cache *s) | 700 | void kmem_cache_destroy(struct kmem_cache *s) |
637 | { | 701 | { |
638 | struct kmem_cache *c, *c2; | ||
639 | LIST_HEAD(release); | 702 | LIST_HEAD(release); |
640 | bool need_rcu_barrier = false; | 703 | bool need_rcu_barrier = false; |
641 | bool busy = false; | 704 | int err; |
642 | 705 | ||
643 | if (unlikely(!s)) | 706 | if (unlikely(!s)) |
644 | return; | 707 | return; |
645 | 708 | ||
646 | BUG_ON(!is_root_cache(s)); | ||
647 | |||
648 | get_online_cpus(); | 709 | get_online_cpus(); |
649 | get_online_mems(); | 710 | get_online_mems(); |
650 | 711 | ||
@@ -654,21 +715,22 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
654 | if (s->refcount) | 715 | if (s->refcount) |
655 | goto out_unlock; | 716 | goto out_unlock; |
656 | 717 | ||
657 | for_each_memcg_cache_safe(c, c2, s) { | 718 | err = shutdown_memcg_caches(s, &release, &need_rcu_barrier); |
658 | if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) | 719 | if (!err) |
659 | busy = true; | 720 | err = shutdown_cache(s, &release, &need_rcu_barrier); |
660 | } | ||
661 | |||
662 | if (!busy) | ||
663 | do_kmem_cache_shutdown(s, &release, &need_rcu_barrier); | ||
664 | 721 | ||
722 | if (err) { | ||
723 | pr_err("kmem_cache_destroy %s: " | ||
724 | "Slab cache still has objects\n", s->name); | ||
725 | dump_stack(); | ||
726 | } | ||
665 | out_unlock: | 727 | out_unlock: |
666 | mutex_unlock(&slab_mutex); | 728 | mutex_unlock(&slab_mutex); |
667 | 729 | ||
668 | put_online_mems(); | 730 | put_online_mems(); |
669 | put_online_cpus(); | 731 | put_online_cpus(); |
670 | 732 | ||
671 | do_kmem_cache_release(&release, need_rcu_barrier); | 733 | release_caches(&release, need_rcu_barrier); |
672 | } | 734 | } |
673 | EXPORT_SYMBOL(kmem_cache_destroy); | 735 | EXPORT_SYMBOL(kmem_cache_destroy); |
674 | 736 | ||
@@ -692,7 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) | |||
692 | } | 754 | } |
693 | EXPORT_SYMBOL(kmem_cache_shrink); | 755 | EXPORT_SYMBOL(kmem_cache_shrink); |
694 | 756 | ||
695 | int slab_is_available(void) | 757 | bool slab_is_available(void) |
696 | { | 758 | { |
697 | return slab_state >= UP; | 759 | return slab_state >= UP; |
698 | } | 760 | } |
@@ -459,8 +459,10 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) | |||
459 | /* | 459 | /* |
460 | * Debug settings: | 460 | * Debug settings: |
461 | */ | 461 | */ |
462 | #ifdef CONFIG_SLUB_DEBUG_ON | 462 | #if defined(CONFIG_SLUB_DEBUG_ON) |
463 | static int slub_debug = DEBUG_DEFAULT_FLAGS; | 463 | static int slub_debug = DEBUG_DEFAULT_FLAGS; |
464 | #elif defined(CONFIG_KASAN) | ||
465 | static int slub_debug = SLAB_STORE_USER; | ||
464 | #else | 466 | #else |
465 | static int slub_debug; | 467 | static int slub_debug; |
466 | #endif | 468 | #endif |
@@ -1328,16 +1330,15 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, | |||
1328 | 1330 | ||
1329 | flags |= __GFP_NOTRACK; | 1331 | flags |= __GFP_NOTRACK; |
1330 | 1332 | ||
1331 | if (memcg_charge_slab(s, flags, order)) | ||
1332 | return NULL; | ||
1333 | |||
1334 | if (node == NUMA_NO_NODE) | 1333 | if (node == NUMA_NO_NODE) |
1335 | page = alloc_pages(flags, order); | 1334 | page = alloc_pages(flags, order); |
1336 | else | 1335 | else |
1337 | page = __alloc_pages_node(node, flags, order); | 1336 | page = __alloc_pages_node(node, flags, order); |
1338 | 1337 | ||
1339 | if (!page) | 1338 | if (page && memcg_charge_slab(page, flags, order, s)) { |
1340 | memcg_uncharge_slab(s, order); | 1339 | __free_pages(page, order); |
1340 | page = NULL; | ||
1341 | } | ||
1341 | 1342 | ||
1342 | return page; | 1343 | return page; |
1343 | } | 1344 | } |
@@ -1476,8 +1477,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1476 | page_mapcount_reset(page); | 1477 | page_mapcount_reset(page); |
1477 | if (current->reclaim_state) | 1478 | if (current->reclaim_state) |
1478 | current->reclaim_state->reclaimed_slab += pages; | 1479 | current->reclaim_state->reclaimed_slab += pages; |
1479 | __free_pages(page, order); | 1480 | __free_kmem_pages(page, order); |
1480 | memcg_uncharge_slab(s, order); | ||
1481 | } | 1481 | } |
1482 | 1482 | ||
1483 | #define need_reserve_slab_rcu \ | 1483 | #define need_reserve_slab_rcu \ |
@@ -2912,20 +2912,15 @@ static inline int slab_order(int size, int min_objects, | |||
2912 | if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) | 2912 | if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) |
2913 | return get_order(size * MAX_OBJS_PER_PAGE) - 1; | 2913 | return get_order(size * MAX_OBJS_PER_PAGE) - 1; |
2914 | 2914 | ||
2915 | for (order = max(min_order, | 2915 | for (order = max(min_order, get_order(min_objects * size + reserved)); |
2916 | fls(min_objects * size - 1) - PAGE_SHIFT); | ||
2917 | order <= max_order; order++) { | 2916 | order <= max_order; order++) { |
2918 | 2917 | ||
2919 | unsigned long slab_size = PAGE_SIZE << order; | 2918 | unsigned long slab_size = PAGE_SIZE << order; |
2920 | 2919 | ||
2921 | if (slab_size < min_objects * size + reserved) | ||
2922 | continue; | ||
2923 | |||
2924 | rem = (slab_size - reserved) % size; | 2920 | rem = (slab_size - reserved) % size; |
2925 | 2921 | ||
2926 | if (rem <= slab_size / fract_leftover) | 2922 | if (rem <= slab_size / fract_leftover) |
2927 | break; | 2923 | break; |
2928 | |||
2929 | } | 2924 | } |
2930 | 2925 | ||
2931 | return order; | 2926 | return order; |
@@ -2943,7 +2938,7 @@ static inline int calculate_order(int size, int reserved) | |||
2943 | * works by first attempting to generate a layout with | 2938 | * works by first attempting to generate a layout with |
2944 | * the best configuration and backing off gradually. | 2939 | * the best configuration and backing off gradually. |
2945 | * | 2940 | * |
2946 | * First we reduce the acceptable waste in a slab. Then | 2941 | * First we increase the acceptable waste in a slab. Then |
2947 | * we reduce the minimum objects required in a slab. | 2942 | * we reduce the minimum objects required in a slab. |
2948 | */ | 2943 | */ |
2949 | min_objects = slub_min_objects; | 2944 | min_objects = slub_min_objects; |
@@ -309,7 +309,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, | |||
309 | { | 309 | { |
310 | if (unlikely(offset + PAGE_ALIGN(len) < offset)) | 310 | if (unlikely(offset + PAGE_ALIGN(len) < offset)) |
311 | return -EINVAL; | 311 | return -EINVAL; |
312 | if (unlikely(offset & ~PAGE_MASK)) | 312 | if (unlikely(offset_in_page(offset))) |
313 | return -EINVAL; | 313 | return -EINVAL; |
314 | 314 | ||
315 | return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); | 315 | return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); |
diff --git a/mm/vmacache.c b/mm/vmacache.c index b6e3662fe339..fd09dc9c6812 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c | |||
@@ -52,7 +52,7 @@ void vmacache_flush_all(struct mm_struct *mm) | |||
52 | * Also handle the case where a kernel thread has adopted this mm via use_mm(). | 52 | * Also handle the case where a kernel thread has adopted this mm via use_mm(). |
53 | * That kernel thread's vmacache is not applicable to this mm. | 53 | * That kernel thread's vmacache is not applicable to this mm. |
54 | */ | 54 | */ |
55 | static bool vmacache_valid_mm(struct mm_struct *mm) | 55 | static inline bool vmacache_valid_mm(struct mm_struct *mm) |
56 | { | 56 | { |
57 | return current->mm == mm && !(current->flags & PF_KTHREAD); | 57 | return current->mm == mm && !(current->flags & PF_KTHREAD); |
58 | } | 58 | } |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af3a519e40c2..9db9ef5e8481 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -358,7 +358,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, | |||
358 | struct vmap_area *first; | 358 | struct vmap_area *first; |
359 | 359 | ||
360 | BUG_ON(!size); | 360 | BUG_ON(!size); |
361 | BUG_ON(size & ~PAGE_MASK); | 361 | BUG_ON(offset_in_page(size)); |
362 | BUG_ON(!is_power_of_2(align)); | 362 | BUG_ON(!is_power_of_2(align)); |
363 | 363 | ||
364 | va = kmalloc_node(sizeof(struct vmap_area), | 364 | va = kmalloc_node(sizeof(struct vmap_area), |
@@ -936,7 +936,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | |||
936 | void *vaddr = NULL; | 936 | void *vaddr = NULL; |
937 | unsigned int order; | 937 | unsigned int order; |
938 | 938 | ||
939 | BUG_ON(size & ~PAGE_MASK); | 939 | BUG_ON(offset_in_page(size)); |
940 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | 940 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); |
941 | if (WARN_ON(size == 0)) { | 941 | if (WARN_ON(size == 0)) { |
942 | /* | 942 | /* |
@@ -989,7 +989,7 @@ static void vb_free(const void *addr, unsigned long size) | |||
989 | unsigned int order; | 989 | unsigned int order; |
990 | struct vmap_block *vb; | 990 | struct vmap_block *vb; |
991 | 991 | ||
992 | BUG_ON(size & ~PAGE_MASK); | 992 | BUG_ON(offset_in_page(size)); |
993 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | 993 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); |
994 | 994 | ||
995 | flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); | 995 | flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); |
@@ -1902,7 +1902,7 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) | |||
1902 | while (count) { | 1902 | while (count) { |
1903 | unsigned long offset, length; | 1903 | unsigned long offset, length; |
1904 | 1904 | ||
1905 | offset = (unsigned long)addr & ~PAGE_MASK; | 1905 | offset = offset_in_page(addr); |
1906 | length = PAGE_SIZE - offset; | 1906 | length = PAGE_SIZE - offset; |
1907 | if (length > count) | 1907 | if (length > count) |
1908 | length = count; | 1908 | length = count; |
@@ -1941,7 +1941,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) | |||
1941 | while (count) { | 1941 | while (count) { |
1942 | unsigned long offset, length; | 1942 | unsigned long offset, length; |
1943 | 1943 | ||
1944 | offset = (unsigned long)addr & ~PAGE_MASK; | 1944 | offset = offset_in_page(addr); |
1945 | length = PAGE_SIZE - offset; | 1945 | length = PAGE_SIZE - offset; |
1946 | if (length > count) | 1946 | if (length > count) |
1947 | length = count; | 1947 | length = count; |
@@ -2392,7 +2392,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
2392 | bool purged = false; | 2392 | bool purged = false; |
2393 | 2393 | ||
2394 | /* verify parameters and allocate data structures */ | 2394 | /* verify parameters and allocate data structures */ |
2395 | BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); | 2395 | BUG_ON(offset_in_page(align) || !is_power_of_2(align)); |
2396 | for (last_area = 0, area = 0; area < nr_vms; area++) { | 2396 | for (last_area = 0, area = 0; area < nr_vms; area++) { |
2397 | start = offsets[area]; | 2397 | start = offsets[area]; |
2398 | end = start + sizes[area]; | 2398 | end = start + sizes[area]; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index e7057af54b6e..55721b619aee 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -194,7 +194,7 @@ static bool sane_reclaim(struct scan_control *sc) | |||
194 | 194 | ||
195 | static unsigned long zone_reclaimable_pages(struct zone *zone) | 195 | static unsigned long zone_reclaimable_pages(struct zone *zone) |
196 | { | 196 | { |
197 | int nr; | 197 | unsigned long nr; |
198 | 198 | ||
199 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + | 199 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + |
200 | zone_page_state(zone, NR_INACTIVE_FILE); | 200 | zone_page_state(zone, NR_INACTIVE_FILE); |
@@ -1859,17 +1859,14 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1859 | } | 1859 | } |
1860 | 1860 | ||
1861 | #ifdef CONFIG_SWAP | 1861 | #ifdef CONFIG_SWAP |
1862 | static int inactive_anon_is_low_global(struct zone *zone) | 1862 | static bool inactive_anon_is_low_global(struct zone *zone) |
1863 | { | 1863 | { |
1864 | unsigned long active, inactive; | 1864 | unsigned long active, inactive; |
1865 | 1865 | ||
1866 | active = zone_page_state(zone, NR_ACTIVE_ANON); | 1866 | active = zone_page_state(zone, NR_ACTIVE_ANON); |
1867 | inactive = zone_page_state(zone, NR_INACTIVE_ANON); | 1867 | inactive = zone_page_state(zone, NR_INACTIVE_ANON); |
1868 | 1868 | ||
1869 | if (inactive * zone->inactive_ratio < active) | 1869 | return inactive * zone->inactive_ratio < active; |
1870 | return 1; | ||
1871 | |||
1872 | return 0; | ||
1873 | } | 1870 | } |
1874 | 1871 | ||
1875 | /** | 1872 | /** |
@@ -1879,14 +1876,14 @@ static int inactive_anon_is_low_global(struct zone *zone) | |||
1879 | * Returns true if the zone does not have enough inactive anon pages, | 1876 | * Returns true if the zone does not have enough inactive anon pages, |
1880 | * meaning some active anon pages need to be deactivated. | 1877 | * meaning some active anon pages need to be deactivated. |
1881 | */ | 1878 | */ |
1882 | static int inactive_anon_is_low(struct lruvec *lruvec) | 1879 | static bool inactive_anon_is_low(struct lruvec *lruvec) |
1883 | { | 1880 | { |
1884 | /* | 1881 | /* |
1885 | * If we don't have swap space, anonymous page deactivation | 1882 | * If we don't have swap space, anonymous page deactivation |
1886 | * is pointless. | 1883 | * is pointless. |
1887 | */ | 1884 | */ |
1888 | if (!total_swap_pages) | 1885 | if (!total_swap_pages) |
1889 | return 0; | 1886 | return false; |
1890 | 1887 | ||
1891 | if (!mem_cgroup_disabled()) | 1888 | if (!mem_cgroup_disabled()) |
1892 | return mem_cgroup_inactive_anon_is_low(lruvec); | 1889 | return mem_cgroup_inactive_anon_is_low(lruvec); |
@@ -1894,9 +1891,9 @@ static int inactive_anon_is_low(struct lruvec *lruvec) | |||
1894 | return inactive_anon_is_low_global(lruvec_zone(lruvec)); | 1891 | return inactive_anon_is_low_global(lruvec_zone(lruvec)); |
1895 | } | 1892 | } |
1896 | #else | 1893 | #else |
1897 | static inline int inactive_anon_is_low(struct lruvec *lruvec) | 1894 | static inline bool inactive_anon_is_low(struct lruvec *lruvec) |
1898 | { | 1895 | { |
1899 | return 0; | 1896 | return false; |
1900 | } | 1897 | } |
1901 | #endif | 1898 | #endif |
1902 | 1899 | ||
@@ -1914,7 +1911,7 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec) | |||
1914 | * This uses a different ratio than the anonymous pages, because | 1911 | * This uses a different ratio than the anonymous pages, because |
1915 | * the page cache uses a use-once replacement algorithm. | 1912 | * the page cache uses a use-once replacement algorithm. |
1916 | */ | 1913 | */ |
1917 | static int inactive_file_is_low(struct lruvec *lruvec) | 1914 | static bool inactive_file_is_low(struct lruvec *lruvec) |
1918 | { | 1915 | { |
1919 | unsigned long inactive; | 1916 | unsigned long inactive; |
1920 | unsigned long active; | 1917 | unsigned long active; |
@@ -1925,7 +1922,7 @@ static int inactive_file_is_low(struct lruvec *lruvec) | |||
1925 | return active > inactive; | 1922 | return active > inactive; |
1926 | } | 1923 | } |
1927 | 1924 | ||
1928 | static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) | 1925 | static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) |
1929 | { | 1926 | { |
1930 | if (is_file_lru(lru)) | 1927 | if (is_file_lru(lru)) |
1931 | return inactive_file_is_low(lruvec); | 1928 | return inactive_file_is_low(lruvec); |
@@ -3696,10 +3693,10 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone) | |||
3696 | } | 3693 | } |
3697 | 3694 | ||
3698 | /* Work out how many page cache pages we can reclaim in this reclaim_mode */ | 3695 | /* Work out how many page cache pages we can reclaim in this reclaim_mode */ |
3699 | static long zone_pagecache_reclaimable(struct zone *zone) | 3696 | static unsigned long zone_pagecache_reclaimable(struct zone *zone) |
3700 | { | 3697 | { |
3701 | long nr_pagecache_reclaimable; | 3698 | unsigned long nr_pagecache_reclaimable; |
3702 | long delta = 0; | 3699 | unsigned long delta = 0; |
3703 | 3700 | ||
3704 | /* | 3701 | /* |
3705 | * If RECLAIM_UNMAP is set, then all file pages are considered | 3702 | * If RECLAIM_UNMAP is set, then all file pages are considered |
diff --git a/mm/vmstat.c b/mm/vmstat.c index fbf14485a049..ffcb4f58bf3e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -591,6 +591,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) | |||
591 | else | 591 | else |
592 | __inc_zone_state(z, NUMA_OTHER); | 592 | __inc_zone_state(z, NUMA_OTHER); |
593 | } | 593 | } |
594 | |||
595 | /* | ||
596 | * Determine the per node value of a stat item. | ||
597 | */ | ||
598 | unsigned long node_page_state(int node, enum zone_stat_item item) | ||
599 | { | ||
600 | struct zone *zones = NODE_DATA(node)->node_zones; | ||
601 | |||
602 | return | ||
603 | #ifdef CONFIG_ZONE_DMA | ||
604 | zone_page_state(&zones[ZONE_DMA], item) + | ||
605 | #endif | ||
606 | #ifdef CONFIG_ZONE_DMA32 | ||
607 | zone_page_state(&zones[ZONE_DMA32], item) + | ||
608 | #endif | ||
609 | #ifdef CONFIG_HIGHMEM | ||
610 | zone_page_state(&zones[ZONE_HIGHMEM], item) + | ||
611 | #endif | ||
612 | zone_page_state(&zones[ZONE_NORMAL], item) + | ||
613 | zone_page_state(&zones[ZONE_MOVABLE], item); | ||
614 | } | ||
615 | |||
594 | #endif | 616 | #endif |
595 | 617 | ||
596 | #ifdef CONFIG_COMPACTION | 618 | #ifdef CONFIG_COMPACTION |
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 3c53cac15de1..e4bb1de1d526 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile | |||
@@ -5,6 +5,8 @@ BINARIES = compaction_test | |||
5 | BINARIES += hugepage-mmap | 5 | BINARIES += hugepage-mmap |
6 | BINARIES += hugepage-shm | 6 | BINARIES += hugepage-shm |
7 | BINARIES += map_hugetlb | 7 | BINARIES += map_hugetlb |
8 | BINARIES += mlock2-tests | ||
9 | BINARIES += on-fault-limit | ||
8 | BINARIES += thuge-gen | 10 | BINARIES += thuge-gen |
9 | BINARIES += transhuge-stress | 11 | BINARIES += transhuge-stress |
10 | BINARIES += userfaultfd | 12 | BINARIES += userfaultfd |
diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c new file mode 100644 index 000000000000..4431994aade2 --- /dev/null +++ b/tools/testing/selftests/vm/mlock2-tests.c | |||
@@ -0,0 +1,736 @@ | |||
1 | #include <sys/mman.h> | ||
2 | #include <stdint.h> | ||
3 | #include <stdio.h> | ||
4 | #include <stdlib.h> | ||
5 | #include <unistd.h> | ||
6 | #include <string.h> | ||
7 | #include <sys/time.h> | ||
8 | #include <sys/resource.h> | ||
9 | #include <syscall.h> | ||
10 | #include <errno.h> | ||
11 | #include <stdbool.h> | ||
12 | |||
13 | #ifndef MLOCK_ONFAULT | ||
14 | #define MLOCK_ONFAULT 1 | ||
15 | #endif | ||
16 | |||
17 | #ifndef MCL_ONFAULT | ||
18 | #define MCL_ONFAULT (MCL_FUTURE << 1) | ||
19 | #endif | ||
20 | |||
21 | static int mlock2_(void *start, size_t len, int flags) | ||
22 | { | ||
23 | #ifdef __NR_mlock2 | ||
24 | return syscall(__NR_mlock2, start, len, flags); | ||
25 | #else | ||
26 | errno = ENOSYS; | ||
27 | return -1; | ||
28 | #endif | ||
29 | } | ||
30 | |||
31 | struct vm_boundaries { | ||
32 | unsigned long start; | ||
33 | unsigned long end; | ||
34 | }; | ||
35 | |||
36 | static int get_vm_area(unsigned long addr, struct vm_boundaries *area) | ||
37 | { | ||
38 | FILE *file; | ||
39 | int ret = 1; | ||
40 | char line[1024] = {0}; | ||
41 | char *end_addr; | ||
42 | char *stop; | ||
43 | unsigned long start; | ||
44 | unsigned long end; | ||
45 | |||
46 | if (!area) | ||
47 | return ret; | ||
48 | |||
49 | file = fopen("/proc/self/maps", "r"); | ||
50 | if (!file) { | ||
51 | perror("fopen"); | ||
52 | return ret; | ||
53 | } | ||
54 | |||
55 | memset(area, 0, sizeof(struct vm_boundaries)); | ||
56 | |||
57 | while(fgets(line, 1024, file)) { | ||
58 | end_addr = strchr(line, '-'); | ||
59 | if (!end_addr) { | ||
60 | printf("cannot parse /proc/self/maps\n"); | ||
61 | goto out; | ||
62 | } | ||
63 | *end_addr = '\0'; | ||
64 | end_addr++; | ||
65 | stop = strchr(end_addr, ' '); | ||
66 | if (!stop) { | ||
67 | printf("cannot parse /proc/self/maps\n"); | ||
68 | goto out; | ||
69 | } | ||
70 | stop = '\0'; | ||
71 | |||
72 | sscanf(line, "%lx", &start); | ||
73 | sscanf(end_addr, "%lx", &end); | ||
74 | |||
75 | if (start <= addr && end > addr) { | ||
76 | area->start = start; | ||
77 | area->end = end; | ||
78 | ret = 0; | ||
79 | goto out; | ||
80 | } | ||
81 | } | ||
82 | out: | ||
83 | fclose(file); | ||
84 | return ret; | ||
85 | } | ||
86 | |||
87 | static uint64_t get_pageflags(unsigned long addr) | ||
88 | { | ||
89 | FILE *file; | ||
90 | uint64_t pfn; | ||
91 | unsigned long offset; | ||
92 | |||
93 | file = fopen("/proc/self/pagemap", "r"); | ||
94 | if (!file) { | ||
95 | perror("fopen pagemap"); | ||
96 | _exit(1); | ||
97 | } | ||
98 | |||
99 | offset = addr / getpagesize() * sizeof(pfn); | ||
100 | |||
101 | if (fseek(file, offset, SEEK_SET)) { | ||
102 | perror("fseek pagemap"); | ||
103 | _exit(1); | ||
104 | } | ||
105 | |||
106 | if (fread(&pfn, sizeof(pfn), 1, file) != 1) { | ||
107 | perror("fread pagemap"); | ||
108 | _exit(1); | ||
109 | } | ||
110 | |||
111 | fclose(file); | ||
112 | return pfn; | ||
113 | } | ||
114 | |||
115 | static uint64_t get_kpageflags(unsigned long pfn) | ||
116 | { | ||
117 | uint64_t flags; | ||
118 | FILE *file; | ||
119 | |||
120 | file = fopen("/proc/kpageflags", "r"); | ||
121 | if (!file) { | ||
122 | perror("fopen kpageflags"); | ||
123 | _exit(1); | ||
124 | } | ||
125 | |||
126 | if (fseek(file, pfn * sizeof(flags), SEEK_SET)) { | ||
127 | perror("fseek kpageflags"); | ||
128 | _exit(1); | ||
129 | } | ||
130 | |||
131 | if (fread(&flags, sizeof(flags), 1, file) != 1) { | ||
132 | perror("fread kpageflags"); | ||
133 | _exit(1); | ||
134 | } | ||
135 | |||
136 | fclose(file); | ||
137 | return flags; | ||
138 | } | ||
139 | |||
140 | static FILE *seek_to_smaps_entry(unsigned long addr) | ||
141 | { | ||
142 | FILE *file; | ||
143 | char *line = NULL; | ||
144 | size_t size = 0; | ||
145 | unsigned long start, end; | ||
146 | char perms[5]; | ||
147 | unsigned long offset; | ||
148 | char dev[32]; | ||
149 | unsigned long inode; | ||
150 | char path[BUFSIZ]; | ||
151 | |||
152 | file = fopen("/proc/self/smaps", "r"); | ||
153 | if (!file) { | ||
154 | perror("fopen smaps"); | ||
155 | _exit(1); | ||
156 | } | ||
157 | |||
158 | while (getline(&line, &size, file) > 0) { | ||
159 | if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n", | ||
160 | &start, &end, perms, &offset, dev, &inode, path) < 6) | ||
161 | goto next; | ||
162 | |||
163 | if (start <= addr && addr < end) | ||
164 | goto out; | ||
165 | |||
166 | next: | ||
167 | free(line); | ||
168 | line = NULL; | ||
169 | size = 0; | ||
170 | } | ||
171 | |||
172 | fclose(file); | ||
173 | file = NULL; | ||
174 | |||
175 | out: | ||
176 | free(line); | ||
177 | return file; | ||
178 | } | ||
179 | |||
180 | #define VMFLAGS "VmFlags:" | ||
181 | |||
182 | static bool is_vmflag_set(unsigned long addr, const char *vmflag) | ||
183 | { | ||
184 | char *line = NULL; | ||
185 | char *flags; | ||
186 | size_t size = 0; | ||
187 | bool ret = false; | ||
188 | FILE *smaps; | ||
189 | |||
190 | smaps = seek_to_smaps_entry(addr); | ||
191 | if (!smaps) { | ||
192 | printf("Unable to parse /proc/self/smaps\n"); | ||
193 | goto out; | ||
194 | } | ||
195 | |||
196 | while (getline(&line, &size, smaps) > 0) { | ||
197 | if (!strstr(line, VMFLAGS)) { | ||
198 | free(line); | ||
199 | line = NULL; | ||
200 | size = 0; | ||
201 | continue; | ||
202 | } | ||
203 | |||
204 | flags = line + strlen(VMFLAGS); | ||
205 | ret = (strstr(flags, vmflag) != NULL); | ||
206 | goto out; | ||
207 | } | ||
208 | |||
209 | out: | ||
210 | free(line); | ||
211 | fclose(smaps); | ||
212 | return ret; | ||
213 | } | ||
214 | |||
215 | #define SIZE "Size:" | ||
216 | #define RSS "Rss:" | ||
217 | #define LOCKED "lo" | ||
218 | |||
219 | static bool is_vma_lock_on_fault(unsigned long addr) | ||
220 | { | ||
221 | bool ret = false; | ||
222 | bool locked; | ||
223 | FILE *smaps = NULL; | ||
224 | unsigned long vma_size, vma_rss; | ||
225 | char *line = NULL; | ||
226 | char *value; | ||
227 | size_t size = 0; | ||
228 | |||
229 | locked = is_vmflag_set(addr, LOCKED); | ||
230 | if (!locked) | ||
231 | goto out; | ||
232 | |||
233 | smaps = seek_to_smaps_entry(addr); | ||
234 | if (!smaps) { | ||
235 | printf("Unable to parse /proc/self/smaps\n"); | ||
236 | goto out; | ||
237 | } | ||
238 | |||
239 | while (getline(&line, &size, smaps) > 0) { | ||
240 | if (!strstr(line, SIZE)) { | ||
241 | free(line); | ||
242 | line = NULL; | ||
243 | size = 0; | ||
244 | continue; | ||
245 | } | ||
246 | |||
247 | value = line + strlen(SIZE); | ||
248 | if (sscanf(value, "%lu kB", &vma_size) < 1) { | ||
249 | printf("Unable to parse smaps entry for Size\n"); | ||
250 | goto out; | ||
251 | } | ||
252 | break; | ||
253 | } | ||
254 | |||
255 | while (getline(&line, &size, smaps) > 0) { | ||
256 | if (!strstr(line, RSS)) { | ||
257 | free(line); | ||
258 | line = NULL; | ||
259 | size = 0; | ||
260 | continue; | ||
261 | } | ||
262 | |||
263 | value = line + strlen(RSS); | ||
264 | if (sscanf(value, "%lu kB", &vma_rss) < 1) { | ||
265 | printf("Unable to parse smaps entry for Rss\n"); | ||
266 | goto out; | ||
267 | } | ||
268 | break; | ||
269 | } | ||
270 | |||
271 | ret = locked && (vma_rss < vma_size); | ||
272 | out: | ||
273 | free(line); | ||
274 | if (smaps) | ||
275 | fclose(smaps); | ||
276 | return ret; | ||
277 | } | ||
278 | |||
279 | #define PRESENT_BIT 0x8000000000000000 | ||
280 | #define PFN_MASK 0x007FFFFFFFFFFFFF | ||
281 | #define UNEVICTABLE_BIT (1UL << 18) | ||
282 | |||
283 | static int lock_check(char *map) | ||
284 | { | ||
285 | unsigned long page_size = getpagesize(); | ||
286 | uint64_t page1_flags, page2_flags; | ||
287 | |||
288 | page1_flags = get_pageflags((unsigned long)map); | ||
289 | page2_flags = get_pageflags((unsigned long)map + page_size); | ||
290 | |||
291 | /* Both pages should be present */ | ||
292 | if (((page1_flags & PRESENT_BIT) == 0) || | ||
293 | ((page2_flags & PRESENT_BIT) == 0)) { | ||
294 | printf("Failed to make both pages present\n"); | ||
295 | return 1; | ||
296 | } | ||
297 | |||
298 | page1_flags = get_kpageflags(page1_flags & PFN_MASK); | ||
299 | page2_flags = get_kpageflags(page2_flags & PFN_MASK); | ||
300 | |||
301 | /* Both pages should be unevictable */ | ||
302 | if (((page1_flags & UNEVICTABLE_BIT) == 0) || | ||
303 | ((page2_flags & UNEVICTABLE_BIT) == 0)) { | ||
304 | printf("Failed to make both pages unevictable\n"); | ||
305 | return 1; | ||
306 | } | ||
307 | |||
308 | if (!is_vmflag_set((unsigned long)map, LOCKED)) { | ||
309 | printf("VMA flag %s is missing on page 1\n", LOCKED); | ||
310 | return 1; | ||
311 | } | ||
312 | |||
313 | if (!is_vmflag_set((unsigned long)map + page_size, LOCKED)) { | ||
314 | printf("VMA flag %s is missing on page 2\n", LOCKED); | ||
315 | return 1; | ||
316 | } | ||
317 | |||
318 | return 0; | ||
319 | } | ||
320 | |||
321 | static int unlock_lock_check(char *map) | ||
322 | { | ||
323 | unsigned long page_size = getpagesize(); | ||
324 | uint64_t page1_flags, page2_flags; | ||
325 | |||
326 | page1_flags = get_pageflags((unsigned long)map); | ||
327 | page2_flags = get_pageflags((unsigned long)map + page_size); | ||
328 | page1_flags = get_kpageflags(page1_flags & PFN_MASK); | ||
329 | page2_flags = get_kpageflags(page2_flags & PFN_MASK); | ||
330 | |||
331 | if ((page1_flags & UNEVICTABLE_BIT) || (page2_flags & UNEVICTABLE_BIT)) { | ||
332 | printf("A page is still marked unevictable after unlock\n"); | ||
333 | return 1; | ||
334 | } | ||
335 | |||
336 | if (is_vmflag_set((unsigned long)map, LOCKED)) { | ||
337 | printf("VMA flag %s is present on page 1 after unlock\n", LOCKED); | ||
338 | return 1; | ||
339 | } | ||
340 | |||
341 | if (is_vmflag_set((unsigned long)map + page_size, LOCKED)) { | ||
342 | printf("VMA flag %s is present on page 2 after unlock\n", LOCKED); | ||
343 | return 1; | ||
344 | } | ||
345 | |||
346 | return 0; | ||
347 | } | ||
348 | |||
349 | static int test_mlock_lock() | ||
350 | { | ||
351 | char *map; | ||
352 | int ret = 1; | ||
353 | unsigned long page_size = getpagesize(); | ||
354 | |||
355 | map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, | ||
356 | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); | ||
357 | if (map == MAP_FAILED) { | ||
358 | perror("test_mlock_locked mmap"); | ||
359 | goto out; | ||
360 | } | ||
361 | |||
362 | if (mlock2_(map, 2 * page_size, 0)) { | ||
363 | if (errno == ENOSYS) { | ||
364 | printf("Cannot call new mlock family, skipping test\n"); | ||
365 | _exit(0); | ||
366 | } | ||
367 | perror("mlock2(0)"); | ||
368 | goto unmap; | ||
369 | } | ||
370 | |||
371 | if (lock_check(map)) | ||
372 | goto unmap; | ||
373 | |||
374 | /* Now unlock and recheck attributes */ | ||
375 | if (munlock(map, 2 * page_size)) { | ||
376 | perror("munlock()"); | ||
377 | goto unmap; | ||
378 | } | ||
379 | |||
380 | ret = unlock_lock_check(map); | ||
381 | |||
382 | unmap: | ||
383 | munmap(map, 2 * page_size); | ||
384 | out: | ||
385 | return ret; | ||
386 | } | ||
387 | |||
388 | static int onfault_check(char *map) | ||
389 | { | ||
390 | unsigned long page_size = getpagesize(); | ||
391 | uint64_t page1_flags, page2_flags; | ||
392 | |||
393 | page1_flags = get_pageflags((unsigned long)map); | ||
394 | page2_flags = get_pageflags((unsigned long)map + page_size); | ||
395 | |||
396 | /* Neither page should be present */ | ||
397 | if ((page1_flags & PRESENT_BIT) || (page2_flags & PRESENT_BIT)) { | ||
398 | printf("Pages were made present by MLOCK_ONFAULT\n"); | ||
399 | return 1; | ||
400 | } | ||
401 | |||
402 | *map = 'a'; | ||
403 | page1_flags = get_pageflags((unsigned long)map); | ||
404 | page2_flags = get_pageflags((unsigned long)map + page_size); | ||
405 | |||
406 | /* Only page 1 should be present */ | ||
407 | if ((page1_flags & PRESENT_BIT) == 0) { | ||
408 | printf("Page 1 is not present after fault\n"); | ||
409 | return 1; | ||
410 | } else if (page2_flags & PRESENT_BIT) { | ||
411 | printf("Page 2 was made present\n"); | ||
412 | return 1; | ||
413 | } | ||
414 | |||
415 | page1_flags = get_kpageflags(page1_flags & PFN_MASK); | ||
416 | |||
417 | /* Page 1 should be unevictable */ | ||
418 | if ((page1_flags & UNEVICTABLE_BIT) == 0) { | ||
419 | printf("Failed to make faulted page unevictable\n"); | ||
420 | return 1; | ||
421 | } | ||
422 | |||
423 | if (!is_vma_lock_on_fault((unsigned long)map)) { | ||
424 | printf("VMA is not marked for lock on fault\n"); | ||
425 | return 1; | ||
426 | } | ||
427 | |||
428 | if (!is_vma_lock_on_fault((unsigned long)map + page_size)) { | ||
429 | printf("VMA is not marked for lock on fault\n"); | ||
430 | return 1; | ||
431 | } | ||
432 | |||
433 | return 0; | ||
434 | } | ||
435 | |||
436 | static int unlock_onfault_check(char *map) | ||
437 | { | ||
438 | unsigned long page_size = getpagesize(); | ||
439 | uint64_t page1_flags; | ||
440 | |||
441 | page1_flags = get_pageflags((unsigned long)map); | ||
442 | page1_flags = get_kpageflags(page1_flags & PFN_MASK); | ||
443 | |||
444 | if (page1_flags & UNEVICTABLE_BIT) { | ||
445 | printf("Page 1 is still marked unevictable after unlock\n"); | ||
446 | return 1; | ||
447 | } | ||
448 | |||
449 | if (is_vma_lock_on_fault((unsigned long)map) || | ||
450 | is_vma_lock_on_fault((unsigned long)map + page_size)) { | ||
451 | printf("VMA is still lock on fault after unlock\n"); | ||
452 | return 1; | ||
453 | } | ||
454 | |||
455 | return 0; | ||
456 | } | ||
457 | |||
458 | static int test_mlock_onfault() | ||
459 | { | ||
460 | char *map; | ||
461 | int ret = 1; | ||
462 | unsigned long page_size = getpagesize(); | ||
463 | |||
464 | map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, | ||
465 | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); | ||
466 | if (map == MAP_FAILED) { | ||
467 | perror("test_mlock_locked mmap"); | ||
468 | goto out; | ||
469 | } | ||
470 | |||
471 | if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { | ||
472 | if (errno == ENOSYS) { | ||
473 | printf("Cannot call new mlock family, skipping test\n"); | ||
474 | _exit(0); | ||
475 | } | ||
476 | perror("mlock2(MLOCK_ONFAULT)"); | ||
477 | goto unmap; | ||
478 | } | ||
479 | |||
480 | if (onfault_check(map)) | ||
481 | goto unmap; | ||
482 | |||
483 | /* Now unlock and recheck attributes */ | ||
484 | if (munlock(map, 2 * page_size)) { | ||
485 | if (errno == ENOSYS) { | ||
486 | printf("Cannot call new mlock family, skipping test\n"); | ||
487 | _exit(0); | ||
488 | } | ||
489 | perror("munlock()"); | ||
490 | goto unmap; | ||
491 | } | ||
492 | |||
493 | ret = unlock_onfault_check(map); | ||
494 | unmap: | ||
495 | munmap(map, 2 * page_size); | ||
496 | out: | ||
497 | return ret; | ||
498 | } | ||
499 | |||
500 | static int test_lock_onfault_of_present() | ||
501 | { | ||
502 | char *map; | ||
503 | int ret = 1; | ||
504 | unsigned long page_size = getpagesize(); | ||
505 | uint64_t page1_flags, page2_flags; | ||
506 | |||
507 | map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, | ||
508 | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); | ||
509 | if (map == MAP_FAILED) { | ||
510 | perror("test_mlock_locked mmap"); | ||
511 | goto out; | ||
512 | } | ||
513 | |||
514 | *map = 'a'; | ||
515 | |||
516 | if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { | ||
517 | if (errno == ENOSYS) { | ||
518 | printf("Cannot call new mlock family, skipping test\n"); | ||
519 | _exit(0); | ||
520 | } | ||
521 | perror("mlock2(MLOCK_ONFAULT)"); | ||
522 | goto unmap; | ||
523 | } | ||
524 | |||
525 | page1_flags = get_pageflags((unsigned long)map); | ||
526 | page2_flags = get_pageflags((unsigned long)map + page_size); | ||
527 | page1_flags = get_kpageflags(page1_flags & PFN_MASK); | ||
528 | page2_flags = get_kpageflags(page2_flags & PFN_MASK); | ||
529 | |||
530 | /* Page 1 should be unevictable */ | ||
531 | if ((page1_flags & UNEVICTABLE_BIT) == 0) { | ||
532 | printf("Failed to make present page unevictable\n"); | ||
533 | goto unmap; | ||
534 | } | ||
535 | |||
536 | if (!is_vma_lock_on_fault((unsigned long)map) || | ||
537 | !is_vma_lock_on_fault((unsigned long)map + page_size)) { | ||
538 | printf("VMA with present pages is not marked lock on fault\n"); | ||
539 | goto unmap; | ||
540 | } | ||
541 | ret = 0; | ||
542 | unmap: | ||
543 | munmap(map, 2 * page_size); | ||
544 | out: | ||
545 | return ret; | ||
546 | } | ||
547 | |||
548 | static int test_munlockall() | ||
549 | { | ||
550 | char *map; | ||
551 | int ret = 1; | ||
552 | unsigned long page_size = getpagesize(); | ||
553 | |||
554 | map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, | ||
555 | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); | ||
556 | |||
557 | if (map == MAP_FAILED) { | ||
558 | perror("test_munlockall mmap"); | ||
559 | goto out; | ||
560 | } | ||
561 | |||
562 | if (mlockall(MCL_CURRENT)) { | ||
563 | perror("mlockall(MCL_CURRENT)"); | ||
564 | goto out; | ||
565 | } | ||
566 | |||
567 | if (lock_check(map)) | ||
568 | goto unmap; | ||
569 | |||
570 | if (munlockall()) { | ||
571 | perror("munlockall()"); | ||
572 | goto unmap; | ||
573 | } | ||
574 | |||
575 | if (unlock_lock_check(map)) | ||
576 | goto unmap; | ||
577 | |||
578 | munmap(map, 2 * page_size); | ||
579 | |||
580 | map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, | ||
581 | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); | ||
582 | |||
583 | if (map == MAP_FAILED) { | ||
584 | perror("test_munlockall second mmap"); | ||
585 | goto out; | ||
586 | } | ||
587 | |||
588 | if (mlockall(MCL_CURRENT | MCL_ONFAULT)) { | ||
589 | perror("mlockall(MCL_CURRENT | MCL_ONFAULT)"); | ||
590 | goto unmap; | ||
591 | } | ||
592 | |||
593 | if (onfault_check(map)) | ||
594 | goto unmap; | ||
595 | |||
596 | if (munlockall()) { | ||
597 | perror("munlockall()"); | ||
598 | goto unmap; | ||
599 | } | ||
600 | |||
601 | if (unlock_onfault_check(map)) | ||
602 | goto unmap; | ||
603 | |||
604 | if (mlockall(MCL_CURRENT | MCL_FUTURE)) { | ||
605 | perror("mlockall(MCL_CURRENT | MCL_FUTURE)"); | ||
606 | goto out; | ||
607 | } | ||
608 | |||
609 | if (lock_check(map)) | ||
610 | goto unmap; | ||
611 | |||
612 | if (munlockall()) { | ||
613 | perror("munlockall()"); | ||
614 | goto unmap; | ||
615 | } | ||
616 | |||
617 | ret = unlock_lock_check(map); | ||
618 | |||
619 | unmap: | ||
620 | munmap(map, 2 * page_size); | ||
621 | out: | ||
622 | munlockall(); | ||
623 | return ret; | ||
624 | } | ||
625 | |||
626 | static int test_vma_management(bool call_mlock) | ||
627 | { | ||
628 | int ret = 1; | ||
629 | void *map; | ||
630 | unsigned long page_size = getpagesize(); | ||
631 | struct vm_boundaries page1; | ||
632 | struct vm_boundaries page2; | ||
633 | struct vm_boundaries page3; | ||
634 | |||
635 | map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, | ||
636 | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); | ||
637 | if (map == MAP_FAILED) { | ||
638 | perror("mmap()"); | ||
639 | return ret; | ||
640 | } | ||
641 | |||
642 | if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) { | ||
643 | if (errno == ENOSYS) { | ||
644 | printf("Cannot call new mlock family, skipping test\n"); | ||
645 | _exit(0); | ||
646 | } | ||
647 | perror("mlock(ONFAULT)\n"); | ||
648 | goto out; | ||
649 | } | ||
650 | |||
651 | if (get_vm_area((unsigned long)map, &page1) || | ||
652 | get_vm_area((unsigned long)map + page_size, &page2) || | ||
653 | get_vm_area((unsigned long)map + page_size * 2, &page3)) { | ||
654 | printf("couldn't find mapping in /proc/self/maps\n"); | ||
655 | goto out; | ||
656 | } | ||
657 | |||
658 | /* | ||
659 | * Before we unlock a portion, we need to that all three pages are in | ||
660 | * the same VMA. If they are not we abort this test (Note that this is | ||
661 | * not a failure) | ||
662 | */ | ||
663 | if (page1.start != page2.start || page2.start != page3.start) { | ||
664 | printf("VMAs are not merged to start, aborting test\n"); | ||
665 | ret = 0; | ||
666 | goto out; | ||
667 | } | ||
668 | |||
669 | if (munlock(map + page_size, page_size)) { | ||
670 | perror("munlock()"); | ||
671 | goto out; | ||
672 | } | ||
673 | |||
674 | if (get_vm_area((unsigned long)map, &page1) || | ||
675 | get_vm_area((unsigned long)map + page_size, &page2) || | ||
676 | get_vm_area((unsigned long)map + page_size * 2, &page3)) { | ||
677 | printf("couldn't find mapping in /proc/self/maps\n"); | ||
678 | goto out; | ||
679 | } | ||
680 | |||
681 | /* All three VMAs should be different */ | ||
682 | if (page1.start == page2.start || page2.start == page3.start) { | ||
683 | printf("failed to split VMA for munlock\n"); | ||
684 | goto out; | ||
685 | } | ||
686 | |||
687 | /* Now unlock the first and third page and check the VMAs again */ | ||
688 | if (munlock(map, page_size * 3)) { | ||
689 | perror("munlock()"); | ||
690 | goto out; | ||
691 | } | ||
692 | |||
693 | if (get_vm_area((unsigned long)map, &page1) || | ||
694 | get_vm_area((unsigned long)map + page_size, &page2) || | ||
695 | get_vm_area((unsigned long)map + page_size * 2, &page3)) { | ||
696 | printf("couldn't find mapping in /proc/self/maps\n"); | ||
697 | goto out; | ||
698 | } | ||
699 | |||
700 | /* Now all three VMAs should be the same */ | ||
701 | if (page1.start != page2.start || page2.start != page3.start) { | ||
702 | printf("failed to merge VMAs after munlock\n"); | ||
703 | goto out; | ||
704 | } | ||
705 | |||
706 | ret = 0; | ||
707 | out: | ||
708 | munmap(map, 3 * page_size); | ||
709 | return ret; | ||
710 | } | ||
711 | |||
712 | static int test_mlockall(int (test_function)(bool call_mlock)) | ||
713 | { | ||
714 | int ret = 1; | ||
715 | |||
716 | if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) { | ||
717 | perror("mlockall"); | ||
718 | return ret; | ||
719 | } | ||
720 | |||
721 | ret = test_function(false); | ||
722 | munlockall(); | ||
723 | return ret; | ||
724 | } | ||
725 | |||
726 | int main(int argc, char **argv) | ||
727 | { | ||
728 | int ret = 0; | ||
729 | ret += test_mlock_lock(); | ||
730 | ret += test_mlock_onfault(); | ||
731 | ret += test_munlockall(); | ||
732 | ret += test_lock_onfault_of_present(); | ||
733 | ret += test_vma_management(true); | ||
734 | ret += test_mlockall(test_vma_management); | ||
735 | return ret; | ||
736 | } | ||
diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c new file mode 100644 index 000000000000..245acccce42d --- /dev/null +++ b/tools/testing/selftests/vm/on-fault-limit.c | |||
@@ -0,0 +1,47 @@ | |||
1 | #include <sys/mman.h> | ||
2 | #include <stdio.h> | ||
3 | #include <unistd.h> | ||
4 | #include <string.h> | ||
5 | #include <sys/time.h> | ||
6 | #include <sys/resource.h> | ||
7 | |||
8 | #ifndef MCL_ONFAULT | ||
9 | #define MCL_ONFAULT (MCL_FUTURE << 1) | ||
10 | #endif | ||
11 | |||
12 | static int test_limit(void) | ||
13 | { | ||
14 | int ret = 1; | ||
15 | struct rlimit lims; | ||
16 | void *map; | ||
17 | |||
18 | if (getrlimit(RLIMIT_MEMLOCK, &lims)) { | ||
19 | perror("getrlimit"); | ||
20 | return ret; | ||
21 | } | ||
22 | |||
23 | if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) { | ||
24 | perror("mlockall"); | ||
25 | return ret; | ||
26 | } | ||
27 | |||
28 | map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE, | ||
29 | MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, 0, 0); | ||
30 | if (map != MAP_FAILED) | ||
31 | printf("mmap should have failed, but didn't\n"); | ||
32 | else { | ||
33 | ret = 0; | ||
34 | munmap(map, 2 * lims.rlim_max); | ||
35 | } | ||
36 | |||
37 | munlockall(); | ||
38 | return ret; | ||
39 | } | ||
40 | |||
41 | int main(int argc, char **argv) | ||
42 | { | ||
43 | int ret = 0; | ||
44 | |||
45 | ret += test_limit(); | ||
46 | return ret; | ||
47 | } | ||
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index 9179ce8df485..2df21b3bb26d 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests | |||
@@ -106,4 +106,26 @@ else | |||
106 | echo "[PASS]" | 106 | echo "[PASS]" |
107 | fi | 107 | fi |
108 | 108 | ||
109 | echo "--------------------" | ||
110 | echo "running on-fault-limit" | ||
111 | echo "--------------------" | ||
112 | sudo -u nobody ./on-fault-limit | ||
113 | if [ $? -ne 0 ]; then | ||
114 | echo "[FAIL]" | ||
115 | exitcode=1 | ||
116 | else | ||
117 | echo "[PASS]" | ||
118 | fi | ||
119 | |||
120 | echo "--------------------" | ||
121 | echo "running mlock2-tests" | ||
122 | echo "--------------------" | ||
123 | ./mlock2-tests | ||
124 | if [ $? -ne 0 ]; then | ||
125 | echo "[FAIL]" | ||
126 | exitcode=1 | ||
127 | else | ||
128 | echo "[PASS]" | ||
129 | fi | ||
130 | |||
109 | exit $exitcode | 131 | exit $exitcode |
diff --git a/tools/vm/slabinfo-gnuplot.sh b/tools/vm/slabinfo-gnuplot.sh new file mode 100644 index 000000000000..35b039864b77 --- /dev/null +++ b/tools/vm/slabinfo-gnuplot.sh | |||
@@ -0,0 +1,275 @@ | |||
1 | #!/bin/sh | ||
2 | |||
3 | # Sergey Senozhatsky, 2015 | ||
4 | # sergey.senozhatsky.work@gmail.com | ||
5 | # | ||
6 | # This software is licensed under the terms of the GNU General Public | ||
7 | # License version 2, as published by the Free Software Foundation, and | ||
8 | # may be copied, distributed, and modified under those terms. | ||
9 | # | ||
10 | # This program is distributed in the hope that it will be useful, | ||
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | # GNU General Public License for more details. | ||
14 | |||
15 | |||
16 | # This program is intended to plot a `slabinfo -X' stats, collected, | ||
17 | # for example, using the following command: | ||
18 | # while [ 1 ]; do slabinfo -X >> stats; sleep 1; done | ||
19 | # | ||
20 | # Use `slabinfo-gnuplot.sh stats' to pre-process collected records | ||
21 | # and generate graphs (totals, slabs sorted by size, slabs sorted | ||
22 | # by size). | ||
23 | # | ||
24 | # Graphs can be [individually] regenerate with different ranges and | ||
25 | # size (-r %d,%d and -s %d,%d options). | ||
26 | # | ||
27 | # To visually compare N `totals' graphs, do | ||
28 | # slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals | ||
29 | # | ||
30 | |||
31 | min_slab_name_size=11 | ||
32 | xmin=0 | ||
33 | xmax=0 | ||
34 | width=1500 | ||
35 | height=700 | ||
36 | mode=preprocess | ||
37 | |||
38 | usage() | ||
39 | { | ||
40 | echo "Usage: [-s W,H] [-r MIN,MAX] [-t|-l] FILE1 [FILE2 ..]" | ||
41 | echo "FILEs must contain 'slabinfo -X' samples" | ||
42 | echo "-t - plot totals for FILE(s)" | ||
43 | echo "-l - plot slabs stats for FILE(s)" | ||
44 | echo "-s %d,%d - set image width and height" | ||
45 | echo "-r %d,%d - use data samples from a given range" | ||
46 | } | ||
47 | |||
48 | check_file_exist() | ||
49 | { | ||
50 | if [ ! -f "$1" ]; then | ||
51 | echo "File '$1' does not exist" | ||
52 | exit 1 | ||
53 | fi | ||
54 | } | ||
55 | |||
56 | do_slabs_plotting() | ||
57 | { | ||
58 | local file=$1 | ||
59 | local out_file | ||
60 | local range="every ::$xmin" | ||
61 | local xtic="" | ||
62 | local xtic_rotate="norotate" | ||
63 | local lines=2000000 | ||
64 | local wc_lines | ||
65 | |||
66 | check_file_exist "$file" | ||
67 | |||
68 | out_file=`basename "$file"` | ||
69 | if [ $xmax -ne 0 ]; then | ||
70 | range="$range::$xmax" | ||
71 | lines=$((xmax-xmin)) | ||
72 | fi | ||
73 | |||
74 | wc_lines=`cat "$file" | wc -l` | ||
75 | if [ $? -ne 0 ] || [ "$wc_lines" -eq 0 ] ; then | ||
76 | wc_lines=$lines | ||
77 | fi | ||
78 | |||
79 | if [ "$wc_lines" -lt "$lines" ]; then | ||
80 | lines=$wc_lines | ||
81 | fi | ||
82 | |||
83 | if [ $((width / lines)) -gt $min_slab_name_size ]; then | ||
84 | xtic=":xtic(1)" | ||
85 | xtic_rotate=90 | ||
86 | fi | ||
87 | |||
88 | gnuplot -p << EOF | ||
89 | #!/usr/bin/env gnuplot | ||
90 | |||
91 | set terminal png enhanced size $width,$height large | ||
92 | set output '$out_file.png' | ||
93 | set autoscale xy | ||
94 | set xlabel 'samples' | ||
95 | set ylabel 'bytes' | ||
96 | set style histogram columnstacked title textcolor lt -1 | ||
97 | set style fill solid 0.15 | ||
98 | set xtics rotate $xtic_rotate | ||
99 | set key left above Left title reverse | ||
100 | |||
101 | plot "$file" $range u 2$xtic title 'SIZE' with boxes,\ | ||
102 | '' $range u 3 title 'LOSS' with boxes | ||
103 | EOF | ||
104 | |||
105 | if [ $? -eq 0 ]; then | ||
106 | echo "$out_file.png" | ||
107 | fi | ||
108 | } | ||
109 | |||
110 | do_totals_plotting() | ||
111 | { | ||
112 | local gnuplot_cmd="" | ||
113 | local range="every ::$xmin" | ||
114 | local file="" | ||
115 | |||
116 | if [ $xmax -ne 0 ]; then | ||
117 | range="$range::$xmax" | ||
118 | fi | ||
119 | |||
120 | for i in "${t_files[@]}"; do | ||
121 | check_file_exist "$i" | ||
122 | |||
123 | file="$file"`basename "$i"` | ||
124 | gnuplot_cmd="$gnuplot_cmd '$i' $range using 1 title\ | ||
125 | '$i Memory usage' with lines," | ||
126 | gnuplot_cmd="$gnuplot_cmd '' $range using 2 title \ | ||
127 | '$i Loss' with lines," | ||
128 | done | ||
129 | |||
130 | gnuplot -p << EOF | ||
131 | #!/usr/bin/env gnuplot | ||
132 | |||
133 | set terminal png enhanced size $width,$height large | ||
134 | set autoscale xy | ||
135 | set output '$file.png' | ||
136 | set xlabel 'samples' | ||
137 | set ylabel 'bytes' | ||
138 | set key left above Left title reverse | ||
139 | |||
140 | plot $gnuplot_cmd | ||
141 | EOF | ||
142 | |||
143 | if [ $? -eq 0 ]; then | ||
144 | echo "$file.png" | ||
145 | fi | ||
146 | } | ||
147 | |||
148 | do_preprocess() | ||
149 | { | ||
150 | local out | ||
151 | local lines | ||
152 | local in=$1 | ||
153 | |||
154 | check_file_exist "$in" | ||
155 | |||
156 | # use only 'TOP' slab (biggest memory usage or loss) | ||
157 | let lines=3 | ||
158 | out=`basename "$in"`"-slabs-by-loss" | ||
159 | `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\ | ||
160 | egrep -iv '\-\-|Name|Slabs'\ | ||
161 | | awk '{print $1" "$4+$2*$3" "$4}' > "$out"` | ||
162 | if [ $? -eq 0 ]; then | ||
163 | do_slabs_plotting "$out" | ||
164 | fi | ||
165 | |||
166 | let lines=3 | ||
167 | out=`basename "$in"`"-slabs-by-size" | ||
168 | `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\ | ||
169 | egrep -iv '\-\-|Name|Slabs'\ | ||
170 | | awk '{print $1" "$4" "$4-$2*$3}' > "$out"` | ||
171 | if [ $? -eq 0 ]; then | ||
172 | do_slabs_plotting "$out" | ||
173 | fi | ||
174 | |||
175 | out=`basename "$in"`"-totals" | ||
176 | `cat "$in" | grep "Memory used" |\ | ||
177 | awk '{print $3" "$7}' > "$out"` | ||
178 | if [ $? -eq 0 ]; then | ||
179 | t_files[0]=$out | ||
180 | do_totals_plotting | ||
181 | fi | ||
182 | } | ||
183 | |||
184 | parse_opts() | ||
185 | { | ||
186 | local opt | ||
187 | |||
188 | while getopts "tlr::s::h" opt; do | ||
189 | case $opt in | ||
190 | t) | ||
191 | mode=totals | ||
192 | ;; | ||
193 | l) | ||
194 | mode=slabs | ||
195 | ;; | ||
196 | s) | ||
197 | array=(${OPTARG//,/ }) | ||
198 | width=${array[0]} | ||
199 | height=${array[1]} | ||
200 | ;; | ||
201 | r) | ||
202 | array=(${OPTARG//,/ }) | ||
203 | xmin=${array[0]} | ||
204 | xmax=${array[1]} | ||
205 | ;; | ||
206 | h) | ||
207 | usage | ||
208 | exit 0 | ||
209 | ;; | ||
210 | \?) | ||
211 | echo "Invalid option: -$OPTARG" >&2 | ||
212 | exit 1 | ||
213 | ;; | ||
214 | :) | ||
215 | echo "-$OPTARG requires an argument." >&2 | ||
216 | exit 1 | ||
217 | ;; | ||
218 | esac | ||
219 | done | ||
220 | |||
221 | return $OPTIND | ||
222 | } | ||
223 | |||
224 | parse_args() | ||
225 | { | ||
226 | local idx=0 | ||
227 | local p | ||
228 | |||
229 | for p in "$@"; do | ||
230 | case $mode in | ||
231 | preprocess) | ||
232 | files[$idx]=$p | ||
233 | idx=$idx+1 | ||
234 | ;; | ||
235 | totals) | ||
236 | t_files[$idx]=$p | ||
237 | idx=$idx+1 | ||
238 | ;; | ||
239 | slabs) | ||
240 | files[$idx]=$p | ||
241 | idx=$idx+1 | ||
242 | ;; | ||
243 | esac | ||
244 | done | ||
245 | } | ||
246 | |||
247 | parse_opts "$@" | ||
248 | argstart=$? | ||
249 | parse_args "${@:$argstart}" | ||
250 | |||
251 | if [ ${#files[@]} -eq 0 ] && [ ${#t_files[@]} -eq 0 ]; then | ||
252 | usage | ||
253 | exit 1 | ||
254 | fi | ||
255 | |||
256 | case $mode in | ||
257 | preprocess) | ||
258 | for i in "${files[@]}"; do | ||
259 | do_preprocess "$i" | ||
260 | done | ||
261 | ;; | ||
262 | totals) | ||
263 | do_totals_plotting | ||
264 | ;; | ||
265 | slabs) | ||
266 | for i in "${files[@]}"; do | ||
267 | do_slabs_plotting "$i" | ||
268 | done | ||
269 | ;; | ||
270 | *) | ||
271 | echo "Unknown mode $mode" >&2 | ||
272 | usage | ||
273 | exit 1 | ||
274 | ;; | ||
275 | esac | ||
diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 808d5a9d5dcf..86e698d07e20 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c | |||
@@ -53,39 +53,43 @@ struct aliasinfo { | |||
53 | struct slabinfo *slab; | 53 | struct slabinfo *slab; |
54 | } aliasinfo[MAX_ALIASES]; | 54 | } aliasinfo[MAX_ALIASES]; |
55 | 55 | ||
56 | int slabs = 0; | 56 | int slabs; |
57 | int actual_slabs = 0; | 57 | int actual_slabs; |
58 | int aliases = 0; | 58 | int aliases; |
59 | int alias_targets = 0; | 59 | int alias_targets; |
60 | int highest_node = 0; | 60 | int highest_node; |
61 | 61 | ||
62 | char buffer[4096]; | 62 | char buffer[4096]; |
63 | 63 | ||
64 | int show_empty = 0; | 64 | int show_empty; |
65 | int show_report = 0; | 65 | int show_report; |
66 | int show_alias = 0; | 66 | int show_alias; |
67 | int show_slab = 0; | 67 | int show_slab; |
68 | int skip_zero = 1; | 68 | int skip_zero = 1; |
69 | int show_numa = 0; | 69 | int show_numa; |
70 | int show_track = 0; | 70 | int show_track; |
71 | int show_first_alias = 0; | 71 | int show_first_alias; |
72 | int validate = 0; | 72 | int validate; |
73 | int shrink = 0; | 73 | int shrink; |
74 | int show_inverted = 0; | 74 | int show_inverted; |
75 | int show_single_ref = 0; | 75 | int show_single_ref; |
76 | int show_totals = 0; | 76 | int show_totals; |
77 | int sort_size = 0; | 77 | int sort_size; |
78 | int sort_active = 0; | 78 | int sort_active; |
79 | int set_debug = 0; | 79 | int set_debug; |
80 | int show_ops = 0; | 80 | int show_ops; |
81 | int show_activity = 0; | 81 | int show_activity; |
82 | int output_lines = -1; | ||
83 | int sort_loss; | ||
84 | int extended_totals; | ||
85 | int show_bytes; | ||
82 | 86 | ||
83 | /* Debug options */ | 87 | /* Debug options */ |
84 | int sanity = 0; | 88 | int sanity; |
85 | int redzone = 0; | 89 | int redzone; |
86 | int poison = 0; | 90 | int poison; |
87 | int tracking = 0; | 91 | int tracking; |
88 | int tracing = 0; | 92 | int tracing; |
89 | 93 | ||
90 | int page_size; | 94 | int page_size; |
91 | 95 | ||
@@ -124,6 +128,10 @@ static void usage(void) | |||
124 | "-v|--validate Validate slabs\n" | 128 | "-v|--validate Validate slabs\n" |
125 | "-z|--zero Include empty slabs\n" | 129 | "-z|--zero Include empty slabs\n" |
126 | "-1|--1ref Single reference\n" | 130 | "-1|--1ref Single reference\n" |
131 | "-N|--lines=K Show the first K slabs\n" | ||
132 | "-L|--Loss Sort by loss\n" | ||
133 | "-X|--Xtotals Show extended summary information\n" | ||
134 | "-B|--Bytes Show size in bytes\n" | ||
127 | "\nValid debug options (FZPUT may be combined)\n" | 135 | "\nValid debug options (FZPUT may be combined)\n" |
128 | "a / A Switch on all debug options (=FZUP)\n" | 136 | "a / A Switch on all debug options (=FZUP)\n" |
129 | "- Switch off all debug options\n" | 137 | "- Switch off all debug options\n" |
@@ -225,15 +233,17 @@ static int store_size(char *buffer, unsigned long value) | |||
225 | char trailer = 0; | 233 | char trailer = 0; |
226 | int n; | 234 | int n; |
227 | 235 | ||
228 | if (value > 1000000000UL) { | 236 | if (!show_bytes) { |
229 | divisor = 100000000UL; | 237 | if (value > 1000000000UL) { |
230 | trailer = 'G'; | 238 | divisor = 100000000UL; |
231 | } else if (value > 1000000UL) { | 239 | trailer = 'G'; |
232 | divisor = 100000UL; | 240 | } else if (value > 1000000UL) { |
233 | trailer = 'M'; | 241 | divisor = 100000UL; |
234 | } else if (value > 1000UL) { | 242 | trailer = 'M'; |
235 | divisor = 100; | 243 | } else if (value > 1000UL) { |
236 | trailer = 'K'; | 244 | divisor = 100; |
245 | trailer = 'K'; | ||
246 | } | ||
237 | } | 247 | } |
238 | 248 | ||
239 | value /= divisor; | 249 | value /= divisor; |
@@ -297,10 +307,12 @@ int line = 0; | |||
297 | static void first_line(void) | 307 | static void first_line(void) |
298 | { | 308 | { |
299 | if (show_activity) | 309 | if (show_activity) |
300 | printf("Name Objects Alloc Free %%Fast Fallb O CmpX UL\n"); | 310 | printf("Name Objects Alloc Free" |
311 | " %%Fast Fallb O CmpX UL\n"); | ||
301 | else | 312 | else |
302 | printf("Name Objects Objsize Space " | 313 | printf("Name Objects Objsize %s " |
303 | "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n"); | 314 | "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n", |
315 | sort_loss ? " Loss" : "Space"); | ||
304 | } | 316 | } |
305 | 317 | ||
306 | /* | 318 | /* |
@@ -333,6 +345,11 @@ static unsigned long slab_activity(struct slabinfo *s) | |||
333 | s->alloc_slowpath + s->free_slowpath; | 345 | s->alloc_slowpath + s->free_slowpath; |
334 | } | 346 | } |
335 | 347 | ||
348 | static unsigned long slab_waste(struct slabinfo *s) | ||
349 | { | ||
350 | return slab_size(s) - s->objects * s->object_size; | ||
351 | } | ||
352 | |||
336 | static void slab_numa(struct slabinfo *s, int mode) | 353 | static void slab_numa(struct slabinfo *s, int mode) |
337 | { | 354 | { |
338 | int node; | 355 | int node; |
@@ -504,7 +521,7 @@ static void report(struct slabinfo *s) | |||
504 | if (strcmp(s->name, "*") == 0) | 521 | if (strcmp(s->name, "*") == 0) |
505 | return; | 522 | return; |
506 | 523 | ||
507 | printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %lu\n", | 524 | printf("\nSlabcache: %-15s Aliases: %2d Order : %2d Objects: %lu\n", |
508 | s->name, s->aliases, s->order, s->objects); | 525 | s->name, s->aliases, s->order, s->objects); |
509 | if (s->hwcache_align) | 526 | if (s->hwcache_align) |
510 | printf("** Hardware cacheline aligned\n"); | 527 | printf("** Hardware cacheline aligned\n"); |
@@ -561,7 +578,10 @@ static void slabcache(struct slabinfo *s) | |||
561 | if (show_empty && s->slabs) | 578 | if (show_empty && s->slabs) |
562 | return; | 579 | return; |
563 | 580 | ||
564 | store_size(size_str, slab_size(s)); | 581 | if (sort_loss == 0) |
582 | store_size(size_str, slab_size(s)); | ||
583 | else | ||
584 | store_size(size_str, slab_waste(s)); | ||
565 | snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs, | 585 | snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs, |
566 | s->partial, s->cpu_slabs); | 586 | s->partial, s->cpu_slabs); |
567 | 587 | ||
@@ -602,15 +622,15 @@ static void slabcache(struct slabinfo *s) | |||
602 | total_free ? (s->free_fastpath * 100 / total_free) : 0, | 622 | total_free ? (s->free_fastpath * 100 / total_free) : 0, |
603 | s->order_fallback, s->order, s->cmpxchg_double_fail, | 623 | s->order_fallback, s->order, s->cmpxchg_double_fail, |
604 | s->cmpxchg_double_cpu_fail); | 624 | s->cmpxchg_double_cpu_fail); |
605 | } | 625 | } else { |
606 | else | 626 | printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n", |
607 | printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n", | ||
608 | s->name, s->objects, s->object_size, size_str, dist_str, | 627 | s->name, s->objects, s->object_size, size_str, dist_str, |
609 | s->objs_per_slab, s->order, | 628 | s->objs_per_slab, s->order, |
610 | s->slabs ? (s->partial * 100) / s->slabs : 100, | 629 | s->slabs ? (s->partial * 100) / s->slabs : 100, |
611 | s->slabs ? (s->objects * s->object_size * 100) / | 630 | s->slabs ? (s->objects * s->object_size * 100) / |
612 | (s->slabs * (page_size << s->order)) : 100, | 631 | (s->slabs * (page_size << s->order)) : 100, |
613 | flags); | 632 | flags); |
633 | } | ||
614 | } | 634 | } |
615 | 635 | ||
616 | /* | 636 | /* |
@@ -918,84 +938,88 @@ static void totals(void) | |||
918 | 938 | ||
919 | printf("Slabcache Totals\n"); | 939 | printf("Slabcache Totals\n"); |
920 | printf("----------------\n"); | 940 | printf("----------------\n"); |
921 | printf("Slabcaches : %3d Aliases : %3d->%-3d Active: %3d\n", | 941 | printf("Slabcaches : %15d Aliases : %11d->%-3d Active: %3d\n", |
922 | slabs, aliases, alias_targets, used_slabs); | 942 | slabs, aliases, alias_targets, used_slabs); |
923 | 943 | ||
924 | store_size(b1, total_size);store_size(b2, total_waste); | 944 | store_size(b1, total_size);store_size(b2, total_waste); |
925 | store_size(b3, total_waste * 100 / total_used); | 945 | store_size(b3, total_waste * 100 / total_used); |
926 | printf("Memory used: %6s # Loss : %6s MRatio:%6s%%\n", b1, b2, b3); | 946 | printf("Memory used: %15s # Loss : %15s MRatio:%6s%%\n", b1, b2, b3); |
927 | 947 | ||
928 | store_size(b1, total_objects);store_size(b2, total_partobj); | 948 | store_size(b1, total_objects);store_size(b2, total_partobj); |
929 | store_size(b3, total_partobj * 100 / total_objects); | 949 | store_size(b3, total_partobj * 100 / total_objects); |
930 | printf("# Objects : %6s # PartObj: %6s ORatio:%6s%%\n", b1, b2, b3); | 950 | printf("# Objects : %15s # PartObj: %15s ORatio:%6s%%\n", b1, b2, b3); |
931 | 951 | ||
932 | printf("\n"); | 952 | printf("\n"); |
933 | printf("Per Cache Average Min Max Total\n"); | 953 | printf("Per Cache Average " |
934 | printf("---------------------------------------------------------\n"); | 954 | "Min Max Total\n"); |
955 | printf("---------------------------------------" | ||
956 | "-------------------------------------\n"); | ||
935 | 957 | ||
936 | store_size(b1, avg_objects);store_size(b2, min_objects); | 958 | store_size(b1, avg_objects);store_size(b2, min_objects); |
937 | store_size(b3, max_objects);store_size(b4, total_objects); | 959 | store_size(b3, max_objects);store_size(b4, total_objects); |
938 | printf("#Objects %10s %10s %10s %10s\n", | 960 | printf("#Objects %15s %15s %15s %15s\n", |
939 | b1, b2, b3, b4); | 961 | b1, b2, b3, b4); |
940 | 962 | ||
941 | store_size(b1, avg_slabs);store_size(b2, min_slabs); | 963 | store_size(b1, avg_slabs);store_size(b2, min_slabs); |
942 | store_size(b3, max_slabs);store_size(b4, total_slabs); | 964 | store_size(b3, max_slabs);store_size(b4, total_slabs); |
943 | printf("#Slabs %10s %10s %10s %10s\n", | 965 | printf("#Slabs %15s %15s %15s %15s\n", |
944 | b1, b2, b3, b4); | 966 | b1, b2, b3, b4); |
945 | 967 | ||
946 | store_size(b1, avg_partial);store_size(b2, min_partial); | 968 | store_size(b1, avg_partial);store_size(b2, min_partial); |
947 | store_size(b3, max_partial);store_size(b4, total_partial); | 969 | store_size(b3, max_partial);store_size(b4, total_partial); |
948 | printf("#PartSlab %10s %10s %10s %10s\n", | 970 | printf("#PartSlab %15s %15s %15s %15s\n", |
949 | b1, b2, b3, b4); | 971 | b1, b2, b3, b4); |
950 | store_size(b1, avg_ppart);store_size(b2, min_ppart); | 972 | store_size(b1, avg_ppart);store_size(b2, min_ppart); |
951 | store_size(b3, max_ppart); | 973 | store_size(b3, max_ppart); |
952 | store_size(b4, total_partial * 100 / total_slabs); | 974 | store_size(b4, total_partial * 100 / total_slabs); |
953 | printf("%%PartSlab%10s%% %10s%% %10s%% %10s%%\n", | 975 | printf("%%PartSlab%15s%% %15s%% %15s%% %15s%%\n", |
954 | b1, b2, b3, b4); | 976 | b1, b2, b3, b4); |
955 | 977 | ||
956 | store_size(b1, avg_partobj);store_size(b2, min_partobj); | 978 | store_size(b1, avg_partobj);store_size(b2, min_partobj); |
957 | store_size(b3, max_partobj); | 979 | store_size(b3, max_partobj); |
958 | store_size(b4, total_partobj); | 980 | store_size(b4, total_partobj); |
959 | printf("PartObjs %10s %10s %10s %10s\n", | 981 | printf("PartObjs %15s %15s %15s %15s\n", |
960 | b1, b2, b3, b4); | 982 | b1, b2, b3, b4); |
961 | 983 | ||
962 | store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj); | 984 | store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj); |
963 | store_size(b3, max_ppartobj); | 985 | store_size(b3, max_ppartobj); |
964 | store_size(b4, total_partobj * 100 / total_objects); | 986 | store_size(b4, total_partobj * 100 / total_objects); |
965 | printf("%% PartObj%10s%% %10s%% %10s%% %10s%%\n", | 987 | printf("%% PartObj%15s%% %15s%% %15s%% %15s%%\n", |
966 | b1, b2, b3, b4); | 988 | b1, b2, b3, b4); |
967 | 989 | ||
968 | store_size(b1, avg_size);store_size(b2, min_size); | 990 | store_size(b1, avg_size);store_size(b2, min_size); |
969 | store_size(b3, max_size);store_size(b4, total_size); | 991 | store_size(b3, max_size);store_size(b4, total_size); |
970 | printf("Memory %10s %10s %10s %10s\n", | 992 | printf("Memory %15s %15s %15s %15s\n", |
971 | b1, b2, b3, b4); | 993 | b1, b2, b3, b4); |
972 | 994 | ||
973 | store_size(b1, avg_used);store_size(b2, min_used); | 995 | store_size(b1, avg_used);store_size(b2, min_used); |
974 | store_size(b3, max_used);store_size(b4, total_used); | 996 | store_size(b3, max_used);store_size(b4, total_used); |
975 | printf("Used %10s %10s %10s %10s\n", | 997 | printf("Used %15s %15s %15s %15s\n", |
976 | b1, b2, b3, b4); | 998 | b1, b2, b3, b4); |
977 | 999 | ||
978 | store_size(b1, avg_waste);store_size(b2, min_waste); | 1000 | store_size(b1, avg_waste);store_size(b2, min_waste); |
979 | store_size(b3, max_waste);store_size(b4, total_waste); | 1001 | store_size(b3, max_waste);store_size(b4, total_waste); |
980 | printf("Loss %10s %10s %10s %10s\n", | 1002 | printf("Loss %15s %15s %15s %15s\n", |
981 | b1, b2, b3, b4); | 1003 | b1, b2, b3, b4); |
982 | 1004 | ||
983 | printf("\n"); | 1005 | printf("\n"); |
984 | printf("Per Object Average Min Max\n"); | 1006 | printf("Per Object Average " |
985 | printf("---------------------------------------------\n"); | 1007 | "Min Max\n"); |
1008 | printf("---------------------------------------" | ||
1009 | "--------------------\n"); | ||
986 | 1010 | ||
987 | store_size(b1, avg_memobj);store_size(b2, min_memobj); | 1011 | store_size(b1, avg_memobj);store_size(b2, min_memobj); |
988 | store_size(b3, max_memobj); | 1012 | store_size(b3, max_memobj); |
989 | printf("Memory %10s %10s %10s\n", | 1013 | printf("Memory %15s %15s %15s\n", |
990 | b1, b2, b3); | 1014 | b1, b2, b3); |
991 | store_size(b1, avg_objsize);store_size(b2, min_objsize); | 1015 | store_size(b1, avg_objsize);store_size(b2, min_objsize); |
992 | store_size(b3, max_objsize); | 1016 | store_size(b3, max_objsize); |
993 | printf("User %10s %10s %10s\n", | 1017 | printf("User %15s %15s %15s\n", |
994 | b1, b2, b3); | 1018 | b1, b2, b3); |
995 | 1019 | ||
996 | store_size(b1, avg_objwaste);store_size(b2, min_objwaste); | 1020 | store_size(b1, avg_objwaste);store_size(b2, min_objwaste); |
997 | store_size(b3, max_objwaste); | 1021 | store_size(b3, max_objwaste); |
998 | printf("Loss %10s %10s %10s\n", | 1022 | printf("Loss %15s %15s %15s\n", |
999 | b1, b2, b3); | 1023 | b1, b2, b3); |
1000 | } | 1024 | } |
1001 | 1025 | ||
@@ -1011,6 +1035,8 @@ static void sort_slabs(void) | |||
1011 | result = slab_size(s1) < slab_size(s2); | 1035 | result = slab_size(s1) < slab_size(s2); |
1012 | else if (sort_active) | 1036 | else if (sort_active) |
1013 | result = slab_activity(s1) < slab_activity(s2); | 1037 | result = slab_activity(s1) < slab_activity(s2); |
1038 | else if (sort_loss) | ||
1039 | result = slab_waste(s1) < slab_waste(s2); | ||
1014 | else | 1040 | else |
1015 | result = strcasecmp(s1->name, s2->name); | 1041 | result = strcasecmp(s1->name, s2->name); |
1016 | 1042 | ||
@@ -1095,7 +1121,7 @@ static void alias(void) | |||
1095 | active = a->slab->name; | 1121 | active = a->slab->name; |
1096 | } | 1122 | } |
1097 | else | 1123 | else |
1098 | printf("%-20s -> %s\n", a->name, a->slab->name); | 1124 | printf("%-15s -> %s\n", a->name, a->slab->name); |
1099 | } | 1125 | } |
1100 | if (active) | 1126 | if (active) |
1101 | printf("\n"); | 1127 | printf("\n"); |
@@ -1241,12 +1267,16 @@ static void read_slab_dir(void) | |||
1241 | static void output_slabs(void) | 1267 | static void output_slabs(void) |
1242 | { | 1268 | { |
1243 | struct slabinfo *slab; | 1269 | struct slabinfo *slab; |
1270 | int lines = output_lines; | ||
1244 | 1271 | ||
1245 | for (slab = slabinfo; slab < slabinfo + slabs; slab++) { | 1272 | for (slab = slabinfo; (slab < slabinfo + slabs) && |
1273 | lines != 0; slab++) { | ||
1246 | 1274 | ||
1247 | if (slab->alias) | 1275 | if (slab->alias) |
1248 | continue; | 1276 | continue; |
1249 | 1277 | ||
1278 | if (lines != -1) | ||
1279 | lines--; | ||
1250 | 1280 | ||
1251 | if (show_numa) | 1281 | if (show_numa) |
1252 | slab_numa(slab, 0); | 1282 | slab_numa(slab, 0); |
@@ -1267,24 +1297,54 @@ static void output_slabs(void) | |||
1267 | } | 1297 | } |
1268 | } | 1298 | } |
1269 | 1299 | ||
1300 | static void xtotals(void) | ||
1301 | { | ||
1302 | totals(); | ||
1303 | |||
1304 | link_slabs(); | ||
1305 | rename_slabs(); | ||
1306 | |||
1307 | printf("\nSlabs sorted by size\n"); | ||
1308 | printf("--------------------\n"); | ||
1309 | sort_loss = 0; | ||
1310 | sort_size = 1; | ||
1311 | sort_slabs(); | ||
1312 | output_slabs(); | ||
1313 | |||
1314 | printf("\nSlabs sorted by loss\n"); | ||
1315 | printf("--------------------\n"); | ||
1316 | line = 0; | ||
1317 | sort_loss = 1; | ||
1318 | sort_size = 0; | ||
1319 | sort_slabs(); | ||
1320 | output_slabs(); | ||
1321 | printf("\n"); | ||
1322 | } | ||
1323 | |||
1270 | struct option opts[] = { | 1324 | struct option opts[] = { |
1271 | { "aliases", 0, NULL, 'a' }, | 1325 | { "aliases", no_argument, NULL, 'a' }, |
1272 | { "activity", 0, NULL, 'A' }, | 1326 | { "activity", no_argument, NULL, 'A' }, |
1273 | { "debug", 2, NULL, 'd' }, | 1327 | { "debug", optional_argument, NULL, 'd' }, |
1274 | { "display-activity", 0, NULL, 'D' }, | 1328 | { "display-activity", no_argument, NULL, 'D' }, |
1275 | { "empty", 0, NULL, 'e' }, | 1329 | { "empty", no_argument, NULL, 'e' }, |
1276 | { "first-alias", 0, NULL, 'f' }, | 1330 | { "first-alias", no_argument, NULL, 'f' }, |
1277 | { "help", 0, NULL, 'h' }, | 1331 | { "help", no_argument, NULL, 'h' }, |
1278 | { "inverted", 0, NULL, 'i'}, | 1332 | { "inverted", no_argument, NULL, 'i'}, |
1279 | { "numa", 0, NULL, 'n' }, | 1333 | { "slabs", no_argument, NULL, 'l' }, |
1280 | { "ops", 0, NULL, 'o' }, | 1334 | { "numa", no_argument, NULL, 'n' }, |
1281 | { "report", 0, NULL, 'r' }, | 1335 | { "ops", no_argument, NULL, 'o' }, |
1282 | { "shrink", 0, NULL, 's' }, | 1336 | { "shrink", no_argument, NULL, 's' }, |
1283 | { "slabs", 0, NULL, 'l' }, | 1337 | { "report", no_argument, NULL, 'r' }, |
1284 | { "track", 0, NULL, 't'}, | 1338 | { "Size", no_argument, NULL, 'S'}, |
1285 | { "validate", 0, NULL, 'v' }, | 1339 | { "tracking", no_argument, NULL, 't'}, |
1286 | { "zero", 0, NULL, 'z' }, | 1340 | { "Totals", no_argument, NULL, 'T'}, |
1287 | { "1ref", 0, NULL, '1'}, | 1341 | { "validate", no_argument, NULL, 'v' }, |
1342 | { "zero", no_argument, NULL, 'z' }, | ||
1343 | { "1ref", no_argument, NULL, '1'}, | ||
1344 | { "lines", required_argument, NULL, 'N'}, | ||
1345 | { "Loss", no_argument, NULL, 'L'}, | ||
1346 | { "Xtotals", no_argument, NULL, 'X'}, | ||
1347 | { "Bytes", no_argument, NULL, 'B'}, | ||
1288 | { NULL, 0, NULL, 0 } | 1348 | { NULL, 0, NULL, 0 } |
1289 | }; | 1349 | }; |
1290 | 1350 | ||
@@ -1296,7 +1356,7 @@ int main(int argc, char *argv[]) | |||
1296 | 1356 | ||
1297 | page_size = getpagesize(); | 1357 | page_size = getpagesize(); |
1298 | 1358 | ||
1299 | while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS", | 1359 | while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXB", |
1300 | opts, NULL)) != -1) | 1360 | opts, NULL)) != -1) |
1301 | switch (c) { | 1361 | switch (c) { |
1302 | case '1': | 1362 | case '1': |
@@ -1358,7 +1418,25 @@ int main(int argc, char *argv[]) | |||
1358 | case 'S': | 1418 | case 'S': |
1359 | sort_size = 1; | 1419 | sort_size = 1; |
1360 | break; | 1420 | break; |
1361 | 1421 | case 'N': | |
1422 | if (optarg) { | ||
1423 | output_lines = atoi(optarg); | ||
1424 | if (output_lines < 1) | ||
1425 | output_lines = 1; | ||
1426 | } | ||
1427 | break; | ||
1428 | case 'L': | ||
1429 | sort_loss = 1; | ||
1430 | break; | ||
1431 | case 'X': | ||
1432 | if (output_lines == -1) | ||
1433 | output_lines = 1; | ||
1434 | extended_totals = 1; | ||
1435 | show_bytes = 1; | ||
1436 | break; | ||
1437 | case 'B': | ||
1438 | show_bytes = 1; | ||
1439 | break; | ||
1362 | default: | 1440 | default: |
1363 | fatal("%s: Invalid option '%c'\n", argv[0], optopt); | 1441 | fatal("%s: Invalid option '%c'\n", argv[0], optopt); |
1364 | 1442 | ||
@@ -1378,12 +1456,13 @@ int main(int argc, char *argv[]) | |||
1378 | fatal("%s: Invalid pattern '%s' code %d\n", | 1456 | fatal("%s: Invalid pattern '%s' code %d\n", |
1379 | argv[0], pattern_source, err); | 1457 | argv[0], pattern_source, err); |
1380 | read_slab_dir(); | 1458 | read_slab_dir(); |
1381 | if (show_alias) | 1459 | if (show_alias) { |
1382 | alias(); | 1460 | alias(); |
1383 | else | 1461 | } else if (extended_totals) { |
1384 | if (show_totals) | 1462 | xtotals(); |
1463 | } else if (show_totals) { | ||
1385 | totals(); | 1464 | totals(); |
1386 | else { | 1465 | } else { |
1387 | link_slabs(); | 1466 | link_slabs(); |
1388 | rename_slabs(); | 1467 | rename_slabs(); |
1389 | sort_slabs(); | 1468 | sort_slabs(); |