diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-04 19:55:13 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-04 19:55:13 -0400 |
commit | 00170fdd0846df7cdb5ad421d3a340440f930b8f (patch) | |
tree | 1883cfbda846cd65faed011bda54a52c1d40ecdd | |
parent | d09cc3659db494aca4b3bb2393c533fb4946b794 (diff) | |
parent | 3ff6db3287e8a5e8f5bb9529b8e1259ca6b10def (diff) |
Merge branch 'akpm' (patchbomb from Andrew) into next
Merge misc updates from Andrew Morton:
- a few fixes for 3.16. Cc'ed to stable so they'll get there somehow.
- various misc fixes and cleanups
- most of the ocfs2 queue. Review is slow...
- most of MM. The MM queue is pretty huge this time, but not much in
the way of feature work.
- some tweaks under kernel/
- printk maintenance work
- updates to lib/
- checkpatch updates
- tweaks to init/
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (276 commits)
fs/autofs4/dev-ioctl.c: add __init to autofs_dev_ioctl_init
fs/ncpfs/getopt.c: replace simple_strtoul by kstrtoul
init/main.c: remove an ifdef
kthreads: kill CLONE_KERNEL, change kernel_thread(kernel_init) to avoid CLONE_SIGHAND
init/main.c: add initcall_blacklist kernel parameter
init/main.c: don't use pr_debug()
fs/binfmt_flat.c: make old_reloc() static
fs/binfmt_elf.c: fix bool assignements
fs/efs: convert printk(KERN_DEBUG to pr_debug
fs/efs: add pr_fmt / use __func__
fs/efs: convert printk to pr_foo()
scripts/checkpatch.pl: device_initcall is not the only __initcall substitute
checkpatch: check stable email address
checkpatch: warn on unnecessary void function return statements
checkpatch: prefer kstrto<foo> to sscanf(buf, "%<lhuidx>", &bar);
checkpatch: add warning for kmalloc/kzalloc with multiply
checkpatch: warn on #defines ending in semicolon
checkpatch: make --strict a default for files in drivers/net and net/
checkpatch: always warn on missing blank line after variable declaration block
checkpatch: fix wildcard DT compatible string checking
...
279 files changed, 4712 insertions, 3514 deletions
diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle index 7fe0546c504a..6b6bef31e956 100644 --- a/Documentation/CodingStyle +++ b/Documentation/CodingStyle | |||
@@ -660,15 +660,23 @@ There are a number of driver model diagnostic macros in <linux/device.h> | |||
660 | which you should use to make sure messages are matched to the right device | 660 | which you should use to make sure messages are matched to the right device |
661 | and driver, and are tagged with the right level: dev_err(), dev_warn(), | 661 | and driver, and are tagged with the right level: dev_err(), dev_warn(), |
662 | dev_info(), and so forth. For messages that aren't associated with a | 662 | dev_info(), and so forth. For messages that aren't associated with a |
663 | particular device, <linux/printk.h> defines pr_debug() and pr_info(). | 663 | particular device, <linux/printk.h> defines pr_notice(), pr_info(), |
664 | pr_warn(), pr_err(), etc. | ||
664 | 665 | ||
665 | Coming up with good debugging messages can be quite a challenge; and once | 666 | Coming up with good debugging messages can be quite a challenge; and once |
666 | you have them, they can be a huge help for remote troubleshooting. Such | 667 | you have them, they can be a huge help for remote troubleshooting. However |
667 | messages should be compiled out when the DEBUG symbol is not defined (that | 668 | debug message printing is handled differently than printing other non-debug |
668 | is, by default they are not included). When you use dev_dbg() or pr_debug(), | 669 | messages. While the other pr_XXX() functions print unconditionally, |
669 | that's automatic. Many subsystems have Kconfig options to turn on -DDEBUG. | 670 | pr_debug() does not; it is compiled out by default, unless either DEBUG is |
670 | A related convention uses VERBOSE_DEBUG to add dev_vdbg() messages to the | 671 | defined or CONFIG_DYNAMIC_DEBUG is set. That is true for dev_dbg() also, |
671 | ones already enabled by DEBUG. | 672 | and a related convention uses VERBOSE_DEBUG to add dev_vdbg() messages to |
673 | the ones already enabled by DEBUG. | ||
674 | |||
675 | Many subsystems have Kconfig debug options to turn on -DDEBUG in the | ||
676 | corresponding Makefile; in other cases specific files #define DEBUG. And | ||
677 | when a debug message should be unconditionally printed, such as if it is | ||
678 | already inside a debug-related #ifdef secton, printk(KERN_DEBUG ...) can be | ||
679 | used. | ||
672 | 680 | ||
673 | 681 | ||
674 | Chapter 14: Allocating memory | 682 | Chapter 14: Allocating memory |
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 2622115276aa..4937e6fff9b4 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -270,6 +270,11 @@ When oom event notifier is registered, event will be delivered. | |||
270 | 270 | ||
271 | 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) | 271 | 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) |
272 | 272 | ||
273 | WARNING: Current implementation lacks reclaim support. That means allocation | ||
274 | attempts will fail when close to the limit even if there are plenty of | ||
275 | kmem available for reclaim. That makes this option unusable in real | ||
276 | life so DO NOT SELECT IT unless for development purposes. | ||
277 | |||
273 | With the Kernel memory extension, the Memory Controller is able to limit | 278 | With the Kernel memory extension, the Memory Controller is able to limit |
274 | the amount of kernel memory used by the system. Kernel memory is fundamentally | 279 | the amount of kernel memory used by the system. Kernel memory is fundamentally |
275 | different than user memory, since it can't be swapped out, which makes it | 280 | different than user memory, since it can't be swapped out, which makes it |
@@ -535,17 +540,15 @@ Note: | |||
535 | 540 | ||
536 | 5.3 swappiness | 541 | 5.3 swappiness |
537 | 542 | ||
538 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. | 543 | Similar to /proc/sys/vm/swappiness, but only affecting reclaim that is |
544 | triggered by this cgroup's hard limit. The tunable in the root cgroup | ||
545 | corresponds to the global swappiness setting. | ||
546 | |||
539 | Please note that unlike the global swappiness, memcg knob set to 0 | 547 | Please note that unlike the global swappiness, memcg knob set to 0 |
540 | really prevents from any swapping even if there is a swap storage | 548 | really prevents from any swapping even if there is a swap storage |
541 | available. This might lead to memcg OOM killer if there are no file | 549 | available. This might lead to memcg OOM killer if there are no file |
542 | pages to reclaim. | 550 | pages to reclaim. |
543 | 551 | ||
544 | Following cgroups' swappiness can't be changed. | ||
545 | - root cgroup (uses /proc/sys/vm/swappiness). | ||
546 | - a cgroup which uses hierarchy and it has other cgroup(s) below it. | ||
547 | - a cgroup which uses hierarchy and not the root of hierarchy. | ||
548 | |||
549 | 5.4 failcnt | 552 | 5.4 failcnt |
550 | 553 | ||
551 | A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. | 554 | A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. |
@@ -754,7 +757,6 @@ You can disable the OOM-killer by writing "1" to memory.oom_control file, as: | |||
754 | 757 | ||
755 | #echo 1 > memory.oom_control | 758 | #echo 1 > memory.oom_control |
756 | 759 | ||
757 | This operation is only allowed to the top cgroup of a sub-hierarchy. | ||
758 | If OOM-killer is disabled, tasks under cgroup will hang/sleep | 760 | If OOM-killer is disabled, tasks under cgroup will hang/sleep |
759 | in memory cgroup's OOM-waitqueue when they request accountable memory. | 761 | in memory cgroup's OOM-waitqueue when they request accountable memory. |
760 | 762 | ||
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index af55e13ace8f..9973a7e2e0ac 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -630,8 +630,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
630 | Also note the kernel might malfunction if you disable | 630 | Also note the kernel might malfunction if you disable |
631 | some critical bits. | 631 | some critical bits. |
632 | 632 | ||
633 | cma=nn[MG] [ARM,KNL] | 633 | cma=nn[MG]@[start[MG][-end[MG]]] |
634 | Sets the size of kernel global memory area for contiguous | 634 | [ARM,X86,KNL] |
635 | Sets the size of kernel global memory area for | ||
636 | contiguous memory allocations and optionally the | ||
637 | placement constraint by the physical address range of | ||
635 | memory allocations. For more information, see | 638 | memory allocations. For more information, see |
636 | include/linux/dma-contiguous.h | 639 | include/linux/dma-contiguous.h |
637 | 640 | ||
@@ -1309,6 +1312,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1309 | for working out where the kernel is dying during | 1312 | for working out where the kernel is dying during |
1310 | startup. | 1313 | startup. |
1311 | 1314 | ||
1315 | initcall_blacklist= [KNL] Do not execute a comma-separated list of | ||
1316 | initcall functions. Useful for debugging built-in | ||
1317 | modules and initcalls. | ||
1318 | |||
1312 | initrd= [BOOT] Specify the location of the initial ramdisk | 1319 | initrd= [BOOT] Specify the location of the initial ramdisk |
1313 | 1320 | ||
1314 | inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver | 1321 | inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver |
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 58340d50f8a6..f304edb8fbe7 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt | |||
@@ -88,16 +88,21 @@ phase by hand. | |||
88 | 88 | ||
89 | 1.3. Unit of Memory online/offline operation | 89 | 1.3. Unit of Memory online/offline operation |
90 | ------------ | 90 | ------------ |
91 | Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole memory | 91 | Memory hotplug uses SPARSEMEM memory model which allows memory to be divided |
92 | into chunks of the same size. The chunk is called a "section". The size of | 92 | into chunks of the same size. These chunks are called "sections". The size of |
93 | a section is architecture dependent. For example, power uses 16MiB, ia64 uses | 93 | a memory section is architecture dependent. For example, power uses 16MiB, ia64 |
94 | 1GiB. The unit of online/offline operation is "one section". (see Section 3.) | 94 | uses 1GiB. |
95 | 95 | ||
96 | To determine the size of sections, please read this file: | 96 | Memory sections are combined into chunks referred to as "memory blocks". The |
97 | size of a memory block is architecture dependent and represents the logical | ||
98 | unit upon which memory online/offline operations are to be performed. The | ||
99 | default size of a memory block is the same as memory section size unless an | ||
100 | architecture specifies otherwise. (see Section 3.) | ||
101 | |||
102 | To determine the size (in bytes) of a memory block please read this file: | ||
97 | 103 | ||
98 | /sys/devices/system/memory/block_size_bytes | 104 | /sys/devices/system/memory/block_size_bytes |
99 | 105 | ||
100 | This file shows the size of sections in byte. | ||
101 | 106 | ||
102 | ----------------------- | 107 | ----------------------- |
103 | 2. Kernel Configuration | 108 | 2. Kernel Configuration |
@@ -123,42 +128,35 @@ config options. | |||
123 | (CONFIG_ACPI_CONTAINER). | 128 | (CONFIG_ACPI_CONTAINER). |
124 | This option can be kernel module too. | 129 | This option can be kernel module too. |
125 | 130 | ||
131 | |||
126 | -------------------------------- | 132 | -------------------------------- |
127 | 4 sysfs files for memory hotplug | 133 | 3 sysfs files for memory hotplug |
128 | -------------------------------- | 134 | -------------------------------- |
129 | All sections have their device information in sysfs. Each section is part of | 135 | All memory blocks have their device information in sysfs. Each memory block |
130 | a memory block under /sys/devices/system/memory as | 136 | is described under /sys/devices/system/memory as |
131 | 137 | ||
132 | /sys/devices/system/memory/memoryXXX | 138 | /sys/devices/system/memory/memoryXXX |
133 | (XXX is the section id.) | 139 | (XXX is the memory block id.) |
134 | 140 | ||
135 | Now, XXX is defined as (start_address_of_section / section_size) of the first | 141 | For the memory block covered by the sysfs directory. It is expected that all |
136 | section contained in the memory block. The files 'phys_index' and | ||
137 | 'end_phys_index' under each directory report the beginning and end section id's | ||
138 | for the memory block covered by the sysfs directory. It is expected that all | ||
139 | memory sections in this range are present and no memory holes exist in the | 142 | memory sections in this range are present and no memory holes exist in the |
140 | range. Currently there is no way to determine if there is a memory hole, but | 143 | range. Currently there is no way to determine if there is a memory hole, but |
141 | the existence of one should not affect the hotplug capabilities of the memory | 144 | the existence of one should not affect the hotplug capabilities of the memory |
142 | block. | 145 | block. |
143 | 146 | ||
144 | For example, assume 1GiB section size. A device for a memory starting at | 147 | For example, assume 1GiB memory block size. A device for a memory starting at |
145 | 0x100000000 is /sys/device/system/memory/memory4 | 148 | 0x100000000 is /sys/device/system/memory/memory4 |
146 | (0x100000000 / 1Gib = 4) | 149 | (0x100000000 / 1Gib = 4) |
147 | This device covers address range [0x100000000 ... 0x140000000) | 150 | This device covers address range [0x100000000 ... 0x140000000) |
148 | 151 | ||
149 | Under each section, you can see 4 or 5 files, the end_phys_index file being | 152 | Under each memory block, you can see 4 files: |
150 | a recent addition and not present on older kernels. | ||
151 | 153 | ||
152 | /sys/devices/system/memory/memoryXXX/start_phys_index | 154 | /sys/devices/system/memory/memoryXXX/phys_index |
153 | /sys/devices/system/memory/memoryXXX/end_phys_index | ||
154 | /sys/devices/system/memory/memoryXXX/phys_device | 155 | /sys/devices/system/memory/memoryXXX/phys_device |
155 | /sys/devices/system/memory/memoryXXX/state | 156 | /sys/devices/system/memory/memoryXXX/state |
156 | /sys/devices/system/memory/memoryXXX/removable | 157 | /sys/devices/system/memory/memoryXXX/removable |
157 | 158 | ||
158 | 'phys_index' : read-only and contains section id of the first section | 159 | 'phys_index' : read-only and contains memory block id, same as XXX. |
159 | in the memory block, same as XXX. | ||
160 | 'end_phys_index' : read-only and contains section id of the last section | ||
161 | in the memory block. | ||
162 | 'state' : read-write | 160 | 'state' : read-write |
163 | at read: contains online/offline state of memory. | 161 | at read: contains online/offline state of memory. |
164 | at write: user can specify "online_kernel", | 162 | at write: user can specify "online_kernel", |
@@ -185,6 +183,7 @@ For example: | |||
185 | A backlink will also be created: | 183 | A backlink will also be created: |
186 | /sys/devices/system/memory/memory9/node0 -> ../../node/node0 | 184 | /sys/devices/system/memory/memory9/node0 -> ../../node/node0 |
187 | 185 | ||
186 | |||
188 | -------------------------------- | 187 | -------------------------------- |
189 | 4. Physical memory hot-add phase | 188 | 4. Physical memory hot-add phase |
190 | -------------------------------- | 189 | -------------------------------- |
@@ -227,11 +226,10 @@ You can tell the physical address of new memory to the kernel by | |||
227 | 226 | ||
228 | % echo start_address_of_new_memory > /sys/devices/system/memory/probe | 227 | % echo start_address_of_new_memory > /sys/devices/system/memory/probe |
229 | 228 | ||
230 | Then, [start_address_of_new_memory, start_address_of_new_memory + section_size) | 229 | Then, [start_address_of_new_memory, start_address_of_new_memory + |
231 | memory range is hot-added. In this case, hotplug script is not called (in | 230 | memory_block_size] memory range is hot-added. In this case, hotplug script is |
232 | current implementation). You'll have to online memory by yourself. | 231 | not called (in current implementation). You'll have to online memory by |
233 | Please see "How to online memory" in this text. | 232 | yourself. Please see "How to online memory" in this text. |
234 | |||
235 | 233 | ||
236 | 234 | ||
237 | ------------------------------ | 235 | ------------------------------ |
@@ -240,36 +238,36 @@ Please see "How to online memory" in this text. | |||
240 | 238 | ||
241 | 5.1. State of memory | 239 | 5.1. State of memory |
242 | ------------ | 240 | ------------ |
243 | To see (online/offline) state of memory section, read 'state' file. | 241 | To see (online/offline) state of a memory block, read 'state' file. |
244 | 242 | ||
245 | % cat /sys/device/system/memory/memoryXXX/state | 243 | % cat /sys/device/system/memory/memoryXXX/state |
246 | 244 | ||
247 | 245 | ||
248 | If the memory section is online, you'll read "online". | 246 | If the memory block is online, you'll read "online". |
249 | If the memory section is offline, you'll read "offline". | 247 | If the memory block is offline, you'll read "offline". |
250 | 248 | ||
251 | 249 | ||
252 | 5.2. How to online memory | 250 | 5.2. How to online memory |
253 | ------------ | 251 | ------------ |
254 | Even if the memory is hot-added, it is not at ready-to-use state. | 252 | Even if the memory is hot-added, it is not at ready-to-use state. |
255 | For using newly added memory, you have to "online" the memory section. | 253 | For using newly added memory, you have to "online" the memory block. |
256 | 254 | ||
257 | For onlining, you have to write "online" to the section's state file as: | 255 | For onlining, you have to write "online" to the memory block's state file as: |
258 | 256 | ||
259 | % echo online > /sys/devices/system/memory/memoryXXX/state | 257 | % echo online > /sys/devices/system/memory/memoryXXX/state |
260 | 258 | ||
261 | This onlining will not change the ZONE type of the target memory section, | 259 | This onlining will not change the ZONE type of the target memory block, |
262 | If the memory section is in ZONE_NORMAL, you can change it to ZONE_MOVABLE: | 260 | If the memory block is in ZONE_NORMAL, you can change it to ZONE_MOVABLE: |
263 | 261 | ||
264 | % echo online_movable > /sys/devices/system/memory/memoryXXX/state | 262 | % echo online_movable > /sys/devices/system/memory/memoryXXX/state |
265 | (NOTE: current limit: this memory section must be adjacent to ZONE_MOVABLE) | 263 | (NOTE: current limit: this memory block must be adjacent to ZONE_MOVABLE) |
266 | 264 | ||
267 | And if the memory section is in ZONE_MOVABLE, you can change it to ZONE_NORMAL: | 265 | And if the memory block is in ZONE_MOVABLE, you can change it to ZONE_NORMAL: |
268 | 266 | ||
269 | % echo online_kernel > /sys/devices/system/memory/memoryXXX/state | 267 | % echo online_kernel > /sys/devices/system/memory/memoryXXX/state |
270 | (NOTE: current limit: this memory section must be adjacent to ZONE_NORMAL) | 268 | (NOTE: current limit: this memory block must be adjacent to ZONE_NORMAL) |
271 | 269 | ||
272 | After this, section memoryXXX's state will be 'online' and the amount of | 270 | After this, memory block XXX's state will be 'online' and the amount of |
273 | available memory will be increased. | 271 | available memory will be increased. |
274 | 272 | ||
275 | Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA). | 273 | Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA). |
@@ -284,22 +282,22 @@ This may be changed in future. | |||
284 | 6.1 Memory offline and ZONE_MOVABLE | 282 | 6.1 Memory offline and ZONE_MOVABLE |
285 | ------------ | 283 | ------------ |
286 | Memory offlining is more complicated than memory online. Because memory offline | 284 | Memory offlining is more complicated than memory online. Because memory offline |
287 | has to make the whole memory section be unused, memory offline can fail if | 285 | has to make the whole memory block be unused, memory offline can fail if |
288 | the section includes memory which cannot be freed. | 286 | the memory block includes memory which cannot be freed. |
289 | 287 | ||
290 | In general, memory offline can use 2 techniques. | 288 | In general, memory offline can use 2 techniques. |
291 | 289 | ||
292 | (1) reclaim and free all memory in the section. | 290 | (1) reclaim and free all memory in the memory block. |
293 | (2) migrate all pages in the section. | 291 | (2) migrate all pages in the memory block. |
294 | 292 | ||
295 | In the current implementation, Linux's memory offline uses method (2), freeing | 293 | In the current implementation, Linux's memory offline uses method (2), freeing |
296 | all pages in the section by page migration. But not all pages are | 294 | all pages in the memory block by page migration. But not all pages are |
297 | migratable. Under current Linux, migratable pages are anonymous pages and | 295 | migratable. Under current Linux, migratable pages are anonymous pages and |
298 | page caches. For offlining a section by migration, the kernel has to guarantee | 296 | page caches. For offlining a memory block by migration, the kernel has to |
299 | that the section contains only migratable pages. | 297 | guarantee that the memory block contains only migratable pages. |
300 | 298 | ||
301 | Now, a boot option for making a section which consists of migratable pages is | 299 | Now, a boot option for making a memory block which consists of migratable pages |
302 | supported. By specifying "kernelcore=" or "movablecore=" boot option, you can | 300 | is supported. By specifying "kernelcore=" or "movablecore=" boot option, you can |
303 | create ZONE_MOVABLE...a zone which is just used for movable pages. | 301 | create ZONE_MOVABLE...a zone which is just used for movable pages. |
304 | (See also Documentation/kernel-parameters.txt) | 302 | (See also Documentation/kernel-parameters.txt) |
305 | 303 | ||
@@ -315,28 +313,27 @@ creates ZONE_MOVABLE as following. | |||
315 | Size of memory for movable pages (for offline) is ZZZZ. | 313 | Size of memory for movable pages (for offline) is ZZZZ. |
316 | 314 | ||
317 | 315 | ||
318 | Note) Unfortunately, there is no information to show which section belongs | 316 | Note: Unfortunately, there is no information to show which memory block belongs |
319 | to ZONE_MOVABLE. This is TBD. | 317 | to ZONE_MOVABLE. This is TBD. |
320 | 318 | ||
321 | 319 | ||
322 | 6.2. How to offline memory | 320 | 6.2. How to offline memory |
323 | ------------ | 321 | ------------ |
324 | You can offline a section by using the same sysfs interface that was used in | 322 | You can offline a memory block by using the same sysfs interface that was used |
325 | memory onlining. | 323 | in memory onlining. |
326 | 324 | ||
327 | % echo offline > /sys/devices/system/memory/memoryXXX/state | 325 | % echo offline > /sys/devices/system/memory/memoryXXX/state |
328 | 326 | ||
329 | If offline succeeds, the state of the memory section is changed to be "offline". | 327 | If offline succeeds, the state of the memory block is changed to be "offline". |
330 | If it fails, some error core (like -EBUSY) will be returned by the kernel. | 328 | If it fails, some error core (like -EBUSY) will be returned by the kernel. |
331 | Even if a section does not belong to ZONE_MOVABLE, you can try to offline it. | 329 | Even if a memory block does not belong to ZONE_MOVABLE, you can try to offline |
332 | If it doesn't contain 'unmovable' memory, you'll get success. | 330 | it. If it doesn't contain 'unmovable' memory, you'll get success. |
333 | 331 | ||
334 | A section under ZONE_MOVABLE is considered to be able to be offlined easily. | 332 | A memory block under ZONE_MOVABLE is considered to be able to be offlined |
335 | But under some busy state, it may return -EBUSY. Even if a memory section | 333 | easily. But under some busy state, it may return -EBUSY. Even if a memory |
336 | cannot be offlined due to -EBUSY, you can retry offlining it and may be able to | 334 | block cannot be offlined due to -EBUSY, you can retry offlining it and may be |
337 | offline it (or not). | 335 | able to offline it (or not). (For example, a page is referred to by some kernel |
338 | (For example, a page is referred to by some kernel internal call and released | 336 | internal call and released soon.) |
339 | soon.) | ||
340 | 337 | ||
341 | Consideration: | 338 | Consideration: |
342 | Memory hotplug's design direction is to make the possibility of memory offlining | 339 | Memory hotplug's design direction is to make the possibility of memory offlining |
@@ -373,11 +370,11 @@ MEMORY_GOING_OFFLINE | |||
373 | Generated to begin the process of offlining memory. Allocations are no | 370 | Generated to begin the process of offlining memory. Allocations are no |
374 | longer possible from the memory but some of the memory to be offlined | 371 | longer possible from the memory but some of the memory to be offlined |
375 | is still in use. The callback can be used to free memory known to a | 372 | is still in use. The callback can be used to free memory known to a |
376 | subsystem from the indicated memory section. | 373 | subsystem from the indicated memory block. |
377 | 374 | ||
378 | MEMORY_CANCEL_OFFLINE | 375 | MEMORY_CANCEL_OFFLINE |
379 | Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from | 376 | Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from |
380 | the section that we attempted to offline. | 377 | the memory block that we attempted to offline. |
381 | 378 | ||
382 | MEMORY_OFFLINE | 379 | MEMORY_OFFLINE |
383 | Generated after offlining memory is complete. | 380 | Generated after offlining memory is complete. |
@@ -413,8 +410,8 @@ node if necessary. | |||
413 | -------------- | 410 | -------------- |
414 | - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like | 411 | - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like |
415 | sysctl or new control file. | 412 | sysctl or new control file. |
416 | - showing memory section and physical device relationship. | 413 | - showing memory block and physical device relationship. |
417 | - showing memory section is under ZONE_MOVABLE or not | 414 | - showing memory block is under ZONE_MOVABLE or not |
418 | - test and make it better memory offlining. | 415 | - test and make it better memory offlining. |
419 | - support HugeTLB page migration and offlining. | 416 | - support HugeTLB page migration and offlining. |
420 | - memmap removing at memory offline. | 417 | - memmap removing at memory offline. |
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index dd9d0e33b443..bd4b34c03738 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -746,8 +746,8 @@ Changing this takes effect whenever an application requests memory. | |||
746 | vfs_cache_pressure | 746 | vfs_cache_pressure |
747 | ------------------ | 747 | ------------------ |
748 | 748 | ||
749 | Controls the tendency of the kernel to reclaim the memory which is used for | 749 | This percentage value controls the tendency of the kernel to reclaim |
750 | caching of directory and inode objects. | 750 | the memory which is used for caching of directory and inode objects. |
751 | 751 | ||
752 | At the default value of vfs_cache_pressure=100 the kernel will attempt to | 752 | At the default value of vfs_cache_pressure=100 the kernel will attempt to |
753 | reclaim dentries and inodes at a "fair" rate with respect to pagecache and | 753 | reclaim dentries and inodes at a "fair" rate with respect to pagecache and |
@@ -757,6 +757,11 @@ never reclaim dentries and inodes due to memory pressure and this can easily | |||
757 | lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100 | 757 | lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100 |
758 | causes the kernel to prefer to reclaim dentries and inodes. | 758 | causes the kernel to prefer to reclaim dentries and inodes. |
759 | 759 | ||
760 | Increasing vfs_cache_pressure significantly beyond 100 may have negative | ||
761 | performance impact. Reclaim code needs to take various locks to find freeable | ||
762 | directory and inode objects. With vfs_cache_pressure=1000, it will look for | ||
763 | ten times more freeable objects than there are. | ||
764 | |||
760 | ============================================================== | 765 | ============================================================== |
761 | 766 | ||
762 | zone_reclaim_mode: | 767 | zone_reclaim_mode: |
@@ -772,16 +777,17 @@ This is value ORed together of | |||
772 | 2 = Zone reclaim writes dirty pages out | 777 | 2 = Zone reclaim writes dirty pages out |
773 | 4 = Zone reclaim swaps pages | 778 | 4 = Zone reclaim swaps pages |
774 | 779 | ||
775 | zone_reclaim_mode is set during bootup to 1 if it is determined that pages | 780 | zone_reclaim_mode is disabled by default. For file servers or workloads |
776 | from remote zones will cause a measurable performance reduction. The | 781 | that benefit from having their data cached, zone_reclaim_mode should be |
777 | page allocator will then reclaim easily reusable pages (those page | 782 | left disabled as the caching effect is likely to be more important than |
778 | cache pages that are currently not used) before allocating off node pages. | ||
779 | |||
780 | It may be beneficial to switch off zone reclaim if the system is | ||
781 | used for a file server and all of memory should be used for caching files | ||
782 | from disk. In that case the caching effect is more important than | ||
783 | data locality. | 783 | data locality. |
784 | 784 | ||
785 | zone_reclaim may be enabled if it's known that the workload is partitioned | ||
786 | such that each partition fits within a NUMA node and that accessing remote | ||
787 | memory would cause a measurable performance reduction. The page allocator | ||
788 | will then reclaim easily reusable pages (those page cache pages that are | ||
789 | currently not used) before allocating off node pages. | ||
790 | |||
785 | Allowing zone reclaim to write out pages stops processes that are | 791 | Allowing zone reclaim to write out pages stops processes that are |
786 | writing large amounts of data from dirtying pages on other nodes. Zone | 792 | writing large amounts of data from dirtying pages on other nodes. Zone |
787 | reclaim will write out dirty pages if a zone fills up and so effectively | 793 | reclaim will write out dirty pages if a zone fills up and so effectively |
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 550068466605..6ae89a9edf2a 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt | |||
@@ -84,6 +84,11 @@ PR_MCE_KILL | |||
84 | PR_MCE_KILL_EARLY: Early kill | 84 | PR_MCE_KILL_EARLY: Early kill |
85 | PR_MCE_KILL_LATE: Late kill | 85 | PR_MCE_KILL_LATE: Late kill |
86 | PR_MCE_KILL_DEFAULT: Use system global default | 86 | PR_MCE_KILL_DEFAULT: Use system global default |
87 | Note that if you want to have a dedicated thread which handles | ||
88 | the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should | ||
89 | call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise, | ||
90 | the SIGBUS is sent to the main thread. | ||
91 | |||
87 | PR_MCE_KILL_GET | 92 | PR_MCE_KILL_GET |
88 | return current mode | 93 | return current mode |
89 | 94 | ||
diff --git a/MAINTAINERS b/MAINTAINERS index e433e45814af..7d101d5ba953 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -3882,6 +3882,11 @@ L: kvm@vger.kernel.org | |||
3882 | S: Supported | 3882 | S: Supported |
3883 | F: drivers/uio/uio_pci_generic.c | 3883 | F: drivers/uio/uio_pci_generic.c |
3884 | 3884 | ||
3885 | GET_MAINTAINER SCRIPT | ||
3886 | M: Joe Perches <joe@perches.com> | ||
3887 | S: Maintained | ||
3888 | F: scripts/get_maintainer.pl | ||
3889 | |||
3885 | GFS2 FILE SYSTEM | 3890 | GFS2 FILE SYSTEM |
3886 | M: Steven Whitehouse <swhiteho@redhat.com> | 3891 | M: Steven Whitehouse <swhiteho@redhat.com> |
3887 | L: cluster-devel@redhat.com | 3892 | L: cluster-devel@redhat.com |
@@ -4006,9 +4011,8 @@ S: Odd Fixes | |||
4006 | F: drivers/media/usb/hdpvr/ | 4011 | F: drivers/media/usb/hdpvr/ |
4007 | 4012 | ||
4008 | HWPOISON MEMORY FAILURE HANDLING | 4013 | HWPOISON MEMORY FAILURE HANDLING |
4009 | M: Andi Kleen <andi@firstfloor.org> | 4014 | M: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> |
4010 | L: linux-mm@kvack.org | 4015 | L: linux-mm@kvack.org |
4011 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git hwpoison | ||
4012 | S: Maintained | 4016 | S: Maintained |
4013 | F: mm/memory-failure.c | 4017 | F: mm/memory-failure.c |
4014 | F: mm/hwpoison-inject.c | 4018 | F: mm/hwpoison-inject.c |
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index 73a7450ee622..1badf9b84b51 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c | |||
@@ -86,12 +86,13 @@ static void show_faulting_vma(unsigned long address, char *buf) | |||
86 | unsigned long ino = 0; | 86 | unsigned long ino = 0; |
87 | dev_t dev = 0; | 87 | dev_t dev = 0; |
88 | char *nm = buf; | 88 | char *nm = buf; |
89 | struct mm_struct *active_mm = current->active_mm; | ||
89 | 90 | ||
90 | /* can't use print_vma_addr() yet as it doesn't check for | 91 | /* can't use print_vma_addr() yet as it doesn't check for |
91 | * non-inclusive vma | 92 | * non-inclusive vma |
92 | */ | 93 | */ |
93 | 94 | down_read(&active_mm->mmap_sem); | |
94 | vma = find_vma(current->active_mm, address); | 95 | vma = find_vma(active_mm, address); |
95 | 96 | ||
96 | /* check against the find_vma( ) behaviour which returns the next VMA | 97 | /* check against the find_vma( ) behaviour which returns the next VMA |
97 | * if the container VMA is not found | 98 | * if the container VMA is not found |
@@ -110,9 +111,10 @@ static void show_faulting_vma(unsigned long address, char *buf) | |||
110 | vma->vm_start < TASK_UNMAPPED_BASE ? | 111 | vma->vm_start < TASK_UNMAPPED_BASE ? |
111 | address : address - vma->vm_start, | 112 | address : address - vma->vm_start, |
112 | nm, vma->vm_start, vma->vm_end); | 113 | nm, vma->vm_start, vma->vm_end); |
113 | } else { | 114 | } else |
114 | pr_info(" @No matching VMA found\n"); | 115 | pr_info(" @No matching VMA found\n"); |
115 | } | 116 | |
117 | up_read(&active_mm->mmap_sem); | ||
116 | } | 118 | } |
117 | 119 | ||
118 | static void show_ecr_verbose(struct pt_regs *regs) | 120 | static void show_ecr_verbose(struct pt_regs *regs) |
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c index 54ee6163c181..66781bf34077 100644 --- a/arch/arm/mm/hugetlbpage.c +++ b/arch/arm/mm/hugetlbpage.c | |||
@@ -56,8 +56,3 @@ int pmd_huge(pmd_t pmd) | |||
56 | { | 56 | { |
57 | return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); | 57 | return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); |
58 | } | 58 | } |
59 | |||
60 | int pmd_huge_support(void) | ||
61 | { | ||
62 | return 1; | ||
63 | } | ||
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 31eb959e9aa8..023747bf4dd7 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c | |||
@@ -58,11 +58,6 @@ int pud_huge(pud_t pud) | |||
58 | #endif | 58 | #endif |
59 | } | 59 | } |
60 | 60 | ||
61 | int pmd_huge_support(void) | ||
62 | { | ||
63 | return 1; | ||
64 | } | ||
65 | |||
66 | static __init int setup_hugepagesz(char *opt) | 61 | static __init int setup_hugepagesz(char *opt) |
67 | { | 62 | { |
68 | unsigned long ps = memparse(opt, &opt); | 63 | unsigned long ps = memparse(opt, &opt); |
diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index c35414bdf7bd..c8c8ff9eff61 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h | |||
@@ -12,7 +12,6 @@ | |||
12 | #define __ARCH_WANT_SYS_ALARM | 12 | #define __ARCH_WANT_SYS_ALARM |
13 | #define __ARCH_WANT_SYS_GETHOSTNAME | 13 | #define __ARCH_WANT_SYS_GETHOSTNAME |
14 | #define __ARCH_WANT_SYS_PAUSE | 14 | #define __ARCH_WANT_SYS_PAUSE |
15 | #define __ARCH_WANT_SYS_SGETMASK | ||
16 | #define __ARCH_WANT_SYS_TIME | 15 | #define __ARCH_WANT_SYS_TIME |
17 | #define __ARCH_WANT_SYS_FADVISE64 | 16 | #define __ARCH_WANT_SYS_FADVISE64 |
18 | #define __ARCH_WANT_SYS_GETPGRP | 17 | #define __ARCH_WANT_SYS_GETPGRP |
diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h index 5cc7d1991e48..0f40fed1ba25 100644 --- a/arch/cris/include/asm/unistd.h +++ b/arch/cris/include/asm/unistd.h | |||
@@ -15,7 +15,6 @@ | |||
15 | #define __ARCH_WANT_SYS_GETHOSTNAME | 15 | #define __ARCH_WANT_SYS_GETHOSTNAME |
16 | #define __ARCH_WANT_SYS_IPC | 16 | #define __ARCH_WANT_SYS_IPC |
17 | #define __ARCH_WANT_SYS_PAUSE | 17 | #define __ARCH_WANT_SYS_PAUSE |
18 | #define __ARCH_WANT_SYS_SGETMASK | ||
19 | #define __ARCH_WANT_SYS_SIGNAL | 18 | #define __ARCH_WANT_SYS_SIGNAL |
20 | #define __ARCH_WANT_SYS_TIME | 19 | #define __ARCH_WANT_SYS_TIME |
21 | #define __ARCH_WANT_SYS_UTIME | 20 | #define __ARCH_WANT_SYS_UTIME |
diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index 70ec7293dce7..17b5df8fc28a 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h | |||
@@ -13,7 +13,6 @@ | |||
13 | /* #define __ARCH_WANT_SYS_GETHOSTNAME */ | 13 | /* #define __ARCH_WANT_SYS_GETHOSTNAME */ |
14 | #define __ARCH_WANT_SYS_IPC | 14 | #define __ARCH_WANT_SYS_IPC |
15 | #define __ARCH_WANT_SYS_PAUSE | 15 | #define __ARCH_WANT_SYS_PAUSE |
16 | /* #define __ARCH_WANT_SYS_SGETMASK */ | ||
17 | /* #define __ARCH_WANT_SYS_SIGNAL */ | 16 | /* #define __ARCH_WANT_SYS_SIGNAL */ |
18 | #define __ARCH_WANT_SYS_TIME | 17 | #define __ARCH_WANT_SYS_TIME |
19 | #define __ARCH_WANT_SYS_UTIME | 18 | #define __ARCH_WANT_SYS_UTIME |
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 3202aa74e0d6..6437ca21f61b 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h | |||
@@ -21,7 +21,8 @@ | |||
21 | #define PENALTY_FOR_NODE_WITH_CPUS 255 | 21 | #define PENALTY_FOR_NODE_WITH_CPUS 255 |
22 | 22 | ||
23 | /* | 23 | /* |
24 | * Distance above which we begin to use zone reclaim | 24 | * Nodes within this distance are eligible for reclaim by zone_reclaim() when |
25 | * zone_reclaim_mode is enabled. | ||
25 | */ | 26 | */ |
26 | #define RECLAIM_DISTANCE 15 | 27 | #define RECLAIM_DISTANCE 15 |
27 | 28 | ||
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 68232db98baa..76069c18ee42 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c | |||
@@ -114,11 +114,6 @@ int pud_huge(pud_t pud) | |||
114 | return 0; | 114 | return 0; |
115 | } | 115 | } |
116 | 116 | ||
117 | int pmd_huge_support(void) | ||
118 | { | ||
119 | return 0; | ||
120 | } | ||
121 | |||
122 | struct page * | 117 | struct page * |
123 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) | 118 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) |
124 | { | 119 | { |
diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index 33afa56ad47a..1fcdd344c7ad 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h | |||
@@ -13,7 +13,6 @@ | |||
13 | #define __ARCH_WANT_SYS_GETHOSTNAME | 13 | #define __ARCH_WANT_SYS_GETHOSTNAME |
14 | #define __ARCH_WANT_SYS_IPC | 14 | #define __ARCH_WANT_SYS_IPC |
15 | #define __ARCH_WANT_SYS_PAUSE | 15 | #define __ARCH_WANT_SYS_PAUSE |
16 | #define __ARCH_WANT_SYS_SGETMASK | ||
17 | #define __ARCH_WANT_SYS_SIGNAL | 16 | #define __ARCH_WANT_SYS_SIGNAL |
18 | #define __ARCH_WANT_SYS_TIME | 17 | #define __ARCH_WANT_SYS_TIME |
19 | #define __ARCH_WANT_SYS_UTIME | 18 | #define __ARCH_WANT_SYS_UTIME |
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index 042431509b56..3c52fa6d0f8e 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c | |||
@@ -110,11 +110,6 @@ int pud_huge(pud_t pud) | |||
110 | return 0; | 110 | return 0; |
111 | } | 111 | } |
112 | 112 | ||
113 | int pmd_huge_support(void) | ||
114 | { | ||
115 | return 1; | ||
116 | } | ||
117 | |||
118 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 113 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
119 | pmd_t *pmd, int write) | 114 | pmd_t *pmd, int write) |
120 | { | 115 | { |
diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index b14232b6878f..fd56a8f66489 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h | |||
@@ -19,7 +19,6 @@ | |||
19 | #define __ARCH_WANT_SYS_ALARM | 19 | #define __ARCH_WANT_SYS_ALARM |
20 | #define __ARCH_WANT_SYS_GETHOSTNAME | 20 | #define __ARCH_WANT_SYS_GETHOSTNAME |
21 | #define __ARCH_WANT_SYS_PAUSE | 21 | #define __ARCH_WANT_SYS_PAUSE |
22 | #define __ARCH_WANT_SYS_SGETMASK | ||
23 | #define __ARCH_WANT_SYS_SIGNAL | 22 | #define __ARCH_WANT_SYS_SIGNAL |
24 | #define __ARCH_WANT_SYS_TIME | 23 | #define __ARCH_WANT_SYS_TIME |
25 | #define __ARCH_WANT_SYS_UTIME | 24 | #define __ARCH_WANT_SYS_UTIME |
diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index 413d6c612bec..e55813029d5a 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h | |||
@@ -29,7 +29,6 @@ | |||
29 | #define __ARCH_WANT_SYS_GETHOSTNAME | 29 | #define __ARCH_WANT_SYS_GETHOSTNAME |
30 | #define __ARCH_WANT_SYS_IPC | 30 | #define __ARCH_WANT_SYS_IPC |
31 | #define __ARCH_WANT_SYS_PAUSE | 31 | #define __ARCH_WANT_SYS_PAUSE |
32 | #define __ARCH_WANT_SYS_SGETMASK | ||
33 | #define __ARCH_WANT_SYS_UTIME | 32 | #define __ARCH_WANT_SYS_UTIME |
34 | #define __ARCH_WANT_SYS_WAITPID | 33 | #define __ARCH_WANT_SYS_WAITPID |
35 | #define __ARCH_WANT_SYS_SOCKETCALL | 34 | #define __ARCH_WANT_SYS_SOCKETCALL |
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 77e0ae036e7c..4ec8ee10d371 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c | |||
@@ -84,11 +84,6 @@ int pud_huge(pud_t pud) | |||
84 | return (pud_val(pud) & _PAGE_HUGE) != 0; | 84 | return (pud_val(pud) & _PAGE_HUGE) != 0; |
85 | } | 85 | } |
86 | 86 | ||
87 | int pmd_huge_support(void) | ||
88 | { | ||
89 | return 1; | ||
90 | } | ||
91 | |||
92 | struct page * | 87 | struct page * |
93 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 88 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
94 | pmd_t *pmd, int write) | 89 | pmd_t *pmd, int write) |
diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index 9d4e2d1ef90e..0522468f488b 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h | |||
@@ -26,7 +26,6 @@ | |||
26 | #define __ARCH_WANT_SYS_GETHOSTNAME | 26 | #define __ARCH_WANT_SYS_GETHOSTNAME |
27 | #define __ARCH_WANT_SYS_IPC | 27 | #define __ARCH_WANT_SYS_IPC |
28 | #define __ARCH_WANT_SYS_PAUSE | 28 | #define __ARCH_WANT_SYS_PAUSE |
29 | #define __ARCH_WANT_SYS_SGETMASK | ||
30 | #define __ARCH_WANT_SYS_SIGNAL | 29 | #define __ARCH_WANT_SYS_SIGNAL |
31 | #define __ARCH_WANT_SYS_TIME | 30 | #define __ARCH_WANT_SYS_TIME |
32 | #define __ARCH_WANT_SYS_UTIME | 31 | #define __ARCH_WANT_SYS_UTIME |
diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index 74d835820ee7..5f4c68daa261 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h | |||
@@ -145,7 +145,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ | |||
145 | #define __ARCH_WANT_SYS_ALARM | 145 | #define __ARCH_WANT_SYS_ALARM |
146 | #define __ARCH_WANT_SYS_GETHOSTNAME | 146 | #define __ARCH_WANT_SYS_GETHOSTNAME |
147 | #define __ARCH_WANT_SYS_PAUSE | 147 | #define __ARCH_WANT_SYS_PAUSE |
148 | #define __ARCH_WANT_SYS_SGETMASK | ||
149 | #define __ARCH_WANT_SYS_SIGNAL | 148 | #define __ARCH_WANT_SYS_SIGNAL |
150 | #define __ARCH_WANT_SYS_TIME | 149 | #define __ARCH_WANT_SYS_TIME |
151 | #define __ARCH_WANT_COMPAT_SYS_TIME | 150 | #define __ARCH_WANT_COMPAT_SYS_TIME |
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 3ebb188c3ff5..d98c1ecc3266 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h | |||
@@ -44,6 +44,12 @@ static inline int pte_present(pte_t pte) | |||
44 | return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA); | 44 | return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA); |
45 | } | 45 | } |
46 | 46 | ||
47 | #define pte_present_nonuma pte_present_nonuma | ||
48 | static inline int pte_present_nonuma(pte_t pte) | ||
49 | { | ||
50 | return pte_val(pte) & (_PAGE_PRESENT); | ||
51 | } | ||
52 | |||
47 | #define pte_numa pte_numa | 53 | #define pte_numa pte_numa |
48 | static inline int pte_numa(pte_t pte) | 54 | static inline int pte_numa(pte_t pte) |
49 | { | 55 | { |
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index c9202151079f..6c8a8c5a37a1 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h | |||
@@ -9,12 +9,8 @@ struct device_node; | |||
9 | #ifdef CONFIG_NUMA | 9 | #ifdef CONFIG_NUMA |
10 | 10 | ||
11 | /* | 11 | /* |
12 | * Before going off node we want the VM to try and reclaim from the local | 12 | * If zone_reclaim_mode is enabled, a RECLAIM_DISTANCE of 10 will mean that |
13 | * node. It does this if the remote distance is larger than RECLAIM_DISTANCE. | 13 | * all zones on all nodes will be eligible for zone_reclaim(). |
14 | * With the default REMOTE_DISTANCE of 20 and the default RECLAIM_DISTANCE of | ||
15 | * 20, we never reclaim and go off node straight away. | ||
16 | * | ||
17 | * To fix this we choose a smaller value of RECLAIM_DISTANCE. | ||
18 | */ | 14 | */ |
19 | #define RECLAIM_DISTANCE 10 | 15 | #define RECLAIM_DISTANCE 10 |
20 | 16 | ||
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 9b892bbd9d84..5ce5552ab9f5 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h | |||
@@ -29,7 +29,6 @@ | |||
29 | #define __ARCH_WANT_SYS_GETHOSTNAME | 29 | #define __ARCH_WANT_SYS_GETHOSTNAME |
30 | #define __ARCH_WANT_SYS_IPC | 30 | #define __ARCH_WANT_SYS_IPC |
31 | #define __ARCH_WANT_SYS_PAUSE | 31 | #define __ARCH_WANT_SYS_PAUSE |
32 | #define __ARCH_WANT_SYS_SGETMASK | ||
33 | #define __ARCH_WANT_SYS_SIGNAL | 32 | #define __ARCH_WANT_SYS_SIGNAL |
34 | #define __ARCH_WANT_SYS_TIME | 33 | #define __ARCH_WANT_SYS_TIME |
35 | #define __ARCH_WANT_SYS_UTIME | 34 | #define __ARCH_WANT_SYS_UTIME |
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index eb923654ba80..7e70ae968e5f 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c | |||
@@ -86,11 +86,6 @@ int pgd_huge(pgd_t pgd) | |||
86 | */ | 86 | */ |
87 | return ((pgd_val(pgd) & 0x3) != 0x0); | 87 | return ((pgd_val(pgd) & 0x3) != 0x0); |
88 | } | 88 | } |
89 | |||
90 | int pmd_huge_support(void) | ||
91 | { | ||
92 | return 1; | ||
93 | } | ||
94 | #else | 89 | #else |
95 | int pmd_huge(pmd_t pmd) | 90 | int pmd_huge(pmd_t pmd) |
96 | { | 91 | { |
@@ -106,11 +101,6 @@ int pgd_huge(pgd_t pgd) | |||
106 | { | 101 | { |
107 | return 0; | 102 | return 0; |
108 | } | 103 | } |
109 | |||
110 | int pmd_huge_support(void) | ||
111 | { | ||
112 | return 0; | ||
113 | } | ||
114 | #endif | 104 | #endif |
115 | 105 | ||
116 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | 106 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) |
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 0727a55d87d9..0ff66a7e29bb 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c | |||
@@ -220,11 +220,6 @@ int pud_huge(pud_t pud) | |||
220 | return 0; | 220 | return 0; |
221 | } | 221 | } |
222 | 222 | ||
223 | int pmd_huge_support(void) | ||
224 | { | ||
225 | return 1; | ||
226 | } | ||
227 | |||
228 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 223 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
229 | pmd_t *pmdp, int write) | 224 | pmd_t *pmdp, int write) |
230 | { | 225 | { |
diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h index e77816c4b9bc..126fe8340b22 100644 --- a/arch/sh/include/asm/unistd.h +++ b/arch/sh/include/asm/unistd.h | |||
@@ -11,7 +11,6 @@ | |||
11 | # define __ARCH_WANT_SYS_GETHOSTNAME | 11 | # define __ARCH_WANT_SYS_GETHOSTNAME |
12 | # define __ARCH_WANT_SYS_IPC | 12 | # define __ARCH_WANT_SYS_IPC |
13 | # define __ARCH_WANT_SYS_PAUSE | 13 | # define __ARCH_WANT_SYS_PAUSE |
14 | # define __ARCH_WANT_SYS_SGETMASK | ||
15 | # define __ARCH_WANT_SYS_SIGNAL | 14 | # define __ARCH_WANT_SYS_SIGNAL |
16 | # define __ARCH_WANT_SYS_TIME | 15 | # define __ARCH_WANT_SYS_TIME |
17 | # define __ARCH_WANT_SYS_UTIME | 16 | # define __ARCH_WANT_SYS_UTIME |
diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c index f9173766ec4b..2197fc584186 100644 --- a/arch/sh/kernel/hw_breakpoint.c +++ b/arch/sh/kernel/hw_breakpoint.c | |||
@@ -52,7 +52,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp) | |||
52 | int i; | 52 | int i; |
53 | 53 | ||
54 | for (i = 0; i < sh_ubc->num_events; i++) { | 54 | for (i = 0; i < sh_ubc->num_events; i++) { |
55 | struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); | 55 | struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); |
56 | 56 | ||
57 | if (!*slot) { | 57 | if (!*slot) { |
58 | *slot = bp; | 58 | *slot = bp; |
@@ -84,7 +84,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) | |||
84 | int i; | 84 | int i; |
85 | 85 | ||
86 | for (i = 0; i < sh_ubc->num_events; i++) { | 86 | for (i = 0; i < sh_ubc->num_events; i++) { |
87 | struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); | 87 | struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); |
88 | 88 | ||
89 | if (*slot == bp) { | 89 | if (*slot == bp) { |
90 | *slot = NULL; | 90 | *slot = NULL; |
diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c index 42b46e61a2d5..83acbf3f6de8 100644 --- a/arch/sh/kernel/kprobes.c +++ b/arch/sh/kernel/kprobes.c | |||
@@ -102,7 +102,7 @@ int __kprobes kprobe_handle_illslot(unsigned long pc) | |||
102 | 102 | ||
103 | void __kprobes arch_remove_kprobe(struct kprobe *p) | 103 | void __kprobes arch_remove_kprobe(struct kprobe *p) |
104 | { | 104 | { |
105 | struct kprobe *saved = &__get_cpu_var(saved_next_opcode); | 105 | struct kprobe *saved = this_cpu_ptr(&saved_next_opcode); |
106 | 106 | ||
107 | if (saved->addr) { | 107 | if (saved->addr) { |
108 | arch_disarm_kprobe(p); | 108 | arch_disarm_kprobe(p); |
@@ -111,7 +111,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p) | |||
111 | saved->addr = NULL; | 111 | saved->addr = NULL; |
112 | saved->opcode = 0; | 112 | saved->opcode = 0; |
113 | 113 | ||
114 | saved = &__get_cpu_var(saved_next_opcode2); | 114 | saved = this_cpu_ptr(&saved_next_opcode2); |
115 | if (saved->addr) { | 115 | if (saved->addr) { |
116 | arch_disarm_kprobe(saved); | 116 | arch_disarm_kprobe(saved); |
117 | 117 | ||
@@ -129,14 +129,14 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) | |||
129 | 129 | ||
130 | static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) | 130 | static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) |
131 | { | 131 | { |
132 | __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; | 132 | __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); |
133 | kcb->kprobe_status = kcb->prev_kprobe.status; | 133 | kcb->kprobe_status = kcb->prev_kprobe.status; |
134 | } | 134 | } |
135 | 135 | ||
136 | static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, | 136 | static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, |
137 | struct kprobe_ctlblk *kcb) | 137 | struct kprobe_ctlblk *kcb) |
138 | { | 138 | { |
139 | __get_cpu_var(current_kprobe) = p; | 139 | __this_cpu_write(current_kprobe, p); |
140 | } | 140 | } |
141 | 141 | ||
142 | /* | 142 | /* |
@@ -146,15 +146,15 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, | |||
146 | */ | 146 | */ |
147 | static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) | 147 | static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) |
148 | { | 148 | { |
149 | __get_cpu_var(saved_current_opcode).addr = (kprobe_opcode_t *)regs->pc; | 149 | __this_cpu_write(saved_current_opcode.addr, (kprobe_opcode_t *)regs->pc); |
150 | 150 | ||
151 | if (p != NULL) { | 151 | if (p != NULL) { |
152 | struct kprobe *op1, *op2; | 152 | struct kprobe *op1, *op2; |
153 | 153 | ||
154 | arch_disarm_kprobe(p); | 154 | arch_disarm_kprobe(p); |
155 | 155 | ||
156 | op1 = &__get_cpu_var(saved_next_opcode); | 156 | op1 = this_cpu_ptr(&saved_next_opcode); |
157 | op2 = &__get_cpu_var(saved_next_opcode2); | 157 | op2 = this_cpu_ptr(&saved_next_opcode2); |
158 | 158 | ||
159 | if (OPCODE_JSR(p->opcode) || OPCODE_JMP(p->opcode)) { | 159 | if (OPCODE_JSR(p->opcode) || OPCODE_JMP(p->opcode)) { |
160 | unsigned int reg_nr = ((p->opcode >> 8) & 0x000F); | 160 | unsigned int reg_nr = ((p->opcode >> 8) & 0x000F); |
@@ -249,7 +249,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) | |||
249 | kcb->kprobe_status = KPROBE_REENTER; | 249 | kcb->kprobe_status = KPROBE_REENTER; |
250 | return 1; | 250 | return 1; |
251 | } else { | 251 | } else { |
252 | p = __get_cpu_var(current_kprobe); | 252 | p = __this_cpu_read(current_kprobe); |
253 | if (p->break_handler && p->break_handler(p, regs)) { | 253 | if (p->break_handler && p->break_handler(p, regs)) { |
254 | goto ss_probe; | 254 | goto ss_probe; |
255 | } | 255 | } |
@@ -336,9 +336,9 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) | |||
336 | continue; | 336 | continue; |
337 | 337 | ||
338 | if (ri->rp && ri->rp->handler) { | 338 | if (ri->rp && ri->rp->handler) { |
339 | __get_cpu_var(current_kprobe) = &ri->rp->kp; | 339 | __this_cpu_write(current_kprobe, &ri->rp->kp); |
340 | ri->rp->handler(ri, regs); | 340 | ri->rp->handler(ri, regs); |
341 | __get_cpu_var(current_kprobe) = NULL; | 341 | __this_cpu_write(current_kprobe, NULL); |
342 | } | 342 | } |
343 | 343 | ||
344 | orig_ret_address = (unsigned long)ri->ret_addr; | 344 | orig_ret_address = (unsigned long)ri->ret_addr; |
@@ -383,19 +383,19 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) | |||
383 | cur->post_handler(cur, regs, 0); | 383 | cur->post_handler(cur, regs, 0); |
384 | } | 384 | } |
385 | 385 | ||
386 | p = &__get_cpu_var(saved_next_opcode); | 386 | p = this_cpu_ptr(&saved_next_opcode); |
387 | if (p->addr) { | 387 | if (p->addr) { |
388 | arch_disarm_kprobe(p); | 388 | arch_disarm_kprobe(p); |
389 | p->addr = NULL; | 389 | p->addr = NULL; |
390 | p->opcode = 0; | 390 | p->opcode = 0; |
391 | 391 | ||
392 | addr = __get_cpu_var(saved_current_opcode).addr; | 392 | addr = __this_cpu_read(saved_current_opcode.addr); |
393 | __get_cpu_var(saved_current_opcode).addr = NULL; | 393 | __this_cpu_write(saved_current_opcode.addr, NULL); |
394 | 394 | ||
395 | p = get_kprobe(addr); | 395 | p = get_kprobe(addr); |
396 | arch_arm_kprobe(p); | 396 | arch_arm_kprobe(p); |
397 | 397 | ||
398 | p = &__get_cpu_var(saved_next_opcode2); | 398 | p = this_cpu_ptr(&saved_next_opcode2); |
399 | if (p->addr) { | 399 | if (p->addr) { |
400 | arch_disarm_kprobe(p); | 400 | arch_disarm_kprobe(p); |
401 | p->addr = NULL; | 401 | p->addr = NULL; |
@@ -511,7 +511,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, | |||
511 | if (kprobe_handler(args->regs)) { | 511 | if (kprobe_handler(args->regs)) { |
512 | ret = NOTIFY_STOP; | 512 | ret = NOTIFY_STOP; |
513 | } else { | 513 | } else { |
514 | p = __get_cpu_var(current_kprobe); | 514 | p = __this_cpu_read(current_kprobe); |
515 | if (p->break_handler && | 515 | if (p->break_handler && |
516 | p->break_handler(p, args->regs)) | 516 | p->break_handler(p, args->regs)) |
517 | ret = NOTIFY_STOP; | 517 | ret = NOTIFY_STOP; |
diff --git a/arch/sh/kernel/localtimer.c b/arch/sh/kernel/localtimer.c index 8bfc6dfa8b94..b880a7e2ace7 100644 --- a/arch/sh/kernel/localtimer.c +++ b/arch/sh/kernel/localtimer.c | |||
@@ -32,7 +32,7 @@ static DEFINE_PER_CPU(struct clock_event_device, local_clockevent); | |||
32 | */ | 32 | */ |
33 | void local_timer_interrupt(void) | 33 | void local_timer_interrupt(void) |
34 | { | 34 | { |
35 | struct clock_event_device *clk = &__get_cpu_var(local_clockevent); | 35 | struct clock_event_device *clk = this_cpu_ptr(&local_clockevent); |
36 | 36 | ||
37 | irq_enter(); | 37 | irq_enter(); |
38 | clk->event_handler(clk); | 38 | clk->event_handler(clk); |
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index b9cefebda55c..02331672b6db 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c | |||
@@ -227,7 +227,7 @@ again: | |||
227 | 227 | ||
228 | static void sh_pmu_stop(struct perf_event *event, int flags) | 228 | static void sh_pmu_stop(struct perf_event *event, int flags) |
229 | { | 229 | { |
230 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 230 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
231 | struct hw_perf_event *hwc = &event->hw; | 231 | struct hw_perf_event *hwc = &event->hw; |
232 | int idx = hwc->idx; | 232 | int idx = hwc->idx; |
233 | 233 | ||
@@ -245,7 +245,7 @@ static void sh_pmu_stop(struct perf_event *event, int flags) | |||
245 | 245 | ||
246 | static void sh_pmu_start(struct perf_event *event, int flags) | 246 | static void sh_pmu_start(struct perf_event *event, int flags) |
247 | { | 247 | { |
248 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 248 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
249 | struct hw_perf_event *hwc = &event->hw; | 249 | struct hw_perf_event *hwc = &event->hw; |
250 | int idx = hwc->idx; | 250 | int idx = hwc->idx; |
251 | 251 | ||
@@ -262,7 +262,7 @@ static void sh_pmu_start(struct perf_event *event, int flags) | |||
262 | 262 | ||
263 | static void sh_pmu_del(struct perf_event *event, int flags) | 263 | static void sh_pmu_del(struct perf_event *event, int flags) |
264 | { | 264 | { |
265 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 265 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
266 | 266 | ||
267 | sh_pmu_stop(event, PERF_EF_UPDATE); | 267 | sh_pmu_stop(event, PERF_EF_UPDATE); |
268 | __clear_bit(event->hw.idx, cpuc->used_mask); | 268 | __clear_bit(event->hw.idx, cpuc->used_mask); |
@@ -272,7 +272,7 @@ static void sh_pmu_del(struct perf_event *event, int flags) | |||
272 | 272 | ||
273 | static int sh_pmu_add(struct perf_event *event, int flags) | 273 | static int sh_pmu_add(struct perf_event *event, int flags) |
274 | { | 274 | { |
275 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 275 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
276 | struct hw_perf_event *hwc = &event->hw; | 276 | struct hw_perf_event *hwc = &event->hw; |
277 | int idx = hwc->idx; | 277 | int idx = hwc->idx; |
278 | int ret = -EAGAIN; | 278 | int ret = -EAGAIN; |
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 86a7936a980b..fc5acfc93c92 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c | |||
@@ -111,7 +111,7 @@ void play_dead_common(void) | |||
111 | irq_ctx_exit(raw_smp_processor_id()); | 111 | irq_ctx_exit(raw_smp_processor_id()); |
112 | mb(); | 112 | mb(); |
113 | 113 | ||
114 | __get_cpu_var(cpu_state) = CPU_DEAD; | 114 | __this_cpu_write(cpu_state, CPU_DEAD); |
115 | local_irq_disable(); | 115 | local_irq_disable(); |
116 | } | 116 | } |
117 | 117 | ||
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 0d676a41081e..d7762349ea48 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c | |||
@@ -83,11 +83,6 @@ int pud_huge(pud_t pud) | |||
83 | return 0; | 83 | return 0; |
84 | } | 84 | } |
85 | 85 | ||
86 | int pmd_huge_support(void) | ||
87 | { | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 86 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
92 | pmd_t *pmd, int write) | 87 | pmd_t *pmd, int write) |
93 | { | 88 | { |
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index dfa53fdd5cbc..0aac1e8f2968 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h | |||
@@ -25,7 +25,6 @@ | |||
25 | #define __ARCH_WANT_SYS_ALARM | 25 | #define __ARCH_WANT_SYS_ALARM |
26 | #define __ARCH_WANT_SYS_GETHOSTNAME | 26 | #define __ARCH_WANT_SYS_GETHOSTNAME |
27 | #define __ARCH_WANT_SYS_PAUSE | 27 | #define __ARCH_WANT_SYS_PAUSE |
28 | #define __ARCH_WANT_SYS_SGETMASK | ||
29 | #define __ARCH_WANT_SYS_SIGNAL | 28 | #define __ARCH_WANT_SYS_SIGNAL |
30 | #define __ARCH_WANT_SYS_TIME | 29 | #define __ARCH_WANT_SYS_TIME |
31 | #define __ARCH_WANT_SYS_UTIME | 30 | #define __ARCH_WANT_SYS_UTIME |
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 9bd9ce80bf77..d329537739c6 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c | |||
@@ -231,11 +231,6 @@ int pud_huge(pud_t pud) | |||
231 | return 0; | 231 | return 0; |
232 | } | 232 | } |
233 | 233 | ||
234 | int pmd_huge_support(void) | ||
235 | { | ||
236 | return 0; | ||
237 | } | ||
238 | |||
239 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 234 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
240 | pmd_t *pmd, int write) | 235 | pmd_t *pmd, int write) |
241 | { | 236 | { |
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c index 004ba568d93f..33294fdc402e 100644 --- a/arch/tile/mm/homecache.c +++ b/arch/tile/mm/homecache.c | |||
@@ -417,7 +417,7 @@ void __homecache_free_pages(struct page *page, unsigned int order) | |||
417 | if (put_page_testzero(page)) { | 417 | if (put_page_testzero(page)) { |
418 | homecache_change_page_home(page, order, PAGE_HOME_HASH); | 418 | homecache_change_page_home(page, order, PAGE_HOME_HASH); |
419 | if (order == 0) { | 419 | if (order == 0) { |
420 | free_hot_cold_page(page, 0); | 420 | free_hot_cold_page(page, false); |
421 | } else { | 421 | } else { |
422 | init_page_count(page); | 422 | init_page_count(page); |
423 | __free_pages(page, order); | 423 | __free_pages(page, order); |
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 0cb3bbaa580c..e514899e1100 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c | |||
@@ -166,11 +166,6 @@ int pud_huge(pud_t pud) | |||
166 | return !!(pud_val(pud) & _PAGE_HUGE_PAGE); | 166 | return !!(pud_val(pud) & _PAGE_HUGE_PAGE); |
167 | } | 167 | } |
168 | 168 | ||
169 | int pmd_huge_support(void) | ||
170 | { | ||
171 | return 1; | ||
172 | } | ||
173 | |||
174 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 169 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
175 | pmd_t *pmd, int write) | 170 | pmd_t *pmd, int write) |
176 | { | 171 | { |
diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c index 13068ee22f33..bf012b2b71a9 100644 --- a/arch/unicore32/mm/ioremap.c +++ b/arch/unicore32/mm/ioremap.c | |||
@@ -144,11 +144,11 @@ void __iomem *__uc32_ioremap_pfn_caller(unsigned long pfn, | |||
144 | * Don't allow RAM to be mapped | 144 | * Don't allow RAM to be mapped |
145 | */ | 145 | */ |
146 | if (pfn_valid(pfn)) { | 146 | if (pfn_valid(pfn)) { |
147 | printk(KERN_WARNING "BUG: Your driver calls ioremap() on\n" | 147 | WARN(1, "BUG: Your driver calls ioremap() on\n" |
148 | "system memory. This leads to architecturally\n" | 148 | "system memory. This leads to architecturally\n" |
149 | "unpredictable behaviour, and ioremap() will fail in\n" | 149 | "unpredictable behaviour, and ioremap() will fail in\n" |
150 | "the next kernel release. Please fix your driver.\n"); | 150 | "the next kernel release. Please fix your driver.\n"); |
151 | WARN_ON(1); | 151 | return NULL; |
152 | } | 152 | } |
153 | 153 | ||
154 | type = get_mem_type(mtype); | 154 | type = get_mem_type(mtype); |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7a01d4335029..272b493ea1bf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -26,7 +26,7 @@ config X86 | |||
26 | select ARCH_MIGHT_HAVE_PC_SERIO | 26 | select ARCH_MIGHT_HAVE_PC_SERIO |
27 | select HAVE_AOUT if X86_32 | 27 | select HAVE_AOUT if X86_32 |
28 | select HAVE_UNSTABLE_SCHED_CLOCK | 28 | select HAVE_UNSTABLE_SCHED_CLOCK |
29 | select ARCH_SUPPORTS_NUMA_BALANCING | 29 | select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 |
30 | select ARCH_SUPPORTS_INT128 if X86_64 | 30 | select ARCH_SUPPORTS_INT128 if X86_64 |
31 | select ARCH_WANTS_PROT_NUMA_PROT_NONE | 31 | select ARCH_WANTS_PROT_NUMA_PROT_NONE |
32 | select HAVE_IDE | 32 | select HAVE_IDE |
@@ -41,7 +41,7 @@ config X86 | |||
41 | select ARCH_WANT_OPTIONAL_GPIOLIB | 41 | select ARCH_WANT_OPTIONAL_GPIOLIB |
42 | select ARCH_WANT_FRAME_POINTERS | 42 | select ARCH_WANT_FRAME_POINTERS |
43 | select HAVE_DMA_ATTRS | 43 | select HAVE_DMA_ATTRS |
44 | select HAVE_DMA_CONTIGUOUS if !SWIOTLB | 44 | select HAVE_DMA_CONTIGUOUS |
45 | select HAVE_KRETPROBES | 45 | select HAVE_KRETPROBES |
46 | select GENERIC_EARLY_IOREMAP | 46 | select GENERIC_EARLY_IOREMAP |
47 | select HAVE_OPTPROBES | 47 | select HAVE_OPTPROBES |
@@ -105,7 +105,7 @@ config X86 | |||
105 | select HAVE_ARCH_SECCOMP_FILTER | 105 | select HAVE_ARCH_SECCOMP_FILTER |
106 | select BUILDTIME_EXTABLE_SORT | 106 | select BUILDTIME_EXTABLE_SORT |
107 | select GENERIC_CMOS_UPDATE | 107 | select GENERIC_CMOS_UPDATE |
108 | select HAVE_ARCH_SOFT_DIRTY | 108 | select HAVE_ARCH_SOFT_DIRTY if X86_64 |
109 | select CLOCKSOURCE_WATCHDOG | 109 | select CLOCKSOURCE_WATCHDOG |
110 | select GENERIC_CLOCKEVENTS | 110 | select GENERIC_CLOCKEVENTS |
111 | select ARCH_CLOCKSOURCE_DATA | 111 | select ARCH_CLOCKSOURCE_DATA |
@@ -1874,6 +1874,10 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK | |||
1874 | def_bool y | 1874 | def_bool y |
1875 | depends on X86_64 || X86_PAE | 1875 | depends on X86_64 || X86_PAE |
1876 | 1876 | ||
1877 | config ARCH_ENABLE_HUGEPAGE_MIGRATION | ||
1878 | def_bool y | ||
1879 | depends on X86_64 && HUGETLB_PAGE && MIGRATION | ||
1880 | |||
1877 | menu "Power management and ACPI options" | 1881 | menu "Power management and ACPI options" |
1878 | 1882 | ||
1879 | config ARCH_HIBERNATION_HEADER | 1883 | config ARCH_HIBERNATION_HEADER |
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6e4ce2df87cf..958b90f761e5 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
@@ -176,8 +176,6 @@ int mce_available(struct cpuinfo_x86 *c); | |||
176 | DECLARE_PER_CPU(unsigned, mce_exception_count); | 176 | DECLARE_PER_CPU(unsigned, mce_exception_count); |
177 | DECLARE_PER_CPU(unsigned, mce_poll_count); | 177 | DECLARE_PER_CPU(unsigned, mce_poll_count); |
178 | 178 | ||
179 | extern atomic_t mce_entry; | ||
180 | |||
181 | typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); | 179 | typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); |
182 | DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); | 180 | DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); |
183 | 181 | ||
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 0d193e234647..206a87fdd22d 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h | |||
@@ -62,66 +62,14 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi | |||
62 | return ((value >> rightshift) & mask) << leftshift; | 62 | return ((value >> rightshift) & mask) << leftshift; |
63 | } | 63 | } |
64 | 64 | ||
65 | #ifdef CONFIG_MEM_SOFT_DIRTY | ||
66 | |||
67 | /* | ||
68 | * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and | ||
69 | * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset | ||
70 | * into this range. | ||
71 | */ | ||
72 | #define PTE_FILE_MAX_BITS 28 | ||
73 | #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) | ||
74 | #define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) | ||
75 | #define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) | ||
76 | #define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1) | ||
77 | #define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) | ||
78 | #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) | ||
79 | #define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) | ||
80 | |||
81 | #define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) | ||
82 | #define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) | ||
83 | #define PTE_FILE_MASK3 ((1U << PTE_FILE_BITS3) - 1) | ||
84 | |||
85 | #define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) | ||
86 | #define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) | ||
87 | #define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) | ||
88 | |||
89 | static __always_inline pgoff_t pte_to_pgoff(pte_t pte) | ||
90 | { | ||
91 | return (pgoff_t) | ||
92 | (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + | ||
93 | pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + | ||
94 | pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) + | ||
95 | pte_bitop(pte.pte_low, PTE_FILE_SHIFT4, -1UL, PTE_FILE_LSHIFT4)); | ||
96 | } | ||
97 | |||
98 | static __always_inline pte_t pgoff_to_pte(pgoff_t off) | ||
99 | { | ||
100 | return (pte_t){ | ||
101 | .pte_low = | ||
102 | pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + | ||
103 | pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + | ||
104 | pte_bitop(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) + | ||
105 | pte_bitop(off, PTE_FILE_LSHIFT4, -1UL, PTE_FILE_SHIFT4) + | ||
106 | _PAGE_FILE, | ||
107 | }; | ||
108 | } | ||
109 | |||
110 | #else /* CONFIG_MEM_SOFT_DIRTY */ | ||
111 | |||
112 | /* | 65 | /* |
113 | * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, | 66 | * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, |
114 | * split up the 29 bits of offset into this range. | 67 | * split up the 29 bits of offset into this range. |
115 | */ | 68 | */ |
116 | #define PTE_FILE_MAX_BITS 29 | 69 | #define PTE_FILE_MAX_BITS 29 |
117 | #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) | 70 | #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) |
118 | #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE | ||
119 | #define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) | 71 | #define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) |
120 | #define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) | 72 | #define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) |
121 | #else | ||
122 | #define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1) | ||
123 | #define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1) | ||
124 | #endif | ||
125 | #define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) | 73 | #define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) |
126 | #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) | 74 | #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) |
127 | 75 | ||
@@ -150,16 +98,9 @@ static __always_inline pte_t pgoff_to_pte(pgoff_t off) | |||
150 | }; | 98 | }; |
151 | } | 99 | } |
152 | 100 | ||
153 | #endif /* CONFIG_MEM_SOFT_DIRTY */ | ||
154 | |||
155 | /* Encode and de-code a swap entry */ | 101 | /* Encode and de-code a swap entry */ |
156 | #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE | ||
157 | #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) | 102 | #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) |
158 | #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) | 103 | #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) |
159 | #else | ||
160 | #define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) | ||
161 | #define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) | ||
162 | #endif | ||
163 | 104 | ||
164 | #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) | 105 | #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) |
165 | 106 | ||
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b459ddf27d64..0ec056012618 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte) | |||
131 | 131 | ||
132 | static inline int pte_special(pte_t pte) | 132 | static inline int pte_special(pte_t pte) |
133 | { | 133 | { |
134 | return pte_flags(pte) & _PAGE_SPECIAL; | 134 | return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) == |
135 | (_PAGE_PRESENT|_PAGE_SPECIAL); | ||
135 | } | 136 | } |
136 | 137 | ||
137 | static inline unsigned long pte_pfn(pte_t pte) | 138 | static inline unsigned long pte_pfn(pte_t pte) |
@@ -296,6 +297,7 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) | |||
296 | return pmd_clear_flags(pmd, _PAGE_PRESENT); | 297 | return pmd_clear_flags(pmd, _PAGE_PRESENT); |
297 | } | 298 | } |
298 | 299 | ||
300 | #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY | ||
299 | static inline int pte_soft_dirty(pte_t pte) | 301 | static inline int pte_soft_dirty(pte_t pte) |
300 | { | 302 | { |
301 | return pte_flags(pte) & _PAGE_SOFT_DIRTY; | 303 | return pte_flags(pte) & _PAGE_SOFT_DIRTY; |
@@ -331,6 +333,8 @@ static inline int pte_file_soft_dirty(pte_t pte) | |||
331 | return pte_flags(pte) & _PAGE_SOFT_DIRTY; | 333 | return pte_flags(pte) & _PAGE_SOFT_DIRTY; |
332 | } | 334 | } |
333 | 335 | ||
336 | #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ | ||
337 | |||
334 | /* | 338 | /* |
335 | * Mask out unsupported bits in a present pgprot. Non-present pgprots | 339 | * Mask out unsupported bits in a present pgprot. Non-present pgprots |
336 | * can use those bits for other purposes, so leave them be. | 340 | * can use those bits for other purposes, so leave them be. |
@@ -452,6 +456,12 @@ static inline int pte_present(pte_t a) | |||
452 | _PAGE_NUMA); | 456 | _PAGE_NUMA); |
453 | } | 457 | } |
454 | 458 | ||
459 | #define pte_present_nonuma pte_present_nonuma | ||
460 | static inline int pte_present_nonuma(pte_t a) | ||
461 | { | ||
462 | return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); | ||
463 | } | ||
464 | |||
455 | #define pte_accessible pte_accessible | 465 | #define pte_accessible pte_accessible |
456 | static inline bool pte_accessible(struct mm_struct *mm, pte_t a) | 466 | static inline bool pte_accessible(struct mm_struct *mm, pte_t a) |
457 | { | 467 | { |
@@ -858,23 +868,25 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, | |||
858 | { | 868 | { |
859 | } | 869 | } |
860 | 870 | ||
871 | #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY | ||
861 | static inline pte_t pte_swp_mksoft_dirty(pte_t pte) | 872 | static inline pte_t pte_swp_mksoft_dirty(pte_t pte) |
862 | { | 873 | { |
863 | VM_BUG_ON(pte_present(pte)); | 874 | VM_BUG_ON(pte_present_nonuma(pte)); |
864 | return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); | 875 | return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); |
865 | } | 876 | } |
866 | 877 | ||
867 | static inline int pte_swp_soft_dirty(pte_t pte) | 878 | static inline int pte_swp_soft_dirty(pte_t pte) |
868 | { | 879 | { |
869 | VM_BUG_ON(pte_present(pte)); | 880 | VM_BUG_ON(pte_present_nonuma(pte)); |
870 | return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; | 881 | return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; |
871 | } | 882 | } |
872 | 883 | ||
873 | static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) | 884 | static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) |
874 | { | 885 | { |
875 | VM_BUG_ON(pte_present(pte)); | 886 | VM_BUG_ON(pte_present_nonuma(pte)); |
876 | return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); | 887 | return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); |
877 | } | 888 | } |
889 | #endif | ||
878 | 890 | ||
879 | #include <asm-generic/pgtable.h> | 891 | #include <asm-generic/pgtable.h> |
880 | #endif /* __ASSEMBLY__ */ | 892 | #endif /* __ASSEMBLY__ */ |
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e22c1dbf7feb..5be9063545d2 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -143,12 +143,12 @@ static inline int pgd_large(pgd_t pgd) { return 0; } | |||
143 | #define pte_unmap(pte) ((void)(pte))/* NOP */ | 143 | #define pte_unmap(pte) ((void)(pte))/* NOP */ |
144 | 144 | ||
145 | /* Encode and de-code a swap entry */ | 145 | /* Encode and de-code a swap entry */ |
146 | #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE | ||
147 | #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) | 146 | #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) |
148 | #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) | 147 | #ifdef CONFIG_NUMA_BALANCING |
148 | /* Automatic NUMA balancing needs to be distinguishable from swap entries */ | ||
149 | #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) | ||
149 | #else | 150 | #else |
150 | #define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) | 151 | #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) |
151 | #define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) | ||
152 | #endif | 152 | #endif |
153 | 153 | ||
154 | #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) | 154 | #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index eb3d44945133..f216963760e5 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
@@ -16,15 +16,26 @@ | |||
16 | #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ | 16 | #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ |
17 | #define _PAGE_BIT_PAT 7 /* on 4KB pages */ | 17 | #define _PAGE_BIT_PAT 7 /* on 4KB pages */ |
18 | #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ | 18 | #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ |
19 | #define _PAGE_BIT_UNUSED1 9 /* available for programmer */ | 19 | #define _PAGE_BIT_SOFTW1 9 /* available for programmer */ |
20 | #define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ | 20 | #define _PAGE_BIT_SOFTW2 10 /* " */ |
21 | #define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ | 21 | #define _PAGE_BIT_SOFTW3 11 /* " */ |
22 | #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ | 22 | #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ |
23 | #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 | 23 | #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 |
24 | #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 | 24 | #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 |
25 | #define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ | 25 | #define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ |
26 | #define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */ | ||
27 | #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ | ||
28 | #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ | ||
26 | #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ | 29 | #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ |
27 | 30 | ||
31 | /* | ||
32 | * Swap offsets on configurations that allow automatic NUMA balancing use the | ||
33 | * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from | ||
34 | * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the | ||
35 | * maximum possible swap space from 16TB to 8TB. | ||
36 | */ | ||
37 | #define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) | ||
38 | |||
28 | /* If _PAGE_BIT_PRESENT is clear, we use these: */ | 39 | /* If _PAGE_BIT_PRESENT is clear, we use these: */ |
29 | /* - if the user mapped it with PROT_NONE; pte_present gives true */ | 40 | /* - if the user mapped it with PROT_NONE; pte_present gives true */ |
30 | #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL | 41 | #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL |
@@ -40,7 +51,7 @@ | |||
40 | #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) | 51 | #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) |
41 | #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) | 52 | #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) |
42 | #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) | 53 | #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) |
43 | #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) | 54 | #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) |
44 | #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) | 55 | #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) |
45 | #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) | 56 | #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) |
46 | #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) | 57 | #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) |
@@ -61,8 +72,6 @@ | |||
61 | * they do not conflict with each other. | 72 | * they do not conflict with each other. |
62 | */ | 73 | */ |
63 | 74 | ||
64 | #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN | ||
65 | |||
66 | #ifdef CONFIG_MEM_SOFT_DIRTY | 75 | #ifdef CONFIG_MEM_SOFT_DIRTY |
67 | #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) | 76 | #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) |
68 | #else | 77 | #else |
@@ -70,6 +79,21 @@ | |||
70 | #endif | 79 | #endif |
71 | 80 | ||
72 | /* | 81 | /* |
82 | * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page | ||
83 | * that is not present. The hinting fault gathers numa placement statistics | ||
84 | * (see pte_numa()). The bit is always zero when the PTE is not present. | ||
85 | * | ||
86 | * The bit picked must be always zero when the pmd is present and not | ||
87 | * present, so that we don't lose information when we set it while | ||
88 | * atomically clearing the present bit. | ||
89 | */ | ||
90 | #ifdef CONFIG_NUMA_BALANCING | ||
91 | #define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) | ||
92 | #else | ||
93 | #define _PAGE_NUMA (_AT(pteval_t, 0)) | ||
94 | #endif | ||
95 | |||
96 | /* | ||
73 | * Tracking soft dirty bit when a page goes to a swap is tricky. | 97 | * Tracking soft dirty bit when a page goes to a swap is tricky. |
74 | * We need a bit which can be stored in pte _and_ not conflict | 98 | * We need a bit which can be stored in pte _and_ not conflict |
75 | * with swap entry format. On x86 bits 6 and 7 are *not* involved | 99 | * with swap entry format. On x86 bits 6 and 7 are *not* involved |
@@ -94,26 +118,6 @@ | |||
94 | #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) | 118 | #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) |
95 | #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) | 119 | #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) |
96 | 120 | ||
97 | /* | ||
98 | * _PAGE_NUMA indicates that this page will trigger a numa hinting | ||
99 | * minor page fault to gather numa placement statistics (see | ||
100 | * pte_numa()). The bit picked (8) is within the range between | ||
101 | * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't | ||
102 | * require changes to the swp entry format because that bit is always | ||
103 | * zero when the pte is not present. | ||
104 | * | ||
105 | * The bit picked must be always zero when the pmd is present and not | ||
106 | * present, so that we don't lose information when we set it while | ||
107 | * atomically clearing the present bit. | ||
108 | * | ||
109 | * Because we shared the same bit (8) with _PAGE_PROTNONE this can be | ||
110 | * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE | ||
111 | * couldn't reach, like handle_mm_fault() (see access_error in | ||
112 | * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for | ||
113 | * handle_mm_fault() to be invoked). | ||
114 | */ | ||
115 | #define _PAGE_NUMA _PAGE_PROTNONE | ||
116 | |||
117 | #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ | 121 | #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ |
118 | _PAGE_ACCESSED | _PAGE_DIRTY) | 122 | _PAGE_ACCESSED | _PAGE_DIRTY) |
119 | #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ | 123 | #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ |
@@ -122,8 +126,8 @@ | |||
122 | /* Set of bits not changed in pte_modify */ | 126 | /* Set of bits not changed in pte_modify */ |
123 | #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ | 127 | #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ |
124 | _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ | 128 | _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ |
125 | _PAGE_SOFT_DIRTY) | 129 | _PAGE_SOFT_DIRTY | _PAGE_NUMA) |
126 | #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) | 130 | #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) |
127 | 131 | ||
128 | #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) | 132 | #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) |
129 | #define _PAGE_CACHE_WB (0) | 133 | #define _PAGE_CACHE_WB (0) |
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index 977f1761a25d..ab05d73e2bb7 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h | |||
@@ -29,4 +29,11 @@ static inline void pci_swiotlb_late_init(void) | |||
29 | 29 | ||
30 | static inline void dma_mark_clean(void *addr, size_t size) {} | 30 | static inline void dma_mark_clean(void *addr, size_t size) {} |
31 | 31 | ||
32 | extern void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, | ||
33 | dma_addr_t *dma_handle, gfp_t flags, | ||
34 | struct dma_attrs *attrs); | ||
35 | extern void x86_swiotlb_free_coherent(struct device *dev, size_t size, | ||
36 | void *vaddr, dma_addr_t dma_addr, | ||
37 | struct dma_attrs *attrs); | ||
38 | |||
32 | #endif /* _ASM_X86_SWIOTLB_H */ | 39 | #endif /* _ASM_X86_SWIOTLB_H */ |
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 3f556c6a0157..2b19caa4081c 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h | |||
@@ -41,7 +41,6 @@ | |||
41 | # define __ARCH_WANT_SYS_OLD_GETRLIMIT | 41 | # define __ARCH_WANT_SYS_OLD_GETRLIMIT |
42 | # define __ARCH_WANT_SYS_OLD_UNAME | 42 | # define __ARCH_WANT_SYS_OLD_UNAME |
43 | # define __ARCH_WANT_SYS_PAUSE | 43 | # define __ARCH_WANT_SYS_PAUSE |
44 | # define __ARCH_WANT_SYS_SGETMASK | ||
45 | # define __ARCH_WANT_SYS_SIGNAL | 44 | # define __ARCH_WANT_SYS_SIGNAL |
46 | # define __ARCH_WANT_SYS_SIGPENDING | 45 | # define __ARCH_WANT_SYS_SIGPENDING |
47 | # define __ARCH_WANT_SYS_SIGPROCMASK | 46 | # define __ARCH_WANT_SYS_SIGPROCMASK |
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b574b295a2f9..8e3842fc8bea 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c | |||
@@ -512,7 +512,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, | |||
512 | dma_addr_t dma_addr, struct dma_attrs *attrs) | 512 | dma_addr_t dma_addr, struct dma_attrs *attrs) |
513 | { | 513 | { |
514 | gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); | 514 | gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); |
515 | free_pages((unsigned long)vaddr, get_order(size)); | 515 | dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); |
516 | } | 516 | } |
517 | 517 | ||
518 | static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) | 518 | static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6cc800381d14..bb92f38153b2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); | |||
60 | 60 | ||
61 | #define SPINUNIT 100 /* 100ns */ | 61 | #define SPINUNIT 100 /* 100ns */ |
62 | 62 | ||
63 | atomic_t mce_entry; | ||
64 | |||
65 | DEFINE_PER_CPU(unsigned, mce_exception_count); | 63 | DEFINE_PER_CPU(unsigned, mce_exception_count); |
66 | 64 | ||
67 | struct mce_bank *mce_banks __read_mostly; | 65 | struct mce_bank *mce_banks __read_mostly; |
@@ -1040,8 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1040 | DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); | 1038 | DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); |
1041 | char *msg = "Unknown"; | 1039 | char *msg = "Unknown"; |
1042 | 1040 | ||
1043 | atomic_inc(&mce_entry); | ||
1044 | |||
1045 | this_cpu_inc(mce_exception_count); | 1041 | this_cpu_inc(mce_exception_count); |
1046 | 1042 | ||
1047 | if (!cfg->banks) | 1043 | if (!cfg->banks) |
@@ -1171,7 +1167,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1171 | mce_report_event(regs); | 1167 | mce_report_event(regs); |
1172 | mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); | 1168 | mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); |
1173 | out: | 1169 | out: |
1174 | atomic_dec(&mce_entry); | ||
1175 | sync_core(); | 1170 | sync_core(); |
1176 | } | 1171 | } |
1177 | EXPORT_SYMBOL_GPL(do_machine_check); | 1172 | EXPORT_SYMBOL_GPL(do_machine_check); |
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 068054f4bf20..eda1a865641e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c | |||
@@ -172,7 +172,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) | |||
172 | */ | 172 | */ |
173 | load_ucode_bsp(); | 173 | load_ucode_bsp(); |
174 | 174 | ||
175 | if (console_loglevel == 10) | 175 | if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) |
176 | early_printk("Kernel alive\n"); | 176 | early_printk("Kernel alive\n"); |
177 | 177 | ||
178 | clear_page(init_level4_pgt); | 178 | clear_page(init_level4_pgt); |
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f7d0672481fd..a25e202bb319 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -97,12 +97,17 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, | |||
97 | 97 | ||
98 | dma_mask = dma_alloc_coherent_mask(dev, flag); | 98 | dma_mask = dma_alloc_coherent_mask(dev, flag); |
99 | 99 | ||
100 | flag |= __GFP_ZERO; | 100 | flag &= ~__GFP_ZERO; |
101 | again: | 101 | again: |
102 | page = NULL; | 102 | page = NULL; |
103 | /* CMA can be used only in the context which permits sleeping */ | 103 | /* CMA can be used only in the context which permits sleeping */ |
104 | if (flag & __GFP_WAIT) | 104 | if (flag & __GFP_WAIT) { |
105 | page = dma_alloc_from_contiguous(dev, count, get_order(size)); | 105 | page = dma_alloc_from_contiguous(dev, count, get_order(size)); |
106 | if (page && page_to_phys(page) + size > dma_mask) { | ||
107 | dma_release_from_contiguous(dev, page, count); | ||
108 | page = NULL; | ||
109 | } | ||
110 | } | ||
106 | /* fallback */ | 111 | /* fallback */ |
107 | if (!page) | 112 | if (!page) |
108 | page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); | 113 | page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); |
@@ -120,7 +125,7 @@ again: | |||
120 | 125 | ||
121 | return NULL; | 126 | return NULL; |
122 | } | 127 | } |
123 | 128 | memset(page_address(page), 0, size); | |
124 | *dma_addr = addr; | 129 | *dma_addr = addr; |
125 | return page_address(page); | 130 | return page_address(page); |
126 | } | 131 | } |
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 6c483ba98b9c..77dd0ad58be4 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c | |||
@@ -14,7 +14,7 @@ | |||
14 | #include <asm/iommu_table.h> | 14 | #include <asm/iommu_table.h> |
15 | int swiotlb __read_mostly; | 15 | int swiotlb __read_mostly; |
16 | 16 | ||
17 | static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, | 17 | void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, |
18 | dma_addr_t *dma_handle, gfp_t flags, | 18 | dma_addr_t *dma_handle, gfp_t flags, |
19 | struct dma_attrs *attrs) | 19 | struct dma_attrs *attrs) |
20 | { | 20 | { |
@@ -28,11 +28,14 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, | |||
28 | return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); | 28 | return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); |
29 | } | 29 | } |
30 | 30 | ||
31 | static void x86_swiotlb_free_coherent(struct device *dev, size_t size, | 31 | void x86_swiotlb_free_coherent(struct device *dev, size_t size, |
32 | void *vaddr, dma_addr_t dma_addr, | 32 | void *vaddr, dma_addr_t dma_addr, |
33 | struct dma_attrs *attrs) | 33 | struct dma_attrs *attrs) |
34 | { | 34 | { |
35 | swiotlb_free_coherent(dev, size, vaddr, dma_addr); | 35 | if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr))) |
36 | swiotlb_free_coherent(dev, size, vaddr, dma_addr); | ||
37 | else | ||
38 | dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); | ||
36 | } | 39 | } |
37 | 40 | ||
38 | static struct dma_map_ops swiotlb_dma_ops = { | 41 | static struct dma_map_ops swiotlb_dma_ops = { |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 09c76d265550..78a0e6298922 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -1119,7 +1119,7 @@ void __init setup_arch(char **cmdline_p) | |||
1119 | setup_real_mode(); | 1119 | setup_real_mode(); |
1120 | 1120 | ||
1121 | memblock_set_current_limit(get_max_mapped()); | 1121 | memblock_set_current_limit(get_max_mapped()); |
1122 | dma_contiguous_reserve(0); | 1122 | dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); |
1123 | 1123 | ||
1124 | /* | 1124 | /* |
1125 | * NOTE: On x86-32, only from this point on, fixmaps are ready for use. | 1125 | * NOTE: On x86-32, only from this point on, fixmaps are ready for use. |
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 8c9f647ff9e1..8b977ebf9388 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c | |||
@@ -58,11 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |||
58 | { | 58 | { |
59 | return NULL; | 59 | return NULL; |
60 | } | 60 | } |
61 | |||
62 | int pmd_huge_support(void) | ||
63 | { | ||
64 | return 0; | ||
65 | } | ||
66 | #else | 61 | #else |
67 | 62 | ||
68 | struct page * | 63 | struct page * |
@@ -80,11 +75,6 @@ int pud_huge(pud_t pud) | |||
80 | { | 75 | { |
81 | return !!(pud_val(pud) & _PAGE_PSE); | 76 | return !!(pud_val(pud) & _PAGE_PSE); |
82 | } | 77 | } |
83 | |||
84 | int pmd_huge_support(void) | ||
85 | { | ||
86 | return 1; | ||
87 | } | ||
88 | #endif | 78 | #endif |
89 | 79 | ||
90 | #ifdef CONFIG_HUGETLB_PAGE | 80 | #ifdef CONFIG_HUGETLB_PAGE |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f35c66c5959a..b92591fa8970 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -1230,17 +1230,43 @@ const char *arch_vma_name(struct vm_area_struct *vma) | |||
1230 | return NULL; | 1230 | return NULL; |
1231 | } | 1231 | } |
1232 | 1232 | ||
1233 | #ifdef CONFIG_X86_UV | 1233 | static unsigned long probe_memory_block_size(void) |
1234 | unsigned long memory_block_size_bytes(void) | ||
1235 | { | 1234 | { |
1235 | /* start from 2g */ | ||
1236 | unsigned long bz = 1UL<<31; | ||
1237 | |||
1238 | #ifdef CONFIG_X86_UV | ||
1236 | if (is_uv_system()) { | 1239 | if (is_uv_system()) { |
1237 | printk(KERN_INFO "UV: memory block size 2GB\n"); | 1240 | printk(KERN_INFO "UV: memory block size 2GB\n"); |
1238 | return 2UL * 1024 * 1024 * 1024; | 1241 | return 2UL * 1024 * 1024 * 1024; |
1239 | } | 1242 | } |
1240 | return MIN_MEMORY_BLOCK_SIZE; | ||
1241 | } | ||
1242 | #endif | 1243 | #endif |
1243 | 1244 | ||
1245 | /* less than 64g installed */ | ||
1246 | if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) | ||
1247 | return MIN_MEMORY_BLOCK_SIZE; | ||
1248 | |||
1249 | /* get the tail size */ | ||
1250 | while (bz > MIN_MEMORY_BLOCK_SIZE) { | ||
1251 | if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) | ||
1252 | break; | ||
1253 | bz >>= 1; | ||
1254 | } | ||
1255 | |||
1256 | printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); | ||
1257 | |||
1258 | return bz; | ||
1259 | } | ||
1260 | |||
1261 | static unsigned long memory_block_size_probed; | ||
1262 | unsigned long memory_block_size_bytes(void) | ||
1263 | { | ||
1264 | if (!memory_block_size_probed) | ||
1265 | memory_block_size_probed = probe_memory_block_size(); | ||
1266 | |||
1267 | return memory_block_size_probed; | ||
1268 | } | ||
1269 | |||
1244 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 1270 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
1245 | /* | 1271 | /* |
1246 | * Initialise the sparsemem vmemmap using huge-pages at the PMD level. | 1272 | * Initialise the sparsemem vmemmap using huge-pages at the PMD level. |
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 1d045f9c390f..a32b706c401a 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
@@ -559,7 +559,7 @@ static void __init numa_clear_kernel_node_hotplug(void) | |||
559 | int i, nid; | 559 | int i, nid; |
560 | nodemask_t numa_kernel_nodes = NODE_MASK_NONE; | 560 | nodemask_t numa_kernel_nodes = NODE_MASK_NONE; |
561 | unsigned long start, end; | 561 | unsigned long start, end; |
562 | struct memblock_type *type = &memblock.reserved; | 562 | struct memblock_region *r; |
563 | 563 | ||
564 | /* | 564 | /* |
565 | * At this time, all memory regions reserved by memblock are | 565 | * At this time, all memory regions reserved by memblock are |
@@ -573,8 +573,8 @@ static void __init numa_clear_kernel_node_hotplug(void) | |||
573 | } | 573 | } |
574 | 574 | ||
575 | /* Mark all kernel nodes. */ | 575 | /* Mark all kernel nodes. */ |
576 | for (i = 0; i < type->cnt; i++) | 576 | for_each_memblock(reserved, r) |
577 | node_set(type->regions[i].nid, numa_kernel_nodes); | 577 | node_set(r->nid, numa_kernel_nodes); |
578 | 578 | ||
579 | /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ | 579 | /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ |
580 | for (i = 0; i < numa_meminfo.nr_blks; i++) { | 580 | for (i = 0; i < numa_meminfo.nr_blks; i++) { |
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 461bc8289024..6629f397b467 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c | |||
@@ -35,7 +35,7 @@ enum { | |||
35 | 35 | ||
36 | static int pte_testbit(pte_t pte) | 36 | static int pte_testbit(pte_t pte) |
37 | { | 37 | { |
38 | return pte_flags(pte) & _PAGE_UNUSED1; | 38 | return pte_flags(pte) & _PAGE_SOFTW1; |
39 | } | 39 | } |
40 | 40 | ||
41 | struct split_state { | 41 | struct split_state { |
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 9d8a509c9730..5ceda85b8687 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c | |||
@@ -173,9 +173,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev, | |||
173 | { | 173 | { |
174 | void *vaddr; | 174 | void *vaddr; |
175 | 175 | ||
176 | vaddr = dma_generic_alloc_coherent(dev, size, dma_handle, flags, attrs); | 176 | vaddr = x86_swiotlb_alloc_coherent(dev, size, dma_handle, flags, attrs); |
177 | if (!vaddr) | ||
178 | vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, flags); | ||
179 | *dma_handle = p2a(*dma_handle, to_pci_dev(dev)); | 177 | *dma_handle = p2a(*dma_handle, to_pci_dev(dev)); |
180 | return vaddr; | 178 | return vaddr; |
181 | } | 179 | } |
@@ -183,7 +181,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev, | |||
183 | /* We have our own dma_ops: the same as swiotlb but from alloc (above) */ | 181 | /* We have our own dma_ops: the same as swiotlb but from alloc (above) */ |
184 | static struct dma_map_ops sta2x11_dma_ops = { | 182 | static struct dma_map_ops sta2x11_dma_ops = { |
185 | .alloc = sta2x11_swiotlb_alloc_coherent, | 183 | .alloc = sta2x11_swiotlb_alloc_coherent, |
186 | .free = swiotlb_free_coherent, | 184 | .free = x86_swiotlb_free_coherent, |
187 | .map_page = swiotlb_map_page, | 185 | .map_page = swiotlb_map_page, |
188 | .unmap_page = swiotlb_unmap_page, | 186 | .unmap_page = swiotlb_unmap_page, |
189 | .map_sg = swiotlb_map_sg_attrs, | 187 | .map_sg = swiotlb_map_sg_attrs, |
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index be27da60dc8f..c89c93320c12 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c | |||
@@ -85,7 +85,7 @@ static cpumask_var_t uv_nmi_cpu_mask; | |||
85 | * Default is all stack dumps go to the console and buffer. | 85 | * Default is all stack dumps go to the console and buffer. |
86 | * Lower level to send to log buffer only. | 86 | * Lower level to send to log buffer only. |
87 | */ | 87 | */ |
88 | static int uv_nmi_loglevel = 7; | 88 | static int uv_nmi_loglevel = CONSOLE_LOGLEVEL_DEFAULT; |
89 | module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644); | 89 | module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644); |
90 | 90 | ||
91 | /* | 91 | /* |
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 4b7b4522b64f..23b8726962af 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig | |||
@@ -258,7 +258,7 @@ endchoice | |||
258 | 258 | ||
259 | config CMA_ALIGNMENT | 259 | config CMA_ALIGNMENT |
260 | int "Maximum PAGE_SIZE order of alignment for contiguous buffers" | 260 | int "Maximum PAGE_SIZE order of alignment for contiguous buffers" |
261 | range 4 9 | 261 | range 4 12 |
262 | default 8 | 262 | default 8 |
263 | help | 263 | help |
264 | DMA mapping framework by default aligns all buffers to the smallest | 264 | DMA mapping framework by default aligns all buffers to the smallest |
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index c34ec3364243..83969f8c5727 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c | |||
@@ -60,11 +60,22 @@ struct cma *dma_contiguous_default_area; | |||
60 | */ | 60 | */ |
61 | static const phys_addr_t size_bytes = CMA_SIZE_MBYTES * SZ_1M; | 61 | static const phys_addr_t size_bytes = CMA_SIZE_MBYTES * SZ_1M; |
62 | static phys_addr_t size_cmdline = -1; | 62 | static phys_addr_t size_cmdline = -1; |
63 | static phys_addr_t base_cmdline; | ||
64 | static phys_addr_t limit_cmdline; | ||
63 | 65 | ||
64 | static int __init early_cma(char *p) | 66 | static int __init early_cma(char *p) |
65 | { | 67 | { |
66 | pr_debug("%s(%s)\n", __func__, p); | 68 | pr_debug("%s(%s)\n", __func__, p); |
67 | size_cmdline = memparse(p, &p); | 69 | size_cmdline = memparse(p, &p); |
70 | if (*p != '@') | ||
71 | return 0; | ||
72 | base_cmdline = memparse(p + 1, &p); | ||
73 | if (*p != '-') { | ||
74 | limit_cmdline = base_cmdline + size_cmdline; | ||
75 | return 0; | ||
76 | } | ||
77 | limit_cmdline = memparse(p + 1, &p); | ||
78 | |||
68 | return 0; | 79 | return 0; |
69 | } | 80 | } |
70 | early_param("cma", early_cma); | 81 | early_param("cma", early_cma); |
@@ -108,11 +119,18 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) | |||
108 | void __init dma_contiguous_reserve(phys_addr_t limit) | 119 | void __init dma_contiguous_reserve(phys_addr_t limit) |
109 | { | 120 | { |
110 | phys_addr_t selected_size = 0; | 121 | phys_addr_t selected_size = 0; |
122 | phys_addr_t selected_base = 0; | ||
123 | phys_addr_t selected_limit = limit; | ||
124 | bool fixed = false; | ||
111 | 125 | ||
112 | pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); | 126 | pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); |
113 | 127 | ||
114 | if (size_cmdline != -1) { | 128 | if (size_cmdline != -1) { |
115 | selected_size = size_cmdline; | 129 | selected_size = size_cmdline; |
130 | selected_base = base_cmdline; | ||
131 | selected_limit = min_not_zero(limit_cmdline, limit); | ||
132 | if (base_cmdline + size_cmdline == limit_cmdline) | ||
133 | fixed = true; | ||
116 | } else { | 134 | } else { |
117 | #ifdef CONFIG_CMA_SIZE_SEL_MBYTES | 135 | #ifdef CONFIG_CMA_SIZE_SEL_MBYTES |
118 | selected_size = size_bytes; | 136 | selected_size = size_bytes; |
@@ -129,10 +147,12 @@ void __init dma_contiguous_reserve(phys_addr_t limit) | |||
129 | pr_debug("%s: reserving %ld MiB for global area\n", __func__, | 147 | pr_debug("%s: reserving %ld MiB for global area\n", __func__, |
130 | (unsigned long)selected_size / SZ_1M); | 148 | (unsigned long)selected_size / SZ_1M); |
131 | 149 | ||
132 | dma_contiguous_reserve_area(selected_size, 0, limit, | 150 | dma_contiguous_reserve_area(selected_size, selected_base, |
133 | &dma_contiguous_default_area); | 151 | selected_limit, |
152 | &dma_contiguous_default_area, | ||
153 | fixed); | ||
134 | } | 154 | } |
135 | }; | 155 | } |
136 | 156 | ||
137 | static DEFINE_MUTEX(cma_mutex); | 157 | static DEFINE_MUTEX(cma_mutex); |
138 | 158 | ||
@@ -189,15 +209,20 @@ core_initcall(cma_init_reserved_areas); | |||
189 | * @base: Base address of the reserved area optional, use 0 for any | 209 | * @base: Base address of the reserved area optional, use 0 for any |
190 | * @limit: End address of the reserved memory (optional, 0 for any). | 210 | * @limit: End address of the reserved memory (optional, 0 for any). |
191 | * @res_cma: Pointer to store the created cma region. | 211 | * @res_cma: Pointer to store the created cma region. |
212 | * @fixed: hint about where to place the reserved area | ||
192 | * | 213 | * |
193 | * This function reserves memory from early allocator. It should be | 214 | * This function reserves memory from early allocator. It should be |
194 | * called by arch specific code once the early allocator (memblock or bootmem) | 215 | * called by arch specific code once the early allocator (memblock or bootmem) |
195 | * has been activated and all other subsystems have already allocated/reserved | 216 | * has been activated and all other subsystems have already allocated/reserved |
196 | * memory. This function allows to create custom reserved areas for specific | 217 | * memory. This function allows to create custom reserved areas for specific |
197 | * devices. | 218 | * devices. |
219 | * | ||
220 | * If @fixed is true, reserve contiguous area at exactly @base. If false, | ||
221 | * reserve in range from @base to @limit. | ||
198 | */ | 222 | */ |
199 | int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, | 223 | int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, |
200 | phys_addr_t limit, struct cma **res_cma) | 224 | phys_addr_t limit, struct cma **res_cma, |
225 | bool fixed) | ||
201 | { | 226 | { |
202 | struct cma *cma = &cma_areas[cma_area_count]; | 227 | struct cma *cma = &cma_areas[cma_area_count]; |
203 | phys_addr_t alignment; | 228 | phys_addr_t alignment; |
@@ -223,18 +248,15 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, | |||
223 | limit &= ~(alignment - 1); | 248 | limit &= ~(alignment - 1); |
224 | 249 | ||
225 | /* Reserve memory */ | 250 | /* Reserve memory */ |
226 | if (base) { | 251 | if (base && fixed) { |
227 | if (memblock_is_region_reserved(base, size) || | 252 | if (memblock_is_region_reserved(base, size) || |
228 | memblock_reserve(base, size) < 0) { | 253 | memblock_reserve(base, size) < 0) { |
229 | ret = -EBUSY; | 254 | ret = -EBUSY; |
230 | goto err; | 255 | goto err; |
231 | } | 256 | } |
232 | } else { | 257 | } else { |
233 | /* | 258 | phys_addr_t addr = memblock_alloc_range(size, alignment, base, |
234 | * Use __memblock_alloc_base() since | 259 | limit); |
235 | * memblock_alloc_base() panic()s. | ||
236 | */ | ||
237 | phys_addr_t addr = __memblock_alloc_base(size, alignment, limit); | ||
238 | if (!addr) { | 260 | if (!addr) { |
239 | ret = -ENOMEM; | 261 | ret = -ENOMEM; |
240 | goto err; | 262 | goto err; |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index bece691cb5d9..89f752dd8465 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -118,16 +118,6 @@ static ssize_t show_mem_start_phys_index(struct device *dev, | |||
118 | return sprintf(buf, "%08lx\n", phys_index); | 118 | return sprintf(buf, "%08lx\n", phys_index); |
119 | } | 119 | } |
120 | 120 | ||
121 | static ssize_t show_mem_end_phys_index(struct device *dev, | ||
122 | struct device_attribute *attr, char *buf) | ||
123 | { | ||
124 | struct memory_block *mem = to_memory_block(dev); | ||
125 | unsigned long phys_index; | ||
126 | |||
127 | phys_index = mem->end_section_nr / sections_per_block; | ||
128 | return sprintf(buf, "%08lx\n", phys_index); | ||
129 | } | ||
130 | |||
131 | /* | 121 | /* |
132 | * Show whether the section of memory is likely to be hot-removable | 122 | * Show whether the section of memory is likely to be hot-removable |
133 | */ | 123 | */ |
@@ -384,7 +374,6 @@ static ssize_t show_phys_device(struct device *dev, | |||
384 | } | 374 | } |
385 | 375 | ||
386 | static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); | 376 | static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); |
387 | static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); | ||
388 | static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); | 377 | static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); |
389 | static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); | 378 | static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); |
390 | static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); | 379 | static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); |
@@ -529,7 +518,6 @@ struct memory_block *find_memory_block(struct mem_section *section) | |||
529 | 518 | ||
530 | static struct attribute *memory_memblk_attrs[] = { | 519 | static struct attribute *memory_memblk_attrs[] = { |
531 | &dev_attr_phys_index.attr, | 520 | &dev_attr_phys_index.attr, |
532 | &dev_attr_end_phys_index.attr, | ||
533 | &dev_attr_state.attr, | 521 | &dev_attr_state.attr, |
534 | &dev_attr_phys_device.attr, | 522 | &dev_attr_phys_device.attr, |
535 | &dev_attr_removable.attr, | 523 | &dev_attr_removable.attr, |
diff --git a/drivers/block/brd.c b/drivers/block/brd.c index e73b85cf0756..c7d138eca731 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c | |||
@@ -200,11 +200,11 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n) | |||
200 | 200 | ||
201 | copy = min_t(size_t, n, PAGE_SIZE - offset); | 201 | copy = min_t(size_t, n, PAGE_SIZE - offset); |
202 | if (!brd_insert_page(brd, sector)) | 202 | if (!brd_insert_page(brd, sector)) |
203 | return -ENOMEM; | 203 | return -ENOSPC; |
204 | if (copy < n) { | 204 | if (copy < n) { |
205 | sector += copy >> SECTOR_SHIFT; | 205 | sector += copy >> SECTOR_SHIFT; |
206 | if (!brd_insert_page(brd, sector)) | 206 | if (!brd_insert_page(brd, sector)) |
207 | return -ENOMEM; | 207 | return -ENOSPC; |
208 | } | 208 | } |
209 | return 0; | 209 | return 0; |
210 | } | 210 | } |
@@ -360,6 +360,15 @@ out: | |||
360 | bio_endio(bio, err); | 360 | bio_endio(bio, err); |
361 | } | 361 | } |
362 | 362 | ||
363 | static int brd_rw_page(struct block_device *bdev, sector_t sector, | ||
364 | struct page *page, int rw) | ||
365 | { | ||
366 | struct brd_device *brd = bdev->bd_disk->private_data; | ||
367 | int err = brd_do_bvec(brd, page, PAGE_CACHE_SIZE, 0, rw, sector); | ||
368 | page_endio(page, rw & WRITE, err); | ||
369 | return err; | ||
370 | } | ||
371 | |||
363 | #ifdef CONFIG_BLK_DEV_XIP | 372 | #ifdef CONFIG_BLK_DEV_XIP |
364 | static int brd_direct_access(struct block_device *bdev, sector_t sector, | 373 | static int brd_direct_access(struct block_device *bdev, sector_t sector, |
365 | void **kaddr, unsigned long *pfn) | 374 | void **kaddr, unsigned long *pfn) |
@@ -375,7 +384,7 @@ static int brd_direct_access(struct block_device *bdev, sector_t sector, | |||
375 | return -ERANGE; | 384 | return -ERANGE; |
376 | page = brd_insert_page(brd, sector); | 385 | page = brd_insert_page(brd, sector); |
377 | if (!page) | 386 | if (!page) |
378 | return -ENOMEM; | 387 | return -ENOSPC; |
379 | *kaddr = page_address(page); | 388 | *kaddr = page_address(page); |
380 | *pfn = page_to_pfn(page); | 389 | *pfn = page_to_pfn(page); |
381 | 390 | ||
@@ -419,6 +428,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode, | |||
419 | 428 | ||
420 | static const struct block_device_operations brd_fops = { | 429 | static const struct block_device_operations brd_fops = { |
421 | .owner = THIS_MODULE, | 430 | .owner = THIS_MODULE, |
431 | .rw_page = brd_rw_page, | ||
422 | .ioctl = brd_ioctl, | 432 | .ioctl = brd_ioctl, |
423 | #ifdef CONFIG_BLK_DEV_XIP | 433 | #ifdef CONFIG_BLK_DEV_XIP |
424 | .direct_access = brd_direct_access, | 434 | .direct_access = brd_direct_access, |
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9849b5233bf4..48eccb350180 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c | |||
@@ -572,10 +572,10 @@ static void zram_bio_discard(struct zram *zram, u32 index, | |||
572 | * skipping this logical block is appropriate here. | 572 | * skipping this logical block is appropriate here. |
573 | */ | 573 | */ |
574 | if (offset) { | 574 | if (offset) { |
575 | if (n < offset) | 575 | if (n <= (PAGE_SIZE - offset)) |
576 | return; | 576 | return; |
577 | 577 | ||
578 | n -= offset; | 578 | n -= (PAGE_SIZE - offset); |
579 | index++; | 579 | index++; |
580 | } | 580 | } |
581 | 581 | ||
diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index 6c1885eedfdf..800158714473 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c | |||
@@ -467,14 +467,17 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev, | |||
467 | goto err_free; | 467 | goto err_free; |
468 | } | 468 | } |
469 | 469 | ||
470 | down_read(¤t->mm->mmap_sem); | ||
470 | vma = find_vma(current->mm, userptr); | 471 | vma = find_vma(current->mm, userptr); |
471 | if (!vma) { | 472 | if (!vma) { |
473 | up_read(¤t->mm->mmap_sem); | ||
472 | DRM_ERROR("failed to get vm region.\n"); | 474 | DRM_ERROR("failed to get vm region.\n"); |
473 | ret = -EFAULT; | 475 | ret = -EFAULT; |
474 | goto err_free_pages; | 476 | goto err_free_pages; |
475 | } | 477 | } |
476 | 478 | ||
477 | if (vma->vm_end < userptr + size) { | 479 | if (vma->vm_end < userptr + size) { |
480 | up_read(¤t->mm->mmap_sem); | ||
478 | DRM_ERROR("vma is too small.\n"); | 481 | DRM_ERROR("vma is too small.\n"); |
479 | ret = -EFAULT; | 482 | ret = -EFAULT; |
480 | goto err_free_pages; | 483 | goto err_free_pages; |
@@ -482,6 +485,7 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev, | |||
482 | 485 | ||
483 | g2d_userptr->vma = exynos_gem_get_vma(vma); | 486 | g2d_userptr->vma = exynos_gem_get_vma(vma); |
484 | if (!g2d_userptr->vma) { | 487 | if (!g2d_userptr->vma) { |
488 | up_read(¤t->mm->mmap_sem); | ||
485 | DRM_ERROR("failed to copy vma.\n"); | 489 | DRM_ERROR("failed to copy vma.\n"); |
486 | ret = -ENOMEM; | 490 | ret = -ENOMEM; |
487 | goto err_free_pages; | 491 | goto err_free_pages; |
@@ -492,10 +496,12 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev, | |||
492 | ret = exynos_gem_get_pages_from_userptr(start & PAGE_MASK, | 496 | ret = exynos_gem_get_pages_from_userptr(start & PAGE_MASK, |
493 | npages, pages, vma); | 497 | npages, pages, vma); |
494 | if (ret < 0) { | 498 | if (ret < 0) { |
499 | up_read(¤t->mm->mmap_sem); | ||
495 | DRM_ERROR("failed to get user pages from userptr.\n"); | 500 | DRM_ERROR("failed to get user pages from userptr.\n"); |
496 | goto err_put_vma; | 501 | goto err_put_vma; |
497 | } | 502 | } |
498 | 503 | ||
504 | up_read(¤t->mm->mmap_sem); | ||
499 | g2d_userptr->pages = pages; | 505 | g2d_userptr->pages = pages; |
500 | 506 | ||
501 | sgt = kzalloc(sizeof(*sgt), GFP_KERNEL); | 507 | sgt = kzalloc(sizeof(*sgt), GFP_KERNEL); |
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index f256ffc02e29..6bb32773c3ac 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/dmi.h> | 39 | #include <linux/dmi.h> |
40 | #include <linux/pci-ats.h> | 40 | #include <linux/pci-ats.h> |
41 | #include <linux/memblock.h> | 41 | #include <linux/memblock.h> |
42 | #include <linux/dma-contiguous.h> | ||
42 | #include <asm/irq_remapping.h> | 43 | #include <asm/irq_remapping.h> |
43 | #include <asm/cacheflush.h> | 44 | #include <asm/cacheflush.h> |
44 | #include <asm/iommu.h> | 45 | #include <asm/iommu.h> |
@@ -3193,7 +3194,7 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, | |||
3193 | dma_addr_t *dma_handle, gfp_t flags, | 3194 | dma_addr_t *dma_handle, gfp_t flags, |
3194 | struct dma_attrs *attrs) | 3195 | struct dma_attrs *attrs) |
3195 | { | 3196 | { |
3196 | void *vaddr; | 3197 | struct page *page = NULL; |
3197 | int order; | 3198 | int order; |
3198 | 3199 | ||
3199 | size = PAGE_ALIGN(size); | 3200 | size = PAGE_ALIGN(size); |
@@ -3208,17 +3209,31 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, | |||
3208 | flags |= GFP_DMA32; | 3209 | flags |= GFP_DMA32; |
3209 | } | 3210 | } |
3210 | 3211 | ||
3211 | vaddr = (void *)__get_free_pages(flags, order); | 3212 | if (flags & __GFP_WAIT) { |
3212 | if (!vaddr) | 3213 | unsigned int count = size >> PAGE_SHIFT; |
3214 | |||
3215 | page = dma_alloc_from_contiguous(dev, count, order); | ||
3216 | if (page && iommu_no_mapping(dev) && | ||
3217 | page_to_phys(page) + size > dev->coherent_dma_mask) { | ||
3218 | dma_release_from_contiguous(dev, page, count); | ||
3219 | page = NULL; | ||
3220 | } | ||
3221 | } | ||
3222 | |||
3223 | if (!page) | ||
3224 | page = alloc_pages(flags, order); | ||
3225 | if (!page) | ||
3213 | return NULL; | 3226 | return NULL; |
3214 | memset(vaddr, 0, size); | 3227 | memset(page_address(page), 0, size); |
3215 | 3228 | ||
3216 | *dma_handle = __intel_map_single(dev, virt_to_bus(vaddr), size, | 3229 | *dma_handle = __intel_map_single(dev, page_to_phys(page), size, |
3217 | DMA_BIDIRECTIONAL, | 3230 | DMA_BIDIRECTIONAL, |
3218 | dev->coherent_dma_mask); | 3231 | dev->coherent_dma_mask); |
3219 | if (*dma_handle) | 3232 | if (*dma_handle) |
3220 | return vaddr; | 3233 | return page_address(page); |
3221 | free_pages((unsigned long)vaddr, order); | 3234 | if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) |
3235 | __free_pages(page, order); | ||
3236 | |||
3222 | return NULL; | 3237 | return NULL; |
3223 | } | 3238 | } |
3224 | 3239 | ||
@@ -3226,12 +3241,14 @@ static void intel_free_coherent(struct device *dev, size_t size, void *vaddr, | |||
3226 | dma_addr_t dma_handle, struct dma_attrs *attrs) | 3241 | dma_addr_t dma_handle, struct dma_attrs *attrs) |
3227 | { | 3242 | { |
3228 | int order; | 3243 | int order; |
3244 | struct page *page = virt_to_page(vaddr); | ||
3229 | 3245 | ||
3230 | size = PAGE_ALIGN(size); | 3246 | size = PAGE_ALIGN(size); |
3231 | order = get_order(size); | 3247 | order = get_order(size); |
3232 | 3248 | ||
3233 | intel_unmap_page(dev, dma_handle, size, DMA_BIDIRECTIONAL, NULL); | 3249 | intel_unmap_page(dev, dma_handle, size, DMA_BIDIRECTIONAL, NULL); |
3234 | free_pages((unsigned long)vaddr, order); | 3250 | if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) |
3251 | __free_pages(page, order); | ||
3235 | } | 3252 | } |
3236 | 3253 | ||
3237 | static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist, | 3254 | static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist, |
diff --git a/drivers/nubus/nubus.c b/drivers/nubus/nubus.c index 43926cd25ae8..5066a7ef7b6c 100644 --- a/drivers/nubus/nubus.c +++ b/drivers/nubus/nubus.c | |||
@@ -473,7 +473,7 @@ static struct nubus_dev* __init | |||
473 | if (slot == 0 && (unsigned long)dir.base % 2) | 473 | if (slot == 0 && (unsigned long)dir.base % 2) |
474 | dir.base += 1; | 474 | dir.base += 1; |
475 | 475 | ||
476 | if (console_loglevel >= 10) | 476 | if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) |
477 | printk(KERN_DEBUG "nubus_get_functional_resource: parent is 0x%p, dir is 0x%p\n", | 477 | printk(KERN_DEBUG "nubus_get_functional_resource: parent is 0x%p, dir is 0x%p\n", |
478 | parent->base, dir.base); | 478 | parent->base, dir.base); |
479 | 479 | ||
@@ -568,7 +568,7 @@ static int __init nubus_get_vidnames(struct nubus_board* board, | |||
568 | 568 | ||
569 | printk(KERN_INFO " video modes supported:\n"); | 569 | printk(KERN_INFO " video modes supported:\n"); |
570 | nubus_get_subdir(parent, &dir); | 570 | nubus_get_subdir(parent, &dir); |
571 | if (console_loglevel >= 10) | 571 | if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) |
572 | printk(KERN_DEBUG "nubus_get_vidnames: parent is 0x%p, dir is 0x%p\n", | 572 | printk(KERN_DEBUG "nubus_get_vidnames: parent is 0x%p, dir is 0x%p\n", |
573 | parent->base, dir.base); | 573 | parent->base, dir.base); |
574 | 574 | ||
@@ -629,7 +629,7 @@ static int __init nubus_get_vendorinfo(struct nubus_board* board, | |||
629 | 629 | ||
630 | printk(KERN_INFO " vendor info:\n"); | 630 | printk(KERN_INFO " vendor info:\n"); |
631 | nubus_get_subdir(parent, &dir); | 631 | nubus_get_subdir(parent, &dir); |
632 | if (console_loglevel >= 10) | 632 | if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) |
633 | printk(KERN_DEBUG "nubus_get_vendorinfo: parent is 0x%p, dir is 0x%p\n", | 633 | printk(KERN_DEBUG "nubus_get_vendorinfo: parent is 0x%p, dir is 0x%p\n", |
634 | parent->base, dir.base); | 634 | parent->base, dir.base); |
635 | 635 | ||
@@ -654,7 +654,7 @@ static int __init nubus_get_board_resource(struct nubus_board* board, int slot, | |||
654 | struct nubus_dirent ent; | 654 | struct nubus_dirent ent; |
655 | 655 | ||
656 | nubus_get_subdir(parent, &dir); | 656 | nubus_get_subdir(parent, &dir); |
657 | if (console_loglevel >= 10) | 657 | if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) |
658 | printk(KERN_DEBUG "nubus_get_board_resource: parent is 0x%p, dir is 0x%p\n", | 658 | printk(KERN_DEBUG "nubus_get_board_resource: parent is 0x%p, dir is 0x%p\n", |
659 | parent->base, dir.base); | 659 | parent->base, dir.base); |
660 | 660 | ||
@@ -753,19 +753,19 @@ static void __init nubus_find_rom_dir(struct nubus_board* board) | |||
753 | if (nubus_readdir(&dir, &ent) == -1) | 753 | if (nubus_readdir(&dir, &ent) == -1) |
754 | goto badrom; | 754 | goto badrom; |
755 | 755 | ||
756 | if (console_loglevel >= 10) | 756 | if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) |
757 | printk(KERN_INFO "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); | 757 | printk(KERN_INFO "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); |
758 | /* This one takes us to where we want to go. */ | 758 | /* This one takes us to where we want to go. */ |
759 | if (nubus_readdir(&dir, &ent) == -1) | 759 | if (nubus_readdir(&dir, &ent) == -1) |
760 | goto badrom; | 760 | goto badrom; |
761 | if (console_loglevel >= 10) | 761 | if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) |
762 | printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); | 762 | printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); |
763 | nubus_get_subdir(&ent, &dir); | 763 | nubus_get_subdir(&ent, &dir); |
764 | 764 | ||
765 | /* Resource ID 01, also an "Unknown Macintosh" */ | 765 | /* Resource ID 01, also an "Unknown Macintosh" */ |
766 | if (nubus_readdir(&dir, &ent) == -1) | 766 | if (nubus_readdir(&dir, &ent) == -1) |
767 | goto badrom; | 767 | goto badrom; |
768 | if (console_loglevel >= 10) | 768 | if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) |
769 | printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); | 769 | printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); |
770 | 770 | ||
771 | /* FIXME: the first one is *not* always the right one. We | 771 | /* FIXME: the first one is *not* always the right one. We |
@@ -780,7 +780,7 @@ static void __init nubus_find_rom_dir(struct nubus_board* board) | |||
780 | path to that address... */ | 780 | path to that address... */ |
781 | if (nubus_readdir(&dir, &ent) == -1) | 781 | if (nubus_readdir(&dir, &ent) == -1) |
782 | goto badrom; | 782 | goto badrom; |
783 | if (console_loglevel >= 10) | 783 | if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) |
784 | printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); | 784 | printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); |
785 | 785 | ||
786 | /* Bwahahahaha... */ | 786 | /* Bwahahahaha... */ |
@@ -816,7 +816,7 @@ static struct nubus_board* __init nubus_add_board(int slot, int bytelanes) | |||
816 | board->fblock = rp; | 816 | board->fblock = rp; |
817 | 817 | ||
818 | /* Dump the format block for debugging purposes */ | 818 | /* Dump the format block for debugging purposes */ |
819 | if (console_loglevel >= 10) { | 819 | if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) { |
820 | int i; | 820 | int i; |
821 | printk(KERN_DEBUG "Slot %X, format block at 0x%p\n", | 821 | printk(KERN_DEBUG "Slot %X, format block at 0x%p\n", |
822 | slot, rp); | 822 | slot, rp); |
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index ce396ecdf412..b767a64e49d9 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c | |||
@@ -88,7 +88,7 @@ static void sysrq_handle_loglevel(int key) | |||
88 | int i; | 88 | int i; |
89 | 89 | ||
90 | i = key - '0'; | 90 | i = key - '0'; |
91 | console_loglevel = 7; | 91 | console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; |
92 | printk("Loglevel set to %d\n", i); | 92 | printk("Loglevel set to %d\n", i); |
93 | console_loglevel = i; | 93 | console_loglevel = i; |
94 | } | 94 | } |
@@ -343,7 +343,7 @@ static void send_sig_all(int sig) | |||
343 | static void sysrq_handle_term(int key) | 343 | static void sysrq_handle_term(int key) |
344 | { | 344 | { |
345 | send_sig_all(SIGTERM); | 345 | send_sig_all(SIGTERM); |
346 | console_loglevel = 8; | 346 | console_loglevel = CONSOLE_LOGLEVEL_DEBUG; |
347 | } | 347 | } |
348 | static struct sysrq_key_op sysrq_term_op = { | 348 | static struct sysrq_key_op sysrq_term_op = { |
349 | .handler = sysrq_handle_term, | 349 | .handler = sysrq_handle_term, |
@@ -387,7 +387,7 @@ static struct sysrq_key_op sysrq_thaw_op = { | |||
387 | static void sysrq_handle_kill(int key) | 387 | static void sysrq_handle_kill(int key) |
388 | { | 388 | { |
389 | send_sig_all(SIGKILL); | 389 | send_sig_all(SIGKILL); |
390 | console_loglevel = 8; | 390 | console_loglevel = CONSOLE_LOGLEVEL_DEBUG; |
391 | } | 391 | } |
392 | static struct sysrq_key_op sysrq_kill_op = { | 392 | static struct sysrq_key_op sysrq_kill_op = { |
393 | .handler = sysrq_handle_kill, | 393 | .handler = sysrq_handle_kill, |
@@ -520,7 +520,7 @@ void __handle_sysrq(int key, bool check_mask) | |||
520 | * routing in the consumers of /proc/kmsg. | 520 | * routing in the consumers of /proc/kmsg. |
521 | */ | 521 | */ |
522 | orig_log_level = console_loglevel; | 522 | orig_log_level = console_loglevel; |
523 | console_loglevel = 7; | 523 | console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; |
524 | printk(KERN_INFO "SysRq : "); | 524 | printk(KERN_INFO "SysRq : "); |
525 | 525 | ||
526 | op_p = __sysrq_get_key_op(key); | 526 | op_p = __sysrq_get_key_op(key); |
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 14da82564f4e..6894b085f0ee 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c | |||
@@ -537,7 +537,7 @@ static struct attribute_group v9fs_attr_group = { | |||
537 | * | 537 | * |
538 | */ | 538 | */ |
539 | 539 | ||
540 | static int v9fs_sysfs_init(void) | 540 | static int __init v9fs_sysfs_init(void) |
541 | { | 541 | { |
542 | v9fs_kobj = kobject_create_and_add("9p", fs_kobj); | 542 | v9fs_kobj = kobject_create_and_add("9p", fs_kobj); |
543 | if (!v9fs_kobj) | 543 | if (!v9fs_kobj) |
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 4d0c2e0be7e5..0b3bfa303dda 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c | |||
@@ -42,7 +42,6 @@ | |||
42 | 42 | ||
43 | /** | 43 | /** |
44 | * struct p9_rdir - readdir accounting | 44 | * struct p9_rdir - readdir accounting |
45 | * @mutex: mutex protecting readdir | ||
46 | * @head: start offset of current dirread buffer | 45 | * @head: start offset of current dirread buffer |
47 | * @tail: end offset of current dirread buffer | 46 | * @tail: end offset of current dirread buffer |
48 | * @buf: dirread buffer | 47 | * @buf: dirread buffer |
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 59e3fe3d56c0..96e550760699 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c | |||
@@ -681,7 +681,7 @@ v9fs_direct_read(struct file *filp, char __user *udata, size_t count, | |||
681 | /** | 681 | /** |
682 | * v9fs_cached_file_read - read from a file | 682 | * v9fs_cached_file_read - read from a file |
683 | * @filp: file pointer to read | 683 | * @filp: file pointer to read |
684 | * @udata: user data buffer to read data into | 684 | * @data: user data buffer to read data into |
685 | * @count: size of buffer | 685 | * @count: size of buffer |
686 | * @offset: offset at which to read data | 686 | * @offset: offset at which to read data |
687 | * | 687 | * |
@@ -698,7 +698,7 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count, | |||
698 | /** | 698 | /** |
699 | * v9fs_mmap_file_read - read from a file | 699 | * v9fs_mmap_file_read - read from a file |
700 | * @filp: file pointer to read | 700 | * @filp: file pointer to read |
701 | * @udata: user data buffer to read data into | 701 | * @data: user data buffer to read data into |
702 | * @count: size of buffer | 702 | * @count: size of buffer |
703 | * @offset: offset at which to read data | 703 | * @offset: offset at which to read data |
704 | * | 704 | * |
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 53161ec058a7..00d140fb2263 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c | |||
@@ -580,7 +580,7 @@ static int v9fs_at_to_dotl_flags(int flags) | |||
580 | * v9fs_remove - helper function to remove files and directories | 580 | * v9fs_remove - helper function to remove files and directories |
581 | * @dir: directory inode that is being deleted | 581 | * @dir: directory inode that is being deleted |
582 | * @dentry: dentry that is being deleted | 582 | * @dentry: dentry that is being deleted |
583 | * @rmdir: removing a directory | 583 | * @flags: removing a directory |
584 | * | 584 | * |
585 | */ | 585 | */ |
586 | 586 | ||
@@ -778,7 +778,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode | |||
778 | * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode | 778 | * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode |
779 | * @dir: inode that is being walked from | 779 | * @dir: inode that is being walked from |
780 | * @dentry: dentry that is being walked to? | 780 | * @dentry: dentry that is being walked to? |
781 | * @nameidata: path data | 781 | * @flags: lookup flags (unused) |
782 | * | 782 | * |
783 | */ | 783 | */ |
784 | 784 | ||
@@ -1324,7 +1324,7 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) | |||
1324 | * v9fs_vfs_mkspecial - create a special file | 1324 | * v9fs_vfs_mkspecial - create a special file |
1325 | * @dir: inode to create special file in | 1325 | * @dir: inode to create special file in |
1326 | * @dentry: dentry to create | 1326 | * @dentry: dentry to create |
1327 | * @mode: mode to create special file | 1327 | * @perm: mode to create special file |
1328 | * @extension: 9p2000.u format extension string representing special file | 1328 | * @extension: 9p2000.u format extension string representing special file |
1329 | * | 1329 | * |
1330 | */ | 1330 | */ |
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 59dc8e87647f..1fa85aae24df 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c | |||
@@ -226,7 +226,7 @@ int v9fs_open_to_dotl_flags(int flags) | |||
226 | * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. | 226 | * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. |
227 | * @dir: directory inode that is being created | 227 | * @dir: directory inode that is being created |
228 | * @dentry: dentry that is being deleted | 228 | * @dentry: dentry that is being deleted |
229 | * @mode: create permissions | 229 | * @omode: create permissions |
230 | * | 230 | * |
231 | */ | 231 | */ |
232 | 232 | ||
@@ -375,7 +375,7 @@ err_clunk_old_fid: | |||
375 | * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory | 375 | * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory |
376 | * @dir: inode that is being unlinked | 376 | * @dir: inode that is being unlinked |
377 | * @dentry: dentry that is being unlinked | 377 | * @dentry: dentry that is being unlinked |
378 | * @mode: mode for new directory | 378 | * @omode: mode for new directory |
379 | * | 379 | * |
380 | */ | 380 | */ |
381 | 381 | ||
@@ -607,7 +607,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) | |||
607 | * v9fs_stat2inode_dotl - populate an inode structure with stat info | 607 | * v9fs_stat2inode_dotl - populate an inode structure with stat info |
608 | * @stat: stat structure | 608 | * @stat: stat structure |
609 | * @inode: inode to populate | 609 | * @inode: inode to populate |
610 | * @sb: superblock of filesystem | ||
611 | * | 610 | * |
612 | */ | 611 | */ |
613 | 612 | ||
@@ -808,7 +807,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, | |||
808 | * v9fs_vfs_mknod_dotl - create a special file | 807 | * v9fs_vfs_mknod_dotl - create a special file |
809 | * @dir: inode destination for new link | 808 | * @dir: inode destination for new link |
810 | * @dentry: dentry for file | 809 | * @dentry: dentry for file |
811 | * @mode: mode for creation | 810 | * @omode: mode for creation |
812 | * @rdev: device associated with special file | 811 | * @rdev: device associated with special file |
813 | * | 812 | * |
814 | */ | 813 | */ |
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 232e03d4780d..5b570b6efa28 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c | |||
@@ -737,7 +737,7 @@ MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); | |||
737 | MODULE_ALIAS("devname:autofs"); | 737 | MODULE_ALIAS("devname:autofs"); |
738 | 738 | ||
739 | /* Register/deregister misc character device */ | 739 | /* Register/deregister misc character device */ |
740 | int autofs_dev_ioctl_init(void) | 740 | int __init autofs_dev_ioctl_init(void) |
741 | { | 741 | { |
742 | int r; | 742 | int r; |
743 | 743 | ||
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index aa3cb626671e..dabc73ab900f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c | |||
@@ -1686,7 +1686,7 @@ static size_t get_note_info_size(struct elf_note_info *info) | |||
1686 | static int write_note_info(struct elf_note_info *info, | 1686 | static int write_note_info(struct elf_note_info *info, |
1687 | struct coredump_params *cprm) | 1687 | struct coredump_params *cprm) |
1688 | { | 1688 | { |
1689 | bool first = 1; | 1689 | bool first = true; |
1690 | struct elf_thread_core_info *t = info->thread; | 1690 | struct elf_thread_core_info *t = info->thread; |
1691 | 1691 | ||
1692 | do { | 1692 | do { |
@@ -1710,7 +1710,7 @@ static int write_note_info(struct elf_note_info *info, | |||
1710 | !writenote(&t->notes[i], cprm)) | 1710 | !writenote(&t->notes[i], cprm)) |
1711 | return 0; | 1711 | return 0; |
1712 | 1712 | ||
1713 | first = 0; | 1713 | first = false; |
1714 | t = t->next; | 1714 | t = t->next; |
1715 | } while (t); | 1715 | } while (t); |
1716 | 1716 | ||
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index d50bbe59da1e..f723cd3a455c 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c | |||
@@ -380,7 +380,7 @@ failed: | |||
380 | 380 | ||
381 | /****************************************************************************/ | 381 | /****************************************************************************/ |
382 | 382 | ||
383 | void old_reloc(unsigned long rl) | 383 | static void old_reloc(unsigned long rl) |
384 | { | 384 | { |
385 | #ifdef DEBUG | 385 | #ifdef DEBUG |
386 | char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; | 386 | char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 552a8d13bc32..83fba15cc394 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -363,6 +363,69 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) | |||
363 | } | 363 | } |
364 | EXPORT_SYMBOL(blkdev_fsync); | 364 | EXPORT_SYMBOL(blkdev_fsync); |
365 | 365 | ||
366 | /** | ||
367 | * bdev_read_page() - Start reading a page from a block device | ||
368 | * @bdev: The device to read the page from | ||
369 | * @sector: The offset on the device to read the page to (need not be aligned) | ||
370 | * @page: The page to read | ||
371 | * | ||
372 | * On entry, the page should be locked. It will be unlocked when the page | ||
373 | * has been read. If the block driver implements rw_page synchronously, | ||
374 | * that will be true on exit from this function, but it need not be. | ||
375 | * | ||
376 | * Errors returned by this function are usually "soft", eg out of memory, or | ||
377 | * queue full; callers should try a different route to read this page rather | ||
378 | * than propagate an error back up the stack. | ||
379 | * | ||
380 | * Return: negative errno if an error occurs, 0 if submission was successful. | ||
381 | */ | ||
382 | int bdev_read_page(struct block_device *bdev, sector_t sector, | ||
383 | struct page *page) | ||
384 | { | ||
385 | const struct block_device_operations *ops = bdev->bd_disk->fops; | ||
386 | if (!ops->rw_page) | ||
387 | return -EOPNOTSUPP; | ||
388 | return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); | ||
389 | } | ||
390 | EXPORT_SYMBOL_GPL(bdev_read_page); | ||
391 | |||
392 | /** | ||
393 | * bdev_write_page() - Start writing a page to a block device | ||
394 | * @bdev: The device to write the page to | ||
395 | * @sector: The offset on the device to write the page to (need not be aligned) | ||
396 | * @page: The page to write | ||
397 | * @wbc: The writeback_control for the write | ||
398 | * | ||
399 | * On entry, the page should be locked and not currently under writeback. | ||
400 | * On exit, if the write started successfully, the page will be unlocked and | ||
401 | * under writeback. If the write failed already (eg the driver failed to | ||
402 | * queue the page to the device), the page will still be locked. If the | ||
403 | * caller is a ->writepage implementation, it will need to unlock the page. | ||
404 | * | ||
405 | * Errors returned by this function are usually "soft", eg out of memory, or | ||
406 | * queue full; callers should try a different route to write this page rather | ||
407 | * than propagate an error back up the stack. | ||
408 | * | ||
409 | * Return: negative errno if an error occurs, 0 if submission was successful. | ||
410 | */ | ||
411 | int bdev_write_page(struct block_device *bdev, sector_t sector, | ||
412 | struct page *page, struct writeback_control *wbc) | ||
413 | { | ||
414 | int result; | ||
415 | int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; | ||
416 | const struct block_device_operations *ops = bdev->bd_disk->fops; | ||
417 | if (!ops->rw_page) | ||
418 | return -EOPNOTSUPP; | ||
419 | set_page_writeback(page); | ||
420 | result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); | ||
421 | if (result) | ||
422 | end_page_writeback(page); | ||
423 | else | ||
424 | unlock_page(page); | ||
425 | return result; | ||
426 | } | ||
427 | EXPORT_SYMBOL_GPL(bdev_write_page); | ||
428 | |||
366 | /* | 429 | /* |
367 | * pseudo-fs | 430 | * pseudo-fs |
368 | */ | 431 | */ |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f29a54e454d4..4cd0ac983f91 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -4510,7 +4510,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) | |||
4510 | spin_unlock(&eb->refs_lock); | 4510 | spin_unlock(&eb->refs_lock); |
4511 | } | 4511 | } |
4512 | 4512 | ||
4513 | static void mark_extent_buffer_accessed(struct extent_buffer *eb) | 4513 | static void mark_extent_buffer_accessed(struct extent_buffer *eb, |
4514 | struct page *accessed) | ||
4514 | { | 4515 | { |
4515 | unsigned long num_pages, i; | 4516 | unsigned long num_pages, i; |
4516 | 4517 | ||
@@ -4519,7 +4520,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb) | |||
4519 | num_pages = num_extent_pages(eb->start, eb->len); | 4520 | num_pages = num_extent_pages(eb->start, eb->len); |
4520 | for (i = 0; i < num_pages; i++) { | 4521 | for (i = 0; i < num_pages; i++) { |
4521 | struct page *p = extent_buffer_page(eb, i); | 4522 | struct page *p = extent_buffer_page(eb, i); |
4522 | mark_page_accessed(p); | 4523 | if (p != accessed) |
4524 | mark_page_accessed(p); | ||
4523 | } | 4525 | } |
4524 | } | 4526 | } |
4525 | 4527 | ||
@@ -4533,7 +4535,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, | |||
4533 | start >> PAGE_CACHE_SHIFT); | 4535 | start >> PAGE_CACHE_SHIFT); |
4534 | if (eb && atomic_inc_not_zero(&eb->refs)) { | 4536 | if (eb && atomic_inc_not_zero(&eb->refs)) { |
4535 | rcu_read_unlock(); | 4537 | rcu_read_unlock(); |
4536 | mark_extent_buffer_accessed(eb); | 4538 | mark_extent_buffer_accessed(eb, NULL); |
4537 | return eb; | 4539 | return eb; |
4538 | } | 4540 | } |
4539 | rcu_read_unlock(); | 4541 | rcu_read_unlock(); |
@@ -4581,7 +4583,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, | |||
4581 | spin_unlock(&mapping->private_lock); | 4583 | spin_unlock(&mapping->private_lock); |
4582 | unlock_page(p); | 4584 | unlock_page(p); |
4583 | page_cache_release(p); | 4585 | page_cache_release(p); |
4584 | mark_extent_buffer_accessed(exists); | 4586 | mark_extent_buffer_accessed(exists, p); |
4585 | goto free_eb; | 4587 | goto free_eb; |
4586 | } | 4588 | } |
4587 | 4589 | ||
@@ -4596,7 +4598,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, | |||
4596 | attach_extent_buffer_page(eb, p); | 4598 | attach_extent_buffer_page(eb, p); |
4597 | spin_unlock(&mapping->private_lock); | 4599 | spin_unlock(&mapping->private_lock); |
4598 | WARN_ON(PageDirty(p)); | 4600 | WARN_ON(PageDirty(p)); |
4599 | mark_page_accessed(p); | ||
4600 | eb->pages[i] = p; | 4601 | eb->pages[i] = p; |
4601 | if (!PageUptodate(p)) | 4602 | if (!PageUptodate(p)) |
4602 | uptodate = 0; | 4603 | uptodate = 0; |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ae6af072b635..74272a3f9d9b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -470,11 +470,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) | |||
470 | for (i = 0; i < num_pages; i++) { | 470 | for (i = 0; i < num_pages; i++) { |
471 | /* page checked is some magic around finding pages that | 471 | /* page checked is some magic around finding pages that |
472 | * have been modified without going through btrfs_set_page_dirty | 472 | * have been modified without going through btrfs_set_page_dirty |
473 | * clear it here | 473 | * clear it here. There should be no need to mark the pages |
474 | * accessed as prepare_pages should have marked them accessed | ||
475 | * in prepare_pages via find_or_create_page() | ||
474 | */ | 476 | */ |
475 | ClearPageChecked(pages[i]); | 477 | ClearPageChecked(pages[i]); |
476 | unlock_page(pages[i]); | 478 | unlock_page(pages[i]); |
477 | mark_page_accessed(pages[i]); | ||
478 | page_cache_release(pages[i]); | 479 | page_cache_release(pages[i]); |
479 | } | 480 | } |
480 | } | 481 | } |
diff --git a/fs/buffer.c b/fs/buffer.c index 6a8110c03a47..eba6e4f621ce 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) | |||
227 | int all_mapped = 1; | 227 | int all_mapped = 1; |
228 | 228 | ||
229 | index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); | 229 | index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); |
230 | page = find_get_page(bd_mapping, index); | 230 | page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); |
231 | if (!page) | 231 | if (!page) |
232 | goto out; | 232 | goto out; |
233 | 233 | ||
@@ -1366,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) | |||
1366 | struct buffer_head *bh = lookup_bh_lru(bdev, block, size); | 1366 | struct buffer_head *bh = lookup_bh_lru(bdev, block, size); |
1367 | 1367 | ||
1368 | if (bh == NULL) { | 1368 | if (bh == NULL) { |
1369 | /* __find_get_block_slow will mark the page accessed */ | ||
1369 | bh = __find_get_block_slow(bdev, block); | 1370 | bh = __find_get_block_slow(bdev, block); |
1370 | if (bh) | 1371 | if (bh) |
1371 | bh_lru_install(bh); | 1372 | bh_lru_install(bh); |
1372 | } | 1373 | } else |
1373 | if (bh) | ||
1374 | touch_buffer(bh); | 1374 | touch_buffer(bh); |
1375 | |||
1375 | return bh; | 1376 | return bh; |
1376 | } | 1377 | } |
1377 | EXPORT_SYMBOL(__find_get_block); | 1378 | EXPORT_SYMBOL(__find_get_block); |
@@ -1483,16 +1484,27 @@ EXPORT_SYMBOL(set_bh_page); | |||
1483 | /* | 1484 | /* |
1484 | * Called when truncating a buffer on a page completely. | 1485 | * Called when truncating a buffer on a page completely. |
1485 | */ | 1486 | */ |
1487 | |||
1488 | /* Bits that are cleared during an invalidate */ | ||
1489 | #define BUFFER_FLAGS_DISCARD \ | ||
1490 | (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \ | ||
1491 | 1 << BH_Delay | 1 << BH_Unwritten) | ||
1492 | |||
1486 | static void discard_buffer(struct buffer_head * bh) | 1493 | static void discard_buffer(struct buffer_head * bh) |
1487 | { | 1494 | { |
1495 | unsigned long b_state, b_state_old; | ||
1496 | |||
1488 | lock_buffer(bh); | 1497 | lock_buffer(bh); |
1489 | clear_buffer_dirty(bh); | 1498 | clear_buffer_dirty(bh); |
1490 | bh->b_bdev = NULL; | 1499 | bh->b_bdev = NULL; |
1491 | clear_buffer_mapped(bh); | 1500 | b_state = bh->b_state; |
1492 | clear_buffer_req(bh); | 1501 | for (;;) { |
1493 | clear_buffer_new(bh); | 1502 | b_state_old = cmpxchg(&bh->b_state, b_state, |
1494 | clear_buffer_delay(bh); | 1503 | (b_state & ~BUFFER_FLAGS_DISCARD)); |
1495 | clear_buffer_unwritten(bh); | 1504 | if (b_state_old == b_state) |
1505 | break; | ||
1506 | b_state = b_state_old; | ||
1507 | } | ||
1496 | unlock_buffer(bh); | 1508 | unlock_buffer(bh); |
1497 | } | 1509 | } |
1498 | 1510 | ||
@@ -2879,10 +2891,9 @@ EXPORT_SYMBOL(block_truncate_page); | |||
2879 | 2891 | ||
2880 | /* | 2892 | /* |
2881 | * The generic ->writepage function for buffer-backed address_spaces | 2893 | * The generic ->writepage function for buffer-backed address_spaces |
2882 | * this form passes in the end_io handler used to finish the IO. | ||
2883 | */ | 2894 | */ |
2884 | int block_write_full_page_endio(struct page *page, get_block_t *get_block, | 2895 | int block_write_full_page(struct page *page, get_block_t *get_block, |
2885 | struct writeback_control *wbc, bh_end_io_t *handler) | 2896 | struct writeback_control *wbc) |
2886 | { | 2897 | { |
2887 | struct inode * const inode = page->mapping->host; | 2898 | struct inode * const inode = page->mapping->host; |
2888 | loff_t i_size = i_size_read(inode); | 2899 | loff_t i_size = i_size_read(inode); |
@@ -2892,7 +2903,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, | |||
2892 | /* Is the page fully inside i_size? */ | 2903 | /* Is the page fully inside i_size? */ |
2893 | if (page->index < end_index) | 2904 | if (page->index < end_index) |
2894 | return __block_write_full_page(inode, page, get_block, wbc, | 2905 | return __block_write_full_page(inode, page, get_block, wbc, |
2895 | handler); | 2906 | end_buffer_async_write); |
2896 | 2907 | ||
2897 | /* Is the page fully outside i_size? (truncate in progress) */ | 2908 | /* Is the page fully outside i_size? (truncate in progress) */ |
2898 | offset = i_size & (PAGE_CACHE_SIZE-1); | 2909 | offset = i_size & (PAGE_CACHE_SIZE-1); |
@@ -2915,18 +2926,8 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, | |||
2915 | * writes to that region are not written out to the file." | 2926 | * writes to that region are not written out to the file." |
2916 | */ | 2927 | */ |
2917 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); | 2928 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); |
2918 | return __block_write_full_page(inode, page, get_block, wbc, handler); | 2929 | return __block_write_full_page(inode, page, get_block, wbc, |
2919 | } | 2930 | end_buffer_async_write); |
2920 | EXPORT_SYMBOL(block_write_full_page_endio); | ||
2921 | |||
2922 | /* | ||
2923 | * The generic ->writepage function for buffer-backed address_spaces | ||
2924 | */ | ||
2925 | int block_write_full_page(struct page *page, get_block_t *get_block, | ||
2926 | struct writeback_control *wbc) | ||
2927 | { | ||
2928 | return block_write_full_page_endio(page, get_block, wbc, | ||
2929 | end_buffer_async_write); | ||
2930 | } | 2931 | } |
2931 | EXPORT_SYMBOL(block_write_full_page); | 2932 | EXPORT_SYMBOL(block_write_full_page); |
2932 | 2933 | ||
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index b5f0a3b91f18..bd4a3c167091 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h | |||
@@ -24,6 +24,12 @@ | |||
24 | * configfs Copyright (C) 2005 Oracle. All rights reserved. | 24 | * configfs Copyright (C) 2005 Oracle. All rights reserved. |
25 | */ | 25 | */ |
26 | 26 | ||
27 | #ifdef pr_fmt | ||
28 | #undef pr_fmt | ||
29 | #endif | ||
30 | |||
31 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
32 | |||
27 | #include <linux/slab.h> | 33 | #include <linux/slab.h> |
28 | #include <linux/list.h> | 34 | #include <linux/list.h> |
29 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index e081acbac2e7..668dcabc5695 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c | |||
@@ -940,9 +940,9 @@ static void client_drop_item(struct config_item *parent_item, | |||
940 | #ifdef DEBUG | 940 | #ifdef DEBUG |
941 | static void configfs_dump_one(struct configfs_dirent *sd, int level) | 941 | static void configfs_dump_one(struct configfs_dirent *sd, int level) |
942 | { | 942 | { |
943 | printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd)); | 943 | pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd)); |
944 | 944 | ||
945 | #define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type); | 945 | #define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type); |
946 | type_print(CONFIGFS_ROOT); | 946 | type_print(CONFIGFS_ROOT); |
947 | type_print(CONFIGFS_DIR); | 947 | type_print(CONFIGFS_DIR); |
948 | type_print(CONFIGFS_ITEM_ATTR); | 948 | type_print(CONFIGFS_ITEM_ATTR); |
@@ -1699,7 +1699,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) | |||
1699 | struct dentry *root = dentry->d_sb->s_root; | 1699 | struct dentry *root = dentry->d_sb->s_root; |
1700 | 1700 | ||
1701 | if (dentry->d_parent != root) { | 1701 | if (dentry->d_parent != root) { |
1702 | printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n"); | 1702 | pr_err("Tried to unregister non-subsystem!\n"); |
1703 | return; | 1703 | return; |
1704 | } | 1704 | } |
1705 | 1705 | ||
@@ -1709,7 +1709,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) | |||
1709 | mutex_lock(&configfs_symlink_mutex); | 1709 | mutex_lock(&configfs_symlink_mutex); |
1710 | spin_lock(&configfs_dirent_lock); | 1710 | spin_lock(&configfs_dirent_lock); |
1711 | if (configfs_detach_prep(dentry, NULL)) { | 1711 | if (configfs_detach_prep(dentry, NULL)) { |
1712 | printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); | 1712 | pr_err("Tried to unregister non-empty subsystem!\n"); |
1713 | } | 1713 | } |
1714 | spin_unlock(&configfs_dirent_lock); | 1714 | spin_unlock(&configfs_dirent_lock); |
1715 | mutex_unlock(&configfs_symlink_mutex); | 1715 | mutex_unlock(&configfs_symlink_mutex); |
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index a9d35b0e06cf..5946ad98053f 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c | |||
@@ -168,9 +168,8 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, | |||
168 | * In practice the maximum level of locking depth is | 168 | * In practice the maximum level of locking depth is |
169 | * already reached. Just inform about possible reasons. | 169 | * already reached. Just inform about possible reasons. |
170 | */ | 170 | */ |
171 | printk(KERN_INFO "configfs: Too many levels of inodes" | 171 | pr_info("Too many levels of inodes for the locking correctness validator.\n"); |
172 | " for the locking correctness validator.\n"); | 172 | pr_info("Spurious warnings may appear.\n"); |
173 | printk(KERN_INFO "Spurious warnings may appear.\n"); | ||
174 | } | 173 | } |
175 | } | 174 | } |
176 | } | 175 | } |
diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 50cee7f9110b..e65f9ffbb999 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c | |||
@@ -19,7 +19,7 @@ | |||
19 | * Boston, MA 021110-1307, USA. | 19 | * Boston, MA 021110-1307, USA. |
20 | * | 20 | * |
21 | * Based on kobject: | 21 | * Based on kobject: |
22 | * kobject is Copyright (c) 2002-2003 Patrick Mochel | 22 | * kobject is Copyright (c) 2002-2003 Patrick Mochel |
23 | * | 23 | * |
24 | * configfs Copyright (C) 2005 Oracle. All rights reserved. | 24 | * configfs Copyright (C) 2005 Oracle. All rights reserved. |
25 | * | 25 | * |
@@ -35,9 +35,9 @@ | |||
35 | #include <linux/configfs.h> | 35 | #include <linux/configfs.h> |
36 | 36 | ||
37 | 37 | ||
38 | static inline struct config_item * to_item(struct list_head * entry) | 38 | static inline struct config_item *to_item(struct list_head *entry) |
39 | { | 39 | { |
40 | return container_of(entry,struct config_item,ci_entry); | 40 | return container_of(entry, struct config_item, ci_entry); |
41 | } | 41 | } |
42 | 42 | ||
43 | /* Evil kernel */ | 43 | /* Evil kernel */ |
@@ -47,34 +47,35 @@ static void config_item_release(struct kref *kref); | |||
47 | * config_item_init - initialize item. | 47 | * config_item_init - initialize item. |
48 | * @item: item in question. | 48 | * @item: item in question. |
49 | */ | 49 | */ |
50 | void config_item_init(struct config_item * item) | 50 | void config_item_init(struct config_item *item) |
51 | { | 51 | { |
52 | kref_init(&item->ci_kref); | 52 | kref_init(&item->ci_kref); |
53 | INIT_LIST_HEAD(&item->ci_entry); | 53 | INIT_LIST_HEAD(&item->ci_entry); |
54 | } | 54 | } |
55 | EXPORT_SYMBOL(config_item_init); | ||
55 | 56 | ||
56 | /** | 57 | /** |
57 | * config_item_set_name - Set the name of an item | 58 | * config_item_set_name - Set the name of an item |
58 | * @item: item. | 59 | * @item: item. |
59 | * @name: name. | 60 | * @fmt: The vsnprintf()'s format string. |
60 | * | 61 | * |
61 | * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a | 62 | * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a |
62 | * dynamically allocated string that @item->ci_name points to. | 63 | * dynamically allocated string that @item->ci_name points to. |
63 | * Otherwise, use the static @item->ci_namebuf array. | 64 | * Otherwise, use the static @item->ci_namebuf array. |
64 | */ | 65 | */ |
65 | int config_item_set_name(struct config_item * item, const char * fmt, ...) | 66 | int config_item_set_name(struct config_item *item, const char *fmt, ...) |
66 | { | 67 | { |
67 | int error = 0; | 68 | int error = 0; |
68 | int limit = CONFIGFS_ITEM_NAME_LEN; | 69 | int limit = CONFIGFS_ITEM_NAME_LEN; |
69 | int need; | 70 | int need; |
70 | va_list args; | 71 | va_list args; |
71 | char * name; | 72 | char *name; |
72 | 73 | ||
73 | /* | 74 | /* |
74 | * First, try the static array | 75 | * First, try the static array |
75 | */ | 76 | */ |
76 | va_start(args,fmt); | 77 | va_start(args, fmt); |
77 | need = vsnprintf(item->ci_namebuf,limit,fmt,args); | 78 | need = vsnprintf(item->ci_namebuf, limit, fmt, args); |
78 | va_end(args); | 79 | va_end(args); |
79 | if (need < limit) | 80 | if (need < limit) |
80 | name = item->ci_namebuf; | 81 | name = item->ci_namebuf; |
@@ -83,13 +84,13 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...) | |||
83 | * Need more space? Allocate it and try again | 84 | * Need more space? Allocate it and try again |
84 | */ | 85 | */ |
85 | limit = need + 1; | 86 | limit = need + 1; |
86 | name = kmalloc(limit,GFP_KERNEL); | 87 | name = kmalloc(limit, GFP_KERNEL); |
87 | if (!name) { | 88 | if (!name) { |
88 | error = -ENOMEM; | 89 | error = -ENOMEM; |
89 | goto Done; | 90 | goto Done; |
90 | } | 91 | } |
91 | va_start(args,fmt); | 92 | va_start(args, fmt); |
92 | need = vsnprintf(name,limit,fmt,args); | 93 | need = vsnprintf(name, limit, fmt, args); |
93 | va_end(args); | 94 | va_end(args); |
94 | 95 | ||
95 | /* Still? Give up. */ | 96 | /* Still? Give up. */ |
@@ -109,7 +110,6 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...) | |||
109 | Done: | 110 | Done: |
110 | return error; | 111 | return error; |
111 | } | 112 | } |
112 | |||
113 | EXPORT_SYMBOL(config_item_set_name); | 113 | EXPORT_SYMBOL(config_item_set_name); |
114 | 114 | ||
115 | void config_item_init_type_name(struct config_item *item, | 115 | void config_item_init_type_name(struct config_item *item, |
@@ -131,20 +131,21 @@ void config_group_init_type_name(struct config_group *group, const char *name, | |||
131 | } | 131 | } |
132 | EXPORT_SYMBOL(config_group_init_type_name); | 132 | EXPORT_SYMBOL(config_group_init_type_name); |
133 | 133 | ||
134 | struct config_item * config_item_get(struct config_item * item) | 134 | struct config_item *config_item_get(struct config_item *item) |
135 | { | 135 | { |
136 | if (item) | 136 | if (item) |
137 | kref_get(&item->ci_kref); | 137 | kref_get(&item->ci_kref); |
138 | return item; | 138 | return item; |
139 | } | 139 | } |
140 | EXPORT_SYMBOL(config_item_get); | ||
140 | 141 | ||
141 | static void config_item_cleanup(struct config_item * item) | 142 | static void config_item_cleanup(struct config_item *item) |
142 | { | 143 | { |
143 | struct config_item_type * t = item->ci_type; | 144 | struct config_item_type *t = item->ci_type; |
144 | struct config_group * s = item->ci_group; | 145 | struct config_group *s = item->ci_group; |
145 | struct config_item * parent = item->ci_parent; | 146 | struct config_item *parent = item->ci_parent; |
146 | 147 | ||
147 | pr_debug("config_item %s: cleaning up\n",config_item_name(item)); | 148 | pr_debug("config_item %s: cleaning up\n", config_item_name(item)); |
148 | if (item->ci_name != item->ci_namebuf) | 149 | if (item->ci_name != item->ci_namebuf) |
149 | kfree(item->ci_name); | 150 | kfree(item->ci_name); |
150 | item->ci_name = NULL; | 151 | item->ci_name = NULL; |
@@ -167,21 +168,23 @@ static void config_item_release(struct kref *kref) | |||
167 | * | 168 | * |
168 | * Decrement the refcount, and if 0, call config_item_cleanup(). | 169 | * Decrement the refcount, and if 0, call config_item_cleanup(). |
169 | */ | 170 | */ |
170 | void config_item_put(struct config_item * item) | 171 | void config_item_put(struct config_item *item) |
171 | { | 172 | { |
172 | if (item) | 173 | if (item) |
173 | kref_put(&item->ci_kref, config_item_release); | 174 | kref_put(&item->ci_kref, config_item_release); |
174 | } | 175 | } |
176 | EXPORT_SYMBOL(config_item_put); | ||
175 | 177 | ||
176 | /** | 178 | /** |
177 | * config_group_init - initialize a group for use | 179 | * config_group_init - initialize a group for use |
178 | * @k: group | 180 | * @group: config_group |
179 | */ | 181 | */ |
180 | void config_group_init(struct config_group *group) | 182 | void config_group_init(struct config_group *group) |
181 | { | 183 | { |
182 | config_item_init(&group->cg_item); | 184 | config_item_init(&group->cg_item); |
183 | INIT_LIST_HEAD(&group->cg_children); | 185 | INIT_LIST_HEAD(&group->cg_children); |
184 | } | 186 | } |
187 | EXPORT_SYMBOL(config_group_init); | ||
185 | 188 | ||
186 | /** | 189 | /** |
187 | * config_group_find_item - search for item in group. | 190 | * config_group_find_item - search for item in group. |
@@ -195,11 +198,11 @@ void config_group_init(struct config_group *group) | |||
195 | struct config_item *config_group_find_item(struct config_group *group, | 198 | struct config_item *config_group_find_item(struct config_group *group, |
196 | const char *name) | 199 | const char *name) |
197 | { | 200 | { |
198 | struct list_head * entry; | 201 | struct list_head *entry; |
199 | struct config_item * ret = NULL; | 202 | struct config_item *ret = NULL; |
200 | 203 | ||
201 | list_for_each(entry,&group->cg_children) { | 204 | list_for_each(entry, &group->cg_children) { |
202 | struct config_item * item = to_item(entry); | 205 | struct config_item *item = to_item(entry); |
203 | if (config_item_name(item) && | 206 | if (config_item_name(item) && |
204 | !strcmp(config_item_name(item), name)) { | 207 | !strcmp(config_item_name(item), name)) { |
205 | ret = config_item_get(item); | 208 | ret = config_item_get(item); |
@@ -208,9 +211,4 @@ struct config_item *config_group_find_item(struct config_group *group, | |||
208 | } | 211 | } |
209 | return ret; | 212 | return ret; |
210 | } | 213 | } |
211 | |||
212 | EXPORT_SYMBOL(config_item_init); | ||
213 | EXPORT_SYMBOL(config_group_init); | ||
214 | EXPORT_SYMBOL(config_item_get); | ||
215 | EXPORT_SYMBOL(config_item_put); | ||
216 | EXPORT_SYMBOL(config_group_find_item); | 214 | EXPORT_SYMBOL(config_group_find_item); |
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 7f26c3cf75ae..f6c285833390 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c | |||
@@ -85,7 +85,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) | |||
85 | /* directory inodes start off with i_nlink == 2 (for "." entry) */ | 85 | /* directory inodes start off with i_nlink == 2 (for "." entry) */ |
86 | inc_nlink(inode); | 86 | inc_nlink(inode); |
87 | } else { | 87 | } else { |
88 | pr_debug("configfs: could not get root inode\n"); | 88 | pr_debug("could not get root inode\n"); |
89 | return -ENOMEM; | 89 | return -ENOMEM; |
90 | } | 90 | } |
91 | 91 | ||
@@ -155,7 +155,7 @@ static int __init configfs_init(void) | |||
155 | 155 | ||
156 | return 0; | 156 | return 0; |
157 | out4: | 157 | out4: |
158 | printk(KERN_ERR "configfs: Unable to register filesystem!\n"); | 158 | pr_err("Unable to register filesystem!\n"); |
159 | configfs_inode_exit(); | 159 | configfs_inode_exit(); |
160 | out3: | 160 | out3: |
161 | kobject_put(config_kobj); | 161 | kobject_put(config_kobj); |
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index becc725a1953..0a48886e069c 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c | |||
@@ -83,7 +83,7 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) | |||
83 | return 0; | 83 | return 0; |
84 | } | 84 | } |
85 | 85 | ||
86 | static struct dentry_operations efivarfs_d_ops = { | 86 | static const struct dentry_operations efivarfs_d_ops = { |
87 | .d_compare = efivarfs_d_compare, | 87 | .d_compare = efivarfs_d_compare, |
88 | .d_hash = efivarfs_d_hash, | 88 | .d_hash = efivarfs_d_hash, |
89 | .d_delete = always_delete_dentry, | 89 | .d_delete = always_delete_dentry, |
diff --git a/fs/efs/dir.c b/fs/efs/dir.c index b72307ccdf7a..ce63b24f7c3e 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c | |||
@@ -26,7 +26,8 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) | |||
26 | int slot; | 26 | int slot; |
27 | 27 | ||
28 | if (inode->i_size & (EFS_DIRBSIZE-1)) | 28 | if (inode->i_size & (EFS_DIRBSIZE-1)) |
29 | printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); | 29 | pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n", |
30 | __func__); | ||
30 | 31 | ||
31 | /* work out where this entry can be found */ | 32 | /* work out where this entry can be found */ |
32 | block = ctx->pos >> EFS_DIRBSIZE_BITS; | 33 | block = ctx->pos >> EFS_DIRBSIZE_BITS; |
@@ -43,14 +44,15 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) | |||
43 | bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); | 44 | bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); |
44 | 45 | ||
45 | if (!bh) { | 46 | if (!bh) { |
46 | printk(KERN_ERR "EFS: readdir(): failed to read dir block %d\n", block); | 47 | pr_err("%s(): failed to read dir block %d\n", |
48 | __func__, block); | ||
47 | break; | 49 | break; |
48 | } | 50 | } |
49 | 51 | ||
50 | dirblock = (struct efs_dir *) bh->b_data; | 52 | dirblock = (struct efs_dir *) bh->b_data; |
51 | 53 | ||
52 | if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { | 54 | if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { |
53 | printk(KERN_ERR "EFS: readdir(): invalid directory block\n"); | 55 | pr_err("%s(): invalid directory block\n", __func__); |
54 | brelse(bh); | 56 | brelse(bh); |
55 | break; | 57 | break; |
56 | } | 58 | } |
@@ -69,10 +71,9 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) | |||
69 | inodenum = be32_to_cpu(dirslot->inode); | 71 | inodenum = be32_to_cpu(dirslot->inode); |
70 | namelen = dirslot->namelen; | 72 | namelen = dirslot->namelen; |
71 | nameptr = dirslot->name; | 73 | nameptr = dirslot->name; |
72 | 74 | pr_debug("%s(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", | |
73 | #ifdef DEBUG | 75 | __func__, block, slot, dirblock->slots-1, |
74 | printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen); | 76 | inodenum, nameptr, namelen); |
75 | #endif | ||
76 | if (!namelen) | 77 | if (!namelen) |
77 | continue; | 78 | continue; |
78 | /* found the next entry */ | 79 | /* found the next entry */ |
@@ -80,7 +81,8 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) | |||
80 | 81 | ||
81 | /* sanity check */ | 82 | /* sanity check */ |
82 | if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { | 83 | if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { |
83 | printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot); | 84 | pr_warn("directory entry %d exceeds directory block\n", |
85 | slot); | ||
84 | continue; | 86 | continue; |
85 | } | 87 | } |
86 | 88 | ||
diff --git a/fs/efs/efs.h b/fs/efs/efs.h index 5528926ac7f6..5bbf9612140c 100644 --- a/fs/efs/efs.h +++ b/fs/efs/efs.h | |||
@@ -7,6 +7,12 @@ | |||
7 | #ifndef _EFS_EFS_H_ | 7 | #ifndef _EFS_EFS_H_ |
8 | #define _EFS_EFS_H_ | 8 | #define _EFS_EFS_H_ |
9 | 9 | ||
10 | #ifdef pr_fmt | ||
11 | #undef pr_fmt | ||
12 | #endif | ||
13 | |||
14 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
15 | |||
10 | #include <linux/fs.h> | 16 | #include <linux/fs.h> |
11 | #include <asm/uaccess.h> | 17 | #include <asm/uaccess.h> |
12 | 18 | ||
diff --git a/fs/efs/file.c b/fs/efs/file.c index 1ccb364ffa63..a37dcee46866 100644 --- a/fs/efs/file.c +++ b/fs/efs/file.c | |||
@@ -22,10 +22,8 @@ int efs_get_block(struct inode *inode, sector_t iblock, | |||
22 | /* | 22 | /* |
23 | * i have no idea why this happens as often as it does | 23 | * i have no idea why this happens as often as it does |
24 | */ | 24 | */ |
25 | printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n", | 25 | pr_warn("%s(): block %d >= %ld (filesize %ld)\n", |
26 | block, | 26 | __func__, block, inode->i_blocks, inode->i_size); |
27 | inode->i_blocks, | ||
28 | inode->i_size); | ||
29 | #endif | 27 | #endif |
30 | return 0; | 28 | return 0; |
31 | } | 29 | } |
@@ -38,7 +36,7 @@ int efs_get_block(struct inode *inode, sector_t iblock, | |||
38 | int efs_bmap(struct inode *inode, efs_block_t block) { | 36 | int efs_bmap(struct inode *inode, efs_block_t block) { |
39 | 37 | ||
40 | if (block < 0) { | 38 | if (block < 0) { |
41 | printk(KERN_WARNING "EFS: bmap(): block < 0\n"); | 39 | pr_warn("%s(): block < 0\n", __func__); |
42 | return 0; | 40 | return 0; |
43 | } | 41 | } |
44 | 42 | ||
@@ -48,10 +46,8 @@ int efs_bmap(struct inode *inode, efs_block_t block) { | |||
48 | /* | 46 | /* |
49 | * i have no idea why this happens as often as it does | 47 | * i have no idea why this happens as often as it does |
50 | */ | 48 | */ |
51 | printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n", | 49 | pr_warn("%s(): block %d >= %ld (filesize %ld)\n", |
52 | block, | 50 | __func__, block, inode->i_blocks, inode->i_size); |
53 | inode->i_blocks, | ||
54 | inode->i_size); | ||
55 | #endif | 51 | #endif |
56 | return 0; | 52 | return 0; |
57 | } | 53 | } |
diff --git a/fs/efs/inode.c b/fs/efs/inode.c index d15ccf20f1b3..079d20306ee1 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c | |||
@@ -89,7 +89,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) | |||
89 | 89 | ||
90 | bh = sb_bread(inode->i_sb, block); | 90 | bh = sb_bread(inode->i_sb, block); |
91 | if (!bh) { | 91 | if (!bh) { |
92 | printk(KERN_WARNING "EFS: bread() failed at block %d\n", block); | 92 | pr_warn("%s() failed at block %d\n", __func__, block); |
93 | goto read_inode_error; | 93 | goto read_inode_error; |
94 | } | 94 | } |
95 | 95 | ||
@@ -130,19 +130,16 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) | |||
130 | for(i = 0; i < EFS_DIRECTEXTENTS; i++) { | 130 | for(i = 0; i < EFS_DIRECTEXTENTS; i++) { |
131 | extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i])); | 131 | extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i])); |
132 | if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) { | 132 | if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) { |
133 | printk(KERN_WARNING "EFS: extent %d has bad magic number in inode %lu\n", i, inode->i_ino); | 133 | pr_warn("extent %d has bad magic number in inode %lu\n", |
134 | i, inode->i_ino); | ||
134 | brelse(bh); | 135 | brelse(bh); |
135 | goto read_inode_error; | 136 | goto read_inode_error; |
136 | } | 137 | } |
137 | } | 138 | } |
138 | 139 | ||
139 | brelse(bh); | 140 | brelse(bh); |
140 | 141 | pr_debug("efs_iget(): inode %lu, extents %d, mode %o\n", | |
141 | #ifdef DEBUG | 142 | inode->i_ino, in->numextents, inode->i_mode); |
142 | printk(KERN_DEBUG "EFS: efs_iget(): inode %lu, extents %d, mode %o\n", | ||
143 | inode->i_ino, in->numextents, inode->i_mode); | ||
144 | #endif | ||
145 | |||
146 | switch (inode->i_mode & S_IFMT) { | 143 | switch (inode->i_mode & S_IFMT) { |
147 | case S_IFDIR: | 144 | case S_IFDIR: |
148 | inode->i_op = &efs_dir_inode_operations; | 145 | inode->i_op = &efs_dir_inode_operations; |
@@ -162,7 +159,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) | |||
162 | init_special_inode(inode, inode->i_mode, device); | 159 | init_special_inode(inode, inode->i_mode, device); |
163 | break; | 160 | break; |
164 | default: | 161 | default: |
165 | printk(KERN_WARNING "EFS: unsupported inode mode %o\n", inode->i_mode); | 162 | pr_warn("unsupported inode mode %o\n", inode->i_mode); |
166 | goto read_inode_error; | 163 | goto read_inode_error; |
167 | break; | 164 | break; |
168 | } | 165 | } |
@@ -171,7 +168,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) | |||
171 | return inode; | 168 | return inode; |
172 | 169 | ||
173 | read_inode_error: | 170 | read_inode_error: |
174 | printk(KERN_WARNING "EFS: failed to read inode %lu\n", inode->i_ino); | 171 | pr_warn("failed to read inode %lu\n", inode->i_ino); |
175 | iget_failed(inode); | 172 | iget_failed(inode); |
176 | return ERR_PTR(-EIO); | 173 | return ERR_PTR(-EIO); |
177 | } | 174 | } |
@@ -216,7 +213,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { | |||
216 | 213 | ||
217 | /* if we only have one extent then nothing can be found */ | 214 | /* if we only have one extent then nothing can be found */ |
218 | if (in->numextents == 1) { | 215 | if (in->numextents == 1) { |
219 | printk(KERN_ERR "EFS: map_block() failed to map (1 extent)\n"); | 216 | pr_err("%s() failed to map (1 extent)\n", __func__); |
220 | return 0; | 217 | return 0; |
221 | } | 218 | } |
222 | 219 | ||
@@ -234,13 +231,12 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { | |||
234 | } | 231 | } |
235 | } | 232 | } |
236 | 233 | ||
237 | printk(KERN_ERR "EFS: map_block() failed to map block %u (dir)\n", block); | 234 | pr_err("%s() failed to map block %u (dir)\n", __func__, block); |
238 | return 0; | 235 | return 0; |
239 | } | 236 | } |
240 | 237 | ||
241 | #ifdef DEBUG | 238 | pr_debug("%s(): indirect search for logical block %u\n", |
242 | printk(KERN_DEBUG "EFS: map_block(): indirect search for logical block %u\n", block); | 239 | __func__, block); |
243 | #endif | ||
244 | direxts = in->extents[0].cooked.ex_offset; | 240 | direxts = in->extents[0].cooked.ex_offset; |
245 | indexts = in->numextents; | 241 | indexts = in->numextents; |
246 | 242 | ||
@@ -262,7 +258,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { | |||
262 | 258 | ||
263 | if (dirext == direxts) { | 259 | if (dirext == direxts) { |
264 | /* should never happen */ | 260 | /* should never happen */ |
265 | printk(KERN_ERR "EFS: couldn't find direct extent for indirect extent %d (block %u)\n", cur, block); | 261 | pr_err("couldn't find direct extent for indirect extent %d (block %u)\n", |
262 | cur, block); | ||
266 | if (bh) brelse(bh); | 263 | if (bh) brelse(bh); |
267 | return 0; | 264 | return 0; |
268 | } | 265 | } |
@@ -279,12 +276,12 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { | |||
279 | 276 | ||
280 | bh = sb_bread(inode->i_sb, iblock); | 277 | bh = sb_bread(inode->i_sb, iblock); |
281 | if (!bh) { | 278 | if (!bh) { |
282 | printk(KERN_ERR "EFS: bread() failed at block %d\n", iblock); | 279 | pr_err("%s() failed at block %d\n", |
280 | __func__, iblock); | ||
283 | return 0; | 281 | return 0; |
284 | } | 282 | } |
285 | #ifdef DEBUG | 283 | pr_debug("%s(): read indirect extent block %d\n", |
286 | printk(KERN_DEBUG "EFS: map_block(): read indirect extent block %d\n", iblock); | 284 | __func__, iblock); |
287 | #endif | ||
288 | first = 0; | 285 | first = 0; |
289 | lastblock = iblock; | 286 | lastblock = iblock; |
290 | } | 287 | } |
@@ -294,7 +291,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { | |||
294 | extent_copy(&(exts[ioffset]), &ext); | 291 | extent_copy(&(exts[ioffset]), &ext); |
295 | 292 | ||
296 | if (ext.cooked.ex_magic != 0) { | 293 | if (ext.cooked.ex_magic != 0) { |
297 | printk(KERN_ERR "EFS: extent %d has bad magic number in block %d\n", cur, iblock); | 294 | pr_err("extent %d has bad magic number in block %d\n", |
295 | cur, iblock); | ||
298 | if (bh) brelse(bh); | 296 | if (bh) brelse(bh); |
299 | return 0; | 297 | return 0; |
300 | } | 298 | } |
@@ -306,7 +304,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { | |||
306 | } | 304 | } |
307 | } | 305 | } |
308 | if (bh) brelse(bh); | 306 | if (bh) brelse(bh); |
309 | printk(KERN_ERR "EFS: map_block() failed to map block %u (indir)\n", block); | 307 | pr_err("%s() failed to map block %u (indir)\n", __func__, block); |
310 | return 0; | 308 | return 0; |
311 | } | 309 | } |
312 | 310 | ||
diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 96f66d213a19..356c044e2cd3 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c | |||
@@ -23,20 +23,22 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) | |||
23 | efs_block_t block; | 23 | efs_block_t block; |
24 | 24 | ||
25 | if (inode->i_size & (EFS_DIRBSIZE-1)) | 25 | if (inode->i_size & (EFS_DIRBSIZE-1)) |
26 | printk(KERN_WARNING "EFS: WARNING: find_entry(): directory size not a multiple of EFS_DIRBSIZE\n"); | 26 | pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n", |
27 | __func__); | ||
27 | 28 | ||
28 | for(block = 0; block < inode->i_blocks; block++) { | 29 | for(block = 0; block < inode->i_blocks; block++) { |
29 | 30 | ||
30 | bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); | 31 | bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); |
31 | if (!bh) { | 32 | if (!bh) { |
32 | printk(KERN_ERR "EFS: find_entry(): failed to read dir block %d\n", block); | 33 | pr_err("%s(): failed to read dir block %d\n", |
34 | __func__, block); | ||
33 | return 0; | 35 | return 0; |
34 | } | 36 | } |
35 | 37 | ||
36 | dirblock = (struct efs_dir *) bh->b_data; | 38 | dirblock = (struct efs_dir *) bh->b_data; |
37 | 39 | ||
38 | if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { | 40 | if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { |
39 | printk(KERN_ERR "EFS: find_entry(): invalid directory block\n"); | 41 | pr_err("%s(): invalid directory block\n", __func__); |
40 | brelse(bh); | 42 | brelse(bh); |
41 | return(0); | 43 | return(0); |
42 | } | 44 | } |
diff --git a/fs/efs/super.c b/fs/efs/super.c index 3befcc9f5d63..7fca462ea4e3 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c | |||
@@ -134,7 +134,7 @@ static const struct export_operations efs_export_ops = { | |||
134 | 134 | ||
135 | static int __init init_efs_fs(void) { | 135 | static int __init init_efs_fs(void) { |
136 | int err; | 136 | int err; |
137 | printk("EFS: "EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); | 137 | pr_info(EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); |
138 | err = init_inodecache(); | 138 | err = init_inodecache(); |
139 | if (err) | 139 | if (err) |
140 | goto out1; | 140 | goto out1; |
@@ -179,12 +179,12 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { | |||
179 | csum += be32_to_cpu(cs); | 179 | csum += be32_to_cpu(cs); |
180 | } | 180 | } |
181 | if (csum) { | 181 | if (csum) { |
182 | printk(KERN_INFO "EFS: SGI disklabel: checksum bad, label corrupted\n"); | 182 | pr_warn("SGI disklabel: checksum bad, label corrupted\n"); |
183 | return 0; | 183 | return 0; |
184 | } | 184 | } |
185 | 185 | ||
186 | #ifdef DEBUG | 186 | #ifdef DEBUG |
187 | printk(KERN_DEBUG "EFS: bf: \"%16s\"\n", vh->vh_bootfile); | 187 | pr_debug("bf: \"%16s\"\n", vh->vh_bootfile); |
188 | 188 | ||
189 | for(i = 0; i < NVDIR; i++) { | 189 | for(i = 0; i < NVDIR; i++) { |
190 | int j; | 190 | int j; |
@@ -196,9 +196,8 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { | |||
196 | name[j] = (char) 0; | 196 | name[j] = (char) 0; |
197 | 197 | ||
198 | if (name[0]) { | 198 | if (name[0]) { |
199 | printk(KERN_DEBUG "EFS: vh: %8s block: 0x%08x size: 0x%08x\n", | 199 | pr_debug("vh: %8s block: 0x%08x size: 0x%08x\n", |
200 | name, | 200 | name, (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), |
201 | (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), | ||
202 | (int) be32_to_cpu(vh->vh_vd[i].vd_nbytes)); | 201 | (int) be32_to_cpu(vh->vh_vd[i].vd_nbytes)); |
203 | } | 202 | } |
204 | } | 203 | } |
@@ -211,12 +210,11 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { | |||
211 | } | 210 | } |
212 | #ifdef DEBUG | 211 | #ifdef DEBUG |
213 | if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) { | 212 | if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) { |
214 | printk(KERN_DEBUG "EFS: pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", | 213 | pr_debug("pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", |
215 | i, | 214 | i, (int)be32_to_cpu(vh->vh_pt[i].pt_firstlbn), |
216 | (int) be32_to_cpu(vh->vh_pt[i].pt_firstlbn), | 215 | (int)be32_to_cpu(vh->vh_pt[i].pt_nblks), |
217 | (int) be32_to_cpu(vh->vh_pt[i].pt_nblks), | 216 | pt_type, (pt_entry->pt_name) ? |
218 | pt_type, | 217 | pt_entry->pt_name : "unknown"); |
219 | (pt_entry->pt_name) ? pt_entry->pt_name : "unknown"); | ||
220 | } | 218 | } |
221 | #endif | 219 | #endif |
222 | if (IS_EFS(pt_type)) { | 220 | if (IS_EFS(pt_type)) { |
@@ -226,11 +224,10 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { | |||
226 | } | 224 | } |
227 | 225 | ||
228 | if (slice == -1) { | 226 | if (slice == -1) { |
229 | printk(KERN_NOTICE "EFS: partition table contained no EFS partitions\n"); | 227 | pr_notice("partition table contained no EFS partitions\n"); |
230 | #ifdef DEBUG | 228 | #ifdef DEBUG |
231 | } else { | 229 | } else { |
232 | printk(KERN_INFO "EFS: using slice %d (type %s, offset 0x%x)\n", | 230 | pr_info("using slice %d (type %s, offset 0x%x)\n", slice, |
233 | slice, | ||
234 | (pt_entry->pt_name) ? pt_entry->pt_name : "unknown", | 231 | (pt_entry->pt_name) ? pt_entry->pt_name : "unknown", |
235 | sblock); | 232 | sblock); |
236 | #endif | 233 | #endif |
@@ -268,7 +265,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) | |||
268 | 265 | ||
269 | s->s_magic = EFS_SUPER_MAGIC; | 266 | s->s_magic = EFS_SUPER_MAGIC; |
270 | if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { | 267 | if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { |
271 | printk(KERN_ERR "EFS: device does not support %d byte blocks\n", | 268 | pr_err("device does not support %d byte blocks\n", |
272 | EFS_BLOCKSIZE); | 269 | EFS_BLOCKSIZE); |
273 | return -EINVAL; | 270 | return -EINVAL; |
274 | } | 271 | } |
@@ -277,7 +274,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) | |||
277 | bh = sb_bread(s, 0); | 274 | bh = sb_bread(s, 0); |
278 | 275 | ||
279 | if (!bh) { | 276 | if (!bh) { |
280 | printk(KERN_ERR "EFS: cannot read volume header\n"); | 277 | pr_err("cannot read volume header\n"); |
281 | return -EINVAL; | 278 | return -EINVAL; |
282 | } | 279 | } |
283 | 280 | ||
@@ -295,13 +292,14 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) | |||
295 | 292 | ||
296 | bh = sb_bread(s, sb->fs_start + EFS_SUPER); | 293 | bh = sb_bread(s, sb->fs_start + EFS_SUPER); |
297 | if (!bh) { | 294 | if (!bh) { |
298 | printk(KERN_ERR "EFS: cannot read superblock\n"); | 295 | pr_err("cannot read superblock\n"); |
299 | return -EINVAL; | 296 | return -EINVAL; |
300 | } | 297 | } |
301 | 298 | ||
302 | if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { | 299 | if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { |
303 | #ifdef DEBUG | 300 | #ifdef DEBUG |
304 | printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); | 301 | pr_warn("invalid superblock at block %u\n", |
302 | sb->fs_start + EFS_SUPER); | ||
305 | #endif | 303 | #endif |
306 | brelse(bh); | 304 | brelse(bh); |
307 | return -EINVAL; | 305 | return -EINVAL; |
@@ -310,7 +308,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) | |||
310 | 308 | ||
311 | if (!(s->s_flags & MS_RDONLY)) { | 309 | if (!(s->s_flags & MS_RDONLY)) { |
312 | #ifdef DEBUG | 310 | #ifdef DEBUG |
313 | printk(KERN_INFO "EFS: forcing read-only mode\n"); | 311 | pr_info("forcing read-only mode\n"); |
314 | #endif | 312 | #endif |
315 | s->s_flags |= MS_RDONLY; | 313 | s->s_flags |= MS_RDONLY; |
316 | } | 314 | } |
@@ -318,13 +316,13 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) | |||
318 | s->s_export_op = &efs_export_ops; | 316 | s->s_export_op = &efs_export_ops; |
319 | root = efs_iget(s, EFS_ROOTINODE); | 317 | root = efs_iget(s, EFS_ROOTINODE); |
320 | if (IS_ERR(root)) { | 318 | if (IS_ERR(root)) { |
321 | printk(KERN_ERR "EFS: get root inode failed\n"); | 319 | pr_err("get root inode failed\n"); |
322 | return PTR_ERR(root); | 320 | return PTR_ERR(root); |
323 | } | 321 | } |
324 | 322 | ||
325 | s->s_root = d_make_root(root); | 323 | s->s_root = d_make_root(root); |
326 | if (!(s->s_root)) { | 324 | if (!(s->s_root)) { |
327 | printk(KERN_ERR "EFS: get root dentry failed\n"); | 325 | pr_err("get root dentry failed\n"); |
328 | return -ENOMEM; | 326 | return -ENOMEM; |
329 | } | 327 | } |
330 | 328 | ||
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 48a359dd286e..b01fbfb51f43 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c | |||
@@ -259,7 +259,7 @@ static int filldir_one(void * __buf, const char * name, int len, | |||
259 | 259 | ||
260 | /** | 260 | /** |
261 | * get_name - default export_operations->get_name function | 261 | * get_name - default export_operations->get_name function |
262 | * @dentry: the directory in which to find a name | 262 | * @path: the directory in which to find a name |
263 | * @name: a pointer to a %NAME_MAX+1 char buffer to store the name | 263 | * @name: a pointer to a %NAME_MAX+1 char buffer to store the name |
264 | * @child: the dentry for the child directory. | 264 | * @child: the dentry for the child directory. |
265 | * | 265 | * |
@@ -337,7 +337,7 @@ out: | |||
337 | /** | 337 | /** |
338 | * export_encode_fh - default export_operations->encode_fh function | 338 | * export_encode_fh - default export_operations->encode_fh function |
339 | * @inode: the object to encode | 339 | * @inode: the object to encode |
340 | * @fh: where to store the file handle fragment | 340 | * @fid: where to store the file handle fragment |
341 | * @max_len: maximum length to store there | 341 | * @max_len: maximum length to store there |
342 | * @parent: parent directory inode, if wanted | 342 | * @parent: parent directory inode, if wanted |
343 | * | 343 | * |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c8238a26818c..afe8a133e3d1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -1044,6 +1044,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | |||
1044 | * allocating. If we are looking at the buddy cache we would | 1044 | * allocating. If we are looking at the buddy cache we would |
1045 | * have taken a reference using ext4_mb_load_buddy and that | 1045 | * have taken a reference using ext4_mb_load_buddy and that |
1046 | * would have pinned buddy page to page cache. | 1046 | * would have pinned buddy page to page cache. |
1047 | * The call to ext4_mb_get_buddy_page_lock will mark the | ||
1048 | * page accessed. | ||
1047 | */ | 1049 | */ |
1048 | ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); | 1050 | ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); |
1049 | if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { | 1051 | if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { |
@@ -1062,7 +1064,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | |||
1062 | ret = -EIO; | 1064 | ret = -EIO; |
1063 | goto err; | 1065 | goto err; |
1064 | } | 1066 | } |
1065 | mark_page_accessed(page); | ||
1066 | 1067 | ||
1067 | if (e4b.bd_buddy_page == NULL) { | 1068 | if (e4b.bd_buddy_page == NULL) { |
1068 | /* | 1069 | /* |
@@ -1082,7 +1083,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | |||
1082 | ret = -EIO; | 1083 | ret = -EIO; |
1083 | goto err; | 1084 | goto err; |
1084 | } | 1085 | } |
1085 | mark_page_accessed(page); | ||
1086 | err: | 1086 | err: |
1087 | ext4_mb_put_buddy_page_lock(&e4b); | 1087 | ext4_mb_put_buddy_page_lock(&e4b); |
1088 | return ret; | 1088 | return ret; |
@@ -1141,7 +1141,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1141 | 1141 | ||
1142 | /* we could use find_or_create_page(), but it locks page | 1142 | /* we could use find_or_create_page(), but it locks page |
1143 | * what we'd like to avoid in fast path ... */ | 1143 | * what we'd like to avoid in fast path ... */ |
1144 | page = find_get_page(inode->i_mapping, pnum); | 1144 | page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); |
1145 | if (page == NULL || !PageUptodate(page)) { | 1145 | if (page == NULL || !PageUptodate(page)) { |
1146 | if (page) | 1146 | if (page) |
1147 | /* | 1147 | /* |
@@ -1176,15 +1176,16 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1176 | ret = -EIO; | 1176 | ret = -EIO; |
1177 | goto err; | 1177 | goto err; |
1178 | } | 1178 | } |
1179 | |||
1180 | /* Pages marked accessed already */ | ||
1179 | e4b->bd_bitmap_page = page; | 1181 | e4b->bd_bitmap_page = page; |
1180 | e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); | 1182 | e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); |
1181 | mark_page_accessed(page); | ||
1182 | 1183 | ||
1183 | block++; | 1184 | block++; |
1184 | pnum = block / blocks_per_page; | 1185 | pnum = block / blocks_per_page; |
1185 | poff = block % blocks_per_page; | 1186 | poff = block % blocks_per_page; |
1186 | 1187 | ||
1187 | page = find_get_page(inode->i_mapping, pnum); | 1188 | page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); |
1188 | if (page == NULL || !PageUptodate(page)) { | 1189 | if (page == NULL || !PageUptodate(page)) { |
1189 | if (page) | 1190 | if (page) |
1190 | page_cache_release(page); | 1191 | page_cache_release(page); |
@@ -1209,9 +1210,10 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1209 | ret = -EIO; | 1210 | ret = -EIO; |
1210 | goto err; | 1211 | goto err; |
1211 | } | 1212 | } |
1213 | |||
1214 | /* Pages marked accessed already */ | ||
1212 | e4b->bd_buddy_page = page; | 1215 | e4b->bd_buddy_page = page; |
1213 | e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); | 1216 | e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); |
1214 | mark_page_accessed(page); | ||
1215 | 1217 | ||
1216 | BUG_ON(e4b->bd_bitmap_page == NULL); | 1218 | BUG_ON(e4b->bd_bitmap_page == NULL); |
1217 | BUG_ON(e4b->bd_buddy_page == NULL); | 1219 | BUG_ON(e4b->bd_buddy_page == NULL); |
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index c18d95b50540..1a64e7a52b84 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -429,7 +429,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, | |||
429 | block_start = bh_offset(bh); | 429 | block_start = bh_offset(bh); |
430 | if (block_start >= len) { | 430 | if (block_start >= len) { |
431 | /* | 431 | /* |
432 | * Comments copied from block_write_full_page_endio: | 432 | * Comments copied from block_write_full_page: |
433 | * | 433 | * |
434 | * The page straddles i_size. It must be zeroed out on | 434 | * The page straddles i_size. It must be zeroed out on |
435 | * each and every writepage invocation because it may | 435 | * each and every writepage invocation because it may |
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4aa521aa9bc3..c405b8f17054 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c | |||
@@ -69,7 +69,6 @@ repeat: | |||
69 | goto repeat; | 69 | goto repeat; |
70 | } | 70 | } |
71 | out: | 71 | out: |
72 | mark_page_accessed(page); | ||
73 | return page; | 72 | return page; |
74 | } | 73 | } |
75 | 74 | ||
@@ -137,13 +136,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) | |||
137 | if (!page) | 136 | if (!page) |
138 | continue; | 137 | continue; |
139 | if (PageUptodate(page)) { | 138 | if (PageUptodate(page)) { |
140 | mark_page_accessed(page); | ||
141 | f2fs_put_page(page, 1); | 139 | f2fs_put_page(page, 1); |
142 | continue; | 140 | continue; |
143 | } | 141 | } |
144 | 142 | ||
145 | f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); | 143 | f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); |
146 | mark_page_accessed(page); | ||
147 | f2fs_put_page(page, 0); | 144 | f2fs_put_page(page, 0); |
148 | } | 145 | } |
149 | out: | 146 | out: |
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a161e955c4c8..57caa6eaf47b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c | |||
@@ -967,7 +967,6 @@ repeat: | |||
967 | goto repeat; | 967 | goto repeat; |
968 | } | 968 | } |
969 | got_it: | 969 | got_it: |
970 | mark_page_accessed(page); | ||
971 | return page; | 970 | return page; |
972 | } | 971 | } |
973 | 972 | ||
@@ -1022,7 +1021,6 @@ page_hit: | |||
1022 | f2fs_put_page(page, 1); | 1021 | f2fs_put_page(page, 1); |
1023 | return ERR_PTR(-EIO); | 1022 | return ERR_PTR(-EIO); |
1024 | } | 1023 | } |
1025 | mark_page_accessed(page); | ||
1026 | return page; | 1024 | return page; |
1027 | } | 1025 | } |
1028 | 1026 | ||
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index f7cff367db7f..56cce7fdd39e 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c | |||
@@ -280,15 +280,15 @@ int fscache_add_cache(struct fscache_cache *cache, | |||
280 | spin_unlock(&fscache_fsdef_index.lock); | 280 | spin_unlock(&fscache_fsdef_index.lock); |
281 | up_write(&fscache_addremove_sem); | 281 | up_write(&fscache_addremove_sem); |
282 | 282 | ||
283 | printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n", | 283 | pr_notice("Cache \"%s\" added (type %s)\n", |
284 | cache->tag->name, cache->ops->name); | 284 | cache->tag->name, cache->ops->name); |
285 | kobject_uevent(cache->kobj, KOBJ_ADD); | 285 | kobject_uevent(cache->kobj, KOBJ_ADD); |
286 | 286 | ||
287 | _leave(" = 0 [%s]", cache->identifier); | 287 | _leave(" = 0 [%s]", cache->identifier); |
288 | return 0; | 288 | return 0; |
289 | 289 | ||
290 | tag_in_use: | 290 | tag_in_use: |
291 | printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname); | 291 | pr_err("Cache tag '%s' already in use\n", tagname); |
292 | __fscache_release_cache_tag(tag); | 292 | __fscache_release_cache_tag(tag); |
293 | _leave(" = -EXIST"); | 293 | _leave(" = -EXIST"); |
294 | return -EEXIST; | 294 | return -EEXIST; |
@@ -317,8 +317,7 @@ EXPORT_SYMBOL(fscache_add_cache); | |||
317 | void fscache_io_error(struct fscache_cache *cache) | 317 | void fscache_io_error(struct fscache_cache *cache) |
318 | { | 318 | { |
319 | if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags)) | 319 | if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags)) |
320 | printk(KERN_ERR "FS-Cache:" | 320 | pr_err("Cache '%s' stopped due to I/O error\n", |
321 | " Cache '%s' stopped due to I/O error\n", | ||
322 | cache->ops->name); | 321 | cache->ops->name); |
323 | } | 322 | } |
324 | EXPORT_SYMBOL(fscache_io_error); | 323 | EXPORT_SYMBOL(fscache_io_error); |
@@ -369,8 +368,8 @@ void fscache_withdraw_cache(struct fscache_cache *cache) | |||
369 | 368 | ||
370 | _enter(""); | 369 | _enter(""); |
371 | 370 | ||
372 | printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n", | 371 | pr_notice("Withdrawing cache \"%s\"\n", |
373 | cache->tag->name); | 372 | cache->tag->name); |
374 | 373 | ||
375 | /* make the cache unavailable for cookie acquisition */ | 374 | /* make the cache unavailable for cookie acquisition */ |
376 | if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags)) | 375 | if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags)) |
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 29d7feb62cf7..aec01be91b0a 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c | |||
@@ -519,7 +519,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) | |||
519 | ASSERTCMP(atomic_read(&cookie->n_active), >, 0); | 519 | ASSERTCMP(atomic_read(&cookie->n_active), >, 0); |
520 | 520 | ||
521 | if (atomic_read(&cookie->n_children) != 0) { | 521 | if (atomic_read(&cookie->n_children) != 0) { |
522 | printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n", | 522 | pr_err("Cookie '%s' still has children\n", |
523 | cookie->def->name); | 523 | cookie->def->name); |
524 | BUG(); | 524 | BUG(); |
525 | } | 525 | } |
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c index bad496748a59..7d637e2335fd 100644 --- a/fs/fscache/histogram.c +++ b/fs/fscache/histogram.c | |||
@@ -31,12 +31,10 @@ static int fscache_histogram_show(struct seq_file *m, void *v) | |||
31 | 31 | ||
32 | switch ((unsigned long) v) { | 32 | switch ((unsigned long) v) { |
33 | case 1: | 33 | case 1: |
34 | seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS " | 34 | seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS\n"); |
35 | " RETRV DLY RETRIEVLS\n"); | ||
36 | return 0; | 35 | return 0; |
37 | case 2: | 36 | case 2: |
38 | seq_puts(m, "===== ===== ========= ========= =========" | 37 | seq_puts(m, "===== ===== ========= ========= ========= ========= =========\n"); |
39 | " ========= =========\n"); | ||
40 | return 0; | 38 | return 0; |
41 | default: | 39 | default: |
42 | index = (unsigned long) v - 3; | 40 | index = (unsigned long) v - 3; |
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 4226f6680b06..bc6c08fcfddd 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h | |||
@@ -22,6 +22,12 @@ | |||
22 | * | 22 | * |
23 | */ | 23 | */ |
24 | 24 | ||
25 | #ifdef pr_fmt | ||
26 | #undef pr_fmt | ||
27 | #endif | ||
28 | |||
29 | #define pr_fmt(fmt) "FS-Cache: " fmt | ||
30 | |||
25 | #include <linux/fscache-cache.h> | 31 | #include <linux/fscache-cache.h> |
26 | #include <linux/sched.h> | 32 | #include <linux/sched.h> |
27 | 33 | ||
@@ -413,8 +419,8 @@ do { \ | |||
413 | #define ASSERT(X) \ | 419 | #define ASSERT(X) \ |
414 | do { \ | 420 | do { \ |
415 | if (unlikely(!(X))) { \ | 421 | if (unlikely(!(X))) { \ |
416 | printk(KERN_ERR "\n"); \ | 422 | pr_err("\n"); \ |
417 | printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ | 423 | pr_err("Assertion failed\n"); \ |
418 | BUG(); \ | 424 | BUG(); \ |
419 | } \ | 425 | } \ |
420 | } while (0) | 426 | } while (0) |
@@ -422,9 +428,9 @@ do { \ | |||
422 | #define ASSERTCMP(X, OP, Y) \ | 428 | #define ASSERTCMP(X, OP, Y) \ |
423 | do { \ | 429 | do { \ |
424 | if (unlikely(!((X) OP (Y)))) { \ | 430 | if (unlikely(!((X) OP (Y)))) { \ |
425 | printk(KERN_ERR "\n"); \ | 431 | pr_err("\n"); \ |
426 | printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ | 432 | pr_err("Assertion failed\n"); \ |
427 | printk(KERN_ERR "%lx " #OP " %lx is false\n", \ | 433 | pr_err("%lx " #OP " %lx is false\n", \ |
428 | (unsigned long)(X), (unsigned long)(Y)); \ | 434 | (unsigned long)(X), (unsigned long)(Y)); \ |
429 | BUG(); \ | 435 | BUG(); \ |
430 | } \ | 436 | } \ |
@@ -433,8 +439,8 @@ do { \ | |||
433 | #define ASSERTIF(C, X) \ | 439 | #define ASSERTIF(C, X) \ |
434 | do { \ | 440 | do { \ |
435 | if (unlikely((C) && !(X))) { \ | 441 | if (unlikely((C) && !(X))) { \ |
436 | printk(KERN_ERR "\n"); \ | 442 | pr_err("\n"); \ |
437 | printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ | 443 | pr_err("Assertion failed\n"); \ |
438 | BUG(); \ | 444 | BUG(); \ |
439 | } \ | 445 | } \ |
440 | } while (0) | 446 | } while (0) |
@@ -442,9 +448,9 @@ do { \ | |||
442 | #define ASSERTIFCMP(C, X, OP, Y) \ | 448 | #define ASSERTIFCMP(C, X, OP, Y) \ |
443 | do { \ | 449 | do { \ |
444 | if (unlikely((C) && !((X) OP (Y)))) { \ | 450 | if (unlikely((C) && !((X) OP (Y)))) { \ |
445 | printk(KERN_ERR "\n"); \ | 451 | pr_err("\n"); \ |
446 | printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ | 452 | pr_err("Assertion failed\n"); \ |
447 | printk(KERN_ERR "%lx " #OP " %lx is false\n", \ | 453 | pr_err("%lx " #OP " %lx is false\n", \ |
448 | (unsigned long)(X), (unsigned long)(Y)); \ | 454 | (unsigned long)(X), (unsigned long)(Y)); \ |
449 | BUG(); \ | 455 | BUG(); \ |
450 | } \ | 456 | } \ |
diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 7c27907e650c..acd4bf1fc277 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c | |||
@@ -146,8 +146,7 @@ static int __init fscache_init(void) | |||
146 | 0, | 146 | 0, |
147 | fscache_cookie_init_once); | 147 | fscache_cookie_init_once); |
148 | if (!fscache_cookie_jar) { | 148 | if (!fscache_cookie_jar) { |
149 | printk(KERN_NOTICE | 149 | pr_notice("Failed to allocate a cookie jar\n"); |
150 | "FS-Cache: Failed to allocate a cookie jar\n"); | ||
151 | ret = -ENOMEM; | 150 | ret = -ENOMEM; |
152 | goto error_cookie_jar; | 151 | goto error_cookie_jar; |
153 | } | 152 | } |
@@ -156,7 +155,7 @@ static int __init fscache_init(void) | |||
156 | if (!fscache_root) | 155 | if (!fscache_root) |
157 | goto error_kobj; | 156 | goto error_kobj; |
158 | 157 | ||
159 | printk(KERN_NOTICE "FS-Cache: Loaded\n"); | 158 | pr_notice("Loaded\n"); |
160 | return 0; | 159 | return 0; |
161 | 160 | ||
162 | error_kobj: | 161 | error_kobj: |
@@ -192,7 +191,7 @@ static void __exit fscache_exit(void) | |||
192 | fscache_proc_cleanup(); | 191 | fscache_proc_cleanup(); |
193 | destroy_workqueue(fscache_op_wq); | 192 | destroy_workqueue(fscache_op_wq); |
194 | destroy_workqueue(fscache_object_wq); | 193 | destroy_workqueue(fscache_object_wq); |
195 | printk(KERN_NOTICE "FS-Cache: Unloaded\n"); | 194 | pr_notice("Unloaded\n"); |
196 | } | 195 | } |
197 | 196 | ||
198 | module_exit(fscache_exit); | 197 | module_exit(fscache_exit); |
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index 989f39401547..6d941f56faf4 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c | |||
@@ -65,8 +65,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) | |||
65 | list_add(&netfs->link, &fscache_netfs_list); | 65 | list_add(&netfs->link, &fscache_netfs_list); |
66 | ret = 0; | 66 | ret = 0; |
67 | 67 | ||
68 | printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n", | 68 | pr_notice("Netfs '%s' registered for caching\n", netfs->name); |
69 | netfs->name); | ||
70 | 69 | ||
71 | already_registered: | 70 | already_registered: |
72 | up_write(&fscache_addremove_sem); | 71 | up_write(&fscache_addremove_sem); |
@@ -97,8 +96,8 @@ void __fscache_unregister_netfs(struct fscache_netfs *netfs) | |||
97 | 96 | ||
98 | up_write(&fscache_addremove_sem); | 97 | up_write(&fscache_addremove_sem); |
99 | 98 | ||
100 | printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n", | 99 | pr_notice("Netfs '%s' unregistered from caching\n", |
101 | netfs->name); | 100 | netfs->name); |
102 | 101 | ||
103 | _leave(""); | 102 | _leave(""); |
104 | } | 103 | } |
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index b5ebc2d7d80d..b8179ca6bf9d 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c | |||
@@ -285,20 +285,20 @@ static int fscache_objlist_show(struct seq_file *m, void *v) | |||
285 | fscache_unuse_cookie(obj); | 285 | fscache_unuse_cookie(obj); |
286 | 286 | ||
287 | if (keylen > 0 || auxlen > 0) { | 287 | if (keylen > 0 || auxlen > 0) { |
288 | seq_printf(m, " "); | 288 | seq_puts(m, " "); |
289 | for (p = buf; keylen > 0; keylen--) | 289 | for (p = buf; keylen > 0; keylen--) |
290 | seq_printf(m, "%02x", *p++); | 290 | seq_printf(m, "%02x", *p++); |
291 | if (auxlen > 0) { | 291 | if (auxlen > 0) { |
292 | if (config & FSCACHE_OBJLIST_CONFIG_KEY) | 292 | if (config & FSCACHE_OBJLIST_CONFIG_KEY) |
293 | seq_printf(m, ", "); | 293 | seq_puts(m, ", "); |
294 | for (; auxlen > 0; auxlen--) | 294 | for (; auxlen > 0; auxlen--) |
295 | seq_printf(m, "%02x", *p++); | 295 | seq_printf(m, "%02x", *p++); |
296 | } | 296 | } |
297 | } | 297 | } |
298 | 298 | ||
299 | seq_printf(m, "\n"); | 299 | seq_puts(m, "\n"); |
300 | } else { | 300 | } else { |
301 | seq_printf(m, "<no_netfs>\n"); | 301 | seq_puts(m, "<no_netfs>\n"); |
302 | } | 302 | } |
303 | return 0; | 303 | return 0; |
304 | } | 304 | } |
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 318071aca217..e7b87a0e5185 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c | |||
@@ -51,8 +51,7 @@ void fscache_enqueue_operation(struct fscache_operation *op) | |||
51 | _debug("queue for caller's attention"); | 51 | _debug("queue for caller's attention"); |
52 | break; | 52 | break; |
53 | default: | 53 | default: |
54 | printk(KERN_ERR "FS-Cache: Unexpected op type %lx", | 54 | pr_err("Unexpected op type %lx", op->flags); |
55 | op->flags); | ||
56 | BUG(); | 55 | BUG(); |
57 | break; | 56 | break; |
58 | } | 57 | } |
diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 7f5c658af755..ed70714503fa 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c | |||
@@ -1108,10 +1108,8 @@ void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page) | |||
1108 | static bool once_only; | 1108 | static bool once_only; |
1109 | if (!once_only) { | 1109 | if (!once_only) { |
1110 | once_only = true; | 1110 | once_only = true; |
1111 | printk(KERN_WARNING "FS-Cache:" | 1111 | pr_warn("Cookie type %s marked page %lx multiple times\n", |
1112 | " Cookie type %s marked page %lx" | 1112 | cookie->def->name, page->index); |
1113 | " multiple times\n", | ||
1114 | cookie->def->name, page->index); | ||
1115 | } | 1113 | } |
1116 | } | 1114 | } |
1117 | 1115 | ||
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index aac71ce373e4..098f97bdcf1b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c | |||
@@ -1614,7 +1614,7 @@ out_finish: | |||
1614 | 1614 | ||
1615 | static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) | 1615 | static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) |
1616 | { | 1616 | { |
1617 | release_pages(req->pages, req->num_pages, 0); | 1617 | release_pages(req->pages, req->num_pages, false); |
1618 | } | 1618 | } |
1619 | 1619 | ||
1620 | static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, | 1620 | static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, |
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f680d2c44e97..903cbc9cd6bd 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c | |||
@@ -1089,8 +1089,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, | |||
1089 | tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); | 1089 | tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); |
1090 | flush_dcache_page(page); | 1090 | flush_dcache_page(page); |
1091 | 1091 | ||
1092 | mark_page_accessed(page); | ||
1093 | |||
1094 | if (!tmp) { | 1092 | if (!tmp) { |
1095 | unlock_page(page); | 1093 | unlock_page(page); |
1096 | page_cache_release(page); | 1094 | page_cache_release(page); |
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 5a49b037da81..492123cda64a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c | |||
@@ -577,7 +577,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, | |||
577 | p = kmap_atomic(page); | 577 | p = kmap_atomic(page); |
578 | memcpy(buf + copied, p + offset, amt); | 578 | memcpy(buf + copied, p + offset, amt); |
579 | kunmap_atomic(p); | 579 | kunmap_atomic(p); |
580 | mark_page_accessed(page); | ||
581 | page_cache_release(page); | 580 | page_cache_release(page); |
582 | copied += amt; | 581 | copied += amt; |
583 | index++; | 582 | index++; |
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 2cf09b63a6b4..b984a6e190bc 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c | |||
@@ -136,7 +136,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) | |||
136 | yield(); | 136 | yield(); |
137 | } | 137 | } |
138 | } else { | 138 | } else { |
139 | page = find_lock_page(mapping, index); | 139 | page = find_get_page_flags(mapping, index, |
140 | FGP_LOCK|FGP_ACCESSED); | ||
140 | if (!page) | 141 | if (!page) |
141 | return NULL; | 142 | return NULL; |
142 | } | 143 | } |
@@ -153,7 +154,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) | |||
153 | map_bh(bh, sdp->sd_vfs, blkno); | 154 | map_bh(bh, sdp->sd_vfs, blkno); |
154 | 155 | ||
155 | unlock_page(page); | 156 | unlock_page(page); |
156 | mark_page_accessed(page); | ||
157 | page_cache_release(page); | 157 | page_cache_release(page); |
158 | 158 | ||
159 | return bh; | 159 | return bh; |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e19d4c0cacae..1e2872b25343 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -6,6 +6,8 @@ | |||
6 | * Copyright (C) 2002 Linus Torvalds. | 6 | * Copyright (C) 2002 Linus Torvalds. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
10 | |||
9 | #include <linux/module.h> | 11 | #include <linux/module.h> |
10 | #include <linux/thread_info.h> | 12 | #include <linux/thread_info.h> |
11 | #include <asm/current.h> | 13 | #include <asm/current.h> |
@@ -475,7 +477,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, | |||
475 | * annotation because huge_pmd_share() does an allocation under | 477 | * annotation because huge_pmd_share() does an allocation under |
476 | * i_mmap_mutex. | 478 | * i_mmap_mutex. |
477 | */ | 479 | */ |
478 | struct lock_class_key hugetlbfs_i_mmap_mutex_key; | 480 | static struct lock_class_key hugetlbfs_i_mmap_mutex_key; |
479 | 481 | ||
480 | static struct inode *hugetlbfs_get_inode(struct super_block *sb, | 482 | static struct inode *hugetlbfs_get_inode(struct super_block *sb, |
481 | struct inode *dir, | 483 | struct inode *dir, |
@@ -823,8 +825,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) | |||
823 | ps = memparse(args[0].from, &rest); | 825 | ps = memparse(args[0].from, &rest); |
824 | pconfig->hstate = size_to_hstate(ps); | 826 | pconfig->hstate = size_to_hstate(ps); |
825 | if (!pconfig->hstate) { | 827 | if (!pconfig->hstate) { |
826 | printk(KERN_ERR | 828 | pr_err("Unsupported page size %lu MB\n", |
827 | "hugetlbfs: Unsupported page size %lu MB\n", | ||
828 | ps >> 20); | 829 | ps >> 20); |
829 | return -EINVAL; | 830 | return -EINVAL; |
830 | } | 831 | } |
@@ -832,8 +833,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) | |||
832 | } | 833 | } |
833 | 834 | ||
834 | default: | 835 | default: |
835 | printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", | 836 | pr_err("Bad mount option: \"%s\"\n", p); |
836 | p); | ||
837 | return -EINVAL; | 837 | return -EINVAL; |
838 | break; | 838 | break; |
839 | } | 839 | } |
@@ -853,8 +853,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) | |||
853 | return 0; | 853 | return 0; |
854 | 854 | ||
855 | bad_val: | 855 | bad_val: |
856 | printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", | 856 | pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); |
857 | args[0].from, p); | ||
858 | return -EINVAL; | 857 | return -EINVAL; |
859 | } | 858 | } |
860 | 859 | ||
@@ -902,8 +901,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) | |||
902 | goto out_free; | 901 | goto out_free; |
903 | return 0; | 902 | return 0; |
904 | out_free: | 903 | out_free: |
905 | if (sbinfo->spool) | 904 | kfree(sbinfo->spool); |
906 | kfree(sbinfo->spool); | ||
907 | kfree(sbinfo); | 905 | kfree(sbinfo); |
908 | return -ENOMEM; | 906 | return -ENOMEM; |
909 | } | 907 | } |
@@ -939,7 +937,7 @@ static int get_hstate_idx(int page_size_log) | |||
939 | return h - hstates; | 937 | return h - hstates; |
940 | } | 938 | } |
941 | 939 | ||
942 | static struct dentry_operations anon_ops = { | 940 | static const struct dentry_operations anon_ops = { |
943 | .d_dname = simple_dname | 941 | .d_dname = simple_dname |
944 | }; | 942 | }; |
945 | 943 | ||
@@ -970,8 +968,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, | |||
970 | *user = current_user(); | 968 | *user = current_user(); |
971 | if (user_shm_lock(size, *user)) { | 969 | if (user_shm_lock(size, *user)) { |
972 | task_lock(current); | 970 | task_lock(current); |
973 | printk_once(KERN_WARNING | 971 | pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", |
974 | "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", | ||
975 | current->comm, current->pid); | 972 | current->comm, current->pid); |
976 | task_unlock(current); | 973 | task_unlock(current); |
977 | } else { | 974 | } else { |
@@ -1031,7 +1028,7 @@ static int __init init_hugetlbfs_fs(void) | |||
1031 | int i; | 1028 | int i; |
1032 | 1029 | ||
1033 | if (!hugepages_supported()) { | 1030 | if (!hugepages_supported()) { |
1034 | pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); | 1031 | pr_info("disabling because there are no supported hugepage sizes\n"); |
1035 | return -ENOTSUPP; | 1032 | return -ENOTSUPP; |
1036 | } | 1033 | } |
1037 | 1034 | ||
@@ -1060,7 +1057,7 @@ static int __init init_hugetlbfs_fs(void) | |||
1060 | buf); | 1057 | buf); |
1061 | 1058 | ||
1062 | if (IS_ERR(hugetlbfs_vfsmount[i])) { | 1059 | if (IS_ERR(hugetlbfs_vfsmount[i])) { |
1063 | pr_err("hugetlb: Cannot mount internal hugetlbfs for " | 1060 | pr_err("Cannot mount internal hugetlbfs for " |
1064 | "page size %uK", ps_kb); | 1061 | "page size %uK", ps_kb); |
1065 | error = PTR_ERR(hugetlbfs_vfsmount[i]); | 1062 | error = PTR_ERR(hugetlbfs_vfsmount[i]); |
1066 | hugetlbfs_vfsmount[i] = NULL; | 1063 | hugetlbfs_vfsmount[i] = NULL; |
diff --git a/fs/libfs.c b/fs/libfs.c index a1844244246f..88e3e00e2eca 100644 --- a/fs/libfs.c +++ b/fs/libfs.c | |||
@@ -3,6 +3,7 @@ | |||
3 | * Library for filesystems writers. | 3 | * Library for filesystems writers. |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #include <linux/blkdev.h> | ||
6 | #include <linux/export.h> | 7 | #include <linux/export.h> |
7 | #include <linux/pagemap.h> | 8 | #include <linux/pagemap.h> |
8 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
@@ -923,16 +924,19 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, | |||
923 | EXPORT_SYMBOL_GPL(generic_fh_to_parent); | 924 | EXPORT_SYMBOL_GPL(generic_fh_to_parent); |
924 | 925 | ||
925 | /** | 926 | /** |
926 | * generic_file_fsync - generic fsync implementation for simple filesystems | 927 | * __generic_file_fsync - generic fsync implementation for simple filesystems |
928 | * | ||
927 | * @file: file to synchronize | 929 | * @file: file to synchronize |
930 | * @start: start offset in bytes | ||
931 | * @end: end offset in bytes (inclusive) | ||
928 | * @datasync: only synchronize essential metadata if true | 932 | * @datasync: only synchronize essential metadata if true |
929 | * | 933 | * |
930 | * This is a generic implementation of the fsync method for simple | 934 | * This is a generic implementation of the fsync method for simple |
931 | * filesystems which track all non-inode metadata in the buffers list | 935 | * filesystems which track all non-inode metadata in the buffers list |
932 | * hanging off the address_space structure. | 936 | * hanging off the address_space structure. |
933 | */ | 937 | */ |
934 | int generic_file_fsync(struct file *file, loff_t start, loff_t end, | 938 | int __generic_file_fsync(struct file *file, loff_t start, loff_t end, |
935 | int datasync) | 939 | int datasync) |
936 | { | 940 | { |
937 | struct inode *inode = file->f_mapping->host; | 941 | struct inode *inode = file->f_mapping->host; |
938 | int err; | 942 | int err; |
@@ -952,10 +956,34 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end, | |||
952 | err = sync_inode_metadata(inode, 1); | 956 | err = sync_inode_metadata(inode, 1); |
953 | if (ret == 0) | 957 | if (ret == 0) |
954 | ret = err; | 958 | ret = err; |
959 | |||
955 | out: | 960 | out: |
956 | mutex_unlock(&inode->i_mutex); | 961 | mutex_unlock(&inode->i_mutex); |
957 | return ret; | 962 | return ret; |
958 | } | 963 | } |
964 | EXPORT_SYMBOL(__generic_file_fsync); | ||
965 | |||
966 | /** | ||
967 | * generic_file_fsync - generic fsync implementation for simple filesystems | ||
968 | * with flush | ||
969 | * @file: file to synchronize | ||
970 | * @start: start offset in bytes | ||
971 | * @end: end offset in bytes (inclusive) | ||
972 | * @datasync: only synchronize essential metadata if true | ||
973 | * | ||
974 | */ | ||
975 | |||
976 | int generic_file_fsync(struct file *file, loff_t start, loff_t end, | ||
977 | int datasync) | ||
978 | { | ||
979 | struct inode *inode = file->f_mapping->host; | ||
980 | int err; | ||
981 | |||
982 | err = __generic_file_fsync(file, start, end, datasync); | ||
983 | if (err) | ||
984 | return err; | ||
985 | return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); | ||
986 | } | ||
959 | EXPORT_SYMBOL(generic_file_fsync); | 987 | EXPORT_SYMBOL(generic_file_fsync); |
960 | 988 | ||
961 | /** | 989 | /** |
diff --git a/fs/mpage.c b/fs/mpage.c index 4979ffa60aaa..5f9ed622274f 100644 --- a/fs/mpage.c +++ b/fs/mpage.c | |||
@@ -48,23 +48,7 @@ static void mpage_end_io(struct bio *bio, int err) | |||
48 | 48 | ||
49 | bio_for_each_segment_all(bv, bio, i) { | 49 | bio_for_each_segment_all(bv, bio, i) { |
50 | struct page *page = bv->bv_page; | 50 | struct page *page = bv->bv_page; |
51 | 51 | page_endio(page, bio_data_dir(bio), err); | |
52 | if (bio_data_dir(bio) == READ) { | ||
53 | if (!err) { | ||
54 | SetPageUptodate(page); | ||
55 | } else { | ||
56 | ClearPageUptodate(page); | ||
57 | SetPageError(page); | ||
58 | } | ||
59 | unlock_page(page); | ||
60 | } else { /* bio_data_dir(bio) == WRITE */ | ||
61 | if (err) { | ||
62 | SetPageError(page); | ||
63 | if (page->mapping) | ||
64 | set_bit(AS_EIO, &page->mapping->flags); | ||
65 | } | ||
66 | end_page_writeback(page); | ||
67 | } | ||
68 | } | 52 | } |
69 | 53 | ||
70 | bio_put(bio); | 54 | bio_put(bio); |
@@ -285,6 +269,11 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, | |||
285 | 269 | ||
286 | alloc_new: | 270 | alloc_new: |
287 | if (bio == NULL) { | 271 | if (bio == NULL) { |
272 | if (first_hole == blocks_per_page) { | ||
273 | if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), | ||
274 | page)) | ||
275 | goto out; | ||
276 | } | ||
288 | bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), | 277 | bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), |
289 | min_t(int, nr_pages, bio_get_nr_vecs(bdev)), | 278 | min_t(int, nr_pages, bio_get_nr_vecs(bdev)), |
290 | GFP_KERNEL); | 279 | GFP_KERNEL); |
@@ -439,6 +428,35 @@ struct mpage_data { | |||
439 | unsigned use_writepage; | 428 | unsigned use_writepage; |
440 | }; | 429 | }; |
441 | 430 | ||
431 | /* | ||
432 | * We have our BIO, so we can now mark the buffers clean. Make | ||
433 | * sure to only clean buffers which we know we'll be writing. | ||
434 | */ | ||
435 | static void clean_buffers(struct page *page, unsigned first_unmapped) | ||
436 | { | ||
437 | unsigned buffer_counter = 0; | ||
438 | struct buffer_head *bh, *head; | ||
439 | if (!page_has_buffers(page)) | ||
440 | return; | ||
441 | head = page_buffers(page); | ||
442 | bh = head; | ||
443 | |||
444 | do { | ||
445 | if (buffer_counter++ == first_unmapped) | ||
446 | break; | ||
447 | clear_buffer_dirty(bh); | ||
448 | bh = bh->b_this_page; | ||
449 | } while (bh != head); | ||
450 | |||
451 | /* | ||
452 | * we cannot drop the bh if the page is not uptodate or a concurrent | ||
453 | * readpage would fail to serialize with the bh and it would read from | ||
454 | * disk before we reach the platter. | ||
455 | */ | ||
456 | if (buffer_heads_over_limit && PageUptodate(page)) | ||
457 | try_to_free_buffers(page); | ||
458 | } | ||
459 | |||
442 | static int __mpage_writepage(struct page *page, struct writeback_control *wbc, | 460 | static int __mpage_writepage(struct page *page, struct writeback_control *wbc, |
443 | void *data) | 461 | void *data) |
444 | { | 462 | { |
@@ -574,6 +592,13 @@ page_is_mapped: | |||
574 | 592 | ||
575 | alloc_new: | 593 | alloc_new: |
576 | if (bio == NULL) { | 594 | if (bio == NULL) { |
595 | if (first_unmapped == blocks_per_page) { | ||
596 | if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), | ||
597 | page, wbc)) { | ||
598 | clean_buffers(page, first_unmapped); | ||
599 | goto out; | ||
600 | } | ||
601 | } | ||
577 | bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), | 602 | bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), |
578 | bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); | 603 | bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); |
579 | if (bio == NULL) | 604 | if (bio == NULL) |
@@ -591,30 +616,7 @@ alloc_new: | |||
591 | goto alloc_new; | 616 | goto alloc_new; |
592 | } | 617 | } |
593 | 618 | ||
594 | /* | 619 | clean_buffers(page, first_unmapped); |
595 | * OK, we have our BIO, so we can now mark the buffers clean. Make | ||
596 | * sure to only clean buffers which we know we'll be writing. | ||
597 | */ | ||
598 | if (page_has_buffers(page)) { | ||
599 | struct buffer_head *head = page_buffers(page); | ||
600 | struct buffer_head *bh = head; | ||
601 | unsigned buffer_counter = 0; | ||
602 | |||
603 | do { | ||
604 | if (buffer_counter++ == first_unmapped) | ||
605 | break; | ||
606 | clear_buffer_dirty(bh); | ||
607 | bh = bh->b_this_page; | ||
608 | } while (bh != head); | ||
609 | |||
610 | /* | ||
611 | * we cannot drop the bh if the page is not uptodate | ||
612 | * or a concurrent readpage would fail to serialize with the bh | ||
613 | * and it would read from disk before we reach the platter. | ||
614 | */ | ||
615 | if (buffer_heads_over_limit && PageUptodate(page)) | ||
616 | try_to_free_buffers(page); | ||
617 | } | ||
618 | 620 | ||
619 | BUG_ON(PageWriteback(page)); | 621 | BUG_ON(PageWriteback(page)); |
620 | set_page_writeback(page); | 622 | set_page_writeback(page); |
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c index 03ffde1f44d6..344889cd120e 100644 --- a/fs/ncpfs/getopt.c +++ b/fs/ncpfs/getopt.c | |||
@@ -53,15 +53,14 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts | |||
53 | return -EINVAL; | 53 | return -EINVAL; |
54 | } | 54 | } |
55 | if (opts->has_arg & OPT_INT) { | 55 | if (opts->has_arg & OPT_INT) { |
56 | char* v; | 56 | int rc = kstrtoul(val, 0, value); |
57 | 57 | ||
58 | *value = simple_strtoul(val, &v, 0); | 58 | if (rc) { |
59 | if (!*v) { | 59 | pr_info("%s: invalid numeric value in %s=%s\n", |
60 | return opts->val; | 60 | caller, token, val); |
61 | return rc; | ||
61 | } | 62 | } |
62 | pr_info("%s: invalid numeric value in %s=%s\n", | 63 | return opts->val; |
63 | caller, token, val); | ||
64 | return -EDOM; | ||
65 | } | 64 | } |
66 | if (opts->has_arg & OPT_STRING) { | 65 | if (opts->has_arg & OPT_STRING) { |
67 | return opts->val; | 66 | return opts->val; |
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 732648b270dc..3fdc8a3e1134 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c | |||
@@ -25,6 +25,19 @@ | |||
25 | #define FANOTIFY_DEFAULT_MAX_MARKS 8192 | 25 | #define FANOTIFY_DEFAULT_MAX_MARKS 8192 |
26 | #define FANOTIFY_DEFAULT_MAX_LISTENERS 128 | 26 | #define FANOTIFY_DEFAULT_MAX_LISTENERS 128 |
27 | 27 | ||
28 | /* | ||
29 | * All flags that may be specified in parameter event_f_flags of fanotify_init. | ||
30 | * | ||
31 | * Internal and external open flags are stored together in field f_flags of | ||
32 | * struct file. Only external open flags shall be allowed in event_f_flags. | ||
33 | * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be | ||
34 | * excluded. | ||
35 | */ | ||
36 | #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ | ||
37 | O_ACCMODE | O_APPEND | O_NONBLOCK | \ | ||
38 | __O_SYNC | O_DSYNC | O_CLOEXEC | \ | ||
39 | O_LARGEFILE | O_NOATIME ) | ||
40 | |||
28 | extern const struct fsnotify_ops fanotify_fsnotify_ops; | 41 | extern const struct fsnotify_ops fanotify_fsnotify_ops; |
29 | 42 | ||
30 | static struct kmem_cache *fanotify_mark_cache __read_mostly; | 43 | static struct kmem_cache *fanotify_mark_cache __read_mostly; |
@@ -669,6 +682,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) | |||
669 | if (flags & ~FAN_ALL_INIT_FLAGS) | 682 | if (flags & ~FAN_ALL_INIT_FLAGS) |
670 | return -EINVAL; | 683 | return -EINVAL; |
671 | 684 | ||
685 | if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) | ||
686 | return -EINVAL; | ||
687 | |||
688 | switch (event_f_flags & O_ACCMODE) { | ||
689 | case O_RDONLY: | ||
690 | case O_RDWR: | ||
691 | case O_WRONLY: | ||
692 | break; | ||
693 | default: | ||
694 | return -EINVAL; | ||
695 | } | ||
696 | |||
672 | user = get_current_user(); | 697 | user = get_current_user(); |
673 | if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { | 698 | if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { |
674 | free_uid(user); | 699 | free_uid(user); |
@@ -776,7 +801,10 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, | |||
776 | case FAN_MARK_REMOVE: | 801 | case FAN_MARK_REMOVE: |
777 | if (!mask) | 802 | if (!mask) |
778 | return -EINVAL; | 803 | return -EINVAL; |
804 | break; | ||
779 | case FAN_MARK_FLUSH: | 805 | case FAN_MARK_FLUSH: |
806 | if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH)) | ||
807 | return -EINVAL; | ||
780 | break; | 808 | break; |
781 | default: | 809 | default: |
782 | return -EINVAL; | 810 | return -EINVAL; |
@@ -813,6 +841,15 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, | |||
813 | group->priority == FS_PRIO_0) | 841 | group->priority == FS_PRIO_0) |
814 | goto fput_and_out; | 842 | goto fput_and_out; |
815 | 843 | ||
844 | if (flags & FAN_MARK_FLUSH) { | ||
845 | ret = 0; | ||
846 | if (flags & FAN_MARK_MOUNT) | ||
847 | fsnotify_clear_vfsmount_marks_by_group(group); | ||
848 | else | ||
849 | fsnotify_clear_inode_marks_by_group(group); | ||
850 | goto fput_and_out; | ||
851 | } | ||
852 | |||
816 | ret = fanotify_find_path(dfd, pathname, &path, flags); | 853 | ret = fanotify_find_path(dfd, pathname, &path, flags); |
817 | if (ret) | 854 | if (ret) |
818 | goto fput_and_out; | 855 | goto fput_and_out; |
@@ -824,7 +861,7 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, | |||
824 | mnt = path.mnt; | 861 | mnt = path.mnt; |
825 | 862 | ||
826 | /* create/update an inode mark */ | 863 | /* create/update an inode mark */ |
827 | switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { | 864 | switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { |
828 | case FAN_MARK_ADD: | 865 | case FAN_MARK_ADD: |
829 | if (flags & FAN_MARK_MOUNT) | 866 | if (flags & FAN_MARK_MOUNT) |
830 | ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); | 867 | ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); |
@@ -837,12 +874,6 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, | |||
837 | else | 874 | else |
838 | ret = fanotify_remove_inode_mark(group, inode, mask, flags); | 875 | ret = fanotify_remove_inode_mark(group, inode, mask, flags); |
839 | break; | 876 | break; |
840 | case FAN_MARK_FLUSH: | ||
841 | if (flags & FAN_MARK_MOUNT) | ||
842 | fsnotify_clear_vfsmount_marks_by_group(group); | ||
843 | else | ||
844 | fsnotify_clear_inode_marks_by_group(group); | ||
845 | break; | ||
846 | default: | 877 | default: |
847 | ret = -EINVAL; | 878 | ret = -EINVAL; |
848 | } | 879 | } |
diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 923fe4a5f503..d90deaa08e78 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c | |||
@@ -340,7 +340,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, | |||
340 | static int fsnotify_mark_destroy(void *ignored) | 340 | static int fsnotify_mark_destroy(void *ignored) |
341 | { | 341 | { |
342 | struct fsnotify_mark *mark, *next; | 342 | struct fsnotify_mark *mark, *next; |
343 | LIST_HEAD(private_destroy_list); | 343 | struct list_head private_destroy_list; |
344 | 344 | ||
345 | for (;;) { | 345 | for (;;) { |
346 | spin_lock(&destroy_lock); | 346 | spin_lock(&destroy_lock); |
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index a27e3fecefaf..250ed5b20c8f 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c | |||
@@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) | |||
1748 | if (page) { | 1748 | if (page) { |
1749 | set_page_dirty(page); | 1749 | set_page_dirty(page); |
1750 | unlock_page(page); | 1750 | unlock_page(page); |
1751 | mark_page_accessed(page); | ||
1752 | page_cache_release(page); | 1751 | page_cache_release(page); |
1753 | } | 1752 | } |
1754 | ntfs_debug("Done."); | 1753 | ntfs_debug("Done."); |
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index ee4144ce5d7c..f82498c35e78 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c | |||
@@ -58,7 +58,7 @@ typedef enum { | |||
58 | /** | 58 | /** |
59 | * ntfs_compression_buffer - one buffer for the decompression engine | 59 | * ntfs_compression_buffer - one buffer for the decompression engine |
60 | */ | 60 | */ |
61 | static u8 *ntfs_compression_buffer = NULL; | 61 | static u8 *ntfs_compression_buffer; |
62 | 62 | ||
63 | /** | 63 | /** |
64 | * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer | 64 | * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer |
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index db9bd8a31725..86ddab916b66 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c | |||
@@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, | |||
2060 | } | 2060 | } |
2061 | do { | 2061 | do { |
2062 | unlock_page(pages[--do_pages]); | 2062 | unlock_page(pages[--do_pages]); |
2063 | mark_page_accessed(pages[do_pages]); | ||
2064 | page_cache_release(pages[do_pages]); | 2063 | page_cache_release(pages[do_pages]); |
2065 | } while (do_pages); | 2064 | } while (do_pages); |
2066 | if (unlikely(status)) | 2065 | if (unlikely(status)) |
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 9de2491f2926..6c3296e546c3 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c | |||
@@ -50,8 +50,8 @@ | |||
50 | static unsigned long ntfs_nr_compression_users; | 50 | static unsigned long ntfs_nr_compression_users; |
51 | 51 | ||
52 | /* A global default upcase table and a corresponding reference count. */ | 52 | /* A global default upcase table and a corresponding reference count. */ |
53 | static ntfschar *default_upcase = NULL; | 53 | static ntfschar *default_upcase; |
54 | static unsigned long ntfs_nr_upcase_users = 0; | 54 | static unsigned long ntfs_nr_upcase_users; |
55 | 55 | ||
56 | /* Error constants/strings used in inode.c::ntfs_show_options(). */ | 56 | /* Error constants/strings used in inode.c::ntfs_show_options(). */ |
57 | typedef enum { | 57 | typedef enum { |
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c index 79a89184cb5e..1927170a35ce 100644 --- a/fs/ntfs/sysctl.c +++ b/fs/ntfs/sysctl.c | |||
@@ -56,7 +56,7 @@ static ctl_table sysctls_root[] = { | |||
56 | }; | 56 | }; |
57 | 57 | ||
58 | /* Storage for the sysctls header. */ | 58 | /* Storage for the sysctls header. */ |
59 | static struct ctl_table_header *sysctls_root_table = NULL; | 59 | static struct ctl_table_header *sysctls_root_table; |
60 | 60 | ||
61 | /** | 61 | /** |
62 | * ntfs_sysctl - add or remove the debug sysctl | 62 | * ntfs_sysctl - add or remove the debug sysctl |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index b4deb5f750d9..9d8fcf2f3b94 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -6046,7 +6046,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work) | |||
6046 | void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, | 6046 | void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, |
6047 | int cancel) | 6047 | int cancel) |
6048 | { | 6048 | { |
6049 | if (osb->osb_tl_inode) { | 6049 | if (osb->osb_tl_inode && |
6050 | atomic_read(&osb->osb_tl_disable) == 0) { | ||
6050 | /* We want to push off log flushes while truncates are | 6051 | /* We want to push off log flushes while truncates are |
6051 | * still running. */ | 6052 | * still running. */ |
6052 | if (cancel) | 6053 | if (cancel) |
@@ -6223,6 +6224,8 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) | |||
6223 | int status; | 6224 | int status; |
6224 | struct inode *tl_inode = osb->osb_tl_inode; | 6225 | struct inode *tl_inode = osb->osb_tl_inode; |
6225 | 6226 | ||
6227 | atomic_set(&osb->osb_tl_disable, 1); | ||
6228 | |||
6226 | if (tl_inode) { | 6229 | if (tl_inode) { |
6227 | cancel_delayed_work(&osb->osb_truncate_log_wq); | 6230 | cancel_delayed_work(&osb->osb_truncate_log_wq); |
6228 | flush_workqueue(ocfs2_wq); | 6231 | flush_workqueue(ocfs2_wq); |
@@ -6254,6 +6257,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) | |||
6254 | * until we're sure all is well. */ | 6257 | * until we're sure all is well. */ |
6255 | INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, | 6258 | INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, |
6256 | ocfs2_truncate_log_worker); | 6259 | ocfs2_truncate_log_worker); |
6260 | atomic_set(&osb->osb_tl_disable, 0); | ||
6257 | osb->osb_tl_bh = tl_bh; | 6261 | osb->osb_tl_bh = tl_bh; |
6258 | osb->osb_tl_inode = tl_inode; | 6262 | osb->osb_tl_inode = tl_inode; |
6259 | 6263 | ||
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index c6b90e670389..a68e07a9bd46 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -108,7 +108,7 @@ static struct rb_root o2net_handler_tree = RB_ROOT; | |||
108 | static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; | 108 | static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; |
109 | 109 | ||
110 | /* XXX someday we'll need better accounting */ | 110 | /* XXX someday we'll need better accounting */ |
111 | static struct socket *o2net_listen_sock = NULL; | 111 | static struct socket *o2net_listen_sock; |
112 | 112 | ||
113 | /* | 113 | /* |
114 | * listen work is only queued by the listening socket callbacks on the | 114 | * listen work is only queued by the listening socket callbacks on the |
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e0517762fcc0..a106b3f2b22a 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h | |||
@@ -108,7 +108,6 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) | |||
108 | struct dlm_recovery_ctxt | 108 | struct dlm_recovery_ctxt |
109 | { | 109 | { |
110 | struct list_head resources; | 110 | struct list_head resources; |
111 | struct list_head received; | ||
112 | struct list_head node_data; | 111 | struct list_head node_data; |
113 | u8 new_master; | 112 | u8 new_master; |
114 | u8 dead_node; | 113 | u8 dead_node; |
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index e33cd7a3c582..18f13c2e4a10 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c | |||
@@ -338,7 +338,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) | |||
338 | 338 | ||
339 | #ifdef CONFIG_DEBUG_FS | 339 | #ifdef CONFIG_DEBUG_FS |
340 | 340 | ||
341 | static struct dentry *dlm_debugfs_root = NULL; | 341 | static struct dentry *dlm_debugfs_root; |
342 | 342 | ||
343 | #define DLM_DEBUGFS_DIR "o2dlm" | 343 | #define DLM_DEBUGFS_DIR "o2dlm" |
344 | #define DLM_DEBUGFS_DLM_STATE "dlm_state" | 344 | #define DLM_DEBUGFS_DLM_STATE "dlm_state" |
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index c973690dc0bc..39efc5057a36 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -959,6 +959,14 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, | |||
959 | * domain. Set him in the map and clean up our | 959 | * domain. Set him in the map and clean up our |
960 | * leftover join state. */ | 960 | * leftover join state. */ |
961 | BUG_ON(dlm->joining_node != assert->node_idx); | 961 | BUG_ON(dlm->joining_node != assert->node_idx); |
962 | |||
963 | if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { | ||
964 | mlog(0, "dlm recovery is ongoing, disallow join\n"); | ||
965 | spin_unlock(&dlm->spinlock); | ||
966 | spin_unlock(&dlm_domain_lock); | ||
967 | return -EAGAIN; | ||
968 | } | ||
969 | |||
962 | set_bit(assert->node_idx, dlm->domain_map); | 970 | set_bit(assert->node_idx, dlm->domain_map); |
963 | clear_bit(assert->node_idx, dlm->exit_domain_map); | 971 | clear_bit(assert->node_idx, dlm->exit_domain_map); |
964 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | 972 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); |
@@ -1517,6 +1525,7 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, | |||
1517 | unsigned int node) | 1525 | unsigned int node) |
1518 | { | 1526 | { |
1519 | int status; | 1527 | int status; |
1528 | int ret; | ||
1520 | struct dlm_assert_joined assert_msg; | 1529 | struct dlm_assert_joined assert_msg; |
1521 | 1530 | ||
1522 | mlog(0, "Sending join assert to node %u\n", node); | 1531 | mlog(0, "Sending join assert to node %u\n", node); |
@@ -1528,11 +1537,13 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, | |||
1528 | 1537 | ||
1529 | status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, | 1538 | status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, |
1530 | &assert_msg, sizeof(assert_msg), node, | 1539 | &assert_msg, sizeof(assert_msg), node, |
1531 | NULL); | 1540 | &ret); |
1532 | if (status < 0) | 1541 | if (status < 0) |
1533 | mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " | 1542 | mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " |
1534 | "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, | 1543 | "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, |
1535 | node); | 1544 | node); |
1545 | else | ||
1546 | status = ret; | ||
1536 | 1547 | ||
1537 | return status; | 1548 | return status; |
1538 | } | 1549 | } |
@@ -2023,7 +2034,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
2023 | INIT_LIST_HEAD(&dlm->list); | 2034 | INIT_LIST_HEAD(&dlm->list); |
2024 | INIT_LIST_HEAD(&dlm->dirty_list); | 2035 | INIT_LIST_HEAD(&dlm->dirty_list); |
2025 | INIT_LIST_HEAD(&dlm->reco.resources); | 2036 | INIT_LIST_HEAD(&dlm->reco.resources); |
2026 | INIT_LIST_HEAD(&dlm->reco.received); | ||
2027 | INIT_LIST_HEAD(&dlm->reco.node_data); | 2037 | INIT_LIST_HEAD(&dlm->reco.node_data); |
2028 | INIT_LIST_HEAD(&dlm->purge_list); | 2038 | INIT_LIST_HEAD(&dlm->purge_list); |
2029 | INIT_LIST_HEAD(&dlm->dlm_domain_handlers); | 2039 | INIT_LIST_HEAD(&dlm->dlm_domain_handlers); |
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 5d32f7511f74..66c2a491f68d 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c | |||
@@ -52,7 +52,7 @@ | |||
52 | #define MLOG_MASK_PREFIX ML_DLM | 52 | #define MLOG_MASK_PREFIX ML_DLM |
53 | #include "cluster/masklog.h" | 53 | #include "cluster/masklog.h" |
54 | 54 | ||
55 | static struct kmem_cache *dlm_lock_cache = NULL; | 55 | static struct kmem_cache *dlm_lock_cache; |
56 | 56 | ||
57 | static DEFINE_SPINLOCK(dlm_cookie_lock); | 57 | static DEFINE_SPINLOCK(dlm_cookie_lock); |
58 | static u64 dlm_next_cookie = 1; | 58 | static u64 dlm_next_cookie = 1; |
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index ee1f88419cb0..3087a21d32f9 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c | |||
@@ -82,9 +82,9 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, | |||
82 | return 1; | 82 | return 1; |
83 | } | 83 | } |
84 | 84 | ||
85 | static struct kmem_cache *dlm_lockres_cache = NULL; | 85 | static struct kmem_cache *dlm_lockres_cache; |
86 | static struct kmem_cache *dlm_lockname_cache = NULL; | 86 | static struct kmem_cache *dlm_lockname_cache; |
87 | static struct kmem_cache *dlm_mle_cache = NULL; | 87 | static struct kmem_cache *dlm_mle_cache; |
88 | 88 | ||
89 | static void dlm_mle_release(struct kref *kref); | 89 | static void dlm_mle_release(struct kref *kref); |
90 | static void dlm_init_mle(struct dlm_master_list_entry *mle, | 90 | static void dlm_init_mle(struct dlm_master_list_entry *mle, |
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index fe29f7978f81..5de019437ea5 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -1986,7 +1986,15 @@ skip_lvb: | |||
1986 | } | 1986 | } |
1987 | if (!bad) { | 1987 | if (!bad) { |
1988 | dlm_lock_get(newlock); | 1988 | dlm_lock_get(newlock); |
1989 | list_add_tail(&newlock->list, queue); | 1989 | if (mres->flags & DLM_MRES_RECOVERY && |
1990 | ml->list == DLM_CONVERTING_LIST && | ||
1991 | newlock->ml.type > | ||
1992 | newlock->ml.convert_type) { | ||
1993 | /* newlock is doing downconvert, add it to the | ||
1994 | * head of converting list */ | ||
1995 | list_add(&newlock->list, queue); | ||
1996 | } else | ||
1997 | list_add_tail(&newlock->list, queue); | ||
1990 | mlog(0, "%s:%.*s: added lock for node %u, " | 1998 | mlog(0, "%s:%.*s: added lock for node %u, " |
1991 | "setting refmap bit\n", dlm->name, | 1999 | "setting refmap bit\n", dlm->name, |
1992 | res->lockname.len, res->lockname.name, ml->node); | 2000 | res->lockname.len, res->lockname.name, ml->node); |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 6bd690b5a061..52cfe99ae056 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -2544,11 +2544,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb, | |||
2544 | * refreshed, so we do it here. Of course, making sense of | 2544 | * refreshed, so we do it here. Of course, making sense of |
2545 | * everything is up to the caller :) */ | 2545 | * everything is up to the caller :) */ |
2546 | status = ocfs2_should_refresh_lock_res(lockres); | 2546 | status = ocfs2_should_refresh_lock_res(lockres); |
2547 | if (status < 0) { | ||
2548 | ocfs2_cluster_unlock(osb, lockres, level); | ||
2549 | mlog_errno(status); | ||
2550 | goto bail; | ||
2551 | } | ||
2552 | if (status) { | 2547 | if (status) { |
2553 | status = ocfs2_refresh_slot_info(osb); | 2548 | status = ocfs2_refresh_slot_info(osb); |
2554 | 2549 | ||
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8970dcf74de5..8eb6e5732d3b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -828,7 +828,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, | |||
828 | /* | 828 | /* |
829 | * fs-writeback will release the dirty pages without page lock | 829 | * fs-writeback will release the dirty pages without page lock |
830 | * whose offset are over inode size, the release happens at | 830 | * whose offset are over inode size, the release happens at |
831 | * block_write_full_page_endio(). | 831 | * block_write_full_page(). |
832 | */ | 832 | */ |
833 | i_size_write(inode, abs_to); | 833 | i_size_write(inode, abs_to); |
834 | inode->i_blocks = ocfs2_inode_sector_count(inode); | 834 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 490229f43731..6f66b3751ace 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c | |||
@@ -143,8 +143,8 @@ bail: | |||
143 | return status; | 143 | return status; |
144 | } | 144 | } |
145 | 145 | ||
146 | int ocfs2_info_handle_blocksize(struct inode *inode, | 146 | static int ocfs2_info_handle_blocksize(struct inode *inode, |
147 | struct ocfs2_info_request __user *req) | 147 | struct ocfs2_info_request __user *req) |
148 | { | 148 | { |
149 | int status = -EFAULT; | 149 | int status = -EFAULT; |
150 | struct ocfs2_info_blocksize oib; | 150 | struct ocfs2_info_blocksize oib; |
@@ -167,8 +167,8 @@ bail: | |||
167 | return status; | 167 | return status; |
168 | } | 168 | } |
169 | 169 | ||
170 | int ocfs2_info_handle_clustersize(struct inode *inode, | 170 | static int ocfs2_info_handle_clustersize(struct inode *inode, |
171 | struct ocfs2_info_request __user *req) | 171 | struct ocfs2_info_request __user *req) |
172 | { | 172 | { |
173 | int status = -EFAULT; | 173 | int status = -EFAULT; |
174 | struct ocfs2_info_clustersize oic; | 174 | struct ocfs2_info_clustersize oic; |
@@ -192,8 +192,8 @@ bail: | |||
192 | return status; | 192 | return status; |
193 | } | 193 | } |
194 | 194 | ||
195 | int ocfs2_info_handle_maxslots(struct inode *inode, | 195 | static int ocfs2_info_handle_maxslots(struct inode *inode, |
196 | struct ocfs2_info_request __user *req) | 196 | struct ocfs2_info_request __user *req) |
197 | { | 197 | { |
198 | int status = -EFAULT; | 198 | int status = -EFAULT; |
199 | struct ocfs2_info_maxslots oim; | 199 | struct ocfs2_info_maxslots oim; |
@@ -217,8 +217,8 @@ bail: | |||
217 | return status; | 217 | return status; |
218 | } | 218 | } |
219 | 219 | ||
220 | int ocfs2_info_handle_label(struct inode *inode, | 220 | static int ocfs2_info_handle_label(struct inode *inode, |
221 | struct ocfs2_info_request __user *req) | 221 | struct ocfs2_info_request __user *req) |
222 | { | 222 | { |
223 | int status = -EFAULT; | 223 | int status = -EFAULT; |
224 | struct ocfs2_info_label oil; | 224 | struct ocfs2_info_label oil; |
@@ -242,8 +242,8 @@ bail: | |||
242 | return status; | 242 | return status; |
243 | } | 243 | } |
244 | 244 | ||
245 | int ocfs2_info_handle_uuid(struct inode *inode, | 245 | static int ocfs2_info_handle_uuid(struct inode *inode, |
246 | struct ocfs2_info_request __user *req) | 246 | struct ocfs2_info_request __user *req) |
247 | { | 247 | { |
248 | int status = -EFAULT; | 248 | int status = -EFAULT; |
249 | struct ocfs2_info_uuid oiu; | 249 | struct ocfs2_info_uuid oiu; |
@@ -267,8 +267,8 @@ bail: | |||
267 | return status; | 267 | return status; |
268 | } | 268 | } |
269 | 269 | ||
270 | int ocfs2_info_handle_fs_features(struct inode *inode, | 270 | static int ocfs2_info_handle_fs_features(struct inode *inode, |
271 | struct ocfs2_info_request __user *req) | 271 | struct ocfs2_info_request __user *req) |
272 | { | 272 | { |
273 | int status = -EFAULT; | 273 | int status = -EFAULT; |
274 | struct ocfs2_info_fs_features oif; | 274 | struct ocfs2_info_fs_features oif; |
@@ -294,8 +294,8 @@ bail: | |||
294 | return status; | 294 | return status; |
295 | } | 295 | } |
296 | 296 | ||
297 | int ocfs2_info_handle_journal_size(struct inode *inode, | 297 | static int ocfs2_info_handle_journal_size(struct inode *inode, |
298 | struct ocfs2_info_request __user *req) | 298 | struct ocfs2_info_request __user *req) |
299 | { | 299 | { |
300 | int status = -EFAULT; | 300 | int status = -EFAULT; |
301 | struct ocfs2_info_journal_size oij; | 301 | struct ocfs2_info_journal_size oij; |
@@ -319,9 +319,10 @@ bail: | |||
319 | return status; | 319 | return status; |
320 | } | 320 | } |
321 | 321 | ||
322 | int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, | 322 | static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, |
323 | struct inode *inode_alloc, u64 blkno, | 323 | struct inode *inode_alloc, u64 blkno, |
324 | struct ocfs2_info_freeinode *fi, u32 slot) | 324 | struct ocfs2_info_freeinode *fi, |
325 | u32 slot) | ||
325 | { | 326 | { |
326 | int status = 0, unlock = 0; | 327 | int status = 0, unlock = 0; |
327 | 328 | ||
@@ -366,8 +367,8 @@ bail: | |||
366 | return status; | 367 | return status; |
367 | } | 368 | } |
368 | 369 | ||
369 | int ocfs2_info_handle_freeinode(struct inode *inode, | 370 | static int ocfs2_info_handle_freeinode(struct inode *inode, |
370 | struct ocfs2_info_request __user *req) | 371 | struct ocfs2_info_request __user *req) |
371 | { | 372 | { |
372 | u32 i; | 373 | u32 i; |
373 | u64 blkno = -1; | 374 | u64 blkno = -1; |
@@ -462,19 +463,19 @@ static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, | |||
462 | stats->ffs_free_chunks_real++; | 463 | stats->ffs_free_chunks_real++; |
463 | } | 464 | } |
464 | 465 | ||
465 | void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, | 466 | static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, |
466 | unsigned int chunksize) | 467 | unsigned int chunksize) |
467 | { | 468 | { |
468 | o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); | 469 | o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); |
469 | o2ffg_update_stats(&(ffg->iff_ffs), chunksize); | 470 | o2ffg_update_stats(&(ffg->iff_ffs), chunksize); |
470 | } | 471 | } |
471 | 472 | ||
472 | int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, | 473 | static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, |
473 | struct inode *gb_inode, | 474 | struct inode *gb_inode, |
474 | struct ocfs2_dinode *gb_dinode, | 475 | struct ocfs2_dinode *gb_dinode, |
475 | struct ocfs2_chain_rec *rec, | 476 | struct ocfs2_chain_rec *rec, |
476 | struct ocfs2_info_freefrag *ffg, | 477 | struct ocfs2_info_freefrag *ffg, |
477 | u32 chunks_in_group) | 478 | u32 chunks_in_group) |
478 | { | 479 | { |
479 | int status = 0, used; | 480 | int status = 0, used; |
480 | u64 blkno; | 481 | u64 blkno; |
@@ -572,9 +573,9 @@ bail: | |||
572 | return status; | 573 | return status; |
573 | } | 574 | } |
574 | 575 | ||
575 | int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, | 576 | static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, |
576 | struct inode *gb_inode, u64 blkno, | 577 | struct inode *gb_inode, u64 blkno, |
577 | struct ocfs2_info_freefrag *ffg) | 578 | struct ocfs2_info_freefrag *ffg) |
578 | { | 579 | { |
579 | u32 chunks_in_group; | 580 | u32 chunks_in_group; |
580 | int status = 0, unlock = 0, i; | 581 | int status = 0, unlock = 0, i; |
@@ -652,8 +653,8 @@ bail: | |||
652 | return status; | 653 | return status; |
653 | } | 654 | } |
654 | 655 | ||
655 | int ocfs2_info_handle_freefrag(struct inode *inode, | 656 | static int ocfs2_info_handle_freefrag(struct inode *inode, |
656 | struct ocfs2_info_request __user *req) | 657 | struct ocfs2_info_request __user *req) |
657 | { | 658 | { |
658 | u64 blkno = -1; | 659 | u64 blkno = -1; |
659 | char namebuf[40]; | 660 | char namebuf[40]; |
@@ -723,8 +724,8 @@ out_err: | |||
723 | return status; | 724 | return status; |
724 | } | 725 | } |
725 | 726 | ||
726 | int ocfs2_info_handle_unknown(struct inode *inode, | 727 | static int ocfs2_info_handle_unknown(struct inode *inode, |
727 | struct ocfs2_info_request __user *req) | 728 | struct ocfs2_info_request __user *req) |
728 | { | 729 | { |
729 | int status = -EFAULT; | 730 | int status = -EFAULT; |
730 | struct ocfs2_info_request oir; | 731 | struct ocfs2_info_request oir; |
@@ -752,8 +753,8 @@ bail: | |||
752 | * - distinguish different requests. | 753 | * - distinguish different requests. |
753 | * - validate size of different requests. | 754 | * - validate size of different requests. |
754 | */ | 755 | */ |
755 | int ocfs2_info_handle_request(struct inode *inode, | 756 | static int ocfs2_info_handle_request(struct inode *inode, |
756 | struct ocfs2_info_request __user *req) | 757 | struct ocfs2_info_request __user *req) |
757 | { | 758 | { |
758 | int status = -EFAULT; | 759 | int status = -EFAULT; |
759 | struct ocfs2_info_request oir; | 760 | struct ocfs2_info_request oir; |
@@ -811,8 +812,8 @@ bail: | |||
811 | return status; | 812 | return status; |
812 | } | 813 | } |
813 | 814 | ||
814 | int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, | 815 | static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, |
815 | u64 *req_addr, int compat_flag) | 816 | u64 *req_addr, int compat_flag) |
816 | { | 817 | { |
817 | int status = -EFAULT; | 818 | int status = -EFAULT; |
818 | u64 __user *bp = NULL; | 819 | u64 __user *bp = NULL; |
@@ -849,8 +850,8 @@ bail: | |||
849 | * a better backward&forward compatibility, since a small piece of | 850 | * a better backward&forward compatibility, since a small piece of |
850 | * request will be less likely to be broken if disk layout get changed. | 851 | * request will be less likely to be broken if disk layout get changed. |
851 | */ | 852 | */ |
852 | int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, | 853 | static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, |
853 | int compat_flag) | 854 | int compat_flag) |
854 | { | 855 | { |
855 | int i, status = 0; | 856 | int i, status = 0; |
856 | u64 req_addr; | 857 | u64 req_addr; |
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 03ea9314fecd..4b0c68849b36 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/kthread.h> | 30 | #include <linux/kthread.h> |
31 | #include <linux/time.h> | 31 | #include <linux/time.h> |
32 | #include <linux/random.h> | 32 | #include <linux/random.h> |
33 | #include <linux/delay.h> | ||
33 | 34 | ||
34 | #include <cluster/masklog.h> | 35 | #include <cluster/masklog.h> |
35 | 36 | ||
@@ -2185,8 +2186,20 @@ static int ocfs2_commit_thread(void *arg) | |||
2185 | || kthread_should_stop()); | 2186 | || kthread_should_stop()); |
2186 | 2187 | ||
2187 | status = ocfs2_commit_cache(osb); | 2188 | status = ocfs2_commit_cache(osb); |
2188 | if (status < 0) | 2189 | if (status < 0) { |
2189 | mlog_errno(status); | 2190 | static unsigned long abort_warn_time; |
2191 | |||
2192 | /* Warn about this once per minute */ | ||
2193 | if (printk_timed_ratelimit(&abort_warn_time, 60*HZ)) | ||
2194 | mlog(ML_ERROR, "status = %d, journal is " | ||
2195 | "already aborted.\n", status); | ||
2196 | /* | ||
2197 | * After ocfs2_commit_cache() fails, j_num_trans has a | ||
2198 | * non-zero value. Sleep here to avoid a busy-wait | ||
2199 | * loop. | ||
2200 | */ | ||
2201 | msleep_interruptible(1000); | ||
2202 | } | ||
2190 | 2203 | ||
2191 | if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ | 2204 | if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ |
2192 | mlog(ML_KTHREAD, | 2205 | mlog(ML_KTHREAD, |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 8d64a97a9d5e..bbec539230fd 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -422,6 +422,7 @@ struct ocfs2_super | |||
422 | struct inode *osb_tl_inode; | 422 | struct inode *osb_tl_inode; |
423 | struct buffer_head *osb_tl_bh; | 423 | struct buffer_head *osb_tl_bh; |
424 | struct delayed_work osb_truncate_log_wq; | 424 | struct delayed_work osb_truncate_log_wq; |
425 | atomic_t osb_tl_disable; | ||
425 | /* | 426 | /* |
426 | * How many clusters in our truncate log. | 427 | * How many clusters in our truncate log. |
427 | * It must be protected by osb_tl_inode->i_mutex. | 428 | * It must be protected by osb_tl_inode->i_mutex. |
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 6ba4bcbc4796..714e53b9cc66 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c | |||
@@ -1408,10 +1408,9 @@ static void swap_refcount_rec(void *a, void *b, int size) | |||
1408 | { | 1408 | { |
1409 | struct ocfs2_refcount_rec *l = a, *r = b, tmp; | 1409 | struct ocfs2_refcount_rec *l = a, *r = b, tmp; |
1410 | 1410 | ||
1411 | tmp = *(struct ocfs2_refcount_rec *)l; | 1411 | tmp = *l; |
1412 | *(struct ocfs2_refcount_rec *)l = | 1412 | *l = *r; |
1413 | *(struct ocfs2_refcount_rec *)r; | 1413 | *r = tmp; |
1414 | *(struct ocfs2_refcount_rec *)r = tmp; | ||
1415 | } | 1414 | } |
1416 | 1415 | ||
1417 | /* | 1416 | /* |
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 822ebc10f281..d5da6f624142 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c | |||
@@ -53,8 +53,6 @@ | |||
53 | */ | 53 | */ |
54 | static u16 ocfs2_calc_new_backup_super(struct inode *inode, | 54 | static u16 ocfs2_calc_new_backup_super(struct inode *inode, |
55 | struct ocfs2_group_desc *gd, | 55 | struct ocfs2_group_desc *gd, |
56 | int new_clusters, | ||
57 | u32 first_new_cluster, | ||
58 | u16 cl_cpg, | 56 | u16 cl_cpg, |
59 | int set) | 57 | int set) |
60 | { | 58 | { |
@@ -127,8 +125,6 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, | |||
127 | OCFS2_FEATURE_COMPAT_BACKUP_SB)) { | 125 | OCFS2_FEATURE_COMPAT_BACKUP_SB)) { |
128 | backups = ocfs2_calc_new_backup_super(bm_inode, | 126 | backups = ocfs2_calc_new_backup_super(bm_inode, |
129 | group, | 127 | group, |
130 | new_clusters, | ||
131 | first_new_cluster, | ||
132 | cl_cpg, 1); | 128 | cl_cpg, 1); |
133 | le16_add_cpu(&group->bg_free_bits_count, -1 * backups); | 129 | le16_add_cpu(&group->bg_free_bits_count, -1 * backups); |
134 | } | 130 | } |
@@ -157,7 +153,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, | |||
157 | 153 | ||
158 | spin_lock(&OCFS2_I(bm_inode)->ip_lock); | 154 | spin_lock(&OCFS2_I(bm_inode)->ip_lock); |
159 | OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | 155 | OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); |
160 | le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits); | 156 | le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits); |
161 | spin_unlock(&OCFS2_I(bm_inode)->ip_lock); | 157 | spin_unlock(&OCFS2_I(bm_inode)->ip_lock); |
162 | i_size_write(bm_inode, le64_to_cpu(fe->i_size)); | 158 | i_size_write(bm_inode, le64_to_cpu(fe->i_size)); |
163 | 159 | ||
@@ -167,8 +163,6 @@ out_rollback: | |||
167 | if (ret < 0) { | 163 | if (ret < 0) { |
168 | ocfs2_calc_new_backup_super(bm_inode, | 164 | ocfs2_calc_new_backup_super(bm_inode, |
169 | group, | 165 | group, |
170 | new_clusters, | ||
171 | first_new_cluster, | ||
172 | cl_cpg, 0); | 166 | cl_cpg, 0); |
173 | le16_add_cpu(&group->bg_free_bits_count, backups); | 167 | le16_add_cpu(&group->bg_free_bits_count, backups); |
174 | le16_add_cpu(&group->bg_bits, -1 * num_bits); | 168 | le16_add_cpu(&group->bg_bits, -1 * num_bits); |
@@ -569,7 +563,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) | |||
569 | 563 | ||
570 | spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); | 564 | spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); |
571 | OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | 565 | OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); |
572 | le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits); | 566 | le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); |
573 | spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); | 567 | spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); |
574 | i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); | 568 | i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); |
575 | 569 | ||
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 83f1a665ae97..5d965e83bd43 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c | |||
@@ -709,7 +709,7 @@ static struct ctl_table ocfs2_root_table[] = { | |||
709 | { } | 709 | { } |
710 | }; | 710 | }; |
711 | 711 | ||
712 | static struct ctl_table_header *ocfs2_table_header = NULL; | 712 | static struct ctl_table_header *ocfs2_table_header; |
713 | 713 | ||
714 | 714 | ||
715 | /* | 715 | /* |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a7cdd56f4c79..c7a89cea5c5d 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -75,7 +75,7 @@ | |||
75 | 75 | ||
76 | #include "buffer_head_io.h" | 76 | #include "buffer_head_io.h" |
77 | 77 | ||
78 | static struct kmem_cache *ocfs2_inode_cachep = NULL; | 78 | static struct kmem_cache *ocfs2_inode_cachep; |
79 | struct kmem_cache *ocfs2_dquot_cachep; | 79 | struct kmem_cache *ocfs2_dquot_cachep; |
80 | struct kmem_cache *ocfs2_qf_chunk_cachep; | 80 | struct kmem_cache *ocfs2_qf_chunk_cachep; |
81 | 81 | ||
@@ -85,7 +85,7 @@ struct kmem_cache *ocfs2_qf_chunk_cachep; | |||
85 | * workqueue and schedule on our own. */ | 85 | * workqueue and schedule on our own. */ |
86 | struct workqueue_struct *ocfs2_wq = NULL; | 86 | struct workqueue_struct *ocfs2_wq = NULL; |
87 | 87 | ||
88 | static struct dentry *ocfs2_debugfs_root = NULL; | 88 | static struct dentry *ocfs2_debugfs_root; |
89 | 89 | ||
90 | MODULE_AUTHOR("Oracle"); | 90 | MODULE_AUTHOR("Oracle"); |
91 | MODULE_LICENSE("GPL"); | 91 | MODULE_LICENSE("GPL"); |
@@ -2292,8 +2292,8 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
2292 | goto bail; | 2292 | goto bail; |
2293 | } | 2293 | } |
2294 | 2294 | ||
2295 | strncpy(osb->vol_label, di->id2.i_super.s_label, 63); | 2295 | strlcpy(osb->vol_label, di->id2.i_super.s_label, |
2296 | osb->vol_label[63] = '\0'; | 2296 | OCFS2_MAX_VOL_LABEL_LEN); |
2297 | osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); | 2297 | osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); |
2298 | osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); | 2298 | osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); |
2299 | osb->first_cluster_group_blkno = | 2299 | osb->first_cluster_group_blkno = |
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 52eaf33d346f..82e17b076ce7 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c | |||
@@ -67,7 +67,7 @@ struct ocfs2_meta_cache_item { | |||
67 | sector_t c_block; | 67 | sector_t c_block; |
68 | }; | 68 | }; |
69 | 69 | ||
70 | static struct kmem_cache *ocfs2_uptodate_cachep = NULL; | 70 | static struct kmem_cache *ocfs2_uptodate_cachep; |
71 | 71 | ||
72 | u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) | 72 | u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) |
73 | { | 73 | { |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 442177b1119a..2101ce46a5d2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -737,9 +737,6 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, | |||
737 | ptent = pte_file_clear_soft_dirty(ptent); | 737 | ptent = pte_file_clear_soft_dirty(ptent); |
738 | } | 738 | } |
739 | 739 | ||
740 | if (vma->vm_flags & VM_SOFTDIRTY) | ||
741 | vma->vm_flags &= ~VM_SOFTDIRTY; | ||
742 | |||
743 | set_pte_at(vma->vm_mm, addr, pte, ptent); | 740 | set_pte_at(vma->vm_mm, addr, pte, ptent); |
744 | #endif | 741 | #endif |
745 | } | 742 | } |
@@ -807,8 +804,9 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, | |||
807 | 804 | ||
808 | if (type == CLEAR_REFS_SOFT_DIRTY) { | 805 | if (type == CLEAR_REFS_SOFT_DIRTY) { |
809 | soft_dirty_cleared = true; | 806 | soft_dirty_cleared = true; |
810 | pr_warn_once("The pagemap bits 55-60 has changed their meaning! " | 807 | pr_warn_once("The pagemap bits 55-60 has changed their meaning!" |
811 | "See the linux/Documentation/vm/pagemap.txt for details.\n"); | 808 | " See the linux/Documentation/vm/pagemap.txt for " |
809 | "details.\n"); | ||
812 | } | 810 | } |
813 | 811 | ||
814 | task = get_proc_task(file_inode(file)); | 812 | task = get_proc_task(file_inode(file)); |
@@ -839,11 +837,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, | |||
839 | * | 837 | * |
840 | * Writing 3 to /proc/pid/clear_refs only affects file | 838 | * Writing 3 to /proc/pid/clear_refs only affects file |
841 | * mapped pages. | 839 | * mapped pages. |
840 | * | ||
841 | * Writing 4 to /proc/pid/clear_refs affects all pages. | ||
842 | */ | 842 | */ |
843 | if (type == CLEAR_REFS_ANON && vma->vm_file) | 843 | if (type == CLEAR_REFS_ANON && vma->vm_file) |
844 | continue; | 844 | continue; |
845 | if (type == CLEAR_REFS_MAPPED && !vma->vm_file) | 845 | if (type == CLEAR_REFS_MAPPED && !vma->vm_file) |
846 | continue; | 846 | continue; |
847 | if (type == CLEAR_REFS_SOFT_DIRTY) { | ||
848 | if (vma->vm_flags & VM_SOFTDIRTY) | ||
849 | vma->vm_flags &= ~VM_SOFTDIRTY; | ||
850 | } | ||
847 | walk_page_range(vma->vm_start, vma->vm_end, | 851 | walk_page_range(vma->vm_start, vma->vm_end, |
848 | &clear_refs_walk); | 852 | &clear_refs_walk); |
849 | } | 853 | } |
diff --git a/fs/readdir.c b/fs/readdir.c index 5b53d995cae6..33fd92208cb7 100644 --- a/fs/readdir.c +++ b/fs/readdir.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/stat.h> | 13 | #include <linux/stat.h> |
14 | #include <linux/file.h> | 14 | #include <linux/file.h> |
15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
16 | #include <linux/fsnotify.h> | ||
16 | #include <linux/dirent.h> | 17 | #include <linux/dirent.h> |
17 | #include <linux/security.h> | 18 | #include <linux/security.h> |
18 | #include <linux/syscalls.h> | 19 | #include <linux/syscalls.h> |
@@ -40,6 +41,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx) | |||
40 | ctx->pos = file->f_pos; | 41 | ctx->pos = file->f_pos; |
41 | res = file->f_op->iterate(file, ctx); | 42 | res = file->f_op->iterate(file, ctx); |
42 | file->f_pos = ctx->pos; | 43 | file->f_pos = ctx->pos; |
44 | fsnotify_access(file); | ||
43 | file_accessed(file); | 45 | file_accessed(file); |
44 | } | 46 | } |
45 | mutex_unlock(&inode->i_mutex); | 47 | mutex_unlock(&inode->i_mutex); |
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 9e1bb79f7e6f..887d6d270080 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h | |||
@@ -25,7 +25,7 @@ | |||
25 | 25 | ||
26 | #define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args) | 26 | #define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args) |
27 | 27 | ||
28 | #define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) | 28 | #define WARNING(s, args...) pr_warn("SQUASHFS: "s, ## args) |
29 | 29 | ||
30 | /* block.c */ | 30 | /* block.c */ |
31 | extern int squashfs_read_data(struct super_block *, u64, int, u64 *, | 31 | extern int squashfs_read_data(struct super_block *, u64, int, u64 *, |
diff --git a/fs/super.c b/fs/super.c index 48377f7463c0..d20d5b11dedf 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -112,9 +112,14 @@ static unsigned long super_cache_count(struct shrinker *shrink, | |||
112 | 112 | ||
113 | sb = container_of(shrink, struct super_block, s_shrink); | 113 | sb = container_of(shrink, struct super_block, s_shrink); |
114 | 114 | ||
115 | if (!grab_super_passive(sb)) | 115 | /* |
116 | return 0; | 116 | * Don't call grab_super_passive as it is a potential |
117 | 117 | * scalability bottleneck. The counts could get updated | |
118 | * between super_cache_count and super_cache_scan anyway. | ||
119 | * Call to super_cache_count with shrinker_rwsem held | ||
120 | * ensures the safety of call to list_lru_count_node() and | ||
121 | * s_op->nr_cached_objects(). | ||
122 | */ | ||
118 | if (sb->s_op && sb->s_op->nr_cached_objects) | 123 | if (sb->s_op && sb->s_op->nr_cached_objects) |
119 | total_objects = sb->s_op->nr_cached_objects(sb, | 124 | total_objects = sb->s_op->nr_cached_objects(sb, |
120 | sc->nid); | 125 | sc->nid); |
@@ -125,7 +130,6 @@ static unsigned long super_cache_count(struct shrinker *shrink, | |||
125 | sc->nid); | 130 | sc->nid); |
126 | 131 | ||
127 | total_objects = vfs_pressure_ratio(total_objects); | 132 | total_objects = vfs_pressure_ratio(total_objects); |
128 | drop_super(sb); | ||
129 | return total_objects; | 133 | return total_objects; |
130 | } | 134 | } |
131 | 135 | ||
@@ -276,10 +280,8 @@ void deactivate_locked_super(struct super_block *s) | |||
276 | struct file_system_type *fs = s->s_type; | 280 | struct file_system_type *fs = s->s_type; |
277 | if (atomic_dec_and_test(&s->s_active)) { | 281 | if (atomic_dec_and_test(&s->s_active)) { |
278 | cleancache_invalidate_fs(s); | 282 | cleancache_invalidate_fs(s); |
279 | fs->kill_sb(s); | ||
280 | |||
281 | /* caches are now gone, we can safely kill the shrinker now */ | ||
282 | unregister_shrinker(&s->s_shrink); | 283 | unregister_shrinker(&s->s_shrink); |
284 | fs->kill_sb(s); | ||
283 | 285 | ||
284 | put_filesystem(fs); | 286 | put_filesystem(fs); |
285 | put_super(s); | 287 | put_super(s); |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a8015a7a55bb..53b2acc38213 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -233,6 +233,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) | |||
233 | # define pte_accessible(mm, pte) ((void)(pte), 1) | 233 | # define pte_accessible(mm, pte) ((void)(pte), 1) |
234 | #endif | 234 | #endif |
235 | 235 | ||
236 | #ifndef pte_present_nonuma | ||
237 | #define pte_present_nonuma(pte) pte_present(pte) | ||
238 | #endif | ||
239 | |||
236 | #ifndef flush_tlb_fix_spurious_fault | 240 | #ifndef flush_tlb_fix_spurious_fault |
237 | #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) | 241 | #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) |
238 | #endif | 242 | #endif |
@@ -670,7 +674,7 @@ static inline int pmd_trans_unstable(pmd_t *pmd) | |||
670 | static inline int pte_numa(pte_t pte) | 674 | static inline int pte_numa(pte_t pte) |
671 | { | 675 | { |
672 | return (pte_flags(pte) & | 676 | return (pte_flags(pte) & |
673 | (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; | 677 | (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; |
674 | } | 678 | } |
675 | #endif | 679 | #endif |
676 | 680 | ||
@@ -678,7 +682,7 @@ static inline int pte_numa(pte_t pte) | |||
678 | static inline int pmd_numa(pmd_t pmd) | 682 | static inline int pmd_numa(pmd_t pmd) |
679 | { | 683 | { |
680 | return (pmd_flags(pmd) & | 684 | return (pmd_flags(pmd) & |
681 | (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; | 685 | (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; |
682 | } | 686 | } |
683 | #endif | 687 | #endif |
684 | 688 | ||
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c6f836afa1b..3cd426e971db 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -1588,6 +1588,7 @@ static inline bool blk_integrity_is_initialized(struct gendisk *g) | |||
1588 | struct block_device_operations { | 1588 | struct block_device_operations { |
1589 | int (*open) (struct block_device *, fmode_t); | 1589 | int (*open) (struct block_device *, fmode_t); |
1590 | void (*release) (struct gendisk *, fmode_t); | 1590 | void (*release) (struct gendisk *, fmode_t); |
1591 | int (*rw_page)(struct block_device *, sector_t, struct page *, int rw); | ||
1591 | int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); | 1592 | int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); |
1592 | int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); | 1593 | int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); |
1593 | int (*direct_access) (struct block_device *, sector_t, | 1594 | int (*direct_access) (struct block_device *, sector_t, |
@@ -1606,7 +1607,13 @@ struct block_device_operations { | |||
1606 | 1607 | ||
1607 | extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, | 1608 | extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, |
1608 | unsigned long); | 1609 | unsigned long); |
1610 | extern int bdev_read_page(struct block_device *, sector_t, struct page *); | ||
1611 | extern int bdev_write_page(struct block_device *, sector_t, struct page *, | ||
1612 | struct writeback_control *); | ||
1609 | #else /* CONFIG_BLOCK */ | 1613 | #else /* CONFIG_BLOCK */ |
1614 | |||
1615 | struct block_device; | ||
1616 | |||
1610 | /* | 1617 | /* |
1611 | * stubs for when the block layer is configured out | 1618 | * stubs for when the block layer is configured out |
1612 | */ | 1619 | */ |
@@ -1642,6 +1649,12 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) | |||
1642 | return false; | 1649 | return false; |
1643 | } | 1650 | } |
1644 | 1651 | ||
1652 | static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, | ||
1653 | sector_t *error_sector) | ||
1654 | { | ||
1655 | return 0; | ||
1656 | } | ||
1657 | |||
1645 | #endif /* CONFIG_BLOCK */ | 1658 | #endif /* CONFIG_BLOCK */ |
1646 | 1659 | ||
1647 | #endif | 1660 | #endif |
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index db51fe4fe317..4e2bd4c95b66 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h | |||
@@ -58,9 +58,9 @@ extern void free_bootmem_late(unsigned long physaddr, unsigned long size); | |||
58 | * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, | 58 | * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, |
59 | * the architecture-specific code should honor this). | 59 | * the architecture-specific code should honor this). |
60 | * | 60 | * |
61 | * If flags is 0, then the return value is always 0 (success). If | 61 | * If flags is BOOTMEM_DEFAULT, then the return value is always 0 (success). |
62 | * flags contains BOOTMEM_EXCLUSIVE, then -EBUSY is returned if the | 62 | * If flags contains BOOTMEM_EXCLUSIVE, then -EBUSY is returned if the memory |
63 | * memory already was reserved. | 63 | * already was reserved. |
64 | */ | 64 | */ |
65 | #define BOOTMEM_DEFAULT 0 | 65 | #define BOOTMEM_DEFAULT 0 |
66 | #define BOOTMEM_EXCLUSIVE (1<<0) | 66 | #define BOOTMEM_EXCLUSIVE (1<<0) |
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 7cbf837a279c..324329ceea1e 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h | |||
@@ -207,8 +207,6 @@ void block_invalidatepage(struct page *page, unsigned int offset, | |||
207 | unsigned int length); | 207 | unsigned int length); |
208 | int block_write_full_page(struct page *page, get_block_t *get_block, | 208 | int block_write_full_page(struct page *page, get_block_t *get_block, |
209 | struct writeback_control *wbc); | 209 | struct writeback_control *wbc); |
210 | int block_write_full_page_endio(struct page *page, get_block_t *get_block, | ||
211 | struct writeback_control *wbc, bh_end_io_t *handler); | ||
212 | int block_read_full_page(struct page*, get_block_t*); | 210 | int block_read_full_page(struct page*, get_block_t*); |
213 | int block_is_partially_uptodate(struct page *page, unsigned long from, | 211 | int block_is_partially_uptodate(struct page *page, unsigned long from, |
214 | unsigned long count); | 212 | unsigned long count); |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 7e1c76e3cd68..01e3132820da 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, | |||
22 | extern int fragmentation_index(struct zone *zone, unsigned int order); | 22 | extern int fragmentation_index(struct zone *zone, unsigned int order); |
23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, | 23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, |
24 | int order, gfp_t gfp_mask, nodemask_t *mask, | 24 | int order, gfp_t gfp_mask, nodemask_t *mask, |
25 | bool sync, bool *contended); | 25 | enum migrate_mode mode, bool *contended); |
26 | extern void compact_pgdat(pg_data_t *pgdat, int order); | 26 | extern void compact_pgdat(pg_data_t *pgdat, int order); |
27 | extern void reset_isolation_suitable(pg_data_t *pgdat); | 27 | extern void reset_isolation_suitable(pg_data_t *pgdat); |
28 | extern unsigned long compaction_suitable(struct zone *zone, int order); | 28 | extern unsigned long compaction_suitable(struct zone *zone, int order); |
@@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) | |||
91 | #else | 91 | #else |
92 | static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, | 92 | static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, |
93 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 93 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
94 | bool sync, bool *contended) | 94 | enum migrate_mode mode, bool *contended) |
95 | { | 95 | { |
96 | return COMPACT_CONTINUE; | 96 | return COMPACT_CONTINUE; |
97 | } | 97 | } |
diff --git a/include/linux/compiler.h b/include/linux/compiler.h index ee7239ea1583..64fdfe1cfcf0 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h | |||
@@ -323,9 +323,18 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); | |||
323 | #endif | 323 | #endif |
324 | #ifndef __compiletime_error | 324 | #ifndef __compiletime_error |
325 | # define __compiletime_error(message) | 325 | # define __compiletime_error(message) |
326 | # define __compiletime_error_fallback(condition) \ | 326 | /* |
327 | * Sparse complains of variable sized arrays due to the temporary variable in | ||
328 | * __compiletime_assert. Unfortunately we can't just expand it out to make | ||
329 | * sparse see a constant array size without breaking compiletime_assert on old | ||
330 | * versions of GCC (e.g. 4.2.4), so hide the array from sparse altogether. | ||
331 | */ | ||
332 | # ifndef __CHECKER__ | ||
333 | # define __compiletime_error_fallback(condition) \ | ||
327 | do { ((void)sizeof(char[1 - 2 * condition])); } while (0) | 334 | do { ((void)sizeof(char[1 - 2 * condition])); } while (0) |
328 | #else | 335 | # endif |
336 | #endif | ||
337 | #ifndef __compiletime_error_fallback | ||
329 | # define __compiletime_error_fallback(condition) do { } while (0) | 338 | # define __compiletime_error_fallback(condition) do { } while (0) |
330 | #endif | 339 | #endif |
331 | 340 | ||
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index b19d3dc2e651..ade2390ffe92 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -12,10 +12,31 @@ | |||
12 | #include <linux/cpumask.h> | 12 | #include <linux/cpumask.h> |
13 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
15 | #include <linux/jump_label.h> | ||
15 | 16 | ||
16 | #ifdef CONFIG_CPUSETS | 17 | #ifdef CONFIG_CPUSETS |
17 | 18 | ||
18 | extern int number_of_cpusets; /* How many cpusets are defined in system? */ | 19 | extern struct static_key cpusets_enabled_key; |
20 | static inline bool cpusets_enabled(void) | ||
21 | { | ||
22 | return static_key_false(&cpusets_enabled_key); | ||
23 | } | ||
24 | |||
25 | static inline int nr_cpusets(void) | ||
26 | { | ||
27 | /* jump label reference count + the top-level cpuset */ | ||
28 | return static_key_count(&cpusets_enabled_key) + 1; | ||
29 | } | ||
30 | |||
31 | static inline void cpuset_inc(void) | ||
32 | { | ||
33 | static_key_slow_inc(&cpusets_enabled_key); | ||
34 | } | ||
35 | |||
36 | static inline void cpuset_dec(void) | ||
37 | { | ||
38 | static_key_slow_dec(&cpusets_enabled_key); | ||
39 | } | ||
19 | 40 | ||
20 | extern int cpuset_init(void); | 41 | extern int cpuset_init(void); |
21 | extern void cpuset_init_smp(void); | 42 | extern void cpuset_init_smp(void); |
@@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask); | |||
32 | 53 | ||
33 | static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | 54 | static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) |
34 | { | 55 | { |
35 | return number_of_cpusets <= 1 || | 56 | return nr_cpusets() <= 1 || |
36 | __cpuset_node_allowed_softwall(node, gfp_mask); | 57 | __cpuset_node_allowed_softwall(node, gfp_mask); |
37 | } | 58 | } |
38 | 59 | ||
39 | static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) | 60 | static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) |
40 | { | 61 | { |
41 | return number_of_cpusets <= 1 || | 62 | return nr_cpusets() <= 1 || |
42 | __cpuset_node_allowed_hardwall(node, gfp_mask); | 63 | __cpuset_node_allowed_hardwall(node, gfp_mask); |
43 | } | 64 | } |
44 | 65 | ||
@@ -124,6 +145,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) | |||
124 | 145 | ||
125 | #else /* !CONFIG_CPUSETS */ | 146 | #else /* !CONFIG_CPUSETS */ |
126 | 147 | ||
148 | static inline bool cpusets_enabled(void) { return false; } | ||
149 | |||
127 | static inline int cpuset_init(void) { return 0; } | 150 | static inline int cpuset_init(void) { return 0; } |
128 | static inline void cpuset_init_smp(void) {} | 151 | static inline void cpuset_init_smp(void) {} |
129 | 152 | ||
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index 3b28f937d959..772eab5d524a 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h | |||
@@ -88,7 +88,8 @@ static inline void dma_contiguous_set_default(struct cma *cma) | |||
88 | void dma_contiguous_reserve(phys_addr_t addr_limit); | 88 | void dma_contiguous_reserve(phys_addr_t addr_limit); |
89 | 89 | ||
90 | int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, | 90 | int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, |
91 | phys_addr_t limit, struct cma **res_cma); | 91 | phys_addr_t limit, struct cma **res_cma, |
92 | bool fixed); | ||
92 | 93 | ||
93 | /** | 94 | /** |
94 | * dma_declare_contiguous() - reserve area for contiguous memory handling | 95 | * dma_declare_contiguous() - reserve area for contiguous memory handling |
@@ -108,7 +109,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size, | |||
108 | { | 109 | { |
109 | struct cma *cma; | 110 | struct cma *cma; |
110 | int ret; | 111 | int ret; |
111 | ret = dma_contiguous_reserve_area(size, base, limit, &cma); | 112 | ret = dma_contiguous_reserve_area(size, base, limit, &cma, true); |
112 | if (ret == 0) | 113 | if (ret == 0) |
113 | dev_set_cma_area(dev, cma); | 114 | dev_set_cma_area(dev, cma); |
114 | 115 | ||
@@ -136,7 +137,9 @@ static inline void dma_contiguous_set_default(struct cma *cma) { } | |||
136 | static inline void dma_contiguous_reserve(phys_addr_t limit) { } | 137 | static inline void dma_contiguous_reserve(phys_addr_t limit) { } |
137 | 138 | ||
138 | static inline int dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, | 139 | static inline int dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, |
139 | phys_addr_t limit, struct cma **res_cma) { | 140 | phys_addr_t limit, struct cma **res_cma, |
141 | bool fixed) | ||
142 | { | ||
140 | return -ENOSYS; | 143 | return -ENOSYS; |
141 | } | 144 | } |
142 | 145 | ||
diff --git a/include/linux/fs.h b/include/linux/fs.h index 878031227c57..c3f46e499dd0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -2590,6 +2590,7 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count, | |||
2590 | extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, | 2590 | extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, |
2591 | const void __user *from, size_t count); | 2591 | const void __user *from, size_t count); |
2592 | 2592 | ||
2593 | extern int __generic_file_fsync(struct file *, loff_t, loff_t, int); | ||
2593 | extern int generic_file_fsync(struct file *, loff_t, loff_t, int); | 2594 | extern int generic_file_fsync(struct file *, loff_t, loff_t, int); |
2594 | 2595 | ||
2595 | extern int generic_check_addressable(unsigned, u64); | 2596 | extern int generic_check_addressable(unsigned, u64); |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 39b81dc7d01a..6eb1fb37de9a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -6,7 +6,6 @@ | |||
6 | #include <linux/stddef.h> | 6 | #include <linux/stddef.h> |
7 | #include <linux/linkage.h> | 7 | #include <linux/linkage.h> |
8 | #include <linux/topology.h> | 8 | #include <linux/topology.h> |
9 | #include <linux/mmdebug.h> | ||
10 | 9 | ||
11 | struct vm_area_struct; | 10 | struct vm_area_struct; |
12 | 11 | ||
@@ -31,7 +30,6 @@ struct vm_area_struct; | |||
31 | #define ___GFP_HARDWALL 0x20000u | 30 | #define ___GFP_HARDWALL 0x20000u |
32 | #define ___GFP_THISNODE 0x40000u | 31 | #define ___GFP_THISNODE 0x40000u |
33 | #define ___GFP_RECLAIMABLE 0x80000u | 32 | #define ___GFP_RECLAIMABLE 0x80000u |
34 | #define ___GFP_KMEMCG 0x100000u | ||
35 | #define ___GFP_NOTRACK 0x200000u | 33 | #define ___GFP_NOTRACK 0x200000u |
36 | #define ___GFP_NO_KSWAPD 0x400000u | 34 | #define ___GFP_NO_KSWAPD 0x400000u |
37 | #define ___GFP_OTHER_NODE 0x800000u | 35 | #define ___GFP_OTHER_NODE 0x800000u |
@@ -91,7 +89,6 @@ struct vm_area_struct; | |||
91 | 89 | ||
92 | #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) | 90 | #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) |
93 | #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ | 91 | #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ |
94 | #define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */ | ||
95 | #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ | 92 | #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ |
96 | 93 | ||
97 | /* | 94 | /* |
@@ -353,6 +350,10 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, | |||
353 | #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ | 350 | #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ |
354 | alloc_pages_vma(gfp_mask, 0, vma, addr, node) | 351 | alloc_pages_vma(gfp_mask, 0, vma, addr, node) |
355 | 352 | ||
353 | extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order); | ||
354 | extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, | ||
355 | unsigned int order); | ||
356 | |||
356 | extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); | 357 | extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); |
357 | extern unsigned long get_zeroed_page(gfp_t gfp_mask); | 358 | extern unsigned long get_zeroed_page(gfp_t gfp_mask); |
358 | 359 | ||
@@ -369,11 +370,11 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); | |||
369 | 370 | ||
370 | extern void __free_pages(struct page *page, unsigned int order); | 371 | extern void __free_pages(struct page *page, unsigned int order); |
371 | extern void free_pages(unsigned long addr, unsigned int order); | 372 | extern void free_pages(unsigned long addr, unsigned int order); |
372 | extern void free_hot_cold_page(struct page *page, int cold); | 373 | extern void free_hot_cold_page(struct page *page, bool cold); |
373 | extern void free_hot_cold_page_list(struct list_head *list, int cold); | 374 | extern void free_hot_cold_page_list(struct list_head *list, bool cold); |
374 | 375 | ||
375 | extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); | 376 | extern void __free_kmem_pages(struct page *page, unsigned int order); |
376 | extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); | 377 | extern void free_kmem_pages(unsigned long addr, unsigned int order); |
377 | 378 | ||
378 | #define __free_page(page) __free_pages((page), 0) | 379 | #define __free_page(page) __free_pages((page), 0) |
379 | #define free_page(addr) free_pages((addr), 0) | 380 | #define free_page(addr) free_pages((addr), 0) |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b65166de1d9d..255cd5cc0754 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -343,6 +343,11 @@ static inline unsigned huge_page_shift(struct hstate *h) | |||
343 | return h->order + PAGE_SHIFT; | 343 | return h->order + PAGE_SHIFT; |
344 | } | 344 | } |
345 | 345 | ||
346 | static inline bool hstate_is_gigantic(struct hstate *h) | ||
347 | { | ||
348 | return huge_page_order(h) >= MAX_ORDER; | ||
349 | } | ||
350 | |||
346 | static inline unsigned int pages_per_huge_page(struct hstate *h) | 351 | static inline unsigned int pages_per_huge_page(struct hstate *h) |
347 | { | 352 | { |
348 | return 1 << h->order; | 353 | return 1 << h->order; |
@@ -392,15 +397,13 @@ static inline pgoff_t basepage_index(struct page *page) | |||
392 | 397 | ||
393 | extern void dissolve_free_huge_pages(unsigned long start_pfn, | 398 | extern void dissolve_free_huge_pages(unsigned long start_pfn, |
394 | unsigned long end_pfn); | 399 | unsigned long end_pfn); |
395 | int pmd_huge_support(void); | 400 | static inline int hugepage_migration_supported(struct hstate *h) |
396 | /* | ||
397 | * Currently hugepage migration is enabled only for pmd-based hugepage. | ||
398 | * This function will be updated when hugepage migration is more widely | ||
399 | * supported. | ||
400 | */ | ||
401 | static inline int hugepage_migration_support(struct hstate *h) | ||
402 | { | 401 | { |
403 | return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT); | 402 | #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION |
403 | return huge_page_shift(h) == PMD_SHIFT; | ||
404 | #else | ||
405 | return 0; | ||
406 | #endif | ||
404 | } | 407 | } |
405 | 408 | ||
406 | static inline spinlock_t *huge_pte_lockptr(struct hstate *h, | 409 | static inline spinlock_t *huge_pte_lockptr(struct hstate *h, |
@@ -450,8 +453,7 @@ static inline pgoff_t basepage_index(struct page *page) | |||
450 | return page->index; | 453 | return page->index; |
451 | } | 454 | } |
452 | #define dissolve_free_huge_pages(s, e) do {} while (0) | 455 | #define dissolve_free_huge_pages(s, e) do {} while (0) |
453 | #define pmd_huge_support() 0 | 456 | #define hugepage_migration_supported(h) 0 |
454 | #define hugepage_migration_support(h) 0 | ||
455 | 457 | ||
456 | static inline spinlock_t *huge_pte_lockptr(struct hstate *h, | 458 | static inline spinlock_t *huge_pte_lockptr(struct hstate *h, |
457 | struct mm_struct *mm, pte_t *pte) | 459 | struct mm_struct *mm, pte_t *pte) |
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 5c1dfb2a9e73..784304b222b3 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h | |||
@@ -69,6 +69,10 @@ struct static_key { | |||
69 | 69 | ||
70 | # include <asm/jump_label.h> | 70 | # include <asm/jump_label.h> |
71 | # define HAVE_JUMP_LABEL | 71 | # define HAVE_JUMP_LABEL |
72 | #else | ||
73 | struct static_key { | ||
74 | atomic_t enabled; | ||
75 | }; | ||
72 | #endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ | 76 | #endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ |
73 | 77 | ||
74 | enum jump_label_type { | 78 | enum jump_label_type { |
@@ -79,6 +83,12 @@ enum jump_label_type { | |||
79 | struct module; | 83 | struct module; |
80 | 84 | ||
81 | #include <linux/atomic.h> | 85 | #include <linux/atomic.h> |
86 | |||
87 | static inline int static_key_count(struct static_key *key) | ||
88 | { | ||
89 | return atomic_read(&key->enabled); | ||
90 | } | ||
91 | |||
82 | #ifdef HAVE_JUMP_LABEL | 92 | #ifdef HAVE_JUMP_LABEL |
83 | 93 | ||
84 | #define JUMP_LABEL_TYPE_FALSE_BRANCH 0UL | 94 | #define JUMP_LABEL_TYPE_FALSE_BRANCH 0UL |
@@ -134,10 +144,6 @@ extern void jump_label_apply_nops(struct module *mod); | |||
134 | 144 | ||
135 | #else /* !HAVE_JUMP_LABEL */ | 145 | #else /* !HAVE_JUMP_LABEL */ |
136 | 146 | ||
137 | struct static_key { | ||
138 | atomic_t enabled; | ||
139 | }; | ||
140 | |||
141 | static __always_inline void jump_label_init(void) | 147 | static __always_inline void jump_label_init(void) |
142 | { | 148 | { |
143 | static_key_initialized = true; | 149 | static_key_initialized = true; |
@@ -145,14 +151,14 @@ static __always_inline void jump_label_init(void) | |||
145 | 151 | ||
146 | static __always_inline bool static_key_false(struct static_key *key) | 152 | static __always_inline bool static_key_false(struct static_key *key) |
147 | { | 153 | { |
148 | if (unlikely(atomic_read(&key->enabled) > 0)) | 154 | if (unlikely(static_key_count(key) > 0)) |
149 | return true; | 155 | return true; |
150 | return false; | 156 | return false; |
151 | } | 157 | } |
152 | 158 | ||
153 | static __always_inline bool static_key_true(struct static_key *key) | 159 | static __always_inline bool static_key_true(struct static_key *key) |
154 | { | 160 | { |
155 | if (likely(atomic_read(&key->enabled) > 0)) | 161 | if (likely(static_key_count(key) > 0)) |
156 | return true; | 162 | return true; |
157 | return false; | 163 | return false; |
158 | } | 164 | } |
@@ -194,7 +200,7 @@ static inline int jump_label_apply_nops(struct module *mod) | |||
194 | 200 | ||
195 | static inline bool static_key_enabled(struct static_key *key) | 201 | static inline bool static_key_enabled(struct static_key *key) |
196 | { | 202 | { |
197 | return (atomic_read(&key->enabled) > 0); | 203 | return static_key_count(key) > 0; |
198 | } | 204 | } |
199 | 205 | ||
200 | #endif /* _LINUX_JUMP_LABEL_H */ | 206 | #endif /* _LINUX_JUMP_LABEL_H */ |
diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 73dc382e72d8..b660e05b63d4 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h | |||
@@ -272,6 +272,8 @@ static inline bool memblock_bottom_up(void) { return false; } | |||
272 | #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) | 272 | #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) |
273 | #define MEMBLOCK_ALLOC_ACCESSIBLE 0 | 273 | #define MEMBLOCK_ALLOC_ACCESSIBLE 0 |
274 | 274 | ||
275 | phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, | ||
276 | phys_addr_t start, phys_addr_t end); | ||
275 | phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, | 277 | phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, |
276 | phys_addr_t max_addr); | 278 | phys_addr_t max_addr); |
277 | phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, | 279 | phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b569b8be5c5a..eb65d29516ca 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -492,13 +492,9 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order); | |||
492 | 492 | ||
493 | int memcg_cache_id(struct mem_cgroup *memcg); | 493 | int memcg_cache_id(struct mem_cgroup *memcg); |
494 | 494 | ||
495 | char *memcg_create_cache_name(struct mem_cgroup *memcg, | ||
496 | struct kmem_cache *root_cache); | ||
497 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, | 495 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, |
498 | struct kmem_cache *root_cache); | 496 | struct kmem_cache *root_cache); |
499 | void memcg_free_cache_params(struct kmem_cache *s); | 497 | void memcg_free_cache_params(struct kmem_cache *s); |
500 | void memcg_register_cache(struct kmem_cache *s); | ||
501 | void memcg_unregister_cache(struct kmem_cache *s); | ||
502 | 498 | ||
503 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups); | 499 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups); |
504 | void memcg_update_array_size(int num_groups); | 500 | void memcg_update_array_size(int num_groups); |
@@ -506,8 +502,10 @@ void memcg_update_array_size(int num_groups); | |||
506 | struct kmem_cache * | 502 | struct kmem_cache * |
507 | __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); | 503 | __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); |
508 | 504 | ||
509 | void mem_cgroup_destroy_cache(struct kmem_cache *cachep); | 505 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); |
510 | int __kmem_cache_destroy_memcg_children(struct kmem_cache *s); | 506 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); |
507 | |||
508 | int __memcg_cleanup_cache_params(struct kmem_cache *s); | ||
511 | 509 | ||
512 | /** | 510 | /** |
513 | * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. | 511 | * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. |
@@ -534,7 +532,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) | |||
534 | * res_counter_charge_nofail, but we hope those allocations are rare, | 532 | * res_counter_charge_nofail, but we hope those allocations are rare, |
535 | * and won't be worth the trouble. | 533 | * and won't be worth the trouble. |
536 | */ | 534 | */ |
537 | if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL)) | 535 | if (gfp & __GFP_NOFAIL) |
538 | return true; | 536 | return true; |
539 | if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) | 537 | if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) |
540 | return true; | 538 | return true; |
@@ -583,17 +581,7 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) | |||
583 | * @cachep: the original global kmem cache | 581 | * @cachep: the original global kmem cache |
584 | * @gfp: allocation flags. | 582 | * @gfp: allocation flags. |
585 | * | 583 | * |
586 | * This function assumes that the task allocating, which determines the memcg | 584 | * All memory allocated from a per-memcg cache is charged to the owner memcg. |
587 | * in the page allocator, belongs to the same cgroup throughout the whole | ||
588 | * process. Misacounting can happen if the task calls memcg_kmem_get_cache() | ||
589 | * while belonging to a cgroup, and later on changes. This is considered | ||
590 | * acceptable, and should only happen upon task migration. | ||
591 | * | ||
592 | * Before the cache is created by the memcg core, there is also a possible | ||
593 | * imbalance: the task belongs to a memcg, but the cache being allocated from | ||
594 | * is the global cache, since the child cache is not yet guaranteed to be | ||
595 | * ready. This case is also fine, since in this case the GFP_KMEMCG will not be | ||
596 | * passed and the page allocator will not attempt any cgroup accounting. | ||
597 | */ | 585 | */ |
598 | static __always_inline struct kmem_cache * | 586 | static __always_inline struct kmem_cache * |
599 | memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | 587 | memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) |
@@ -648,14 +636,6 @@ static inline void memcg_free_cache_params(struct kmem_cache *s) | |||
648 | { | 636 | { |
649 | } | 637 | } |
650 | 638 | ||
651 | static inline void memcg_register_cache(struct kmem_cache *s) | ||
652 | { | ||
653 | } | ||
654 | |||
655 | static inline void memcg_unregister_cache(struct kmem_cache *s) | ||
656 | { | ||
657 | } | ||
658 | |||
659 | static inline struct kmem_cache * | 639 | static inline struct kmem_cache * |
660 | memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | 640 | memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) |
661 | { | 641 | { |
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4ca3d951fe91..010d125bffbf 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h | |||
@@ -187,14 +187,8 @@ extern void put_page_bootmem(struct page *page); | |||
187 | extern void get_page_bootmem(unsigned long ingo, struct page *page, | 187 | extern void get_page_bootmem(unsigned long ingo, struct page *page, |
188 | unsigned long type); | 188 | unsigned long type); |
189 | 189 | ||
190 | /* | 190 | void get_online_mems(void); |
191 | * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug | 191 | void put_online_mems(void); |
192 | * notifier will be called under this. 2) offline/online/add/remove memory | ||
193 | * will not run simultaneously. | ||
194 | */ | ||
195 | |||
196 | void lock_memory_hotplug(void); | ||
197 | void unlock_memory_hotplug(void); | ||
198 | 192 | ||
199 | #else /* ! CONFIG_MEMORY_HOTPLUG */ | 193 | #else /* ! CONFIG_MEMORY_HOTPLUG */ |
200 | /* | 194 | /* |
@@ -232,8 +226,8 @@ static inline int try_online_node(int nid) | |||
232 | return 0; | 226 | return 0; |
233 | } | 227 | } |
234 | 228 | ||
235 | static inline void lock_memory_hotplug(void) {} | 229 | static inline void get_online_mems(void) {} |
236 | static inline void unlock_memory_hotplug(void) {} | 230 | static inline void put_online_mems(void) {} |
237 | 231 | ||
238 | #endif /* ! CONFIG_MEMORY_HOTPLUG */ | 232 | #endif /* ! CONFIG_MEMORY_HOTPLUG */ |
239 | 233 | ||
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 3c1b968da0ca..f230a978e6ba 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h | |||
@@ -175,6 +175,12 @@ static inline int vma_migratable(struct vm_area_struct *vma) | |||
175 | { | 175 | { |
176 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | 176 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) |
177 | return 0; | 177 | return 0; |
178 | |||
179 | #ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION | ||
180 | if (vma->vm_flags & VM_HUGETLB) | ||
181 | return 0; | ||
182 | #endif | ||
183 | |||
178 | /* | 184 | /* |
179 | * Migration allocates pages in the highest zone. If we cannot | 185 | * Migration allocates pages in the highest zone. If we cannot |
180 | * do so then migration (at least from node to node) is not | 186 | * do so then migration (at least from node to node) is not |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 84a31ad0b791..a2901c414664 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -5,7 +5,9 @@ | |||
5 | #include <linux/mempolicy.h> | 5 | #include <linux/mempolicy.h> |
6 | #include <linux/migrate_mode.h> | 6 | #include <linux/migrate_mode.h> |
7 | 7 | ||
8 | typedef struct page *new_page_t(struct page *, unsigned long private, int **); | 8 | typedef struct page *new_page_t(struct page *page, unsigned long private, |
9 | int **reason); | ||
10 | typedef void free_page_t(struct page *page, unsigned long private); | ||
9 | 11 | ||
10 | /* | 12 | /* |
11 | * Return values from addresss_space_operations.migratepage(): | 13 | * Return values from addresss_space_operations.migratepage(): |
@@ -38,7 +40,7 @@ enum migrate_reason { | |||
38 | extern void putback_movable_pages(struct list_head *l); | 40 | extern void putback_movable_pages(struct list_head *l); |
39 | extern int migrate_page(struct address_space *, | 41 | extern int migrate_page(struct address_space *, |
40 | struct page *, struct page *, enum migrate_mode); | 42 | struct page *, struct page *, enum migrate_mode); |
41 | extern int migrate_pages(struct list_head *l, new_page_t x, | 43 | extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, |
42 | unsigned long private, enum migrate_mode mode, int reason); | 44 | unsigned long private, enum migrate_mode mode, int reason); |
43 | 45 | ||
44 | extern int migrate_prep(void); | 46 | extern int migrate_prep(void); |
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping, | |||
56 | #else | 58 | #else |
57 | 59 | ||
58 | static inline void putback_movable_pages(struct list_head *l) {} | 60 | static inline void putback_movable_pages(struct list_head *l) {} |
59 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 61 | static inline int migrate_pages(struct list_head *l, new_page_t new, |
60 | unsigned long private, enum migrate_mode mode, int reason) | 62 | free_page_t free, unsigned long private, enum migrate_mode mode, |
63 | int reason) | ||
61 | { return -ENOSYS; } | 64 | { return -ENOSYS; } |
62 | 65 | ||
63 | static inline int migrate_prep(void) { return -ENOSYS; } | 66 | static inline int migrate_prep(void) { return -ENOSYS; } |
diff --git a/include/linux/mm.h b/include/linux/mm.h index d6777060449f..368600628d14 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -407,20 +407,25 @@ static inline void compound_unlock_irqrestore(struct page *page, | |||
407 | #endif | 407 | #endif |
408 | } | 408 | } |
409 | 409 | ||
410 | static inline struct page *compound_head_by_tail(struct page *tail) | ||
411 | { | ||
412 | struct page *head = tail->first_page; | ||
413 | |||
414 | /* | ||
415 | * page->first_page may be a dangling pointer to an old | ||
416 | * compound page, so recheck that it is still a tail | ||
417 | * page before returning. | ||
418 | */ | ||
419 | smp_rmb(); | ||
420 | if (likely(PageTail(tail))) | ||
421 | return head; | ||
422 | return tail; | ||
423 | } | ||
424 | |||
410 | static inline struct page *compound_head(struct page *page) | 425 | static inline struct page *compound_head(struct page *page) |
411 | { | 426 | { |
412 | if (unlikely(PageTail(page))) { | 427 | if (unlikely(PageTail(page))) |
413 | struct page *head = page->first_page; | 428 | return compound_head_by_tail(page); |
414 | |||
415 | /* | ||
416 | * page->first_page may be a dangling pointer to an old | ||
417 | * compound page, so recheck that it is still a tail | ||
418 | * page before returning. | ||
419 | */ | ||
420 | smp_rmb(); | ||
421 | if (likely(PageTail(page))) | ||
422 | return head; | ||
423 | } | ||
424 | return page; | 429 | return page; |
425 | } | 430 | } |
426 | 431 | ||
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8967e20cbe57..de1627232af0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -406,7 +406,7 @@ struct mm_struct { | |||
406 | spinlock_t ioctx_lock; | 406 | spinlock_t ioctx_lock; |
407 | struct kioctx_table __rcu *ioctx_table; | 407 | struct kioctx_table __rcu *ioctx_table; |
408 | #endif | 408 | #endif |
409 | #ifdef CONFIG_MM_OWNER | 409 | #ifdef CONFIG_MEMCG |
410 | /* | 410 | /* |
411 | * "owner" points to a task that is regarded as the canonical | 411 | * "owner" points to a task that is regarded as the canonical |
412 | * user/owner of this mm. All of the following must be true in | 412 | * user/owner of this mm. All of the following must be true in |
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 2d57efa64cc1..edd82a105220 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef LINUX_MM_DEBUG_H | 1 | #ifndef LINUX_MM_DEBUG_H |
2 | #define LINUX_MM_DEBUG_H 1 | 2 | #define LINUX_MM_DEBUG_H 1 |
3 | 3 | ||
4 | #include <linux/stringify.h> | ||
5 | |||
4 | struct page; | 6 | struct page; |
5 | 7 | ||
6 | extern void dump_page(struct page *page, const char *reason); | 8 | extern void dump_page(struct page *page, const char *reason); |
@@ -9,11 +11,20 @@ extern void dump_page_badflags(struct page *page, const char *reason, | |||
9 | 11 | ||
10 | #ifdef CONFIG_DEBUG_VM | 12 | #ifdef CONFIG_DEBUG_VM |
11 | #define VM_BUG_ON(cond) BUG_ON(cond) | 13 | #define VM_BUG_ON(cond) BUG_ON(cond) |
12 | #define VM_BUG_ON_PAGE(cond, page) \ | 14 | #define VM_BUG_ON_PAGE(cond, page) \ |
13 | do { if (unlikely(cond)) { dump_page(page, NULL); BUG(); } } while (0) | 15 | do { \ |
16 | if (unlikely(cond)) { \ | ||
17 | dump_page(page, "VM_BUG_ON_PAGE(" __stringify(cond)")");\ | ||
18 | BUG(); \ | ||
19 | } \ | ||
20 | } while (0) | ||
21 | #define VM_WARN_ON(cond) WARN_ON(cond) | ||
22 | #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) | ||
14 | #else | 23 | #else |
15 | #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) | 24 | #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) |
16 | #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) | 25 | #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) |
26 | #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) | ||
27 | #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) | ||
17 | #endif | 28 | #endif |
18 | 29 | ||
19 | #ifdef CONFIG_DEBUG_VIRTUAL | 30 | #ifdef CONFIG_DEBUG_VIRTUAL |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fac5509c18f0..6cbd1b6c3d20 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -75,9 +75,18 @@ enum { | |||
75 | 75 | ||
76 | extern int page_group_by_mobility_disabled; | 76 | extern int page_group_by_mobility_disabled; |
77 | 77 | ||
78 | static inline int get_pageblock_migratetype(struct page *page) | 78 | #define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1) |
79 | #define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1) | ||
80 | |||
81 | #define get_pageblock_migratetype(page) \ | ||
82 | get_pfnblock_flags_mask(page, page_to_pfn(page), \ | ||
83 | PB_migrate_end, MIGRATETYPE_MASK) | ||
84 | |||
85 | static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) | ||
79 | { | 86 | { |
80 | return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end); | 87 | BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2); |
88 | return get_pfnblock_flags_mask(page, pfn, PB_migrate_end, | ||
89 | MIGRATETYPE_MASK); | ||
81 | } | 90 | } |
82 | 91 | ||
83 | struct free_area { | 92 | struct free_area { |
@@ -360,9 +369,10 @@ struct zone { | |||
360 | /* Set to true when the PG_migrate_skip bits should be cleared */ | 369 | /* Set to true when the PG_migrate_skip bits should be cleared */ |
361 | bool compact_blockskip_flush; | 370 | bool compact_blockskip_flush; |
362 | 371 | ||
363 | /* pfns where compaction scanners should start */ | 372 | /* pfn where compaction free scanner should start */ |
364 | unsigned long compact_cached_free_pfn; | 373 | unsigned long compact_cached_free_pfn; |
365 | unsigned long compact_cached_migrate_pfn; | 374 | /* pfn where async and sync compaction migration scanner should start */ |
375 | unsigned long compact_cached_migrate_pfn[2]; | ||
366 | #endif | 376 | #endif |
367 | #ifdef CONFIG_MEMORY_HOTPLUG | 377 | #ifdef CONFIG_MEMORY_HOTPLUG |
368 | /* see spanned/present_pages for more description */ | 378 | /* see spanned/present_pages for more description */ |
@@ -481,9 +491,8 @@ struct zone { | |||
481 | * give them a chance of being in the same cacheline. | 491 | * give them a chance of being in the same cacheline. |
482 | * | 492 | * |
483 | * Write access to present_pages at runtime should be protected by | 493 | * Write access to present_pages at runtime should be protected by |
484 | * lock_memory_hotplug()/unlock_memory_hotplug(). Any reader who can't | 494 | * mem_hotplug_begin/end(). Any reader who can't tolerant drift of |
485 | * tolerant drift of present_pages should hold memory hotplug lock to | 495 | * present_pages should get_online_mems() to get a stable value. |
486 | * get a stable value. | ||
487 | * | 496 | * |
488 | * Read access to managed_pages should be safe because it's unsigned | 497 | * Read access to managed_pages should be safe because it's unsigned |
489 | * long. Write access to zone->managed_pages and totalram_pages are | 498 | * long. Write access to zone->managed_pages and totalram_pages are |
@@ -763,10 +772,10 @@ typedef struct pglist_data { | |||
763 | unsigned long node_spanned_pages; /* total size of physical page | 772 | unsigned long node_spanned_pages; /* total size of physical page |
764 | range, including holes */ | 773 | range, including holes */ |
765 | int node_id; | 774 | int node_id; |
766 | nodemask_t reclaim_nodes; /* Nodes allowed to reclaim from */ | ||
767 | wait_queue_head_t kswapd_wait; | 775 | wait_queue_head_t kswapd_wait; |
768 | wait_queue_head_t pfmemalloc_wait; | 776 | wait_queue_head_t pfmemalloc_wait; |
769 | struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ | 777 | struct task_struct *kswapd; /* Protected by |
778 | mem_hotplug_begin/end() */ | ||
770 | int kswapd_max_order; | 779 | int kswapd_max_order; |
771 | enum zone_type classzone_idx; | 780 | enum zone_type classzone_idx; |
772 | #ifdef CONFIG_NUMA_BALANCING | 781 | #ifdef CONFIG_NUMA_BALANCING |
@@ -808,10 +817,10 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) | |||
808 | extern struct mutex zonelists_mutex; | 817 | extern struct mutex zonelists_mutex; |
809 | void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); | 818 | void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); |
810 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); | 819 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); |
811 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 820 | bool zone_watermark_ok(struct zone *z, unsigned int order, |
812 | int classzone_idx, int alloc_flags); | 821 | unsigned long mark, int classzone_idx, int alloc_flags); |
813 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | 822 | bool zone_watermark_ok_safe(struct zone *z, unsigned int order, |
814 | int classzone_idx, int alloc_flags); | 823 | unsigned long mark, int classzone_idx, int alloc_flags); |
815 | enum memmap_context { | 824 | enum memmap_context { |
816 | MEMMAP_EARLY, | 825 | MEMMAP_EARLY, |
817 | MEMMAP_HOTPLUG, | 826 | MEMMAP_HOTPLUG, |
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d1fe1a761047..2093eb72785e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -198,6 +198,7 @@ struct page; /* forward declaration */ | |||
198 | TESTPAGEFLAG(Locked, locked) | 198 | TESTPAGEFLAG(Locked, locked) |
199 | PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) | 199 | PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) |
200 | PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) | 200 | PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) |
201 | __SETPAGEFLAG(Referenced, referenced) | ||
201 | PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) | 202 | PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) |
202 | PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) | 203 | PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) |
203 | PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) | 204 | PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) |
@@ -208,6 +209,7 @@ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ | |||
208 | PAGEFLAG(SavePinned, savepinned); /* Xen */ | 209 | PAGEFLAG(SavePinned, savepinned); /* Xen */ |
209 | PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) | 210 | PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) |
210 | PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) | 211 | PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) |
212 | __SETPAGEFLAG(SwapBacked, swapbacked) | ||
211 | 213 | ||
212 | __PAGEFLAG(SlobFree, slob_free) | 214 | __PAGEFLAG(SlobFree, slob_free) |
213 | 215 | ||
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 2ee8cd2466b5..2baeee12f48e 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h | |||
@@ -30,9 +30,12 @@ enum pageblock_bits { | |||
30 | PB_migrate, | 30 | PB_migrate, |
31 | PB_migrate_end = PB_migrate + 3 - 1, | 31 | PB_migrate_end = PB_migrate + 3 - 1, |
32 | /* 3 bits required for migrate types */ | 32 | /* 3 bits required for migrate types */ |
33 | #ifdef CONFIG_COMPACTION | ||
34 | PB_migrate_skip,/* If set the block is skipped by compaction */ | 33 | PB_migrate_skip,/* If set the block is skipped by compaction */ |
35 | #endif /* CONFIG_COMPACTION */ | 34 | |
35 | /* | ||
36 | * Assume the bits will always align on a word. If this assumption | ||
37 | * changes then get/set pageblock needs updating. | ||
38 | */ | ||
36 | NR_PAGEBLOCK_BITS | 39 | NR_PAGEBLOCK_BITS |
37 | }; | 40 | }; |
38 | 41 | ||
@@ -62,11 +65,26 @@ extern int pageblock_order; | |||
62 | /* Forward declaration */ | 65 | /* Forward declaration */ |
63 | struct page; | 66 | struct page; |
64 | 67 | ||
68 | unsigned long get_pfnblock_flags_mask(struct page *page, | ||
69 | unsigned long pfn, | ||
70 | unsigned long end_bitidx, | ||
71 | unsigned long mask); | ||
72 | |||
73 | void set_pfnblock_flags_mask(struct page *page, | ||
74 | unsigned long flags, | ||
75 | unsigned long pfn, | ||
76 | unsigned long end_bitidx, | ||
77 | unsigned long mask); | ||
78 | |||
65 | /* Declarations for getting and setting flags. See mm/page_alloc.c */ | 79 | /* Declarations for getting and setting flags. See mm/page_alloc.c */ |
66 | unsigned long get_pageblock_flags_group(struct page *page, | 80 | #define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \ |
67 | int start_bitidx, int end_bitidx); | 81 | get_pfnblock_flags_mask(page, page_to_pfn(page), \ |
68 | void set_pageblock_flags_group(struct page *page, unsigned long flags, | 82 | end_bitidx, \ |
69 | int start_bitidx, int end_bitidx); | 83 | (1 << (end_bitidx - start_bitidx + 1)) - 1) |
84 | #define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \ | ||
85 | set_pfnblock_flags_mask(page, flags, page_to_pfn(page), \ | ||
86 | end_bitidx, \ | ||
87 | (1 << (end_bitidx - start_bitidx + 1)) - 1) | ||
70 | 88 | ||
71 | #ifdef CONFIG_COMPACTION | 89 | #ifdef CONFIG_COMPACTION |
72 | #define get_pageblock_skip(page) \ | 90 | #define get_pageblock_skip(page) \ |
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 45598f1e9aa3..0a97b583ee8d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -110,7 +110,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) | |||
110 | 110 | ||
111 | #define page_cache_get(page) get_page(page) | 111 | #define page_cache_get(page) get_page(page) |
112 | #define page_cache_release(page) put_page(page) | 112 | #define page_cache_release(page) put_page(page) |
113 | void release_pages(struct page **pages, int nr, int cold); | 113 | void release_pages(struct page **pages, int nr, bool cold); |
114 | 114 | ||
115 | /* | 115 | /* |
116 | * speculatively take a reference to a page. | 116 | * speculatively take a reference to a page. |
@@ -259,12 +259,109 @@ pgoff_t page_cache_next_hole(struct address_space *mapping, | |||
259 | pgoff_t page_cache_prev_hole(struct address_space *mapping, | 259 | pgoff_t page_cache_prev_hole(struct address_space *mapping, |
260 | pgoff_t index, unsigned long max_scan); | 260 | pgoff_t index, unsigned long max_scan); |
261 | 261 | ||
262 | #define FGP_ACCESSED 0x00000001 | ||
263 | #define FGP_LOCK 0x00000002 | ||
264 | #define FGP_CREAT 0x00000004 | ||
265 | #define FGP_WRITE 0x00000008 | ||
266 | #define FGP_NOFS 0x00000010 | ||
267 | #define FGP_NOWAIT 0x00000020 | ||
268 | |||
269 | struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, | ||
270 | int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask); | ||
271 | |||
272 | /** | ||
273 | * find_get_page - find and get a page reference | ||
274 | * @mapping: the address_space to search | ||
275 | * @offset: the page index | ||
276 | * | ||
277 | * Looks up the page cache slot at @mapping & @offset. If there is a | ||
278 | * page cache page, it is returned with an increased refcount. | ||
279 | * | ||
280 | * Otherwise, %NULL is returned. | ||
281 | */ | ||
282 | static inline struct page *find_get_page(struct address_space *mapping, | ||
283 | pgoff_t offset) | ||
284 | { | ||
285 | return pagecache_get_page(mapping, offset, 0, 0, 0); | ||
286 | } | ||
287 | |||
288 | static inline struct page *find_get_page_flags(struct address_space *mapping, | ||
289 | pgoff_t offset, int fgp_flags) | ||
290 | { | ||
291 | return pagecache_get_page(mapping, offset, fgp_flags, 0, 0); | ||
292 | } | ||
293 | |||
294 | /** | ||
295 | * find_lock_page - locate, pin and lock a pagecache page | ||
296 | * pagecache_get_page - find and get a page reference | ||
297 | * @mapping: the address_space to search | ||
298 | * @offset: the page index | ||
299 | * | ||
300 | * Looks up the page cache slot at @mapping & @offset. If there is a | ||
301 | * page cache page, it is returned locked and with an increased | ||
302 | * refcount. | ||
303 | * | ||
304 | * Otherwise, %NULL is returned. | ||
305 | * | ||
306 | * find_lock_page() may sleep. | ||
307 | */ | ||
308 | static inline struct page *find_lock_page(struct address_space *mapping, | ||
309 | pgoff_t offset) | ||
310 | { | ||
311 | return pagecache_get_page(mapping, offset, FGP_LOCK, 0, 0); | ||
312 | } | ||
313 | |||
314 | /** | ||
315 | * find_or_create_page - locate or add a pagecache page | ||
316 | * @mapping: the page's address_space | ||
317 | * @index: the page's index into the mapping | ||
318 | * @gfp_mask: page allocation mode | ||
319 | * | ||
320 | * Looks up the page cache slot at @mapping & @offset. If there is a | ||
321 | * page cache page, it is returned locked and with an increased | ||
322 | * refcount. | ||
323 | * | ||
324 | * If the page is not present, a new page is allocated using @gfp_mask | ||
325 | * and added to the page cache and the VM's LRU list. The page is | ||
326 | * returned locked and with an increased refcount. | ||
327 | * | ||
328 | * On memory exhaustion, %NULL is returned. | ||
329 | * | ||
330 | * find_or_create_page() may sleep, even if @gfp_flags specifies an | ||
331 | * atomic allocation! | ||
332 | */ | ||
333 | static inline struct page *find_or_create_page(struct address_space *mapping, | ||
334 | pgoff_t offset, gfp_t gfp_mask) | ||
335 | { | ||
336 | return pagecache_get_page(mapping, offset, | ||
337 | FGP_LOCK|FGP_ACCESSED|FGP_CREAT, | ||
338 | gfp_mask, gfp_mask & GFP_RECLAIM_MASK); | ||
339 | } | ||
340 | |||
341 | /** | ||
342 | * grab_cache_page_nowait - returns locked page at given index in given cache | ||
343 | * @mapping: target address_space | ||
344 | * @index: the page index | ||
345 | * | ||
346 | * Same as grab_cache_page(), but do not wait if the page is unavailable. | ||
347 | * This is intended for speculative data generators, where the data can | ||
348 | * be regenerated if the page couldn't be grabbed. This routine should | ||
349 | * be safe to call while holding the lock for another page. | ||
350 | * | ||
351 | * Clear __GFP_FS when allocating the page to avoid recursion into the fs | ||
352 | * and deadlock against the caller's locked page. | ||
353 | */ | ||
354 | static inline struct page *grab_cache_page_nowait(struct address_space *mapping, | ||
355 | pgoff_t index) | ||
356 | { | ||
357 | return pagecache_get_page(mapping, index, | ||
358 | FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, | ||
359 | mapping_gfp_mask(mapping), | ||
360 | GFP_NOFS); | ||
361 | } | ||
362 | |||
262 | struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); | 363 | struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); |
263 | struct page *find_get_page(struct address_space *mapping, pgoff_t offset); | ||
264 | struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); | 364 | struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); |
265 | struct page *find_lock_page(struct address_space *mapping, pgoff_t offset); | ||
266 | struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, | ||
267 | gfp_t gfp_mask); | ||
268 | unsigned find_get_entries(struct address_space *mapping, pgoff_t start, | 365 | unsigned find_get_entries(struct address_space *mapping, pgoff_t start, |
269 | unsigned int nr_entries, struct page **entries, | 366 | unsigned int nr_entries, struct page **entries, |
270 | pgoff_t *indices); | 367 | pgoff_t *indices); |
@@ -287,8 +384,6 @@ static inline struct page *grab_cache_page(struct address_space *mapping, | |||
287 | return find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); | 384 | return find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); |
288 | } | 385 | } |
289 | 386 | ||
290 | extern struct page * grab_cache_page_nowait(struct address_space *mapping, | ||
291 | pgoff_t index); | ||
292 | extern struct page * read_cache_page(struct address_space *mapping, | 387 | extern struct page * read_cache_page(struct address_space *mapping, |
293 | pgoff_t index, filler_t *filler, void *data); | 388 | pgoff_t index, filler_t *filler, void *data); |
294 | extern struct page * read_cache_page_gfp(struct address_space *mapping, | 389 | extern struct page * read_cache_page_gfp(struct address_space *mapping, |
@@ -425,6 +520,8 @@ static inline void wait_on_page_writeback(struct page *page) | |||
425 | extern void end_page_writeback(struct page *page); | 520 | extern void end_page_writeback(struct page *page); |
426 | void wait_for_stable_page(struct page *page); | 521 | void wait_for_stable_page(struct page *page); |
427 | 522 | ||
523 | void page_endio(struct page *page, int rw, int err); | ||
524 | |||
428 | /* | 525 | /* |
429 | * Add an arbitrary waiter to a page's wait queue | 526 | * Add an arbitrary waiter to a page's wait queue |
430 | */ | 527 | */ |
diff --git a/include/linux/plist.h b/include/linux/plist.h index aa0fb390bd29..8b6c970cff6c 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h | |||
@@ -98,6 +98,13 @@ struct plist_node { | |||
98 | } | 98 | } |
99 | 99 | ||
100 | /** | 100 | /** |
101 | * PLIST_HEAD - declare and init plist_head | ||
102 | * @head: name for struct plist_head variable | ||
103 | */ | ||
104 | #define PLIST_HEAD(head) \ | ||
105 | struct plist_head head = PLIST_HEAD_INIT(head) | ||
106 | |||
107 | /** | ||
101 | * PLIST_NODE_INIT - static struct plist_node initializer | 108 | * PLIST_NODE_INIT - static struct plist_node initializer |
102 | * @node: struct plist_node variable name | 109 | * @node: struct plist_node variable name |
103 | * @__prio: initial node priority | 110 | * @__prio: initial node priority |
@@ -134,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio) | |||
134 | extern void plist_add(struct plist_node *node, struct plist_head *head); | 141 | extern void plist_add(struct plist_node *node, struct plist_head *head); |
135 | extern void plist_del(struct plist_node *node, struct plist_head *head); | 142 | extern void plist_del(struct plist_node *node, struct plist_head *head); |
136 | 143 | ||
144 | extern void plist_requeue(struct plist_node *node, struct plist_head *head); | ||
145 | |||
137 | /** | 146 | /** |
138 | * plist_for_each - iterate over the plist | 147 | * plist_for_each - iterate over the plist |
139 | * @pos: the type * to use as a loop counter | 148 | * @pos: the type * to use as a loop counter |
@@ -143,6 +152,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); | |||
143 | list_for_each_entry(pos, &(head)->node_list, node_list) | 152 | list_for_each_entry(pos, &(head)->node_list, node_list) |
144 | 153 | ||
145 | /** | 154 | /** |
155 | * plist_for_each_continue - continue iteration over the plist | ||
156 | * @pos: the type * to use as a loop cursor | ||
157 | * @head: the head for your list | ||
158 | * | ||
159 | * Continue to iterate over plist, continuing after the current position. | ||
160 | */ | ||
161 | #define plist_for_each_continue(pos, head) \ | ||
162 | list_for_each_entry_continue(pos, &(head)->node_list, node_list) | ||
163 | |||
164 | /** | ||
146 | * plist_for_each_safe - iterate safely over a plist of given type | 165 | * plist_for_each_safe - iterate safely over a plist of given type |
147 | * @pos: the type * to use as a loop counter | 166 | * @pos: the type * to use as a loop counter |
148 | * @n: another type * to use as temporary storage | 167 | * @n: another type * to use as temporary storage |
@@ -163,6 +182,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); | |||
163 | list_for_each_entry(pos, &(head)->node_list, mem.node_list) | 182 | list_for_each_entry(pos, &(head)->node_list, mem.node_list) |
164 | 183 | ||
165 | /** | 184 | /** |
185 | * plist_for_each_entry_continue - continue iteration over list of given type | ||
186 | * @pos: the type * to use as a loop cursor | ||
187 | * @head: the head for your list | ||
188 | * @m: the name of the list_struct within the struct | ||
189 | * | ||
190 | * Continue to iterate over list of given type, continuing after | ||
191 | * the current position. | ||
192 | */ | ||
193 | #define plist_for_each_entry_continue(pos, head, m) \ | ||
194 | list_for_each_entry_continue(pos, &(head)->node_list, m.node_list) | ||
195 | |||
196 | /** | ||
166 | * plist_for_each_entry_safe - iterate safely over list of given type | 197 | * plist_for_each_entry_safe - iterate safely over list of given type |
167 | * @pos: the type * to use as a loop counter | 198 | * @pos: the type * to use as a loop counter |
168 | * @n: another type * to use as temporary storage | 199 | * @n: another type * to use as temporary storage |
@@ -229,6 +260,20 @@ static inline int plist_node_empty(const struct plist_node *node) | |||
229 | #endif | 260 | #endif |
230 | 261 | ||
231 | /** | 262 | /** |
263 | * plist_next - get the next entry in list | ||
264 | * @pos: the type * to cursor | ||
265 | */ | ||
266 | #define plist_next(pos) \ | ||
267 | list_next_entry(pos, node_list) | ||
268 | |||
269 | /** | ||
270 | * plist_prev - get the prev entry in list | ||
271 | * @pos: the type * to cursor | ||
272 | */ | ||
273 | #define plist_prev(pos) \ | ||
274 | list_prev_entry(pos, node_list) | ||
275 | |||
276 | /** | ||
232 | * plist_first - return the first node (and thus, highest priority) | 277 | * plist_first - return the first node (and thus, highest priority) |
233 | * @head: the &struct plist_head pointer | 278 | * @head: the &struct plist_head pointer |
234 | * | 279 | * |
diff --git a/include/linux/printk.h b/include/linux/printk.h index 8752f7595b27..319ff7e53efb 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h | |||
@@ -30,6 +30,17 @@ static inline const char *printk_skip_level(const char *buffer) | |||
30 | return buffer; | 30 | return buffer; |
31 | } | 31 | } |
32 | 32 | ||
33 | /* printk's without a loglevel use this.. */ | ||
34 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL | ||
35 | |||
36 | /* We show everything that is MORE important than this.. */ | ||
37 | #define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ | ||
38 | #define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */ | ||
39 | #define CONSOLE_LOGLEVEL_QUIET 4 /* Shhh ..., when booted with "quiet" */ | ||
40 | #define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */ | ||
41 | #define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */ | ||
42 | #define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */ | ||
43 | |||
33 | extern int console_printk[]; | 44 | extern int console_printk[]; |
34 | 45 | ||
35 | #define console_loglevel (console_printk[0]) | 46 | #define console_loglevel (console_printk[0]) |
@@ -39,13 +50,13 @@ extern int console_printk[]; | |||
39 | 50 | ||
40 | static inline void console_silent(void) | 51 | static inline void console_silent(void) |
41 | { | 52 | { |
42 | console_loglevel = 0; | 53 | console_loglevel = CONSOLE_LOGLEVEL_SILENT; |
43 | } | 54 | } |
44 | 55 | ||
45 | static inline void console_verbose(void) | 56 | static inline void console_verbose(void) |
46 | { | 57 | { |
47 | if (console_loglevel) | 58 | if (console_loglevel) |
48 | console_loglevel = 15; | 59 | console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; |
49 | } | 60 | } |
50 | 61 | ||
51 | struct va_format { | 62 | struct va_format { |
@@ -128,9 +139,9 @@ asmlinkage __printf(1, 2) __cold | |||
128 | int printk(const char *fmt, ...); | 139 | int printk(const char *fmt, ...); |
129 | 140 | ||
130 | /* | 141 | /* |
131 | * Special printk facility for scheduler use only, _DO_NOT_USE_ ! | 142 | * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! |
132 | */ | 143 | */ |
133 | __printf(1, 2) __cold int printk_sched(const char *fmt, ...); | 144 | __printf(1, 2) __cold int printk_deferred(const char *fmt, ...); |
134 | 145 | ||
135 | /* | 146 | /* |
136 | * Please don't use printk_ratelimit(), because it shares ratelimiting state | 147 | * Please don't use printk_ratelimit(), because it shares ratelimiting state |
@@ -165,7 +176,7 @@ int printk(const char *s, ...) | |||
165 | return 0; | 176 | return 0; |
166 | } | 177 | } |
167 | static inline __printf(1, 2) __cold | 178 | static inline __printf(1, 2) __cold |
168 | int printk_sched(const char *s, ...) | 179 | int printk_deferred(const char *s, ...) |
169 | { | 180 | { |
170 | return 0; | 181 | return 0; |
171 | } | 182 | } |
@@ -210,6 +221,12 @@ extern asmlinkage void dump_stack(void) __cold; | |||
210 | #define pr_fmt(fmt) fmt | 221 | #define pr_fmt(fmt) fmt |
211 | #endif | 222 | #endif |
212 | 223 | ||
224 | /* | ||
225 | * These can be used to print at the various log levels. | ||
226 | * All of these will print unconditionally, although note that pr_debug() | ||
227 | * and other debug macros are compiled out unless either DEBUG is defined | ||
228 | * or CONFIG_DYNAMIC_DEBUG is set. | ||
229 | */ | ||
213 | #define pr_emerg(fmt, ...) \ | 230 | #define pr_emerg(fmt, ...) \ |
214 | printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) | 231 | printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) |
215 | #define pr_alert(fmt, ...) \ | 232 | #define pr_alert(fmt, ...) \ |
@@ -266,9 +283,20 @@ extern asmlinkage void dump_stack(void) __cold; | |||
266 | printk(fmt, ##__VA_ARGS__); \ | 283 | printk(fmt, ##__VA_ARGS__); \ |
267 | } \ | 284 | } \ |
268 | }) | 285 | }) |
286 | #define printk_deferred_once(fmt, ...) \ | ||
287 | ({ \ | ||
288 | static bool __print_once __read_mostly; \ | ||
289 | \ | ||
290 | if (!__print_once) { \ | ||
291 | __print_once = true; \ | ||
292 | printk_deferred(fmt, ##__VA_ARGS__); \ | ||
293 | } \ | ||
294 | }) | ||
269 | #else | 295 | #else |
270 | #define printk_once(fmt, ...) \ | 296 | #define printk_once(fmt, ...) \ |
271 | no_printk(fmt, ##__VA_ARGS__) | 297 | no_printk(fmt, ##__VA_ARGS__) |
298 | #define printk_deferred_once(fmt, ...) \ | ||
299 | no_printk(fmt, ##__VA_ARGS__) | ||
272 | #endif | 300 | #endif |
273 | 301 | ||
274 | #define pr_emerg_once(fmt, ...) \ | 302 | #define pr_emerg_once(fmt, ...) \ |
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 608e60a74c3c..9d117f61d976 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h | |||
@@ -44,6 +44,10 @@ extern int remove_proc_subtree(const char *, struct proc_dir_entry *); | |||
44 | 44 | ||
45 | #else /* CONFIG_PROC_FS */ | 45 | #else /* CONFIG_PROC_FS */ |
46 | 46 | ||
47 | static inline void proc_root_init(void) | ||
48 | { | ||
49 | } | ||
50 | |||
47 | static inline void proc_flush_task(struct task_struct *task) | 51 | static inline void proc_flush_task(struct task_struct *task) |
48 | { | 52 | { |
49 | } | 53 | } |
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b66c2110cb1f..be574506e6a9 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
@@ -72,10 +72,9 @@ struct anon_vma_chain { | |||
72 | }; | 72 | }; |
73 | 73 | ||
74 | enum ttu_flags { | 74 | enum ttu_flags { |
75 | TTU_UNMAP = 0, /* unmap mode */ | 75 | TTU_UNMAP = 1, /* unmap mode */ |
76 | TTU_MIGRATION = 1, /* migration mode */ | 76 | TTU_MIGRATION = 2, /* migration mode */ |
77 | TTU_MUNLOCK = 2, /* munlock mode */ | 77 | TTU_MUNLOCK = 4, /* munlock mode */ |
78 | TTU_ACTION_MASK = 0xff, | ||
79 | 78 | ||
80 | TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ | 79 | TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ |
81 | TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ | 80 | TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ |
@@ -183,14 +182,10 @@ static inline void page_dup_rmap(struct page *page) | |||
183 | */ | 182 | */ |
184 | int page_referenced(struct page *, int is_locked, | 183 | int page_referenced(struct page *, int is_locked, |
185 | struct mem_cgroup *memcg, unsigned long *vm_flags); | 184 | struct mem_cgroup *memcg, unsigned long *vm_flags); |
186 | int page_referenced_one(struct page *, struct vm_area_struct *, | ||
187 | unsigned long address, void *arg); | ||
188 | 185 | ||
189 | #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) | 186 | #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) |
190 | 187 | ||
191 | int try_to_unmap(struct page *, enum ttu_flags flags); | 188 | int try_to_unmap(struct page *, enum ttu_flags flags); |
192 | int try_to_unmap_one(struct page *, struct vm_area_struct *, | ||
193 | unsigned long address, void *arg); | ||
194 | 189 | ||
195 | /* | 190 | /* |
196 | * Called from mm/filemap_xip.c to unmap empty zero page | 191 | * Called from mm/filemap_xip.c to unmap empty zero page |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 70f67e4e6156..8fcd0e6098d9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -137,12 +137,6 @@ struct filename; | |||
137 | #define VMACACHE_MASK (VMACACHE_SIZE - 1) | 137 | #define VMACACHE_MASK (VMACACHE_SIZE - 1) |
138 | 138 | ||
139 | /* | 139 | /* |
140 | * List of flags we want to share for kernel threads, | ||
141 | * if only because they are not used by them anyway. | ||
142 | */ | ||
143 | #define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) | ||
144 | |||
145 | /* | ||
146 | * These are the constant used to fake the fixed-point load-average | 140 | * These are the constant used to fake the fixed-point load-average |
147 | * counting. Some notes: | 141 | * counting. Some notes: |
148 | * - 11 bit fractions expand to 22 bits by the multiplies: this gives | 142 | * - 11 bit fractions expand to 22 bits by the multiplies: this gives |
@@ -745,7 +739,6 @@ static inline int signal_group_exit(const struct signal_struct *sig) | |||
745 | struct user_struct { | 739 | struct user_struct { |
746 | atomic_t __count; /* reference count */ | 740 | atomic_t __count; /* reference count */ |
747 | atomic_t processes; /* How many processes does this user have? */ | 741 | atomic_t processes; /* How many processes does this user have? */ |
748 | atomic_t files; /* How many open files does this user have? */ | ||
749 | atomic_t sigpending; /* How many pending signals does this user have? */ | 742 | atomic_t sigpending; /* How many pending signals does this user have? */ |
750 | #ifdef CONFIG_INOTIFY_USER | 743 | #ifdef CONFIG_INOTIFY_USER |
751 | atomic_t inotify_watches; /* How many inotify watches does this user have? */ | 744 | atomic_t inotify_watches; /* How many inotify watches does this user have? */ |
@@ -2967,7 +2960,7 @@ static inline void inc_syscw(struct task_struct *tsk) | |||
2967 | #define TASK_SIZE_OF(tsk) TASK_SIZE | 2960 | #define TASK_SIZE_OF(tsk) TASK_SIZE |
2968 | #endif | 2961 | #endif |
2969 | 2962 | ||
2970 | #ifdef CONFIG_MM_OWNER | 2963 | #ifdef CONFIG_MEMCG |
2971 | extern void mm_update_next_owner(struct mm_struct *mm); | 2964 | extern void mm_update_next_owner(struct mm_struct *mm); |
2972 | extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); | 2965 | extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); |
2973 | #else | 2966 | #else |
@@ -2978,7 +2971,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm) | |||
2978 | static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | 2971 | static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) |
2979 | { | 2972 | { |
2980 | } | 2973 | } |
2981 | #endif /* CONFIG_MM_OWNER */ | 2974 | #endif /* CONFIG_MEMCG */ |
2982 | 2975 | ||
2983 | static inline unsigned long task_rlimit(const struct task_struct *tsk, | 2976 | static inline unsigned long task_rlimit(const struct task_struct *tsk, |
2984 | unsigned int limit) | 2977 | unsigned int limit) |
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 8045a554cafb..596a0e007c62 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h | |||
@@ -25,6 +25,10 @@ enum { sysctl_hung_task_timeout_secs = 0 }; | |||
25 | * Because the kernel adds some informative sections to a image of program at | 25 | * Because the kernel adds some informative sections to a image of program at |
26 | * generating coredump, we need some margin. The number of extra sections is | 26 | * generating coredump, we need some margin. The number of extra sections is |
27 | * 1-3 now and depends on arch. We use "5" as safe margin, here. | 27 | * 1-3 now and depends on arch. We use "5" as safe margin, here. |
28 | * | ||
29 | * ELF extended numbering allows more than 65535 sections, so 16-bit bound is | ||
30 | * not a hard limit any more. Although some userspace tools can be surprised by | ||
31 | * that. | ||
28 | */ | 32 | */ |
29 | #define MAPCOUNT_ELF_CORE_MARGIN (5) | 33 | #define MAPCOUNT_ELF_CORE_MARGIN (5) |
30 | #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) | 34 | #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) |
diff --git a/include/linux/slab.h b/include/linux/slab.h index 307bfbe62387..1d9abb7d22a0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -116,7 +116,9 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, | |||
116 | unsigned long, | 116 | unsigned long, |
117 | void (*)(void *)); | 117 | void (*)(void *)); |
118 | #ifdef CONFIG_MEMCG_KMEM | 118 | #ifdef CONFIG_MEMCG_KMEM |
119 | void kmem_cache_create_memcg(struct mem_cgroup *, struct kmem_cache *); | 119 | struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *, |
120 | struct kmem_cache *, | ||
121 | const char *); | ||
120 | #endif | 122 | #endif |
121 | void kmem_cache_destroy(struct kmem_cache *); | 123 | void kmem_cache_destroy(struct kmem_cache *); |
122 | int kmem_cache_shrink(struct kmem_cache *); | 124 | int kmem_cache_shrink(struct kmem_cache *); |
@@ -369,16 +371,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, | |||
369 | #include <linux/slub_def.h> | 371 | #include <linux/slub_def.h> |
370 | #endif | 372 | #endif |
371 | 373 | ||
372 | static __always_inline void * | 374 | extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order); |
373 | kmalloc_order(size_t size, gfp_t flags, unsigned int order) | ||
374 | { | ||
375 | void *ret; | ||
376 | |||
377 | flags |= (__GFP_COMP | __GFP_KMEMCG); | ||
378 | ret = (void *) __get_free_pages(flags, order); | ||
379 | kmemleak_alloc(ret, size, 1, flags); | ||
380 | return ret; | ||
381 | } | ||
382 | 375 | ||
383 | #ifdef CONFIG_TRACING | 376 | #ifdef CONFIG_TRACING |
384 | extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order); | 377 | extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order); |
@@ -533,10 +526,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) | |||
533 | * @memcg: pointer to the memcg this cache belongs to | 526 | * @memcg: pointer to the memcg this cache belongs to |
534 | * @list: list_head for the list of all caches in this memcg | 527 | * @list: list_head for the list of all caches in this memcg |
535 | * @root_cache: pointer to the global, root cache, this cache was derived from | 528 | * @root_cache: pointer to the global, root cache, this cache was derived from |
536 | * @dead: set to true after the memcg dies; the cache may still be around. | ||
537 | * @nr_pages: number of pages that belongs to this cache. | 529 | * @nr_pages: number of pages that belongs to this cache. |
538 | * @destroy: worker to be called whenever we are ready, or believe we may be | ||
539 | * ready, to destroy this cache. | ||
540 | */ | 530 | */ |
541 | struct memcg_cache_params { | 531 | struct memcg_cache_params { |
542 | bool is_root_cache; | 532 | bool is_root_cache; |
@@ -549,9 +539,7 @@ struct memcg_cache_params { | |||
549 | struct mem_cgroup *memcg; | 539 | struct mem_cgroup *memcg; |
550 | struct list_head list; | 540 | struct list_head list; |
551 | struct kmem_cache *root_cache; | 541 | struct kmem_cache *root_cache; |
552 | bool dead; | ||
553 | atomic_t nr_pages; | 542 | atomic_t nr_pages; |
554 | struct work_struct destroy; | ||
555 | }; | 543 | }; |
556 | }; | 544 | }; |
557 | }; | 545 | }; |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 350711560753..4bdbee80eede 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -166,10 +166,10 @@ enum { | |||
166 | #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX | 166 | #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX |
167 | 167 | ||
168 | /* | 168 | /* |
169 | * Ratio between the present memory in the zone and the "gap" that | 169 | * Ratio between zone->managed_pages and the "gap" that above the per-zone |
170 | * we're allowing kswapd to shrink in addition to the per-zone high | 170 | * "high_wmark". While balancing nodes, We allow kswapd to shrink zones that |
171 | * wmark, even for zones that already have the high wmark satisfied, | 171 | * do not meet the (high_wmark + gap) watermark, even which already met the |
172 | * in order to provide better per-zone lru behavior. We are ok to | 172 | * high_wmark, in order to provide better per-zone lru behavior. We are ok to |
173 | * spend not more than 1% of the memory for this zone balancing "gap". | 173 | * spend not more than 1% of the memory for this zone balancing "gap". |
174 | */ | 174 | */ |
175 | #define KSWAPD_ZONE_BALANCE_GAP_RATIO 100 | 175 | #define KSWAPD_ZONE_BALANCE_GAP_RATIO 100 |
@@ -214,8 +214,9 @@ struct percpu_cluster { | |||
214 | struct swap_info_struct { | 214 | struct swap_info_struct { |
215 | unsigned long flags; /* SWP_USED etc: see above */ | 215 | unsigned long flags; /* SWP_USED etc: see above */ |
216 | signed short prio; /* swap priority of this type */ | 216 | signed short prio; /* swap priority of this type */ |
217 | struct plist_node list; /* entry in swap_active_head */ | ||
218 | struct plist_node avail_list; /* entry in swap_avail_head */ | ||
217 | signed char type; /* strange name for an index */ | 219 | signed char type; /* strange name for an index */ |
218 | signed char next; /* next type on the swap list */ | ||
219 | unsigned int max; /* extent of the swap_map */ | 220 | unsigned int max; /* extent of the swap_map */ |
220 | unsigned char *swap_map; /* vmalloc'ed array of usage counts */ | 221 | unsigned char *swap_map; /* vmalloc'ed array of usage counts */ |
221 | struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ | 222 | struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ |
@@ -255,11 +256,6 @@ struct swap_info_struct { | |||
255 | struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ | 256 | struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ |
256 | }; | 257 | }; |
257 | 258 | ||
258 | struct swap_list_t { | ||
259 | int head; /* head of priority-ordered swapfile list */ | ||
260 | int next; /* swapfile to be used next */ | ||
261 | }; | ||
262 | |||
263 | /* linux/mm/workingset.c */ | 259 | /* linux/mm/workingset.c */ |
264 | void *workingset_eviction(struct address_space *mapping, struct page *page); | 260 | void *workingset_eviction(struct address_space *mapping, struct page *page); |
265 | bool workingset_refault(void *shadow); | 261 | bool workingset_refault(void *shadow); |
@@ -308,12 +304,14 @@ extern unsigned long nr_free_pagecache_pages(void); | |||
308 | 304 | ||
309 | 305 | ||
310 | /* linux/mm/swap.c */ | 306 | /* linux/mm/swap.c */ |
311 | extern void __lru_cache_add(struct page *); | ||
312 | extern void lru_cache_add(struct page *); | 307 | extern void lru_cache_add(struct page *); |
308 | extern void lru_cache_add_anon(struct page *page); | ||
309 | extern void lru_cache_add_file(struct page *page); | ||
313 | extern void lru_add_page_tail(struct page *page, struct page *page_tail, | 310 | extern void lru_add_page_tail(struct page *page, struct page *page_tail, |
314 | struct lruvec *lruvec, struct list_head *head); | 311 | struct lruvec *lruvec, struct list_head *head); |
315 | extern void activate_page(struct page *); | 312 | extern void activate_page(struct page *); |
316 | extern void mark_page_accessed(struct page *); | 313 | extern void mark_page_accessed(struct page *); |
314 | extern void init_page_accessed(struct page *page); | ||
317 | extern void lru_add_drain(void); | 315 | extern void lru_add_drain(void); |
318 | extern void lru_add_drain_cpu(int cpu); | 316 | extern void lru_add_drain_cpu(int cpu); |
319 | extern void lru_add_drain_all(void); | 317 | extern void lru_add_drain_all(void); |
@@ -323,22 +321,6 @@ extern void swap_setup(void); | |||
323 | 321 | ||
324 | extern void add_page_to_unevictable_list(struct page *page); | 322 | extern void add_page_to_unevictable_list(struct page *page); |
325 | 323 | ||
326 | /** | ||
327 | * lru_cache_add: add a page to the page lists | ||
328 | * @page: the page to add | ||
329 | */ | ||
330 | static inline void lru_cache_add_anon(struct page *page) | ||
331 | { | ||
332 | ClearPageActive(page); | ||
333 | __lru_cache_add(page); | ||
334 | } | ||
335 | |||
336 | static inline void lru_cache_add_file(struct page *page) | ||
337 | { | ||
338 | ClearPageActive(page); | ||
339 | __lru_cache_add(page); | ||
340 | } | ||
341 | |||
342 | /* linux/mm/vmscan.c */ | 324 | /* linux/mm/vmscan.c */ |
343 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 325 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
344 | gfp_t gfp_mask, nodemask_t *mask); | 326 | gfp_t gfp_mask, nodemask_t *mask); |
@@ -496,7 +478,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
496 | #define free_page_and_swap_cache(page) \ | 478 | #define free_page_and_swap_cache(page) \ |
497 | page_cache_release(page) | 479 | page_cache_release(page) |
498 | #define free_pages_and_swap_cache(pages, nr) \ | 480 | #define free_pages_and_swap_cache(pages, nr) \ |
499 | release_pages((pages), (nr), 0); | 481 | release_pages((pages), (nr), false); |
500 | 482 | ||
501 | static inline void show_swap_cache_info(void) | 483 | static inline void show_swap_cache_info(void) |
502 | { | 484 | { |
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index e282624e8c10..388293a91e8c 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h | |||
@@ -6,7 +6,7 @@ | |||
6 | * want to expose them to the dozens of source files that include swap.h | 6 | * want to expose them to the dozens of source files that include swap.h |
7 | */ | 7 | */ |
8 | extern spinlock_t swap_lock; | 8 | extern spinlock_t swap_lock; |
9 | extern struct swap_list_t swap_list; | 9 | extern struct plist_head swap_active_head; |
10 | extern struct swap_info_struct *swap_info[]; | 10 | extern struct swap_info_struct *swap_info[]; |
11 | extern int try_to_unuse(unsigned int, bool, unsigned long); | 11 | extern int try_to_unuse(unsigned int, bool, unsigned long); |
12 | 12 | ||
diff --git a/include/linux/swapops.h b/include/linux/swapops.h index c0f75261a728..6adfb7bfbf44 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h | |||
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry) | |||
54 | /* check whether a pte points to a swap entry */ | 54 | /* check whether a pte points to a swap entry */ |
55 | static inline int is_swap_pte(pte_t pte) | 55 | static inline int is_swap_pte(pte_t pte) |
56 | { | 56 | { |
57 | return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); | 57 | return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte); |
58 | } | 58 | } |
59 | #endif | 59 | #endif |
60 | 60 | ||
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index a5ffd32642fd..e7a018eaf3a2 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h | |||
@@ -116,4 +116,6 @@ static inline void swiotlb_free(void) { } | |||
116 | #endif | 116 | #endif |
117 | 117 | ||
118 | extern void swiotlb_print_info(void); | 118 | extern void swiotlb_print_info(void); |
119 | extern int is_swiotlb_buffer(phys_addr_t paddr); | ||
120 | |||
119 | #endif /* __LINUX_SWIOTLB_H */ | 121 | #endif /* __LINUX_SWIOTLB_H */ |
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a4a0588c5397..b0881a0ed322 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h | |||
@@ -711,7 +711,7 @@ asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3, | |||
711 | 711 | ||
712 | asmlinkage long sys_ioprio_set(int which, int who, int ioprio); | 712 | asmlinkage long sys_ioprio_set(int which, int who, int ioprio); |
713 | asmlinkage long sys_ioprio_get(int which, int who); | 713 | asmlinkage long sys_ioprio_get(int which, int who); |
714 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | 714 | asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nmask, |
715 | unsigned long maxnode); | 715 | unsigned long maxnode); |
716 | asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | 716 | asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, |
717 | const unsigned long __user *from, | 717 | const unsigned long __user *from, |
@@ -723,7 +723,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | |||
723 | int flags); | 723 | int flags); |
724 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | 724 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, |
725 | unsigned long mode, | 725 | unsigned long mode, |
726 | unsigned long __user *nmask, | 726 | const unsigned long __user *nmask, |
727 | unsigned long maxnode, | 727 | unsigned long maxnode, |
728 | unsigned flags); | 728 | unsigned flags); |
729 | asmlinkage long sys_get_mempolicy(int __user *policy, | 729 | asmlinkage long sys_get_mempolicy(int __user *policy, |
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index cb0cec94fda3..ff307b548ed3 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h | |||
@@ -61,8 +61,6 @@ extern long do_no_restart_syscall(struct restart_block *parm); | |||
61 | # define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) | 61 | # define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) |
62 | #endif | 62 | #endif |
63 | 63 | ||
64 | #define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG) | ||
65 | |||
66 | /* | 64 | /* |
67 | * flag set/clear/test wrappers | 65 | * flag set/clear/test wrappers |
68 | * - pass TIF_xxxx constants to these functions | 66 | * - pass TIF_xxxx constants to these functions |
diff --git a/include/linux/topology.h b/include/linux/topology.h index 973671ff9e7d..dda6ee521e74 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
@@ -58,7 +58,8 @@ int arch_update_cpu_topology(void); | |||
58 | /* | 58 | /* |
59 | * If the distance between nodes in a system is larger than RECLAIM_DISTANCE | 59 | * If the distance between nodes in a system is larger than RECLAIM_DISTANCE |
60 | * (in whatever arch specific measurement units returned by node_distance()) | 60 | * (in whatever arch specific measurement units returned by node_distance()) |
61 | * then switch on zone reclaim on boot. | 61 | * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim() |
62 | * on nodes within this distance. | ||
62 | */ | 63 | */ |
63 | #define RECLAIM_DISTANCE 30 | 64 | #define RECLAIM_DISTANCE 30 |
64 | #endif | 65 | #endif |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 486c3972c0be..ced92345c963 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -80,6 +80,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
80 | NR_TLB_LOCAL_FLUSH_ALL, | 80 | NR_TLB_LOCAL_FLUSH_ALL, |
81 | NR_TLB_LOCAL_FLUSH_ONE, | 81 | NR_TLB_LOCAL_FLUSH_ONE, |
82 | #endif /* CONFIG_DEBUG_TLBFLUSH */ | 82 | #endif /* CONFIG_DEBUG_TLBFLUSH */ |
83 | #ifdef CONFIG_DEBUG_VM_VMACACHE | ||
84 | VMACACHE_FIND_CALLS, | ||
85 | VMACACHE_FIND_HITS, | ||
86 | #endif | ||
83 | NR_VM_EVENT_ITEMS | 87 | NR_VM_EVENT_ITEMS |
84 | }; | 88 | }; |
85 | 89 | ||
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 45c9cd1daf7a..82e7db7f7100 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
@@ -95,6 +95,12 @@ static inline void vm_events_fold_cpu(int cpu) | |||
95 | #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) | 95 | #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) |
96 | #endif | 96 | #endif |
97 | 97 | ||
98 | #ifdef CONFIG_DEBUG_VM_VMACACHE | ||
99 | #define count_vm_vmacache_event(x) count_vm_event(x) | ||
100 | #else | ||
101 | #define count_vm_vmacache_event(x) do {} while (0) | ||
102 | #endif | ||
103 | |||
98 | #define __count_zone_vm_events(item, zone, delta) \ | 104 | #define __count_zone_vm_events(item, zone, delta) \ |
99 | __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ | 105 | __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ |
100 | zone_idx(zone), delta) | 106 | zone_idx(zone), delta) |
diff --git a/include/linux/zbud.h b/include/linux/zbud.h index 2571a5cfa5fc..13af0d450bf6 100644 --- a/include/linux/zbud.h +++ b/include/linux/zbud.h | |||
@@ -11,7 +11,7 @@ struct zbud_ops { | |||
11 | 11 | ||
12 | struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops); | 12 | struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops); |
13 | void zbud_destroy_pool(struct zbud_pool *pool); | 13 | void zbud_destroy_pool(struct zbud_pool *pool); |
14 | int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, | 14 | int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, |
15 | unsigned long *handle); | 15 | unsigned long *handle); |
16 | void zbud_free(struct zbud_pool *pool, unsigned long handle); | 16 | void zbud_free(struct zbud_pool *pool, unsigned long handle); |
17 | int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries); | 17 | int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries); |
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 06f544ef2f6f..c6814b917bdf 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h | |||
@@ -5,6 +5,7 @@ | |||
5 | #define _TRACE_COMPACTION_H | 5 | #define _TRACE_COMPACTION_H |
6 | 6 | ||
7 | #include <linux/types.h> | 7 | #include <linux/types.h> |
8 | #include <linux/list.h> | ||
8 | #include <linux/tracepoint.h> | 9 | #include <linux/tracepoint.h> |
9 | #include <trace/events/gfpflags.h> | 10 | #include <trace/events/gfpflags.h> |
10 | 11 | ||
@@ -47,10 +48,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, | |||
47 | 48 | ||
48 | TRACE_EVENT(mm_compaction_migratepages, | 49 | TRACE_EVENT(mm_compaction_migratepages, |
49 | 50 | ||
50 | TP_PROTO(unsigned long nr_migrated, | 51 | TP_PROTO(unsigned long nr_all, |
51 | unsigned long nr_failed), | 52 | int migrate_rc, |
53 | struct list_head *migratepages), | ||
52 | 54 | ||
53 | TP_ARGS(nr_migrated, nr_failed), | 55 | TP_ARGS(nr_all, migrate_rc, migratepages), |
54 | 56 | ||
55 | TP_STRUCT__entry( | 57 | TP_STRUCT__entry( |
56 | __field(unsigned long, nr_migrated) | 58 | __field(unsigned long, nr_migrated) |
@@ -58,7 +60,22 @@ TRACE_EVENT(mm_compaction_migratepages, | |||
58 | ), | 60 | ), |
59 | 61 | ||
60 | TP_fast_assign( | 62 | TP_fast_assign( |
61 | __entry->nr_migrated = nr_migrated; | 63 | unsigned long nr_failed = 0; |
64 | struct list_head *page_lru; | ||
65 | |||
66 | /* | ||
67 | * migrate_pages() returns either a non-negative number | ||
68 | * with the number of pages that failed migration, or an | ||
69 | * error code, in which case we need to count the remaining | ||
70 | * pages manually | ||
71 | */ | ||
72 | if (migrate_rc >= 0) | ||
73 | nr_failed = migrate_rc; | ||
74 | else | ||
75 | list_for_each(page_lru, migratepages) | ||
76 | nr_failed++; | ||
77 | |||
78 | __entry->nr_migrated = nr_all - nr_failed; | ||
62 | __entry->nr_failed = nr_failed; | 79 | __entry->nr_failed = nr_failed; |
63 | ), | 80 | ), |
64 | 81 | ||
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index 1eddbf1557f2..d6fd8e5b14b7 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h | |||
@@ -34,7 +34,6 @@ | |||
34 | {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ | 34 | {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ |
35 | {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ | 35 | {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ |
36 | {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ | 36 | {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ |
37 | {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \ | ||
38 | {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ | 37 | {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ |
39 | {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ | 38 | {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ |
40 | {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ | 39 | {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ |
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 132a985aba8b..69590b6ffc09 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h | |||
@@ -191,6 +191,7 @@ TRACE_EVENT(mm_shrink_slab_start, | |||
191 | TP_STRUCT__entry( | 191 | TP_STRUCT__entry( |
192 | __field(struct shrinker *, shr) | 192 | __field(struct shrinker *, shr) |
193 | __field(void *, shrink) | 193 | __field(void *, shrink) |
194 | __field(int, nid) | ||
194 | __field(long, nr_objects_to_shrink) | 195 | __field(long, nr_objects_to_shrink) |
195 | __field(gfp_t, gfp_flags) | 196 | __field(gfp_t, gfp_flags) |
196 | __field(unsigned long, pgs_scanned) | 197 | __field(unsigned long, pgs_scanned) |
@@ -203,6 +204,7 @@ TRACE_EVENT(mm_shrink_slab_start, | |||
203 | TP_fast_assign( | 204 | TP_fast_assign( |
204 | __entry->shr = shr; | 205 | __entry->shr = shr; |
205 | __entry->shrink = shr->scan_objects; | 206 | __entry->shrink = shr->scan_objects; |
207 | __entry->nid = sc->nid; | ||
206 | __entry->nr_objects_to_shrink = nr_objects_to_shrink; | 208 | __entry->nr_objects_to_shrink = nr_objects_to_shrink; |
207 | __entry->gfp_flags = sc->gfp_mask; | 209 | __entry->gfp_flags = sc->gfp_mask; |
208 | __entry->pgs_scanned = pgs_scanned; | 210 | __entry->pgs_scanned = pgs_scanned; |
@@ -212,9 +214,10 @@ TRACE_EVENT(mm_shrink_slab_start, | |||
212 | __entry->total_scan = total_scan; | 214 | __entry->total_scan = total_scan; |
213 | ), | 215 | ), |
214 | 216 | ||
215 | TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", | 217 | TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", |
216 | __entry->shrink, | 218 | __entry->shrink, |
217 | __entry->shr, | 219 | __entry->shr, |
220 | __entry->nid, | ||
218 | __entry->nr_objects_to_shrink, | 221 | __entry->nr_objects_to_shrink, |
219 | show_gfp_flags(__entry->gfp_flags), | 222 | show_gfp_flags(__entry->gfp_flags), |
220 | __entry->pgs_scanned, | 223 | __entry->pgs_scanned, |
@@ -225,13 +228,15 @@ TRACE_EVENT(mm_shrink_slab_start, | |||
225 | ); | 228 | ); |
226 | 229 | ||
227 | TRACE_EVENT(mm_shrink_slab_end, | 230 | TRACE_EVENT(mm_shrink_slab_end, |
228 | TP_PROTO(struct shrinker *shr, int shrinker_retval, | 231 | TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval, |
229 | long unused_scan_cnt, long new_scan_cnt), | 232 | long unused_scan_cnt, long new_scan_cnt, long total_scan), |
230 | 233 | ||
231 | TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt), | 234 | TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt, |
235 | total_scan), | ||
232 | 236 | ||
233 | TP_STRUCT__entry( | 237 | TP_STRUCT__entry( |
234 | __field(struct shrinker *, shr) | 238 | __field(struct shrinker *, shr) |
239 | __field(int, nid) | ||
235 | __field(void *, shrink) | 240 | __field(void *, shrink) |
236 | __field(long, unused_scan) | 241 | __field(long, unused_scan) |
237 | __field(long, new_scan) | 242 | __field(long, new_scan) |
@@ -241,16 +246,18 @@ TRACE_EVENT(mm_shrink_slab_end, | |||
241 | 246 | ||
242 | TP_fast_assign( | 247 | TP_fast_assign( |
243 | __entry->shr = shr; | 248 | __entry->shr = shr; |
249 | __entry->nid = nid; | ||
244 | __entry->shrink = shr->scan_objects; | 250 | __entry->shrink = shr->scan_objects; |
245 | __entry->unused_scan = unused_scan_cnt; | 251 | __entry->unused_scan = unused_scan_cnt; |
246 | __entry->new_scan = new_scan_cnt; | 252 | __entry->new_scan = new_scan_cnt; |
247 | __entry->retval = shrinker_retval; | 253 | __entry->retval = shrinker_retval; |
248 | __entry->total_scan = new_scan_cnt - unused_scan_cnt; | 254 | __entry->total_scan = total_scan; |
249 | ), | 255 | ), |
250 | 256 | ||
251 | TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", | 257 | TP_printk("%pF %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", |
252 | __entry->shrink, | 258 | __entry->shrink, |
253 | __entry->shr, | 259 | __entry->shr, |
260 | __entry->nid, | ||
254 | __entry->unused_scan, | 261 | __entry->unused_scan, |
255 | __entry->new_scan, | 262 | __entry->new_scan, |
256 | __entry->total_scan, | 263 | __entry->total_scan, |
diff --git a/init/Kconfig b/init/Kconfig index 9d3585bb2a7a..9d76b99af1b9 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -261,6 +261,16 @@ config POSIX_MQUEUE_SYSCTL | |||
261 | depends on SYSCTL | 261 | depends on SYSCTL |
262 | default y | 262 | default y |
263 | 263 | ||
264 | config CROSS_MEMORY_ATTACH | ||
265 | bool "Enable process_vm_readv/writev syscalls" | ||
266 | depends on MMU | ||
267 | default y | ||
268 | help | ||
269 | Enabling this option adds the system calls process_vm_readv and | ||
270 | process_vm_writev which allow a process with the correct privileges | ||
271 | to directly read from or write to to another process's address space. | ||
272 | See the man page for more details. | ||
273 | |||
264 | config FHANDLE | 274 | config FHANDLE |
265 | bool "open by fhandle syscalls" | 275 | bool "open by fhandle syscalls" |
266 | select EXPORTFS | 276 | select EXPORTFS |
@@ -933,7 +943,6 @@ config RESOURCE_COUNTERS | |||
933 | config MEMCG | 943 | config MEMCG |
934 | bool "Memory Resource Controller for Control Groups" | 944 | bool "Memory Resource Controller for Control Groups" |
935 | depends on RESOURCE_COUNTERS | 945 | depends on RESOURCE_COUNTERS |
936 | select MM_OWNER | ||
937 | select EVENTFD | 946 | select EVENTFD |
938 | help | 947 | help |
939 | Provides a memory resource controller that manages both anonymous | 948 | Provides a memory resource controller that manages both anonymous |
@@ -951,9 +960,6 @@ config MEMCG | |||
951 | disable memory resource controller and you can avoid overheads. | 960 | disable memory resource controller and you can avoid overheads. |
952 | (and lose benefits of memory resource controller) | 961 | (and lose benefits of memory resource controller) |
953 | 962 | ||
954 | This config option also selects MM_OWNER config option, which | ||
955 | could in turn add some fork/exit overhead. | ||
956 | |||
957 | config MEMCG_SWAP | 963 | config MEMCG_SWAP |
958 | bool "Memory Resource Controller Swap Extension" | 964 | bool "Memory Resource Controller Swap Extension" |
959 | depends on MEMCG && SWAP | 965 | depends on MEMCG && SWAP |
@@ -996,6 +1002,12 @@ config MEMCG_KMEM | |||
996 | the kmem extension can use it to guarantee that no group of processes | 1002 | the kmem extension can use it to guarantee that no group of processes |
997 | will ever exhaust kernel resources alone. | 1003 | will ever exhaust kernel resources alone. |
998 | 1004 | ||
1005 | WARNING: Current implementation lacks reclaim support. That means | ||
1006 | allocation attempts will fail when close to the limit even if there | ||
1007 | are plenty of kmem available for reclaim. That makes this option | ||
1008 | unusable in real life so DO NOT SELECT IT unless for development | ||
1009 | purposes. | ||
1010 | |||
999 | config CGROUP_HUGETLB | 1011 | config CGROUP_HUGETLB |
1000 | bool "HugeTLB Resource Controller for Control Groups" | 1012 | bool "HugeTLB Resource Controller for Control Groups" |
1001 | depends on RESOURCE_COUNTERS && HUGETLB_PAGE | 1013 | depends on RESOURCE_COUNTERS && HUGETLB_PAGE |
@@ -1173,9 +1185,6 @@ config SCHED_AUTOGROUP | |||
1173 | desktop applications. Task group autogeneration is currently based | 1185 | desktop applications. Task group autogeneration is currently based |
1174 | upon task session. | 1186 | upon task session. |
1175 | 1187 | ||
1176 | config MM_OWNER | ||
1177 | bool | ||
1178 | |||
1179 | config SYSFS_DEPRECATED | 1188 | config SYSFS_DEPRECATED |
1180 | bool "Enable deprecated sysfs features to support old userspace tools" | 1189 | bool "Enable deprecated sysfs features to support old userspace tools" |
1181 | depends on SYSFS | 1190 | depends on SYSFS |
@@ -1304,6 +1313,16 @@ config UID16 | |||
1304 | help | 1313 | help |
1305 | This enables the legacy 16-bit UID syscall wrappers. | 1314 | This enables the legacy 16-bit UID syscall wrappers. |
1306 | 1315 | ||
1316 | config SGETMASK_SYSCALL | ||
1317 | bool "sgetmask/ssetmask syscalls support" if EXPERT | ||
1318 | def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH | ||
1319 | ---help--- | ||
1320 | sys_sgetmask and sys_ssetmask are obsolete system calls | ||
1321 | no longer supported in libc but still enabled by default in some | ||
1322 | architectures. | ||
1323 | |||
1324 | If unsure, leave the default option here. | ||
1325 | |||
1307 | config SYSFS_SYSCALL | 1326 | config SYSFS_SYSCALL |
1308 | bool "Sysfs syscall support" if EXPERT | 1327 | bool "Sysfs syscall support" if EXPERT |
1309 | default y | 1328 | default y |
diff --git a/init/main.c b/init/main.c index 48655ceb66f4..17d47bcdf573 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -77,6 +77,7 @@ | |||
77 | #include <linux/sched_clock.h> | 77 | #include <linux/sched_clock.h> |
78 | #include <linux/context_tracking.h> | 78 | #include <linux/context_tracking.h> |
79 | #include <linux/random.h> | 79 | #include <linux/random.h> |
80 | #include <linux/list.h> | ||
80 | 81 | ||
81 | #include <asm/io.h> | 82 | #include <asm/io.h> |
82 | #include <asm/bugs.h> | 83 | #include <asm/bugs.h> |
@@ -203,13 +204,13 @@ EXPORT_SYMBOL(loops_per_jiffy); | |||
203 | 204 | ||
204 | static int __init debug_kernel(char *str) | 205 | static int __init debug_kernel(char *str) |
205 | { | 206 | { |
206 | console_loglevel = 10; | 207 | console_loglevel = CONSOLE_LOGLEVEL_DEBUG; |
207 | return 0; | 208 | return 0; |
208 | } | 209 | } |
209 | 210 | ||
210 | static int __init quiet_kernel(char *str) | 211 | static int __init quiet_kernel(char *str) |
211 | { | 212 | { |
212 | console_loglevel = 4; | 213 | console_loglevel = CONSOLE_LOGLEVEL_QUIET; |
213 | return 0; | 214 | return 0; |
214 | } | 215 | } |
215 | 216 | ||
@@ -379,7 +380,7 @@ static noinline void __init_refok rest_init(void) | |||
379 | * the init task will end up wanting to create kthreads, which, if | 380 | * the init task will end up wanting to create kthreads, which, if |
380 | * we schedule it before we create kthreadd, will OOPS. | 381 | * we schedule it before we create kthreadd, will OOPS. |
381 | */ | 382 | */ |
382 | kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); | 383 | kernel_thread(kernel_init, NULL, CLONE_FS); |
383 | numa_default_policy(); | 384 | numa_default_policy(); |
384 | pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); | 385 | pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); |
385 | rcu_read_lock(); | 386 | rcu_read_lock(); |
@@ -507,7 +508,6 @@ asmlinkage __visible void __init start_kernel(void) | |||
507 | page_address_init(); | 508 | page_address_init(); |
508 | pr_notice("%s", linux_banner); | 509 | pr_notice("%s", linux_banner); |
509 | setup_arch(&command_line); | 510 | setup_arch(&command_line); |
510 | mm_init_owner(&init_mm, &init_task); | ||
511 | mm_init_cpumask(&init_mm); | 511 | mm_init_cpumask(&init_mm); |
512 | setup_command_line(command_line); | 512 | setup_command_line(command_line); |
513 | setup_nr_cpu_ids(); | 513 | setup_nr_cpu_ids(); |
@@ -629,9 +629,7 @@ asmlinkage __visible void __init start_kernel(void) | |||
629 | signals_init(); | 629 | signals_init(); |
630 | /* rootfs populating might need page-writeback */ | 630 | /* rootfs populating might need page-writeback */ |
631 | page_writeback_init(); | 631 | page_writeback_init(); |
632 | #ifdef CONFIG_PROC_FS | ||
633 | proc_root_init(); | 632 | proc_root_init(); |
634 | #endif | ||
635 | cgroup_init(); | 633 | cgroup_init(); |
636 | cpuset_init(); | 634 | cpuset_init(); |
637 | taskstats_init_early(); | 635 | taskstats_init_early(); |
@@ -666,19 +664,83 @@ static void __init do_ctors(void) | |||
666 | bool initcall_debug; | 664 | bool initcall_debug; |
667 | core_param(initcall_debug, initcall_debug, bool, 0644); | 665 | core_param(initcall_debug, initcall_debug, bool, 0644); |
668 | 666 | ||
667 | #ifdef CONFIG_KALLSYMS | ||
668 | struct blacklist_entry { | ||
669 | struct list_head next; | ||
670 | char *buf; | ||
671 | }; | ||
672 | |||
673 | static __initdata_or_module LIST_HEAD(blacklisted_initcalls); | ||
674 | |||
675 | static int __init initcall_blacklist(char *str) | ||
676 | { | ||
677 | char *str_entry; | ||
678 | struct blacklist_entry *entry; | ||
679 | |||
680 | /* str argument is a comma-separated list of functions */ | ||
681 | do { | ||
682 | str_entry = strsep(&str, ","); | ||
683 | if (str_entry) { | ||
684 | pr_debug("blacklisting initcall %s\n", str_entry); | ||
685 | entry = alloc_bootmem(sizeof(*entry)); | ||
686 | entry->buf = alloc_bootmem(strlen(str_entry) + 1); | ||
687 | strcpy(entry->buf, str_entry); | ||
688 | list_add(&entry->next, &blacklisted_initcalls); | ||
689 | } | ||
690 | } while (str_entry); | ||
691 | |||
692 | return 0; | ||
693 | } | ||
694 | |||
695 | static bool __init_or_module initcall_blacklisted(initcall_t fn) | ||
696 | { | ||
697 | struct list_head *tmp; | ||
698 | struct blacklist_entry *entry; | ||
699 | char *fn_name; | ||
700 | |||
701 | fn_name = kasprintf(GFP_KERNEL, "%pf", fn); | ||
702 | if (!fn_name) | ||
703 | return false; | ||
704 | |||
705 | list_for_each(tmp, &blacklisted_initcalls) { | ||
706 | entry = list_entry(tmp, struct blacklist_entry, next); | ||
707 | if (!strcmp(fn_name, entry->buf)) { | ||
708 | pr_debug("initcall %s blacklisted\n", fn_name); | ||
709 | kfree(fn_name); | ||
710 | return true; | ||
711 | } | ||
712 | } | ||
713 | |||
714 | kfree(fn_name); | ||
715 | return false; | ||
716 | } | ||
717 | #else | ||
718 | static int __init initcall_blacklist(char *str) | ||
719 | { | ||
720 | pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n"); | ||
721 | return 0; | ||
722 | } | ||
723 | |||
724 | static bool __init_or_module initcall_blacklisted(initcall_t fn) | ||
725 | { | ||
726 | return false; | ||
727 | } | ||
728 | #endif | ||
729 | __setup("initcall_blacklist=", initcall_blacklist); | ||
730 | |||
669 | static int __init_or_module do_one_initcall_debug(initcall_t fn) | 731 | static int __init_or_module do_one_initcall_debug(initcall_t fn) |
670 | { | 732 | { |
671 | ktime_t calltime, delta, rettime; | 733 | ktime_t calltime, delta, rettime; |
672 | unsigned long long duration; | 734 | unsigned long long duration; |
673 | int ret; | 735 | int ret; |
674 | 736 | ||
675 | pr_debug("calling %pF @ %i\n", fn, task_pid_nr(current)); | 737 | printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); |
676 | calltime = ktime_get(); | 738 | calltime = ktime_get(); |
677 | ret = fn(); | 739 | ret = fn(); |
678 | rettime = ktime_get(); | 740 | rettime = ktime_get(); |
679 | delta = ktime_sub(rettime, calltime); | 741 | delta = ktime_sub(rettime, calltime); |
680 | duration = (unsigned long long) ktime_to_ns(delta) >> 10; | 742 | duration = (unsigned long long) ktime_to_ns(delta) >> 10; |
681 | pr_debug("initcall %pF returned %d after %lld usecs\n", | 743 | printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", |
682 | fn, ret, duration); | 744 | fn, ret, duration); |
683 | 745 | ||
684 | return ret; | 746 | return ret; |
@@ -690,6 +752,9 @@ int __init_or_module do_one_initcall(initcall_t fn) | |||
690 | int ret; | 752 | int ret; |
691 | char msgbuf[64]; | 753 | char msgbuf[64]; |
692 | 754 | ||
755 | if (initcall_blacklisted(fn)) | ||
756 | return -EPERM; | ||
757 | |||
693 | if (initcall_debug) | 758 | if (initcall_debug) |
694 | ret = do_one_initcall_debug(fn); | 759 | ret = do_one_initcall_debug(fn); |
695 | else | 760 | else |
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a5e026bc45c4..1323360d90e3 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c | |||
@@ -19,8 +19,8 @@ | |||
19 | 19 | ||
20 | static void backtrace_test_normal(void) | 20 | static void backtrace_test_normal(void) |
21 | { | 21 | { |
22 | printk("Testing a backtrace from process context.\n"); | 22 | pr_info("Testing a backtrace from process context.\n"); |
23 | printk("The following trace is a kernel self test and not a bug!\n"); | 23 | pr_info("The following trace is a kernel self test and not a bug!\n"); |
24 | 24 | ||
25 | dump_stack(); | 25 | dump_stack(); |
26 | } | 26 | } |
@@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); | |||
37 | 37 | ||
38 | static void backtrace_test_irq(void) | 38 | static void backtrace_test_irq(void) |
39 | { | 39 | { |
40 | printk("Testing a backtrace from irq context.\n"); | 40 | pr_info("Testing a backtrace from irq context.\n"); |
41 | printk("The following trace is a kernel self test and not a bug!\n"); | 41 | pr_info("The following trace is a kernel self test and not a bug!\n"); |
42 | 42 | ||
43 | init_completion(&backtrace_work); | 43 | init_completion(&backtrace_work); |
44 | tasklet_schedule(&backtrace_tasklet); | 44 | tasklet_schedule(&backtrace_tasklet); |
@@ -51,8 +51,8 @@ static void backtrace_test_saved(void) | |||
51 | struct stack_trace trace; | 51 | struct stack_trace trace; |
52 | unsigned long entries[8]; | 52 | unsigned long entries[8]; |
53 | 53 | ||
54 | printk("Testing a saved backtrace.\n"); | 54 | pr_info("Testing a saved backtrace.\n"); |
55 | printk("The following trace is a kernel self test and not a bug!\n"); | 55 | pr_info("The following trace is a kernel self test and not a bug!\n"); |
56 | 56 | ||
57 | trace.nr_entries = 0; | 57 | trace.nr_entries = 0; |
58 | trace.max_entries = ARRAY_SIZE(entries); | 58 | trace.max_entries = ARRAY_SIZE(entries); |
@@ -65,19 +65,19 @@ static void backtrace_test_saved(void) | |||
65 | #else | 65 | #else |
66 | static void backtrace_test_saved(void) | 66 | static void backtrace_test_saved(void) |
67 | { | 67 | { |
68 | printk("Saved backtrace test skipped.\n"); | 68 | pr_info("Saved backtrace test skipped.\n"); |
69 | } | 69 | } |
70 | #endif | 70 | #endif |
71 | 71 | ||
72 | static int backtrace_regression_test(void) | 72 | static int backtrace_regression_test(void) |
73 | { | 73 | { |
74 | printk("====[ backtrace testing ]===========\n"); | 74 | pr_info("====[ backtrace testing ]===========\n"); |
75 | 75 | ||
76 | backtrace_test_normal(); | 76 | backtrace_test_normal(); |
77 | backtrace_test_irq(); | 77 | backtrace_test_irq(); |
78 | backtrace_test_saved(); | 78 | backtrace_test_saved(); |
79 | 79 | ||
80 | printk("====[ end of backtrace testing ]====\n"); | 80 | pr_info("====[ end of backtrace testing ]====\n"); |
81 | return 0; | 81 | return 0; |
82 | } | 82 | } |
83 | 83 | ||
diff --git a/kernel/capability.c b/kernel/capability.c index a8d63df0c322..84b2bbf443e7 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -24,7 +24,6 @@ | |||
24 | */ | 24 | */ |
25 | 25 | ||
26 | const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; | 26 | const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; |
27 | |||
28 | EXPORT_SYMBOL(__cap_empty_set); | 27 | EXPORT_SYMBOL(__cap_empty_set); |
29 | 28 | ||
30 | int file_caps_enabled = 1; | 29 | int file_caps_enabled = 1; |
@@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) | |||
189 | * | 188 | * |
190 | * An alternative would be to return an error here | 189 | * An alternative would be to return an error here |
191 | * (-ERANGE), but that causes legacy applications to | 190 | * (-ERANGE), but that causes legacy applications to |
192 | * unexpectidly fail; the capget/modify/capset aborts | 191 | * unexpectedly fail; the capget/modify/capset aborts |
193 | * before modification is attempted and the application | 192 | * before modification is attempted and the application |
194 | * fails. | 193 | * fails. |
195 | */ | 194 | */ |
@@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable); | |||
395 | * This does not set PF_SUPERPRIV because the caller may not | 394 | * This does not set PF_SUPERPRIV because the caller may not |
396 | * actually be privileged. | 395 | * actually be privileged. |
397 | */ | 396 | */ |
398 | bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) | 397 | bool file_ns_capable(const struct file *file, struct user_namespace *ns, |
398 | int cap) | ||
399 | { | 399 | { |
400 | if (WARN_ON_ONCE(!cap_valid(cap))) | 400 | if (WARN_ON_ONCE(!cap_valid(cap))) |
401 | return false; | 401 | return false; |
diff --git a/kernel/compat.c b/kernel/compat.c index e40b0430b562..633394f442f8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp | |||
157 | int compat_get_timeval(struct timeval *tv, const void __user *utv) | 157 | int compat_get_timeval(struct timeval *tv, const void __user *utv) |
158 | { | 158 | { |
159 | if (COMPAT_USE_64BIT_TIME) | 159 | if (COMPAT_USE_64BIT_TIME) |
160 | return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; | 160 | return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; |
161 | else | 161 | else |
162 | return __compat_get_timeval(tv, utv); | 162 | return __compat_get_timeval(tv, utv); |
163 | } | 163 | } |
@@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval); | |||
166 | int compat_put_timeval(const struct timeval *tv, void __user *utv) | 166 | int compat_put_timeval(const struct timeval *tv, void __user *utv) |
167 | { | 167 | { |
168 | if (COMPAT_USE_64BIT_TIME) | 168 | if (COMPAT_USE_64BIT_TIME) |
169 | return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; | 169 | return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; |
170 | else | 170 | else |
171 | return __compat_put_timeval(tv, utv); | 171 | return __compat_put_timeval(tv, utv); |
172 | } | 172 | } |
@@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval); | |||
175 | int compat_get_timespec(struct timespec *ts, const void __user *uts) | 175 | int compat_get_timespec(struct timespec *ts, const void __user *uts) |
176 | { | 176 | { |
177 | if (COMPAT_USE_64BIT_TIME) | 177 | if (COMPAT_USE_64BIT_TIME) |
178 | return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; | 178 | return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; |
179 | else | 179 | else |
180 | return __compat_get_timespec(ts, uts); | 180 | return __compat_get_timespec(ts, uts); |
181 | } | 181 | } |
@@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec); | |||
184 | int compat_put_timespec(const struct timespec *ts, void __user *uts) | 184 | int compat_put_timespec(const struct timespec *ts, void __user *uts) |
185 | { | 185 | { |
186 | if (COMPAT_USE_64BIT_TIME) | 186 | if (COMPAT_USE_64BIT_TIME) |
187 | return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; | 187 | return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; |
188 | else | 188 | else |
189 | return __compat_put_timespec(ts, uts); | 189 | return __compat_put_timespec(ts, uts); |
190 | } | 190 | } |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 247979a1b815..acf791c55b71 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -283,8 +283,7 @@ static inline void check_for_tasks(int cpu) | |||
283 | task_cputime(p, &utime, &stime); | 283 | task_cputime(p, &utime, &stime); |
284 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && | 284 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && |
285 | (utime || stime)) | 285 | (utime || stime)) |
286 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " | 286 | pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", |
287 | "(state = %ld, flags = %x)\n", | ||
288 | p->comm, task_pid_nr(p), cpu, | 287 | p->comm, task_pid_nr(p), cpu, |
289 | p->state, p->flags); | 288 | p->state, p->flags); |
290 | } | 289 | } |
@@ -336,8 +335,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
336 | if (err) { | 335 | if (err) { |
337 | nr_calls--; | 336 | nr_calls--; |
338 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); | 337 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); |
339 | printk("%s: attempt to take down CPU %u failed\n", | 338 | pr_warn("%s: attempt to take down CPU %u failed\n", |
340 | __func__, cpu); | 339 | __func__, cpu); |
341 | goto out_release; | 340 | goto out_release; |
342 | } | 341 | } |
343 | 342 | ||
@@ -444,8 +443,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) | |||
444 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); | 443 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); |
445 | if (ret) { | 444 | if (ret) { |
446 | nr_calls--; | 445 | nr_calls--; |
447 | printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", | 446 | pr_warn("%s: attempt to bring up CPU %u failed\n", |
448 | __func__, cpu); | 447 | __func__, cpu); |
449 | goto out_notify; | 448 | goto out_notify; |
450 | } | 449 | } |
451 | 450 | ||
@@ -475,11 +474,10 @@ int cpu_up(unsigned int cpu) | |||
475 | int err = 0; | 474 | int err = 0; |
476 | 475 | ||
477 | if (!cpu_possible(cpu)) { | 476 | if (!cpu_possible(cpu)) { |
478 | printk(KERN_ERR "can't online cpu %d because it is not " | 477 | pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", |
479 | "configured as may-hotadd at boot time\n", cpu); | 478 | cpu); |
480 | #if defined(CONFIG_IA64) | 479 | #if defined(CONFIG_IA64) |
481 | printk(KERN_ERR "please check additional_cpus= boot " | 480 | pr_err("please check additional_cpus= boot parameter\n"); |
482 | "parameter\n"); | ||
483 | #endif | 481 | #endif |
484 | return -EINVAL; | 482 | return -EINVAL; |
485 | } | 483 | } |
@@ -518,7 +516,7 @@ int disable_nonboot_cpus(void) | |||
518 | */ | 516 | */ |
519 | cpumask_clear(frozen_cpus); | 517 | cpumask_clear(frozen_cpus); |
520 | 518 | ||
521 | printk("Disabling non-boot CPUs ...\n"); | 519 | pr_info("Disabling non-boot CPUs ...\n"); |
522 | for_each_online_cpu(cpu) { | 520 | for_each_online_cpu(cpu) { |
523 | if (cpu == first_cpu) | 521 | if (cpu == first_cpu) |
524 | continue; | 522 | continue; |
@@ -526,8 +524,7 @@ int disable_nonboot_cpus(void) | |||
526 | if (!error) | 524 | if (!error) |
527 | cpumask_set_cpu(cpu, frozen_cpus); | 525 | cpumask_set_cpu(cpu, frozen_cpus); |
528 | else { | 526 | else { |
529 | printk(KERN_ERR "Error taking CPU%d down: %d\n", | 527 | pr_err("Error taking CPU%d down: %d\n", cpu, error); |
530 | cpu, error); | ||
531 | break; | 528 | break; |
532 | } | 529 | } |
533 | } | 530 | } |
@@ -537,7 +534,7 @@ int disable_nonboot_cpus(void) | |||
537 | /* Make sure the CPUs won't be enabled by someone else */ | 534 | /* Make sure the CPUs won't be enabled by someone else */ |
538 | cpu_hotplug_disabled = 1; | 535 | cpu_hotplug_disabled = 1; |
539 | } else { | 536 | } else { |
540 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); | 537 | pr_err("Non-boot CPUs are not disabled\n"); |
541 | } | 538 | } |
542 | cpu_maps_update_done(); | 539 | cpu_maps_update_done(); |
543 | return error; | 540 | return error; |
@@ -561,17 +558,17 @@ void __ref enable_nonboot_cpus(void) | |||
561 | if (cpumask_empty(frozen_cpus)) | 558 | if (cpumask_empty(frozen_cpus)) |
562 | goto out; | 559 | goto out; |
563 | 560 | ||
564 | printk(KERN_INFO "Enabling non-boot CPUs ...\n"); | 561 | pr_info("Enabling non-boot CPUs ...\n"); |
565 | 562 | ||
566 | arch_enable_nonboot_cpus_begin(); | 563 | arch_enable_nonboot_cpus_begin(); |
567 | 564 | ||
568 | for_each_cpu(cpu, frozen_cpus) { | 565 | for_each_cpu(cpu, frozen_cpus) { |
569 | error = _cpu_up(cpu, 1); | 566 | error = _cpu_up(cpu, 1); |
570 | if (!error) { | 567 | if (!error) { |
571 | printk(KERN_INFO "CPU%d is up\n", cpu); | 568 | pr_info("CPU%d is up\n", cpu); |
572 | continue; | 569 | continue; |
573 | } | 570 | } |
574 | printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); | 571 | pr_warn("Error taking CPU%d up: %d\n", cpu, error); |
575 | } | 572 | } |
576 | 573 | ||
577 | arch_enable_nonboot_cpus_end(); | 574 | arch_enable_nonboot_cpus_end(); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d54c418bd06..130017843899 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -61,12 +61,7 @@ | |||
61 | #include <linux/cgroup.h> | 61 | #include <linux/cgroup.h> |
62 | #include <linux/wait.h> | 62 | #include <linux/wait.h> |
63 | 63 | ||
64 | /* | 64 | struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; |
65 | * Tracks how many cpusets are currently defined in system. | ||
66 | * When there is only one cpuset (the root cpuset) we can | ||
67 | * short circuit some hooks. | ||
68 | */ | ||
69 | int number_of_cpusets __read_mostly; | ||
70 | 65 | ||
71 | /* See "Frequency meter" comments, below. */ | 66 | /* See "Frequency meter" comments, below. */ |
72 | 67 | ||
@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
611 | goto done; | 606 | goto done; |
612 | } | 607 | } |
613 | 608 | ||
614 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 609 | csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); |
615 | if (!csa) | 610 | if (!csa) |
616 | goto done; | 611 | goto done; |
617 | csn = 0; | 612 | csn = 0; |
@@ -1888,7 +1883,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) | |||
1888 | if (is_spread_slab(parent)) | 1883 | if (is_spread_slab(parent)) |
1889 | set_bit(CS_SPREAD_SLAB, &cs->flags); | 1884 | set_bit(CS_SPREAD_SLAB, &cs->flags); |
1890 | 1885 | ||
1891 | number_of_cpusets++; | 1886 | cpuset_inc(); |
1892 | 1887 | ||
1893 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) | 1888 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
1894 | goto out_unlock; | 1889 | goto out_unlock; |
@@ -1939,7 +1934,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) | |||
1939 | if (is_sched_load_balance(cs)) | 1934 | if (is_sched_load_balance(cs)) |
1940 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | 1935 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); |
1941 | 1936 | ||
1942 | number_of_cpusets--; | 1937 | cpuset_dec(); |
1943 | clear_bit(CS_ONLINE, &cs->flags); | 1938 | clear_bit(CS_ONLINE, &cs->flags); |
1944 | 1939 | ||
1945 | mutex_unlock(&cpuset_mutex); | 1940 | mutex_unlock(&cpuset_mutex); |
@@ -1992,7 +1987,6 @@ int __init cpuset_init(void) | |||
1992 | if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) | 1987 | if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) |
1993 | BUG(); | 1988 | BUG(); |
1994 | 1989 | ||
1995 | number_of_cpusets = 1; | ||
1996 | return 0; | 1990 | return 0; |
1997 | } | 1991 | } |
1998 | 1992 | ||
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index b03e0e814e43..fe15fff5df53 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c | |||
@@ -21,7 +21,7 @@ | |||
21 | static void kdb_show_stack(struct task_struct *p, void *addr) | 21 | static void kdb_show_stack(struct task_struct *p, void *addr) |
22 | { | 22 | { |
23 | int old_lvl = console_loglevel; | 23 | int old_lvl = console_loglevel; |
24 | console_loglevel = 15; | 24 | console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; |
25 | kdb_trap_printk++; | 25 | kdb_trap_printk++; |
26 | kdb_set_current_task(p); | 26 | kdb_set_current_task(p); |
27 | if (addr) { | 27 | if (addr) { |
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 14ff4849262c..7c70812caea5 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
@@ -710,7 +710,7 @@ kdb_printit: | |||
710 | } | 710 | } |
711 | if (logging) { | 711 | if (logging) { |
712 | saved_loglevel = console_loglevel; | 712 | saved_loglevel = console_loglevel; |
713 | console_loglevel = 0; | 713 | console_loglevel = CONSOLE_LOGLEVEL_SILENT; |
714 | printk(KERN_INFO "%s", kdb_buffer); | 714 | printk(KERN_INFO "%s", kdb_buffer); |
715 | } | 715 | } |
716 | 716 | ||
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0b097c8a1e50..2f7c760305ca 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv) | |||
1091 | static void kdb_dumpregs(struct pt_regs *regs) | 1091 | static void kdb_dumpregs(struct pt_regs *regs) |
1092 | { | 1092 | { |
1093 | int old_lvl = console_loglevel; | 1093 | int old_lvl = console_loglevel; |
1094 | console_loglevel = 15; | 1094 | console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; |
1095 | kdb_trap_printk++; | 1095 | kdb_trap_printk++; |
1096 | show_regs(regs); | 1096 | show_regs(regs); |
1097 | kdb_trap_printk--; | 1097 | kdb_trap_printk--; |
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0dbeae374225..83d4382f5699 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c | |||
@@ -37,7 +37,7 @@ static unsigned long ident_map[32] = { | |||
37 | struct exec_domain default_exec_domain = { | 37 | struct exec_domain default_exec_domain = { |
38 | .name = "Linux", /* name */ | 38 | .name = "Linux", /* name */ |
39 | .handler = default_handler, /* lcall7 causes a seg fault. */ | 39 | .handler = default_handler, /* lcall7 causes a seg fault. */ |
40 | .pers_low = 0, /* PER_LINUX personality. */ | 40 | .pers_low = 0, /* PER_LINUX personality. */ |
41 | .pers_high = 0, /* PER_LINUX personality. */ | 41 | .pers_high = 0, /* PER_LINUX personality. */ |
42 | .signal_map = ident_map, /* Identity map signals. */ | 42 | .signal_map = ident_map, /* Identity map signals. */ |
43 | .signal_invmap = ident_map, /* - both ways. */ | 43 | .signal_invmap = ident_map, /* - both ways. */ |
@@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality) | |||
83 | ep = &default_exec_domain; | 83 | ep = &default_exec_domain; |
84 | out: | 84 | out: |
85 | read_unlock(&exec_domains_lock); | 85 | read_unlock(&exec_domains_lock); |
86 | return (ep); | 86 | return ep; |
87 | } | 87 | } |
88 | 88 | ||
89 | int | 89 | int |
@@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep) | |||
110 | 110 | ||
111 | out: | 111 | out: |
112 | write_unlock(&exec_domains_lock); | 112 | write_unlock(&exec_domains_lock); |
113 | return (err); | 113 | return err; |
114 | } | 114 | } |
115 | EXPORT_SYMBOL(register_exec_domain); | ||
115 | 116 | ||
116 | int | 117 | int |
117 | unregister_exec_domain(struct exec_domain *ep) | 118 | unregister_exec_domain(struct exec_domain *ep) |
@@ -133,6 +134,7 @@ unregister: | |||
133 | write_unlock(&exec_domains_lock); | 134 | write_unlock(&exec_domains_lock); |
134 | return 0; | 135 | return 0; |
135 | } | 136 | } |
137 | EXPORT_SYMBOL(unregister_exec_domain); | ||
136 | 138 | ||
137 | int __set_personality(unsigned int personality) | 139 | int __set_personality(unsigned int personality) |
138 | { | 140 | { |
@@ -144,6 +146,7 @@ int __set_personality(unsigned int personality) | |||
144 | 146 | ||
145 | return 0; | 147 | return 0; |
146 | } | 148 | } |
149 | EXPORT_SYMBOL(__set_personality); | ||
147 | 150 | ||
148 | #ifdef CONFIG_PROC_FS | 151 | #ifdef CONFIG_PROC_FS |
149 | static int execdomains_proc_show(struct seq_file *m, void *v) | 152 | static int execdomains_proc_show(struct seq_file *m, void *v) |
@@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) | |||
188 | 191 | ||
189 | return old; | 192 | return old; |
190 | } | 193 | } |
191 | |||
192 | |||
193 | EXPORT_SYMBOL(register_exec_domain); | ||
194 | EXPORT_SYMBOL(unregister_exec_domain); | ||
195 | EXPORT_SYMBOL(__set_personality); | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 6ed6a1d552b5..750c2e594617 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -352,7 +352,7 @@ int disallow_signal(int sig) | |||
352 | 352 | ||
353 | EXPORT_SYMBOL(disallow_signal); | 353 | EXPORT_SYMBOL(disallow_signal); |
354 | 354 | ||
355 | #ifdef CONFIG_MM_OWNER | 355 | #ifdef CONFIG_MEMCG |
356 | /* | 356 | /* |
357 | * A task is exiting. If it owned this mm, find a new owner for the mm. | 357 | * A task is exiting. If it owned this mm, find a new owner for the mm. |
358 | */ | 358 | */ |
@@ -395,14 +395,18 @@ retry: | |||
395 | } | 395 | } |
396 | 396 | ||
397 | /* | 397 | /* |
398 | * Search through everything else. We should not get | 398 | * Search through everything else, we should not get here often. |
399 | * here often | ||
400 | */ | 399 | */ |
401 | do_each_thread(g, c) { | 400 | for_each_process(g) { |
402 | if (c->mm == mm) | 401 | if (g->flags & PF_KTHREAD) |
403 | goto assign_new_owner; | 402 | continue; |
404 | } while_each_thread(g, c); | 403 | for_each_thread(g, c) { |
405 | 404 | if (c->mm == mm) | |
405 | goto assign_new_owner; | ||
406 | if (c->mm) | ||
407 | break; | ||
408 | } | ||
409 | } | ||
406 | read_unlock(&tasklist_lock); | 410 | read_unlock(&tasklist_lock); |
407 | /* | 411 | /* |
408 | * We found no owner yet mm_users > 1: this implies that we are | 412 | * We found no owner yet mm_users > 1: this implies that we are |
@@ -434,7 +438,7 @@ assign_new_owner: | |||
434 | task_unlock(c); | 438 | task_unlock(c); |
435 | put_task_struct(c); | 439 | put_task_struct(c); |
436 | } | 440 | } |
437 | #endif /* CONFIG_MM_OWNER */ | 441 | #endif /* CONFIG_MEMCG */ |
438 | 442 | ||
439 | /* | 443 | /* |
440 | * Turn us into a lazy TLB process if we | 444 | * Turn us into a lazy TLB process if we |
diff --git a/kernel/fork.c b/kernel/fork.c index 54a8d26f612f..0d53eb0dfb6f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti) | |||
150 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | 150 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
151 | int node) | 151 | int node) |
152 | { | 152 | { |
153 | struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, | 153 | struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, |
154 | THREAD_SIZE_ORDER); | 154 | THREAD_SIZE_ORDER); |
155 | 155 | ||
156 | return page ? page_address(page) : NULL; | 156 | return page ? page_address(page) : NULL; |
157 | } | 157 | } |
158 | 158 | ||
159 | static inline void free_thread_info(struct thread_info *ti) | 159 | static inline void free_thread_info(struct thread_info *ti) |
160 | { | 160 | { |
161 | free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); | 161 | free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
162 | } | 162 | } |
163 | # else | 163 | # else |
164 | static struct kmem_cache *thread_info_cache; | 164 | static struct kmem_cache *thread_info_cache; |
@@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
1099 | #endif | 1099 | #endif |
1100 | } | 1100 | } |
1101 | 1101 | ||
1102 | #ifdef CONFIG_MM_OWNER | 1102 | #ifdef CONFIG_MEMCG |
1103 | void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | 1103 | void mm_init_owner(struct mm_struct *mm, struct task_struct *p) |
1104 | { | 1104 | { |
1105 | mm->owner = p; | 1105 | mm->owner = p; |
1106 | } | 1106 | } |
1107 | #endif /* CONFIG_MM_OWNER */ | 1107 | #endif /* CONFIG_MEMCG */ |
1108 | 1108 | ||
1109 | /* | 1109 | /* |
1110 | * Initialize POSIX timer handling for a single task. | 1110 | * Initialize POSIX timer handling for a single task. |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06bb1417b063..06db12434d72 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic = | |||
52 | 52 | ||
53 | static int __init hung_task_panic_setup(char *str) | 53 | static int __init hung_task_panic_setup(char *str) |
54 | { | 54 | { |
55 | sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); | 55 | int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); |
56 | 56 | ||
57 | if (rc) | ||
58 | return rc; | ||
57 | return 1; | 59 | return 1; |
58 | } | 60 | } |
59 | __setup("hung_task_panic=", hung_task_panic_setup); | 61 | __setup("hung_task_panic=", hung_task_panic_setup); |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 9a130ec06f7a..c2390f41307b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create) | |||
262 | * kthread_stop() has been called). The return value should be zero | 262 | * kthread_stop() has been called). The return value should be zero |
263 | * or a negative error number; it will be passed to kthread_stop(). | 263 | * or a negative error number; it will be passed to kthread_stop(). |
264 | * | 264 | * |
265 | * Returns a task_struct or ERR_PTR(-ENOMEM). | 265 | * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). |
266 | */ | 266 | */ |
267 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | 267 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), |
268 | void *data, int node, | 268 | void *data, int node, |
@@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
298 | * that thread. | 298 | * that thread. |
299 | */ | 299 | */ |
300 | if (xchg(&create->done, NULL)) | 300 | if (xchg(&create->done, NULL)) |
301 | return ERR_PTR(-ENOMEM); | 301 | return ERR_PTR(-EINTR); |
302 | /* | 302 | /* |
303 | * kthreadd (or new kernel thread) will call complete() | 303 | * kthreadd (or new kernel thread) will call complete() |
304 | * shortly. | 304 | * shortly. |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a462b317f9a0..a02812743a7e 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void) | |||
88 | } | 88 | } |
89 | 89 | ||
90 | static void __sched | 90 | static void __sched |
91 | account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) | 91 | account_global_scheduler_latency(struct task_struct *tsk, |
92 | struct latency_record *lat) | ||
92 | { | 93 | { |
93 | int firstnonnull = MAXLR + 1; | 94 | int firstnonnull = MAXLR + 1; |
94 | int i; | 95 | int i; |
@@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v) | |||
255 | break; | 256 | break; |
256 | seq_printf(m, " %ps", (void *)bt); | 257 | seq_printf(m, " %ps", (void *)bt); |
257 | } | 258 | } |
258 | seq_printf(m, "\n"); | 259 | seq_puts(m, "\n"); |
259 | } | 260 | } |
260 | } | 261 | } |
261 | return 0; | 262 | return 0; |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 221229cf0190..ea2d5f6962ed 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -54,20 +54,16 @@ | |||
54 | #include "console_cmdline.h" | 54 | #include "console_cmdline.h" |
55 | #include "braille.h" | 55 | #include "braille.h" |
56 | 56 | ||
57 | /* printk's without a loglevel use this.. */ | ||
58 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL | ||
59 | |||
60 | /* We show everything that is MORE important than this.. */ | ||
61 | #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ | ||
62 | #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ | ||
63 | |||
64 | int console_printk[4] = { | 57 | int console_printk[4] = { |
65 | DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ | 58 | CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ |
66 | DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ | 59 | DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ |
67 | MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ | 60 | CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ |
68 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ | 61 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ |
69 | }; | 62 | }; |
70 | 63 | ||
64 | /* Deferred messaged from sched code are marked by this special level */ | ||
65 | #define SCHED_MESSAGE_LOGLEVEL -2 | ||
66 | |||
71 | /* | 67 | /* |
72 | * Low level drivers may need that to know if they can schedule in | 68 | * Low level drivers may need that to know if they can schedule in |
73 | * their unblank() callback or not. So let's export it. | 69 | * their unblank() callback or not. So let's export it. |
@@ -91,6 +87,29 @@ static struct lockdep_map console_lock_dep_map = { | |||
91 | #endif | 87 | #endif |
92 | 88 | ||
93 | /* | 89 | /* |
90 | * Helper macros to handle lockdep when locking/unlocking console_sem. We use | ||
91 | * macros instead of functions so that _RET_IP_ contains useful information. | ||
92 | */ | ||
93 | #define down_console_sem() do { \ | ||
94 | down(&console_sem);\ | ||
95 | mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ | ||
96 | } while (0) | ||
97 | |||
98 | static int __down_trylock_console_sem(unsigned long ip) | ||
99 | { | ||
100 | if (down_trylock(&console_sem)) | ||
101 | return 1; | ||
102 | mutex_acquire(&console_lock_dep_map, 0, 1, ip); | ||
103 | return 0; | ||
104 | } | ||
105 | #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) | ||
106 | |||
107 | #define up_console_sem() do { \ | ||
108 | mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ | ||
109 | up(&console_sem);\ | ||
110 | } while (0) | ||
111 | |||
112 | /* | ||
94 | * This is used for debugging the mess that is the VT code by | 113 | * This is used for debugging the mess that is the VT code by |
95 | * keeping track if we have the console semaphore held. It's | 114 | * keeping track if we have the console semaphore held. It's |
96 | * definitely not the perfect debug tool (we don't know if _WE_ | 115 | * definitely not the perfect debug tool (we don't know if _WE_ |
@@ -206,8 +225,9 @@ struct printk_log { | |||
206 | }; | 225 | }; |
207 | 226 | ||
208 | /* | 227 | /* |
209 | * The logbuf_lock protects kmsg buffer, indices, counters. It is also | 228 | * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken |
210 | * used in interesting ways to provide interlocking in console_unlock(); | 229 | * within the scheduler's rq lock. It must be released before calling |
230 | * console_unlock() or anything else that might wake up a process. | ||
211 | */ | 231 | */ |
212 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | 232 | static DEFINE_RAW_SPINLOCK(logbuf_lock); |
213 | 233 | ||
@@ -250,9 +270,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | |||
250 | static char *log_buf = __log_buf; | 270 | static char *log_buf = __log_buf; |
251 | static u32 log_buf_len = __LOG_BUF_LEN; | 271 | static u32 log_buf_len = __LOG_BUF_LEN; |
252 | 272 | ||
253 | /* cpu currently holding logbuf_lock */ | ||
254 | static volatile unsigned int logbuf_cpu = UINT_MAX; | ||
255 | |||
256 | /* human readable text of the record */ | 273 | /* human readable text of the record */ |
257 | static char *log_text(const struct printk_log *msg) | 274 | static char *log_text(const struct printk_log *msg) |
258 | { | 275 | { |
@@ -297,34 +314,106 @@ static u32 log_next(u32 idx) | |||
297 | return idx + msg->len; | 314 | return idx + msg->len; |
298 | } | 315 | } |
299 | 316 | ||
300 | /* insert record into the buffer, discard old ones, update heads */ | 317 | /* |
301 | static void log_store(int facility, int level, | 318 | * Check whether there is enough free space for the given message. |
302 | enum log_flags flags, u64 ts_nsec, | 319 | * |
303 | const char *dict, u16 dict_len, | 320 | * The same values of first_idx and next_idx mean that the buffer |
304 | const char *text, u16 text_len) | 321 | * is either empty or full. |
322 | * | ||
323 | * If the buffer is empty, we must respect the position of the indexes. | ||
324 | * They cannot be reset to the beginning of the buffer. | ||
325 | */ | ||
326 | static int logbuf_has_space(u32 msg_size, bool empty) | ||
305 | { | 327 | { |
306 | struct printk_log *msg; | 328 | u32 free; |
307 | u32 size, pad_len; | ||
308 | 329 | ||
309 | /* number of '\0' padding bytes to next message */ | 330 | if (log_next_idx > log_first_idx || empty) |
310 | size = sizeof(struct printk_log) + text_len + dict_len; | 331 | free = max(log_buf_len - log_next_idx, log_first_idx); |
311 | pad_len = (-size) & (LOG_ALIGN - 1); | 332 | else |
312 | size += pad_len; | 333 | free = log_first_idx - log_next_idx; |
313 | 334 | ||
335 | /* | ||
336 | * We need space also for an empty header that signalizes wrapping | ||
337 | * of the buffer. | ||
338 | */ | ||
339 | return free >= msg_size + sizeof(struct printk_log); | ||
340 | } | ||
341 | |||
342 | static int log_make_free_space(u32 msg_size) | ||
343 | { | ||
314 | while (log_first_seq < log_next_seq) { | 344 | while (log_first_seq < log_next_seq) { |
315 | u32 free; | 345 | if (logbuf_has_space(msg_size, false)) |
346 | return 0; | ||
347 | /* drop old messages until we have enough continuous space */ | ||
348 | log_first_idx = log_next(log_first_idx); | ||
349 | log_first_seq++; | ||
350 | } | ||
316 | 351 | ||
317 | if (log_next_idx > log_first_idx) | 352 | /* sequence numbers are equal, so the log buffer is empty */ |
318 | free = max(log_buf_len - log_next_idx, log_first_idx); | 353 | if (logbuf_has_space(msg_size, true)) |
319 | else | 354 | return 0; |
320 | free = log_first_idx - log_next_idx; | ||
321 | 355 | ||
322 | if (free >= size + sizeof(struct printk_log)) | 356 | return -ENOMEM; |
323 | break; | 357 | } |
324 | 358 | ||
325 | /* drop old messages until we have enough contiuous space */ | 359 | /* compute the message size including the padding bytes */ |
326 | log_first_idx = log_next(log_first_idx); | 360 | static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) |
327 | log_first_seq++; | 361 | { |
362 | u32 size; | ||
363 | |||
364 | size = sizeof(struct printk_log) + text_len + dict_len; | ||
365 | *pad_len = (-size) & (LOG_ALIGN - 1); | ||
366 | size += *pad_len; | ||
367 | |||
368 | return size; | ||
369 | } | ||
370 | |||
371 | /* | ||
372 | * Define how much of the log buffer we could take at maximum. The value | ||
373 | * must be greater than two. Note that only half of the buffer is available | ||
374 | * when the index points to the middle. | ||
375 | */ | ||
376 | #define MAX_LOG_TAKE_PART 4 | ||
377 | static const char trunc_msg[] = "<truncated>"; | ||
378 | |||
379 | static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, | ||
380 | u16 *dict_len, u32 *pad_len) | ||
381 | { | ||
382 | /* | ||
383 | * The message should not take the whole buffer. Otherwise, it might | ||
384 | * get removed too soon. | ||
385 | */ | ||
386 | u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; | ||
387 | if (*text_len > max_text_len) | ||
388 | *text_len = max_text_len; | ||
389 | /* enable the warning message */ | ||
390 | *trunc_msg_len = strlen(trunc_msg); | ||
391 | /* disable the "dict" completely */ | ||
392 | *dict_len = 0; | ||
393 | /* compute the size again, count also the warning message */ | ||
394 | return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); | ||
395 | } | ||
396 | |||
397 | /* insert record into the buffer, discard old ones, update heads */ | ||
398 | static int log_store(int facility, int level, | ||
399 | enum log_flags flags, u64 ts_nsec, | ||
400 | const char *dict, u16 dict_len, | ||
401 | const char *text, u16 text_len) | ||
402 | { | ||
403 | struct printk_log *msg; | ||
404 | u32 size, pad_len; | ||
405 | u16 trunc_msg_len = 0; | ||
406 | |||
407 | /* number of '\0' padding bytes to next message */ | ||
408 | size = msg_used_size(text_len, dict_len, &pad_len); | ||
409 | |||
410 | if (log_make_free_space(size)) { | ||
411 | /* truncate the message if it is too long for empty buffer */ | ||
412 | size = truncate_msg(&text_len, &trunc_msg_len, | ||
413 | &dict_len, &pad_len); | ||
414 | /* survive when the log buffer is too small for trunc_msg */ | ||
415 | if (log_make_free_space(size)) | ||
416 | return 0; | ||
328 | } | 417 | } |
329 | 418 | ||
330 | if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { | 419 | if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { |
@@ -341,6 +430,10 @@ static void log_store(int facility, int level, | |||
341 | msg = (struct printk_log *)(log_buf + log_next_idx); | 430 | msg = (struct printk_log *)(log_buf + log_next_idx); |
342 | memcpy(log_text(msg), text, text_len); | 431 | memcpy(log_text(msg), text, text_len); |
343 | msg->text_len = text_len; | 432 | msg->text_len = text_len; |
433 | if (trunc_msg_len) { | ||
434 | memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); | ||
435 | msg->text_len += trunc_msg_len; | ||
436 | } | ||
344 | memcpy(log_dict(msg), dict, dict_len); | 437 | memcpy(log_dict(msg), dict, dict_len); |
345 | msg->dict_len = dict_len; | 438 | msg->dict_len = dict_len; |
346 | msg->facility = facility; | 439 | msg->facility = facility; |
@@ -356,6 +449,8 @@ static void log_store(int facility, int level, | |||
356 | /* insert message */ | 449 | /* insert message */ |
357 | log_next_idx += msg->len; | 450 | log_next_idx += msg->len; |
358 | log_next_seq++; | 451 | log_next_seq++; |
452 | |||
453 | return msg->text_len; | ||
359 | } | 454 | } |
360 | 455 | ||
361 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT | 456 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT |
@@ -1303,7 +1398,10 @@ static void zap_locks(void) | |||
1303 | sema_init(&console_sem, 1); | 1398 | sema_init(&console_sem, 1); |
1304 | } | 1399 | } |
1305 | 1400 | ||
1306 | /* Check if we have any console registered that can be called early in boot. */ | 1401 | /* |
1402 | * Check if we have any console that is capable of printing while cpu is | ||
1403 | * booting or shutting down. Requires console_sem. | ||
1404 | */ | ||
1307 | static int have_callable_console(void) | 1405 | static int have_callable_console(void) |
1308 | { | 1406 | { |
1309 | struct console *con; | 1407 | struct console *con; |
@@ -1318,10 +1416,9 @@ static int have_callable_console(void) | |||
1318 | /* | 1416 | /* |
1319 | * Can we actually use the console at this time on this cpu? | 1417 | * Can we actually use the console at this time on this cpu? |
1320 | * | 1418 | * |
1321 | * Console drivers may assume that per-cpu resources have | 1419 | * Console drivers may assume that per-cpu resources have been allocated. So |
1322 | * been allocated. So unless they're explicitly marked as | 1420 | * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't |
1323 | * being able to cope (CON_ANYTIME) don't call them until | 1421 | * call them until this CPU is officially up. |
1324 | * this CPU is officially up. | ||
1325 | */ | 1422 | */ |
1326 | static inline int can_use_console(unsigned int cpu) | 1423 | static inline int can_use_console(unsigned int cpu) |
1327 | { | 1424 | { |
@@ -1333,36 +1430,24 @@ static inline int can_use_console(unsigned int cpu) | |||
1333 | * messages from a 'printk'. Return true (and with the | 1430 | * messages from a 'printk'. Return true (and with the |
1334 | * console_lock held, and 'console_locked' set) if it | 1431 | * console_lock held, and 'console_locked' set) if it |
1335 | * is successful, false otherwise. | 1432 | * is successful, false otherwise. |
1336 | * | ||
1337 | * This gets called with the 'logbuf_lock' spinlock held and | ||
1338 | * interrupts disabled. It should return with 'lockbuf_lock' | ||
1339 | * released but interrupts still disabled. | ||
1340 | */ | 1433 | */ |
1341 | static int console_trylock_for_printk(unsigned int cpu) | 1434 | static int console_trylock_for_printk(void) |
1342 | __releases(&logbuf_lock) | ||
1343 | { | 1435 | { |
1344 | int retval = 0, wake = 0; | 1436 | unsigned int cpu = smp_processor_id(); |
1345 | |||
1346 | if (console_trylock()) { | ||
1347 | retval = 1; | ||
1348 | 1437 | ||
1349 | /* | 1438 | if (!console_trylock()) |
1350 | * If we can't use the console, we need to release | 1439 | return 0; |
1351 | * the console semaphore by hand to avoid flushing | 1440 | /* |
1352 | * the buffer. We need to hold the console semaphore | 1441 | * If we can't use the console, we need to release the console |
1353 | * in order to do this test safely. | 1442 | * semaphore by hand to avoid flushing the buffer. We need to hold the |
1354 | */ | 1443 | * console semaphore in order to do this test safely. |
1355 | if (!can_use_console(cpu)) { | 1444 | */ |
1356 | console_locked = 0; | 1445 | if (!can_use_console(cpu)) { |
1357 | wake = 1; | 1446 | console_locked = 0; |
1358 | retval = 0; | 1447 | up_console_sem(); |
1359 | } | 1448 | return 0; |
1360 | } | 1449 | } |
1361 | logbuf_cpu = UINT_MAX; | 1450 | return 1; |
1362 | raw_spin_unlock(&logbuf_lock); | ||
1363 | if (wake) | ||
1364 | up(&console_sem); | ||
1365 | return retval; | ||
1366 | } | 1451 | } |
1367 | 1452 | ||
1368 | int printk_delay_msec __read_mostly; | 1453 | int printk_delay_msec __read_mostly; |
@@ -1490,11 +1575,19 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1490 | static int recursion_bug; | 1575 | static int recursion_bug; |
1491 | static char textbuf[LOG_LINE_MAX]; | 1576 | static char textbuf[LOG_LINE_MAX]; |
1492 | char *text = textbuf; | 1577 | char *text = textbuf; |
1493 | size_t text_len; | 1578 | size_t text_len = 0; |
1494 | enum log_flags lflags = 0; | 1579 | enum log_flags lflags = 0; |
1495 | unsigned long flags; | 1580 | unsigned long flags; |
1496 | int this_cpu; | 1581 | int this_cpu; |
1497 | int printed_len = 0; | 1582 | int printed_len = 0; |
1583 | bool in_sched = false; | ||
1584 | /* cpu currently holding logbuf_lock in this function */ | ||
1585 | static volatile unsigned int logbuf_cpu = UINT_MAX; | ||
1586 | |||
1587 | if (level == SCHED_MESSAGE_LOGLEVEL) { | ||
1588 | level = -1; | ||
1589 | in_sched = true; | ||
1590 | } | ||
1498 | 1591 | ||
1499 | boot_delay_msec(level); | 1592 | boot_delay_msec(level); |
1500 | printk_delay(); | 1593 | printk_delay(); |
@@ -1516,7 +1609,8 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1516 | */ | 1609 | */ |
1517 | if (!oops_in_progress && !lockdep_recursing(current)) { | 1610 | if (!oops_in_progress && !lockdep_recursing(current)) { |
1518 | recursion_bug = 1; | 1611 | recursion_bug = 1; |
1519 | goto out_restore_irqs; | 1612 | local_irq_restore(flags); |
1613 | return 0; | ||
1520 | } | 1614 | } |
1521 | zap_locks(); | 1615 | zap_locks(); |
1522 | } | 1616 | } |
@@ -1530,17 +1624,22 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1530 | "BUG: recent printk recursion!"; | 1624 | "BUG: recent printk recursion!"; |
1531 | 1625 | ||
1532 | recursion_bug = 0; | 1626 | recursion_bug = 0; |
1533 | printed_len += strlen(recursion_msg); | 1627 | text_len = strlen(recursion_msg); |
1534 | /* emit KERN_CRIT message */ | 1628 | /* emit KERN_CRIT message */ |
1535 | log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, | 1629 | printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, |
1536 | NULL, 0, recursion_msg, printed_len); | 1630 | NULL, 0, recursion_msg, text_len); |
1537 | } | 1631 | } |
1538 | 1632 | ||
1539 | /* | 1633 | /* |
1540 | * The printf needs to come first; we need the syslog | 1634 | * The printf needs to come first; we need the syslog |
1541 | * prefix which might be passed-in as a parameter. | 1635 | * prefix which might be passed-in as a parameter. |
1542 | */ | 1636 | */ |
1543 | text_len = vscnprintf(text, sizeof(textbuf), fmt, args); | 1637 | if (in_sched) |
1638 | text_len = scnprintf(text, sizeof(textbuf), | ||
1639 | KERN_WARNING "[sched_delayed] "); | ||
1640 | |||
1641 | text_len += vscnprintf(text + text_len, | ||
1642 | sizeof(textbuf) - text_len, fmt, args); | ||
1544 | 1643 | ||
1545 | /* mark and strip a trailing newline */ | 1644 | /* mark and strip a trailing newline */ |
1546 | if (text_len && text[text_len-1] == '\n') { | 1645 | if (text_len && text[text_len-1] == '\n') { |
@@ -1586,9 +1685,12 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1586 | cont_flush(LOG_NEWLINE); | 1685 | cont_flush(LOG_NEWLINE); |
1587 | 1686 | ||
1588 | /* buffer line if possible, otherwise store it right away */ | 1687 | /* buffer line if possible, otherwise store it right away */ |
1589 | if (!cont_add(facility, level, text, text_len)) | 1688 | if (cont_add(facility, level, text, text_len)) |
1590 | log_store(facility, level, lflags | LOG_CONT, 0, | 1689 | printed_len += text_len; |
1591 | dict, dictlen, text, text_len); | 1690 | else |
1691 | printed_len += log_store(facility, level, | ||
1692 | lflags | LOG_CONT, 0, | ||
1693 | dict, dictlen, text, text_len); | ||
1592 | } else { | 1694 | } else { |
1593 | bool stored = false; | 1695 | bool stored = false; |
1594 | 1696 | ||
@@ -1607,26 +1709,35 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1607 | cont_flush(LOG_NEWLINE); | 1709 | cont_flush(LOG_NEWLINE); |
1608 | } | 1710 | } |
1609 | 1711 | ||
1610 | if (!stored) | 1712 | if (stored) |
1611 | log_store(facility, level, lflags, 0, | 1713 | printed_len += text_len; |
1612 | dict, dictlen, text, text_len); | 1714 | else |
1715 | printed_len += log_store(facility, level, lflags, 0, | ||
1716 | dict, dictlen, text, text_len); | ||
1613 | } | 1717 | } |
1614 | printed_len += text_len; | ||
1615 | 1718 | ||
1719 | logbuf_cpu = UINT_MAX; | ||
1720 | raw_spin_unlock(&logbuf_lock); | ||
1721 | lockdep_on(); | ||
1722 | local_irq_restore(flags); | ||
1723 | |||
1724 | /* If called from the scheduler, we can not call up(). */ | ||
1725 | if (in_sched) | ||
1726 | return printed_len; | ||
1727 | |||
1728 | /* | ||
1729 | * Disable preemption to avoid being preempted while holding | ||
1730 | * console_sem which would prevent anyone from printing to console | ||
1731 | */ | ||
1732 | preempt_disable(); | ||
1616 | /* | 1733 | /* |
1617 | * Try to acquire and then immediately release the console semaphore. | 1734 | * Try to acquire and then immediately release the console semaphore. |
1618 | * The release will print out buffers and wake up /dev/kmsg and syslog() | 1735 | * The release will print out buffers and wake up /dev/kmsg and syslog() |
1619 | * users. | 1736 | * users. |
1620 | * | ||
1621 | * The console_trylock_for_printk() function will release 'logbuf_lock' | ||
1622 | * regardless of whether it actually gets the console semaphore or not. | ||
1623 | */ | 1737 | */ |
1624 | if (console_trylock_for_printk(this_cpu)) | 1738 | if (console_trylock_for_printk()) |
1625 | console_unlock(); | 1739 | console_unlock(); |
1626 | 1740 | preempt_enable(); | |
1627 | lockdep_on(); | ||
1628 | out_restore_irqs: | ||
1629 | local_irq_restore(flags); | ||
1630 | 1741 | ||
1631 | return printed_len; | 1742 | return printed_len; |
1632 | } | 1743 | } |
@@ -1882,16 +1993,14 @@ void suspend_console(void) | |||
1882 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); | 1993 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); |
1883 | console_lock(); | 1994 | console_lock(); |
1884 | console_suspended = 1; | 1995 | console_suspended = 1; |
1885 | up(&console_sem); | 1996 | up_console_sem(); |
1886 | mutex_release(&console_lock_dep_map, 1, _RET_IP_); | ||
1887 | } | 1997 | } |
1888 | 1998 | ||
1889 | void resume_console(void) | 1999 | void resume_console(void) |
1890 | { | 2000 | { |
1891 | if (!console_suspend_enabled) | 2001 | if (!console_suspend_enabled) |
1892 | return; | 2002 | return; |
1893 | down(&console_sem); | 2003 | down_console_sem(); |
1894 | mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); | ||
1895 | console_suspended = 0; | 2004 | console_suspended = 0; |
1896 | console_unlock(); | 2005 | console_unlock(); |
1897 | } | 2006 | } |
@@ -1933,12 +2042,11 @@ void console_lock(void) | |||
1933 | { | 2042 | { |
1934 | might_sleep(); | 2043 | might_sleep(); |
1935 | 2044 | ||
1936 | down(&console_sem); | 2045 | down_console_sem(); |
1937 | if (console_suspended) | 2046 | if (console_suspended) |
1938 | return; | 2047 | return; |
1939 | console_locked = 1; | 2048 | console_locked = 1; |
1940 | console_may_schedule = 1; | 2049 | console_may_schedule = 1; |
1941 | mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); | ||
1942 | } | 2050 | } |
1943 | EXPORT_SYMBOL(console_lock); | 2051 | EXPORT_SYMBOL(console_lock); |
1944 | 2052 | ||
@@ -1952,15 +2060,14 @@ EXPORT_SYMBOL(console_lock); | |||
1952 | */ | 2060 | */ |
1953 | int console_trylock(void) | 2061 | int console_trylock(void) |
1954 | { | 2062 | { |
1955 | if (down_trylock(&console_sem)) | 2063 | if (down_trylock_console_sem()) |
1956 | return 0; | 2064 | return 0; |
1957 | if (console_suspended) { | 2065 | if (console_suspended) { |
1958 | up(&console_sem); | 2066 | up_console_sem(); |
1959 | return 0; | 2067 | return 0; |
1960 | } | 2068 | } |
1961 | console_locked = 1; | 2069 | console_locked = 1; |
1962 | console_may_schedule = 0; | 2070 | console_may_schedule = 0; |
1963 | mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); | ||
1964 | return 1; | 2071 | return 1; |
1965 | } | 2072 | } |
1966 | EXPORT_SYMBOL(console_trylock); | 2073 | EXPORT_SYMBOL(console_trylock); |
@@ -2022,7 +2129,7 @@ void console_unlock(void) | |||
2022 | bool retry; | 2129 | bool retry; |
2023 | 2130 | ||
2024 | if (console_suspended) { | 2131 | if (console_suspended) { |
2025 | up(&console_sem); | 2132 | up_console_sem(); |
2026 | return; | 2133 | return; |
2027 | } | 2134 | } |
2028 | 2135 | ||
@@ -2043,10 +2150,15 @@ again: | |||
2043 | } | 2150 | } |
2044 | 2151 | ||
2045 | if (console_seq < log_first_seq) { | 2152 | if (console_seq < log_first_seq) { |
2153 | len = sprintf(text, "** %u printk messages dropped ** ", | ||
2154 | (unsigned)(log_first_seq - console_seq)); | ||
2155 | |||
2046 | /* messages are gone, move to first one */ | 2156 | /* messages are gone, move to first one */ |
2047 | console_seq = log_first_seq; | 2157 | console_seq = log_first_seq; |
2048 | console_idx = log_first_idx; | 2158 | console_idx = log_first_idx; |
2049 | console_prev = 0; | 2159 | console_prev = 0; |
2160 | } else { | ||
2161 | len = 0; | ||
2050 | } | 2162 | } |
2051 | skip: | 2163 | skip: |
2052 | if (console_seq == log_next_seq) | 2164 | if (console_seq == log_next_seq) |
@@ -2071,8 +2183,8 @@ skip: | |||
2071 | } | 2183 | } |
2072 | 2184 | ||
2073 | level = msg->level; | 2185 | level = msg->level; |
2074 | len = msg_print_text(msg, console_prev, false, | 2186 | len += msg_print_text(msg, console_prev, false, |
2075 | text, sizeof(text)); | 2187 | text + len, sizeof(text) - len); |
2076 | console_idx = log_next(console_idx); | 2188 | console_idx = log_next(console_idx); |
2077 | console_seq++; | 2189 | console_seq++; |
2078 | console_prev = msg->flags; | 2190 | console_prev = msg->flags; |
@@ -2084,7 +2196,6 @@ skip: | |||
2084 | local_irq_restore(flags); | 2196 | local_irq_restore(flags); |
2085 | } | 2197 | } |
2086 | console_locked = 0; | 2198 | console_locked = 0; |
2087 | mutex_release(&console_lock_dep_map, 1, _RET_IP_); | ||
2088 | 2199 | ||
2089 | /* Release the exclusive_console once it is used */ | 2200 | /* Release the exclusive_console once it is used */ |
2090 | if (unlikely(exclusive_console)) | 2201 | if (unlikely(exclusive_console)) |
@@ -2092,7 +2203,7 @@ skip: | |||
2092 | 2203 | ||
2093 | raw_spin_unlock(&logbuf_lock); | 2204 | raw_spin_unlock(&logbuf_lock); |
2094 | 2205 | ||
2095 | up(&console_sem); | 2206 | up_console_sem(); |
2096 | 2207 | ||
2097 | /* | 2208 | /* |
2098 | * Someone could have filled up the buffer again, so re-check if there's | 2209 | * Someone could have filled up the buffer again, so re-check if there's |
@@ -2137,7 +2248,7 @@ void console_unblank(void) | |||
2137 | * oops_in_progress is set to 1.. | 2248 | * oops_in_progress is set to 1.. |
2138 | */ | 2249 | */ |
2139 | if (oops_in_progress) { | 2250 | if (oops_in_progress) { |
2140 | if (down_trylock(&console_sem) != 0) | 2251 | if (down_trylock_console_sem() != 0) |
2141 | return; | 2252 | return; |
2142 | } else | 2253 | } else |
2143 | console_lock(); | 2254 | console_lock(); |
@@ -2438,21 +2549,19 @@ late_initcall(printk_late_init); | |||
2438 | /* | 2549 | /* |
2439 | * Delayed printk version, for scheduler-internal messages: | 2550 | * Delayed printk version, for scheduler-internal messages: |
2440 | */ | 2551 | */ |
2441 | #define PRINTK_BUF_SIZE 512 | ||
2442 | |||
2443 | #define PRINTK_PENDING_WAKEUP 0x01 | 2552 | #define PRINTK_PENDING_WAKEUP 0x01 |
2444 | #define PRINTK_PENDING_SCHED 0x02 | 2553 | #define PRINTK_PENDING_OUTPUT 0x02 |
2445 | 2554 | ||
2446 | static DEFINE_PER_CPU(int, printk_pending); | 2555 | static DEFINE_PER_CPU(int, printk_pending); |
2447 | static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); | ||
2448 | 2556 | ||
2449 | static void wake_up_klogd_work_func(struct irq_work *irq_work) | 2557 | static void wake_up_klogd_work_func(struct irq_work *irq_work) |
2450 | { | 2558 | { |
2451 | int pending = __this_cpu_xchg(printk_pending, 0); | 2559 | int pending = __this_cpu_xchg(printk_pending, 0); |
2452 | 2560 | ||
2453 | if (pending & PRINTK_PENDING_SCHED) { | 2561 | if (pending & PRINTK_PENDING_OUTPUT) { |
2454 | char *buf = __get_cpu_var(printk_sched_buf); | 2562 | /* If trylock fails, someone else is doing the printing */ |
2455 | pr_warn("[sched_delayed] %s", buf); | 2563 | if (console_trylock()) |
2564 | console_unlock(); | ||
2456 | } | 2565 | } |
2457 | 2566 | ||
2458 | if (pending & PRINTK_PENDING_WAKEUP) | 2567 | if (pending & PRINTK_PENDING_WAKEUP) |
@@ -2474,23 +2583,19 @@ void wake_up_klogd(void) | |||
2474 | preempt_enable(); | 2583 | preempt_enable(); |
2475 | } | 2584 | } |
2476 | 2585 | ||
2477 | int printk_sched(const char *fmt, ...) | 2586 | int printk_deferred(const char *fmt, ...) |
2478 | { | 2587 | { |
2479 | unsigned long flags; | ||
2480 | va_list args; | 2588 | va_list args; |
2481 | char *buf; | ||
2482 | int r; | 2589 | int r; |
2483 | 2590 | ||
2484 | local_irq_save(flags); | 2591 | preempt_disable(); |
2485 | buf = __get_cpu_var(printk_sched_buf); | ||
2486 | |||
2487 | va_start(args, fmt); | 2592 | va_start(args, fmt); |
2488 | r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); | 2593 | r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); |
2489 | va_end(args); | 2594 | va_end(args); |
2490 | 2595 | ||
2491 | __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); | 2596 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); |
2492 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); | 2597 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); |
2493 | local_irq_restore(flags); | 2598 | preempt_enable(); |
2494 | 2599 | ||
2495 | return r; | 2600 | return r; |
2496 | } | 2601 | } |
diff --git a/kernel/reboot.c b/kernel/reboot.c index 662c83fc16b7..a3a9e240fcdb 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
@@ -388,15 +388,22 @@ static int __init reboot_setup(char *str) | |||
388 | break; | 388 | break; |
389 | 389 | ||
390 | case 's': | 390 | case 's': |
391 | if (isdigit(*(str+1))) | 391 | { |
392 | reboot_cpu = simple_strtoul(str+1, NULL, 0); | 392 | int rc; |
393 | else if (str[1] == 'm' && str[2] == 'p' && | 393 | |
394 | isdigit(*(str+3))) | 394 | if (isdigit(*(str+1))) { |
395 | reboot_cpu = simple_strtoul(str+3, NULL, 0); | 395 | rc = kstrtoint(str+1, 0, &reboot_cpu); |
396 | else | 396 | if (rc) |
397 | return rc; | ||
398 | } else if (str[1] == 'm' && str[2] == 'p' && | ||
399 | isdigit(*(str+3))) { | ||
400 | rc = kstrtoint(str+3, 0, &reboot_cpu); | ||
401 | if (rc) | ||
402 | return rc; | ||
403 | } else | ||
397 | reboot_mode = REBOOT_SOFT; | 404 | reboot_mode = REBOOT_SOFT; |
398 | break; | 405 | break; |
399 | 406 | } | |
400 | case 'g': | 407 | case 'g': |
401 | reboot_mode = REBOOT_GPIO; | 408 | reboot_mode = REBOOT_GPIO; |
402 | break; | 409 | break; |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 51dbac6a3633..e791130f85a7 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf, | |||
186 | 186 | ||
187 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ | 187 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ |
188 | if (*buf == '-') { | 188 | if (*buf == '-') { |
189 | res = simple_strtoull(buf + 1, &end, 10); | 189 | int rc = kstrtoull(buf + 1, 10, &res); |
190 | if (res != 1 || *end != '\0') | 190 | |
191 | if (rc) | ||
192 | return rc; | ||
193 | if (res != 1) | ||
191 | return -EINVAL; | 194 | return -EINVAL; |
192 | *resp = RES_COUNTER_MAX; | 195 | *resp = RES_COUNTER_MAX; |
193 | return 0; | 196 | return 0; |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 913c6d6cc2c1..caf03e89a068 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1367,7 +1367,7 @@ out: | |||
1367 | * leave kernel. | 1367 | * leave kernel. |
1368 | */ | 1368 | */ |
1369 | if (p->mm && printk_ratelimit()) { | 1369 | if (p->mm && printk_ratelimit()) { |
1370 | printk_sched("process %d (%s) no longer affine to cpu%d\n", | 1370 | printk_deferred("process %d (%s) no longer affine to cpu%d\n", |
1371 | task_pid_nr(p), p->comm, cpu); | 1371 | task_pid_nr(p), p->comm, cpu); |
1372 | } | 1372 | } |
1373 | } | 1373 | } |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f9ca7d19781a..e1574fca03b5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -348,12 +348,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, | |||
348 | * entity. | 348 | * entity. |
349 | */ | 349 | */ |
350 | if (dl_time_before(dl_se->deadline, rq_clock(rq))) { | 350 | if (dl_time_before(dl_se->deadline, rq_clock(rq))) { |
351 | static bool lag_once = false; | 351 | printk_deferred_once("sched: DL replenish lagged to much\n"); |
352 | |||
353 | if (!lag_once) { | ||
354 | lag_once = true; | ||
355 | printk_sched("sched: DL replenish lagged to much\n"); | ||
356 | } | ||
357 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | 352 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; |
358 | dl_se->runtime = pi_se->dl_runtime; | 353 | dl_se->runtime = pi_se->dl_runtime; |
359 | } | 354 | } |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0ebfd7a29472..b3512f1afce9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -890,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
890 | * but accrue some time due to boosting. | 890 | * but accrue some time due to boosting. |
891 | */ | 891 | */ |
892 | if (likely(rt_b->rt_runtime)) { | 892 | if (likely(rt_b->rt_runtime)) { |
893 | static bool once = false; | ||
894 | |||
895 | rt_rq->rt_throttled = 1; | 893 | rt_rq->rt_throttled = 1; |
896 | 894 | printk_deferred_once("sched: RT throttling activated\n"); | |
897 | if (!once) { | ||
898 | once = true; | ||
899 | printk_sched("sched: RT throttling activated\n"); | ||
900 | } | ||
901 | } else { | 895 | } else { |
902 | /* | 896 | /* |
903 | * In case we did anyway, make it go away, | 897 | * In case we did anyway, make it go away, |
diff --git a/kernel/signal.c b/kernel/signal.c index 6ea13c09ae56..6e600aaa2af4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -3496,7 +3496,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, | |||
3496 | } | 3496 | } |
3497 | #endif | 3497 | #endif |
3498 | 3498 | ||
3499 | #ifdef __ARCH_WANT_SYS_SGETMASK | 3499 | #ifdef CONFIG_SGETMASK_SYSCALL |
3500 | 3500 | ||
3501 | /* | 3501 | /* |
3502 | * For backwards compatibility. Functionality superseded by sigprocmask. | 3502 | * For backwards compatibility. Functionality superseded by sigprocmask. |
@@ -3517,7 +3517,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) | |||
3517 | 3517 | ||
3518 | return old; | 3518 | return old; |
3519 | } | 3519 | } |
3520 | #endif /* __ARCH_WANT_SGETMASK */ | 3520 | #endif /* CONFIG_SGETMASK_SYSCALL */ |
3521 | 3521 | ||
3522 | #ifdef __ARCH_WANT_SYS_SIGNAL | 3522 | #ifdef __ARCH_WANT_SYS_SIGNAL |
3523 | /* | 3523 | /* |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 01fbae5b97b7..695f0c6cd169 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * | |||
307 | * @cpu: cpu to stop | 307 | * @cpu: cpu to stop |
308 | * @fn: function to execute | 308 | * @fn: function to execute |
309 | * @arg: argument to @fn | 309 | * @arg: argument to @fn |
310 | * @work_buf: pointer to cpu_stop_work structure | ||
310 | * | 311 | * |
311 | * Similar to stop_one_cpu() but doesn't wait for completion. The | 312 | * Similar to stop_one_cpu() but doesn't wait for completion. The |
312 | * caller is responsible for ensuring @work_buf is currently unused | 313 | * caller is responsible for ensuring @work_buf is currently unused |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bc8d1b74a6b9..36441b51b5df 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16); | |||
135 | cond_syscall(sys_setresuid16); | 135 | cond_syscall(sys_setresuid16); |
136 | cond_syscall(sys_setreuid16); | 136 | cond_syscall(sys_setreuid16); |
137 | cond_syscall(sys_setuid16); | 137 | cond_syscall(sys_setuid16); |
138 | cond_syscall(sys_sgetmask); | ||
139 | cond_syscall(sys_ssetmask); | ||
138 | cond_syscall(sys_vm86old); | 140 | cond_syscall(sys_vm86old); |
139 | cond_syscall(sys_vm86); | 141 | cond_syscall(sys_vm86); |
140 | cond_syscall(sys_ipc); | 142 | cond_syscall(sys_ipc); |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c8780cdaf852..33db43a39515 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) | |||
786 | time_status |= STA_PPSERROR; | 786 | time_status |= STA_PPSERROR; |
787 | pps_errcnt++; | 787 | pps_errcnt++; |
788 | pps_dec_freq_interval(); | 788 | pps_dec_freq_interval(); |
789 | pr_err("hardpps: PPSERROR: interval too long - %ld s\n", | 789 | printk_deferred(KERN_ERR |
790 | freq_norm.sec); | 790 | "hardpps: PPSERROR: interval too long - %ld s\n", |
791 | freq_norm.sec); | ||
791 | return 0; | 792 | return 0; |
792 | } | 793 | } |
793 | 794 | ||
@@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) | |||
800 | delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); | 801 | delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); |
801 | pps_freq = ftemp; | 802 | pps_freq = ftemp; |
802 | if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { | 803 | if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { |
803 | pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); | 804 | printk_deferred(KERN_WARNING |
805 | "hardpps: PPSWANDER: change=%ld\n", delta); | ||
804 | time_status |= STA_PPSWANDER; | 806 | time_status |= STA_PPSWANDER; |
805 | pps_stbcnt++; | 807 | pps_stbcnt++; |
806 | pps_dec_freq_interval(); | 808 | pps_dec_freq_interval(); |
@@ -844,8 +846,9 @@ static void hardpps_update_phase(long error) | |||
844 | * the time offset is updated. | 846 | * the time offset is updated. |
845 | */ | 847 | */ |
846 | if (jitter > (pps_jitter << PPS_POPCORN)) { | 848 | if (jitter > (pps_jitter << PPS_POPCORN)) { |
847 | pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", | 849 | printk_deferred(KERN_WARNING |
848 | jitter, (pps_jitter << PPS_POPCORN)); | 850 | "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", |
851 | jitter, (pps_jitter << PPS_POPCORN)); | ||
849 | time_status |= STA_PPSJITTER; | 852 | time_status |= STA_PPSJITTER; |
850 | pps_jitcnt++; | 853 | pps_jitcnt++; |
851 | } else if (time_status & STA_PPSTIME) { | 854 | } else if (time_status & STA_PPSTIME) { |
@@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
902 | time_status |= STA_PPSJITTER; | 905 | time_status |= STA_PPSJITTER; |
903 | /* restart the frequency calibration interval */ | 906 | /* restart the frequency calibration interval */ |
904 | pps_fbase = *raw_ts; | 907 | pps_fbase = *raw_ts; |
905 | pr_err("hardpps: PPSJITTER: bad pulse\n"); | 908 | printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); |
906 | return; | 909 | return; |
907 | } | 910 | } |
908 | 911 | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f7df8ea21707..32d8d6aaedb8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | |||
852 | struct timespec *delta) | 852 | struct timespec *delta) |
853 | { | 853 | { |
854 | if (!timespec_valid_strict(delta)) { | 854 | if (!timespec_valid_strict(delta)) { |
855 | printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " | 855 | printk_deferred(KERN_WARNING |
856 | "sleep delta value!\n"); | 856 | "__timekeeping_inject_sleeptime: Invalid " |
857 | "sleep delta value!\n"); | ||
857 | return; | 858 | return; |
858 | } | 859 | } |
859 | tk_xtime_add(tk, delta); | 860 | tk_xtime_add(tk, delta); |
@@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
1157 | 1158 | ||
1158 | if (unlikely(tk->clock->maxadj && | 1159 | if (unlikely(tk->clock->maxadj && |
1159 | (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { | 1160 | (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { |
1160 | printk_once(KERN_WARNING | 1161 | printk_deferred_once(KERN_WARNING |
1161 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | 1162 | "Adjusting %s more than 11%% (%ld vs %ld)\n", |
1162 | tk->clock->name, (long)tk->mult + adj, | 1163 | tk->clock->name, (long)tk->mult + adj, |
1163 | (long)tk->clock->mult + tk->clock->maxadj); | 1164 | (long)tk->clock->mult + tk->clock->maxadj); |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 6620e5837ce2..33cbd8c203f8 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, | |||
239 | * tracepoint_probe_register - Connect a probe to a tracepoint | 239 | * tracepoint_probe_register - Connect a probe to a tracepoint |
240 | * @tp: tracepoint | 240 | * @tp: tracepoint |
241 | * @probe: probe handler | 241 | * @probe: probe handler |
242 | * @data: tracepoint data | ||
242 | * | 243 | * |
243 | * Returns 0 if ok, error value on error. | 244 | * Returns 0 if ok, error value on error. |
244 | * Note: if @tp is within a module, the caller is responsible for | 245 | * Note: if @tp is within a module, the caller is responsible for |
@@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register); | |||
264 | * tracepoint_probe_unregister - Disconnect a probe from a tracepoint | 265 | * tracepoint_probe_unregister - Disconnect a probe from a tracepoint |
265 | * @tp: tracepoint | 266 | * @tp: tracepoint |
266 | * @probe: probe function pointer | 267 | * @probe: probe function pointer |
268 | * @data: tracepoint data | ||
267 | * | 269 | * |
268 | * Returns 0 if ok, error value on error. | 270 | * Returns 0 if ok, error value on error. |
269 | */ | 271 | */ |
diff --git a/kernel/user.c b/kernel/user.c index 294fc6a94168..4efa39350e44 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock); | |||
87 | struct user_struct root_user = { | 87 | struct user_struct root_user = { |
88 | .__count = ATOMIC_INIT(1), | 88 | .__count = ATOMIC_INIT(1), |
89 | .processes = ATOMIC_INIT(1), | 89 | .processes = ATOMIC_INIT(1), |
90 | .files = ATOMIC_INIT(0), | ||
91 | .sigpending = ATOMIC_INIT(0), | 90 | .sigpending = ATOMIC_INIT(0), |
92 | .locked_shm = 0, | 91 | .locked_shm = 0, |
93 | .uid = GLOBAL_ROOT_UID, | 92 | .uid = GLOBAL_ROOT_UID, |
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4f69f9a5e221..6fbe811c7ad1 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c | |||
@@ -51,7 +51,7 @@ static int proc_do_uts_string(ctl_table *table, int write, | |||
51 | int r; | 51 | int r; |
52 | memcpy(&uts_table, table, sizeof(uts_table)); | 52 | memcpy(&uts_table, table, sizeof(uts_table)); |
53 | uts_table.data = get_uts(table, write); | 53 | uts_table.data = get_uts(table, write); |
54 | r = proc_dostring(&uts_table,write,buffer,lenp, ppos); | 54 | r = proc_dostring(&uts_table, write, buffer, lenp, ppos); |
55 | put_uts(table, write, uts_table.data); | 55 | put_uts(table, write, uts_table.data); |
56 | 56 | ||
57 | if (write) | 57 | if (write) |
@@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void) | |||
135 | return 0; | 135 | return 0; |
136 | } | 136 | } |
137 | 137 | ||
138 | __initcall(utsname_sysctl_init); | 138 | device_initcall(utsname_sysctl_init); |
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 99c8bfee1b00..ccca32264748 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
@@ -501,6 +501,16 @@ config DEBUG_VM | |||
501 | 501 | ||
502 | If unsure, say N. | 502 | If unsure, say N. |
503 | 503 | ||
504 | config DEBUG_VM_VMACACHE | ||
505 | bool "Debug VMA caching" | ||
506 | depends on DEBUG_VM | ||
507 | help | ||
508 | Enable this to turn on VMA caching debug information. Doing so | ||
509 | can cause significant overhead, so only enable it in non-production | ||
510 | environments. | ||
511 | |||
512 | If unsure, say N. | ||
513 | |||
504 | config DEBUG_VM_RB | 514 | config DEBUG_VM_RB |
505 | bool "Debug VM red-black trees" | 515 | bool "Debug VM red-black trees" |
506 | depends on DEBUG_VM | 516 | depends on DEBUG_VM |
@@ -823,11 +833,6 @@ config DEBUG_RT_MUTEXES | |||
823 | This allows rt mutex semantics violations and rt mutex related | 833 | This allows rt mutex semantics violations and rt mutex related |
824 | deadlocks (lockups) to be detected and reported automatically. | 834 | deadlocks (lockups) to be detected and reported automatically. |
825 | 835 | ||
826 | config DEBUG_PI_LIST | ||
827 | bool | ||
828 | default y | ||
829 | depends on DEBUG_RT_MUTEXES | ||
830 | |||
831 | config RT_MUTEX_TESTER | 836 | config RT_MUTEX_TESTER |
832 | bool "Built-in scriptable tester for rt-mutexes" | 837 | bool "Built-in scriptable tester for rt-mutexes" |
833 | depends on DEBUG_KERNEL && RT_MUTEXES | 838 | depends on DEBUG_KERNEL && RT_MUTEXES |
@@ -1053,6 +1058,16 @@ config DEBUG_LIST | |||
1053 | 1058 | ||
1054 | If unsure, say N. | 1059 | If unsure, say N. |
1055 | 1060 | ||
1061 | config DEBUG_PI_LIST | ||
1062 | bool "Debug priority linked list manipulation" | ||
1063 | depends on DEBUG_KERNEL | ||
1064 | help | ||
1065 | Enable this to turn on extended checks in the priority-ordered | ||
1066 | linked-list (plist) walking routines. This checks the entire | ||
1067 | list multiple times during each manipulation. | ||
1068 | |||
1069 | If unsure, say N. | ||
1070 | |||
1056 | config DEBUG_SG | 1071 | config DEBUG_SG |
1057 | bool "Debug SG table operations" | 1072 | bool "Debug SG table operations" |
1058 | depends on DEBUG_KERNEL | 1073 | depends on DEBUG_KERNEL |
diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 11b9b01fda6b..1a000bb050f9 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c | |||
@@ -140,7 +140,7 @@ error: | |||
140 | * @decoder: The decoder definition (produced by asn1_compiler) | 140 | * @decoder: The decoder definition (produced by asn1_compiler) |
141 | * @context: The caller's context (to be passed to the action functions) | 141 | * @context: The caller's context (to be passed to the action functions) |
142 | * @data: The encoded data | 142 | * @data: The encoded data |
143 | * @datasize: The size of the encoded data | 143 | * @datalen: The size of the encoded data |
144 | * | 144 | * |
145 | * Decode BER/DER/CER encoded ASN.1 data according to a bytecode pattern | 145 | * Decode BER/DER/CER encoded ASN.1 data according to a bytecode pattern |
146 | * produced by asn1_compiler. Action functions are called on marked tags to | 146 | * produced by asn1_compiler. Action functions are called on marked tags to |
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index 00bca223d1e1..0211d30d8c39 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c | |||
@@ -8,6 +8,9 @@ | |||
8 | * the Free Software Foundation; either version 2 of the License, or | 8 | * the Free Software Foundation; either version 2 of the License, or |
9 | * (at your option) any later version. | 9 | * (at your option) any later version. |
10 | */ | 10 | */ |
11 | |||
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
13 | |||
11 | #include <linux/init.h> | 14 | #include <linux/init.h> |
12 | #include <linux/bug.h> | 15 | #include <linux/bug.h> |
13 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
@@ -146,18 +149,18 @@ static __init int test_atomic64(void) | |||
146 | BUG_ON(v.counter != r); | 149 | BUG_ON(v.counter != r); |
147 | 150 | ||
148 | #ifdef CONFIG_X86 | 151 | #ifdef CONFIG_X86 |
149 | printk(KERN_INFO "atomic64 test passed for %s platform %s CX8 and %s SSE\n", | 152 | pr_info("passed for %s platform %s CX8 and %s SSE\n", |
150 | #ifdef CONFIG_X86_64 | 153 | #ifdef CONFIG_X86_64 |
151 | "x86-64", | 154 | "x86-64", |
152 | #elif defined(CONFIG_X86_CMPXCHG64) | 155 | #elif defined(CONFIG_X86_CMPXCHG64) |
153 | "i586+", | 156 | "i586+", |
154 | #else | 157 | #else |
155 | "i386+", | 158 | "i386+", |
156 | #endif | 159 | #endif |
157 | boot_cpu_has(X86_FEATURE_CX8) ? "with" : "without", | 160 | boot_cpu_has(X86_FEATURE_CX8) ? "with" : "without", |
158 | boot_cpu_has(X86_FEATURE_XMM) ? "with" : "without"); | 161 | boot_cpu_has(X86_FEATURE_XMM) ? "with" : "without"); |
159 | #else | 162 | #else |
160 | printk(KERN_INFO "atomic64 test passed\n"); | 163 | pr_info("passed\n"); |
161 | #endif | 164 | #endif |
162 | 165 | ||
163 | return 0; | 166 | return 0; |
diff --git a/lib/btree.c b/lib/btree.c index f9a484676cb6..4264871ea1a0 100644 --- a/lib/btree.c +++ b/lib/btree.c | |||
@@ -198,6 +198,7 @@ EXPORT_SYMBOL_GPL(btree_init); | |||
198 | 198 | ||
199 | void btree_destroy(struct btree_head *head) | 199 | void btree_destroy(struct btree_head *head) |
200 | { | 200 | { |
201 | mempool_free(head->node, head->mempool); | ||
201 | mempool_destroy(head->mempool); | 202 | mempool_destroy(head->mempool); |
202 | head->mempool = NULL; | 203 | head->mempool = NULL; |
203 | } | 204 | } |
@@ -37,6 +37,9 @@ | |||
37 | 37 | ||
38 | Jeremy Fitzhardinge <jeremy@goop.org> 2006 | 38 | Jeremy Fitzhardinge <jeremy@goop.org> 2006 |
39 | */ | 39 | */ |
40 | |||
41 | #define pr_fmt(fmt) fmt | ||
42 | |||
40 | #include <linux/list.h> | 43 | #include <linux/list.h> |
41 | #include <linux/module.h> | 44 | #include <linux/module.h> |
42 | #include <linux/kernel.h> | 45 | #include <linux/kernel.h> |
@@ -153,15 +156,13 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) | |||
153 | 156 | ||
154 | if (warning) { | 157 | if (warning) { |
155 | /* this is a WARN_ON rather than BUG/BUG_ON */ | 158 | /* this is a WARN_ON rather than BUG/BUG_ON */ |
156 | printk(KERN_WARNING "------------[ cut here ]------------\n"); | 159 | pr_warn("------------[ cut here ]------------\n"); |
157 | 160 | ||
158 | if (file) | 161 | if (file) |
159 | printk(KERN_WARNING "WARNING: at %s:%u\n", | 162 | pr_warn("WARNING: at %s:%u\n", file, line); |
160 | file, line); | ||
161 | else | 163 | else |
162 | printk(KERN_WARNING "WARNING: at %p " | 164 | pr_warn("WARNING: at %p [verbose debug info unavailable]\n", |
163 | "[verbose debug info unavailable]\n", | 165 | (void *)bugaddr); |
164 | (void *)bugaddr); | ||
165 | 166 | ||
166 | print_modules(); | 167 | print_modules(); |
167 | show_regs(regs); | 168 | show_regs(regs); |
@@ -174,12 +175,10 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) | |||
174 | printk(KERN_DEFAULT "------------[ cut here ]------------\n"); | 175 | printk(KERN_DEFAULT "------------[ cut here ]------------\n"); |
175 | 176 | ||
176 | if (file) | 177 | if (file) |
177 | printk(KERN_CRIT "kernel BUG at %s:%u!\n", | 178 | pr_crit("kernel BUG at %s:%u!\n", file, line); |
178 | file, line); | ||
179 | else | 179 | else |
180 | printk(KERN_CRIT "Kernel BUG at %p " | 180 | pr_crit("Kernel BUG at %p [verbose debug info unavailable]\n", |
181 | "[verbose debug info unavailable]\n", | 181 | (void *)bugaddr); |
182 | (void *)bugaddr); | ||
183 | 182 | ||
184 | return BUG_TRAP_TYPE_BUG; | 183 | return BUG_TRAP_TYPE_BUG; |
185 | } | 184 | } |
diff --git a/lib/crc32.c b/lib/crc32.c index 70f00ca5ef1e..21a7b2135af6 100644 --- a/lib/crc32.c +++ b/lib/crc32.c | |||
@@ -33,13 +33,13 @@ | |||
33 | #include "crc32defs.h" | 33 | #include "crc32defs.h" |
34 | 34 | ||
35 | #if CRC_LE_BITS > 8 | 35 | #if CRC_LE_BITS > 8 |
36 | # define tole(x) ((__force u32) __constant_cpu_to_le32(x)) | 36 | # define tole(x) ((__force u32) cpu_to_le32(x)) |
37 | #else | 37 | #else |
38 | # define tole(x) (x) | 38 | # define tole(x) (x) |
39 | #endif | 39 | #endif |
40 | 40 | ||
41 | #if CRC_BE_BITS > 8 | 41 | #if CRC_BE_BITS > 8 |
42 | # define tobe(x) ((__force u32) __constant_cpu_to_be32(x)) | 42 | # define tobe(x) ((__force u32) cpu_to_be32(x)) |
43 | #else | 43 | #else |
44 | # define tobe(x) (x) | 44 | # define tobe(x) (x) |
45 | #endif | 45 | #endif |
diff --git a/lib/debugobjects.c b/lib/debugobjects.c index e0731c3db706..547f7f923dbc 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c | |||
@@ -7,6 +7,9 @@ | |||
7 | * | 7 | * |
8 | * For licencing details see kernel-base/COPYING | 8 | * For licencing details see kernel-base/COPYING |
9 | */ | 9 | */ |
10 | |||
11 | #define pr_fmt(fmt) "ODEBUG: " fmt | ||
12 | |||
10 | #include <linux/debugobjects.h> | 13 | #include <linux/debugobjects.h> |
11 | #include <linux/interrupt.h> | 14 | #include <linux/interrupt.h> |
12 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
@@ -218,7 +221,7 @@ static void debug_objects_oom(void) | |||
218 | unsigned long flags; | 221 | unsigned long flags; |
219 | int i; | 222 | int i; |
220 | 223 | ||
221 | printk(KERN_WARNING "ODEBUG: Out of memory. ODEBUG disabled\n"); | 224 | pr_warn("Out of memory. ODEBUG disabled\n"); |
222 | 225 | ||
223 | for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { | 226 | for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { |
224 | raw_spin_lock_irqsave(&db->lock, flags); | 227 | raw_spin_lock_irqsave(&db->lock, flags); |
@@ -292,11 +295,9 @@ static void debug_object_is_on_stack(void *addr, int onstack) | |||
292 | 295 | ||
293 | limit++; | 296 | limit++; |
294 | if (is_on_stack) | 297 | if (is_on_stack) |
295 | printk(KERN_WARNING | 298 | pr_warn("object is on stack, but not annotated\n"); |
296 | "ODEBUG: object is on stack, but not annotated\n"); | ||
297 | else | 299 | else |
298 | printk(KERN_WARNING | 300 | pr_warn("object is not on stack, but annotated\n"); |
299 | "ODEBUG: object is not on stack, but annotated\n"); | ||
300 | WARN_ON(1); | 301 | WARN_ON(1); |
301 | } | 302 | } |
302 | 303 | ||
@@ -985,7 +986,7 @@ static void __init debug_objects_selftest(void) | |||
985 | if (check_results(&obj, ODEBUG_STATE_NONE, ++fixups, ++warnings)) | 986 | if (check_results(&obj, ODEBUG_STATE_NONE, ++fixups, ++warnings)) |
986 | goto out; | 987 | goto out; |
987 | #endif | 988 | #endif |
988 | printk(KERN_INFO "ODEBUG: selftest passed\n"); | 989 | pr_info("selftest passed\n"); |
989 | 990 | ||
990 | out: | 991 | out: |
991 | debug_objects_fixups = oldfixups; | 992 | debug_objects_fixups = oldfixups; |
@@ -1060,8 +1061,8 @@ static int __init debug_objects_replace_static_objects(void) | |||
1060 | } | 1061 | } |
1061 | local_irq_enable(); | 1062 | local_irq_enable(); |
1062 | 1063 | ||
1063 | printk(KERN_DEBUG "ODEBUG: %d of %d active objects replaced\n", cnt, | 1064 | pr_debug("%d of %d active objects replaced\n", |
1064 | obj_pool_used); | 1065 | cnt, obj_pool_used); |
1065 | return 0; | 1066 | return 0; |
1066 | free: | 1067 | free: |
1067 | hlist_for_each_entry_safe(obj, tmp, &objects, node) { | 1068 | hlist_for_each_entry_safe(obj, tmp, &objects, node) { |
@@ -1090,7 +1091,7 @@ void __init debug_objects_mem_init(void) | |||
1090 | debug_objects_enabled = 0; | 1091 | debug_objects_enabled = 0; |
1091 | if (obj_cache) | 1092 | if (obj_cache) |
1092 | kmem_cache_destroy(obj_cache); | 1093 | kmem_cache_destroy(obj_cache); |
1093 | printk(KERN_WARNING "ODEBUG: out of memory.\n"); | 1094 | pr_warn("out of memory.\n"); |
1094 | } else | 1095 | } else |
1095 | debug_objects_selftest(); | 1096 | debug_objects_selftest(); |
1096 | } | 1097 | } |
diff --git a/lib/digsig.c b/lib/digsig.c index 8793aeda30ca..ae05ea393fc8 100644 --- a/lib/digsig.c +++ b/lib/digsig.c | |||
@@ -175,10 +175,11 @@ err1: | |||
175 | * digsig_verify() - digital signature verification with public key | 175 | * digsig_verify() - digital signature verification with public key |
176 | * @keyring: keyring to search key in | 176 | * @keyring: keyring to search key in |
177 | * @sig: digital signature | 177 | * @sig: digital signature |
178 | * @sigen: length of the signature | 178 | * @siglen: length of the signature |
179 | * @data: data | 179 | * @data: data |
180 | * @datalen: length of the data | 180 | * @datalen: length of the data |
181 | * @return: 0 on success, -EINVAL otherwise | 181 | * |
182 | * Returns 0 on success, -EINVAL otherwise | ||
182 | * | 183 | * |
183 | * Verifies data integrity against digital signature. | 184 | * Verifies data integrity against digital signature. |
184 | * Currently only RSA is supported. | 185 | * Currently only RSA is supported. |
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c index 244f5480c898..b3131f5cf8a2 100644 --- a/lib/libcrc32c.c +++ b/lib/libcrc32c.c | |||
@@ -62,10 +62,7 @@ EXPORT_SYMBOL(crc32c); | |||
62 | static int __init libcrc32c_mod_init(void) | 62 | static int __init libcrc32c_mod_init(void) |
63 | { | 63 | { |
64 | tfm = crypto_alloc_shash("crc32c", 0, 0); | 64 | tfm = crypto_alloc_shash("crc32c", 0, 0); |
65 | if (IS_ERR(tfm)) | 65 | return PTR_ERR_OR_ZERO(tfm); |
66 | return PTR_ERR(tfm); | ||
67 | |||
68 | return 0; | ||
69 | } | 66 | } |
70 | 67 | ||
71 | static void __exit libcrc32c_mod_fini(void) | 68 | static void __exit libcrc32c_mod_fini(void) |
diff --git a/lib/nlattr.c b/lib/nlattr.c index fc6754720ced..0c5778752aec 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c | |||
@@ -136,6 +136,7 @@ int nla_validate(const struct nlattr *head, int len, int maxtype, | |||
136 | errout: | 136 | errout: |
137 | return err; | 137 | return err; |
138 | } | 138 | } |
139 | EXPORT_SYMBOL(nla_validate); | ||
139 | 140 | ||
140 | /** | 141 | /** |
141 | * nla_policy_len - Determin the max. length of a policy | 142 | * nla_policy_len - Determin the max. length of a policy |
@@ -162,6 +163,7 @@ nla_policy_len(const struct nla_policy *p, int n) | |||
162 | 163 | ||
163 | return len; | 164 | return len; |
164 | } | 165 | } |
166 | EXPORT_SYMBOL(nla_policy_len); | ||
165 | 167 | ||
166 | /** | 168 | /** |
167 | * nla_parse - Parse a stream of attributes into a tb buffer | 169 | * nla_parse - Parse a stream of attributes into a tb buffer |
@@ -208,6 +210,7 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, | |||
208 | errout: | 210 | errout: |
209 | return err; | 211 | return err; |
210 | } | 212 | } |
213 | EXPORT_SYMBOL(nla_parse); | ||
211 | 214 | ||
212 | /** | 215 | /** |
213 | * nla_find - Find a specific attribute in a stream of attributes | 216 | * nla_find - Find a specific attribute in a stream of attributes |
@@ -228,6 +231,7 @@ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) | |||
228 | 231 | ||
229 | return NULL; | 232 | return NULL; |
230 | } | 233 | } |
234 | EXPORT_SYMBOL(nla_find); | ||
231 | 235 | ||
232 | /** | 236 | /** |
233 | * nla_strlcpy - Copy string attribute payload into a sized buffer | 237 | * nla_strlcpy - Copy string attribute payload into a sized buffer |
@@ -258,6 +262,7 @@ size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize) | |||
258 | 262 | ||
259 | return srclen; | 263 | return srclen; |
260 | } | 264 | } |
265 | EXPORT_SYMBOL(nla_strlcpy); | ||
261 | 266 | ||
262 | /** | 267 | /** |
263 | * nla_memcpy - Copy a netlink attribute into another memory area | 268 | * nla_memcpy - Copy a netlink attribute into another memory area |
@@ -278,6 +283,7 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) | |||
278 | 283 | ||
279 | return minlen; | 284 | return minlen; |
280 | } | 285 | } |
286 | EXPORT_SYMBOL(nla_memcpy); | ||
281 | 287 | ||
282 | /** | 288 | /** |
283 | * nla_memcmp - Compare an attribute with sized memory area | 289 | * nla_memcmp - Compare an attribute with sized memory area |
@@ -295,6 +301,7 @@ int nla_memcmp(const struct nlattr *nla, const void *data, | |||
295 | 301 | ||
296 | return d; | 302 | return d; |
297 | } | 303 | } |
304 | EXPORT_SYMBOL(nla_memcmp); | ||
298 | 305 | ||
299 | /** | 306 | /** |
300 | * nla_strcmp - Compare a string attribute against a string | 307 | * nla_strcmp - Compare a string attribute against a string |
@@ -317,6 +324,7 @@ int nla_strcmp(const struct nlattr *nla, const char *str) | |||
317 | 324 | ||
318 | return d; | 325 | return d; |
319 | } | 326 | } |
327 | EXPORT_SYMBOL(nla_strcmp); | ||
320 | 328 | ||
321 | #ifdef CONFIG_NET | 329 | #ifdef CONFIG_NET |
322 | /** | 330 | /** |
@@ -502,12 +510,3 @@ int nla_append(struct sk_buff *skb, int attrlen, const void *data) | |||
502 | } | 510 | } |
503 | EXPORT_SYMBOL(nla_append); | 511 | EXPORT_SYMBOL(nla_append); |
504 | #endif | 512 | #endif |
505 | |||
506 | EXPORT_SYMBOL(nla_validate); | ||
507 | EXPORT_SYMBOL(nla_policy_len); | ||
508 | EXPORT_SYMBOL(nla_parse); | ||
509 | EXPORT_SYMBOL(nla_find); | ||
510 | EXPORT_SYMBOL(nla_strlcpy); | ||
511 | EXPORT_SYMBOL(nla_memcpy); | ||
512 | EXPORT_SYMBOL(nla_memcmp); | ||
513 | EXPORT_SYMBOL(nla_strcmp); | ||
diff --git a/lib/plist.c b/lib/plist.c index 1ebc95f7a46f..d408e774b746 100644 --- a/lib/plist.c +++ b/lib/plist.c | |||
@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head) | |||
134 | plist_check_head(head); | 134 | plist_check_head(head); |
135 | } | 135 | } |
136 | 136 | ||
137 | /** | ||
138 | * plist_requeue - Requeue @node at end of same-prio entries. | ||
139 | * | ||
140 | * This is essentially an optimized plist_del() followed by | ||
141 | * plist_add(). It moves an entry already in the plist to | ||
142 | * after any other same-priority entries. | ||
143 | * | ||
144 | * @node: &struct plist_node pointer - entry to be moved | ||
145 | * @head: &struct plist_head pointer - list head | ||
146 | */ | ||
147 | void plist_requeue(struct plist_node *node, struct plist_head *head) | ||
148 | { | ||
149 | struct plist_node *iter; | ||
150 | struct list_head *node_next = &head->node_list; | ||
151 | |||
152 | plist_check_head(head); | ||
153 | BUG_ON(plist_head_empty(head)); | ||
154 | BUG_ON(plist_node_empty(node)); | ||
155 | |||
156 | if (node == plist_last(head)) | ||
157 | return; | ||
158 | |||
159 | iter = plist_next(node); | ||
160 | |||
161 | if (node->prio != iter->prio) | ||
162 | return; | ||
163 | |||
164 | plist_del(node, head); | ||
165 | |||
166 | plist_for_each_continue(iter, head) { | ||
167 | if (node->prio != iter->prio) { | ||
168 | node_next = &iter->node_list; | ||
169 | break; | ||
170 | } | ||
171 | } | ||
172 | list_add_tail(&node->node_list, node_next); | ||
173 | |||
174 | plist_check_head(head); | ||
175 | } | ||
176 | |||
137 | #ifdef CONFIG_DEBUG_PI_LIST | 177 | #ifdef CONFIG_DEBUG_PI_LIST |
138 | #include <linux/sched.h> | 178 | #include <linux/sched.h> |
139 | #include <linux/module.h> | 179 | #include <linux/module.h> |
@@ -170,12 +210,20 @@ static void __init plist_test_check(int nr_expect) | |||
170 | BUG_ON(prio_pos->prio_list.next != &first->prio_list); | 210 | BUG_ON(prio_pos->prio_list.next != &first->prio_list); |
171 | } | 211 | } |
172 | 212 | ||
213 | static void __init plist_test_requeue(struct plist_node *node) | ||
214 | { | ||
215 | plist_requeue(node, &test_head); | ||
216 | |||
217 | if (node != plist_last(&test_head)) | ||
218 | BUG_ON(node->prio == plist_next(node)->prio); | ||
219 | } | ||
220 | |||
173 | static int __init plist_test(void) | 221 | static int __init plist_test(void) |
174 | { | 222 | { |
175 | int nr_expect = 0, i, loop; | 223 | int nr_expect = 0, i, loop; |
176 | unsigned int r = local_clock(); | 224 | unsigned int r = local_clock(); |
177 | 225 | ||
178 | pr_debug("start plist test\n"); | 226 | printk(KERN_DEBUG "start plist test\n"); |
179 | plist_head_init(&test_head); | 227 | plist_head_init(&test_head); |
180 | for (i = 0; i < ARRAY_SIZE(test_node); i++) | 228 | for (i = 0; i < ARRAY_SIZE(test_node); i++) |
181 | plist_node_init(test_node + i, 0); | 229 | plist_node_init(test_node + i, 0); |
@@ -193,6 +241,10 @@ static int __init plist_test(void) | |||
193 | nr_expect--; | 241 | nr_expect--; |
194 | } | 242 | } |
195 | plist_test_check(nr_expect); | 243 | plist_test_check(nr_expect); |
244 | if (!plist_node_empty(test_node + i)) { | ||
245 | plist_test_requeue(test_node + i); | ||
246 | plist_test_check(nr_expect); | ||
247 | } | ||
196 | } | 248 | } |
197 | 249 | ||
198 | for (i = 0; i < ARRAY_SIZE(test_node); i++) { | 250 | for (i = 0; i < ARRAY_SIZE(test_node); i++) { |
@@ -203,7 +255,7 @@ static int __init plist_test(void) | |||
203 | plist_test_check(nr_expect); | 255 | plist_test_check(nr_expect); |
204 | } | 256 | } |
205 | 257 | ||
206 | pr_debug("end plist test\n"); | 258 | printk(KERN_DEBUG "end plist test\n"); |
207 | return 0; | 259 | return 0; |
208 | } | 260 | } |
209 | 261 | ||
diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 9599aa72d7a0..d64815651e90 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c | |||
@@ -194,7 +194,7 @@ radix_tree_node_alloc(struct radix_tree_root *root) | |||
194 | * succeed in getting a node here (and never reach | 194 | * succeed in getting a node here (and never reach |
195 | * kmem_cache_alloc) | 195 | * kmem_cache_alloc) |
196 | */ | 196 | */ |
197 | rtp = &__get_cpu_var(radix_tree_preloads); | 197 | rtp = this_cpu_ptr(&radix_tree_preloads); |
198 | if (rtp->nr) { | 198 | if (rtp->nr) { |
199 | ret = rtp->nodes[rtp->nr - 1]; | 199 | ret = rtp->nodes[rtp->nr - 1]; |
200 | rtp->nodes[rtp->nr - 1] = NULL; | 200 | rtp->nodes[rtp->nr - 1] = NULL; |
@@ -250,14 +250,14 @@ static int __radix_tree_preload(gfp_t gfp_mask) | |||
250 | int ret = -ENOMEM; | 250 | int ret = -ENOMEM; |
251 | 251 | ||
252 | preempt_disable(); | 252 | preempt_disable(); |
253 | rtp = &__get_cpu_var(radix_tree_preloads); | 253 | rtp = this_cpu_ptr(&radix_tree_preloads); |
254 | while (rtp->nr < ARRAY_SIZE(rtp->nodes)) { | 254 | while (rtp->nr < ARRAY_SIZE(rtp->nodes)) { |
255 | preempt_enable(); | 255 | preempt_enable(); |
256 | node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); | 256 | node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); |
257 | if (node == NULL) | 257 | if (node == NULL) |
258 | goto out; | 258 | goto out; |
259 | preempt_disable(); | 259 | preempt_disable(); |
260 | rtp = &__get_cpu_var(radix_tree_preloads); | 260 | rtp = this_cpu_ptr(&radix_tree_preloads); |
261 | if (rtp->nr < ARRAY_SIZE(rtp->nodes)) | 261 | if (rtp->nr < ARRAY_SIZE(rtp->nodes)) |
262 | rtp->nodes[rtp->nr++] = node; | 262 | rtp->nodes[rtp->nr++] = node; |
263 | else | 263 | else |
@@ -1296,7 +1296,6 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) | |||
1296 | /** | 1296 | /** |
1297 | * __radix_tree_delete_node - try to free node after clearing a slot | 1297 | * __radix_tree_delete_node - try to free node after clearing a slot |
1298 | * @root: radix tree root | 1298 | * @root: radix tree root |
1299 | * @index: index key | ||
1300 | * @node: node containing @index | 1299 | * @node: node containing @index |
1301 | * | 1300 | * |
1302 | * After clearing the slot at @index in @node from radix tree | 1301 | * After clearing the slot at @index in @node from radix tree |
diff --git a/lib/string.c b/lib/string.c index e0c20eb362f0..992bf30af759 100644 --- a/lib/string.c +++ b/lib/string.c | |||
@@ -107,7 +107,7 @@ EXPORT_SYMBOL(strcpy); | |||
107 | 107 | ||
108 | #ifndef __HAVE_ARCH_STRNCPY | 108 | #ifndef __HAVE_ARCH_STRNCPY |
109 | /** | 109 | /** |
110 | * strncpy - Copy a length-limited, %NUL-terminated string | 110 | * strncpy - Copy a length-limited, C-string |
111 | * @dest: Where to copy the string to | 111 | * @dest: Where to copy the string to |
112 | * @src: Where to copy the string from | 112 | * @src: Where to copy the string from |
113 | * @count: The maximum number of bytes to copy | 113 | * @count: The maximum number of bytes to copy |
@@ -136,7 +136,7 @@ EXPORT_SYMBOL(strncpy); | |||
136 | 136 | ||
137 | #ifndef __HAVE_ARCH_STRLCPY | 137 | #ifndef __HAVE_ARCH_STRLCPY |
138 | /** | 138 | /** |
139 | * strlcpy - Copy a %NUL terminated string into a sized buffer | 139 | * strlcpy - Copy a C-string into a sized buffer |
140 | * @dest: Where to copy the string to | 140 | * @dest: Where to copy the string to |
141 | * @src: Where to copy the string from | 141 | * @src: Where to copy the string from |
142 | * @size: size of destination buffer | 142 | * @size: size of destination buffer |
@@ -182,7 +182,7 @@ EXPORT_SYMBOL(strcat); | |||
182 | 182 | ||
183 | #ifndef __HAVE_ARCH_STRNCAT | 183 | #ifndef __HAVE_ARCH_STRNCAT |
184 | /** | 184 | /** |
185 | * strncat - Append a length-limited, %NUL-terminated string to another | 185 | * strncat - Append a length-limited, C-string to another |
186 | * @dest: The string to be appended to | 186 | * @dest: The string to be appended to |
187 | * @src: The string to append to it | 187 | * @src: The string to append to it |
188 | * @count: The maximum numbers of bytes to copy | 188 | * @count: The maximum numbers of bytes to copy |
@@ -211,7 +211,7 @@ EXPORT_SYMBOL(strncat); | |||
211 | 211 | ||
212 | #ifndef __HAVE_ARCH_STRLCAT | 212 | #ifndef __HAVE_ARCH_STRLCAT |
213 | /** | 213 | /** |
214 | * strlcat - Append a length-limited, %NUL-terminated string to another | 214 | * strlcat - Append a length-limited, C-string to another |
215 | * @dest: The string to be appended to | 215 | * @dest: The string to be appended to |
216 | * @src: The string to append to it | 216 | * @src: The string to append to it |
217 | * @count: The size of the destination buffer. | 217 | * @count: The size of the destination buffer. |
diff --git a/lib/swiotlb.c b/lib/swiotlb.c index b604b831f4d1..649d097853a1 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c | |||
@@ -374,7 +374,7 @@ void __init swiotlb_free(void) | |||
374 | io_tlb_nslabs = 0; | 374 | io_tlb_nslabs = 0; |
375 | } | 375 | } |
376 | 376 | ||
377 | static int is_swiotlb_buffer(phys_addr_t paddr) | 377 | int is_swiotlb_buffer(phys_addr_t paddr) |
378 | { | 378 | { |
379 | return paddr >= io_tlb_start && paddr < io_tlb_end; | 379 | return paddr >= io_tlb_start && paddr < io_tlb_end; |
380 | } | 380 | } |
diff --git a/lib/textsearch.c b/lib/textsearch.c index e0cc0146ae62..0c7e9ab2d88f 100644 --- a/lib/textsearch.c +++ b/lib/textsearch.c | |||
@@ -159,6 +159,7 @@ errout: | |||
159 | spin_unlock(&ts_mod_lock); | 159 | spin_unlock(&ts_mod_lock); |
160 | return err; | 160 | return err; |
161 | } | 161 | } |
162 | EXPORT_SYMBOL(textsearch_register); | ||
162 | 163 | ||
163 | /** | 164 | /** |
164 | * textsearch_unregister - unregister a textsearch module | 165 | * textsearch_unregister - unregister a textsearch module |
@@ -190,6 +191,7 @@ out: | |||
190 | spin_unlock(&ts_mod_lock); | 191 | spin_unlock(&ts_mod_lock); |
191 | return err; | 192 | return err; |
192 | } | 193 | } |
194 | EXPORT_SYMBOL(textsearch_unregister); | ||
193 | 195 | ||
194 | struct ts_linear_state | 196 | struct ts_linear_state |
195 | { | 197 | { |
@@ -236,6 +238,7 @@ unsigned int textsearch_find_continuous(struct ts_config *conf, | |||
236 | 238 | ||
237 | return textsearch_find(conf, state); | 239 | return textsearch_find(conf, state); |
238 | } | 240 | } |
241 | EXPORT_SYMBOL(textsearch_find_continuous); | ||
239 | 242 | ||
240 | /** | 243 | /** |
241 | * textsearch_prepare - Prepare a search | 244 | * textsearch_prepare - Prepare a search |
@@ -298,6 +301,7 @@ errout: | |||
298 | 301 | ||
299 | return ERR_PTR(err); | 302 | return ERR_PTR(err); |
300 | } | 303 | } |
304 | EXPORT_SYMBOL(textsearch_prepare); | ||
301 | 305 | ||
302 | /** | 306 | /** |
303 | * textsearch_destroy - destroy a search configuration | 307 | * textsearch_destroy - destroy a search configuration |
@@ -316,9 +320,4 @@ void textsearch_destroy(struct ts_config *conf) | |||
316 | 320 | ||
317 | kfree(conf); | 321 | kfree(conf); |
318 | } | 322 | } |
319 | |||
320 | EXPORT_SYMBOL(textsearch_register); | ||
321 | EXPORT_SYMBOL(textsearch_unregister); | ||
322 | EXPORT_SYMBOL(textsearch_prepare); | ||
323 | EXPORT_SYMBOL(textsearch_find_continuous); | ||
324 | EXPORT_SYMBOL(textsearch_destroy); | 323 | EXPORT_SYMBOL(textsearch_destroy); |
diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 0648291cdafe..6fe2c84eb055 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c | |||
@@ -2347,7 +2347,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args) | |||
2347 | break; | 2347 | break; |
2348 | 2348 | ||
2349 | base = 10; | 2349 | base = 10; |
2350 | is_sign = 0; | 2350 | is_sign = false; |
2351 | 2351 | ||
2352 | switch (*fmt++) { | 2352 | switch (*fmt++) { |
2353 | case 'c': | 2353 | case 'c': |
@@ -2386,7 +2386,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args) | |||
2386 | case 'i': | 2386 | case 'i': |
2387 | base = 0; | 2387 | base = 0; |
2388 | case 'd': | 2388 | case 'd': |
2389 | is_sign = 1; | 2389 | is_sign = true; |
2390 | case 'u': | 2390 | case 'u': |
2391 | break; | 2391 | break; |
2392 | case '%': | 2392 | case '%': |
diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig index 08837db52d94..12d2d777f36b 100644 --- a/lib/xz/Kconfig +++ b/lib/xz/Kconfig | |||
@@ -9,33 +9,33 @@ config XZ_DEC | |||
9 | if XZ_DEC | 9 | if XZ_DEC |
10 | 10 | ||
11 | config XZ_DEC_X86 | 11 | config XZ_DEC_X86 |
12 | bool "x86 BCJ filter decoder" | 12 | bool "x86 BCJ filter decoder" if EXPERT |
13 | default y if X86 | 13 | default y |
14 | select XZ_DEC_BCJ | 14 | select XZ_DEC_BCJ |
15 | 15 | ||
16 | config XZ_DEC_POWERPC | 16 | config XZ_DEC_POWERPC |
17 | bool "PowerPC BCJ filter decoder" | 17 | bool "PowerPC BCJ filter decoder" if EXPERT |
18 | default y if PPC | 18 | default y |
19 | select XZ_DEC_BCJ | 19 | select XZ_DEC_BCJ |
20 | 20 | ||
21 | config XZ_DEC_IA64 | 21 | config XZ_DEC_IA64 |
22 | bool "IA-64 BCJ filter decoder" | 22 | bool "IA-64 BCJ filter decoder" if EXPERT |
23 | default y if IA64 | 23 | default y |
24 | select XZ_DEC_BCJ | 24 | select XZ_DEC_BCJ |
25 | 25 | ||
26 | config XZ_DEC_ARM | 26 | config XZ_DEC_ARM |
27 | bool "ARM BCJ filter decoder" | 27 | bool "ARM BCJ filter decoder" if EXPERT |
28 | default y if ARM | 28 | default y |
29 | select XZ_DEC_BCJ | 29 | select XZ_DEC_BCJ |
30 | 30 | ||
31 | config XZ_DEC_ARMTHUMB | 31 | config XZ_DEC_ARMTHUMB |
32 | bool "ARM-Thumb BCJ filter decoder" | 32 | bool "ARM-Thumb BCJ filter decoder" if EXPERT |
33 | default y if (ARM && ARM_THUMB) | 33 | default y |
34 | select XZ_DEC_BCJ | 34 | select XZ_DEC_BCJ |
35 | 35 | ||
36 | config XZ_DEC_SPARC | 36 | config XZ_DEC_SPARC |
37 | bool "SPARC BCJ filter decoder" | 37 | bool "SPARC BCJ filter decoder" if EXPERT |
38 | default y if SPARC | 38 | default y |
39 | select XZ_DEC_BCJ | 39 | select XZ_DEC_BCJ |
40 | 40 | ||
41 | endif | 41 | endif |
diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index a6cdc969ea42..08c3c8049998 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c | |||
@@ -1043,6 +1043,8 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, | |||
1043 | 1043 | ||
1044 | s->lzma2.sequence = SEQ_LZMA_PREPARE; | 1044 | s->lzma2.sequence = SEQ_LZMA_PREPARE; |
1045 | 1045 | ||
1046 | /* Fall through */ | ||
1047 | |||
1046 | case SEQ_LZMA_PREPARE: | 1048 | case SEQ_LZMA_PREPARE: |
1047 | if (s->lzma2.compressed < RC_INIT_BYTES) | 1049 | if (s->lzma2.compressed < RC_INIT_BYTES) |
1048 | return XZ_DATA_ERROR; | 1050 | return XZ_DATA_ERROR; |
@@ -1053,6 +1055,8 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, | |||
1053 | s->lzma2.compressed -= RC_INIT_BYTES; | 1055 | s->lzma2.compressed -= RC_INIT_BYTES; |
1054 | s->lzma2.sequence = SEQ_LZMA_RUN; | 1056 | s->lzma2.sequence = SEQ_LZMA_RUN; |
1055 | 1057 | ||
1058 | /* Fall through */ | ||
1059 | |||
1056 | case SEQ_LZMA_RUN: | 1060 | case SEQ_LZMA_RUN: |
1057 | /* | 1061 | /* |
1058 | * Set dictionary limit to indicate how much we want | 1062 | * Set dictionary limit to indicate how much we want |
diff --git a/mm/Kconfig b/mm/Kconfig index 28cec518f4d4..3e9977a9d657 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -267,6 +267,9 @@ config MIGRATION | |||
267 | pages as migration can relocate pages to satisfy a huge page | 267 | pages as migration can relocate pages to satisfy a huge page |
268 | allocation instead of reclaiming. | 268 | allocation instead of reclaiming. |
269 | 269 | ||
270 | config ARCH_ENABLE_HUGEPAGE_MIGRATION | ||
271 | boolean | ||
272 | |||
270 | config PHYS_ADDR_T_64BIT | 273 | config PHYS_ADDR_T_64BIT |
271 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT | 274 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT |
272 | 275 | ||
@@ -433,16 +436,6 @@ choice | |||
433 | benefit. | 436 | benefit. |
434 | endchoice | 437 | endchoice |
435 | 438 | ||
436 | config CROSS_MEMORY_ATTACH | ||
437 | bool "Cross Memory Support" | ||
438 | depends on MMU | ||
439 | default y | ||
440 | help | ||
441 | Enabling this option adds the system calls process_vm_readv and | ||
442 | process_vm_writev which allow a process with the correct privileges | ||
443 | to directly read from or write to to another process's address space. | ||
444 | See the man page for more details. | ||
445 | |||
446 | # | 439 | # |
447 | # UP and nommu archs use km based percpu allocator | 440 | # UP and nommu archs use km based percpu allocator |
448 | # | 441 | # |
@@ -558,7 +551,7 @@ config MEM_SOFT_DIRTY | |||
558 | See Documentation/vm/soft-dirty.txt for more details. | 551 | See Documentation/vm/soft-dirty.txt for more details. |
559 | 552 | ||
560 | config ZSMALLOC | 553 | config ZSMALLOC |
561 | bool "Memory allocator for compressed pages" | 554 | tristate "Memory allocator for compressed pages" |
562 | depends on MMU | 555 | depends on MMU |
563 | default n | 556 | default n |
564 | help | 557 | help |
diff --git a/mm/Makefile b/mm/Makefile index 0173940407f6..4064f3ec145e 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -3,7 +3,7 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \ |
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o pagewalk.o pgtable-generic.o | 8 | vmalloc.o pagewalk.o pgtable-generic.o |
9 | 9 | ||
diff --git a/mm/compaction.c b/mm/compaction.c index 627dc2e4320f..21bf292b642a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone) | |||
89 | unsigned long end_pfn = zone_end_pfn(zone); | 89 | unsigned long end_pfn = zone_end_pfn(zone); |
90 | unsigned long pfn; | 90 | unsigned long pfn; |
91 | 91 | ||
92 | zone->compact_cached_migrate_pfn = start_pfn; | 92 | zone->compact_cached_migrate_pfn[0] = start_pfn; |
93 | zone->compact_cached_migrate_pfn[1] = start_pfn; | ||
93 | zone->compact_cached_free_pfn = end_pfn; | 94 | zone->compact_cached_free_pfn = end_pfn; |
94 | zone->compact_blockskip_flush = false; | 95 | zone->compact_blockskip_flush = false; |
95 | 96 | ||
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat) | |||
131 | */ | 132 | */ |
132 | static void update_pageblock_skip(struct compact_control *cc, | 133 | static void update_pageblock_skip(struct compact_control *cc, |
133 | struct page *page, unsigned long nr_isolated, | 134 | struct page *page, unsigned long nr_isolated, |
134 | bool migrate_scanner) | 135 | bool set_unsuitable, bool migrate_scanner) |
135 | { | 136 | { |
136 | struct zone *zone = cc->zone; | 137 | struct zone *zone = cc->zone; |
138 | unsigned long pfn; | ||
137 | 139 | ||
138 | if (cc->ignore_skip_hint) | 140 | if (cc->ignore_skip_hint) |
139 | return; | 141 | return; |
@@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc, | |||
141 | if (!page) | 143 | if (!page) |
142 | return; | 144 | return; |
143 | 145 | ||
144 | if (!nr_isolated) { | 146 | if (nr_isolated) |
145 | unsigned long pfn = page_to_pfn(page); | 147 | return; |
148 | |||
149 | /* | ||
150 | * Only skip pageblocks when all forms of compaction will be known to | ||
151 | * fail in the near future. | ||
152 | */ | ||
153 | if (set_unsuitable) | ||
146 | set_pageblock_skip(page); | 154 | set_pageblock_skip(page); |
147 | 155 | ||
148 | /* Update where compaction should restart */ | 156 | pfn = page_to_pfn(page); |
149 | if (migrate_scanner) { | 157 | |
150 | if (!cc->finished_update_migrate && | 158 | /* Update where async and sync compaction should restart */ |
151 | pfn > zone->compact_cached_migrate_pfn) | 159 | if (migrate_scanner) { |
152 | zone->compact_cached_migrate_pfn = pfn; | 160 | if (cc->finished_update_migrate) |
153 | } else { | 161 | return; |
154 | if (!cc->finished_update_free && | 162 | if (pfn > zone->compact_cached_migrate_pfn[0]) |
155 | pfn < zone->compact_cached_free_pfn) | 163 | zone->compact_cached_migrate_pfn[0] = pfn; |
156 | zone->compact_cached_free_pfn = pfn; | 164 | if (cc->mode != MIGRATE_ASYNC && |
157 | } | 165 | pfn > zone->compact_cached_migrate_pfn[1]) |
166 | zone->compact_cached_migrate_pfn[1] = pfn; | ||
167 | } else { | ||
168 | if (cc->finished_update_free) | ||
169 | return; | ||
170 | if (pfn < zone->compact_cached_free_pfn) | ||
171 | zone->compact_cached_free_pfn = pfn; | ||
158 | } | 172 | } |
159 | } | 173 | } |
160 | #else | 174 | #else |
@@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc, | |||
166 | 180 | ||
167 | static void update_pageblock_skip(struct compact_control *cc, | 181 | static void update_pageblock_skip(struct compact_control *cc, |
168 | struct page *page, unsigned long nr_isolated, | 182 | struct page *page, unsigned long nr_isolated, |
169 | bool migrate_scanner) | 183 | bool set_unsuitable, bool migrate_scanner) |
170 | { | 184 | { |
171 | } | 185 | } |
172 | #endif /* CONFIG_COMPACTION */ | 186 | #endif /* CONFIG_COMPACTION */ |
@@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | |||
195 | } | 209 | } |
196 | 210 | ||
197 | /* async aborts if taking too long or contended */ | 211 | /* async aborts if taking too long or contended */ |
198 | if (!cc->sync) { | 212 | if (cc->mode == MIGRATE_ASYNC) { |
199 | cc->contended = true; | 213 | cc->contended = true; |
200 | return false; | 214 | return false; |
201 | } | 215 | } |
@@ -208,10 +222,28 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | |||
208 | return true; | 222 | return true; |
209 | } | 223 | } |
210 | 224 | ||
211 | static inline bool compact_trylock_irqsave(spinlock_t *lock, | 225 | /* |
212 | unsigned long *flags, struct compact_control *cc) | 226 | * Aside from avoiding lock contention, compaction also periodically checks |
227 | * need_resched() and either schedules in sync compaction or aborts async | ||
228 | * compaction. This is similar to what compact_checklock_irqsave() does, but | ||
229 | * is used where no lock is concerned. | ||
230 | * | ||
231 | * Returns false when no scheduling was needed, or sync compaction scheduled. | ||
232 | * Returns true when async compaction should abort. | ||
233 | */ | ||
234 | static inline bool compact_should_abort(struct compact_control *cc) | ||
213 | { | 235 | { |
214 | return compact_checklock_irqsave(lock, flags, false, cc); | 236 | /* async compaction aborts if contended */ |
237 | if (need_resched()) { | ||
238 | if (cc->mode == MIGRATE_ASYNC) { | ||
239 | cc->contended = true; | ||
240 | return true; | ||
241 | } | ||
242 | |||
243 | cond_resched(); | ||
244 | } | ||
245 | |||
246 | return false; | ||
215 | } | 247 | } |
216 | 248 | ||
217 | /* Returns true if the page is within a block suitable for migration to */ | 249 | /* Returns true if the page is within a block suitable for migration to */ |
@@ -329,7 +361,8 @@ isolate_fail: | |||
329 | 361 | ||
330 | /* Update the pageblock-skip if the whole pageblock was scanned */ | 362 | /* Update the pageblock-skip if the whole pageblock was scanned */ |
331 | if (blockpfn == end_pfn) | 363 | if (blockpfn == end_pfn) |
332 | update_pageblock_skip(cc, valid_page, total_isolated, false); | 364 | update_pageblock_skip(cc, valid_page, total_isolated, true, |
365 | false); | ||
333 | 366 | ||
334 | count_compact_events(COMPACTFREE_SCANNED, nr_scanned); | 367 | count_compact_events(COMPACTFREE_SCANNED, nr_scanned); |
335 | if (total_isolated) | 368 | if (total_isolated) |
@@ -464,8 +497,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
464 | unsigned long flags; | 497 | unsigned long flags; |
465 | bool locked = false; | 498 | bool locked = false; |
466 | struct page *page = NULL, *valid_page = NULL; | 499 | struct page *page = NULL, *valid_page = NULL; |
467 | bool skipped_async_unsuitable = false; | 500 | bool set_unsuitable = true; |
468 | const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | | 501 | const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? |
502 | ISOLATE_ASYNC_MIGRATE : 0) | | ||
469 | (unevictable ? ISOLATE_UNEVICTABLE : 0); | 503 | (unevictable ? ISOLATE_UNEVICTABLE : 0); |
470 | 504 | ||
471 | /* | 505 | /* |
@@ -475,7 +509,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
475 | */ | 509 | */ |
476 | while (unlikely(too_many_isolated(zone))) { | 510 | while (unlikely(too_many_isolated(zone))) { |
477 | /* async migration should just abort */ | 511 | /* async migration should just abort */ |
478 | if (!cc->sync) | 512 | if (cc->mode == MIGRATE_ASYNC) |
479 | return 0; | 513 | return 0; |
480 | 514 | ||
481 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 515 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -484,8 +518,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
484 | return 0; | 518 | return 0; |
485 | } | 519 | } |
486 | 520 | ||
521 | if (compact_should_abort(cc)) | ||
522 | return 0; | ||
523 | |||
487 | /* Time to isolate some pages for migration */ | 524 | /* Time to isolate some pages for migration */ |
488 | cond_resched(); | ||
489 | for (; low_pfn < end_pfn; low_pfn++) { | 525 | for (; low_pfn < end_pfn; low_pfn++) { |
490 | /* give a chance to irqs before checking need_resched() */ | 526 | /* give a chance to irqs before checking need_resched() */ |
491 | if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { | 527 | if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { |
@@ -540,9 +576,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
540 | * the minimum amount of work satisfies the allocation | 576 | * the minimum amount of work satisfies the allocation |
541 | */ | 577 | */ |
542 | mt = get_pageblock_migratetype(page); | 578 | mt = get_pageblock_migratetype(page); |
543 | if (!cc->sync && !migrate_async_suitable(mt)) { | 579 | if (cc->mode == MIGRATE_ASYNC && |
544 | cc->finished_update_migrate = true; | 580 | !migrate_async_suitable(mt)) { |
545 | skipped_async_unsuitable = true; | 581 | set_unsuitable = false; |
546 | goto next_pageblock; | 582 | goto next_pageblock; |
547 | } | 583 | } |
548 | } | 584 | } |
@@ -646,11 +682,10 @@ next_pageblock: | |||
646 | /* | 682 | /* |
647 | * Update the pageblock-skip information and cached scanner pfn, | 683 | * Update the pageblock-skip information and cached scanner pfn, |
648 | * if the whole pageblock was scanned without isolating any page. | 684 | * if the whole pageblock was scanned without isolating any page. |
649 | * This is not done when pageblock was skipped due to being unsuitable | ||
650 | * for async compaction, so that eventual sync compaction can try. | ||
651 | */ | 685 | */ |
652 | if (low_pfn == end_pfn && !skipped_async_unsuitable) | 686 | if (low_pfn == end_pfn) |
653 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | 687 | update_pageblock_skip(cc, valid_page, nr_isolated, |
688 | set_unsuitable, true); | ||
654 | 689 | ||
655 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 690 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
656 | 691 | ||
@@ -671,7 +706,9 @@ static void isolate_freepages(struct zone *zone, | |||
671 | struct compact_control *cc) | 706 | struct compact_control *cc) |
672 | { | 707 | { |
673 | struct page *page; | 708 | struct page *page; |
674 | unsigned long high_pfn, low_pfn, pfn, z_end_pfn; | 709 | unsigned long block_start_pfn; /* start of current pageblock */ |
710 | unsigned long block_end_pfn; /* end of current pageblock */ | ||
711 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ | ||
675 | int nr_freepages = cc->nr_freepages; | 712 | int nr_freepages = cc->nr_freepages; |
676 | struct list_head *freelist = &cc->freepages; | 713 | struct list_head *freelist = &cc->freepages; |
677 | 714 | ||
@@ -679,41 +716,38 @@ static void isolate_freepages(struct zone *zone, | |||
679 | * Initialise the free scanner. The starting point is where we last | 716 | * Initialise the free scanner. The starting point is where we last |
680 | * successfully isolated from, zone-cached value, or the end of the | 717 | * successfully isolated from, zone-cached value, or the end of the |
681 | * zone when isolating for the first time. We need this aligned to | 718 | * zone when isolating for the first time. We need this aligned to |
682 | * the pageblock boundary, because we do pfn -= pageblock_nr_pages | 719 | * the pageblock boundary, because we do |
683 | * in the for loop. | 720 | * block_start_pfn -= pageblock_nr_pages in the for loop. |
721 | * For ending point, take care when isolating in last pageblock of a | ||
722 | * a zone which ends in the middle of a pageblock. | ||
684 | * The low boundary is the end of the pageblock the migration scanner | 723 | * The low boundary is the end of the pageblock the migration scanner |
685 | * is using. | 724 | * is using. |
686 | */ | 725 | */ |
687 | pfn = cc->free_pfn & ~(pageblock_nr_pages-1); | 726 | block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); |
727 | block_end_pfn = min(block_start_pfn + pageblock_nr_pages, | ||
728 | zone_end_pfn(zone)); | ||
688 | low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); | 729 | low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); |
689 | 730 | ||
690 | /* | 731 | /* |
691 | * Take care that if the migration scanner is at the end of the zone | ||
692 | * that the free scanner does not accidentally move to the next zone | ||
693 | * in the next isolation cycle. | ||
694 | */ | ||
695 | high_pfn = min(low_pfn, pfn); | ||
696 | |||
697 | z_end_pfn = zone_end_pfn(zone); | ||
698 | |||
699 | /* | ||
700 | * Isolate free pages until enough are available to migrate the | 732 | * Isolate free pages until enough are available to migrate the |
701 | * pages on cc->migratepages. We stop searching if the migrate | 733 | * pages on cc->migratepages. We stop searching if the migrate |
702 | * and free page scanners meet or enough free pages are isolated. | 734 | * and free page scanners meet or enough free pages are isolated. |
703 | */ | 735 | */ |
704 | for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; | 736 | for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; |
705 | pfn -= pageblock_nr_pages) { | 737 | block_end_pfn = block_start_pfn, |
738 | block_start_pfn -= pageblock_nr_pages) { | ||
706 | unsigned long isolated; | 739 | unsigned long isolated; |
707 | unsigned long end_pfn; | ||
708 | 740 | ||
709 | /* | 741 | /* |
710 | * This can iterate a massively long zone without finding any | 742 | * This can iterate a massively long zone without finding any |
711 | * suitable migration targets, so periodically check if we need | 743 | * suitable migration targets, so periodically check if we need |
712 | * to schedule. | 744 | * to schedule, or even abort async compaction. |
713 | */ | 745 | */ |
714 | cond_resched(); | 746 | if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) |
747 | && compact_should_abort(cc)) | ||
748 | break; | ||
715 | 749 | ||
716 | if (!pfn_valid(pfn)) | 750 | if (!pfn_valid(block_start_pfn)) |
717 | continue; | 751 | continue; |
718 | 752 | ||
719 | /* | 753 | /* |
@@ -723,7 +757,7 @@ static void isolate_freepages(struct zone *zone, | |||
723 | * i.e. it's possible that all pages within a zones range of | 757 | * i.e. it's possible that all pages within a zones range of |
724 | * pages do not belong to a single zone. | 758 | * pages do not belong to a single zone. |
725 | */ | 759 | */ |
726 | page = pfn_to_page(pfn); | 760 | page = pfn_to_page(block_start_pfn); |
727 | if (page_zone(page) != zone) | 761 | if (page_zone(page) != zone) |
728 | continue; | 762 | continue; |
729 | 763 | ||
@@ -736,26 +770,26 @@ static void isolate_freepages(struct zone *zone, | |||
736 | continue; | 770 | continue; |
737 | 771 | ||
738 | /* Found a block suitable for isolating free pages from */ | 772 | /* Found a block suitable for isolating free pages from */ |
739 | isolated = 0; | 773 | cc->free_pfn = block_start_pfn; |
774 | isolated = isolate_freepages_block(cc, block_start_pfn, | ||
775 | block_end_pfn, freelist, false); | ||
776 | nr_freepages += isolated; | ||
740 | 777 | ||
741 | /* | 778 | /* |
742 | * Take care when isolating in last pageblock of a zone which | 779 | * Set a flag that we successfully isolated in this pageblock. |
743 | * ends in the middle of a pageblock. | 780 | * In the next loop iteration, zone->compact_cached_free_pfn |
781 | * will not be updated and thus it will effectively contain the | ||
782 | * highest pageblock we isolated pages from. | ||
744 | */ | 783 | */ |
745 | end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); | 784 | if (isolated) |
746 | isolated = isolate_freepages_block(cc, pfn, end_pfn, | 785 | cc->finished_update_free = true; |
747 | freelist, false); | ||
748 | nr_freepages += isolated; | ||
749 | 786 | ||
750 | /* | 787 | /* |
751 | * Record the highest PFN we isolated pages from. When next | 788 | * isolate_freepages_block() might have aborted due to async |
752 | * looking for free pages, the search will restart here as | 789 | * compaction being contended |
753 | * page migration may have returned some pages to the allocator | ||
754 | */ | 790 | */ |
755 | if (isolated) { | 791 | if (cc->contended) |
756 | cc->finished_update_free = true; | 792 | break; |
757 | high_pfn = max(high_pfn, pfn); | ||
758 | } | ||
759 | } | 793 | } |
760 | 794 | ||
761 | /* split_free_page does not map the pages */ | 795 | /* split_free_page does not map the pages */ |
@@ -765,10 +799,9 @@ static void isolate_freepages(struct zone *zone, | |||
765 | * If we crossed the migrate scanner, we want to keep it that way | 799 | * If we crossed the migrate scanner, we want to keep it that way |
766 | * so that compact_finished() may detect this | 800 | * so that compact_finished() may detect this |
767 | */ | 801 | */ |
768 | if (pfn < low_pfn) | 802 | if (block_start_pfn < low_pfn) |
769 | cc->free_pfn = max(pfn, zone->zone_start_pfn); | 803 | cc->free_pfn = cc->migrate_pfn; |
770 | else | 804 | |
771 | cc->free_pfn = high_pfn; | ||
772 | cc->nr_freepages = nr_freepages; | 805 | cc->nr_freepages = nr_freepages; |
773 | } | 806 | } |
774 | 807 | ||
@@ -783,9 +816,13 @@ static struct page *compaction_alloc(struct page *migratepage, | |||
783 | struct compact_control *cc = (struct compact_control *)data; | 816 | struct compact_control *cc = (struct compact_control *)data; |
784 | struct page *freepage; | 817 | struct page *freepage; |
785 | 818 | ||
786 | /* Isolate free pages if necessary */ | 819 | /* |
820 | * Isolate free pages if necessary, and if we are not aborting due to | ||
821 | * contention. | ||
822 | */ | ||
787 | if (list_empty(&cc->freepages)) { | 823 | if (list_empty(&cc->freepages)) { |
788 | isolate_freepages(cc->zone, cc); | 824 | if (!cc->contended) |
825 | isolate_freepages(cc->zone, cc); | ||
789 | 826 | ||
790 | if (list_empty(&cc->freepages)) | 827 | if (list_empty(&cc->freepages)) |
791 | return NULL; | 828 | return NULL; |
@@ -799,23 +836,16 @@ static struct page *compaction_alloc(struct page *migratepage, | |||
799 | } | 836 | } |
800 | 837 | ||
801 | /* | 838 | /* |
802 | * We cannot control nr_migratepages and nr_freepages fully when migration is | 839 | * This is a migrate-callback that "frees" freepages back to the isolated |
803 | * running as migrate_pages() has no knowledge of compact_control. When | 840 | * freelist. All pages on the freelist are from the same zone, so there is no |
804 | * migration is complete, we count the number of pages on the lists by hand. | 841 | * special handling needed for NUMA. |
805 | */ | 842 | */ |
806 | static void update_nr_listpages(struct compact_control *cc) | 843 | static void compaction_free(struct page *page, unsigned long data) |
807 | { | 844 | { |
808 | int nr_migratepages = 0; | 845 | struct compact_control *cc = (struct compact_control *)data; |
809 | int nr_freepages = 0; | ||
810 | struct page *page; | ||
811 | |||
812 | list_for_each_entry(page, &cc->migratepages, lru) | ||
813 | nr_migratepages++; | ||
814 | list_for_each_entry(page, &cc->freepages, lru) | ||
815 | nr_freepages++; | ||
816 | 846 | ||
817 | cc->nr_migratepages = nr_migratepages; | 847 | list_add(&page->lru, &cc->freepages); |
818 | cc->nr_freepages = nr_freepages; | 848 | cc->nr_freepages++; |
819 | } | 849 | } |
820 | 850 | ||
821 | /* possible outcome of isolate_migratepages */ | 851 | /* possible outcome of isolate_migratepages */ |
@@ -862,13 +892,14 @@ static int compact_finished(struct zone *zone, | |||
862 | unsigned int order; | 892 | unsigned int order; |
863 | unsigned long watermark; | 893 | unsigned long watermark; |
864 | 894 | ||
865 | if (fatal_signal_pending(current)) | 895 | if (cc->contended || fatal_signal_pending(current)) |
866 | return COMPACT_PARTIAL; | 896 | return COMPACT_PARTIAL; |
867 | 897 | ||
868 | /* Compaction run completes if the migrate and free scanner meet */ | 898 | /* Compaction run completes if the migrate and free scanner meet */ |
869 | if (cc->free_pfn <= cc->migrate_pfn) { | 899 | if (cc->free_pfn <= cc->migrate_pfn) { |
870 | /* Let the next compaction start anew. */ | 900 | /* Let the next compaction start anew. */ |
871 | zone->compact_cached_migrate_pfn = zone->zone_start_pfn; | 901 | zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; |
902 | zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; | ||
872 | zone->compact_cached_free_pfn = zone_end_pfn(zone); | 903 | zone->compact_cached_free_pfn = zone_end_pfn(zone); |
873 | 904 | ||
874 | /* | 905 | /* |
@@ -968,6 +999,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
968 | int ret; | 999 | int ret; |
969 | unsigned long start_pfn = zone->zone_start_pfn; | 1000 | unsigned long start_pfn = zone->zone_start_pfn; |
970 | unsigned long end_pfn = zone_end_pfn(zone); | 1001 | unsigned long end_pfn = zone_end_pfn(zone); |
1002 | const bool sync = cc->mode != MIGRATE_ASYNC; | ||
971 | 1003 | ||
972 | ret = compaction_suitable(zone, cc->order); | 1004 | ret = compaction_suitable(zone, cc->order); |
973 | switch (ret) { | 1005 | switch (ret) { |
@@ -993,7 +1025,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
993 | * information on where the scanners should start but check that it | 1025 | * information on where the scanners should start but check that it |
994 | * is initialised by ensuring the values are within zone boundaries. | 1026 | * is initialised by ensuring the values are within zone boundaries. |
995 | */ | 1027 | */ |
996 | cc->migrate_pfn = zone->compact_cached_migrate_pfn; | 1028 | cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; |
997 | cc->free_pfn = zone->compact_cached_free_pfn; | 1029 | cc->free_pfn = zone->compact_cached_free_pfn; |
998 | if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { | 1030 | if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { |
999 | cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); | 1031 | cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); |
@@ -1001,7 +1033,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1001 | } | 1033 | } |
1002 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { | 1034 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { |
1003 | cc->migrate_pfn = start_pfn; | 1035 | cc->migrate_pfn = start_pfn; |
1004 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; | 1036 | zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; |
1037 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; | ||
1005 | } | 1038 | } |
1006 | 1039 | ||
1007 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); | 1040 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); |
@@ -1009,7 +1042,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1009 | migrate_prep_local(); | 1042 | migrate_prep_local(); |
1010 | 1043 | ||
1011 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { | 1044 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { |
1012 | unsigned long nr_migrate, nr_remaining; | ||
1013 | int err; | 1045 | int err; |
1014 | 1046 | ||
1015 | switch (isolate_migratepages(zone, cc)) { | 1047 | switch (isolate_migratepages(zone, cc)) { |
@@ -1024,21 +1056,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1024 | ; | 1056 | ; |
1025 | } | 1057 | } |
1026 | 1058 | ||
1027 | nr_migrate = cc->nr_migratepages; | 1059 | if (!cc->nr_migratepages) |
1060 | continue; | ||
1061 | |||
1028 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 1062 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
1029 | (unsigned long)cc, | 1063 | compaction_free, (unsigned long)cc, cc->mode, |
1030 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, | ||
1031 | MR_COMPACTION); | 1064 | MR_COMPACTION); |
1032 | update_nr_listpages(cc); | ||
1033 | nr_remaining = cc->nr_migratepages; | ||
1034 | 1065 | ||
1035 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, | 1066 | trace_mm_compaction_migratepages(cc->nr_migratepages, err, |
1036 | nr_remaining); | 1067 | &cc->migratepages); |
1037 | 1068 | ||
1038 | /* Release isolated pages not migrated */ | 1069 | /* All pages were either migrated or will be released */ |
1070 | cc->nr_migratepages = 0; | ||
1039 | if (err) { | 1071 | if (err) { |
1040 | putback_movable_pages(&cc->migratepages); | 1072 | putback_movable_pages(&cc->migratepages); |
1041 | cc->nr_migratepages = 0; | ||
1042 | /* | 1073 | /* |
1043 | * migrate_pages() may return -ENOMEM when scanners meet | 1074 | * migrate_pages() may return -ENOMEM when scanners meet |
1044 | * and we want compact_finished() to detect it | 1075 | * and we want compact_finished() to detect it |
@@ -1060,9 +1091,8 @@ out: | |||
1060 | return ret; | 1091 | return ret; |
1061 | } | 1092 | } |
1062 | 1093 | ||
1063 | static unsigned long compact_zone_order(struct zone *zone, | 1094 | static unsigned long compact_zone_order(struct zone *zone, int order, |
1064 | int order, gfp_t gfp_mask, | 1095 | gfp_t gfp_mask, enum migrate_mode mode, bool *contended) |
1065 | bool sync, bool *contended) | ||
1066 | { | 1096 | { |
1067 | unsigned long ret; | 1097 | unsigned long ret; |
1068 | struct compact_control cc = { | 1098 | struct compact_control cc = { |
@@ -1071,7 +1101,7 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
1071 | .order = order, | 1101 | .order = order, |
1072 | .migratetype = allocflags_to_migratetype(gfp_mask), | 1102 | .migratetype = allocflags_to_migratetype(gfp_mask), |
1073 | .zone = zone, | 1103 | .zone = zone, |
1074 | .sync = sync, | 1104 | .mode = mode, |
1075 | }; | 1105 | }; |
1076 | INIT_LIST_HEAD(&cc.freepages); | 1106 | INIT_LIST_HEAD(&cc.freepages); |
1077 | INIT_LIST_HEAD(&cc.migratepages); | 1107 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -1093,7 +1123,7 @@ int sysctl_extfrag_threshold = 500; | |||
1093 | * @order: The order of the current allocation | 1123 | * @order: The order of the current allocation |
1094 | * @gfp_mask: The GFP mask of the current allocation | 1124 | * @gfp_mask: The GFP mask of the current allocation |
1095 | * @nodemask: The allowed nodes to allocate from | 1125 | * @nodemask: The allowed nodes to allocate from |
1096 | * @sync: Whether migration is synchronous or not | 1126 | * @mode: The migration mode for async, sync light, or sync migration |
1097 | * @contended: Return value that is true if compaction was aborted due to lock contention | 1127 | * @contended: Return value that is true if compaction was aborted due to lock contention |
1098 | * @page: Optionally capture a free page of the requested order during compaction | 1128 | * @page: Optionally capture a free page of the requested order during compaction |
1099 | * | 1129 | * |
@@ -1101,7 +1131,7 @@ int sysctl_extfrag_threshold = 500; | |||
1101 | */ | 1131 | */ |
1102 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1132 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
1103 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1133 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
1104 | bool sync, bool *contended) | 1134 | enum migrate_mode mode, bool *contended) |
1105 | { | 1135 | { |
1106 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1136 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
1107 | int may_enter_fs = gfp_mask & __GFP_FS; | 1137 | int may_enter_fs = gfp_mask & __GFP_FS; |
@@ -1126,7 +1156,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1126 | nodemask) { | 1156 | nodemask) { |
1127 | int status; | 1157 | int status; |
1128 | 1158 | ||
1129 | status = compact_zone_order(zone, order, gfp_mask, sync, | 1159 | status = compact_zone_order(zone, order, gfp_mask, mode, |
1130 | contended); | 1160 | contended); |
1131 | rc = max(status, rc); | 1161 | rc = max(status, rc); |
1132 | 1162 | ||
@@ -1165,9 +1195,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
1165 | if (zone_watermark_ok(zone, cc->order, | 1195 | if (zone_watermark_ok(zone, cc->order, |
1166 | low_wmark_pages(zone), 0, 0)) | 1196 | low_wmark_pages(zone), 0, 0)) |
1167 | compaction_defer_reset(zone, cc->order, false); | 1197 | compaction_defer_reset(zone, cc->order, false); |
1168 | /* Currently async compaction is never deferred. */ | ||
1169 | else if (cc->sync) | ||
1170 | defer_compaction(zone, cc->order); | ||
1171 | } | 1198 | } |
1172 | 1199 | ||
1173 | VM_BUG_ON(!list_empty(&cc->freepages)); | 1200 | VM_BUG_ON(!list_empty(&cc->freepages)); |
@@ -1179,7 +1206,7 @@ void compact_pgdat(pg_data_t *pgdat, int order) | |||
1179 | { | 1206 | { |
1180 | struct compact_control cc = { | 1207 | struct compact_control cc = { |
1181 | .order = order, | 1208 | .order = order, |
1182 | .sync = false, | 1209 | .mode = MIGRATE_ASYNC, |
1183 | }; | 1210 | }; |
1184 | 1211 | ||
1185 | if (!order) | 1212 | if (!order) |
@@ -1192,7 +1219,7 @@ static void compact_node(int nid) | |||
1192 | { | 1219 | { |
1193 | struct compact_control cc = { | 1220 | struct compact_control cc = { |
1194 | .order = -1, | 1221 | .order = -1, |
1195 | .sync = true, | 1222 | .mode = MIGRATE_SYNC, |
1196 | .ignore_skip_hint = true, | 1223 | .ignore_skip_hint = true, |
1197 | }; | 1224 | }; |
1198 | 1225 | ||
diff --git a/mm/dmapool.c b/mm/dmapool.c index 8058fcd7ae91..306baa594f95 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -170,24 +170,16 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, | |||
170 | retval->boundary = boundary; | 170 | retval->boundary = boundary; |
171 | retval->allocation = allocation; | 171 | retval->allocation = allocation; |
172 | 172 | ||
173 | if (dev) { | 173 | INIT_LIST_HEAD(&retval->pools); |
174 | int ret; | ||
175 | 174 | ||
176 | mutex_lock(&pools_lock); | 175 | mutex_lock(&pools_lock); |
177 | if (list_empty(&dev->dma_pools)) | 176 | if (list_empty(&dev->dma_pools) && |
178 | ret = device_create_file(dev, &dev_attr_pools); | 177 | device_create_file(dev, &dev_attr_pools)) { |
179 | else | 178 | kfree(retval); |
180 | ret = 0; | 179 | return NULL; |
181 | /* note: not currently insisting "name" be unique */ | ||
182 | if (!ret) | ||
183 | list_add(&retval->pools, &dev->dma_pools); | ||
184 | else { | ||
185 | kfree(retval); | ||
186 | retval = NULL; | ||
187 | } | ||
188 | mutex_unlock(&pools_lock); | ||
189 | } else | 180 | } else |
190 | INIT_LIST_HEAD(&retval->pools); | 181 | list_add(&retval->pools, &dev->dma_pools); |
182 | mutex_unlock(&pools_lock); | ||
191 | 183 | ||
192 | return retval; | 184 | return retval; |
193 | } | 185 | } |
@@ -508,7 +500,6 @@ void dmam_pool_destroy(struct dma_pool *pool) | |||
508 | { | 500 | { |
509 | struct device *dev = pool->dev; | 501 | struct device *dev = pool->dev; |
510 | 502 | ||
511 | WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); | 503 | WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool)); |
512 | dma_pool_destroy(pool); | ||
513 | } | 504 | } |
514 | EXPORT_SYMBOL(dmam_pool_destroy); | 505 | EXPORT_SYMBOL(dmam_pool_destroy); |
diff --git a/mm/filemap.c b/mm/filemap.c index 021056c324e6..7fadf1c62838 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -753,8 +753,17 @@ EXPORT_SYMBOL(unlock_page); | |||
753 | */ | 753 | */ |
754 | void end_page_writeback(struct page *page) | 754 | void end_page_writeback(struct page *page) |
755 | { | 755 | { |
756 | if (TestClearPageReclaim(page)) | 756 | /* |
757 | * TestClearPageReclaim could be used here but it is an atomic | ||
758 | * operation and overkill in this particular case. Failing to | ||
759 | * shuffle a page marked for immediate reclaim is too mild to | ||
760 | * justify taking an atomic operation penalty at the end of | ||
761 | * ever page writeback. | ||
762 | */ | ||
763 | if (PageReclaim(page)) { | ||
764 | ClearPageReclaim(page); | ||
757 | rotate_reclaimable_page(page); | 765 | rotate_reclaimable_page(page); |
766 | } | ||
758 | 767 | ||
759 | if (!test_clear_page_writeback(page)) | 768 | if (!test_clear_page_writeback(page)) |
760 | BUG(); | 769 | BUG(); |
@@ -764,6 +773,31 @@ void end_page_writeback(struct page *page) | |||
764 | } | 773 | } |
765 | EXPORT_SYMBOL(end_page_writeback); | 774 | EXPORT_SYMBOL(end_page_writeback); |
766 | 775 | ||
776 | /* | ||
777 | * After completing I/O on a page, call this routine to update the page | ||
778 | * flags appropriately | ||
779 | */ | ||
780 | void page_endio(struct page *page, int rw, int err) | ||
781 | { | ||
782 | if (rw == READ) { | ||
783 | if (!err) { | ||
784 | SetPageUptodate(page); | ||
785 | } else { | ||
786 | ClearPageUptodate(page); | ||
787 | SetPageError(page); | ||
788 | } | ||
789 | unlock_page(page); | ||
790 | } else { /* rw == WRITE */ | ||
791 | if (err) { | ||
792 | SetPageError(page); | ||
793 | if (page->mapping) | ||
794 | mapping_set_error(page->mapping, err); | ||
795 | } | ||
796 | end_page_writeback(page); | ||
797 | } | ||
798 | } | ||
799 | EXPORT_SYMBOL_GPL(page_endio); | ||
800 | |||
767 | /** | 801 | /** |
768 | * __lock_page - get a lock on the page, assuming we need to sleep to get it | 802 | * __lock_page - get a lock on the page, assuming we need to sleep to get it |
769 | * @page: the page to lock | 803 | * @page: the page to lock |
@@ -957,26 +991,6 @@ out: | |||
957 | EXPORT_SYMBOL(find_get_entry); | 991 | EXPORT_SYMBOL(find_get_entry); |
958 | 992 | ||
959 | /** | 993 | /** |
960 | * find_get_page - find and get a page reference | ||
961 | * @mapping: the address_space to search | ||
962 | * @offset: the page index | ||
963 | * | ||
964 | * Looks up the page cache slot at @mapping & @offset. If there is a | ||
965 | * page cache page, it is returned with an increased refcount. | ||
966 | * | ||
967 | * Otherwise, %NULL is returned. | ||
968 | */ | ||
969 | struct page *find_get_page(struct address_space *mapping, pgoff_t offset) | ||
970 | { | ||
971 | struct page *page = find_get_entry(mapping, offset); | ||
972 | |||
973 | if (radix_tree_exceptional_entry(page)) | ||
974 | page = NULL; | ||
975 | return page; | ||
976 | } | ||
977 | EXPORT_SYMBOL(find_get_page); | ||
978 | |||
979 | /** | ||
980 | * find_lock_entry - locate, pin and lock a page cache entry | 994 | * find_lock_entry - locate, pin and lock a page cache entry |
981 | * @mapping: the address_space to search | 995 | * @mapping: the address_space to search |
982 | * @offset: the page cache index | 996 | * @offset: the page cache index |
@@ -1013,66 +1027,84 @@ repeat: | |||
1013 | EXPORT_SYMBOL(find_lock_entry); | 1027 | EXPORT_SYMBOL(find_lock_entry); |
1014 | 1028 | ||
1015 | /** | 1029 | /** |
1016 | * find_lock_page - locate, pin and lock a pagecache page | 1030 | * pagecache_get_page - find and get a page reference |
1017 | * @mapping: the address_space to search | 1031 | * @mapping: the address_space to search |
1018 | * @offset: the page index | 1032 | * @offset: the page index |
1033 | * @fgp_flags: PCG flags | ||
1034 | * @gfp_mask: gfp mask to use if a page is to be allocated | ||
1019 | * | 1035 | * |
1020 | * Looks up the page cache slot at @mapping & @offset. If there is a | 1036 | * Looks up the page cache slot at @mapping & @offset. |
1021 | * page cache page, it is returned locked and with an increased | ||
1022 | * refcount. | ||
1023 | * | ||
1024 | * Otherwise, %NULL is returned. | ||
1025 | * | ||
1026 | * find_lock_page() may sleep. | ||
1027 | */ | ||
1028 | struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) | ||
1029 | { | ||
1030 | struct page *page = find_lock_entry(mapping, offset); | ||
1031 | |||
1032 | if (radix_tree_exceptional_entry(page)) | ||
1033 | page = NULL; | ||
1034 | return page; | ||
1035 | } | ||
1036 | EXPORT_SYMBOL(find_lock_page); | ||
1037 | |||
1038 | /** | ||
1039 | * find_or_create_page - locate or add a pagecache page | ||
1040 | * @mapping: the page's address_space | ||
1041 | * @index: the page's index into the mapping | ||
1042 | * @gfp_mask: page allocation mode | ||
1043 | * | 1037 | * |
1044 | * Looks up the page cache slot at @mapping & @offset. If there is a | 1038 | * PCG flags modify how the page is returned |
1045 | * page cache page, it is returned locked and with an increased | ||
1046 | * refcount. | ||
1047 | * | 1039 | * |
1048 | * If the page is not present, a new page is allocated using @gfp_mask | 1040 | * FGP_ACCESSED: the page will be marked accessed |
1049 | * and added to the page cache and the VM's LRU list. The page is | 1041 | * FGP_LOCK: Page is return locked |
1050 | * returned locked and with an increased refcount. | 1042 | * FGP_CREAT: If page is not present then a new page is allocated using |
1043 | * @gfp_mask and added to the page cache and the VM's LRU | ||
1044 | * list. The page is returned locked and with an increased | ||
1045 | * refcount. Otherwise, %NULL is returned. | ||
1051 | * | 1046 | * |
1052 | * On memory exhaustion, %NULL is returned. | 1047 | * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even |
1048 | * if the GFP flags specified for FGP_CREAT are atomic. | ||
1053 | * | 1049 | * |
1054 | * find_or_create_page() may sleep, even if @gfp_flags specifies an | 1050 | * If there is a page cache page, it is returned with an increased refcount. |
1055 | * atomic allocation! | ||
1056 | */ | 1051 | */ |
1057 | struct page *find_or_create_page(struct address_space *mapping, | 1052 | struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, |
1058 | pgoff_t index, gfp_t gfp_mask) | 1053 | int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask) |
1059 | { | 1054 | { |
1060 | struct page *page; | 1055 | struct page *page; |
1061 | int err; | 1056 | |
1062 | repeat: | 1057 | repeat: |
1063 | page = find_lock_page(mapping, index); | 1058 | page = find_get_entry(mapping, offset); |
1064 | if (!page) { | 1059 | if (radix_tree_exceptional_entry(page)) |
1065 | page = __page_cache_alloc(gfp_mask); | 1060 | page = NULL; |
1061 | if (!page) | ||
1062 | goto no_page; | ||
1063 | |||
1064 | if (fgp_flags & FGP_LOCK) { | ||
1065 | if (fgp_flags & FGP_NOWAIT) { | ||
1066 | if (!trylock_page(page)) { | ||
1067 | page_cache_release(page); | ||
1068 | return NULL; | ||
1069 | } | ||
1070 | } else { | ||
1071 | lock_page(page); | ||
1072 | } | ||
1073 | |||
1074 | /* Has the page been truncated? */ | ||
1075 | if (unlikely(page->mapping != mapping)) { | ||
1076 | unlock_page(page); | ||
1077 | page_cache_release(page); | ||
1078 | goto repeat; | ||
1079 | } | ||
1080 | VM_BUG_ON_PAGE(page->index != offset, page); | ||
1081 | } | ||
1082 | |||
1083 | if (page && (fgp_flags & FGP_ACCESSED)) | ||
1084 | mark_page_accessed(page); | ||
1085 | |||
1086 | no_page: | ||
1087 | if (!page && (fgp_flags & FGP_CREAT)) { | ||
1088 | int err; | ||
1089 | if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) | ||
1090 | cache_gfp_mask |= __GFP_WRITE; | ||
1091 | if (fgp_flags & FGP_NOFS) { | ||
1092 | cache_gfp_mask &= ~__GFP_FS; | ||
1093 | radix_gfp_mask &= ~__GFP_FS; | ||
1094 | } | ||
1095 | |||
1096 | page = __page_cache_alloc(cache_gfp_mask); | ||
1066 | if (!page) | 1097 | if (!page) |
1067 | return NULL; | 1098 | return NULL; |
1068 | /* | 1099 | |
1069 | * We want a regular kernel memory (not highmem or DMA etc) | 1100 | if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) |
1070 | * allocation for the radix tree nodes, but we need to honour | 1101 | fgp_flags |= FGP_LOCK; |
1071 | * the context-specific requirements the caller has asked for. | 1102 | |
1072 | * GFP_RECLAIM_MASK collects those requirements. | 1103 | /* Init accessed so avoit atomic mark_page_accessed later */ |
1073 | */ | 1104 | if (fgp_flags & FGP_ACCESSED) |
1074 | err = add_to_page_cache_lru(page, mapping, index, | 1105 | init_page_accessed(page); |
1075 | (gfp_mask & GFP_RECLAIM_MASK)); | 1106 | |
1107 | err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); | ||
1076 | if (unlikely(err)) { | 1108 | if (unlikely(err)) { |
1077 | page_cache_release(page); | 1109 | page_cache_release(page); |
1078 | page = NULL; | 1110 | page = NULL; |
@@ -1080,9 +1112,10 @@ repeat: | |||
1080 | goto repeat; | 1112 | goto repeat; |
1081 | } | 1113 | } |
1082 | } | 1114 | } |
1115 | |||
1083 | return page; | 1116 | return page; |
1084 | } | 1117 | } |
1085 | EXPORT_SYMBOL(find_or_create_page); | 1118 | EXPORT_SYMBOL(pagecache_get_page); |
1086 | 1119 | ||
1087 | /** | 1120 | /** |
1088 | * find_get_entries - gang pagecache lookup | 1121 | * find_get_entries - gang pagecache lookup |
@@ -1379,39 +1412,6 @@ repeat: | |||
1379 | } | 1412 | } |
1380 | EXPORT_SYMBOL(find_get_pages_tag); | 1413 | EXPORT_SYMBOL(find_get_pages_tag); |
1381 | 1414 | ||
1382 | /** | ||
1383 | * grab_cache_page_nowait - returns locked page at given index in given cache | ||
1384 | * @mapping: target address_space | ||
1385 | * @index: the page index | ||
1386 | * | ||
1387 | * Same as grab_cache_page(), but do not wait if the page is unavailable. | ||
1388 | * This is intended for speculative data generators, where the data can | ||
1389 | * be regenerated if the page couldn't be grabbed. This routine should | ||
1390 | * be safe to call while holding the lock for another page. | ||
1391 | * | ||
1392 | * Clear __GFP_FS when allocating the page to avoid recursion into the fs | ||
1393 | * and deadlock against the caller's locked page. | ||
1394 | */ | ||
1395 | struct page * | ||
1396 | grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) | ||
1397 | { | ||
1398 | struct page *page = find_get_page(mapping, index); | ||
1399 | |||
1400 | if (page) { | ||
1401 | if (trylock_page(page)) | ||
1402 | return page; | ||
1403 | page_cache_release(page); | ||
1404 | return NULL; | ||
1405 | } | ||
1406 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); | ||
1407 | if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { | ||
1408 | page_cache_release(page); | ||
1409 | page = NULL; | ||
1410 | } | ||
1411 | return page; | ||
1412 | } | ||
1413 | EXPORT_SYMBOL(grab_cache_page_nowait); | ||
1414 | |||
1415 | /* | 1415 | /* |
1416 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail | 1416 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail |
1417 | * a _large_ part of the i/o request. Imagine the worst scenario: | 1417 | * a _large_ part of the i/o request. Imagine the worst scenario: |
@@ -2381,7 +2381,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, | |||
2381 | { | 2381 | { |
2382 | const struct address_space_operations *aops = mapping->a_ops; | 2382 | const struct address_space_operations *aops = mapping->a_ops; |
2383 | 2383 | ||
2384 | mark_page_accessed(page); | ||
2385 | return aops->write_end(file, mapping, pos, len, copied, page, fsdata); | 2384 | return aops->write_end(file, mapping, pos, len, copied, page, fsdata); |
2386 | } | 2385 | } |
2387 | EXPORT_SYMBOL(pagecache_write_end); | 2386 | EXPORT_SYMBOL(pagecache_write_end); |
@@ -2463,34 +2462,18 @@ EXPORT_SYMBOL(generic_file_direct_write); | |||
2463 | struct page *grab_cache_page_write_begin(struct address_space *mapping, | 2462 | struct page *grab_cache_page_write_begin(struct address_space *mapping, |
2464 | pgoff_t index, unsigned flags) | 2463 | pgoff_t index, unsigned flags) |
2465 | { | 2464 | { |
2466 | int status; | ||
2467 | gfp_t gfp_mask; | ||
2468 | struct page *page; | 2465 | struct page *page; |
2469 | gfp_t gfp_notmask = 0; | 2466 | int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; |
2470 | 2467 | ||
2471 | gfp_mask = mapping_gfp_mask(mapping); | ||
2472 | if (mapping_cap_account_dirty(mapping)) | ||
2473 | gfp_mask |= __GFP_WRITE; | ||
2474 | if (flags & AOP_FLAG_NOFS) | 2468 | if (flags & AOP_FLAG_NOFS) |
2475 | gfp_notmask = __GFP_FS; | 2469 | fgp_flags |= FGP_NOFS; |
2476 | repeat: | 2470 | |
2477 | page = find_lock_page(mapping, index); | 2471 | page = pagecache_get_page(mapping, index, fgp_flags, |
2472 | mapping_gfp_mask(mapping), | ||
2473 | GFP_KERNEL); | ||
2478 | if (page) | 2474 | if (page) |
2479 | goto found; | 2475 | wait_for_stable_page(page); |
2480 | 2476 | ||
2481 | page = __page_cache_alloc(gfp_mask & ~gfp_notmask); | ||
2482 | if (!page) | ||
2483 | return NULL; | ||
2484 | status = add_to_page_cache_lru(page, mapping, index, | ||
2485 | GFP_KERNEL & ~gfp_notmask); | ||
2486 | if (unlikely(status)) { | ||
2487 | page_cache_release(page); | ||
2488 | if (status == -EEXIST) | ||
2489 | goto repeat; | ||
2490 | return NULL; | ||
2491 | } | ||
2492 | found: | ||
2493 | wait_for_stable_page(page); | ||
2494 | return page; | 2477 | return page; |
2495 | } | 2478 | } |
2496 | EXPORT_SYMBOL(grab_cache_page_write_begin); | 2479 | EXPORT_SYMBOL(grab_cache_page_write_begin); |
@@ -2539,7 +2522,7 @@ again: | |||
2539 | 2522 | ||
2540 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, | 2523 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, |
2541 | &page, &fsdata); | 2524 | &page, &fsdata); |
2542 | if (unlikely(status)) | 2525 | if (unlikely(status < 0)) |
2543 | break; | 2526 | break; |
2544 | 2527 | ||
2545 | if (mapping_writably_mapped(mapping)) | 2528 | if (mapping_writably_mapped(mapping)) |
@@ -2548,7 +2531,6 @@ again: | |||
2548 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | 2531 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); |
2549 | flush_dcache_page(page); | 2532 | flush_dcache_page(page); |
2550 | 2533 | ||
2551 | mark_page_accessed(page); | ||
2552 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | 2534 | status = a_ops->write_end(file, mapping, pos, bytes, copied, |
2553 | page, fsdata); | 2535 | page, fsdata); |
2554 | if (unlikely(status < 0)) | 2536 | if (unlikely(status < 0)) |
diff --git a/mm/fremap.c b/mm/fremap.c index 34feba60a17e..2c5646f11f41 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -82,13 +82,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
82 | 82 | ||
83 | ptfile = pgoff_to_pte(pgoff); | 83 | ptfile = pgoff_to_pte(pgoff); |
84 | 84 | ||
85 | if (!pte_none(*pte)) { | 85 | if (!pte_none(*pte)) |
86 | if (pte_present(*pte) && pte_soft_dirty(*pte)) | ||
87 | pte_file_mksoft_dirty(ptfile); | ||
88 | zap_pte(mm, vma, addr, pte); | 86 | zap_pte(mm, vma, addr, pte); |
89 | } | ||
90 | 87 | ||
91 | set_pte_at(mm, addr, pte, ptfile); | 88 | set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); |
92 | /* | 89 | /* |
93 | * We don't need to run update_mmu_cache() here because the "file pte" | 90 | * We don't need to run update_mmu_cache() here because the "file pte" |
94 | * being installed by install_file_pte() is not a real pte - it's a | 91 | * being installed by install_file_pte() is not a real pte - it's a |
diff --git a/mm/frontswap.c b/mm/frontswap.c index 1b24bdcb3197..c30eec536f03 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c | |||
@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area); | |||
327 | 327 | ||
328 | static unsigned long __frontswap_curr_pages(void) | 328 | static unsigned long __frontswap_curr_pages(void) |
329 | { | 329 | { |
330 | int type; | ||
331 | unsigned long totalpages = 0; | 330 | unsigned long totalpages = 0; |
332 | struct swap_info_struct *si = NULL; | 331 | struct swap_info_struct *si = NULL; |
333 | 332 | ||
334 | assert_spin_locked(&swap_lock); | 333 | assert_spin_locked(&swap_lock); |
335 | for (type = swap_list.head; type >= 0; type = si->next) { | 334 | plist_for_each_entry(si, &swap_active_head, list) |
336 | si = swap_info[type]; | ||
337 | totalpages += atomic_read(&si->frontswap_pages); | 335 | totalpages += atomic_read(&si->frontswap_pages); |
338 | } | ||
339 | return totalpages; | 336 | return totalpages; |
340 | } | 337 | } |
341 | 338 | ||
@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, | |||
347 | int si_frontswap_pages; | 344 | int si_frontswap_pages; |
348 | unsigned long total_pages_to_unuse = total; | 345 | unsigned long total_pages_to_unuse = total; |
349 | unsigned long pages = 0, pages_to_unuse = 0; | 346 | unsigned long pages = 0, pages_to_unuse = 0; |
350 | int type; | ||
351 | 347 | ||
352 | assert_spin_locked(&swap_lock); | 348 | assert_spin_locked(&swap_lock); |
353 | for (type = swap_list.head; type >= 0; type = si->next) { | 349 | plist_for_each_entry(si, &swap_active_head, list) { |
354 | si = swap_info[type]; | ||
355 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | 350 | si_frontswap_pages = atomic_read(&si->frontswap_pages); |
356 | if (total_pages_to_unuse < si_frontswap_pages) { | 351 | if (total_pages_to_unuse < si_frontswap_pages) { |
357 | pages = pages_to_unuse = total_pages_to_unuse; | 352 | pages = pages_to_unuse = total_pages_to_unuse; |
@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, | |||
366 | } | 361 | } |
367 | vm_unacct_memory(pages); | 362 | vm_unacct_memory(pages); |
368 | *unused = pages_to_unuse; | 363 | *unused = pages_to_unuse; |
369 | *swapid = type; | 364 | *swapid = si->type; |
370 | ret = 0; | 365 | ret = 0; |
371 | break; | 366 | break; |
372 | } | 367 | } |
@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) | |||
413 | /* | 408 | /* |
414 | * we don't want to hold swap_lock while doing a very | 409 | * we don't want to hold swap_lock while doing a very |
415 | * lengthy try_to_unuse, but swap_list may change | 410 | * lengthy try_to_unuse, but swap_list may change |
416 | * so restart scan from swap_list.head each time | 411 | * so restart scan from swap_active_head each time |
417 | */ | 412 | */ |
418 | spin_lock(&swap_lock); | 413 | spin_lock(&swap_lock); |
419 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); | 414 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); |
diff --git a/mm/gup.c b/mm/gup.c new file mode 100644 index 000000000000..cc5a9e7adea7 --- /dev/null +++ b/mm/gup.c | |||
@@ -0,0 +1,662 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/err.h> | ||
4 | #include <linux/spinlock.h> | ||
5 | |||
6 | #include <linux/hugetlb.h> | ||
7 | #include <linux/mm.h> | ||
8 | #include <linux/pagemap.h> | ||
9 | #include <linux/rmap.h> | ||
10 | #include <linux/swap.h> | ||
11 | #include <linux/swapops.h> | ||
12 | |||
13 | #include "internal.h" | ||
14 | |||
15 | static struct page *no_page_table(struct vm_area_struct *vma, | ||
16 | unsigned int flags) | ||
17 | { | ||
18 | /* | ||
19 | * When core dumping an enormous anonymous area that nobody | ||
20 | * has touched so far, we don't want to allocate unnecessary pages or | ||
21 | * page tables. Return error instead of NULL to skip handle_mm_fault, | ||
22 | * then get_dump_page() will return NULL to leave a hole in the dump. | ||
23 | * But we can only make this optimization where a hole would surely | ||
24 | * be zero-filled if handle_mm_fault() actually did handle it. | ||
25 | */ | ||
26 | if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) | ||
27 | return ERR_PTR(-EFAULT); | ||
28 | return NULL; | ||
29 | } | ||
30 | |||
31 | static struct page *follow_page_pte(struct vm_area_struct *vma, | ||
32 | unsigned long address, pmd_t *pmd, unsigned int flags) | ||
33 | { | ||
34 | struct mm_struct *mm = vma->vm_mm; | ||
35 | struct page *page; | ||
36 | spinlock_t *ptl; | ||
37 | pte_t *ptep, pte; | ||
38 | |||
39 | retry: | ||
40 | if (unlikely(pmd_bad(*pmd))) | ||
41 | return no_page_table(vma, flags); | ||
42 | |||
43 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
44 | pte = *ptep; | ||
45 | if (!pte_present(pte)) { | ||
46 | swp_entry_t entry; | ||
47 | /* | ||
48 | * KSM's break_ksm() relies upon recognizing a ksm page | ||
49 | * even while it is being migrated, so for that case we | ||
50 | * need migration_entry_wait(). | ||
51 | */ | ||
52 | if (likely(!(flags & FOLL_MIGRATION))) | ||
53 | goto no_page; | ||
54 | if (pte_none(pte) || pte_file(pte)) | ||
55 | goto no_page; | ||
56 | entry = pte_to_swp_entry(pte); | ||
57 | if (!is_migration_entry(entry)) | ||
58 | goto no_page; | ||
59 | pte_unmap_unlock(ptep, ptl); | ||
60 | migration_entry_wait(mm, pmd, address); | ||
61 | goto retry; | ||
62 | } | ||
63 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | ||
64 | goto no_page; | ||
65 | if ((flags & FOLL_WRITE) && !pte_write(pte)) { | ||
66 | pte_unmap_unlock(ptep, ptl); | ||
67 | return NULL; | ||
68 | } | ||
69 | |||
70 | page = vm_normal_page(vma, address, pte); | ||
71 | if (unlikely(!page)) { | ||
72 | if ((flags & FOLL_DUMP) || | ||
73 | !is_zero_pfn(pte_pfn(pte))) | ||
74 | goto bad_page; | ||
75 | page = pte_page(pte); | ||
76 | } | ||
77 | |||
78 | if (flags & FOLL_GET) | ||
79 | get_page_foll(page); | ||
80 | if (flags & FOLL_TOUCH) { | ||
81 | if ((flags & FOLL_WRITE) && | ||
82 | !pte_dirty(pte) && !PageDirty(page)) | ||
83 | set_page_dirty(page); | ||
84 | /* | ||
85 | * pte_mkyoung() would be more correct here, but atomic care | ||
86 | * is needed to avoid losing the dirty bit: it is easier to use | ||
87 | * mark_page_accessed(). | ||
88 | */ | ||
89 | mark_page_accessed(page); | ||
90 | } | ||
91 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | ||
92 | /* | ||
93 | * The preliminary mapping check is mainly to avoid the | ||
94 | * pointless overhead of lock_page on the ZERO_PAGE | ||
95 | * which might bounce very badly if there is contention. | ||
96 | * | ||
97 | * If the page is already locked, we don't need to | ||
98 | * handle it now - vmscan will handle it later if and | ||
99 | * when it attempts to reclaim the page. | ||
100 | */ | ||
101 | if (page->mapping && trylock_page(page)) { | ||
102 | lru_add_drain(); /* push cached pages to LRU */ | ||
103 | /* | ||
104 | * Because we lock page here, and migration is | ||
105 | * blocked by the pte's page reference, and we | ||
106 | * know the page is still mapped, we don't even | ||
107 | * need to check for file-cache page truncation. | ||
108 | */ | ||
109 | mlock_vma_page(page); | ||
110 | unlock_page(page); | ||
111 | } | ||
112 | } | ||
113 | pte_unmap_unlock(ptep, ptl); | ||
114 | return page; | ||
115 | bad_page: | ||
116 | pte_unmap_unlock(ptep, ptl); | ||
117 | return ERR_PTR(-EFAULT); | ||
118 | |||
119 | no_page: | ||
120 | pte_unmap_unlock(ptep, ptl); | ||
121 | if (!pte_none(pte)) | ||
122 | return NULL; | ||
123 | return no_page_table(vma, flags); | ||
124 | } | ||
125 | |||
126 | /** | ||
127 | * follow_page_mask - look up a page descriptor from a user-virtual address | ||
128 | * @vma: vm_area_struct mapping @address | ||
129 | * @address: virtual address to look up | ||
130 | * @flags: flags modifying lookup behaviour | ||
131 | * @page_mask: on output, *page_mask is set according to the size of the page | ||
132 | * | ||
133 | * @flags can have FOLL_ flags set, defined in <linux/mm.h> | ||
134 | * | ||
135 | * Returns the mapped (struct page *), %NULL if no mapping exists, or | ||
136 | * an error pointer if there is a mapping to something not represented | ||
137 | * by a page descriptor (see also vm_normal_page()). | ||
138 | */ | ||
139 | struct page *follow_page_mask(struct vm_area_struct *vma, | ||
140 | unsigned long address, unsigned int flags, | ||
141 | unsigned int *page_mask) | ||
142 | { | ||
143 | pgd_t *pgd; | ||
144 | pud_t *pud; | ||
145 | pmd_t *pmd; | ||
146 | spinlock_t *ptl; | ||
147 | struct page *page; | ||
148 | struct mm_struct *mm = vma->vm_mm; | ||
149 | |||
150 | *page_mask = 0; | ||
151 | |||
152 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); | ||
153 | if (!IS_ERR(page)) { | ||
154 | BUG_ON(flags & FOLL_GET); | ||
155 | return page; | ||
156 | } | ||
157 | |||
158 | pgd = pgd_offset(mm, address); | ||
159 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
160 | return no_page_table(vma, flags); | ||
161 | |||
162 | pud = pud_offset(pgd, address); | ||
163 | if (pud_none(*pud)) | ||
164 | return no_page_table(vma, flags); | ||
165 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { | ||
166 | if (flags & FOLL_GET) | ||
167 | return NULL; | ||
168 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | ||
169 | return page; | ||
170 | } | ||
171 | if (unlikely(pud_bad(*pud))) | ||
172 | return no_page_table(vma, flags); | ||
173 | |||
174 | pmd = pmd_offset(pud, address); | ||
175 | if (pmd_none(*pmd)) | ||
176 | return no_page_table(vma, flags); | ||
177 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { | ||
178 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | ||
179 | if (flags & FOLL_GET) { | ||
180 | /* | ||
181 | * Refcount on tail pages are not well-defined and | ||
182 | * shouldn't be taken. The caller should handle a NULL | ||
183 | * return when trying to follow tail pages. | ||
184 | */ | ||
185 | if (PageHead(page)) | ||
186 | get_page(page); | ||
187 | else | ||
188 | page = NULL; | ||
189 | } | ||
190 | return page; | ||
191 | } | ||
192 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | ||
193 | return no_page_table(vma, flags); | ||
194 | if (pmd_trans_huge(*pmd)) { | ||
195 | if (flags & FOLL_SPLIT) { | ||
196 | split_huge_page_pmd(vma, address, pmd); | ||
197 | return follow_page_pte(vma, address, pmd, flags); | ||
198 | } | ||
199 | ptl = pmd_lock(mm, pmd); | ||
200 | if (likely(pmd_trans_huge(*pmd))) { | ||
201 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
202 | spin_unlock(ptl); | ||
203 | wait_split_huge_page(vma->anon_vma, pmd); | ||
204 | } else { | ||
205 | page = follow_trans_huge_pmd(vma, address, | ||
206 | pmd, flags); | ||
207 | spin_unlock(ptl); | ||
208 | *page_mask = HPAGE_PMD_NR - 1; | ||
209 | return page; | ||
210 | } | ||
211 | } else | ||
212 | spin_unlock(ptl); | ||
213 | } | ||
214 | return follow_page_pte(vma, address, pmd, flags); | ||
215 | } | ||
216 | |||
217 | static int get_gate_page(struct mm_struct *mm, unsigned long address, | ||
218 | unsigned int gup_flags, struct vm_area_struct **vma, | ||
219 | struct page **page) | ||
220 | { | ||
221 | pgd_t *pgd; | ||
222 | pud_t *pud; | ||
223 | pmd_t *pmd; | ||
224 | pte_t *pte; | ||
225 | int ret = -EFAULT; | ||
226 | |||
227 | /* user gate pages are read-only */ | ||
228 | if (gup_flags & FOLL_WRITE) | ||
229 | return -EFAULT; | ||
230 | if (address > TASK_SIZE) | ||
231 | pgd = pgd_offset_k(address); | ||
232 | else | ||
233 | pgd = pgd_offset_gate(mm, address); | ||
234 | BUG_ON(pgd_none(*pgd)); | ||
235 | pud = pud_offset(pgd, address); | ||
236 | BUG_ON(pud_none(*pud)); | ||
237 | pmd = pmd_offset(pud, address); | ||
238 | if (pmd_none(*pmd)) | ||
239 | return -EFAULT; | ||
240 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
241 | pte = pte_offset_map(pmd, address); | ||
242 | if (pte_none(*pte)) | ||
243 | goto unmap; | ||
244 | *vma = get_gate_vma(mm); | ||
245 | if (!page) | ||
246 | goto out; | ||
247 | *page = vm_normal_page(*vma, address, *pte); | ||
248 | if (!*page) { | ||
249 | if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) | ||
250 | goto unmap; | ||
251 | *page = pte_page(*pte); | ||
252 | } | ||
253 | get_page(*page); | ||
254 | out: | ||
255 | ret = 0; | ||
256 | unmap: | ||
257 | pte_unmap(pte); | ||
258 | return ret; | ||
259 | } | ||
260 | |||
261 | static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, | ||
262 | unsigned long address, unsigned int *flags, int *nonblocking) | ||
263 | { | ||
264 | struct mm_struct *mm = vma->vm_mm; | ||
265 | unsigned int fault_flags = 0; | ||
266 | int ret; | ||
267 | |||
268 | /* For mlock, just skip the stack guard page. */ | ||
269 | if ((*flags & FOLL_MLOCK) && | ||
270 | (stack_guard_page_start(vma, address) || | ||
271 | stack_guard_page_end(vma, address + PAGE_SIZE))) | ||
272 | return -ENOENT; | ||
273 | if (*flags & FOLL_WRITE) | ||
274 | fault_flags |= FAULT_FLAG_WRITE; | ||
275 | if (nonblocking) | ||
276 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | ||
277 | if (*flags & FOLL_NOWAIT) | ||
278 | fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; | ||
279 | |||
280 | ret = handle_mm_fault(mm, vma, address, fault_flags); | ||
281 | if (ret & VM_FAULT_ERROR) { | ||
282 | if (ret & VM_FAULT_OOM) | ||
283 | return -ENOMEM; | ||
284 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | ||
285 | return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; | ||
286 | if (ret & VM_FAULT_SIGBUS) | ||
287 | return -EFAULT; | ||
288 | BUG(); | ||
289 | } | ||
290 | |||
291 | if (tsk) { | ||
292 | if (ret & VM_FAULT_MAJOR) | ||
293 | tsk->maj_flt++; | ||
294 | else | ||
295 | tsk->min_flt++; | ||
296 | } | ||
297 | |||
298 | if (ret & VM_FAULT_RETRY) { | ||
299 | if (nonblocking) | ||
300 | *nonblocking = 0; | ||
301 | return -EBUSY; | ||
302 | } | ||
303 | |||
304 | /* | ||
305 | * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when | ||
306 | * necessary, even if maybe_mkwrite decided not to set pte_write. We | ||
307 | * can thus safely do subsequent page lookups as if they were reads. | ||
308 | * But only do so when looping for pte_write is futile: in some cases | ||
309 | * userspace may also be wanting to write to the gotten user page, | ||
310 | * which a read fault here might prevent (a readonly page might get | ||
311 | * reCOWed by userspace write). | ||
312 | */ | ||
313 | if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) | ||
314 | *flags &= ~FOLL_WRITE; | ||
315 | return 0; | ||
316 | } | ||
317 | |||
318 | static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) | ||
319 | { | ||
320 | vm_flags_t vm_flags = vma->vm_flags; | ||
321 | |||
322 | if (vm_flags & (VM_IO | VM_PFNMAP)) | ||
323 | return -EFAULT; | ||
324 | |||
325 | if (gup_flags & FOLL_WRITE) { | ||
326 | if (!(vm_flags & VM_WRITE)) { | ||
327 | if (!(gup_flags & FOLL_FORCE)) | ||
328 | return -EFAULT; | ||
329 | /* | ||
330 | * We used to let the write,force case do COW in a | ||
331 | * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could | ||
332 | * set a breakpoint in a read-only mapping of an | ||
333 | * executable, without corrupting the file (yet only | ||
334 | * when that file had been opened for writing!). | ||
335 | * Anon pages in shared mappings are surprising: now | ||
336 | * just reject it. | ||
337 | */ | ||
338 | if (!is_cow_mapping(vm_flags)) { | ||
339 | WARN_ON_ONCE(vm_flags & VM_MAYWRITE); | ||
340 | return -EFAULT; | ||
341 | } | ||
342 | } | ||
343 | } else if (!(vm_flags & VM_READ)) { | ||
344 | if (!(gup_flags & FOLL_FORCE)) | ||
345 | return -EFAULT; | ||
346 | /* | ||
347 | * Is there actually any vma we can reach here which does not | ||
348 | * have VM_MAYREAD set? | ||
349 | */ | ||
350 | if (!(vm_flags & VM_MAYREAD)) | ||
351 | return -EFAULT; | ||
352 | } | ||
353 | return 0; | ||
354 | } | ||
355 | |||
356 | /** | ||
357 | * __get_user_pages() - pin user pages in memory | ||
358 | * @tsk: task_struct of target task | ||
359 | * @mm: mm_struct of target mm | ||
360 | * @start: starting user address | ||
361 | * @nr_pages: number of pages from start to pin | ||
362 | * @gup_flags: flags modifying pin behaviour | ||
363 | * @pages: array that receives pointers to the pages pinned. | ||
364 | * Should be at least nr_pages long. Or NULL, if caller | ||
365 | * only intends to ensure the pages are faulted in. | ||
366 | * @vmas: array of pointers to vmas corresponding to each page. | ||
367 | * Or NULL if the caller does not require them. | ||
368 | * @nonblocking: whether waiting for disk IO or mmap_sem contention | ||
369 | * | ||
370 | * Returns number of pages pinned. This may be fewer than the number | ||
371 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
372 | * were pinned, returns -errno. Each page returned must be released | ||
373 | * with a put_page() call when it is finished with. vmas will only | ||
374 | * remain valid while mmap_sem is held. | ||
375 | * | ||
376 | * Must be called with mmap_sem held for read or write. | ||
377 | * | ||
378 | * __get_user_pages walks a process's page tables and takes a reference to | ||
379 | * each struct page that each user address corresponds to at a given | ||
380 | * instant. That is, it takes the page that would be accessed if a user | ||
381 | * thread accesses the given user virtual address at that instant. | ||
382 | * | ||
383 | * This does not guarantee that the page exists in the user mappings when | ||
384 | * __get_user_pages returns, and there may even be a completely different | ||
385 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
386 | * and subsequently re faulted). However it does guarantee that the page | ||
387 | * won't be freed completely. And mostly callers simply care that the page | ||
388 | * contains data that was valid *at some point in time*. Typically, an IO | ||
389 | * or similar operation cannot guarantee anything stronger anyway because | ||
390 | * locks can't be held over the syscall boundary. | ||
391 | * | ||
392 | * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If | ||
393 | * the page is written to, set_page_dirty (or set_page_dirty_lock, as | ||
394 | * appropriate) must be called after the page is finished with, and | ||
395 | * before put_page is called. | ||
396 | * | ||
397 | * If @nonblocking != NULL, __get_user_pages will not wait for disk IO | ||
398 | * or mmap_sem contention, and if waiting is needed to pin all pages, | ||
399 | * *@nonblocking will be set to 0. | ||
400 | * | ||
401 | * In most cases, get_user_pages or get_user_pages_fast should be used | ||
402 | * instead of __get_user_pages. __get_user_pages should be used only if | ||
403 | * you need some special @gup_flags. | ||
404 | */ | ||
405 | long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
406 | unsigned long start, unsigned long nr_pages, | ||
407 | unsigned int gup_flags, struct page **pages, | ||
408 | struct vm_area_struct **vmas, int *nonblocking) | ||
409 | { | ||
410 | long i = 0; | ||
411 | unsigned int page_mask; | ||
412 | struct vm_area_struct *vma = NULL; | ||
413 | |||
414 | if (!nr_pages) | ||
415 | return 0; | ||
416 | |||
417 | VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); | ||
418 | |||
419 | /* | ||
420 | * If FOLL_FORCE is set then do not force a full fault as the hinting | ||
421 | * fault information is unrelated to the reference behaviour of a task | ||
422 | * using the address space | ||
423 | */ | ||
424 | if (!(gup_flags & FOLL_FORCE)) | ||
425 | gup_flags |= FOLL_NUMA; | ||
426 | |||
427 | do { | ||
428 | struct page *page; | ||
429 | unsigned int foll_flags = gup_flags; | ||
430 | unsigned int page_increm; | ||
431 | |||
432 | /* first iteration or cross vma bound */ | ||
433 | if (!vma || start >= vma->vm_end) { | ||
434 | vma = find_extend_vma(mm, start); | ||
435 | if (!vma && in_gate_area(mm, start)) { | ||
436 | int ret; | ||
437 | ret = get_gate_page(mm, start & PAGE_MASK, | ||
438 | gup_flags, &vma, | ||
439 | pages ? &pages[i] : NULL); | ||
440 | if (ret) | ||
441 | return i ? : ret; | ||
442 | page_mask = 0; | ||
443 | goto next_page; | ||
444 | } | ||
445 | |||
446 | if (!vma || check_vma_flags(vma, gup_flags)) | ||
447 | return i ? : -EFAULT; | ||
448 | if (is_vm_hugetlb_page(vma)) { | ||
449 | i = follow_hugetlb_page(mm, vma, pages, vmas, | ||
450 | &start, &nr_pages, i, | ||
451 | gup_flags); | ||
452 | continue; | ||
453 | } | ||
454 | } | ||
455 | retry: | ||
456 | /* | ||
457 | * If we have a pending SIGKILL, don't keep faulting pages and | ||
458 | * potentially allocating memory. | ||
459 | */ | ||
460 | if (unlikely(fatal_signal_pending(current))) | ||
461 | return i ? i : -ERESTARTSYS; | ||
462 | cond_resched(); | ||
463 | page = follow_page_mask(vma, start, foll_flags, &page_mask); | ||
464 | if (!page) { | ||
465 | int ret; | ||
466 | ret = faultin_page(tsk, vma, start, &foll_flags, | ||
467 | nonblocking); | ||
468 | switch (ret) { | ||
469 | case 0: | ||
470 | goto retry; | ||
471 | case -EFAULT: | ||
472 | case -ENOMEM: | ||
473 | case -EHWPOISON: | ||
474 | return i ? i : ret; | ||
475 | case -EBUSY: | ||
476 | return i; | ||
477 | case -ENOENT: | ||
478 | goto next_page; | ||
479 | } | ||
480 | BUG(); | ||
481 | } | ||
482 | if (IS_ERR(page)) | ||
483 | return i ? i : PTR_ERR(page); | ||
484 | if (pages) { | ||
485 | pages[i] = page; | ||
486 | flush_anon_page(vma, page, start); | ||
487 | flush_dcache_page(page); | ||
488 | page_mask = 0; | ||
489 | } | ||
490 | next_page: | ||
491 | if (vmas) { | ||
492 | vmas[i] = vma; | ||
493 | page_mask = 0; | ||
494 | } | ||
495 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); | ||
496 | if (page_increm > nr_pages) | ||
497 | page_increm = nr_pages; | ||
498 | i += page_increm; | ||
499 | start += page_increm * PAGE_SIZE; | ||
500 | nr_pages -= page_increm; | ||
501 | } while (nr_pages); | ||
502 | return i; | ||
503 | } | ||
504 | EXPORT_SYMBOL(__get_user_pages); | ||
505 | |||
506 | /* | ||
507 | * fixup_user_fault() - manually resolve a user page fault | ||
508 | * @tsk: the task_struct to use for page fault accounting, or | ||
509 | * NULL if faults are not to be recorded. | ||
510 | * @mm: mm_struct of target mm | ||
511 | * @address: user address | ||
512 | * @fault_flags:flags to pass down to handle_mm_fault() | ||
513 | * | ||
514 | * This is meant to be called in the specific scenario where for locking reasons | ||
515 | * we try to access user memory in atomic context (within a pagefault_disable() | ||
516 | * section), this returns -EFAULT, and we want to resolve the user fault before | ||
517 | * trying again. | ||
518 | * | ||
519 | * Typically this is meant to be used by the futex code. | ||
520 | * | ||
521 | * The main difference with get_user_pages() is that this function will | ||
522 | * unconditionally call handle_mm_fault() which will in turn perform all the | ||
523 | * necessary SW fixup of the dirty and young bits in the PTE, while | ||
524 | * handle_mm_fault() only guarantees to update these in the struct page. | ||
525 | * | ||
526 | * This is important for some architectures where those bits also gate the | ||
527 | * access permission to the page because they are maintained in software. On | ||
528 | * such architectures, gup() will not be enough to make a subsequent access | ||
529 | * succeed. | ||
530 | * | ||
531 | * This should be called with the mm_sem held for read. | ||
532 | */ | ||
533 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | ||
534 | unsigned long address, unsigned int fault_flags) | ||
535 | { | ||
536 | struct vm_area_struct *vma; | ||
537 | vm_flags_t vm_flags; | ||
538 | int ret; | ||
539 | |||
540 | vma = find_extend_vma(mm, address); | ||
541 | if (!vma || address < vma->vm_start) | ||
542 | return -EFAULT; | ||
543 | |||
544 | vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; | ||
545 | if (!(vm_flags & vma->vm_flags)) | ||
546 | return -EFAULT; | ||
547 | |||
548 | ret = handle_mm_fault(mm, vma, address, fault_flags); | ||
549 | if (ret & VM_FAULT_ERROR) { | ||
550 | if (ret & VM_FAULT_OOM) | ||
551 | return -ENOMEM; | ||
552 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | ||
553 | return -EHWPOISON; | ||
554 | if (ret & VM_FAULT_SIGBUS) | ||
555 | return -EFAULT; | ||
556 | BUG(); | ||
557 | } | ||
558 | if (tsk) { | ||
559 | if (ret & VM_FAULT_MAJOR) | ||
560 | tsk->maj_flt++; | ||
561 | else | ||
562 | tsk->min_flt++; | ||
563 | } | ||
564 | return 0; | ||
565 | } | ||
566 | |||
567 | /* | ||
568 | * get_user_pages() - pin user pages in memory | ||
569 | * @tsk: the task_struct to use for page fault accounting, or | ||
570 | * NULL if faults are not to be recorded. | ||
571 | * @mm: mm_struct of target mm | ||
572 | * @start: starting user address | ||
573 | * @nr_pages: number of pages from start to pin | ||
574 | * @write: whether pages will be written to by the caller | ||
575 | * @force: whether to force access even when user mapping is currently | ||
576 | * protected (but never forces write access to shared mapping). | ||
577 | * @pages: array that receives pointers to the pages pinned. | ||
578 | * Should be at least nr_pages long. Or NULL, if caller | ||
579 | * only intends to ensure the pages are faulted in. | ||
580 | * @vmas: array of pointers to vmas corresponding to each page. | ||
581 | * Or NULL if the caller does not require them. | ||
582 | * | ||
583 | * Returns number of pages pinned. This may be fewer than the number | ||
584 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
585 | * were pinned, returns -errno. Each page returned must be released | ||
586 | * with a put_page() call when it is finished with. vmas will only | ||
587 | * remain valid while mmap_sem is held. | ||
588 | * | ||
589 | * Must be called with mmap_sem held for read or write. | ||
590 | * | ||
591 | * get_user_pages walks a process's page tables and takes a reference to | ||
592 | * each struct page that each user address corresponds to at a given | ||
593 | * instant. That is, it takes the page that would be accessed if a user | ||
594 | * thread accesses the given user virtual address at that instant. | ||
595 | * | ||
596 | * This does not guarantee that the page exists in the user mappings when | ||
597 | * get_user_pages returns, and there may even be a completely different | ||
598 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
599 | * and subsequently re faulted). However it does guarantee that the page | ||
600 | * won't be freed completely. And mostly callers simply care that the page | ||
601 | * contains data that was valid *at some point in time*. Typically, an IO | ||
602 | * or similar operation cannot guarantee anything stronger anyway because | ||
603 | * locks can't be held over the syscall boundary. | ||
604 | * | ||
605 | * If write=0, the page must not be written to. If the page is written to, | ||
606 | * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called | ||
607 | * after the page is finished with, and before put_page is called. | ||
608 | * | ||
609 | * get_user_pages is typically used for fewer-copy IO operations, to get a | ||
610 | * handle on the memory by some means other than accesses via the user virtual | ||
611 | * addresses. The pages may be submitted for DMA to devices or accessed via | ||
612 | * their kernel linear mapping (via the kmap APIs). Care should be taken to | ||
613 | * use the correct cache flushing APIs. | ||
614 | * | ||
615 | * See also get_user_pages_fast, for performance critical applications. | ||
616 | */ | ||
617 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
618 | unsigned long start, unsigned long nr_pages, int write, | ||
619 | int force, struct page **pages, struct vm_area_struct **vmas) | ||
620 | { | ||
621 | int flags = FOLL_TOUCH; | ||
622 | |||
623 | if (pages) | ||
624 | flags |= FOLL_GET; | ||
625 | if (write) | ||
626 | flags |= FOLL_WRITE; | ||
627 | if (force) | ||
628 | flags |= FOLL_FORCE; | ||
629 | |||
630 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, | ||
631 | NULL); | ||
632 | } | ||
633 | EXPORT_SYMBOL(get_user_pages); | ||
634 | |||
635 | /** | ||
636 | * get_dump_page() - pin user page in memory while writing it to core dump | ||
637 | * @addr: user address | ||
638 | * | ||
639 | * Returns struct page pointer of user page pinned for dump, | ||
640 | * to be freed afterwards by page_cache_release() or put_page(). | ||
641 | * | ||
642 | * Returns NULL on any kind of failure - a hole must then be inserted into | ||
643 | * the corefile, to preserve alignment with its headers; and also returns | ||
644 | * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - | ||
645 | * allowing a hole to be left in the corefile to save diskspace. | ||
646 | * | ||
647 | * Called without mmap_sem, but after all other threads have been killed. | ||
648 | */ | ||
649 | #ifdef CONFIG_ELF_CORE | ||
650 | struct page *get_dump_page(unsigned long addr) | ||
651 | { | ||
652 | struct vm_area_struct *vma; | ||
653 | struct page *page; | ||
654 | |||
655 | if (__get_user_pages(current, current->mm, addr, 1, | ||
656 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, | ||
657 | NULL) < 1) | ||
658 | return NULL; | ||
659 | flush_cache_page(vma, addr, page_to_pfn(page)); | ||
660 | return page; | ||
661 | } | ||
662 | #endif /* CONFIG_ELF_CORE */ | ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d199d2d91946..e60837dc785c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -5,6 +5,8 @@ | |||
5 | * the COPYING file in the top-level directory. | 5 | * the COPYING file in the top-level directory. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
9 | |||
8 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
9 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
10 | #include <linux/highmem.h> | 12 | #include <linux/highmem.h> |
@@ -151,8 +153,7 @@ static int start_khugepaged(void) | |||
151 | khugepaged_thread = kthread_run(khugepaged, NULL, | 153 | khugepaged_thread = kthread_run(khugepaged, NULL, |
152 | "khugepaged"); | 154 | "khugepaged"); |
153 | if (unlikely(IS_ERR(khugepaged_thread))) { | 155 | if (unlikely(IS_ERR(khugepaged_thread))) { |
154 | printk(KERN_ERR | 156 | pr_err("khugepaged: kthread_run(khugepaged) failed\n"); |
155 | "khugepaged: kthread_run(khugepaged) failed\n"); | ||
156 | err = PTR_ERR(khugepaged_thread); | 157 | err = PTR_ERR(khugepaged_thread); |
157 | khugepaged_thread = NULL; | 158 | khugepaged_thread = NULL; |
158 | } | 159 | } |
@@ -584,19 +585,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) | |||
584 | 585 | ||
585 | *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); | 586 | *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); |
586 | if (unlikely(!*hugepage_kobj)) { | 587 | if (unlikely(!*hugepage_kobj)) { |
587 | printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); | 588 | pr_err("failed to create transparent hugepage kobject\n"); |
588 | return -ENOMEM; | 589 | return -ENOMEM; |
589 | } | 590 | } |
590 | 591 | ||
591 | err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); | 592 | err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); |
592 | if (err) { | 593 | if (err) { |
593 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); | 594 | pr_err("failed to register transparent hugepage group\n"); |
594 | goto delete_obj; | 595 | goto delete_obj; |
595 | } | 596 | } |
596 | 597 | ||
597 | err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); | 598 | err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); |
598 | if (err) { | 599 | if (err) { |
599 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); | 600 | pr_err("failed to register transparent hugepage group\n"); |
600 | goto remove_hp_group; | 601 | goto remove_hp_group; |
601 | } | 602 | } |
602 | 603 | ||
@@ -689,8 +690,7 @@ static int __init setup_transparent_hugepage(char *str) | |||
689 | } | 690 | } |
690 | out: | 691 | out: |
691 | if (!ret) | 692 | if (!ret) |
692 | printk(KERN_WARNING | 693 | pr_warn("transparent_hugepage= cannot parse, ignored\n"); |
693 | "transparent_hugepage= cannot parse, ignored\n"); | ||
694 | return ret; | 694 | return ret; |
695 | } | 695 | } |
696 | __setup("transparent_hugepage=", setup_transparent_hugepage); | 696 | __setup("transparent_hugepage=", setup_transparent_hugepage); |
@@ -1830,10 +1830,11 @@ static void __split_huge_page(struct page *page, | |||
1830 | * the newly established pmd of the child later during the | 1830 | * the newly established pmd of the child later during the |
1831 | * walk, to be able to set it as pmd_trans_splitting too. | 1831 | * walk, to be able to set it as pmd_trans_splitting too. |
1832 | */ | 1832 | */ |
1833 | if (mapcount != page_mapcount(page)) | 1833 | if (mapcount != page_mapcount(page)) { |
1834 | printk(KERN_ERR "mapcount %d page_mapcount %d\n", | 1834 | pr_err("mapcount %d page_mapcount %d\n", |
1835 | mapcount, page_mapcount(page)); | 1835 | mapcount, page_mapcount(page)); |
1836 | BUG_ON(mapcount != page_mapcount(page)); | 1836 | BUG(); |
1837 | } | ||
1837 | 1838 | ||
1838 | __split_huge_page_refcount(page, list); | 1839 | __split_huge_page_refcount(page, list); |
1839 | 1840 | ||
@@ -1844,10 +1845,11 @@ static void __split_huge_page(struct page *page, | |||
1844 | BUG_ON(is_vma_temporary_stack(vma)); | 1845 | BUG_ON(is_vma_temporary_stack(vma)); |
1845 | mapcount2 += __split_huge_page_map(page, vma, addr); | 1846 | mapcount2 += __split_huge_page_map(page, vma, addr); |
1846 | } | 1847 | } |
1847 | if (mapcount != mapcount2) | 1848 | if (mapcount != mapcount2) { |
1848 | printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", | 1849 | pr_err("mapcount %d mapcount2 %d page_mapcount %d\n", |
1849 | mapcount, mapcount2, page_mapcount(page)); | 1850 | mapcount, mapcount2, page_mapcount(page)); |
1850 | BUG_ON(mapcount != mapcount2); | 1851 | BUG(); |
1852 | } | ||
1851 | } | 1853 | } |
1852 | 1854 | ||
1853 | /* | 1855 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c82290b9c1fc..226910cb7c9b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -544,7 +544,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | |||
544 | /* Movability of hugepages depends on migration support. */ | 544 | /* Movability of hugepages depends on migration support. */ |
545 | static inline gfp_t htlb_alloc_mask(struct hstate *h) | 545 | static inline gfp_t htlb_alloc_mask(struct hstate *h) |
546 | { | 546 | { |
547 | if (hugepages_treat_as_movable || hugepage_migration_support(h)) | 547 | if (hugepages_treat_as_movable || hugepage_migration_supported(h)) |
548 | return GFP_HIGHUSER_MOVABLE; | 548 | return GFP_HIGHUSER_MOVABLE; |
549 | else | 549 | else |
550 | return GFP_HIGHUSER; | 550 | return GFP_HIGHUSER; |
@@ -607,25 +607,242 @@ err: | |||
607 | return NULL; | 607 | return NULL; |
608 | } | 608 | } |
609 | 609 | ||
610 | /* | ||
611 | * common helper functions for hstate_next_node_to_{alloc|free}. | ||
612 | * We may have allocated or freed a huge page based on a different | ||
613 | * nodes_allowed previously, so h->next_node_to_{alloc|free} might | ||
614 | * be outside of *nodes_allowed. Ensure that we use an allowed | ||
615 | * node for alloc or free. | ||
616 | */ | ||
617 | static int next_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
618 | { | ||
619 | nid = next_node(nid, *nodes_allowed); | ||
620 | if (nid == MAX_NUMNODES) | ||
621 | nid = first_node(*nodes_allowed); | ||
622 | VM_BUG_ON(nid >= MAX_NUMNODES); | ||
623 | |||
624 | return nid; | ||
625 | } | ||
626 | |||
627 | static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
628 | { | ||
629 | if (!node_isset(nid, *nodes_allowed)) | ||
630 | nid = next_node_allowed(nid, nodes_allowed); | ||
631 | return nid; | ||
632 | } | ||
633 | |||
634 | /* | ||
635 | * returns the previously saved node ["this node"] from which to | ||
636 | * allocate a persistent huge page for the pool and advance the | ||
637 | * next node from which to allocate, handling wrap at end of node | ||
638 | * mask. | ||
639 | */ | ||
640 | static int hstate_next_node_to_alloc(struct hstate *h, | ||
641 | nodemask_t *nodes_allowed) | ||
642 | { | ||
643 | int nid; | ||
644 | |||
645 | VM_BUG_ON(!nodes_allowed); | ||
646 | |||
647 | nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); | ||
648 | h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); | ||
649 | |||
650 | return nid; | ||
651 | } | ||
652 | |||
653 | /* | ||
654 | * helper for free_pool_huge_page() - return the previously saved | ||
655 | * node ["this node"] from which to free a huge page. Advance the | ||
656 | * next node id whether or not we find a free huge page to free so | ||
657 | * that the next attempt to free addresses the next node. | ||
658 | */ | ||
659 | static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) | ||
660 | { | ||
661 | int nid; | ||
662 | |||
663 | VM_BUG_ON(!nodes_allowed); | ||
664 | |||
665 | nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); | ||
666 | h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); | ||
667 | |||
668 | return nid; | ||
669 | } | ||
670 | |||
671 | #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ | ||
672 | for (nr_nodes = nodes_weight(*mask); \ | ||
673 | nr_nodes > 0 && \ | ||
674 | ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ | ||
675 | nr_nodes--) | ||
676 | |||
677 | #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ | ||
678 | for (nr_nodes = nodes_weight(*mask); \ | ||
679 | nr_nodes > 0 && \ | ||
680 | ((node = hstate_next_node_to_free(hs, mask)) || 1); \ | ||
681 | nr_nodes--) | ||
682 | |||
683 | #if defined(CONFIG_CMA) && defined(CONFIG_X86_64) | ||
684 | static void destroy_compound_gigantic_page(struct page *page, | ||
685 | unsigned long order) | ||
686 | { | ||
687 | int i; | ||
688 | int nr_pages = 1 << order; | ||
689 | struct page *p = page + 1; | ||
690 | |||
691 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | ||
692 | __ClearPageTail(p); | ||
693 | set_page_refcounted(p); | ||
694 | p->first_page = NULL; | ||
695 | } | ||
696 | |||
697 | set_compound_order(page, 0); | ||
698 | __ClearPageHead(page); | ||
699 | } | ||
700 | |||
701 | static void free_gigantic_page(struct page *page, unsigned order) | ||
702 | { | ||
703 | free_contig_range(page_to_pfn(page), 1 << order); | ||
704 | } | ||
705 | |||
706 | static int __alloc_gigantic_page(unsigned long start_pfn, | ||
707 | unsigned long nr_pages) | ||
708 | { | ||
709 | unsigned long end_pfn = start_pfn + nr_pages; | ||
710 | return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | ||
711 | } | ||
712 | |||
713 | static bool pfn_range_valid_gigantic(unsigned long start_pfn, | ||
714 | unsigned long nr_pages) | ||
715 | { | ||
716 | unsigned long i, end_pfn = start_pfn + nr_pages; | ||
717 | struct page *page; | ||
718 | |||
719 | for (i = start_pfn; i < end_pfn; i++) { | ||
720 | if (!pfn_valid(i)) | ||
721 | return false; | ||
722 | |||
723 | page = pfn_to_page(i); | ||
724 | |||
725 | if (PageReserved(page)) | ||
726 | return false; | ||
727 | |||
728 | if (page_count(page) > 0) | ||
729 | return false; | ||
730 | |||
731 | if (PageHuge(page)) | ||
732 | return false; | ||
733 | } | ||
734 | |||
735 | return true; | ||
736 | } | ||
737 | |||
738 | static bool zone_spans_last_pfn(const struct zone *zone, | ||
739 | unsigned long start_pfn, unsigned long nr_pages) | ||
740 | { | ||
741 | unsigned long last_pfn = start_pfn + nr_pages - 1; | ||
742 | return zone_spans_pfn(zone, last_pfn); | ||
743 | } | ||
744 | |||
745 | static struct page *alloc_gigantic_page(int nid, unsigned order) | ||
746 | { | ||
747 | unsigned long nr_pages = 1 << order; | ||
748 | unsigned long ret, pfn, flags; | ||
749 | struct zone *z; | ||
750 | |||
751 | z = NODE_DATA(nid)->node_zones; | ||
752 | for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { | ||
753 | spin_lock_irqsave(&z->lock, flags); | ||
754 | |||
755 | pfn = ALIGN(z->zone_start_pfn, nr_pages); | ||
756 | while (zone_spans_last_pfn(z, pfn, nr_pages)) { | ||
757 | if (pfn_range_valid_gigantic(pfn, nr_pages)) { | ||
758 | /* | ||
759 | * We release the zone lock here because | ||
760 | * alloc_contig_range() will also lock the zone | ||
761 | * at some point. If there's an allocation | ||
762 | * spinning on this lock, it may win the race | ||
763 | * and cause alloc_contig_range() to fail... | ||
764 | */ | ||
765 | spin_unlock_irqrestore(&z->lock, flags); | ||
766 | ret = __alloc_gigantic_page(pfn, nr_pages); | ||
767 | if (!ret) | ||
768 | return pfn_to_page(pfn); | ||
769 | spin_lock_irqsave(&z->lock, flags); | ||
770 | } | ||
771 | pfn += nr_pages; | ||
772 | } | ||
773 | |||
774 | spin_unlock_irqrestore(&z->lock, flags); | ||
775 | } | ||
776 | |||
777 | return NULL; | ||
778 | } | ||
779 | |||
780 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); | ||
781 | static void prep_compound_gigantic_page(struct page *page, unsigned long order); | ||
782 | |||
783 | static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) | ||
784 | { | ||
785 | struct page *page; | ||
786 | |||
787 | page = alloc_gigantic_page(nid, huge_page_order(h)); | ||
788 | if (page) { | ||
789 | prep_compound_gigantic_page(page, huge_page_order(h)); | ||
790 | prep_new_huge_page(h, page, nid); | ||
791 | } | ||
792 | |||
793 | return page; | ||
794 | } | ||
795 | |||
796 | static int alloc_fresh_gigantic_page(struct hstate *h, | ||
797 | nodemask_t *nodes_allowed) | ||
798 | { | ||
799 | struct page *page = NULL; | ||
800 | int nr_nodes, node; | ||
801 | |||
802 | for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { | ||
803 | page = alloc_fresh_gigantic_page_node(h, node); | ||
804 | if (page) | ||
805 | return 1; | ||
806 | } | ||
807 | |||
808 | return 0; | ||
809 | } | ||
810 | |||
811 | static inline bool gigantic_page_supported(void) { return true; } | ||
812 | #else | ||
813 | static inline bool gigantic_page_supported(void) { return false; } | ||
814 | static inline void free_gigantic_page(struct page *page, unsigned order) { } | ||
815 | static inline void destroy_compound_gigantic_page(struct page *page, | ||
816 | unsigned long order) { } | ||
817 | static inline int alloc_fresh_gigantic_page(struct hstate *h, | ||
818 | nodemask_t *nodes_allowed) { return 0; } | ||
819 | #endif | ||
820 | |||
610 | static void update_and_free_page(struct hstate *h, struct page *page) | 821 | static void update_and_free_page(struct hstate *h, struct page *page) |
611 | { | 822 | { |
612 | int i; | 823 | int i; |
613 | 824 | ||
614 | VM_BUG_ON(h->order >= MAX_ORDER); | 825 | if (hstate_is_gigantic(h) && !gigantic_page_supported()) |
826 | return; | ||
615 | 827 | ||
616 | h->nr_huge_pages--; | 828 | h->nr_huge_pages--; |
617 | h->nr_huge_pages_node[page_to_nid(page)]--; | 829 | h->nr_huge_pages_node[page_to_nid(page)]--; |
618 | for (i = 0; i < pages_per_huge_page(h); i++) { | 830 | for (i = 0; i < pages_per_huge_page(h); i++) { |
619 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | | 831 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | |
620 | 1 << PG_referenced | 1 << PG_dirty | | 832 | 1 << PG_referenced | 1 << PG_dirty | |
621 | 1 << PG_active | 1 << PG_reserved | | 833 | 1 << PG_active | 1 << PG_private | |
622 | 1 << PG_private | 1 << PG_writeback); | 834 | 1 << PG_writeback); |
623 | } | 835 | } |
624 | VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); | 836 | VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); |
625 | set_compound_page_dtor(page, NULL); | 837 | set_compound_page_dtor(page, NULL); |
626 | set_page_refcounted(page); | 838 | set_page_refcounted(page); |
627 | arch_release_hugepage(page); | 839 | if (hstate_is_gigantic(h)) { |
628 | __free_pages(page, huge_page_order(h)); | 840 | destroy_compound_gigantic_page(page, huge_page_order(h)); |
841 | free_gigantic_page(page, huge_page_order(h)); | ||
842 | } else { | ||
843 | arch_release_hugepage(page); | ||
844 | __free_pages(page, huge_page_order(h)); | ||
845 | } | ||
629 | } | 846 | } |
630 | 847 | ||
631 | struct hstate *size_to_hstate(unsigned long size) | 848 | struct hstate *size_to_hstate(unsigned long size) |
@@ -664,7 +881,7 @@ static void free_huge_page(struct page *page) | |||
664 | if (restore_reserve) | 881 | if (restore_reserve) |
665 | h->resv_huge_pages++; | 882 | h->resv_huge_pages++; |
666 | 883 | ||
667 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { | 884 | if (h->surplus_huge_pages_node[nid]) { |
668 | /* remove the page from active list */ | 885 | /* remove the page from active list */ |
669 | list_del(&page->lru); | 886 | list_del(&page->lru); |
670 | update_and_free_page(h, page); | 887 | update_and_free_page(h, page); |
@@ -690,8 +907,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | |||
690 | put_page(page); /* free it into the hugepage allocator */ | 907 | put_page(page); /* free it into the hugepage allocator */ |
691 | } | 908 | } |
692 | 909 | ||
693 | static void __init prep_compound_gigantic_page(struct page *page, | 910 | static void prep_compound_gigantic_page(struct page *page, unsigned long order) |
694 | unsigned long order) | ||
695 | { | 911 | { |
696 | int i; | 912 | int i; |
697 | int nr_pages = 1 << order; | 913 | int nr_pages = 1 << order; |
@@ -769,9 +985,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
769 | { | 985 | { |
770 | struct page *page; | 986 | struct page *page; |
771 | 987 | ||
772 | if (h->order >= MAX_ORDER) | ||
773 | return NULL; | ||
774 | |||
775 | page = alloc_pages_exact_node(nid, | 988 | page = alloc_pages_exact_node(nid, |
776 | htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| | 989 | htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| |
777 | __GFP_REPEAT|__GFP_NOWARN, | 990 | __GFP_REPEAT|__GFP_NOWARN, |
@@ -787,79 +1000,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
787 | return page; | 1000 | return page; |
788 | } | 1001 | } |
789 | 1002 | ||
790 | /* | ||
791 | * common helper functions for hstate_next_node_to_{alloc|free}. | ||
792 | * We may have allocated or freed a huge page based on a different | ||
793 | * nodes_allowed previously, so h->next_node_to_{alloc|free} might | ||
794 | * be outside of *nodes_allowed. Ensure that we use an allowed | ||
795 | * node for alloc or free. | ||
796 | */ | ||
797 | static int next_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
798 | { | ||
799 | nid = next_node(nid, *nodes_allowed); | ||
800 | if (nid == MAX_NUMNODES) | ||
801 | nid = first_node(*nodes_allowed); | ||
802 | VM_BUG_ON(nid >= MAX_NUMNODES); | ||
803 | |||
804 | return nid; | ||
805 | } | ||
806 | |||
807 | static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
808 | { | ||
809 | if (!node_isset(nid, *nodes_allowed)) | ||
810 | nid = next_node_allowed(nid, nodes_allowed); | ||
811 | return nid; | ||
812 | } | ||
813 | |||
814 | /* | ||
815 | * returns the previously saved node ["this node"] from which to | ||
816 | * allocate a persistent huge page for the pool and advance the | ||
817 | * next node from which to allocate, handling wrap at end of node | ||
818 | * mask. | ||
819 | */ | ||
820 | static int hstate_next_node_to_alloc(struct hstate *h, | ||
821 | nodemask_t *nodes_allowed) | ||
822 | { | ||
823 | int nid; | ||
824 | |||
825 | VM_BUG_ON(!nodes_allowed); | ||
826 | |||
827 | nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); | ||
828 | h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); | ||
829 | |||
830 | return nid; | ||
831 | } | ||
832 | |||
833 | /* | ||
834 | * helper for free_pool_huge_page() - return the previously saved | ||
835 | * node ["this node"] from which to free a huge page. Advance the | ||
836 | * next node id whether or not we find a free huge page to free so | ||
837 | * that the next attempt to free addresses the next node. | ||
838 | */ | ||
839 | static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) | ||
840 | { | ||
841 | int nid; | ||
842 | |||
843 | VM_BUG_ON(!nodes_allowed); | ||
844 | |||
845 | nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); | ||
846 | h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); | ||
847 | |||
848 | return nid; | ||
849 | } | ||
850 | |||
851 | #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ | ||
852 | for (nr_nodes = nodes_weight(*mask); \ | ||
853 | nr_nodes > 0 && \ | ||
854 | ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ | ||
855 | nr_nodes--) | ||
856 | |||
857 | #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ | ||
858 | for (nr_nodes = nodes_weight(*mask); \ | ||
859 | nr_nodes > 0 && \ | ||
860 | ((node = hstate_next_node_to_free(hs, mask)) || 1); \ | ||
861 | nr_nodes--) | ||
862 | |||
863 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) | 1003 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) |
864 | { | 1004 | { |
865 | struct page *page; | 1005 | struct page *page; |
@@ -963,7 +1103,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
963 | struct page *page; | 1103 | struct page *page; |
964 | unsigned int r_nid; | 1104 | unsigned int r_nid; |
965 | 1105 | ||
966 | if (h->order >= MAX_ORDER) | 1106 | if (hstate_is_gigantic(h)) |
967 | return NULL; | 1107 | return NULL; |
968 | 1108 | ||
969 | /* | 1109 | /* |
@@ -1156,7 +1296,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
1156 | h->resv_huge_pages -= unused_resv_pages; | 1296 | h->resv_huge_pages -= unused_resv_pages; |
1157 | 1297 | ||
1158 | /* Cannot return gigantic pages currently */ | 1298 | /* Cannot return gigantic pages currently */ |
1159 | if (h->order >= MAX_ORDER) | 1299 | if (hstate_is_gigantic(h)) |
1160 | return; | 1300 | return; |
1161 | 1301 | ||
1162 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); | 1302 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); |
@@ -1246,24 +1386,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1246 | return ERR_PTR(-ENOSPC); | 1386 | return ERR_PTR(-ENOSPC); |
1247 | 1387 | ||
1248 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); | 1388 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); |
1249 | if (ret) { | 1389 | if (ret) |
1250 | if (chg || avoid_reserve) | 1390 | goto out_subpool_put; |
1251 | hugepage_subpool_put_pages(spool, 1); | 1391 | |
1252 | return ERR_PTR(-ENOSPC); | ||
1253 | } | ||
1254 | spin_lock(&hugetlb_lock); | 1392 | spin_lock(&hugetlb_lock); |
1255 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); | 1393 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); |
1256 | if (!page) { | 1394 | if (!page) { |
1257 | spin_unlock(&hugetlb_lock); | 1395 | spin_unlock(&hugetlb_lock); |
1258 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1396 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1259 | if (!page) { | 1397 | if (!page) |
1260 | hugetlb_cgroup_uncharge_cgroup(idx, | 1398 | goto out_uncharge_cgroup; |
1261 | pages_per_huge_page(h), | 1399 | |
1262 | h_cg); | ||
1263 | if (chg || avoid_reserve) | ||
1264 | hugepage_subpool_put_pages(spool, 1); | ||
1265 | return ERR_PTR(-ENOSPC); | ||
1266 | } | ||
1267 | spin_lock(&hugetlb_lock); | 1400 | spin_lock(&hugetlb_lock); |
1268 | list_move(&page->lru, &h->hugepage_activelist); | 1401 | list_move(&page->lru, &h->hugepage_activelist); |
1269 | /* Fall through */ | 1402 | /* Fall through */ |
@@ -1275,6 +1408,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1275 | 1408 | ||
1276 | vma_commit_reservation(h, vma, addr); | 1409 | vma_commit_reservation(h, vma, addr); |
1277 | return page; | 1410 | return page; |
1411 | |||
1412 | out_uncharge_cgroup: | ||
1413 | hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); | ||
1414 | out_subpool_put: | ||
1415 | if (chg || avoid_reserve) | ||
1416 | hugepage_subpool_put_pages(spool, 1); | ||
1417 | return ERR_PTR(-ENOSPC); | ||
1278 | } | 1418 | } |
1279 | 1419 | ||
1280 | /* | 1420 | /* |
@@ -1356,7 +1496,7 @@ static void __init gather_bootmem_prealloc(void) | |||
1356 | * fix confusing memory reports from free(1) and another | 1496 | * fix confusing memory reports from free(1) and another |
1357 | * side-effects, like CommitLimit going negative. | 1497 | * side-effects, like CommitLimit going negative. |
1358 | */ | 1498 | */ |
1359 | if (h->order > (MAX_ORDER - 1)) | 1499 | if (hstate_is_gigantic(h)) |
1360 | adjust_managed_page_count(page, 1 << h->order); | 1500 | adjust_managed_page_count(page, 1 << h->order); |
1361 | } | 1501 | } |
1362 | } | 1502 | } |
@@ -1366,7 +1506,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
1366 | unsigned long i; | 1506 | unsigned long i; |
1367 | 1507 | ||
1368 | for (i = 0; i < h->max_huge_pages; ++i) { | 1508 | for (i = 0; i < h->max_huge_pages; ++i) { |
1369 | if (h->order >= MAX_ORDER) { | 1509 | if (hstate_is_gigantic(h)) { |
1370 | if (!alloc_bootmem_huge_page(h)) | 1510 | if (!alloc_bootmem_huge_page(h)) |
1371 | break; | 1511 | break; |
1372 | } else if (!alloc_fresh_huge_page(h, | 1512 | } else if (!alloc_fresh_huge_page(h, |
@@ -1382,7 +1522,7 @@ static void __init hugetlb_init_hstates(void) | |||
1382 | 1522 | ||
1383 | for_each_hstate(h) { | 1523 | for_each_hstate(h) { |
1384 | /* oversize hugepages were init'ed in early boot */ | 1524 | /* oversize hugepages were init'ed in early boot */ |
1385 | if (h->order < MAX_ORDER) | 1525 | if (!hstate_is_gigantic(h)) |
1386 | hugetlb_hstate_alloc_pages(h); | 1526 | hugetlb_hstate_alloc_pages(h); |
1387 | } | 1527 | } |
1388 | } | 1528 | } |
@@ -1416,7 +1556,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, | |||
1416 | { | 1556 | { |
1417 | int i; | 1557 | int i; |
1418 | 1558 | ||
1419 | if (h->order >= MAX_ORDER) | 1559 | if (hstate_is_gigantic(h)) |
1420 | return; | 1560 | return; |
1421 | 1561 | ||
1422 | for_each_node_mask(i, *nodes_allowed) { | 1562 | for_each_node_mask(i, *nodes_allowed) { |
@@ -1479,7 +1619,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, | |||
1479 | { | 1619 | { |
1480 | unsigned long min_count, ret; | 1620 | unsigned long min_count, ret; |
1481 | 1621 | ||
1482 | if (h->order >= MAX_ORDER) | 1622 | if (hstate_is_gigantic(h) && !gigantic_page_supported()) |
1483 | return h->max_huge_pages; | 1623 | return h->max_huge_pages; |
1484 | 1624 | ||
1485 | /* | 1625 | /* |
@@ -1506,7 +1646,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, | |||
1506 | * and reducing the surplus. | 1646 | * and reducing the surplus. |
1507 | */ | 1647 | */ |
1508 | spin_unlock(&hugetlb_lock); | 1648 | spin_unlock(&hugetlb_lock); |
1509 | ret = alloc_fresh_huge_page(h, nodes_allowed); | 1649 | if (hstate_is_gigantic(h)) |
1650 | ret = alloc_fresh_gigantic_page(h, nodes_allowed); | ||
1651 | else | ||
1652 | ret = alloc_fresh_huge_page(h, nodes_allowed); | ||
1510 | spin_lock(&hugetlb_lock); | 1653 | spin_lock(&hugetlb_lock); |
1511 | if (!ret) | 1654 | if (!ret) |
1512 | goto out; | 1655 | goto out; |
@@ -1606,7 +1749,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1606 | goto out; | 1749 | goto out; |
1607 | 1750 | ||
1608 | h = kobj_to_hstate(kobj, &nid); | 1751 | h = kobj_to_hstate(kobj, &nid); |
1609 | if (h->order >= MAX_ORDER) { | 1752 | if (hstate_is_gigantic(h) && !gigantic_page_supported()) { |
1610 | err = -EINVAL; | 1753 | err = -EINVAL; |
1611 | goto out; | 1754 | goto out; |
1612 | } | 1755 | } |
@@ -1689,7 +1832,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
1689 | unsigned long input; | 1832 | unsigned long input; |
1690 | struct hstate *h = kobj_to_hstate(kobj, NULL); | 1833 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1691 | 1834 | ||
1692 | if (h->order >= MAX_ORDER) | 1835 | if (hstate_is_gigantic(h)) |
1693 | return -EINVAL; | 1836 | return -EINVAL; |
1694 | 1837 | ||
1695 | err = kstrtoul(buf, 10, &input); | 1838 | err = kstrtoul(buf, 10, &input); |
@@ -2113,7 +2256,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
2113 | 2256 | ||
2114 | tmp = h->max_huge_pages; | 2257 | tmp = h->max_huge_pages; |
2115 | 2258 | ||
2116 | if (write && h->order >= MAX_ORDER) | 2259 | if (write && hstate_is_gigantic(h) && !gigantic_page_supported()) |
2117 | return -EINVAL; | 2260 | return -EINVAL; |
2118 | 2261 | ||
2119 | table->data = &tmp; | 2262 | table->data = &tmp; |
@@ -2169,7 +2312,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
2169 | 2312 | ||
2170 | tmp = h->nr_overcommit_huge_pages; | 2313 | tmp = h->nr_overcommit_huge_pages; |
2171 | 2314 | ||
2172 | if (write && h->order >= MAX_ORDER) | 2315 | if (write && hstate_is_gigantic(h)) |
2173 | return -EINVAL; | 2316 | return -EINVAL; |
2174 | 2317 | ||
2175 | table->data = &tmp; | 2318 | table->data = &tmp; |
diff --git a/mm/internal.h b/mm/internal.h index 07b67361a40a..7f22a11fcc66 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -134,7 +134,7 @@ struct compact_control { | |||
134 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 134 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
135 | unsigned long free_pfn; /* isolate_freepages search base */ | 135 | unsigned long free_pfn; /* isolate_freepages search base */ |
136 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 136 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
137 | bool sync; /* Synchronous migration */ | 137 | enum migrate_mode mode; /* Async or sync migration mode */ |
138 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ | 138 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ |
139 | bool finished_update_free; /* True when the zone cached pfns are | 139 | bool finished_update_free; /* True when the zone cached pfns are |
140 | * no longer being updated | 140 | * no longer being updated |
@@ -144,7 +144,10 @@ struct compact_control { | |||
144 | int order; /* order a direct compactor needs */ | 144 | int order; /* order a direct compactor needs */ |
145 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 145 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
146 | struct zone *zone; | 146 | struct zone *zone; |
147 | bool contended; /* True if a lock was contended */ | 147 | bool contended; /* True if a lock was contended, or |
148 | * need_resched() true during async | ||
149 | * compaction | ||
150 | */ | ||
148 | }; | 151 | }; |
149 | 152 | ||
150 | unsigned long | 153 | unsigned long |
@@ -169,6 +172,11 @@ static inline unsigned long page_order(struct page *page) | |||
169 | return page_private(page); | 172 | return page_private(page); |
170 | } | 173 | } |
171 | 174 | ||
175 | static inline bool is_cow_mapping(vm_flags_t flags) | ||
176 | { | ||
177 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | ||
178 | } | ||
179 | |||
172 | /* mm/util.c */ | 180 | /* mm/util.c */ |
173 | void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | 181 | void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, |
174 | struct vm_area_struct *prev, struct rb_node *rb_parent); | 182 | struct vm_area_struct *prev, struct rb_node *rb_parent); |
@@ -184,26 +192,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
184 | } | 192 | } |
185 | 193 | ||
186 | /* | 194 | /* |
187 | * Called only in fault path, to determine if a new page is being | ||
188 | * mapped into a LOCKED vma. If it is, mark page as mlocked. | ||
189 | */ | ||
190 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, | ||
191 | struct page *page) | ||
192 | { | ||
193 | VM_BUG_ON_PAGE(PageLRU(page), page); | ||
194 | |||
195 | if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) | ||
196 | return 0; | ||
197 | |||
198 | if (!TestSetPageMlocked(page)) { | ||
199 | mod_zone_page_state(page_zone(page), NR_MLOCK, | ||
200 | hpage_nr_pages(page)); | ||
201 | count_vm_event(UNEVICTABLE_PGMLOCKED); | ||
202 | } | ||
203 | return 1; | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * must be called with vma's mmap_sem held for read or write, and page locked. | 195 | * must be called with vma's mmap_sem held for read or write, and page locked. |
208 | */ | 196 | */ |
209 | extern void mlock_vma_page(struct page *page); | 197 | extern void mlock_vma_page(struct page *page); |
@@ -245,10 +233,6 @@ extern unsigned long vma_address(struct page *page, | |||
245 | struct vm_area_struct *vma); | 233 | struct vm_area_struct *vma); |
246 | #endif | 234 | #endif |
247 | #else /* !CONFIG_MMU */ | 235 | #else /* !CONFIG_MMU */ |
248 | static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) | ||
249 | { | ||
250 | return 0; | ||
251 | } | ||
252 | static inline void clear_page_mlock(struct page *page) { } | 236 | static inline void clear_page_mlock(struct page *page) { } |
253 | static inline void mlock_vma_page(struct page *page) { } | 237 | static inline void mlock_vma_page(struct page *page) { } |
254 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | 238 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 8d2fcdfeff7f..736ade31d1dc 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -1300,7 +1300,7 @@ static void kmemleak_scan(void) | |||
1300 | /* | 1300 | /* |
1301 | * Struct page scanning for each node. | 1301 | * Struct page scanning for each node. |
1302 | */ | 1302 | */ |
1303 | lock_memory_hotplug(); | 1303 | get_online_mems(); |
1304 | for_each_online_node(i) { | 1304 | for_each_online_node(i) { |
1305 | unsigned long start_pfn = node_start_pfn(i); | 1305 | unsigned long start_pfn = node_start_pfn(i); |
1306 | unsigned long end_pfn = node_end_pfn(i); | 1306 | unsigned long end_pfn = node_end_pfn(i); |
@@ -1318,7 +1318,7 @@ static void kmemleak_scan(void) | |||
1318 | scan_block(page, page + 1, NULL, 1); | 1318 | scan_block(page, page + 1, NULL, 1); |
1319 | } | 1319 | } |
1320 | } | 1320 | } |
1321 | unlock_memory_hotplug(); | 1321 | put_online_mems(); |
1322 | 1322 | ||
1323 | /* | 1323 | /* |
1324 | * Scanning the task stacks (may introduce false negatives). | 1324 | * Scanning the task stacks (may introduce false negatives). |
diff --git a/mm/memblock.c b/mm/memblock.c index a810ba923cdd..0aa0d2b07624 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -1033,22 +1033,35 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, | |||
1033 | } | 1033 | } |
1034 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 1034 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
1035 | 1035 | ||
1036 | static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, | 1036 | static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, |
1037 | phys_addr_t align, phys_addr_t max_addr, | 1037 | phys_addr_t align, phys_addr_t start, |
1038 | int nid) | 1038 | phys_addr_t end, int nid) |
1039 | { | 1039 | { |
1040 | phys_addr_t found; | 1040 | phys_addr_t found; |
1041 | 1041 | ||
1042 | if (!align) | 1042 | if (!align) |
1043 | align = SMP_CACHE_BYTES; | 1043 | align = SMP_CACHE_BYTES; |
1044 | 1044 | ||
1045 | found = memblock_find_in_range_node(size, align, 0, max_addr, nid); | 1045 | found = memblock_find_in_range_node(size, align, start, end, nid); |
1046 | if (found && !memblock_reserve(found, size)) | 1046 | if (found && !memblock_reserve(found, size)) |
1047 | return found; | 1047 | return found; |
1048 | 1048 | ||
1049 | return 0; | 1049 | return 0; |
1050 | } | 1050 | } |
1051 | 1051 | ||
1052 | phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, | ||
1053 | phys_addr_t start, phys_addr_t end) | ||
1054 | { | ||
1055 | return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); | ||
1056 | } | ||
1057 | |||
1058 | static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, | ||
1059 | phys_addr_t align, phys_addr_t max_addr, | ||
1060 | int nid) | ||
1061 | { | ||
1062 | return memblock_alloc_range_nid(size, align, 0, max_addr, nid); | ||
1063 | } | ||
1064 | |||
1052 | phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) | 1065 | phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) |
1053 | { | 1066 | { |
1054 | return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); | 1067 | return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); |
@@ -1389,9 +1402,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, | |||
1389 | if (mid == -1) | 1402 | if (mid == -1) |
1390 | return -1; | 1403 | return -1; |
1391 | 1404 | ||
1392 | *start_pfn = type->regions[mid].base >> PAGE_SHIFT; | 1405 | *start_pfn = PFN_DOWN(type->regions[mid].base); |
1393 | *end_pfn = (type->regions[mid].base + type->regions[mid].size) | 1406 | *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); |
1394 | >> PAGE_SHIFT; | ||
1395 | 1407 | ||
1396 | return type->regions[mid].nid; | 1408 | return type->regions[mid].nid; |
1397 | } | 1409 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5177c6d4a2dd..a500cb0594c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -80,7 +80,7 @@ int do_swap_account __read_mostly; | |||
80 | #ifdef CONFIG_MEMCG_SWAP_ENABLED | 80 | #ifdef CONFIG_MEMCG_SWAP_ENABLED |
81 | static int really_do_swap_account __initdata = 1; | 81 | static int really_do_swap_account __initdata = 1; |
82 | #else | 82 | #else |
83 | static int really_do_swap_account __initdata = 0; | 83 | static int really_do_swap_account __initdata; |
84 | #endif | 84 | #endif |
85 | 85 | ||
86 | #else | 86 | #else |
@@ -357,10 +357,9 @@ struct mem_cgroup { | |||
357 | struct cg_proto tcp_mem; | 357 | struct cg_proto tcp_mem; |
358 | #endif | 358 | #endif |
359 | #if defined(CONFIG_MEMCG_KMEM) | 359 | #if defined(CONFIG_MEMCG_KMEM) |
360 | /* analogous to slab_common's slab_caches list. per-memcg */ | 360 | /* analogous to slab_common's slab_caches list, but per-memcg; |
361 | * protected by memcg_slab_mutex */ | ||
361 | struct list_head memcg_slab_caches; | 362 | struct list_head memcg_slab_caches; |
362 | /* Not a spinlock, we can take a lot of time walking the list */ | ||
363 | struct mutex slab_caches_mutex; | ||
364 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ | 363 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ |
365 | int kmemcg_id; | 364 | int kmemcg_id; |
366 | #endif | 365 | #endif |
@@ -1595,23 +1594,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) | |||
1595 | } | 1594 | } |
1596 | 1595 | ||
1597 | /* | 1596 | /* |
1598 | * 2 routines for checking "mem" is under move_account() or not. | 1597 | * A routine for checking "mem" is under move_account() or not. |
1599 | * | 1598 | * |
1600 | * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This | 1599 | * Checking a cgroup is mc.from or mc.to or under hierarchy of |
1601 | * is used for avoiding races in accounting. If true, | 1600 | * moving cgroups. This is for waiting at high-memory pressure |
1602 | * pc->mem_cgroup may be overwritten. | 1601 | * caused by "move". |
1603 | * | ||
1604 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or | ||
1605 | * under hierarchy of moving cgroups. This is for | ||
1606 | * waiting at hith-memory prressure caused by "move". | ||
1607 | */ | 1602 | */ |
1608 | |||
1609 | static bool mem_cgroup_stolen(struct mem_cgroup *memcg) | ||
1610 | { | ||
1611 | VM_BUG_ON(!rcu_read_lock_held()); | ||
1612 | return atomic_read(&memcg->moving_account) > 0; | ||
1613 | } | ||
1614 | |||
1615 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) | 1603 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) |
1616 | { | 1604 | { |
1617 | struct mem_cgroup *from; | 1605 | struct mem_cgroup *from; |
@@ -1654,7 +1642,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) | |||
1654 | * Take this lock when | 1642 | * Take this lock when |
1655 | * - a code tries to modify page's memcg while it's USED. | 1643 | * - a code tries to modify page's memcg while it's USED. |
1656 | * - a code tries to modify page state accounting in a memcg. | 1644 | * - a code tries to modify page state accounting in a memcg. |
1657 | * see mem_cgroup_stolen(), too. | ||
1658 | */ | 1645 | */ |
1659 | static void move_lock_mem_cgroup(struct mem_cgroup *memcg, | 1646 | static void move_lock_mem_cgroup(struct mem_cgroup *memcg, |
1660 | unsigned long *flags) | 1647 | unsigned long *flags) |
@@ -2289,12 +2276,11 @@ cleanup: | |||
2289 | } | 2276 | } |
2290 | 2277 | ||
2291 | /* | 2278 | /* |
2292 | * Currently used to update mapped file statistics, but the routine can be | 2279 | * Used to update mapped file or writeback or other statistics. |
2293 | * generalized to update other statistics as well. | ||
2294 | * | 2280 | * |
2295 | * Notes: Race condition | 2281 | * Notes: Race condition |
2296 | * | 2282 | * |
2297 | * We usually use page_cgroup_lock() for accessing page_cgroup member but | 2283 | * We usually use lock_page_cgroup() for accessing page_cgroup member but |
2298 | * it tends to be costly. But considering some conditions, we doesn't need | 2284 | * it tends to be costly. But considering some conditions, we doesn't need |
2299 | * to do so _always_. | 2285 | * to do so _always_. |
2300 | * | 2286 | * |
@@ -2308,8 +2294,8 @@ cleanup: | |||
2308 | * by flags. | 2294 | * by flags. |
2309 | * | 2295 | * |
2310 | * Considering "move", this is an only case we see a race. To make the race | 2296 | * Considering "move", this is an only case we see a race. To make the race |
2311 | * small, we check mm->moving_account and detect there are possibility of race | 2297 | * small, we check memcg->moving_account and detect there are possibility |
2312 | * If there is, we take a lock. | 2298 | * of race or not. If there is, we take a lock. |
2313 | */ | 2299 | */ |
2314 | 2300 | ||
2315 | void __mem_cgroup_begin_update_page_stat(struct page *page, | 2301 | void __mem_cgroup_begin_update_page_stat(struct page *page, |
@@ -2327,9 +2313,10 @@ again: | |||
2327 | * If this memory cgroup is not under account moving, we don't | 2313 | * If this memory cgroup is not under account moving, we don't |
2328 | * need to take move_lock_mem_cgroup(). Because we already hold | 2314 | * need to take move_lock_mem_cgroup(). Because we already hold |
2329 | * rcu_read_lock(), any calls to move_account will be delayed until | 2315 | * rcu_read_lock(), any calls to move_account will be delayed until |
2330 | * rcu_read_unlock() if mem_cgroup_stolen() == true. | 2316 | * rcu_read_unlock(). |
2331 | */ | 2317 | */ |
2332 | if (!mem_cgroup_stolen(memcg)) | 2318 | VM_BUG_ON(!rcu_read_lock_held()); |
2319 | if (atomic_read(&memcg->moving_account) <= 0) | ||
2333 | return; | 2320 | return; |
2334 | 2321 | ||
2335 | move_lock_mem_cgroup(memcg, flags); | 2322 | move_lock_mem_cgroup(memcg, flags); |
@@ -2437,7 +2424,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) | |||
2437 | */ | 2424 | */ |
2438 | static void drain_local_stock(struct work_struct *dummy) | 2425 | static void drain_local_stock(struct work_struct *dummy) |
2439 | { | 2426 | { |
2440 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | 2427 | struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); |
2441 | drain_stock(stock); | 2428 | drain_stock(stock); |
2442 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | 2429 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); |
2443 | } | 2430 | } |
@@ -2684,7 +2671,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, | |||
2684 | * free their memory. | 2671 | * free their memory. |
2685 | */ | 2672 | */ |
2686 | if (unlikely(test_thread_flag(TIF_MEMDIE) || | 2673 | if (unlikely(test_thread_flag(TIF_MEMDIE) || |
2687 | fatal_signal_pending(current))) | 2674 | fatal_signal_pending(current) || |
2675 | current->flags & PF_EXITING)) | ||
2688 | goto bypass; | 2676 | goto bypass; |
2689 | 2677 | ||
2690 | if (unlikely(task_in_memcg_oom(current))) | 2678 | if (unlikely(task_in_memcg_oom(current))) |
@@ -2912,6 +2900,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2912 | static DEFINE_MUTEX(set_limit_mutex); | 2900 | static DEFINE_MUTEX(set_limit_mutex); |
2913 | 2901 | ||
2914 | #ifdef CONFIG_MEMCG_KMEM | 2902 | #ifdef CONFIG_MEMCG_KMEM |
2903 | /* | ||
2904 | * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or | ||
2905 | * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. | ||
2906 | */ | ||
2907 | static DEFINE_MUTEX(memcg_slab_mutex); | ||
2908 | |||
2915 | static DEFINE_MUTEX(activate_kmem_mutex); | 2909 | static DEFINE_MUTEX(activate_kmem_mutex); |
2916 | 2910 | ||
2917 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) | 2911 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) |
@@ -2944,10 +2938,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) | |||
2944 | 2938 | ||
2945 | print_slabinfo_header(m); | 2939 | print_slabinfo_header(m); |
2946 | 2940 | ||
2947 | mutex_lock(&memcg->slab_caches_mutex); | 2941 | mutex_lock(&memcg_slab_mutex); |
2948 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) | 2942 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) |
2949 | cache_show(memcg_params_to_cache(params), m); | 2943 | cache_show(memcg_params_to_cache(params), m); |
2950 | mutex_unlock(&memcg->slab_caches_mutex); | 2944 | mutex_unlock(&memcg_slab_mutex); |
2951 | 2945 | ||
2952 | return 0; | 2946 | return 0; |
2953 | } | 2947 | } |
@@ -3049,8 +3043,6 @@ void memcg_update_array_size(int num) | |||
3049 | memcg_limited_groups_array_size = memcg_caches_array_size(num); | 3043 | memcg_limited_groups_array_size = memcg_caches_array_size(num); |
3050 | } | 3044 | } |
3051 | 3045 | ||
3052 | static void kmem_cache_destroy_work_func(struct work_struct *w); | ||
3053 | |||
3054 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | 3046 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups) |
3055 | { | 3047 | { |
3056 | struct memcg_cache_params *cur_params = s->memcg_params; | 3048 | struct memcg_cache_params *cur_params = s->memcg_params; |
@@ -3103,29 +3095,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
3103 | return 0; | 3095 | return 0; |
3104 | } | 3096 | } |
3105 | 3097 | ||
3106 | char *memcg_create_cache_name(struct mem_cgroup *memcg, | ||
3107 | struct kmem_cache *root_cache) | ||
3108 | { | ||
3109 | static char *buf = NULL; | ||
3110 | |||
3111 | /* | ||
3112 | * We need a mutex here to protect the shared buffer. Since this is | ||
3113 | * expected to be called only on cache creation, we can employ the | ||
3114 | * slab_mutex for that purpose. | ||
3115 | */ | ||
3116 | lockdep_assert_held(&slab_mutex); | ||
3117 | |||
3118 | if (!buf) { | ||
3119 | buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); | ||
3120 | if (!buf) | ||
3121 | return NULL; | ||
3122 | } | ||
3123 | |||
3124 | cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); | ||
3125 | return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, | ||
3126 | memcg_cache_id(memcg), buf); | ||
3127 | } | ||
3128 | |||
3129 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, | 3098 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, |
3130 | struct kmem_cache *root_cache) | 3099 | struct kmem_cache *root_cache) |
3131 | { | 3100 | { |
@@ -3147,8 +3116,6 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, | |||
3147 | if (memcg) { | 3116 | if (memcg) { |
3148 | s->memcg_params->memcg = memcg; | 3117 | s->memcg_params->memcg = memcg; |
3149 | s->memcg_params->root_cache = root_cache; | 3118 | s->memcg_params->root_cache = root_cache; |
3150 | INIT_WORK(&s->memcg_params->destroy, | ||
3151 | kmem_cache_destroy_work_func); | ||
3152 | css_get(&memcg->css); | 3119 | css_get(&memcg->css); |
3153 | } else | 3120 | } else |
3154 | s->memcg_params->is_root_cache = true; | 3121 | s->memcg_params->is_root_cache = true; |
@@ -3165,24 +3132,37 @@ void memcg_free_cache_params(struct kmem_cache *s) | |||
3165 | kfree(s->memcg_params); | 3132 | kfree(s->memcg_params); |
3166 | } | 3133 | } |
3167 | 3134 | ||
3168 | void memcg_register_cache(struct kmem_cache *s) | 3135 | static void memcg_register_cache(struct mem_cgroup *memcg, |
3136 | struct kmem_cache *root_cache) | ||
3169 | { | 3137 | { |
3170 | struct kmem_cache *root; | 3138 | static char memcg_name_buf[NAME_MAX + 1]; /* protected by |
3171 | struct mem_cgroup *memcg; | 3139 | memcg_slab_mutex */ |
3140 | struct kmem_cache *cachep; | ||
3172 | int id; | 3141 | int id; |
3173 | 3142 | ||
3174 | if (is_root_cache(s)) | 3143 | lockdep_assert_held(&memcg_slab_mutex); |
3144 | |||
3145 | id = memcg_cache_id(memcg); | ||
3146 | |||
3147 | /* | ||
3148 | * Since per-memcg caches are created asynchronously on first | ||
3149 | * allocation (see memcg_kmem_get_cache()), several threads can try to | ||
3150 | * create the same cache, but only one of them may succeed. | ||
3151 | */ | ||
3152 | if (cache_from_memcg_idx(root_cache, id)) | ||
3175 | return; | 3153 | return; |
3176 | 3154 | ||
3155 | cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); | ||
3156 | cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); | ||
3177 | /* | 3157 | /* |
3178 | * Holding the slab_mutex assures nobody will touch the memcg_caches | 3158 | * If we could not create a memcg cache, do not complain, because |
3179 | * array while we are modifying it. | 3159 | * that's not critical at all as we can always proceed with the root |
3160 | * cache. | ||
3180 | */ | 3161 | */ |
3181 | lockdep_assert_held(&slab_mutex); | 3162 | if (!cachep) |
3163 | return; | ||
3182 | 3164 | ||
3183 | root = s->memcg_params->root_cache; | 3165 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); |
3184 | memcg = s->memcg_params->memcg; | ||
3185 | id = memcg_cache_id(memcg); | ||
3186 | 3166 | ||
3187 | /* | 3167 | /* |
3188 | * Since readers won't lock (see cache_from_memcg_idx()), we need a | 3168 | * Since readers won't lock (see cache_from_memcg_idx()), we need a |
@@ -3191,49 +3171,30 @@ void memcg_register_cache(struct kmem_cache *s) | |||
3191 | */ | 3171 | */ |
3192 | smp_wmb(); | 3172 | smp_wmb(); |
3193 | 3173 | ||
3194 | /* | 3174 | BUG_ON(root_cache->memcg_params->memcg_caches[id]); |
3195 | * Initialize the pointer to this cache in its parent's memcg_params | 3175 | root_cache->memcg_params->memcg_caches[id] = cachep; |
3196 | * before adding it to the memcg_slab_caches list, otherwise we can | ||
3197 | * fail to convert memcg_params_to_cache() while traversing the list. | ||
3198 | */ | ||
3199 | VM_BUG_ON(root->memcg_params->memcg_caches[id]); | ||
3200 | root->memcg_params->memcg_caches[id] = s; | ||
3201 | |||
3202 | mutex_lock(&memcg->slab_caches_mutex); | ||
3203 | list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); | ||
3204 | mutex_unlock(&memcg->slab_caches_mutex); | ||
3205 | } | 3176 | } |
3206 | 3177 | ||
3207 | void memcg_unregister_cache(struct kmem_cache *s) | 3178 | static void memcg_unregister_cache(struct kmem_cache *cachep) |
3208 | { | 3179 | { |
3209 | struct kmem_cache *root; | 3180 | struct kmem_cache *root_cache; |
3210 | struct mem_cgroup *memcg; | 3181 | struct mem_cgroup *memcg; |
3211 | int id; | 3182 | int id; |
3212 | 3183 | ||
3213 | if (is_root_cache(s)) | 3184 | lockdep_assert_held(&memcg_slab_mutex); |
3214 | return; | ||
3215 | 3185 | ||
3216 | /* | 3186 | BUG_ON(is_root_cache(cachep)); |
3217 | * Holding the slab_mutex assures nobody will touch the memcg_caches | ||
3218 | * array while we are modifying it. | ||
3219 | */ | ||
3220 | lockdep_assert_held(&slab_mutex); | ||
3221 | 3187 | ||
3222 | root = s->memcg_params->root_cache; | 3188 | root_cache = cachep->memcg_params->root_cache; |
3223 | memcg = s->memcg_params->memcg; | 3189 | memcg = cachep->memcg_params->memcg; |
3224 | id = memcg_cache_id(memcg); | 3190 | id = memcg_cache_id(memcg); |
3225 | 3191 | ||
3226 | mutex_lock(&memcg->slab_caches_mutex); | 3192 | BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); |
3227 | list_del(&s->memcg_params->list); | 3193 | root_cache->memcg_params->memcg_caches[id] = NULL; |
3228 | mutex_unlock(&memcg->slab_caches_mutex); | ||
3229 | 3194 | ||
3230 | /* | 3195 | list_del(&cachep->memcg_params->list); |
3231 | * Clear the pointer to this cache in its parent's memcg_params only | 3196 | |
3232 | * after removing it from the memcg_slab_caches list, otherwise we can | 3197 | kmem_cache_destroy(cachep); |
3233 | * fail to convert memcg_params_to_cache() while traversing the list. | ||
3234 | */ | ||
3235 | VM_BUG_ON(root->memcg_params->memcg_caches[id] != s); | ||
3236 | root->memcg_params->memcg_caches[id] = NULL; | ||
3237 | } | 3198 | } |
3238 | 3199 | ||
3239 | /* | 3200 | /* |
@@ -3267,144 +3228,61 @@ static inline void memcg_resume_kmem_account(void) | |||
3267 | current->memcg_kmem_skip_account--; | 3228 | current->memcg_kmem_skip_account--; |
3268 | } | 3229 | } |
3269 | 3230 | ||
3270 | static void kmem_cache_destroy_work_func(struct work_struct *w) | 3231 | int __memcg_cleanup_cache_params(struct kmem_cache *s) |
3271 | { | ||
3272 | struct kmem_cache *cachep; | ||
3273 | struct memcg_cache_params *p; | ||
3274 | |||
3275 | p = container_of(w, struct memcg_cache_params, destroy); | ||
3276 | |||
3277 | cachep = memcg_params_to_cache(p); | ||
3278 | |||
3279 | /* | ||
3280 | * If we get down to 0 after shrink, we could delete right away. | ||
3281 | * However, memcg_release_pages() already puts us back in the workqueue | ||
3282 | * in that case. If we proceed deleting, we'll get a dangling | ||
3283 | * reference, and removing the object from the workqueue in that case | ||
3284 | * is unnecessary complication. We are not a fast path. | ||
3285 | * | ||
3286 | * Note that this case is fundamentally different from racing with | ||
3287 | * shrink_slab(): if memcg_cgroup_destroy_cache() is called in | ||
3288 | * kmem_cache_shrink, not only we would be reinserting a dead cache | ||
3289 | * into the queue, but doing so from inside the worker racing to | ||
3290 | * destroy it. | ||
3291 | * | ||
3292 | * So if we aren't down to zero, we'll just schedule a worker and try | ||
3293 | * again | ||
3294 | */ | ||
3295 | if (atomic_read(&cachep->memcg_params->nr_pages) != 0) | ||
3296 | kmem_cache_shrink(cachep); | ||
3297 | else | ||
3298 | kmem_cache_destroy(cachep); | ||
3299 | } | ||
3300 | |||
3301 | void mem_cgroup_destroy_cache(struct kmem_cache *cachep) | ||
3302 | { | ||
3303 | if (!cachep->memcg_params->dead) | ||
3304 | return; | ||
3305 | |||
3306 | /* | ||
3307 | * There are many ways in which we can get here. | ||
3308 | * | ||
3309 | * We can get to a memory-pressure situation while the delayed work is | ||
3310 | * still pending to run. The vmscan shrinkers can then release all | ||
3311 | * cache memory and get us to destruction. If this is the case, we'll | ||
3312 | * be executed twice, which is a bug (the second time will execute over | ||
3313 | * bogus data). In this case, cancelling the work should be fine. | ||
3314 | * | ||
3315 | * But we can also get here from the worker itself, if | ||
3316 | * kmem_cache_shrink is enough to shake all the remaining objects and | ||
3317 | * get the page count to 0. In this case, we'll deadlock if we try to | ||
3318 | * cancel the work (the worker runs with an internal lock held, which | ||
3319 | * is the same lock we would hold for cancel_work_sync().) | ||
3320 | * | ||
3321 | * Since we can't possibly know who got us here, just refrain from | ||
3322 | * running if there is already work pending | ||
3323 | */ | ||
3324 | if (work_pending(&cachep->memcg_params->destroy)) | ||
3325 | return; | ||
3326 | /* | ||
3327 | * We have to defer the actual destroying to a workqueue, because | ||
3328 | * we might currently be in a context that cannot sleep. | ||
3329 | */ | ||
3330 | schedule_work(&cachep->memcg_params->destroy); | ||
3331 | } | ||
3332 | |||
3333 | int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) | ||
3334 | { | 3232 | { |
3335 | struct kmem_cache *c; | 3233 | struct kmem_cache *c; |
3336 | int i, failed = 0; | 3234 | int i, failed = 0; |
3337 | 3235 | ||
3338 | /* | 3236 | mutex_lock(&memcg_slab_mutex); |
3339 | * If the cache is being destroyed, we trust that there is no one else | ||
3340 | * requesting objects from it. Even if there are, the sanity checks in | ||
3341 | * kmem_cache_destroy should caught this ill-case. | ||
3342 | * | ||
3343 | * Still, we don't want anyone else freeing memcg_caches under our | ||
3344 | * noses, which can happen if a new memcg comes to life. As usual, | ||
3345 | * we'll take the activate_kmem_mutex to protect ourselves against | ||
3346 | * this. | ||
3347 | */ | ||
3348 | mutex_lock(&activate_kmem_mutex); | ||
3349 | for_each_memcg_cache_index(i) { | 3237 | for_each_memcg_cache_index(i) { |
3350 | c = cache_from_memcg_idx(s, i); | 3238 | c = cache_from_memcg_idx(s, i); |
3351 | if (!c) | 3239 | if (!c) |
3352 | continue; | 3240 | continue; |
3353 | 3241 | ||
3354 | /* | 3242 | memcg_unregister_cache(c); |
3355 | * We will now manually delete the caches, so to avoid races | ||
3356 | * we need to cancel all pending destruction workers and | ||
3357 | * proceed with destruction ourselves. | ||
3358 | * | ||
3359 | * kmem_cache_destroy() will call kmem_cache_shrink internally, | ||
3360 | * and that could spawn the workers again: it is likely that | ||
3361 | * the cache still have active pages until this very moment. | ||
3362 | * This would lead us back to mem_cgroup_destroy_cache. | ||
3363 | * | ||
3364 | * But that will not execute at all if the "dead" flag is not | ||
3365 | * set, so flip it down to guarantee we are in control. | ||
3366 | */ | ||
3367 | c->memcg_params->dead = false; | ||
3368 | cancel_work_sync(&c->memcg_params->destroy); | ||
3369 | kmem_cache_destroy(c); | ||
3370 | 3243 | ||
3371 | if (cache_from_memcg_idx(s, i)) | 3244 | if (cache_from_memcg_idx(s, i)) |
3372 | failed++; | 3245 | failed++; |
3373 | } | 3246 | } |
3374 | mutex_unlock(&activate_kmem_mutex); | 3247 | mutex_unlock(&memcg_slab_mutex); |
3375 | return failed; | 3248 | return failed; |
3376 | } | 3249 | } |
3377 | 3250 | ||
3378 | static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) | 3251 | static void memcg_unregister_all_caches(struct mem_cgroup *memcg) |
3379 | { | 3252 | { |
3380 | struct kmem_cache *cachep; | 3253 | struct kmem_cache *cachep; |
3381 | struct memcg_cache_params *params; | 3254 | struct memcg_cache_params *params, *tmp; |
3382 | 3255 | ||
3383 | if (!memcg_kmem_is_active(memcg)) | 3256 | if (!memcg_kmem_is_active(memcg)) |
3384 | return; | 3257 | return; |
3385 | 3258 | ||
3386 | mutex_lock(&memcg->slab_caches_mutex); | 3259 | mutex_lock(&memcg_slab_mutex); |
3387 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) { | 3260 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { |
3388 | cachep = memcg_params_to_cache(params); | 3261 | cachep = memcg_params_to_cache(params); |
3389 | cachep->memcg_params->dead = true; | 3262 | kmem_cache_shrink(cachep); |
3390 | schedule_work(&cachep->memcg_params->destroy); | 3263 | if (atomic_read(&cachep->memcg_params->nr_pages) == 0) |
3264 | memcg_unregister_cache(cachep); | ||
3391 | } | 3265 | } |
3392 | mutex_unlock(&memcg->slab_caches_mutex); | 3266 | mutex_unlock(&memcg_slab_mutex); |
3393 | } | 3267 | } |
3394 | 3268 | ||
3395 | struct create_work { | 3269 | struct memcg_register_cache_work { |
3396 | struct mem_cgroup *memcg; | 3270 | struct mem_cgroup *memcg; |
3397 | struct kmem_cache *cachep; | 3271 | struct kmem_cache *cachep; |
3398 | struct work_struct work; | 3272 | struct work_struct work; |
3399 | }; | 3273 | }; |
3400 | 3274 | ||
3401 | static void memcg_create_cache_work_func(struct work_struct *w) | 3275 | static void memcg_register_cache_func(struct work_struct *w) |
3402 | { | 3276 | { |
3403 | struct create_work *cw = container_of(w, struct create_work, work); | 3277 | struct memcg_register_cache_work *cw = |
3278 | container_of(w, struct memcg_register_cache_work, work); | ||
3404 | struct mem_cgroup *memcg = cw->memcg; | 3279 | struct mem_cgroup *memcg = cw->memcg; |
3405 | struct kmem_cache *cachep = cw->cachep; | 3280 | struct kmem_cache *cachep = cw->cachep; |
3406 | 3281 | ||
3407 | kmem_cache_create_memcg(memcg, cachep); | 3282 | mutex_lock(&memcg_slab_mutex); |
3283 | memcg_register_cache(memcg, cachep); | ||
3284 | mutex_unlock(&memcg_slab_mutex); | ||
3285 | |||
3408 | css_put(&memcg->css); | 3286 | css_put(&memcg->css); |
3409 | kfree(cw); | 3287 | kfree(cw); |
3410 | } | 3288 | } |
@@ -3412,12 +3290,12 @@ static void memcg_create_cache_work_func(struct work_struct *w) | |||
3412 | /* | 3290 | /* |
3413 | * Enqueue the creation of a per-memcg kmem_cache. | 3291 | * Enqueue the creation of a per-memcg kmem_cache. |
3414 | */ | 3292 | */ |
3415 | static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, | 3293 | static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, |
3416 | struct kmem_cache *cachep) | 3294 | struct kmem_cache *cachep) |
3417 | { | 3295 | { |
3418 | struct create_work *cw; | 3296 | struct memcg_register_cache_work *cw; |
3419 | 3297 | ||
3420 | cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); | 3298 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); |
3421 | if (cw == NULL) { | 3299 | if (cw == NULL) { |
3422 | css_put(&memcg->css); | 3300 | css_put(&memcg->css); |
3423 | return; | 3301 | return; |
@@ -3426,17 +3304,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, | |||
3426 | cw->memcg = memcg; | 3304 | cw->memcg = memcg; |
3427 | cw->cachep = cachep; | 3305 | cw->cachep = cachep; |
3428 | 3306 | ||
3429 | INIT_WORK(&cw->work, memcg_create_cache_work_func); | 3307 | INIT_WORK(&cw->work, memcg_register_cache_func); |
3430 | schedule_work(&cw->work); | 3308 | schedule_work(&cw->work); |
3431 | } | 3309 | } |
3432 | 3310 | ||
3433 | static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, | 3311 | static void memcg_schedule_register_cache(struct mem_cgroup *memcg, |
3434 | struct kmem_cache *cachep) | 3312 | struct kmem_cache *cachep) |
3435 | { | 3313 | { |
3436 | /* | 3314 | /* |
3437 | * We need to stop accounting when we kmalloc, because if the | 3315 | * We need to stop accounting when we kmalloc, because if the |
3438 | * corresponding kmalloc cache is not yet created, the first allocation | 3316 | * corresponding kmalloc cache is not yet created, the first allocation |
3439 | * in __memcg_create_cache_enqueue will recurse. | 3317 | * in __memcg_schedule_register_cache will recurse. |
3440 | * | 3318 | * |
3441 | * However, it is better to enclose the whole function. Depending on | 3319 | * However, it is better to enclose the whole function. Depending on |
3442 | * the debugging options enabled, INIT_WORK(), for instance, can | 3320 | * the debugging options enabled, INIT_WORK(), for instance, can |
@@ -3445,9 +3323,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, | |||
3445 | * the safest choice is to do it like this, wrapping the whole function. | 3323 | * the safest choice is to do it like this, wrapping the whole function. |
3446 | */ | 3324 | */ |
3447 | memcg_stop_kmem_account(); | 3325 | memcg_stop_kmem_account(); |
3448 | __memcg_create_cache_enqueue(memcg, cachep); | 3326 | __memcg_schedule_register_cache(memcg, cachep); |
3449 | memcg_resume_kmem_account(); | 3327 | memcg_resume_kmem_account(); |
3450 | } | 3328 | } |
3329 | |||
3330 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) | ||
3331 | { | ||
3332 | int res; | ||
3333 | |||
3334 | res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, | ||
3335 | PAGE_SIZE << order); | ||
3336 | if (!res) | ||
3337 | atomic_add(1 << order, &cachep->memcg_params->nr_pages); | ||
3338 | return res; | ||
3339 | } | ||
3340 | |||
3341 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | ||
3342 | { | ||
3343 | memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); | ||
3344 | atomic_sub(1 << order, &cachep->memcg_params->nr_pages); | ||
3345 | } | ||
3346 | |||
3451 | /* | 3347 | /* |
3452 | * Return the kmem_cache we're supposed to use for a slab allocation. | 3348 | * Return the kmem_cache we're supposed to use for a slab allocation. |
3453 | * We try to use the current memcg's version of the cache. | 3349 | * We try to use the current memcg's version of the cache. |
@@ -3498,22 +3394,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
3498 | * | 3394 | * |
3499 | * However, there are some clashes that can arrive from locking. | 3395 | * However, there are some clashes that can arrive from locking. |
3500 | * For instance, because we acquire the slab_mutex while doing | 3396 | * For instance, because we acquire the slab_mutex while doing |
3501 | * kmem_cache_dup, this means no further allocation could happen | 3397 | * memcg_create_kmem_cache, this means no further allocation |
3502 | * with the slab_mutex held. | 3398 | * could happen with the slab_mutex held. So it's better to |
3503 | * | 3399 | * defer everything. |
3504 | * Also, because cache creation issue get_online_cpus(), this | ||
3505 | * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, | ||
3506 | * that ends up reversed during cpu hotplug. (cpuset allocates | ||
3507 | * a bunch of GFP_KERNEL memory during cpuup). Due to all that, | ||
3508 | * better to defer everything. | ||
3509 | */ | 3400 | */ |
3510 | memcg_create_cache_enqueue(memcg, cachep); | 3401 | memcg_schedule_register_cache(memcg, cachep); |
3511 | return cachep; | 3402 | return cachep; |
3512 | out: | 3403 | out: |
3513 | rcu_read_unlock(); | 3404 | rcu_read_unlock(); |
3514 | return cachep; | 3405 | return cachep; |
3515 | } | 3406 | } |
3516 | EXPORT_SYMBOL(__memcg_kmem_get_cache); | ||
3517 | 3407 | ||
3518 | /* | 3408 | /* |
3519 | * We need to verify if the allocation against current->mm->owner's memcg is | 3409 | * We need to verify if the allocation against current->mm->owner's memcg is |
@@ -3540,11 +3430,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
3540 | /* | 3430 | /* |
3541 | * Disabling accounting is only relevant for some specific memcg | 3431 | * Disabling accounting is only relevant for some specific memcg |
3542 | * internal allocations. Therefore we would initially not have such | 3432 | * internal allocations. Therefore we would initially not have such |
3543 | * check here, since direct calls to the page allocator that are marked | 3433 | * check here, since direct calls to the page allocator that are |
3544 | * with GFP_KMEMCG only happen outside memcg core. We are mostly | 3434 | * accounted to kmemcg (alloc_kmem_pages and friends) only happen |
3545 | * concerned with cache allocations, and by having this test at | 3435 | * outside memcg core. We are mostly concerned with cache allocations, |
3546 | * memcg_kmem_get_cache, we are already able to relay the allocation to | 3436 | * and by having this test at memcg_kmem_get_cache, we are already able |
3547 | * the root cache and bypass the memcg cache altogether. | 3437 | * to relay the allocation to the root cache and bypass the memcg cache |
3438 | * altogether. | ||
3548 | * | 3439 | * |
3549 | * There is one exception, though: the SLUB allocator does not create | 3440 | * There is one exception, though: the SLUB allocator does not create |
3550 | * large order caches, but rather service large kmallocs directly from | 3441 | * large order caches, but rather service large kmallocs directly from |
@@ -3631,7 +3522,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) | |||
3631 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | 3522 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); |
3632 | } | 3523 | } |
3633 | #else | 3524 | #else |
3634 | static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) | 3525 | static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) |
3635 | { | 3526 | { |
3636 | } | 3527 | } |
3637 | #endif /* CONFIG_MEMCG_KMEM */ | 3528 | #endif /* CONFIG_MEMCG_KMEM */ |
@@ -4784,9 +4675,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
4784 | if (mem_cgroup_move_parent(page, pc, memcg)) { | 4675 | if (mem_cgroup_move_parent(page, pc, memcg)) { |
4785 | /* found lock contention or "pc" is obsolete. */ | 4676 | /* found lock contention or "pc" is obsolete. */ |
4786 | busy = page; | 4677 | busy = page; |
4787 | cond_resched(); | ||
4788 | } else | 4678 | } else |
4789 | busy = NULL; | 4679 | busy = NULL; |
4680 | cond_resched(); | ||
4790 | } while (!list_empty(list)); | 4681 | } while (!list_empty(list)); |
4791 | } | 4682 | } |
4792 | 4683 | ||
@@ -5062,13 +4953,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, | |||
5062 | * Make sure we have enough space for this cgroup in each root cache's | 4953 | * Make sure we have enough space for this cgroup in each root cache's |
5063 | * memcg_params. | 4954 | * memcg_params. |
5064 | */ | 4955 | */ |
4956 | mutex_lock(&memcg_slab_mutex); | ||
5065 | err = memcg_update_all_caches(memcg_id + 1); | 4957 | err = memcg_update_all_caches(memcg_id + 1); |
4958 | mutex_unlock(&memcg_slab_mutex); | ||
5066 | if (err) | 4959 | if (err) |
5067 | goto out_rmid; | 4960 | goto out_rmid; |
5068 | 4961 | ||
5069 | memcg->kmemcg_id = memcg_id; | 4962 | memcg->kmemcg_id = memcg_id; |
5070 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | 4963 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); |
5071 | mutex_init(&memcg->slab_caches_mutex); | ||
5072 | 4964 | ||
5073 | /* | 4965 | /* |
5074 | * We couldn't have accounted to this cgroup, because it hasn't got the | 4966 | * We couldn't have accounted to this cgroup, because it hasn't got the |
@@ -5443,22 +5335,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, | |||
5443 | struct cftype *cft, u64 val) | 5335 | struct cftype *cft, u64 val) |
5444 | { | 5336 | { |
5445 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5337 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5446 | struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); | ||
5447 | 5338 | ||
5448 | if (val > 100 || !parent) | 5339 | if (val > 100) |
5449 | return -EINVAL; | 5340 | return -EINVAL; |
5450 | 5341 | ||
5451 | mutex_lock(&memcg_create_mutex); | 5342 | if (css_parent(css)) |
5452 | 5343 | memcg->swappiness = val; | |
5453 | /* If under hierarchy, only empty-root can set this value */ | 5344 | else |
5454 | if ((parent->use_hierarchy) || memcg_has_children(memcg)) { | 5345 | vm_swappiness = val; |
5455 | mutex_unlock(&memcg_create_mutex); | ||
5456 | return -EINVAL; | ||
5457 | } | ||
5458 | |||
5459 | memcg->swappiness = val; | ||
5460 | |||
5461 | mutex_unlock(&memcg_create_mutex); | ||
5462 | 5346 | ||
5463 | return 0; | 5347 | return 0; |
5464 | } | 5348 | } |
@@ -5790,22 +5674,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, | |||
5790 | struct cftype *cft, u64 val) | 5674 | struct cftype *cft, u64 val) |
5791 | { | 5675 | { |
5792 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5676 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5793 | struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); | ||
5794 | 5677 | ||
5795 | /* cannot set to root cgroup and only 0 and 1 are allowed */ | 5678 | /* cannot set to root cgroup and only 0 and 1 are allowed */ |
5796 | if (!parent || !((val == 0) || (val == 1))) | 5679 | if (!css_parent(css) || !((val == 0) || (val == 1))) |
5797 | return -EINVAL; | 5680 | return -EINVAL; |
5798 | 5681 | ||
5799 | mutex_lock(&memcg_create_mutex); | ||
5800 | /* oom-kill-disable is a flag for subhierarchy. */ | ||
5801 | if ((parent->use_hierarchy) || memcg_has_children(memcg)) { | ||
5802 | mutex_unlock(&memcg_create_mutex); | ||
5803 | return -EINVAL; | ||
5804 | } | ||
5805 | memcg->oom_kill_disable = val; | 5682 | memcg->oom_kill_disable = val; |
5806 | if (!val) | 5683 | if (!val) |
5807 | memcg_oom_recover(memcg); | 5684 | memcg_oom_recover(memcg); |
5808 | mutex_unlock(&memcg_create_mutex); | 5685 | |
5809 | return 0; | 5686 | return 0; |
5810 | } | 5687 | } |
5811 | 5688 | ||
@@ -6491,7 +6368,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
6491 | css_for_each_descendant_post(iter, css) | 6368 | css_for_each_descendant_post(iter, css) |
6492 | mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); | 6369 | mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); |
6493 | 6370 | ||
6494 | mem_cgroup_destroy_all_caches(memcg); | 6371 | memcg_unregister_all_caches(memcg); |
6495 | vmpressure_cleanup(&memcg->vmpressure); | 6372 | vmpressure_cleanup(&memcg->vmpressure); |
6496 | } | 6373 | } |
6497 | 6374 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9ccef39a9de2..cd8989c1027e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -204,9 +204,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, | |||
204 | #endif | 204 | #endif |
205 | si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; | 205 | si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; |
206 | 206 | ||
207 | if ((flags & MF_ACTION_REQUIRED) && t == current) { | 207 | if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { |
208 | si.si_code = BUS_MCEERR_AR; | 208 | si.si_code = BUS_MCEERR_AR; |
209 | ret = force_sig_info(SIGBUS, &si, t); | 209 | ret = force_sig_info(SIGBUS, &si, current); |
210 | } else { | 210 | } else { |
211 | /* | 211 | /* |
212 | * Don't use force here, it's convenient if the signal | 212 | * Don't use force here, it's convenient if the signal |
@@ -380,20 +380,51 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, | |||
380 | } | 380 | } |
381 | } | 381 | } |
382 | 382 | ||
383 | static int task_early_kill(struct task_struct *tsk) | 383 | /* |
384 | * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) | ||
385 | * on behalf of the thread group. Return task_struct of the (first found) | ||
386 | * dedicated thread if found, and return NULL otherwise. | ||
387 | * | ||
388 | * We already hold read_lock(&tasklist_lock) in the caller, so we don't | ||
389 | * have to call rcu_read_lock/unlock() in this function. | ||
390 | */ | ||
391 | static struct task_struct *find_early_kill_thread(struct task_struct *tsk) | ||
384 | { | 392 | { |
393 | struct task_struct *t; | ||
394 | |||
395 | for_each_thread(tsk, t) | ||
396 | if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) | ||
397 | return t; | ||
398 | return NULL; | ||
399 | } | ||
400 | |||
401 | /* | ||
402 | * Determine whether a given process is "early kill" process which expects | ||
403 | * to be signaled when some page under the process is hwpoisoned. | ||
404 | * Return task_struct of the dedicated thread (main thread unless explicitly | ||
405 | * specified) if the process is "early kill," and otherwise returns NULL. | ||
406 | */ | ||
407 | static struct task_struct *task_early_kill(struct task_struct *tsk, | ||
408 | int force_early) | ||
409 | { | ||
410 | struct task_struct *t; | ||
385 | if (!tsk->mm) | 411 | if (!tsk->mm) |
386 | return 0; | 412 | return NULL; |
387 | if (tsk->flags & PF_MCE_PROCESS) | 413 | if (force_early) |
388 | return !!(tsk->flags & PF_MCE_EARLY); | 414 | return tsk; |
389 | return sysctl_memory_failure_early_kill; | 415 | t = find_early_kill_thread(tsk); |
416 | if (t) | ||
417 | return t; | ||
418 | if (sysctl_memory_failure_early_kill) | ||
419 | return tsk; | ||
420 | return NULL; | ||
390 | } | 421 | } |
391 | 422 | ||
392 | /* | 423 | /* |
393 | * Collect processes when the error hit an anonymous page. | 424 | * Collect processes when the error hit an anonymous page. |
394 | */ | 425 | */ |
395 | static void collect_procs_anon(struct page *page, struct list_head *to_kill, | 426 | static void collect_procs_anon(struct page *page, struct list_head *to_kill, |
396 | struct to_kill **tkc) | 427 | struct to_kill **tkc, int force_early) |
397 | { | 428 | { |
398 | struct vm_area_struct *vma; | 429 | struct vm_area_struct *vma; |
399 | struct task_struct *tsk; | 430 | struct task_struct *tsk; |
@@ -408,16 +439,17 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
408 | read_lock(&tasklist_lock); | 439 | read_lock(&tasklist_lock); |
409 | for_each_process (tsk) { | 440 | for_each_process (tsk) { |
410 | struct anon_vma_chain *vmac; | 441 | struct anon_vma_chain *vmac; |
442 | struct task_struct *t = task_early_kill(tsk, force_early); | ||
411 | 443 | ||
412 | if (!task_early_kill(tsk)) | 444 | if (!t) |
413 | continue; | 445 | continue; |
414 | anon_vma_interval_tree_foreach(vmac, &av->rb_root, | 446 | anon_vma_interval_tree_foreach(vmac, &av->rb_root, |
415 | pgoff, pgoff) { | 447 | pgoff, pgoff) { |
416 | vma = vmac->vma; | 448 | vma = vmac->vma; |
417 | if (!page_mapped_in_vma(page, vma)) | 449 | if (!page_mapped_in_vma(page, vma)) |
418 | continue; | 450 | continue; |
419 | if (vma->vm_mm == tsk->mm) | 451 | if (vma->vm_mm == t->mm) |
420 | add_to_kill(tsk, page, vma, to_kill, tkc); | 452 | add_to_kill(t, page, vma, to_kill, tkc); |
421 | } | 453 | } |
422 | } | 454 | } |
423 | read_unlock(&tasklist_lock); | 455 | read_unlock(&tasklist_lock); |
@@ -428,7 +460,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
428 | * Collect processes when the error hit a file mapped page. | 460 | * Collect processes when the error hit a file mapped page. |
429 | */ | 461 | */ |
430 | static void collect_procs_file(struct page *page, struct list_head *to_kill, | 462 | static void collect_procs_file(struct page *page, struct list_head *to_kill, |
431 | struct to_kill **tkc) | 463 | struct to_kill **tkc, int force_early) |
432 | { | 464 | { |
433 | struct vm_area_struct *vma; | 465 | struct vm_area_struct *vma; |
434 | struct task_struct *tsk; | 466 | struct task_struct *tsk; |
@@ -438,10 +470,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
438 | read_lock(&tasklist_lock); | 470 | read_lock(&tasklist_lock); |
439 | for_each_process(tsk) { | 471 | for_each_process(tsk) { |
440 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 472 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
473 | struct task_struct *t = task_early_kill(tsk, force_early); | ||
441 | 474 | ||
442 | if (!task_early_kill(tsk)) | 475 | if (!t) |
443 | continue; | 476 | continue; |
444 | |||
445 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, | 477 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, |
446 | pgoff) { | 478 | pgoff) { |
447 | /* | 479 | /* |
@@ -451,8 +483,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
451 | * Assume applications who requested early kill want | 483 | * Assume applications who requested early kill want |
452 | * to be informed of all such data corruptions. | 484 | * to be informed of all such data corruptions. |
453 | */ | 485 | */ |
454 | if (vma->vm_mm == tsk->mm) | 486 | if (vma->vm_mm == t->mm) |
455 | add_to_kill(tsk, page, vma, to_kill, tkc); | 487 | add_to_kill(t, page, vma, to_kill, tkc); |
456 | } | 488 | } |
457 | } | 489 | } |
458 | read_unlock(&tasklist_lock); | 490 | read_unlock(&tasklist_lock); |
@@ -465,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
465 | * First preallocate one tokill structure outside the spin locks, | 497 | * First preallocate one tokill structure outside the spin locks, |
466 | * so that we can kill at least one process reasonably reliable. | 498 | * so that we can kill at least one process reasonably reliable. |
467 | */ | 499 | */ |
468 | static void collect_procs(struct page *page, struct list_head *tokill) | 500 | static void collect_procs(struct page *page, struct list_head *tokill, |
501 | int force_early) | ||
469 | { | 502 | { |
470 | struct to_kill *tk; | 503 | struct to_kill *tk; |
471 | 504 | ||
@@ -476,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill) | |||
476 | if (!tk) | 509 | if (!tk) |
477 | return; | 510 | return; |
478 | if (PageAnon(page)) | 511 | if (PageAnon(page)) |
479 | collect_procs_anon(page, tokill, &tk); | 512 | collect_procs_anon(page, tokill, &tk, force_early); |
480 | else | 513 | else |
481 | collect_procs_file(page, tokill, &tk); | 514 | collect_procs_file(page, tokill, &tk, force_early); |
482 | kfree(tk); | 515 | kfree(tk); |
483 | } | 516 | } |
484 | 517 | ||
@@ -963,7 +996,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
963 | * there's nothing that can be done. | 996 | * there's nothing that can be done. |
964 | */ | 997 | */ |
965 | if (kill) | 998 | if (kill) |
966 | collect_procs(ppage, &tokill); | 999 | collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); |
967 | 1000 | ||
968 | ret = try_to_unmap(ppage, ttu); | 1001 | ret = try_to_unmap(ppage, ttu); |
969 | if (ret != SWAP_SUCCESS) | 1002 | if (ret != SWAP_SUCCESS) |
@@ -1132,11 +1165,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1132 | } | 1165 | } |
1133 | } | 1166 | } |
1134 | 1167 | ||
1135 | /* | ||
1136 | * Lock the page and wait for writeback to finish. | ||
1137 | * It's very difficult to mess with pages currently under IO | ||
1138 | * and in many cases impossible, so we just avoid it here. | ||
1139 | */ | ||
1140 | lock_page(hpage); | 1168 | lock_page(hpage); |
1141 | 1169 | ||
1142 | /* | 1170 | /* |
@@ -1186,6 +1214,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1186 | if (PageHuge(p)) | 1214 | if (PageHuge(p)) |
1187 | set_page_hwpoison_huge_page(hpage); | 1215 | set_page_hwpoison_huge_page(hpage); |
1188 | 1216 | ||
1217 | /* | ||
1218 | * It's very difficult to mess with pages currently under IO | ||
1219 | * and in many cases impossible, so we just avoid it here. | ||
1220 | */ | ||
1189 | wait_on_page_writeback(p); | 1221 | wait_on_page_writeback(p); |
1190 | 1222 | ||
1191 | /* | 1223 | /* |
@@ -1298,7 +1330,7 @@ static void memory_failure_work_func(struct work_struct *work) | |||
1298 | unsigned long proc_flags; | 1330 | unsigned long proc_flags; |
1299 | int gotten; | 1331 | int gotten; |
1300 | 1332 | ||
1301 | mf_cpu = &__get_cpu_var(memory_failure_cpu); | 1333 | mf_cpu = this_cpu_ptr(&memory_failure_cpu); |
1302 | for (;;) { | 1334 | for (;;) { |
1303 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | 1335 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); |
1304 | gotten = kfifo_get(&mf_cpu->fifo, &entry); | 1336 | gotten = kfifo_get(&mf_cpu->fifo, &entry); |
@@ -1503,7 +1535,7 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1503 | 1535 | ||
1504 | /* Keep page count to indicate a given hugepage is isolated. */ | 1536 | /* Keep page count to indicate a given hugepage is isolated. */ |
1505 | list_move(&hpage->lru, &pagelist); | 1537 | list_move(&hpage->lru, &pagelist); |
1506 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1538 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
1507 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1539 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1508 | if (ret) { | 1540 | if (ret) { |
1509 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1541 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
@@ -1584,7 +1616,7 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1584 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 1616 | inc_zone_page_state(page, NR_ISOLATED_ANON + |
1585 | page_is_file_cache(page)); | 1617 | page_is_file_cache(page)); |
1586 | list_add(&page->lru, &pagelist); | 1618 | list_add(&page->lru, &pagelist); |
1587 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1619 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
1588 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1620 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1589 | if (ret) { | 1621 | if (ret) { |
1590 | if (!list_empty(&pagelist)) { | 1622 | if (!list_empty(&pagelist)) { |
@@ -1664,11 +1696,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1664 | } | 1696 | } |
1665 | } | 1697 | } |
1666 | 1698 | ||
1667 | /* | 1699 | get_online_mems(); |
1668 | * The lock_memory_hotplug prevents a race with memory hotplug. | ||
1669 | * This is a big hammer, a better would be nicer. | ||
1670 | */ | ||
1671 | lock_memory_hotplug(); | ||
1672 | 1700 | ||
1673 | /* | 1701 | /* |
1674 | * Isolate the page, so that it doesn't get reallocated if it | 1702 | * Isolate the page, so that it doesn't get reallocated if it |
@@ -1679,7 +1707,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1679 | set_migratetype_isolate(page, true); | 1707 | set_migratetype_isolate(page, true); |
1680 | 1708 | ||
1681 | ret = get_any_page(page, pfn, flags); | 1709 | ret = get_any_page(page, pfn, flags); |
1682 | unlock_memory_hotplug(); | 1710 | put_online_mems(); |
1683 | if (ret > 0) { /* for in-use pages */ | 1711 | if (ret > 0) { /* for in-use pages */ |
1684 | if (PageHuge(page)) | 1712 | if (PageHuge(page)) |
1685 | ret = soft_offline_huge_page(page, flags); | 1713 | ret = soft_offline_huge_page(page, flags); |
diff --git a/mm/memory.c b/mm/memory.c index e302ae1dcce0..d67fd9fcf1f2 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -698,11 +698,6 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
698 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); | 698 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
699 | } | 699 | } |
700 | 700 | ||
701 | static inline bool is_cow_mapping(vm_flags_t flags) | ||
702 | { | ||
703 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | ||
704 | } | ||
705 | |||
706 | /* | 701 | /* |
707 | * vm_normal_page -- This function gets the "struct page" associated with a pte. | 702 | * vm_normal_page -- This function gets the "struct page" associated with a pte. |
708 | * | 703 | * |
@@ -756,7 +751,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
756 | unsigned long pfn = pte_pfn(pte); | 751 | unsigned long pfn = pte_pfn(pte); |
757 | 752 | ||
758 | if (HAVE_PTE_SPECIAL) { | 753 | if (HAVE_PTE_SPECIAL) { |
759 | if (likely(!pte_special(pte))) | 754 | if (likely(!pte_special(pte) || pte_numa(pte))) |
760 | goto check_pfn; | 755 | goto check_pfn; |
761 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) | 756 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) |
762 | return NULL; | 757 | return NULL; |
@@ -782,14 +777,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
782 | } | 777 | } |
783 | } | 778 | } |
784 | 779 | ||
785 | if (is_zero_pfn(pfn)) | ||
786 | return NULL; | ||
787 | check_pfn: | 780 | check_pfn: |
788 | if (unlikely(pfn > highest_memmap_pfn)) { | 781 | if (unlikely(pfn > highest_memmap_pfn)) { |
789 | print_bad_pte(vma, addr, pte, NULL); | 782 | print_bad_pte(vma, addr, pte, NULL); |
790 | return NULL; | 783 | return NULL; |
791 | } | 784 | } |
792 | 785 | ||
786 | if (is_zero_pfn(pfn)) | ||
787 | return NULL; | ||
788 | |||
793 | /* | 789 | /* |
794 | * NOTE! We still have PageReserved() pages in the page tables. | 790 | * NOTE! We still have PageReserved() pages in the page tables. |
795 | * eg. VDSO mappings can cause them to exist. | 791 | * eg. VDSO mappings can cause them to exist. |
@@ -1457,646 +1453,6 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | |||
1457 | } | 1453 | } |
1458 | EXPORT_SYMBOL_GPL(zap_vma_ptes); | 1454 | EXPORT_SYMBOL_GPL(zap_vma_ptes); |
1459 | 1455 | ||
1460 | /** | ||
1461 | * follow_page_mask - look up a page descriptor from a user-virtual address | ||
1462 | * @vma: vm_area_struct mapping @address | ||
1463 | * @address: virtual address to look up | ||
1464 | * @flags: flags modifying lookup behaviour | ||
1465 | * @page_mask: on output, *page_mask is set according to the size of the page | ||
1466 | * | ||
1467 | * @flags can have FOLL_ flags set, defined in <linux/mm.h> | ||
1468 | * | ||
1469 | * Returns the mapped (struct page *), %NULL if no mapping exists, or | ||
1470 | * an error pointer if there is a mapping to something not represented | ||
1471 | * by a page descriptor (see also vm_normal_page()). | ||
1472 | */ | ||
1473 | struct page *follow_page_mask(struct vm_area_struct *vma, | ||
1474 | unsigned long address, unsigned int flags, | ||
1475 | unsigned int *page_mask) | ||
1476 | { | ||
1477 | pgd_t *pgd; | ||
1478 | pud_t *pud; | ||
1479 | pmd_t *pmd; | ||
1480 | pte_t *ptep, pte; | ||
1481 | spinlock_t *ptl; | ||
1482 | struct page *page; | ||
1483 | struct mm_struct *mm = vma->vm_mm; | ||
1484 | |||
1485 | *page_mask = 0; | ||
1486 | |||
1487 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); | ||
1488 | if (!IS_ERR(page)) { | ||
1489 | BUG_ON(flags & FOLL_GET); | ||
1490 | goto out; | ||
1491 | } | ||
1492 | |||
1493 | page = NULL; | ||
1494 | pgd = pgd_offset(mm, address); | ||
1495 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
1496 | goto no_page_table; | ||
1497 | |||
1498 | pud = pud_offset(pgd, address); | ||
1499 | if (pud_none(*pud)) | ||
1500 | goto no_page_table; | ||
1501 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { | ||
1502 | if (flags & FOLL_GET) | ||
1503 | goto out; | ||
1504 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | ||
1505 | goto out; | ||
1506 | } | ||
1507 | if (unlikely(pud_bad(*pud))) | ||
1508 | goto no_page_table; | ||
1509 | |||
1510 | pmd = pmd_offset(pud, address); | ||
1511 | if (pmd_none(*pmd)) | ||
1512 | goto no_page_table; | ||
1513 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { | ||
1514 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | ||
1515 | if (flags & FOLL_GET) { | ||
1516 | /* | ||
1517 | * Refcount on tail pages are not well-defined and | ||
1518 | * shouldn't be taken. The caller should handle a NULL | ||
1519 | * return when trying to follow tail pages. | ||
1520 | */ | ||
1521 | if (PageHead(page)) | ||
1522 | get_page(page); | ||
1523 | else { | ||
1524 | page = NULL; | ||
1525 | goto out; | ||
1526 | } | ||
1527 | } | ||
1528 | goto out; | ||
1529 | } | ||
1530 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | ||
1531 | goto no_page_table; | ||
1532 | if (pmd_trans_huge(*pmd)) { | ||
1533 | if (flags & FOLL_SPLIT) { | ||
1534 | split_huge_page_pmd(vma, address, pmd); | ||
1535 | goto split_fallthrough; | ||
1536 | } | ||
1537 | ptl = pmd_lock(mm, pmd); | ||
1538 | if (likely(pmd_trans_huge(*pmd))) { | ||
1539 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
1540 | spin_unlock(ptl); | ||
1541 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1542 | } else { | ||
1543 | page = follow_trans_huge_pmd(vma, address, | ||
1544 | pmd, flags); | ||
1545 | spin_unlock(ptl); | ||
1546 | *page_mask = HPAGE_PMD_NR - 1; | ||
1547 | goto out; | ||
1548 | } | ||
1549 | } else | ||
1550 | spin_unlock(ptl); | ||
1551 | /* fall through */ | ||
1552 | } | ||
1553 | split_fallthrough: | ||
1554 | if (unlikely(pmd_bad(*pmd))) | ||
1555 | goto no_page_table; | ||
1556 | |||
1557 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
1558 | |||
1559 | pte = *ptep; | ||
1560 | if (!pte_present(pte)) { | ||
1561 | swp_entry_t entry; | ||
1562 | /* | ||
1563 | * KSM's break_ksm() relies upon recognizing a ksm page | ||
1564 | * even while it is being migrated, so for that case we | ||
1565 | * need migration_entry_wait(). | ||
1566 | */ | ||
1567 | if (likely(!(flags & FOLL_MIGRATION))) | ||
1568 | goto no_page; | ||
1569 | if (pte_none(pte) || pte_file(pte)) | ||
1570 | goto no_page; | ||
1571 | entry = pte_to_swp_entry(pte); | ||
1572 | if (!is_migration_entry(entry)) | ||
1573 | goto no_page; | ||
1574 | pte_unmap_unlock(ptep, ptl); | ||
1575 | migration_entry_wait(mm, pmd, address); | ||
1576 | goto split_fallthrough; | ||
1577 | } | ||
1578 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | ||
1579 | goto no_page; | ||
1580 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | ||
1581 | goto unlock; | ||
1582 | |||
1583 | page = vm_normal_page(vma, address, pte); | ||
1584 | if (unlikely(!page)) { | ||
1585 | if ((flags & FOLL_DUMP) || | ||
1586 | !is_zero_pfn(pte_pfn(pte))) | ||
1587 | goto bad_page; | ||
1588 | page = pte_page(pte); | ||
1589 | } | ||
1590 | |||
1591 | if (flags & FOLL_GET) | ||
1592 | get_page_foll(page); | ||
1593 | if (flags & FOLL_TOUCH) { | ||
1594 | if ((flags & FOLL_WRITE) && | ||
1595 | !pte_dirty(pte) && !PageDirty(page)) | ||
1596 | set_page_dirty(page); | ||
1597 | /* | ||
1598 | * pte_mkyoung() would be more correct here, but atomic care | ||
1599 | * is needed to avoid losing the dirty bit: it is easier to use | ||
1600 | * mark_page_accessed(). | ||
1601 | */ | ||
1602 | mark_page_accessed(page); | ||
1603 | } | ||
1604 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | ||
1605 | /* | ||
1606 | * The preliminary mapping check is mainly to avoid the | ||
1607 | * pointless overhead of lock_page on the ZERO_PAGE | ||
1608 | * which might bounce very badly if there is contention. | ||
1609 | * | ||
1610 | * If the page is already locked, we don't need to | ||
1611 | * handle it now - vmscan will handle it later if and | ||
1612 | * when it attempts to reclaim the page. | ||
1613 | */ | ||
1614 | if (page->mapping && trylock_page(page)) { | ||
1615 | lru_add_drain(); /* push cached pages to LRU */ | ||
1616 | /* | ||
1617 | * Because we lock page here, and migration is | ||
1618 | * blocked by the pte's page reference, and we | ||
1619 | * know the page is still mapped, we don't even | ||
1620 | * need to check for file-cache page truncation. | ||
1621 | */ | ||
1622 | mlock_vma_page(page); | ||
1623 | unlock_page(page); | ||
1624 | } | ||
1625 | } | ||
1626 | unlock: | ||
1627 | pte_unmap_unlock(ptep, ptl); | ||
1628 | out: | ||
1629 | return page; | ||
1630 | |||
1631 | bad_page: | ||
1632 | pte_unmap_unlock(ptep, ptl); | ||
1633 | return ERR_PTR(-EFAULT); | ||
1634 | |||
1635 | no_page: | ||
1636 | pte_unmap_unlock(ptep, ptl); | ||
1637 | if (!pte_none(pte)) | ||
1638 | return page; | ||
1639 | |||
1640 | no_page_table: | ||
1641 | /* | ||
1642 | * When core dumping an enormous anonymous area that nobody | ||
1643 | * has touched so far, we don't want to allocate unnecessary pages or | ||
1644 | * page tables. Return error instead of NULL to skip handle_mm_fault, | ||
1645 | * then get_dump_page() will return NULL to leave a hole in the dump. | ||
1646 | * But we can only make this optimization where a hole would surely | ||
1647 | * be zero-filled if handle_mm_fault() actually did handle it. | ||
1648 | */ | ||
1649 | if ((flags & FOLL_DUMP) && | ||
1650 | (!vma->vm_ops || !vma->vm_ops->fault)) | ||
1651 | return ERR_PTR(-EFAULT); | ||
1652 | return page; | ||
1653 | } | ||
1654 | |||
1655 | static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) | ||
1656 | { | ||
1657 | return stack_guard_page_start(vma, addr) || | ||
1658 | stack_guard_page_end(vma, addr+PAGE_SIZE); | ||
1659 | } | ||
1660 | |||
1661 | /** | ||
1662 | * __get_user_pages() - pin user pages in memory | ||
1663 | * @tsk: task_struct of target task | ||
1664 | * @mm: mm_struct of target mm | ||
1665 | * @start: starting user address | ||
1666 | * @nr_pages: number of pages from start to pin | ||
1667 | * @gup_flags: flags modifying pin behaviour | ||
1668 | * @pages: array that receives pointers to the pages pinned. | ||
1669 | * Should be at least nr_pages long. Or NULL, if caller | ||
1670 | * only intends to ensure the pages are faulted in. | ||
1671 | * @vmas: array of pointers to vmas corresponding to each page. | ||
1672 | * Or NULL if the caller does not require them. | ||
1673 | * @nonblocking: whether waiting for disk IO or mmap_sem contention | ||
1674 | * | ||
1675 | * Returns number of pages pinned. This may be fewer than the number | ||
1676 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
1677 | * were pinned, returns -errno. Each page returned must be released | ||
1678 | * with a put_page() call when it is finished with. vmas will only | ||
1679 | * remain valid while mmap_sem is held. | ||
1680 | * | ||
1681 | * Must be called with mmap_sem held for read or write. | ||
1682 | * | ||
1683 | * __get_user_pages walks a process's page tables and takes a reference to | ||
1684 | * each struct page that each user address corresponds to at a given | ||
1685 | * instant. That is, it takes the page that would be accessed if a user | ||
1686 | * thread accesses the given user virtual address at that instant. | ||
1687 | * | ||
1688 | * This does not guarantee that the page exists in the user mappings when | ||
1689 | * __get_user_pages returns, and there may even be a completely different | ||
1690 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
1691 | * and subsequently re faulted). However it does guarantee that the page | ||
1692 | * won't be freed completely. And mostly callers simply care that the page | ||
1693 | * contains data that was valid *at some point in time*. Typically, an IO | ||
1694 | * or similar operation cannot guarantee anything stronger anyway because | ||
1695 | * locks can't be held over the syscall boundary. | ||
1696 | * | ||
1697 | * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If | ||
1698 | * the page is written to, set_page_dirty (or set_page_dirty_lock, as | ||
1699 | * appropriate) must be called after the page is finished with, and | ||
1700 | * before put_page is called. | ||
1701 | * | ||
1702 | * If @nonblocking != NULL, __get_user_pages will not wait for disk IO | ||
1703 | * or mmap_sem contention, and if waiting is needed to pin all pages, | ||
1704 | * *@nonblocking will be set to 0. | ||
1705 | * | ||
1706 | * In most cases, get_user_pages or get_user_pages_fast should be used | ||
1707 | * instead of __get_user_pages. __get_user_pages should be used only if | ||
1708 | * you need some special @gup_flags. | ||
1709 | */ | ||
1710 | long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
1711 | unsigned long start, unsigned long nr_pages, | ||
1712 | unsigned int gup_flags, struct page **pages, | ||
1713 | struct vm_area_struct **vmas, int *nonblocking) | ||
1714 | { | ||
1715 | long i; | ||
1716 | unsigned long vm_flags; | ||
1717 | unsigned int page_mask; | ||
1718 | |||
1719 | if (!nr_pages) | ||
1720 | return 0; | ||
1721 | |||
1722 | VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); | ||
1723 | |||
1724 | /* | ||
1725 | * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault | ||
1726 | * would be called on PROT_NONE ranges. We must never invoke | ||
1727 | * handle_mm_fault on PROT_NONE ranges or the NUMA hinting | ||
1728 | * page faults would unprotect the PROT_NONE ranges if | ||
1729 | * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd | ||
1730 | * bitflag. So to avoid that, don't set FOLL_NUMA if | ||
1731 | * FOLL_FORCE is set. | ||
1732 | */ | ||
1733 | if (!(gup_flags & FOLL_FORCE)) | ||
1734 | gup_flags |= FOLL_NUMA; | ||
1735 | |||
1736 | i = 0; | ||
1737 | |||
1738 | do { | ||
1739 | struct vm_area_struct *vma; | ||
1740 | |||
1741 | vma = find_extend_vma(mm, start); | ||
1742 | if (!vma && in_gate_area(mm, start)) { | ||
1743 | unsigned long pg = start & PAGE_MASK; | ||
1744 | pgd_t *pgd; | ||
1745 | pud_t *pud; | ||
1746 | pmd_t *pmd; | ||
1747 | pte_t *pte; | ||
1748 | |||
1749 | /* user gate pages are read-only */ | ||
1750 | if (gup_flags & FOLL_WRITE) | ||
1751 | goto efault; | ||
1752 | if (pg > TASK_SIZE) | ||
1753 | pgd = pgd_offset_k(pg); | ||
1754 | else | ||
1755 | pgd = pgd_offset_gate(mm, pg); | ||
1756 | BUG_ON(pgd_none(*pgd)); | ||
1757 | pud = pud_offset(pgd, pg); | ||
1758 | BUG_ON(pud_none(*pud)); | ||
1759 | pmd = pmd_offset(pud, pg); | ||
1760 | if (pmd_none(*pmd)) | ||
1761 | goto efault; | ||
1762 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1763 | pte = pte_offset_map(pmd, pg); | ||
1764 | if (pte_none(*pte)) { | ||
1765 | pte_unmap(pte); | ||
1766 | goto efault; | ||
1767 | } | ||
1768 | vma = get_gate_vma(mm); | ||
1769 | if (pages) { | ||
1770 | struct page *page; | ||
1771 | |||
1772 | page = vm_normal_page(vma, start, *pte); | ||
1773 | if (!page) { | ||
1774 | if (!(gup_flags & FOLL_DUMP) && | ||
1775 | is_zero_pfn(pte_pfn(*pte))) | ||
1776 | page = pte_page(*pte); | ||
1777 | else { | ||
1778 | pte_unmap(pte); | ||
1779 | goto efault; | ||
1780 | } | ||
1781 | } | ||
1782 | pages[i] = page; | ||
1783 | get_page(page); | ||
1784 | } | ||
1785 | pte_unmap(pte); | ||
1786 | page_mask = 0; | ||
1787 | goto next_page; | ||
1788 | } | ||
1789 | |||
1790 | if (!vma) | ||
1791 | goto efault; | ||
1792 | vm_flags = vma->vm_flags; | ||
1793 | if (vm_flags & (VM_IO | VM_PFNMAP)) | ||
1794 | goto efault; | ||
1795 | |||
1796 | if (gup_flags & FOLL_WRITE) { | ||
1797 | if (!(vm_flags & VM_WRITE)) { | ||
1798 | if (!(gup_flags & FOLL_FORCE)) | ||
1799 | goto efault; | ||
1800 | /* | ||
1801 | * We used to let the write,force case do COW | ||
1802 | * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so | ||
1803 | * ptrace could set a breakpoint in a read-only | ||
1804 | * mapping of an executable, without corrupting | ||
1805 | * the file (yet only when that file had been | ||
1806 | * opened for writing!). Anon pages in shared | ||
1807 | * mappings are surprising: now just reject it. | ||
1808 | */ | ||
1809 | if (!is_cow_mapping(vm_flags)) { | ||
1810 | WARN_ON_ONCE(vm_flags & VM_MAYWRITE); | ||
1811 | goto efault; | ||
1812 | } | ||
1813 | } | ||
1814 | } else { | ||
1815 | if (!(vm_flags & VM_READ)) { | ||
1816 | if (!(gup_flags & FOLL_FORCE)) | ||
1817 | goto efault; | ||
1818 | /* | ||
1819 | * Is there actually any vma we can reach here | ||
1820 | * which does not have VM_MAYREAD set? | ||
1821 | */ | ||
1822 | if (!(vm_flags & VM_MAYREAD)) | ||
1823 | goto efault; | ||
1824 | } | ||
1825 | } | ||
1826 | |||
1827 | if (is_vm_hugetlb_page(vma)) { | ||
1828 | i = follow_hugetlb_page(mm, vma, pages, vmas, | ||
1829 | &start, &nr_pages, i, gup_flags); | ||
1830 | continue; | ||
1831 | } | ||
1832 | |||
1833 | do { | ||
1834 | struct page *page; | ||
1835 | unsigned int foll_flags = gup_flags; | ||
1836 | unsigned int page_increm; | ||
1837 | |||
1838 | /* | ||
1839 | * If we have a pending SIGKILL, don't keep faulting | ||
1840 | * pages and potentially allocating memory. | ||
1841 | */ | ||
1842 | if (unlikely(fatal_signal_pending(current))) | ||
1843 | return i ? i : -ERESTARTSYS; | ||
1844 | |||
1845 | cond_resched(); | ||
1846 | while (!(page = follow_page_mask(vma, start, | ||
1847 | foll_flags, &page_mask))) { | ||
1848 | int ret; | ||
1849 | unsigned int fault_flags = 0; | ||
1850 | |||
1851 | /* For mlock, just skip the stack guard page. */ | ||
1852 | if (foll_flags & FOLL_MLOCK) { | ||
1853 | if (stack_guard_page(vma, start)) | ||
1854 | goto next_page; | ||
1855 | } | ||
1856 | if (foll_flags & FOLL_WRITE) | ||
1857 | fault_flags |= FAULT_FLAG_WRITE; | ||
1858 | if (nonblocking) | ||
1859 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | ||
1860 | if (foll_flags & FOLL_NOWAIT) | ||
1861 | fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); | ||
1862 | |||
1863 | ret = handle_mm_fault(mm, vma, start, | ||
1864 | fault_flags); | ||
1865 | |||
1866 | if (ret & VM_FAULT_ERROR) { | ||
1867 | if (ret & VM_FAULT_OOM) | ||
1868 | return i ? i : -ENOMEM; | ||
1869 | if (ret & (VM_FAULT_HWPOISON | | ||
1870 | VM_FAULT_HWPOISON_LARGE)) { | ||
1871 | if (i) | ||
1872 | return i; | ||
1873 | else if (gup_flags & FOLL_HWPOISON) | ||
1874 | return -EHWPOISON; | ||
1875 | else | ||
1876 | return -EFAULT; | ||
1877 | } | ||
1878 | if (ret & VM_FAULT_SIGBUS) | ||
1879 | goto efault; | ||
1880 | BUG(); | ||
1881 | } | ||
1882 | |||
1883 | if (tsk) { | ||
1884 | if (ret & VM_FAULT_MAJOR) | ||
1885 | tsk->maj_flt++; | ||
1886 | else | ||
1887 | tsk->min_flt++; | ||
1888 | } | ||
1889 | |||
1890 | if (ret & VM_FAULT_RETRY) { | ||
1891 | if (nonblocking) | ||
1892 | *nonblocking = 0; | ||
1893 | return i; | ||
1894 | } | ||
1895 | |||
1896 | /* | ||
1897 | * The VM_FAULT_WRITE bit tells us that | ||
1898 | * do_wp_page has broken COW when necessary, | ||
1899 | * even if maybe_mkwrite decided not to set | ||
1900 | * pte_write. We can thus safely do subsequent | ||
1901 | * page lookups as if they were reads. But only | ||
1902 | * do so when looping for pte_write is futile: | ||
1903 | * in some cases userspace may also be wanting | ||
1904 | * to write to the gotten user page, which a | ||
1905 | * read fault here might prevent (a readonly | ||
1906 | * page might get reCOWed by userspace write). | ||
1907 | */ | ||
1908 | if ((ret & VM_FAULT_WRITE) && | ||
1909 | !(vma->vm_flags & VM_WRITE)) | ||
1910 | foll_flags &= ~FOLL_WRITE; | ||
1911 | |||
1912 | cond_resched(); | ||
1913 | } | ||
1914 | if (IS_ERR(page)) | ||
1915 | return i ? i : PTR_ERR(page); | ||
1916 | if (pages) { | ||
1917 | pages[i] = page; | ||
1918 | |||
1919 | flush_anon_page(vma, page, start); | ||
1920 | flush_dcache_page(page); | ||
1921 | page_mask = 0; | ||
1922 | } | ||
1923 | next_page: | ||
1924 | if (vmas) { | ||
1925 | vmas[i] = vma; | ||
1926 | page_mask = 0; | ||
1927 | } | ||
1928 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); | ||
1929 | if (page_increm > nr_pages) | ||
1930 | page_increm = nr_pages; | ||
1931 | i += page_increm; | ||
1932 | start += page_increm * PAGE_SIZE; | ||
1933 | nr_pages -= page_increm; | ||
1934 | } while (nr_pages && start < vma->vm_end); | ||
1935 | } while (nr_pages); | ||
1936 | return i; | ||
1937 | efault: | ||
1938 | return i ? : -EFAULT; | ||
1939 | } | ||
1940 | EXPORT_SYMBOL(__get_user_pages); | ||
1941 | |||
1942 | /* | ||
1943 | * fixup_user_fault() - manually resolve a user page fault | ||
1944 | * @tsk: the task_struct to use for page fault accounting, or | ||
1945 | * NULL if faults are not to be recorded. | ||
1946 | * @mm: mm_struct of target mm | ||
1947 | * @address: user address | ||
1948 | * @fault_flags:flags to pass down to handle_mm_fault() | ||
1949 | * | ||
1950 | * This is meant to be called in the specific scenario where for locking reasons | ||
1951 | * we try to access user memory in atomic context (within a pagefault_disable() | ||
1952 | * section), this returns -EFAULT, and we want to resolve the user fault before | ||
1953 | * trying again. | ||
1954 | * | ||
1955 | * Typically this is meant to be used by the futex code. | ||
1956 | * | ||
1957 | * The main difference with get_user_pages() is that this function will | ||
1958 | * unconditionally call handle_mm_fault() which will in turn perform all the | ||
1959 | * necessary SW fixup of the dirty and young bits in the PTE, while | ||
1960 | * handle_mm_fault() only guarantees to update these in the struct page. | ||
1961 | * | ||
1962 | * This is important for some architectures where those bits also gate the | ||
1963 | * access permission to the page because they are maintained in software. On | ||
1964 | * such architectures, gup() will not be enough to make a subsequent access | ||
1965 | * succeed. | ||
1966 | * | ||
1967 | * This should be called with the mm_sem held for read. | ||
1968 | */ | ||
1969 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | ||
1970 | unsigned long address, unsigned int fault_flags) | ||
1971 | { | ||
1972 | struct vm_area_struct *vma; | ||
1973 | vm_flags_t vm_flags; | ||
1974 | int ret; | ||
1975 | |||
1976 | vma = find_extend_vma(mm, address); | ||
1977 | if (!vma || address < vma->vm_start) | ||
1978 | return -EFAULT; | ||
1979 | |||
1980 | vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; | ||
1981 | if (!(vm_flags & vma->vm_flags)) | ||
1982 | return -EFAULT; | ||
1983 | |||
1984 | ret = handle_mm_fault(mm, vma, address, fault_flags); | ||
1985 | if (ret & VM_FAULT_ERROR) { | ||
1986 | if (ret & VM_FAULT_OOM) | ||
1987 | return -ENOMEM; | ||
1988 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | ||
1989 | return -EHWPOISON; | ||
1990 | if (ret & VM_FAULT_SIGBUS) | ||
1991 | return -EFAULT; | ||
1992 | BUG(); | ||
1993 | } | ||
1994 | if (tsk) { | ||
1995 | if (ret & VM_FAULT_MAJOR) | ||
1996 | tsk->maj_flt++; | ||
1997 | else | ||
1998 | tsk->min_flt++; | ||
1999 | } | ||
2000 | return 0; | ||
2001 | } | ||
2002 | |||
2003 | /* | ||
2004 | * get_user_pages() - pin user pages in memory | ||
2005 | * @tsk: the task_struct to use for page fault accounting, or | ||
2006 | * NULL if faults are not to be recorded. | ||
2007 | * @mm: mm_struct of target mm | ||
2008 | * @start: starting user address | ||
2009 | * @nr_pages: number of pages from start to pin | ||
2010 | * @write: whether pages will be written to by the caller | ||
2011 | * @force: whether to force access even when user mapping is currently | ||
2012 | * protected (but never forces write access to shared mapping). | ||
2013 | * @pages: array that receives pointers to the pages pinned. | ||
2014 | * Should be at least nr_pages long. Or NULL, if caller | ||
2015 | * only intends to ensure the pages are faulted in. | ||
2016 | * @vmas: array of pointers to vmas corresponding to each page. | ||
2017 | * Or NULL if the caller does not require them. | ||
2018 | * | ||
2019 | * Returns number of pages pinned. This may be fewer than the number | ||
2020 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
2021 | * were pinned, returns -errno. Each page returned must be released | ||
2022 | * with a put_page() call when it is finished with. vmas will only | ||
2023 | * remain valid while mmap_sem is held. | ||
2024 | * | ||
2025 | * Must be called with mmap_sem held for read or write. | ||
2026 | * | ||
2027 | * get_user_pages walks a process's page tables and takes a reference to | ||
2028 | * each struct page that each user address corresponds to at a given | ||
2029 | * instant. That is, it takes the page that would be accessed if a user | ||
2030 | * thread accesses the given user virtual address at that instant. | ||
2031 | * | ||
2032 | * This does not guarantee that the page exists in the user mappings when | ||
2033 | * get_user_pages returns, and there may even be a completely different | ||
2034 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
2035 | * and subsequently re faulted). However it does guarantee that the page | ||
2036 | * won't be freed completely. And mostly callers simply care that the page | ||
2037 | * contains data that was valid *at some point in time*. Typically, an IO | ||
2038 | * or similar operation cannot guarantee anything stronger anyway because | ||
2039 | * locks can't be held over the syscall boundary. | ||
2040 | * | ||
2041 | * If write=0, the page must not be written to. If the page is written to, | ||
2042 | * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called | ||
2043 | * after the page is finished with, and before put_page is called. | ||
2044 | * | ||
2045 | * get_user_pages is typically used for fewer-copy IO operations, to get a | ||
2046 | * handle on the memory by some means other than accesses via the user virtual | ||
2047 | * addresses. The pages may be submitted for DMA to devices or accessed via | ||
2048 | * their kernel linear mapping (via the kmap APIs). Care should be taken to | ||
2049 | * use the correct cache flushing APIs. | ||
2050 | * | ||
2051 | * See also get_user_pages_fast, for performance critical applications. | ||
2052 | */ | ||
2053 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
2054 | unsigned long start, unsigned long nr_pages, int write, | ||
2055 | int force, struct page **pages, struct vm_area_struct **vmas) | ||
2056 | { | ||
2057 | int flags = FOLL_TOUCH; | ||
2058 | |||
2059 | if (pages) | ||
2060 | flags |= FOLL_GET; | ||
2061 | if (write) | ||
2062 | flags |= FOLL_WRITE; | ||
2063 | if (force) | ||
2064 | flags |= FOLL_FORCE; | ||
2065 | |||
2066 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, | ||
2067 | NULL); | ||
2068 | } | ||
2069 | EXPORT_SYMBOL(get_user_pages); | ||
2070 | |||
2071 | /** | ||
2072 | * get_dump_page() - pin user page in memory while writing it to core dump | ||
2073 | * @addr: user address | ||
2074 | * | ||
2075 | * Returns struct page pointer of user page pinned for dump, | ||
2076 | * to be freed afterwards by page_cache_release() or put_page(). | ||
2077 | * | ||
2078 | * Returns NULL on any kind of failure - a hole must then be inserted into | ||
2079 | * the corefile, to preserve alignment with its headers; and also returns | ||
2080 | * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - | ||
2081 | * allowing a hole to be left in the corefile to save diskspace. | ||
2082 | * | ||
2083 | * Called without mmap_sem, but after all other threads have been killed. | ||
2084 | */ | ||
2085 | #ifdef CONFIG_ELF_CORE | ||
2086 | struct page *get_dump_page(unsigned long addr) | ||
2087 | { | ||
2088 | struct vm_area_struct *vma; | ||
2089 | struct page *page; | ||
2090 | |||
2091 | if (__get_user_pages(current, current->mm, addr, 1, | ||
2092 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, | ||
2093 | NULL) < 1) | ||
2094 | return NULL; | ||
2095 | flush_cache_page(vma, addr, page_to_pfn(page)); | ||
2096 | return page; | ||
2097 | } | ||
2098 | #endif /* CONFIG_ELF_CORE */ | ||
2099 | |||
2100 | pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, | 1456 | pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, |
2101 | spinlock_t **ptl) | 1457 | spinlock_t **ptl) |
2102 | { | 1458 | { |
@@ -3402,65 +2758,76 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, | |||
3402 | update_mmu_cache(vma, address, pte); | 2758 | update_mmu_cache(vma, address, pte); |
3403 | } | 2759 | } |
3404 | 2760 | ||
3405 | #define FAULT_AROUND_ORDER 4 | 2761 | static unsigned long fault_around_bytes = 65536; |
2762 | |||
2763 | /* | ||
2764 | * fault_around_pages() and fault_around_mask() round down fault_around_bytes | ||
2765 | * to nearest page order. It's what do_fault_around() expects to see. | ||
2766 | */ | ||
2767 | static inline unsigned long fault_around_pages(void) | ||
2768 | { | ||
2769 | return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE; | ||
2770 | } | ||
2771 | |||
2772 | static inline unsigned long fault_around_mask(void) | ||
2773 | { | ||
2774 | return ~(rounddown_pow_of_two(fault_around_bytes) - 1) & PAGE_MASK; | ||
2775 | } | ||
3406 | 2776 | ||
3407 | #ifdef CONFIG_DEBUG_FS | ||
3408 | static unsigned int fault_around_order = FAULT_AROUND_ORDER; | ||
3409 | 2777 | ||
3410 | static int fault_around_order_get(void *data, u64 *val) | 2778 | #ifdef CONFIG_DEBUG_FS |
2779 | static int fault_around_bytes_get(void *data, u64 *val) | ||
3411 | { | 2780 | { |
3412 | *val = fault_around_order; | 2781 | *val = fault_around_bytes; |
3413 | return 0; | 2782 | return 0; |
3414 | } | 2783 | } |
3415 | 2784 | ||
3416 | static int fault_around_order_set(void *data, u64 val) | 2785 | static int fault_around_bytes_set(void *data, u64 val) |
3417 | { | 2786 | { |
3418 | BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE); | 2787 | if (val / PAGE_SIZE > PTRS_PER_PTE) |
3419 | if (1UL << val > PTRS_PER_PTE) | ||
3420 | return -EINVAL; | 2788 | return -EINVAL; |
3421 | fault_around_order = val; | 2789 | fault_around_bytes = val; |
3422 | return 0; | 2790 | return 0; |
3423 | } | 2791 | } |
3424 | DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops, | 2792 | DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, |
3425 | fault_around_order_get, fault_around_order_set, "%llu\n"); | 2793 | fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); |
3426 | 2794 | ||
3427 | static int __init fault_around_debugfs(void) | 2795 | static int __init fault_around_debugfs(void) |
3428 | { | 2796 | { |
3429 | void *ret; | 2797 | void *ret; |
3430 | 2798 | ||
3431 | ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL, | 2799 | ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, |
3432 | &fault_around_order_fops); | 2800 | &fault_around_bytes_fops); |
3433 | if (!ret) | 2801 | if (!ret) |
3434 | pr_warn("Failed to create fault_around_order in debugfs"); | 2802 | pr_warn("Failed to create fault_around_bytes in debugfs"); |
3435 | return 0; | 2803 | return 0; |
3436 | } | 2804 | } |
3437 | late_initcall(fault_around_debugfs); | 2805 | late_initcall(fault_around_debugfs); |
3438 | |||
3439 | static inline unsigned long fault_around_pages(void) | ||
3440 | { | ||
3441 | return 1UL << fault_around_order; | ||
3442 | } | ||
3443 | |||
3444 | static inline unsigned long fault_around_mask(void) | ||
3445 | { | ||
3446 | return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1); | ||
3447 | } | ||
3448 | #else | ||
3449 | static inline unsigned long fault_around_pages(void) | ||
3450 | { | ||
3451 | unsigned long nr_pages; | ||
3452 | |||
3453 | nr_pages = 1UL << FAULT_AROUND_ORDER; | ||
3454 | BUILD_BUG_ON(nr_pages > PTRS_PER_PTE); | ||
3455 | return nr_pages; | ||
3456 | } | ||
3457 | |||
3458 | static inline unsigned long fault_around_mask(void) | ||
3459 | { | ||
3460 | return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1); | ||
3461 | } | ||
3462 | #endif | 2806 | #endif |
3463 | 2807 | ||
2808 | /* | ||
2809 | * do_fault_around() tries to map few pages around the fault address. The hope | ||
2810 | * is that the pages will be needed soon and this will lower the number of | ||
2811 | * faults to handle. | ||
2812 | * | ||
2813 | * It uses vm_ops->map_pages() to map the pages, which skips the page if it's | ||
2814 | * not ready to be mapped: not up-to-date, locked, etc. | ||
2815 | * | ||
2816 | * This function is called with the page table lock taken. In the split ptlock | ||
2817 | * case the page table lock only protects only those entries which belong to | ||
2818 | * the page table corresponding to the fault address. | ||
2819 | * | ||
2820 | * This function doesn't cross the VMA boundaries, in order to call map_pages() | ||
2821 | * only once. | ||
2822 | * | ||
2823 | * fault_around_pages() defines how many pages we'll try to map. | ||
2824 | * do_fault_around() expects it to return a power of two less than or equal to | ||
2825 | * PTRS_PER_PTE. | ||
2826 | * | ||
2827 | * The virtual address of the area that we map is naturally aligned to the | ||
2828 | * fault_around_pages() value (and therefore to page order). This way it's | ||
2829 | * easier to guarantee that we don't cross page table boundaries. | ||
2830 | */ | ||
3464 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | 2831 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, |
3465 | pte_t *pte, pgoff_t pgoff, unsigned int flags) | 2832 | pte_t *pte, pgoff_t pgoff, unsigned int flags) |
3466 | { | 2833 | { |
@@ -3476,7 +2843,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | |||
3476 | 2843 | ||
3477 | /* | 2844 | /* |
3478 | * max_pgoff is either end of page table or end of vma | 2845 | * max_pgoff is either end of page table or end of vma |
3479 | * or fault_around_pages() from pgoff, depending what is neast. | 2846 | * or fault_around_pages() from pgoff, depending what is nearest. |
3480 | */ | 2847 | */ |
3481 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | 2848 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + |
3482 | PTRS_PER_PTE - 1; | 2849 | PTRS_PER_PTE - 1; |
@@ -3515,7 +2882,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3515 | * if page by the offset is not ready to be mapped (cold cache or | 2882 | * if page by the offset is not ready to be mapped (cold cache or |
3516 | * something). | 2883 | * something). |
3517 | */ | 2884 | */ |
3518 | if (vma->vm_ops->map_pages) { | 2885 | if (vma->vm_ops->map_pages && fault_around_pages() > 1) { |
3519 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2886 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
3520 | do_fault_around(vma, address, pte, pgoff, flags); | 2887 | do_fault_around(vma, address, pte, pgoff, flags); |
3521 | if (!pte_same(*pte, orig_pte)) | 2888 | if (!pte_same(*pte, orig_pte)) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a650db29606f..469bbf505f85 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -46,19 +46,84 @@ | |||
46 | static void generic_online_page(struct page *page); | 46 | static void generic_online_page(struct page *page); |
47 | 47 | ||
48 | static online_page_callback_t online_page_callback = generic_online_page; | 48 | static online_page_callback_t online_page_callback = generic_online_page; |
49 | static DEFINE_MUTEX(online_page_callback_lock); | ||
49 | 50 | ||
50 | DEFINE_MUTEX(mem_hotplug_mutex); | 51 | /* The same as the cpu_hotplug lock, but for memory hotplug. */ |
52 | static struct { | ||
53 | struct task_struct *active_writer; | ||
54 | struct mutex lock; /* Synchronizes accesses to refcount, */ | ||
55 | /* | ||
56 | * Also blocks the new readers during | ||
57 | * an ongoing mem hotplug operation. | ||
58 | */ | ||
59 | int refcount; | ||
60 | |||
61 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
62 | struct lockdep_map dep_map; | ||
63 | #endif | ||
64 | } mem_hotplug = { | ||
65 | .active_writer = NULL, | ||
66 | .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), | ||
67 | .refcount = 0, | ||
68 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
69 | .dep_map = {.name = "mem_hotplug.lock" }, | ||
70 | #endif | ||
71 | }; | ||
72 | |||
73 | /* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ | ||
74 | #define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) | ||
75 | #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) | ||
76 | #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) | ||
77 | |||
78 | void get_online_mems(void) | ||
79 | { | ||
80 | might_sleep(); | ||
81 | if (mem_hotplug.active_writer == current) | ||
82 | return; | ||
83 | memhp_lock_acquire_read(); | ||
84 | mutex_lock(&mem_hotplug.lock); | ||
85 | mem_hotplug.refcount++; | ||
86 | mutex_unlock(&mem_hotplug.lock); | ||
87 | |||
88 | } | ||
51 | 89 | ||
52 | void lock_memory_hotplug(void) | 90 | void put_online_mems(void) |
53 | { | 91 | { |
54 | mutex_lock(&mem_hotplug_mutex); | 92 | if (mem_hotplug.active_writer == current) |
93 | return; | ||
94 | mutex_lock(&mem_hotplug.lock); | ||
95 | |||
96 | if (WARN_ON(!mem_hotplug.refcount)) | ||
97 | mem_hotplug.refcount++; /* try to fix things up */ | ||
98 | |||
99 | if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) | ||
100 | wake_up_process(mem_hotplug.active_writer); | ||
101 | mutex_unlock(&mem_hotplug.lock); | ||
102 | memhp_lock_release(); | ||
103 | |||
55 | } | 104 | } |
56 | 105 | ||
57 | void unlock_memory_hotplug(void) | 106 | static void mem_hotplug_begin(void) |
58 | { | 107 | { |
59 | mutex_unlock(&mem_hotplug_mutex); | 108 | mem_hotplug.active_writer = current; |
109 | |||
110 | memhp_lock_acquire(); | ||
111 | for (;;) { | ||
112 | mutex_lock(&mem_hotplug.lock); | ||
113 | if (likely(!mem_hotplug.refcount)) | ||
114 | break; | ||
115 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
116 | mutex_unlock(&mem_hotplug.lock); | ||
117 | schedule(); | ||
118 | } | ||
60 | } | 119 | } |
61 | 120 | ||
121 | static void mem_hotplug_done(void) | ||
122 | { | ||
123 | mem_hotplug.active_writer = NULL; | ||
124 | mutex_unlock(&mem_hotplug.lock); | ||
125 | memhp_lock_release(); | ||
126 | } | ||
62 | 127 | ||
63 | /* add this memory to iomem resource */ | 128 | /* add this memory to iomem resource */ |
64 | static struct resource *register_memory_resource(u64 start, u64 size) | 129 | static struct resource *register_memory_resource(u64 start, u64 size) |
@@ -727,14 +792,16 @@ int set_online_page_callback(online_page_callback_t callback) | |||
727 | { | 792 | { |
728 | int rc = -EINVAL; | 793 | int rc = -EINVAL; |
729 | 794 | ||
730 | lock_memory_hotplug(); | 795 | get_online_mems(); |
796 | mutex_lock(&online_page_callback_lock); | ||
731 | 797 | ||
732 | if (online_page_callback == generic_online_page) { | 798 | if (online_page_callback == generic_online_page) { |
733 | online_page_callback = callback; | 799 | online_page_callback = callback; |
734 | rc = 0; | 800 | rc = 0; |
735 | } | 801 | } |
736 | 802 | ||
737 | unlock_memory_hotplug(); | 803 | mutex_unlock(&online_page_callback_lock); |
804 | put_online_mems(); | ||
738 | 805 | ||
739 | return rc; | 806 | return rc; |
740 | } | 807 | } |
@@ -744,14 +811,16 @@ int restore_online_page_callback(online_page_callback_t callback) | |||
744 | { | 811 | { |
745 | int rc = -EINVAL; | 812 | int rc = -EINVAL; |
746 | 813 | ||
747 | lock_memory_hotplug(); | 814 | get_online_mems(); |
815 | mutex_lock(&online_page_callback_lock); | ||
748 | 816 | ||
749 | if (online_page_callback == callback) { | 817 | if (online_page_callback == callback) { |
750 | online_page_callback = generic_online_page; | 818 | online_page_callback = generic_online_page; |
751 | rc = 0; | 819 | rc = 0; |
752 | } | 820 | } |
753 | 821 | ||
754 | unlock_memory_hotplug(); | 822 | mutex_unlock(&online_page_callback_lock); |
823 | put_online_mems(); | ||
755 | 824 | ||
756 | return rc; | 825 | return rc; |
757 | } | 826 | } |
@@ -899,7 +968,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
899 | int ret; | 968 | int ret; |
900 | struct memory_notify arg; | 969 | struct memory_notify arg; |
901 | 970 | ||
902 | lock_memory_hotplug(); | 971 | mem_hotplug_begin(); |
903 | /* | 972 | /* |
904 | * This doesn't need a lock to do pfn_to_page(). | 973 | * This doesn't need a lock to do pfn_to_page(). |
905 | * The section can't be removed here because of the | 974 | * The section can't be removed here because of the |
@@ -907,23 +976,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
907 | */ | 976 | */ |
908 | zone = page_zone(pfn_to_page(pfn)); | 977 | zone = page_zone(pfn_to_page(pfn)); |
909 | 978 | ||
979 | ret = -EINVAL; | ||
910 | if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && | 980 | if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && |
911 | !can_online_high_movable(zone)) { | 981 | !can_online_high_movable(zone)) |
912 | unlock_memory_hotplug(); | 982 | goto out; |
913 | return -EINVAL; | ||
914 | } | ||
915 | 983 | ||
916 | if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { | 984 | if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { |
917 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { | 985 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) |
918 | unlock_memory_hotplug(); | 986 | goto out; |
919 | return -EINVAL; | ||
920 | } | ||
921 | } | 987 | } |
922 | if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { | 988 | if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { |
923 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { | 989 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) |
924 | unlock_memory_hotplug(); | 990 | goto out; |
925 | return -EINVAL; | ||
926 | } | ||
927 | } | 991 | } |
928 | 992 | ||
929 | /* Previous code may changed the zone of the pfn range */ | 993 | /* Previous code may changed the zone of the pfn range */ |
@@ -939,8 +1003,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
939 | ret = notifier_to_errno(ret); | 1003 | ret = notifier_to_errno(ret); |
940 | if (ret) { | 1004 | if (ret) { |
941 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 1005 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
942 | unlock_memory_hotplug(); | 1006 | goto out; |
943 | return ret; | ||
944 | } | 1007 | } |
945 | /* | 1008 | /* |
946 | * If this zone is not populated, then it is not in zonelist. | 1009 | * If this zone is not populated, then it is not in zonelist. |
@@ -964,8 +1027,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
964 | (((unsigned long long) pfn + nr_pages) | 1027 | (((unsigned long long) pfn + nr_pages) |
965 | << PAGE_SHIFT) - 1); | 1028 | << PAGE_SHIFT) - 1); |
966 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 1029 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
967 | unlock_memory_hotplug(); | 1030 | goto out; |
968 | return ret; | ||
969 | } | 1031 | } |
970 | 1032 | ||
971 | zone->present_pages += onlined_pages; | 1033 | zone->present_pages += onlined_pages; |
@@ -995,9 +1057,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
995 | 1057 | ||
996 | if (onlined_pages) | 1058 | if (onlined_pages) |
997 | memory_notify(MEM_ONLINE, &arg); | 1059 | memory_notify(MEM_ONLINE, &arg); |
998 | unlock_memory_hotplug(); | 1060 | out: |
999 | 1061 | mem_hotplug_done(); | |
1000 | return 0; | 1062 | return ret; |
1001 | } | 1063 | } |
1002 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ | 1064 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ |
1003 | 1065 | ||
@@ -1007,7 +1069,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
1007 | struct pglist_data *pgdat; | 1069 | struct pglist_data *pgdat; |
1008 | unsigned long zones_size[MAX_NR_ZONES] = {0}; | 1070 | unsigned long zones_size[MAX_NR_ZONES] = {0}; |
1009 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; | 1071 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; |
1010 | unsigned long start_pfn = start >> PAGE_SHIFT; | 1072 | unsigned long start_pfn = PFN_DOWN(start); |
1011 | 1073 | ||
1012 | pgdat = NODE_DATA(nid); | 1074 | pgdat = NODE_DATA(nid); |
1013 | if (!pgdat) { | 1075 | if (!pgdat) { |
@@ -1055,7 +1117,7 @@ int try_online_node(int nid) | |||
1055 | if (node_online(nid)) | 1117 | if (node_online(nid)) |
1056 | return 0; | 1118 | return 0; |
1057 | 1119 | ||
1058 | lock_memory_hotplug(); | 1120 | mem_hotplug_begin(); |
1059 | pgdat = hotadd_new_pgdat(nid, 0); | 1121 | pgdat = hotadd_new_pgdat(nid, 0); |
1060 | if (!pgdat) { | 1122 | if (!pgdat) { |
1061 | pr_err("Cannot online node %d due to NULL pgdat\n", nid); | 1123 | pr_err("Cannot online node %d due to NULL pgdat\n", nid); |
@@ -1073,13 +1135,13 @@ int try_online_node(int nid) | |||
1073 | } | 1135 | } |
1074 | 1136 | ||
1075 | out: | 1137 | out: |
1076 | unlock_memory_hotplug(); | 1138 | mem_hotplug_done(); |
1077 | return ret; | 1139 | return ret; |
1078 | } | 1140 | } |
1079 | 1141 | ||
1080 | static int check_hotplug_memory_range(u64 start, u64 size) | 1142 | static int check_hotplug_memory_range(u64 start, u64 size) |
1081 | { | 1143 | { |
1082 | u64 start_pfn = start >> PAGE_SHIFT; | 1144 | u64 start_pfn = PFN_DOWN(start); |
1083 | u64 nr_pages = size >> PAGE_SHIFT; | 1145 | u64 nr_pages = size >> PAGE_SHIFT; |
1084 | 1146 | ||
1085 | /* Memory range must be aligned with section */ | 1147 | /* Memory range must be aligned with section */ |
@@ -1117,7 +1179,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
1117 | new_pgdat = !p; | 1179 | new_pgdat = !p; |
1118 | } | 1180 | } |
1119 | 1181 | ||
1120 | lock_memory_hotplug(); | 1182 | mem_hotplug_begin(); |
1121 | 1183 | ||
1122 | new_node = !node_online(nid); | 1184 | new_node = !node_online(nid); |
1123 | if (new_node) { | 1185 | if (new_node) { |
@@ -1158,7 +1220,7 @@ error: | |||
1158 | release_memory_resource(res); | 1220 | release_memory_resource(res); |
1159 | 1221 | ||
1160 | out: | 1222 | out: |
1161 | unlock_memory_hotplug(); | 1223 | mem_hotplug_done(); |
1162 | return ret; | 1224 | return ret; |
1163 | } | 1225 | } |
1164 | EXPORT_SYMBOL_GPL(add_memory); | 1226 | EXPORT_SYMBOL_GPL(add_memory); |
@@ -1332,7 +1394,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1332 | * alloc_migrate_target should be improooooved!! | 1394 | * alloc_migrate_target should be improooooved!! |
1333 | * migrate_pages returns # of failed pages. | 1395 | * migrate_pages returns # of failed pages. |
1334 | */ | 1396 | */ |
1335 | ret = migrate_pages(&source, alloc_migrate_target, 0, | 1397 | ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, |
1336 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); | 1398 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); |
1337 | if (ret) | 1399 | if (ret) |
1338 | putback_movable_pages(&source); | 1400 | putback_movable_pages(&source); |
@@ -1565,7 +1627,7 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
1565 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) | 1627 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) |
1566 | return -EINVAL; | 1628 | return -EINVAL; |
1567 | 1629 | ||
1568 | lock_memory_hotplug(); | 1630 | mem_hotplug_begin(); |
1569 | 1631 | ||
1570 | zone = page_zone(pfn_to_page(start_pfn)); | 1632 | zone = page_zone(pfn_to_page(start_pfn)); |
1571 | node = zone_to_nid(zone); | 1633 | node = zone_to_nid(zone); |
@@ -1672,7 +1734,7 @@ repeat: | |||
1672 | writeback_set_ratelimit(); | 1734 | writeback_set_ratelimit(); |
1673 | 1735 | ||
1674 | memory_notify(MEM_OFFLINE, &arg); | 1736 | memory_notify(MEM_OFFLINE, &arg); |
1675 | unlock_memory_hotplug(); | 1737 | mem_hotplug_done(); |
1676 | return 0; | 1738 | return 0; |
1677 | 1739 | ||
1678 | failed_removal: | 1740 | failed_removal: |
@@ -1684,7 +1746,7 @@ failed_removal: | |||
1684 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 1746 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
1685 | 1747 | ||
1686 | out: | 1748 | out: |
1687 | unlock_memory_hotplug(); | 1749 | mem_hotplug_done(); |
1688 | return ret; | 1750 | return ret; |
1689 | } | 1751 | } |
1690 | 1752 | ||
@@ -1888,7 +1950,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) | |||
1888 | 1950 | ||
1889 | BUG_ON(check_hotplug_memory_range(start, size)); | 1951 | BUG_ON(check_hotplug_memory_range(start, size)); |
1890 | 1952 | ||
1891 | lock_memory_hotplug(); | 1953 | mem_hotplug_begin(); |
1892 | 1954 | ||
1893 | /* | 1955 | /* |
1894 | * All memory blocks must be offlined before removing memory. Check | 1956 | * All memory blocks must be offlined before removing memory. Check |
@@ -1897,10 +1959,8 @@ void __ref remove_memory(int nid, u64 start, u64 size) | |||
1897 | */ | 1959 | */ |
1898 | ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, | 1960 | ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, |
1899 | check_memblock_offlined_cb); | 1961 | check_memblock_offlined_cb); |
1900 | if (ret) { | 1962 | if (ret) |
1901 | unlock_memory_hotplug(); | ||
1902 | BUG(); | 1963 | BUG(); |
1903 | } | ||
1904 | 1964 | ||
1905 | /* remove memmap entry */ | 1965 | /* remove memmap entry */ |
1906 | firmware_map_remove(start, start + size, "System RAM"); | 1966 | firmware_map_remove(start, start + size, "System RAM"); |
@@ -1909,7 +1969,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) | |||
1909 | 1969 | ||
1910 | try_offline_node(nid); | 1970 | try_offline_node(nid); |
1911 | 1971 | ||
1912 | unlock_memory_hotplug(); | 1972 | mem_hotplug_done(); |
1913 | } | 1973 | } |
1914 | EXPORT_SYMBOL_GPL(remove_memory); | 1974 | EXPORT_SYMBOL_GPL(remove_memory); |
1915 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 1975 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 78e1472933ea..16bc9fa42998 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
1028 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 1028 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
1029 | 1029 | ||
1030 | if (!list_empty(&pagelist)) { | 1030 | if (!list_empty(&pagelist)) { |
1031 | err = migrate_pages(&pagelist, new_node_page, dest, | 1031 | err = migrate_pages(&pagelist, new_node_page, NULL, dest, |
1032 | MIGRATE_SYNC, MR_SYSCALL); | 1032 | MIGRATE_SYNC, MR_SYSCALL); |
1033 | if (err) | 1033 | if (err) |
1034 | putback_movable_pages(&pagelist); | 1034 | putback_movable_pages(&pagelist); |
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1277 | if (!list_empty(&pagelist)) { | 1277 | if (!list_empty(&pagelist)) { |
1278 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | 1278 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); |
1279 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1279 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1280 | (unsigned long)vma, | 1280 | NULL, (unsigned long)vma, |
1281 | MIGRATE_SYNC, MR_MEMPOLICY_MBIND); | 1281 | MIGRATE_SYNC, MR_MEMPOLICY_MBIND); |
1282 | if (nr_failed) | 1282 | if (nr_failed) |
1283 | putback_movable_pages(&pagelist); | 1283 | putback_movable_pages(&pagelist); |
@@ -1362,7 +1362,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | |||
1362 | } | 1362 | } |
1363 | 1363 | ||
1364 | SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, | 1364 | SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, |
1365 | unsigned long, mode, unsigned long __user *, nmask, | 1365 | unsigned long, mode, const unsigned long __user *, nmask, |
1366 | unsigned long, maxnode, unsigned, flags) | 1366 | unsigned long, maxnode, unsigned, flags) |
1367 | { | 1367 | { |
1368 | nodemask_t nodes; | 1368 | nodemask_t nodes; |
@@ -1383,7 +1383,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, | |||
1383 | } | 1383 | } |
1384 | 1384 | ||
1385 | /* Set the process memory policy */ | 1385 | /* Set the process memory policy */ |
1386 | SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, | 1386 | SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, |
1387 | unsigned long, maxnode) | 1387 | unsigned long, maxnode) |
1388 | { | 1388 | { |
1389 | int err; | 1389 | int err; |
@@ -1606,9 +1606,9 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, | |||
1606 | 1606 | ||
1607 | /* | 1607 | /* |
1608 | * get_vma_policy(@task, @vma, @addr) | 1608 | * get_vma_policy(@task, @vma, @addr) |
1609 | * @task - task for fallback if vma policy == default | 1609 | * @task: task for fallback if vma policy == default |
1610 | * @vma - virtual memory area whose policy is sought | 1610 | * @vma: virtual memory area whose policy is sought |
1611 | * @addr - address in @vma for shared policy lookup | 1611 | * @addr: address in @vma for shared policy lookup |
1612 | * | 1612 | * |
1613 | * Returns effective policy for a VMA at specified address. | 1613 | * Returns effective policy for a VMA at specified address. |
1614 | * Falls back to @task or system default policy, as necessary. | 1614 | * Falls back to @task or system default policy, as necessary. |
@@ -1854,11 +1854,11 @@ int node_random(const nodemask_t *maskp) | |||
1854 | #ifdef CONFIG_HUGETLBFS | 1854 | #ifdef CONFIG_HUGETLBFS |
1855 | /* | 1855 | /* |
1856 | * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) | 1856 | * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) |
1857 | * @vma = virtual memory area whose policy is sought | 1857 | * @vma: virtual memory area whose policy is sought |
1858 | * @addr = address in @vma for shared policy lookup and interleave policy | 1858 | * @addr: address in @vma for shared policy lookup and interleave policy |
1859 | * @gfp_flags = for requested zone | 1859 | * @gfp_flags: for requested zone |
1860 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy | 1860 | * @mpol: pointer to mempolicy pointer for reference counted mempolicy |
1861 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask | 1861 | * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask |
1862 | * | 1862 | * |
1863 | * Returns a zonelist suitable for a huge page allocation and a pointer | 1863 | * Returns a zonelist suitable for a huge page allocation and a pointer |
1864 | * to the struct mempolicy for conditional unref after allocation. | 1864 | * to the struct mempolicy for conditional unref after allocation. |
@@ -2270,9 +2270,9 @@ static void sp_free(struct sp_node *n) | |||
2270 | /** | 2270 | /** |
2271 | * mpol_misplaced - check whether current page node is valid in policy | 2271 | * mpol_misplaced - check whether current page node is valid in policy |
2272 | * | 2272 | * |
2273 | * @page - page to be checked | 2273 | * @page: page to be checked |
2274 | * @vma - vm area where page mapped | 2274 | * @vma: vm area where page mapped |
2275 | * @addr - virtual address where page mapped | 2275 | * @addr: virtual address where page mapped |
2276 | * | 2276 | * |
2277 | * Lookup current policy node id for vma,addr and "compare to" page's | 2277 | * Lookup current policy node id for vma,addr and "compare to" page's |
2278 | * node id. | 2278 | * node id. |
diff --git a/mm/mempool.c b/mm/mempool.c index 905434f18c97..455d468c3a5d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -192,6 +192,7 @@ EXPORT_SYMBOL(mempool_resize); | |||
192 | * returns NULL. Note that due to preallocation, this function | 192 | * returns NULL. Note that due to preallocation, this function |
193 | * *never* fails when called from process contexts. (it might | 193 | * *never* fails when called from process contexts. (it might |
194 | * fail if called from an IRQ context.) | 194 | * fail if called from an IRQ context.) |
195 | * Note: using __GFP_ZERO is not supported. | ||
195 | */ | 196 | */ |
196 | void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) | 197 | void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) |
197 | { | 198 | { |
@@ -200,6 +201,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) | |||
200 | wait_queue_t wait; | 201 | wait_queue_t wait; |
201 | gfp_t gfp_temp; | 202 | gfp_t gfp_temp; |
202 | 203 | ||
204 | VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); | ||
203 | might_sleep_if(gfp_mask & __GFP_WAIT); | 205 | might_sleep_if(gfp_mask & __GFP_WAIT); |
204 | 206 | ||
205 | gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ | 207 | gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ |
diff --git a/mm/migrate.c b/mm/migrate.c index bed48809e5d0..63f0cd559999 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -938,8 +938,9 @@ out: | |||
938 | * Obtain the lock on page, remove all ptes and migrate the page | 938 | * Obtain the lock on page, remove all ptes and migrate the page |
939 | * to the newly allocated page in newpage. | 939 | * to the newly allocated page in newpage. |
940 | */ | 940 | */ |
941 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 941 | static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, |
942 | struct page *page, int force, enum migrate_mode mode) | 942 | unsigned long private, struct page *page, int force, |
943 | enum migrate_mode mode) | ||
943 | { | 944 | { |
944 | int rc = 0; | 945 | int rc = 0; |
945 | int *result = NULL; | 946 | int *result = NULL; |
@@ -983,11 +984,17 @@ out: | |||
983 | page_is_file_cache(page)); | 984 | page_is_file_cache(page)); |
984 | putback_lru_page(page); | 985 | putback_lru_page(page); |
985 | } | 986 | } |
987 | |||
986 | /* | 988 | /* |
987 | * Move the new page to the LRU. If migration was not successful | 989 | * If migration was not successful and there's a freeing callback, use |
988 | * then this will free the page. | 990 | * it. Otherwise, putback_lru_page() will drop the reference grabbed |
991 | * during isolation. | ||
989 | */ | 992 | */ |
990 | putback_lru_page(newpage); | 993 | if (rc != MIGRATEPAGE_SUCCESS && put_new_page) |
994 | put_new_page(newpage, private); | ||
995 | else | ||
996 | putback_lru_page(newpage); | ||
997 | |||
991 | if (result) { | 998 | if (result) { |
992 | if (rc) | 999 | if (rc) |
993 | *result = rc; | 1000 | *result = rc; |
@@ -1016,8 +1023,9 @@ out: | |||
1016 | * will wait in the page fault for migration to complete. | 1023 | * will wait in the page fault for migration to complete. |
1017 | */ | 1024 | */ |
1018 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 1025 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
1019 | unsigned long private, struct page *hpage, | 1026 | free_page_t put_new_page, unsigned long private, |
1020 | int force, enum migrate_mode mode) | 1027 | struct page *hpage, int force, |
1028 | enum migrate_mode mode) | ||
1021 | { | 1029 | { |
1022 | int rc = 0; | 1030 | int rc = 0; |
1023 | int *result = NULL; | 1031 | int *result = NULL; |
@@ -1031,7 +1039,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1031 | * tables or check whether the hugepage is pmd-based or not before | 1039 | * tables or check whether the hugepage is pmd-based or not before |
1032 | * kicking migration. | 1040 | * kicking migration. |
1033 | */ | 1041 | */ |
1034 | if (!hugepage_migration_support(page_hstate(hpage))) { | 1042 | if (!hugepage_migration_supported(page_hstate(hpage))) { |
1035 | putback_active_hugepage(hpage); | 1043 | putback_active_hugepage(hpage); |
1036 | return -ENOSYS; | 1044 | return -ENOSYS; |
1037 | } | 1045 | } |
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1056 | if (!page_mapped(hpage)) | 1064 | if (!page_mapped(hpage)) |
1057 | rc = move_to_new_page(new_hpage, hpage, 1, mode); | 1065 | rc = move_to_new_page(new_hpage, hpage, 1, mode); |
1058 | 1066 | ||
1059 | if (rc) | 1067 | if (rc != MIGRATEPAGE_SUCCESS) |
1060 | remove_migration_ptes(hpage, hpage); | 1068 | remove_migration_ptes(hpage, hpage); |
1061 | 1069 | ||
1062 | if (anon_vma) | 1070 | if (anon_vma) |
1063 | put_anon_vma(anon_vma); | 1071 | put_anon_vma(anon_vma); |
1064 | 1072 | ||
1065 | if (!rc) | 1073 | if (rc == MIGRATEPAGE_SUCCESS) |
1066 | hugetlb_cgroup_migrate(hpage, new_hpage); | 1074 | hugetlb_cgroup_migrate(hpage, new_hpage); |
1067 | 1075 | ||
1068 | unlock_page(hpage); | 1076 | unlock_page(hpage); |
1069 | out: | 1077 | out: |
1070 | if (rc != -EAGAIN) | 1078 | if (rc != -EAGAIN) |
1071 | putback_active_hugepage(hpage); | 1079 | putback_active_hugepage(hpage); |
1072 | put_page(new_hpage); | 1080 | |
1081 | /* | ||
1082 | * If migration was not successful and there's a freeing callback, use | ||
1083 | * it. Otherwise, put_page() will drop the reference grabbed during | ||
1084 | * isolation. | ||
1085 | */ | ||
1086 | if (rc != MIGRATEPAGE_SUCCESS && put_new_page) | ||
1087 | put_new_page(new_hpage, private); | ||
1088 | else | ||
1089 | put_page(new_hpage); | ||
1090 | |||
1073 | if (result) { | 1091 | if (result) { |
1074 | if (rc) | 1092 | if (rc) |
1075 | *result = rc; | 1093 | *result = rc; |
@@ -1086,6 +1104,8 @@ out: | |||
1086 | * @from: The list of pages to be migrated. | 1104 | * @from: The list of pages to be migrated. |
1087 | * @get_new_page: The function used to allocate free pages to be used | 1105 | * @get_new_page: The function used to allocate free pages to be used |
1088 | * as the target of the page migration. | 1106 | * as the target of the page migration. |
1107 | * @put_new_page: The function used to free target pages if migration | ||
1108 | * fails, or NULL if no special handling is necessary. | ||
1089 | * @private: Private data to be passed on to get_new_page() | 1109 | * @private: Private data to be passed on to get_new_page() |
1090 | * @mode: The migration mode that specifies the constraints for | 1110 | * @mode: The migration mode that specifies the constraints for |
1091 | * page migration, if any. | 1111 | * page migration, if any. |
@@ -1099,7 +1119,8 @@ out: | |||
1099 | * Returns the number of pages that were not migrated, or an error code. | 1119 | * Returns the number of pages that were not migrated, or an error code. |
1100 | */ | 1120 | */ |
1101 | int migrate_pages(struct list_head *from, new_page_t get_new_page, | 1121 | int migrate_pages(struct list_head *from, new_page_t get_new_page, |
1102 | unsigned long private, enum migrate_mode mode, int reason) | 1122 | free_page_t put_new_page, unsigned long private, |
1123 | enum migrate_mode mode, int reason) | ||
1103 | { | 1124 | { |
1104 | int retry = 1; | 1125 | int retry = 1; |
1105 | int nr_failed = 0; | 1126 | int nr_failed = 0; |
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, | |||
1121 | 1142 | ||
1122 | if (PageHuge(page)) | 1143 | if (PageHuge(page)) |
1123 | rc = unmap_and_move_huge_page(get_new_page, | 1144 | rc = unmap_and_move_huge_page(get_new_page, |
1124 | private, page, pass > 2, mode); | 1145 | put_new_page, private, page, |
1146 | pass > 2, mode); | ||
1125 | else | 1147 | else |
1126 | rc = unmap_and_move(get_new_page, private, | 1148 | rc = unmap_and_move(get_new_page, put_new_page, |
1127 | page, pass > 2, mode); | 1149 | private, page, pass > 2, mode); |
1128 | 1150 | ||
1129 | switch(rc) { | 1151 | switch(rc) { |
1130 | case -ENOMEM: | 1152 | case -ENOMEM: |
@@ -1273,7 +1295,7 @@ set_status: | |||
1273 | 1295 | ||
1274 | err = 0; | 1296 | err = 0; |
1275 | if (!list_empty(&pagelist)) { | 1297 | if (!list_empty(&pagelist)) { |
1276 | err = migrate_pages(&pagelist, new_page_node, | 1298 | err = migrate_pages(&pagelist, new_page_node, NULL, |
1277 | (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); | 1299 | (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); |
1278 | if (err) | 1300 | if (err) |
1279 | putback_movable_pages(&pagelist); | 1301 | putback_movable_pages(&pagelist); |
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, | |||
1729 | 1751 | ||
1730 | list_add(&page->lru, &migratepages); | 1752 | list_add(&page->lru, &migratepages); |
1731 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, | 1753 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, |
1732 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); | 1754 | NULL, node, MIGRATE_ASYNC, |
1755 | MR_NUMA_MISPLACED); | ||
1733 | if (nr_remaining) { | 1756 | if (nr_remaining) { |
1734 | if (!list_empty(&migratepages)) { | 1757 | if (!list_empty(&migratepages)) { |
1735 | list_del(&page->lru); | 1758 | list_del(&page->lru); |
@@ -1852,7 +1875,7 @@ fail_putback: | |||
1852 | * guarantee the copy is visible before the pagetable update. | 1875 | * guarantee the copy is visible before the pagetable update. |
1853 | */ | 1876 | */ |
1854 | flush_cache_range(vma, mmun_start, mmun_end); | 1877 | flush_cache_range(vma, mmun_start, mmun_end); |
1855 | page_add_new_anon_rmap(new_page, vma, mmun_start); | 1878 | page_add_anon_rmap(new_page, vma, mmun_start); |
1856 | pmdp_clear_flush(vma, mmun_start, pmd); | 1879 | pmdp_clear_flush(vma, mmun_start, pmd); |
1857 | set_pmd_at(mm, mmun_start, pmd, entry); | 1880 | set_pmd_at(mm, mmun_start, pmd, entry); |
1858 | flush_tlb_range(vma, mmun_start, mmun_end); | 1881 | flush_tlb_range(vma, mmun_start, mmun_end); |
@@ -1877,6 +1900,10 @@ fail_putback: | |||
1877 | spin_unlock(ptl); | 1900 | spin_unlock(ptl); |
1878 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1901 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1879 | 1902 | ||
1903 | /* Take an "isolate" reference and put new page on the LRU. */ | ||
1904 | get_page(new_page); | ||
1905 | putback_lru_page(new_page); | ||
1906 | |||
1880 | unlock_page(new_page); | 1907 | unlock_page(new_page); |
1881 | unlock_page(page); | 1908 | unlock_page(page); |
1882 | put_page(page); /* Drop the rmap reference */ | 1909 | put_page(page); /* Drop the rmap reference */ |
@@ -640,11 +640,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
640 | { | 640 | { |
641 | struct address_space *mapping = NULL; | 641 | struct address_space *mapping = NULL; |
642 | 642 | ||
643 | if (vma->vm_file) | 643 | if (vma->vm_file) { |
644 | mapping = vma->vm_file->f_mapping; | 644 | mapping = vma->vm_file->f_mapping; |
645 | |||
646 | if (mapping) | ||
647 | mutex_lock(&mapping->i_mmap_mutex); | 645 | mutex_lock(&mapping->i_mmap_mutex); |
646 | } | ||
648 | 647 | ||
649 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 648 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
650 | __vma_link_file(vma); | 649 | __vma_link_file(vma); |
@@ -2965,9 +2964,7 @@ int install_special_mapping(struct mm_struct *mm, | |||
2965 | struct vm_area_struct *vma = _install_special_mapping(mm, | 2964 | struct vm_area_struct *vma = _install_special_mapping(mm, |
2966 | addr, len, vm_flags, pages); | 2965 | addr, len, vm_flags, pages); |
2967 | 2966 | ||
2968 | if (IS_ERR(vma)) | 2967 | return PTR_ERR_OR_ZERO(vma); |
2969 | return PTR_ERR(vma); | ||
2970 | return 0; | ||
2971 | } | 2968 | } |
2972 | 2969 | ||
2973 | static DEFINE_MUTEX(mm_all_locks_mutex); | 2970 | static DEFINE_MUTEX(mm_all_locks_mutex); |
diff --git a/mm/msync.c b/mm/msync.c index 632df4527c01..a5c673669ca6 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -58,6 +58,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) | |||
58 | vma = find_vma(mm, start); | 58 | vma = find_vma(mm, start); |
59 | for (;;) { | 59 | for (;;) { |
60 | struct file *file; | 60 | struct file *file; |
61 | loff_t fstart, fend; | ||
61 | 62 | ||
62 | /* Still start < end. */ | 63 | /* Still start < end. */ |
63 | error = -ENOMEM; | 64 | error = -ENOMEM; |
@@ -77,12 +78,17 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) | |||
77 | goto out_unlock; | 78 | goto out_unlock; |
78 | } | 79 | } |
79 | file = vma->vm_file; | 80 | file = vma->vm_file; |
81 | fstart = start + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | ||
82 | fend = fstart + (min(end, vma->vm_end) - start) - 1; | ||
80 | start = vma->vm_end; | 83 | start = vma->vm_end; |
81 | if ((flags & MS_SYNC) && file && | 84 | if ((flags & MS_SYNC) && file && |
82 | (vma->vm_flags & VM_SHARED)) { | 85 | (vma->vm_flags & VM_SHARED)) { |
83 | get_file(file); | 86 | get_file(file); |
84 | up_read(&mm->mmap_sem); | 87 | up_read(&mm->mmap_sem); |
85 | error = vfs_fsync(file, 0); | 88 | if (vma->vm_flags & VM_NONLINEAR) |
89 | error = vfs_fsync(file, 1); | ||
90 | else | ||
91 | error = vfs_fsync_range(file, fstart, fend, 1); | ||
86 | fput(file); | 92 | fput(file); |
87 | if (error || start >= end) | 93 | if (error || start >= end) |
88 | goto out; | 94 | goto out; |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a4317da60532..533fa60c9ac1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -156,24 +156,6 @@ static unsigned long writeout_period_time = 0; | |||
156 | #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) | 156 | #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) |
157 | 157 | ||
158 | /* | 158 | /* |
159 | * Work out the current dirty-memory clamping and background writeout | ||
160 | * thresholds. | ||
161 | * | ||
162 | * The main aim here is to lower them aggressively if there is a lot of mapped | ||
163 | * memory around. To avoid stressing page reclaim with lots of unreclaimable | ||
164 | * pages. It is better to clamp down on writers than to start swapping, and | ||
165 | * performing lots of scanning. | ||
166 | * | ||
167 | * We only allow 1/2 of the currently-unmapped memory to be dirtied. | ||
168 | * | ||
169 | * We don't permit the clamping level to fall below 5% - that is getting rather | ||
170 | * excessive. | ||
171 | * | ||
172 | * We make sure that the background writeout level is below the adjusted | ||
173 | * clamping level. | ||
174 | */ | ||
175 | |||
176 | /* | ||
177 | * In a memory zone, there is a certain amount of pages we consider | 159 | * In a memory zone, there is a certain amount of pages we consider |
178 | * available for the page cache, which is essentially the number of | 160 | * available for the page cache, which is essentially the number of |
179 | * free and reclaimable pages, minus some zone reserves to protect | 161 | * free and reclaimable pages, minus some zone reserves to protect |
@@ -1623,7 +1605,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) | |||
1623 | * 1000+ tasks, all of them start dirtying pages at exactly the same | 1605 | * 1000+ tasks, all of them start dirtying pages at exactly the same |
1624 | * time, hence all honoured too large initial task->nr_dirtied_pause. | 1606 | * time, hence all honoured too large initial task->nr_dirtied_pause. |
1625 | */ | 1607 | */ |
1626 | p = &__get_cpu_var(bdp_ratelimits); | 1608 | p = this_cpu_ptr(&bdp_ratelimits); |
1627 | if (unlikely(current->nr_dirtied >= ratelimit)) | 1609 | if (unlikely(current->nr_dirtied >= ratelimit)) |
1628 | *p = 0; | 1610 | *p = 0; |
1629 | else if (unlikely(*p >= ratelimit_pages)) { | 1611 | else if (unlikely(*p >= ratelimit_pages)) { |
@@ -1635,7 +1617,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) | |||
1635 | * short-lived tasks (eg. gcc invocations in a kernel build) escaping | 1617 | * short-lived tasks (eg. gcc invocations in a kernel build) escaping |
1636 | * the dirty throttling and livelock other long-run dirtiers. | 1618 | * the dirty throttling and livelock other long-run dirtiers. |
1637 | */ | 1619 | */ |
1638 | p = &__get_cpu_var(dirty_throttle_leaks); | 1620 | p = this_cpu_ptr(&dirty_throttle_leaks); |
1639 | if (*p > 0 && current->nr_dirtied < ratelimit) { | 1621 | if (*p > 0 && current->nr_dirtied < ratelimit) { |
1640 | unsigned long nr_pages_dirtied; | 1622 | unsigned long nr_pages_dirtied; |
1641 | nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); | 1623 | nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5dba2933c9c0..a59bdb653958 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -261,8 +261,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | |||
261 | } while (zone_span_seqretry(zone, seq)); | 261 | } while (zone_span_seqretry(zone, seq)); |
262 | 262 | ||
263 | if (ret) | 263 | if (ret) |
264 | pr_err("page %lu outside zone [ %lu - %lu ]\n", | 264 | pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", |
265 | pfn, start_pfn, start_pfn + sp); | 265 | pfn, zone_to_nid(zone), zone->name, |
266 | start_pfn, start_pfn + sp); | ||
266 | 267 | ||
267 | return ret; | 268 | return ret; |
268 | } | 269 | } |
@@ -408,7 +409,8 @@ static int destroy_compound_page(struct page *page, unsigned long order) | |||
408 | return bad; | 409 | return bad; |
409 | } | 410 | } |
410 | 411 | ||
411 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 412 | static inline void prep_zero_page(struct page *page, unsigned int order, |
413 | gfp_t gfp_flags) | ||
412 | { | 414 | { |
413 | int i; | 415 | int i; |
414 | 416 | ||
@@ -452,7 +454,7 @@ static inline void set_page_guard_flag(struct page *page) { } | |||
452 | static inline void clear_page_guard_flag(struct page *page) { } | 454 | static inline void clear_page_guard_flag(struct page *page) { } |
453 | #endif | 455 | #endif |
454 | 456 | ||
455 | static inline void set_page_order(struct page *page, int order) | 457 | static inline void set_page_order(struct page *page, unsigned int order) |
456 | { | 458 | { |
457 | set_page_private(page, order); | 459 | set_page_private(page, order); |
458 | __SetPageBuddy(page); | 460 | __SetPageBuddy(page); |
@@ -503,21 +505,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) | |||
503 | * For recording page's order, we use page_private(page). | 505 | * For recording page's order, we use page_private(page). |
504 | */ | 506 | */ |
505 | static inline int page_is_buddy(struct page *page, struct page *buddy, | 507 | static inline int page_is_buddy(struct page *page, struct page *buddy, |
506 | int order) | 508 | unsigned int order) |
507 | { | 509 | { |
508 | if (!pfn_valid_within(page_to_pfn(buddy))) | 510 | if (!pfn_valid_within(page_to_pfn(buddy))) |
509 | return 0; | 511 | return 0; |
510 | 512 | ||
511 | if (page_zone_id(page) != page_zone_id(buddy)) | ||
512 | return 0; | ||
513 | |||
514 | if (page_is_guard(buddy) && page_order(buddy) == order) { | 513 | if (page_is_guard(buddy) && page_order(buddy) == order) { |
515 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | 514 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
515 | |||
516 | if (page_zone_id(page) != page_zone_id(buddy)) | ||
517 | return 0; | ||
518 | |||
516 | return 1; | 519 | return 1; |
517 | } | 520 | } |
518 | 521 | ||
519 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 522 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
520 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | 523 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
524 | |||
525 | /* | ||
526 | * zone check is done late to avoid uselessly | ||
527 | * calculating zone/node ids for pages that could | ||
528 | * never merge. | ||
529 | */ | ||
530 | if (page_zone_id(page) != page_zone_id(buddy)) | ||
531 | return 0; | ||
532 | |||
521 | return 1; | 533 | return 1; |
522 | } | 534 | } |
523 | return 0; | 535 | return 0; |
@@ -549,6 +561,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
549 | */ | 561 | */ |
550 | 562 | ||
551 | static inline void __free_one_page(struct page *page, | 563 | static inline void __free_one_page(struct page *page, |
564 | unsigned long pfn, | ||
552 | struct zone *zone, unsigned int order, | 565 | struct zone *zone, unsigned int order, |
553 | int migratetype) | 566 | int migratetype) |
554 | { | 567 | { |
@@ -565,7 +578,7 @@ static inline void __free_one_page(struct page *page, | |||
565 | 578 | ||
566 | VM_BUG_ON(migratetype == -1); | 579 | VM_BUG_ON(migratetype == -1); |
567 | 580 | ||
568 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 581 | page_idx = pfn & ((1 << MAX_ORDER) - 1); |
569 | 582 | ||
570 | VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); | 583 | VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); |
571 | VM_BUG_ON_PAGE(bad_range(zone, page), page); | 584 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
@@ -700,7 +713,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
700 | list_del(&page->lru); | 713 | list_del(&page->lru); |
701 | mt = get_freepage_migratetype(page); | 714 | mt = get_freepage_migratetype(page); |
702 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 715 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
703 | __free_one_page(page, zone, 0, mt); | 716 | __free_one_page(page, page_to_pfn(page), zone, 0, mt); |
704 | trace_mm_page_pcpu_drain(page, 0, mt); | 717 | trace_mm_page_pcpu_drain(page, 0, mt); |
705 | if (likely(!is_migrate_isolate_page(page))) { | 718 | if (likely(!is_migrate_isolate_page(page))) { |
706 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); | 719 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); |
@@ -712,13 +725,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
712 | spin_unlock(&zone->lock); | 725 | spin_unlock(&zone->lock); |
713 | } | 726 | } |
714 | 727 | ||
715 | static void free_one_page(struct zone *zone, struct page *page, int order, | 728 | static void free_one_page(struct zone *zone, |
729 | struct page *page, unsigned long pfn, | ||
730 | unsigned int order, | ||
716 | int migratetype) | 731 | int migratetype) |
717 | { | 732 | { |
718 | spin_lock(&zone->lock); | 733 | spin_lock(&zone->lock); |
719 | zone->pages_scanned = 0; | 734 | zone->pages_scanned = 0; |
720 | 735 | ||
721 | __free_one_page(page, zone, order, migratetype); | 736 | __free_one_page(page, pfn, zone, order, migratetype); |
722 | if (unlikely(!is_migrate_isolate(migratetype))) | 737 | if (unlikely(!is_migrate_isolate(migratetype))) |
723 | __mod_zone_freepage_state(zone, 1 << order, migratetype); | 738 | __mod_zone_freepage_state(zone, 1 << order, migratetype); |
724 | spin_unlock(&zone->lock); | 739 | spin_unlock(&zone->lock); |
@@ -755,15 +770,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
755 | { | 770 | { |
756 | unsigned long flags; | 771 | unsigned long flags; |
757 | int migratetype; | 772 | int migratetype; |
773 | unsigned long pfn = page_to_pfn(page); | ||
758 | 774 | ||
759 | if (!free_pages_prepare(page, order)) | 775 | if (!free_pages_prepare(page, order)) |
760 | return; | 776 | return; |
761 | 777 | ||
778 | migratetype = get_pfnblock_migratetype(page, pfn); | ||
762 | local_irq_save(flags); | 779 | local_irq_save(flags); |
763 | __count_vm_events(PGFREE, 1 << order); | 780 | __count_vm_events(PGFREE, 1 << order); |
764 | migratetype = get_pageblock_migratetype(page); | ||
765 | set_freepage_migratetype(page, migratetype); | 781 | set_freepage_migratetype(page, migratetype); |
766 | free_one_page(page_zone(page), page, order, migratetype); | 782 | free_one_page(page_zone(page), page, pfn, order, migratetype); |
767 | local_irq_restore(flags); | 783 | local_irq_restore(flags); |
768 | } | 784 | } |
769 | 785 | ||
@@ -882,7 +898,7 @@ static inline int check_new_page(struct page *page) | |||
882 | return 0; | 898 | return 0; |
883 | } | 899 | } |
884 | 900 | ||
885 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | 901 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) |
886 | { | 902 | { |
887 | int i; | 903 | int i; |
888 | 904 | ||
@@ -931,6 +947,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
931 | rmv_page_order(page); | 947 | rmv_page_order(page); |
932 | area->nr_free--; | 948 | area->nr_free--; |
933 | expand(zone, page, order, current_order, area, migratetype); | 949 | expand(zone, page, order, current_order, area, migratetype); |
950 | set_freepage_migratetype(page, migratetype); | ||
934 | return page; | 951 | return page; |
935 | } | 952 | } |
936 | 953 | ||
@@ -1057,7 +1074,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
1057 | 1074 | ||
1058 | /* | 1075 | /* |
1059 | * When borrowing from MIGRATE_CMA, we need to release the excess | 1076 | * When borrowing from MIGRATE_CMA, we need to release the excess |
1060 | * buddy pages to CMA itself. | 1077 | * buddy pages to CMA itself. We also ensure the freepage_migratetype |
1078 | * is set to CMA so it is returned to the correct freelist in case | ||
1079 | * the page ends up being not actually allocated from the pcp lists. | ||
1061 | */ | 1080 | */ |
1062 | if (is_migrate_cma(fallback_type)) | 1081 | if (is_migrate_cma(fallback_type)) |
1063 | return fallback_type; | 1082 | return fallback_type; |
@@ -1090,16 +1109,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
1090 | 1109 | ||
1091 | /* Remove an element from the buddy allocator from the fallback list */ | 1110 | /* Remove an element from the buddy allocator from the fallback list */ |
1092 | static inline struct page * | 1111 | static inline struct page * |
1093 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | 1112 | __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) |
1094 | { | 1113 | { |
1095 | struct free_area *area; | 1114 | struct free_area *area; |
1096 | int current_order; | 1115 | unsigned int current_order; |
1097 | struct page *page; | 1116 | struct page *page; |
1098 | int migratetype, new_type, i; | 1117 | int migratetype, new_type, i; |
1099 | 1118 | ||
1100 | /* Find the largest possible block of pages in the other list */ | 1119 | /* Find the largest possible block of pages in the other list */ |
1101 | for (current_order = MAX_ORDER-1; current_order >= order; | 1120 | for (current_order = MAX_ORDER-1; |
1102 | --current_order) { | 1121 | current_order >= order && current_order <= MAX_ORDER-1; |
1122 | --current_order) { | ||
1103 | for (i = 0;; i++) { | 1123 | for (i = 0;; i++) { |
1104 | migratetype = fallbacks[start_migratetype][i]; | 1124 | migratetype = fallbacks[start_migratetype][i]; |
1105 | 1125 | ||
@@ -1125,6 +1145,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
1125 | 1145 | ||
1126 | expand(zone, page, order, current_order, area, | 1146 | expand(zone, page, order, current_order, area, |
1127 | new_type); | 1147 | new_type); |
1148 | /* The freepage_migratetype may differ from pageblock's | ||
1149 | * migratetype depending on the decisions in | ||
1150 | * try_to_steal_freepages. This is OK as long as it does | ||
1151 | * not differ for MIGRATE_CMA type. | ||
1152 | */ | ||
1153 | set_freepage_migratetype(page, new_type); | ||
1128 | 1154 | ||
1129 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1155 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1130 | start_migratetype, migratetype, new_type); | 1156 | start_migratetype, migratetype, new_type); |
@@ -1173,9 +1199,9 @@ retry_reserve: | |||
1173 | */ | 1199 | */ |
1174 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 1200 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
1175 | unsigned long count, struct list_head *list, | 1201 | unsigned long count, struct list_head *list, |
1176 | int migratetype, int cold) | 1202 | int migratetype, bool cold) |
1177 | { | 1203 | { |
1178 | int mt = migratetype, i; | 1204 | int i; |
1179 | 1205 | ||
1180 | spin_lock(&zone->lock); | 1206 | spin_lock(&zone->lock); |
1181 | for (i = 0; i < count; ++i) { | 1207 | for (i = 0; i < count; ++i) { |
@@ -1192,18 +1218,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
1192 | * merge IO requests if the physical pages are ordered | 1218 | * merge IO requests if the physical pages are ordered |
1193 | * properly. | 1219 | * properly. |
1194 | */ | 1220 | */ |
1195 | if (likely(cold == 0)) | 1221 | if (likely(!cold)) |
1196 | list_add(&page->lru, list); | 1222 | list_add(&page->lru, list); |
1197 | else | 1223 | else |
1198 | list_add_tail(&page->lru, list); | 1224 | list_add_tail(&page->lru, list); |
1199 | if (IS_ENABLED(CONFIG_CMA)) { | ||
1200 | mt = get_pageblock_migratetype(page); | ||
1201 | if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) | ||
1202 | mt = migratetype; | ||
1203 | } | ||
1204 | set_freepage_migratetype(page, mt); | ||
1205 | list = &page->lru; | 1225 | list = &page->lru; |
1206 | if (is_migrate_cma(mt)) | 1226 | if (is_migrate_cma(get_freepage_migratetype(page))) |
1207 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, | 1227 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, |
1208 | -(1 << order)); | 1228 | -(1 << order)); |
1209 | } | 1229 | } |
@@ -1327,7 +1347,7 @@ void mark_free_pages(struct zone *zone) | |||
1327 | { | 1347 | { |
1328 | unsigned long pfn, max_zone_pfn; | 1348 | unsigned long pfn, max_zone_pfn; |
1329 | unsigned long flags; | 1349 | unsigned long flags; |
1330 | int order, t; | 1350 | unsigned int order, t; |
1331 | struct list_head *curr; | 1351 | struct list_head *curr; |
1332 | 1352 | ||
1333 | if (zone_is_empty(zone)) | 1353 | if (zone_is_empty(zone)) |
@@ -1359,19 +1379,20 @@ void mark_free_pages(struct zone *zone) | |||
1359 | 1379 | ||
1360 | /* | 1380 | /* |
1361 | * Free a 0-order page | 1381 | * Free a 0-order page |
1362 | * cold == 1 ? free a cold page : free a hot page | 1382 | * cold == true ? free a cold page : free a hot page |
1363 | */ | 1383 | */ |
1364 | void free_hot_cold_page(struct page *page, int cold) | 1384 | void free_hot_cold_page(struct page *page, bool cold) |
1365 | { | 1385 | { |
1366 | struct zone *zone = page_zone(page); | 1386 | struct zone *zone = page_zone(page); |
1367 | struct per_cpu_pages *pcp; | 1387 | struct per_cpu_pages *pcp; |
1368 | unsigned long flags; | 1388 | unsigned long flags; |
1389 | unsigned long pfn = page_to_pfn(page); | ||
1369 | int migratetype; | 1390 | int migratetype; |
1370 | 1391 | ||
1371 | if (!free_pages_prepare(page, 0)) | 1392 | if (!free_pages_prepare(page, 0)) |
1372 | return; | 1393 | return; |
1373 | 1394 | ||
1374 | migratetype = get_pageblock_migratetype(page); | 1395 | migratetype = get_pfnblock_migratetype(page, pfn); |
1375 | set_freepage_migratetype(page, migratetype); | 1396 | set_freepage_migratetype(page, migratetype); |
1376 | local_irq_save(flags); | 1397 | local_irq_save(flags); |
1377 | __count_vm_event(PGFREE); | 1398 | __count_vm_event(PGFREE); |
@@ -1385,17 +1406,17 @@ void free_hot_cold_page(struct page *page, int cold) | |||
1385 | */ | 1406 | */ |
1386 | if (migratetype >= MIGRATE_PCPTYPES) { | 1407 | if (migratetype >= MIGRATE_PCPTYPES) { |
1387 | if (unlikely(is_migrate_isolate(migratetype))) { | 1408 | if (unlikely(is_migrate_isolate(migratetype))) { |
1388 | free_one_page(zone, page, 0, migratetype); | 1409 | free_one_page(zone, page, pfn, 0, migratetype); |
1389 | goto out; | 1410 | goto out; |
1390 | } | 1411 | } |
1391 | migratetype = MIGRATE_MOVABLE; | 1412 | migratetype = MIGRATE_MOVABLE; |
1392 | } | 1413 | } |
1393 | 1414 | ||
1394 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 1415 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
1395 | if (cold) | 1416 | if (!cold) |
1396 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | ||
1397 | else | ||
1398 | list_add(&page->lru, &pcp->lists[migratetype]); | 1417 | list_add(&page->lru, &pcp->lists[migratetype]); |
1418 | else | ||
1419 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | ||
1399 | pcp->count++; | 1420 | pcp->count++; |
1400 | if (pcp->count >= pcp->high) { | 1421 | if (pcp->count >= pcp->high) { |
1401 | unsigned long batch = ACCESS_ONCE(pcp->batch); | 1422 | unsigned long batch = ACCESS_ONCE(pcp->batch); |
@@ -1410,7 +1431,7 @@ out: | |||
1410 | /* | 1431 | /* |
1411 | * Free a list of 0-order pages | 1432 | * Free a list of 0-order pages |
1412 | */ | 1433 | */ |
1413 | void free_hot_cold_page_list(struct list_head *list, int cold) | 1434 | void free_hot_cold_page_list(struct list_head *list, bool cold) |
1414 | { | 1435 | { |
1415 | struct page *page, *next; | 1436 | struct page *page, *next; |
1416 | 1437 | ||
@@ -1522,12 +1543,12 @@ int split_free_page(struct page *page) | |||
1522 | */ | 1543 | */ |
1523 | static inline | 1544 | static inline |
1524 | struct page *buffered_rmqueue(struct zone *preferred_zone, | 1545 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
1525 | struct zone *zone, int order, gfp_t gfp_flags, | 1546 | struct zone *zone, unsigned int order, |
1526 | int migratetype) | 1547 | gfp_t gfp_flags, int migratetype) |
1527 | { | 1548 | { |
1528 | unsigned long flags; | 1549 | unsigned long flags; |
1529 | struct page *page; | 1550 | struct page *page; |
1530 | int cold = !!(gfp_flags & __GFP_COLD); | 1551 | bool cold = ((gfp_flags & __GFP_COLD) != 0); |
1531 | 1552 | ||
1532 | again: | 1553 | again: |
1533 | if (likely(order == 0)) { | 1554 | if (likely(order == 0)) { |
@@ -1572,7 +1593,7 @@ again: | |||
1572 | if (!page) | 1593 | if (!page) |
1573 | goto failed; | 1594 | goto failed; |
1574 | __mod_zone_freepage_state(zone, -(1 << order), | 1595 | __mod_zone_freepage_state(zone, -(1 << order), |
1575 | get_pageblock_migratetype(page)); | 1596 | get_freepage_migratetype(page)); |
1576 | } | 1597 | } |
1577 | 1598 | ||
1578 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | 1599 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); |
@@ -1672,8 +1693,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | |||
1672 | * Return true if free pages are above 'mark'. This takes into account the order | 1693 | * Return true if free pages are above 'mark'. This takes into account the order |
1673 | * of the allocation. | 1694 | * of the allocation. |
1674 | */ | 1695 | */ |
1675 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1696 | static bool __zone_watermark_ok(struct zone *z, unsigned int order, |
1676 | int classzone_idx, int alloc_flags, long free_pages) | 1697 | unsigned long mark, int classzone_idx, int alloc_flags, |
1698 | long free_pages) | ||
1677 | { | 1699 | { |
1678 | /* free_pages my go negative - that's OK */ | 1700 | /* free_pages my go negative - that's OK */ |
1679 | long min = mark; | 1701 | long min = mark; |
@@ -1707,15 +1729,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1707 | return true; | 1729 | return true; |
1708 | } | 1730 | } |
1709 | 1731 | ||
1710 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1732 | bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, |
1711 | int classzone_idx, int alloc_flags) | 1733 | int classzone_idx, int alloc_flags) |
1712 | { | 1734 | { |
1713 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1735 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1714 | zone_page_state(z, NR_FREE_PAGES)); | 1736 | zone_page_state(z, NR_FREE_PAGES)); |
1715 | } | 1737 | } |
1716 | 1738 | ||
1717 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | 1739 | bool zone_watermark_ok_safe(struct zone *z, unsigned int order, |
1718 | int classzone_idx, int alloc_flags) | 1740 | unsigned long mark, int classzone_idx, int alloc_flags) |
1719 | { | 1741 | { |
1720 | long free_pages = zone_page_state(z, NR_FREE_PAGES); | 1742 | long free_pages = zone_page_state(z, NR_FREE_PAGES); |
1721 | 1743 | ||
@@ -1850,18 +1872,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone) | |||
1850 | 1872 | ||
1851 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | 1873 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
1852 | { | 1874 | { |
1853 | return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); | 1875 | return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < |
1854 | } | 1876 | RECLAIM_DISTANCE; |
1855 | |||
1856 | static void __paginginit init_zone_allows_reclaim(int nid) | ||
1857 | { | ||
1858 | int i; | ||
1859 | |||
1860 | for_each_node_state(i, N_MEMORY) | ||
1861 | if (node_distance(nid, i) <= RECLAIM_DISTANCE) | ||
1862 | node_set(i, NODE_DATA(nid)->reclaim_nodes); | ||
1863 | else | ||
1864 | zone_reclaim_mode = 1; | ||
1865 | } | 1877 | } |
1866 | 1878 | ||
1867 | #else /* CONFIG_NUMA */ | 1879 | #else /* CONFIG_NUMA */ |
@@ -1895,9 +1907,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | |||
1895 | return true; | 1907 | return true; |
1896 | } | 1908 | } |
1897 | 1909 | ||
1898 | static inline void init_zone_allows_reclaim(int nid) | ||
1899 | { | ||
1900 | } | ||
1901 | #endif /* CONFIG_NUMA */ | 1910 | #endif /* CONFIG_NUMA */ |
1902 | 1911 | ||
1903 | /* | 1912 | /* |
@@ -1907,17 +1916,17 @@ static inline void init_zone_allows_reclaim(int nid) | |||
1907 | static struct page * | 1916 | static struct page * |
1908 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 1917 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
1909 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, | 1918 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, |
1910 | struct zone *preferred_zone, int migratetype) | 1919 | struct zone *preferred_zone, int classzone_idx, int migratetype) |
1911 | { | 1920 | { |
1912 | struct zoneref *z; | 1921 | struct zoneref *z; |
1913 | struct page *page = NULL; | 1922 | struct page *page = NULL; |
1914 | int classzone_idx; | ||
1915 | struct zone *zone; | 1923 | struct zone *zone; |
1916 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1924 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
1917 | int zlc_active = 0; /* set if using zonelist_cache */ | 1925 | int zlc_active = 0; /* set if using zonelist_cache */ |
1918 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1926 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1927 | bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && | ||
1928 | (gfp_mask & __GFP_WRITE); | ||
1919 | 1929 | ||
1920 | classzone_idx = zone_idx(preferred_zone); | ||
1921 | zonelist_scan: | 1930 | zonelist_scan: |
1922 | /* | 1931 | /* |
1923 | * Scan zonelist, looking for a zone with enough free. | 1932 | * Scan zonelist, looking for a zone with enough free. |
@@ -1930,12 +1939,10 @@ zonelist_scan: | |||
1930 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && | 1939 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
1931 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1940 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1932 | continue; | 1941 | continue; |
1933 | if ((alloc_flags & ALLOC_CPUSET) && | 1942 | if (cpusets_enabled() && |
1943 | (alloc_flags & ALLOC_CPUSET) && | ||
1934 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1944 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1935 | continue; | 1945 | continue; |
1936 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | ||
1937 | if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) | ||
1938 | goto try_this_zone; | ||
1939 | /* | 1946 | /* |
1940 | * Distribute pages in proportion to the individual | 1947 | * Distribute pages in proportion to the individual |
1941 | * zone size to ensure fair page aging. The zone a | 1948 | * zone size to ensure fair page aging. The zone a |
@@ -1974,15 +1981,19 @@ zonelist_scan: | |||
1974 | * will require awareness of zones in the | 1981 | * will require awareness of zones in the |
1975 | * dirty-throttling and the flusher threads. | 1982 | * dirty-throttling and the flusher threads. |
1976 | */ | 1983 | */ |
1977 | if ((alloc_flags & ALLOC_WMARK_LOW) && | 1984 | if (consider_zone_dirty && !zone_dirty_ok(zone)) |
1978 | (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) | 1985 | continue; |
1979 | goto this_zone_full; | ||
1980 | 1986 | ||
1981 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 1987 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
1982 | if (!zone_watermark_ok(zone, order, mark, | 1988 | if (!zone_watermark_ok(zone, order, mark, |
1983 | classzone_idx, alloc_flags)) { | 1989 | classzone_idx, alloc_flags)) { |
1984 | int ret; | 1990 | int ret; |
1985 | 1991 | ||
1992 | /* Checked here to keep the fast path fast */ | ||
1993 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | ||
1994 | if (alloc_flags & ALLOC_NO_WATERMARKS) | ||
1995 | goto try_this_zone; | ||
1996 | |||
1986 | if (IS_ENABLED(CONFIG_NUMA) && | 1997 | if (IS_ENABLED(CONFIG_NUMA) && |
1987 | !did_zlc_setup && nr_online_nodes > 1) { | 1998 | !did_zlc_setup && nr_online_nodes > 1) { |
1988 | /* | 1999 | /* |
@@ -2044,7 +2055,7 @@ try_this_zone: | |||
2044 | if (page) | 2055 | if (page) |
2045 | break; | 2056 | break; |
2046 | this_zone_full: | 2057 | this_zone_full: |
2047 | if (IS_ENABLED(CONFIG_NUMA)) | 2058 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) |
2048 | zlc_mark_zone_full(zonelist, z); | 2059 | zlc_mark_zone_full(zonelist, z); |
2049 | } | 2060 | } |
2050 | 2061 | ||
@@ -2173,7 +2184,7 @@ static inline struct page * | |||
2173 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | 2184 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
2174 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2185 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2175 | nodemask_t *nodemask, struct zone *preferred_zone, | 2186 | nodemask_t *nodemask, struct zone *preferred_zone, |
2176 | int migratetype) | 2187 | int classzone_idx, int migratetype) |
2177 | { | 2188 | { |
2178 | struct page *page; | 2189 | struct page *page; |
2179 | 2190 | ||
@@ -2191,7 +2202,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2191 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | 2202 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, |
2192 | order, zonelist, high_zoneidx, | 2203 | order, zonelist, high_zoneidx, |
2193 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | 2204 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, |
2194 | preferred_zone, migratetype); | 2205 | preferred_zone, classzone_idx, migratetype); |
2195 | if (page) | 2206 | if (page) |
2196 | goto out; | 2207 | goto out; |
2197 | 2208 | ||
@@ -2226,7 +2237,7 @@ static struct page * | |||
2226 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2237 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2227 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2238 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2228 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2239 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2229 | int migratetype, bool sync_migration, | 2240 | int classzone_idx, int migratetype, enum migrate_mode mode, |
2230 | bool *contended_compaction, bool *deferred_compaction, | 2241 | bool *contended_compaction, bool *deferred_compaction, |
2231 | unsigned long *did_some_progress) | 2242 | unsigned long *did_some_progress) |
2232 | { | 2243 | { |
@@ -2240,7 +2251,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2240 | 2251 | ||
2241 | current->flags |= PF_MEMALLOC; | 2252 | current->flags |= PF_MEMALLOC; |
2242 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2253 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2243 | nodemask, sync_migration, | 2254 | nodemask, mode, |
2244 | contended_compaction); | 2255 | contended_compaction); |
2245 | current->flags &= ~PF_MEMALLOC; | 2256 | current->flags &= ~PF_MEMALLOC; |
2246 | 2257 | ||
@@ -2254,7 +2265,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2254 | page = get_page_from_freelist(gfp_mask, nodemask, | 2265 | page = get_page_from_freelist(gfp_mask, nodemask, |
2255 | order, zonelist, high_zoneidx, | 2266 | order, zonelist, high_zoneidx, |
2256 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2267 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2257 | preferred_zone, migratetype); | 2268 | preferred_zone, classzone_idx, migratetype); |
2258 | if (page) { | 2269 | if (page) { |
2259 | preferred_zone->compact_blockskip_flush = false; | 2270 | preferred_zone->compact_blockskip_flush = false; |
2260 | compaction_defer_reset(preferred_zone, order, true); | 2271 | compaction_defer_reset(preferred_zone, order, true); |
@@ -2273,7 +2284,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2273 | * As async compaction considers a subset of pageblocks, only | 2284 | * As async compaction considers a subset of pageblocks, only |
2274 | * defer if the failure was a sync compaction failure. | 2285 | * defer if the failure was a sync compaction failure. |
2275 | */ | 2286 | */ |
2276 | if (sync_migration) | 2287 | if (mode != MIGRATE_ASYNC) |
2277 | defer_compaction(preferred_zone, order); | 2288 | defer_compaction(preferred_zone, order); |
2278 | 2289 | ||
2279 | cond_resched(); | 2290 | cond_resched(); |
@@ -2286,9 +2297,9 @@ static inline struct page * | |||
2286 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2297 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2287 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2298 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2288 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2299 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2289 | int migratetype, bool sync_migration, | 2300 | int classzone_idx, int migratetype, |
2290 | bool *contended_compaction, bool *deferred_compaction, | 2301 | enum migrate_mode mode, bool *contended_compaction, |
2291 | unsigned long *did_some_progress) | 2302 | bool *deferred_compaction, unsigned long *did_some_progress) |
2292 | { | 2303 | { |
2293 | return NULL; | 2304 | return NULL; |
2294 | } | 2305 | } |
@@ -2327,7 +2338,7 @@ static inline struct page * | |||
2327 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2338 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
2328 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2339 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2329 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2340 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2330 | int migratetype, unsigned long *did_some_progress) | 2341 | int classzone_idx, int migratetype, unsigned long *did_some_progress) |
2331 | { | 2342 | { |
2332 | struct page *page = NULL; | 2343 | struct page *page = NULL; |
2333 | bool drained = false; | 2344 | bool drained = false; |
@@ -2345,7 +2356,8 @@ retry: | |||
2345 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2356 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2346 | zonelist, high_zoneidx, | 2357 | zonelist, high_zoneidx, |
2347 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2358 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2348 | preferred_zone, migratetype); | 2359 | preferred_zone, classzone_idx, |
2360 | migratetype); | ||
2349 | 2361 | ||
2350 | /* | 2362 | /* |
2351 | * If an allocation failed after direct reclaim, it could be because | 2363 | * If an allocation failed after direct reclaim, it could be because |
@@ -2368,14 +2380,14 @@ static inline struct page * | |||
2368 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | 2380 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
2369 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2381 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2370 | nodemask_t *nodemask, struct zone *preferred_zone, | 2382 | nodemask_t *nodemask, struct zone *preferred_zone, |
2371 | int migratetype) | 2383 | int classzone_idx, int migratetype) |
2372 | { | 2384 | { |
2373 | struct page *page; | 2385 | struct page *page; |
2374 | 2386 | ||
2375 | do { | 2387 | do { |
2376 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2388 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2377 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | 2389 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, |
2378 | preferred_zone, migratetype); | 2390 | preferred_zone, classzone_idx, migratetype); |
2379 | 2391 | ||
2380 | if (!page && gfp_mask & __GFP_NOFAIL) | 2392 | if (!page && gfp_mask & __GFP_NOFAIL) |
2381 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2393 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
@@ -2476,14 +2488,14 @@ static inline struct page * | |||
2476 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2488 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2477 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2489 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2478 | nodemask_t *nodemask, struct zone *preferred_zone, | 2490 | nodemask_t *nodemask, struct zone *preferred_zone, |
2479 | int migratetype) | 2491 | int classzone_idx, int migratetype) |
2480 | { | 2492 | { |
2481 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2493 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
2482 | struct page *page = NULL; | 2494 | struct page *page = NULL; |
2483 | int alloc_flags; | 2495 | int alloc_flags; |
2484 | unsigned long pages_reclaimed = 0; | 2496 | unsigned long pages_reclaimed = 0; |
2485 | unsigned long did_some_progress; | 2497 | unsigned long did_some_progress; |
2486 | bool sync_migration = false; | 2498 | enum migrate_mode migration_mode = MIGRATE_ASYNC; |
2487 | bool deferred_compaction = false; | 2499 | bool deferred_compaction = false; |
2488 | bool contended_compaction = false; | 2500 | bool contended_compaction = false; |
2489 | 2501 | ||
@@ -2525,15 +2537,18 @@ restart: | |||
2525 | * Find the true preferred zone if the allocation is unconstrained by | 2537 | * Find the true preferred zone if the allocation is unconstrained by |
2526 | * cpusets. | 2538 | * cpusets. |
2527 | */ | 2539 | */ |
2528 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) | 2540 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { |
2529 | first_zones_zonelist(zonelist, high_zoneidx, NULL, | 2541 | struct zoneref *preferred_zoneref; |
2530 | &preferred_zone); | 2542 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, |
2543 | NULL, &preferred_zone); | ||
2544 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | ||
2545 | } | ||
2531 | 2546 | ||
2532 | rebalance: | 2547 | rebalance: |
2533 | /* This is the last chance, in general, before the goto nopage. */ | 2548 | /* This is the last chance, in general, before the goto nopage. */ |
2534 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2549 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
2535 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2550 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
2536 | preferred_zone, migratetype); | 2551 | preferred_zone, classzone_idx, migratetype); |
2537 | if (page) | 2552 | if (page) |
2538 | goto got_pg; | 2553 | goto got_pg; |
2539 | 2554 | ||
@@ -2548,7 +2563,7 @@ rebalance: | |||
2548 | 2563 | ||
2549 | page = __alloc_pages_high_priority(gfp_mask, order, | 2564 | page = __alloc_pages_high_priority(gfp_mask, order, |
2550 | zonelist, high_zoneidx, nodemask, | 2565 | zonelist, high_zoneidx, nodemask, |
2551 | preferred_zone, migratetype); | 2566 | preferred_zone, classzone_idx, migratetype); |
2552 | if (page) { | 2567 | if (page) { |
2553 | goto got_pg; | 2568 | goto got_pg; |
2554 | } | 2569 | } |
@@ -2577,17 +2592,23 @@ rebalance: | |||
2577 | * Try direct compaction. The first pass is asynchronous. Subsequent | 2592 | * Try direct compaction. The first pass is asynchronous. Subsequent |
2578 | * attempts after direct reclaim are synchronous | 2593 | * attempts after direct reclaim are synchronous |
2579 | */ | 2594 | */ |
2580 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2595 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, |
2581 | zonelist, high_zoneidx, | 2596 | high_zoneidx, nodemask, alloc_flags, |
2582 | nodemask, | 2597 | preferred_zone, |
2583 | alloc_flags, preferred_zone, | 2598 | classzone_idx, migratetype, |
2584 | migratetype, sync_migration, | 2599 | migration_mode, &contended_compaction, |
2585 | &contended_compaction, | ||
2586 | &deferred_compaction, | 2600 | &deferred_compaction, |
2587 | &did_some_progress); | 2601 | &did_some_progress); |
2588 | if (page) | 2602 | if (page) |
2589 | goto got_pg; | 2603 | goto got_pg; |
2590 | sync_migration = true; | 2604 | |
2605 | /* | ||
2606 | * It can become very expensive to allocate transparent hugepages at | ||
2607 | * fault, so use asynchronous memory compaction for THP unless it is | ||
2608 | * khugepaged trying to collapse. | ||
2609 | */ | ||
2610 | if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) | ||
2611 | migration_mode = MIGRATE_SYNC_LIGHT; | ||
2591 | 2612 | ||
2592 | /* | 2613 | /* |
2593 | * If compaction is deferred for high-order allocations, it is because | 2614 | * If compaction is deferred for high-order allocations, it is because |
@@ -2604,7 +2625,8 @@ rebalance: | |||
2604 | zonelist, high_zoneidx, | 2625 | zonelist, high_zoneidx, |
2605 | nodemask, | 2626 | nodemask, |
2606 | alloc_flags, preferred_zone, | 2627 | alloc_flags, preferred_zone, |
2607 | migratetype, &did_some_progress); | 2628 | classzone_idx, migratetype, |
2629 | &did_some_progress); | ||
2608 | if (page) | 2630 | if (page) |
2609 | goto got_pg; | 2631 | goto got_pg; |
2610 | 2632 | ||
@@ -2623,7 +2645,7 @@ rebalance: | |||
2623 | page = __alloc_pages_may_oom(gfp_mask, order, | 2645 | page = __alloc_pages_may_oom(gfp_mask, order, |
2624 | zonelist, high_zoneidx, | 2646 | zonelist, high_zoneidx, |
2625 | nodemask, preferred_zone, | 2647 | nodemask, preferred_zone, |
2626 | migratetype); | 2648 | classzone_idx, migratetype); |
2627 | if (page) | 2649 | if (page) |
2628 | goto got_pg; | 2650 | goto got_pg; |
2629 | 2651 | ||
@@ -2662,12 +2684,11 @@ rebalance: | |||
2662 | * direct reclaim and reclaim/compaction depends on compaction | 2684 | * direct reclaim and reclaim/compaction depends on compaction |
2663 | * being called after reclaim so call directly if necessary | 2685 | * being called after reclaim so call directly if necessary |
2664 | */ | 2686 | */ |
2665 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2687 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, |
2666 | zonelist, high_zoneidx, | 2688 | high_zoneidx, nodemask, alloc_flags, |
2667 | nodemask, | 2689 | preferred_zone, |
2668 | alloc_flags, preferred_zone, | 2690 | classzone_idx, migratetype, |
2669 | migratetype, sync_migration, | 2691 | migration_mode, &contended_compaction, |
2670 | &contended_compaction, | ||
2671 | &deferred_compaction, | 2692 | &deferred_compaction, |
2672 | &did_some_progress); | 2693 | &did_some_progress); |
2673 | if (page) | 2694 | if (page) |
@@ -2693,11 +2714,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2693 | { | 2714 | { |
2694 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 2715 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
2695 | struct zone *preferred_zone; | 2716 | struct zone *preferred_zone; |
2717 | struct zoneref *preferred_zoneref; | ||
2696 | struct page *page = NULL; | 2718 | struct page *page = NULL; |
2697 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2719 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2698 | unsigned int cpuset_mems_cookie; | 2720 | unsigned int cpuset_mems_cookie; |
2699 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; | 2721 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; |
2700 | struct mem_cgroup *memcg = NULL; | 2722 | int classzone_idx; |
2701 | 2723 | ||
2702 | gfp_mask &= gfp_allowed_mask; | 2724 | gfp_mask &= gfp_allowed_mask; |
2703 | 2725 | ||
@@ -2716,22 +2738,16 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2716 | if (unlikely(!zonelist->_zonerefs->zone)) | 2738 | if (unlikely(!zonelist->_zonerefs->zone)) |
2717 | return NULL; | 2739 | return NULL; |
2718 | 2740 | ||
2719 | /* | ||
2720 | * Will only have any effect when __GFP_KMEMCG is set. This is | ||
2721 | * verified in the (always inline) callee | ||
2722 | */ | ||
2723 | if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) | ||
2724 | return NULL; | ||
2725 | |||
2726 | retry_cpuset: | 2741 | retry_cpuset: |
2727 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2742 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2728 | 2743 | ||
2729 | /* The preferred zone is used for statistics later */ | 2744 | /* The preferred zone is used for statistics later */ |
2730 | first_zones_zonelist(zonelist, high_zoneidx, | 2745 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, |
2731 | nodemask ? : &cpuset_current_mems_allowed, | 2746 | nodemask ? : &cpuset_current_mems_allowed, |
2732 | &preferred_zone); | 2747 | &preferred_zone); |
2733 | if (!preferred_zone) | 2748 | if (!preferred_zone) |
2734 | goto out; | 2749 | goto out; |
2750 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | ||
2735 | 2751 | ||
2736 | #ifdef CONFIG_CMA | 2752 | #ifdef CONFIG_CMA |
2737 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | 2753 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
@@ -2741,7 +2757,7 @@ retry: | |||
2741 | /* First allocation attempt */ | 2757 | /* First allocation attempt */ |
2742 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2758 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2743 | zonelist, high_zoneidx, alloc_flags, | 2759 | zonelist, high_zoneidx, alloc_flags, |
2744 | preferred_zone, migratetype); | 2760 | preferred_zone, classzone_idx, migratetype); |
2745 | if (unlikely(!page)) { | 2761 | if (unlikely(!page)) { |
2746 | /* | 2762 | /* |
2747 | * The first pass makes sure allocations are spread | 2763 | * The first pass makes sure allocations are spread |
@@ -2767,7 +2783,7 @@ retry: | |||
2767 | gfp_mask = memalloc_noio_flags(gfp_mask); | 2783 | gfp_mask = memalloc_noio_flags(gfp_mask); |
2768 | page = __alloc_pages_slowpath(gfp_mask, order, | 2784 | page = __alloc_pages_slowpath(gfp_mask, order, |
2769 | zonelist, high_zoneidx, nodemask, | 2785 | zonelist, high_zoneidx, nodemask, |
2770 | preferred_zone, migratetype); | 2786 | preferred_zone, classzone_idx, migratetype); |
2771 | } | 2787 | } |
2772 | 2788 | ||
2773 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2789 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
@@ -2782,8 +2798,6 @@ out: | |||
2782 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 2798 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2783 | goto retry_cpuset; | 2799 | goto retry_cpuset; |
2784 | 2800 | ||
2785 | memcg_kmem_commit_charge(page, memcg, order); | ||
2786 | |||
2787 | return page; | 2801 | return page; |
2788 | } | 2802 | } |
2789 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2803 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
@@ -2818,7 +2832,7 @@ void __free_pages(struct page *page, unsigned int order) | |||
2818 | { | 2832 | { |
2819 | if (put_page_testzero(page)) { | 2833 | if (put_page_testzero(page)) { |
2820 | if (order == 0) | 2834 | if (order == 0) |
2821 | free_hot_cold_page(page, 0); | 2835 | free_hot_cold_page(page, false); |
2822 | else | 2836 | else |
2823 | __free_pages_ok(page, order); | 2837 | __free_pages_ok(page, order); |
2824 | } | 2838 | } |
@@ -2837,27 +2851,51 @@ void free_pages(unsigned long addr, unsigned int order) | |||
2837 | EXPORT_SYMBOL(free_pages); | 2851 | EXPORT_SYMBOL(free_pages); |
2838 | 2852 | ||
2839 | /* | 2853 | /* |
2840 | * __free_memcg_kmem_pages and free_memcg_kmem_pages will free | 2854 | * alloc_kmem_pages charges newly allocated pages to the kmem resource counter |
2841 | * pages allocated with __GFP_KMEMCG. | 2855 | * of the current memory cgroup. |
2842 | * | ||
2843 | * Those pages are accounted to a particular memcg, embedded in the | ||
2844 | * corresponding page_cgroup. To avoid adding a hit in the allocator to search | ||
2845 | * for that information only to find out that it is NULL for users who have no | ||
2846 | * interest in that whatsoever, we provide these functions. | ||
2847 | * | 2856 | * |
2848 | * The caller knows better which flags it relies on. | 2857 | * It should be used when the caller would like to use kmalloc, but since the |
2858 | * allocation is large, it has to fall back to the page allocator. | ||
2849 | */ | 2859 | */ |
2850 | void __free_memcg_kmem_pages(struct page *page, unsigned int order) | 2860 | struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) |
2861 | { | ||
2862 | struct page *page; | ||
2863 | struct mem_cgroup *memcg = NULL; | ||
2864 | |||
2865 | if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) | ||
2866 | return NULL; | ||
2867 | page = alloc_pages(gfp_mask, order); | ||
2868 | memcg_kmem_commit_charge(page, memcg, order); | ||
2869 | return page; | ||
2870 | } | ||
2871 | |||
2872 | struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) | ||
2873 | { | ||
2874 | struct page *page; | ||
2875 | struct mem_cgroup *memcg = NULL; | ||
2876 | |||
2877 | if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) | ||
2878 | return NULL; | ||
2879 | page = alloc_pages_node(nid, gfp_mask, order); | ||
2880 | memcg_kmem_commit_charge(page, memcg, order); | ||
2881 | return page; | ||
2882 | } | ||
2883 | |||
2884 | /* | ||
2885 | * __free_kmem_pages and free_kmem_pages will free pages allocated with | ||
2886 | * alloc_kmem_pages. | ||
2887 | */ | ||
2888 | void __free_kmem_pages(struct page *page, unsigned int order) | ||
2851 | { | 2889 | { |
2852 | memcg_kmem_uncharge_pages(page, order); | 2890 | memcg_kmem_uncharge_pages(page, order); |
2853 | __free_pages(page, order); | 2891 | __free_pages(page, order); |
2854 | } | 2892 | } |
2855 | 2893 | ||
2856 | void free_memcg_kmem_pages(unsigned long addr, unsigned int order) | 2894 | void free_kmem_pages(unsigned long addr, unsigned int order) |
2857 | { | 2895 | { |
2858 | if (addr != 0) { | 2896 | if (addr != 0) { |
2859 | VM_BUG_ON(!virt_addr_valid((void *)addr)); | 2897 | VM_BUG_ON(!virt_addr_valid((void *)addr)); |
2860 | __free_memcg_kmem_pages(virt_to_page((void *)addr), order); | 2898 | __free_kmem_pages(virt_to_page((void *)addr), order); |
2861 | } | 2899 | } |
2862 | } | 2900 | } |
2863 | 2901 | ||
@@ -4095,7 +4133,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
4095 | 4133 | ||
4096 | static void __meminit zone_init_free_lists(struct zone *zone) | 4134 | static void __meminit zone_init_free_lists(struct zone *zone) |
4097 | { | 4135 | { |
4098 | int order, t; | 4136 | unsigned int order, t; |
4099 | for_each_migratetype_order(order, t) { | 4137 | for_each_migratetype_order(order, t) { |
4100 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); | 4138 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); |
4101 | zone->free_area[order].nr_free = 0; | 4139 | zone->free_area[order].nr_free = 0; |
@@ -4349,9 +4387,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, | |||
4349 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID | 4387 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID |
4350 | /* | 4388 | /* |
4351 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. | 4389 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. |
4352 | * Architectures may implement their own version but if add_active_range() | ||
4353 | * was used and there are no special requirements, this is a convenient | ||
4354 | * alternative | ||
4355 | */ | 4390 | */ |
4356 | int __meminit __early_pfn_to_nid(unsigned long pfn) | 4391 | int __meminit __early_pfn_to_nid(unsigned long pfn) |
4357 | { | 4392 | { |
@@ -4406,10 +4441,9 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) | |||
4406 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. | 4441 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. |
4407 | * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid | 4442 | * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid |
4408 | * | 4443 | * |
4409 | * If an architecture guarantees that all ranges registered with | 4444 | * If an architecture guarantees that all ranges registered contain no holes |
4410 | * add_active_ranges() contain no holes and may be freed, this | 4445 | * and may be freed, this this function may be used instead of calling |
4411 | * this function may be used instead of calling memblock_free_early_nid() | 4446 | * memblock_free_early_nid() manually. |
4412 | * manually. | ||
4413 | */ | 4447 | */ |
4414 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | 4448 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) |
4415 | { | 4449 | { |
@@ -4431,9 +4465,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | |||
4431 | * sparse_memory_present_with_active_regions - Call memory_present for each active range | 4465 | * sparse_memory_present_with_active_regions - Call memory_present for each active range |
4432 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. | 4466 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. |
4433 | * | 4467 | * |
4434 | * If an architecture guarantees that all ranges registered with | 4468 | * If an architecture guarantees that all ranges registered contain no holes and may |
4435 | * add_active_ranges() contain no holes and may be freed, this | 4469 | * be freed, this function may be used instead of calling memory_present() manually. |
4436 | * function may be used instead of calling memory_present() manually. | ||
4437 | */ | 4470 | */ |
4438 | void __init sparse_memory_present_with_active_regions(int nid) | 4471 | void __init sparse_memory_present_with_active_regions(int nid) |
4439 | { | 4472 | { |
@@ -4451,7 +4484,7 @@ void __init sparse_memory_present_with_active_regions(int nid) | |||
4451 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn. | 4484 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn. |
4452 | * | 4485 | * |
4453 | * It returns the start and end page frame of a node based on information | 4486 | * It returns the start and end page frame of a node based on information |
4454 | * provided by an arch calling add_active_range(). If called for a node | 4487 | * provided by memblock_set_node(). If called for a node |
4455 | * with no available memory, a warning is printed and the start and end | 4488 | * with no available memory, a warning is printed and the start and end |
4456 | * PFNs will be 0. | 4489 | * PFNs will be 0. |
4457 | */ | 4490 | */ |
@@ -4921,8 +4954,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4921 | 4954 | ||
4922 | pgdat->node_id = nid; | 4955 | pgdat->node_id = nid; |
4923 | pgdat->node_start_pfn = node_start_pfn; | 4956 | pgdat->node_start_pfn = node_start_pfn; |
4924 | if (node_state(nid, N_MEMORY)) | ||
4925 | init_zone_allows_reclaim(nid); | ||
4926 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 4957 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
4927 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | 4958 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
4928 | #endif | 4959 | #endif |
@@ -5030,7 +5061,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) | |||
5030 | * find_min_pfn_with_active_regions - Find the minimum PFN registered | 5061 | * find_min_pfn_with_active_regions - Find the minimum PFN registered |
5031 | * | 5062 | * |
5032 | * It returns the minimum PFN based on information provided via | 5063 | * It returns the minimum PFN based on information provided via |
5033 | * add_active_range(). | 5064 | * memblock_set_node(). |
5034 | */ | 5065 | */ |
5035 | unsigned long __init find_min_pfn_with_active_regions(void) | 5066 | unsigned long __init find_min_pfn_with_active_regions(void) |
5036 | { | 5067 | { |
@@ -5251,7 +5282,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) | |||
5251 | * @max_zone_pfn: an array of max PFNs for each zone | 5282 | * @max_zone_pfn: an array of max PFNs for each zone |
5252 | * | 5283 | * |
5253 | * This will call free_area_init_node() for each active node in the system. | 5284 | * This will call free_area_init_node() for each active node in the system. |
5254 | * Using the page ranges provided by add_active_range(), the size of each | 5285 | * Using the page ranges provided by memblock_set_node(), the size of each |
5255 | * zone in each node and their holes is calculated. If the maximum PFN | 5286 | * zone in each node and their holes is calculated. If the maximum PFN |
5256 | * between two adjacent zones match, it is assumed that the zone is empty. | 5287 | * between two adjacent zones match, it is assumed that the zone is empty. |
5257 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed | 5288 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed |
@@ -6009,53 +6040,64 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) | |||
6009 | * @end_bitidx: The last bit of interest | 6040 | * @end_bitidx: The last bit of interest |
6010 | * returns pageblock_bits flags | 6041 | * returns pageblock_bits flags |
6011 | */ | 6042 | */ |
6012 | unsigned long get_pageblock_flags_group(struct page *page, | 6043 | unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, |
6013 | int start_bitidx, int end_bitidx) | 6044 | unsigned long end_bitidx, |
6045 | unsigned long mask) | ||
6014 | { | 6046 | { |
6015 | struct zone *zone; | 6047 | struct zone *zone; |
6016 | unsigned long *bitmap; | 6048 | unsigned long *bitmap; |
6017 | unsigned long pfn, bitidx; | 6049 | unsigned long bitidx, word_bitidx; |
6018 | unsigned long flags = 0; | 6050 | unsigned long word; |
6019 | unsigned long value = 1; | ||
6020 | 6051 | ||
6021 | zone = page_zone(page); | 6052 | zone = page_zone(page); |
6022 | pfn = page_to_pfn(page); | ||
6023 | bitmap = get_pageblock_bitmap(zone, pfn); | 6053 | bitmap = get_pageblock_bitmap(zone, pfn); |
6024 | bitidx = pfn_to_bitidx(zone, pfn); | 6054 | bitidx = pfn_to_bitidx(zone, pfn); |
6055 | word_bitidx = bitidx / BITS_PER_LONG; | ||
6056 | bitidx &= (BITS_PER_LONG-1); | ||
6025 | 6057 | ||
6026 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 6058 | word = bitmap[word_bitidx]; |
6027 | if (test_bit(bitidx + start_bitidx, bitmap)) | 6059 | bitidx += end_bitidx; |
6028 | flags |= value; | 6060 | return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; |
6029 | |||
6030 | return flags; | ||
6031 | } | 6061 | } |
6032 | 6062 | ||
6033 | /** | 6063 | /** |
6034 | * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages | 6064 | * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages |
6035 | * @page: The page within the block of interest | 6065 | * @page: The page within the block of interest |
6036 | * @start_bitidx: The first bit of interest | 6066 | * @start_bitidx: The first bit of interest |
6037 | * @end_bitidx: The last bit of interest | 6067 | * @end_bitidx: The last bit of interest |
6038 | * @flags: The flags to set | 6068 | * @flags: The flags to set |
6039 | */ | 6069 | */ |
6040 | void set_pageblock_flags_group(struct page *page, unsigned long flags, | 6070 | void set_pfnblock_flags_mask(struct page *page, unsigned long flags, |
6041 | int start_bitidx, int end_bitidx) | 6071 | unsigned long pfn, |
6072 | unsigned long end_bitidx, | ||
6073 | unsigned long mask) | ||
6042 | { | 6074 | { |
6043 | struct zone *zone; | 6075 | struct zone *zone; |
6044 | unsigned long *bitmap; | 6076 | unsigned long *bitmap; |
6045 | unsigned long pfn, bitidx; | 6077 | unsigned long bitidx, word_bitidx; |
6046 | unsigned long value = 1; | 6078 | unsigned long old_word, word; |
6079 | |||
6080 | BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); | ||
6047 | 6081 | ||
6048 | zone = page_zone(page); | 6082 | zone = page_zone(page); |
6049 | pfn = page_to_pfn(page); | ||
6050 | bitmap = get_pageblock_bitmap(zone, pfn); | 6083 | bitmap = get_pageblock_bitmap(zone, pfn); |
6051 | bitidx = pfn_to_bitidx(zone, pfn); | 6084 | bitidx = pfn_to_bitidx(zone, pfn); |
6085 | word_bitidx = bitidx / BITS_PER_LONG; | ||
6086 | bitidx &= (BITS_PER_LONG-1); | ||
6087 | |||
6052 | VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); | 6088 | VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); |
6053 | 6089 | ||
6054 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 6090 | bitidx += end_bitidx; |
6055 | if (flags & value) | 6091 | mask <<= (BITS_PER_LONG - bitidx - 1); |
6056 | __set_bit(bitidx + start_bitidx, bitmap); | 6092 | flags <<= (BITS_PER_LONG - bitidx - 1); |
6057 | else | 6093 | |
6058 | __clear_bit(bitidx + start_bitidx, bitmap); | 6094 | word = ACCESS_ONCE(bitmap[word_bitidx]); |
6095 | for (;;) { | ||
6096 | old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); | ||
6097 | if (word == old_word) | ||
6098 | break; | ||
6099 | word = old_word; | ||
6100 | } | ||
6059 | } | 6101 | } |
6060 | 6102 | ||
6061 | /* | 6103 | /* |
@@ -6215,7 +6257,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
6215 | cc->nr_migratepages -= nr_reclaimed; | 6257 | cc->nr_migratepages -= nr_reclaimed; |
6216 | 6258 | ||
6217 | ret = migrate_pages(&cc->migratepages, alloc_migrate_target, | 6259 | ret = migrate_pages(&cc->migratepages, alloc_migrate_target, |
6218 | 0, MIGRATE_SYNC, MR_CMA); | 6260 | NULL, 0, cc->mode, MR_CMA); |
6219 | } | 6261 | } |
6220 | if (ret < 0) { | 6262 | if (ret < 0) { |
6221 | putback_movable_pages(&cc->migratepages); | 6263 | putback_movable_pages(&cc->migratepages); |
@@ -6254,7 +6296,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
6254 | .nr_migratepages = 0, | 6296 | .nr_migratepages = 0, |
6255 | .order = -1, | 6297 | .order = -1, |
6256 | .zone = page_zone(pfn_to_page(start)), | 6298 | .zone = page_zone(pfn_to_page(start)), |
6257 | .sync = true, | 6299 | .mode = MIGRATE_SYNC, |
6258 | .ignore_skip_hint = true, | 6300 | .ignore_skip_hint = true, |
6259 | }; | 6301 | }; |
6260 | INIT_LIST_HEAD(&cc.migratepages); | 6302 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -6409,7 +6451,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
6409 | { | 6451 | { |
6410 | struct page *page; | 6452 | struct page *page; |
6411 | struct zone *zone; | 6453 | struct zone *zone; |
6412 | int order, i; | 6454 | unsigned int order, i; |
6413 | unsigned long pfn; | 6455 | unsigned long pfn; |
6414 | unsigned long flags; | 6456 | unsigned long flags; |
6415 | /* find the first valid pfn */ | 6457 | /* find the first valid pfn */ |
@@ -6461,7 +6503,7 @@ bool is_free_buddy_page(struct page *page) | |||
6461 | struct zone *zone = page_zone(page); | 6503 | struct zone *zone = page_zone(page); |
6462 | unsigned long pfn = page_to_pfn(page); | 6504 | unsigned long pfn = page_to_pfn(page); |
6463 | unsigned long flags; | 6505 | unsigned long flags; |
6464 | int order; | 6506 | unsigned int order; |
6465 | 6507 | ||
6466 | spin_lock_irqsave(&zone->lock, flags); | 6508 | spin_lock_irqsave(&zone->lock, flags); |
6467 | for (order = 0; order < MAX_ORDER; order++) { | 6509 | for (order = 0; order < MAX_ORDER; order++) { |
diff --git a/mm/page_io.c b/mm/page_io.c index 7c59ef681381..58b50d2901fe 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -248,11 +248,16 @@ out: | |||
248 | return ret; | 248 | return ret; |
249 | } | 249 | } |
250 | 250 | ||
251 | static sector_t swap_page_sector(struct page *page) | ||
252 | { | ||
253 | return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9); | ||
254 | } | ||
255 | |||
251 | int __swap_writepage(struct page *page, struct writeback_control *wbc, | 256 | int __swap_writepage(struct page *page, struct writeback_control *wbc, |
252 | void (*end_write_func)(struct bio *, int)) | 257 | void (*end_write_func)(struct bio *, int)) |
253 | { | 258 | { |
254 | struct bio *bio; | 259 | struct bio *bio; |
255 | int ret = 0, rw = WRITE; | 260 | int ret, rw = WRITE; |
256 | struct swap_info_struct *sis = page_swap_info(page); | 261 | struct swap_info_struct *sis = page_swap_info(page); |
257 | 262 | ||
258 | if (sis->flags & SWP_FILE) { | 263 | if (sis->flags & SWP_FILE) { |
@@ -297,6 +302,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, | |||
297 | return ret; | 302 | return ret; |
298 | } | 303 | } |
299 | 304 | ||
305 | ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); | ||
306 | if (!ret) { | ||
307 | count_vm_event(PSWPOUT); | ||
308 | return 0; | ||
309 | } | ||
310 | |||
311 | ret = 0; | ||
300 | bio = get_swap_bio(GFP_NOIO, page, end_write_func); | 312 | bio = get_swap_bio(GFP_NOIO, page, end_write_func); |
301 | if (bio == NULL) { | 313 | if (bio == NULL) { |
302 | set_page_dirty(page); | 314 | set_page_dirty(page); |
@@ -338,6 +350,13 @@ int swap_readpage(struct page *page) | |||
338 | return ret; | 350 | return ret; |
339 | } | 351 | } |
340 | 352 | ||
353 | ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); | ||
354 | if (!ret) { | ||
355 | count_vm_event(PSWPIN); | ||
356 | return 0; | ||
357 | } | ||
358 | |||
359 | ret = 0; | ||
341 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); | 360 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
342 | if (bio == NULL) { | 361 | if (bio == NULL) { |
343 | unlock_page(page); | 362 | unlock_page(page); |
@@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
103 | * LOCK should suffice since the actual taking of the lock must | 103 | * LOCK should suffice since the actual taking of the lock must |
104 | * happen _before_ what follows. | 104 | * happen _before_ what follows. |
105 | */ | 105 | */ |
106 | might_sleep(); | ||
106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { | 107 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { |
107 | anon_vma_lock_write(anon_vma); | 108 | anon_vma_lock_write(anon_vma); |
108 | anon_vma_unlock_write(anon_vma); | 109 | anon_vma_unlock_write(anon_vma); |
@@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page) | |||
426 | * above cannot corrupt). | 427 | * above cannot corrupt). |
427 | */ | 428 | */ |
428 | if (!page_mapped(page)) { | 429 | if (!page_mapped(page)) { |
430 | rcu_read_unlock(); | ||
429 | put_anon_vma(anon_vma); | 431 | put_anon_vma(anon_vma); |
430 | anon_vma = NULL; | 432 | return NULL; |
431 | } | 433 | } |
432 | out: | 434 | out: |
433 | rcu_read_unlock(); | 435 | rcu_read_unlock(); |
@@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) | |||
477 | } | 479 | } |
478 | 480 | ||
479 | if (!page_mapped(page)) { | 481 | if (!page_mapped(page)) { |
482 | rcu_read_unlock(); | ||
480 | put_anon_vma(anon_vma); | 483 | put_anon_vma(anon_vma); |
481 | anon_vma = NULL; | 484 | return NULL; |
482 | goto out; | ||
483 | } | 485 | } |
484 | 486 | ||
485 | /* we pinned the anon_vma, its safe to sleep */ | 487 | /* we pinned the anon_vma, its safe to sleep */ |
@@ -669,7 +671,7 @@ struct page_referenced_arg { | |||
669 | /* | 671 | /* |
670 | * arg: page_referenced_arg will be passed | 672 | * arg: page_referenced_arg will be passed |
671 | */ | 673 | */ |
672 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, | 674 | static int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
673 | unsigned long address, void *arg) | 675 | unsigned long address, void *arg) |
674 | { | 676 | { |
675 | struct mm_struct *mm = vma->vm_mm; | 677 | struct mm_struct *mm = vma->vm_mm; |
@@ -986,6 +988,12 @@ void do_page_add_anon_rmap(struct page *page, | |||
986 | { | 988 | { |
987 | int first = atomic_inc_and_test(&page->_mapcount); | 989 | int first = atomic_inc_and_test(&page->_mapcount); |
988 | if (first) { | 990 | if (first) { |
991 | /* | ||
992 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because | ||
993 | * these counters are not modified in interrupt context, and | ||
994 | * pte lock(a spinlock) is held, which implies preemption | ||
995 | * disabled. | ||
996 | */ | ||
989 | if (PageTransHuge(page)) | 997 | if (PageTransHuge(page)) |
990 | __inc_zone_page_state(page, | 998 | __inc_zone_page_state(page, |
991 | NR_ANON_TRANSPARENT_HUGEPAGES); | 999 | NR_ANON_TRANSPARENT_HUGEPAGES); |
@@ -1024,11 +1032,25 @@ void page_add_new_anon_rmap(struct page *page, | |||
1024 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | 1032 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, |
1025 | hpage_nr_pages(page)); | 1033 | hpage_nr_pages(page)); |
1026 | __page_set_anon_rmap(page, vma, address, 1); | 1034 | __page_set_anon_rmap(page, vma, address, 1); |
1027 | if (!mlocked_vma_newpage(vma, page)) { | 1035 | |
1036 | VM_BUG_ON_PAGE(PageLRU(page), page); | ||
1037 | if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { | ||
1028 | SetPageActive(page); | 1038 | SetPageActive(page); |
1029 | lru_cache_add(page); | 1039 | lru_cache_add(page); |
1030 | } else | 1040 | return; |
1031 | add_page_to_unevictable_list(page); | 1041 | } |
1042 | |||
1043 | if (!TestSetPageMlocked(page)) { | ||
1044 | /* | ||
1045 | * We use the irq-unsafe __mod_zone_page_stat because this | ||
1046 | * counter is not modified from interrupt context, and the pte | ||
1047 | * lock is held(spinlock), which implies preemption disabled. | ||
1048 | */ | ||
1049 | __mod_zone_page_state(page_zone(page), NR_MLOCK, | ||
1050 | hpage_nr_pages(page)); | ||
1051 | count_vm_event(UNEVICTABLE_PGMLOCKED); | ||
1052 | } | ||
1053 | add_page_to_unevictable_list(page); | ||
1032 | } | 1054 | } |
1033 | 1055 | ||
1034 | /** | 1056 | /** |
@@ -1077,6 +1099,11 @@ void page_remove_rmap(struct page *page) | |||
1077 | /* | 1099 | /* |
1078 | * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED | 1100 | * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED |
1079 | * and not charged by memcg for now. | 1101 | * and not charged by memcg for now. |
1102 | * | ||
1103 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because | ||
1104 | * these counters are not modified in interrupt context, and | ||
1105 | * these counters are not modified in interrupt context, and | ||
1106 | * pte lock(a spinlock) is held, which implies preemption disabled. | ||
1080 | */ | 1107 | */ |
1081 | if (unlikely(PageHuge(page))) | 1108 | if (unlikely(PageHuge(page))) |
1082 | goto out; | 1109 | goto out; |
@@ -1112,7 +1139,7 @@ out: | |||
1112 | /* | 1139 | /* |
1113 | * @arg: enum ttu_flags will be passed to this argument | 1140 | * @arg: enum ttu_flags will be passed to this argument |
1114 | */ | 1141 | */ |
1115 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 1142 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
1116 | unsigned long address, void *arg) | 1143 | unsigned long address, void *arg) |
1117 | { | 1144 | { |
1118 | struct mm_struct *mm = vma->vm_mm; | 1145 | struct mm_struct *mm = vma->vm_mm; |
@@ -1135,7 +1162,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1135 | if (vma->vm_flags & VM_LOCKED) | 1162 | if (vma->vm_flags & VM_LOCKED) |
1136 | goto out_mlock; | 1163 | goto out_mlock; |
1137 | 1164 | ||
1138 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | 1165 | if (flags & TTU_MUNLOCK) |
1139 | goto out_unmap; | 1166 | goto out_unmap; |
1140 | } | 1167 | } |
1141 | if (!(flags & TTU_IGNORE_ACCESS)) { | 1168 | if (!(flags & TTU_IGNORE_ACCESS)) { |
@@ -1203,7 +1230,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1203 | * pte. do_swap_page() will wait until the migration | 1230 | * pte. do_swap_page() will wait until the migration |
1204 | * pte is removed and then restart fault handling. | 1231 | * pte is removed and then restart fault handling. |
1205 | */ | 1232 | */ |
1206 | BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); | 1233 | BUG_ON(!(flags & TTU_MIGRATION)); |
1207 | entry = make_migration_entry(page, pte_write(pteval)); | 1234 | entry = make_migration_entry(page, pte_write(pteval)); |
1208 | } | 1235 | } |
1209 | swp_pte = swp_entry_to_pte(entry); | 1236 | swp_pte = swp_entry_to_pte(entry); |
@@ -1212,7 +1239,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1212 | set_pte_at(mm, address, pte, swp_pte); | 1239 | set_pte_at(mm, address, pte, swp_pte); |
1213 | BUG_ON(pte_file(*pte)); | 1240 | BUG_ON(pte_file(*pte)); |
1214 | } else if (IS_ENABLED(CONFIG_MIGRATION) && | 1241 | } else if (IS_ENABLED(CONFIG_MIGRATION) && |
1215 | (TTU_ACTION(flags) == TTU_MIGRATION)) { | 1242 | (flags & TTU_MIGRATION)) { |
1216 | /* Establish migration entry for a file page */ | 1243 | /* Establish migration entry for a file page */ |
1217 | swp_entry_t entry; | 1244 | swp_entry_t entry; |
1218 | entry = make_migration_entry(page, pte_write(pteval)); | 1245 | entry = make_migration_entry(page, pte_write(pteval)); |
@@ -1225,7 +1252,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1225 | 1252 | ||
1226 | out_unmap: | 1253 | out_unmap: |
1227 | pte_unmap_unlock(pte, ptl); | 1254 | pte_unmap_unlock(pte, ptl); |
1228 | if (ret != SWAP_FAIL) | 1255 | if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) |
1229 | mmu_notifier_invalidate_page(mm, address); | 1256 | mmu_notifier_invalidate_page(mm, address); |
1230 | out: | 1257 | out: |
1231 | return ret; | 1258 | return ret; |
@@ -1359,7 +1386,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1359 | if (page->index != linear_page_index(vma, address)) { | 1386 | if (page->index != linear_page_index(vma, address)) { |
1360 | pte_t ptfile = pgoff_to_pte(page->index); | 1387 | pte_t ptfile = pgoff_to_pte(page->index); |
1361 | if (pte_soft_dirty(pteval)) | 1388 | if (pte_soft_dirty(pteval)) |
1362 | pte_file_mksoft_dirty(ptfile); | 1389 | ptfile = pte_file_mksoft_dirty(ptfile); |
1363 | set_pte_at(mm, address, pte, ptfile); | 1390 | set_pte_at(mm, address, pte, ptfile); |
1364 | } | 1391 | } |
1365 | 1392 | ||
@@ -1512,7 +1539,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1512 | * locking requirements of exec(), migration skips | 1539 | * locking requirements of exec(), migration skips |
1513 | * temporary VMAs until after exec() completes. | 1540 | * temporary VMAs until after exec() completes. |
1514 | */ | 1541 | */ |
1515 | if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) | 1542 | if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) |
1516 | rwc.invalid_vma = invalid_migration_vma; | 1543 | rwc.invalid_vma = invalid_migration_vma; |
1517 | 1544 | ||
1518 | ret = rmap_walk(page, &rwc); | 1545 | ret = rmap_walk(page, &rwc); |
diff --git a/mm/shmem.c b/mm/shmem.c index 9f70e02111c6..5402481c28d1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1132,7 +1132,7 @@ repeat: | |||
1132 | goto decused; | 1132 | goto decused; |
1133 | } | 1133 | } |
1134 | 1134 | ||
1135 | SetPageSwapBacked(page); | 1135 | __SetPageSwapBacked(page); |
1136 | __set_page_locked(page); | 1136 | __set_page_locked(page); |
1137 | error = mem_cgroup_charge_file(page, current->mm, | 1137 | error = mem_cgroup_charge_file(page, current->mm, |
1138 | gfp & GFP_RECLAIM_MASK); | 1138 | gfp & GFP_RECLAIM_MASK); |
@@ -1372,9 +1372,13 @@ shmem_write_begin(struct file *file, struct address_space *mapping, | |||
1372 | loff_t pos, unsigned len, unsigned flags, | 1372 | loff_t pos, unsigned len, unsigned flags, |
1373 | struct page **pagep, void **fsdata) | 1373 | struct page **pagep, void **fsdata) |
1374 | { | 1374 | { |
1375 | int ret; | ||
1375 | struct inode *inode = mapping->host; | 1376 | struct inode *inode = mapping->host; |
1376 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1377 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1377 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); | 1378 | ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); |
1379 | if (ret == 0 && *pagep) | ||
1380 | init_page_accessed(*pagep); | ||
1381 | return ret; | ||
1378 | } | 1382 | } |
1379 | 1383 | ||
1380 | static int | 1384 | static int |
@@ -1621,10 +1621,16 @@ __initcall(cpucache_init); | |||
1621 | static noinline void | 1621 | static noinline void |
1622 | slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) | 1622 | slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) |
1623 | { | 1623 | { |
1624 | #if DEBUG | ||
1624 | struct kmem_cache_node *n; | 1625 | struct kmem_cache_node *n; |
1625 | struct page *page; | 1626 | struct page *page; |
1626 | unsigned long flags; | 1627 | unsigned long flags; |
1627 | int node; | 1628 | int node; |
1629 | static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, | ||
1630 | DEFAULT_RATELIMIT_BURST); | ||
1631 | |||
1632 | if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) | ||
1633 | return; | ||
1628 | 1634 | ||
1629 | printk(KERN_WARNING | 1635 | printk(KERN_WARNING |
1630 | "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", | 1636 | "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", |
@@ -1662,6 +1668,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) | |||
1662 | node, active_slabs, num_slabs, active_objs, num_objs, | 1668 | node, active_slabs, num_slabs, active_objs, num_objs, |
1663 | free_objects); | 1669 | free_objects); |
1664 | } | 1670 | } |
1671 | #endif | ||
1665 | } | 1672 | } |
1666 | 1673 | ||
1667 | /* | 1674 | /* |
@@ -1681,10 +1688,13 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, | |||
1681 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1688 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1682 | flags |= __GFP_RECLAIMABLE; | 1689 | flags |= __GFP_RECLAIMABLE; |
1683 | 1690 | ||
1691 | if (memcg_charge_slab(cachep, flags, cachep->gfporder)) | ||
1692 | return NULL; | ||
1693 | |||
1684 | page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); | 1694 | page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); |
1685 | if (!page) { | 1695 | if (!page) { |
1686 | if (!(flags & __GFP_NOWARN) && printk_ratelimit()) | 1696 | memcg_uncharge_slab(cachep, cachep->gfporder); |
1687 | slab_out_of_memory(cachep, flags, nodeid); | 1697 | slab_out_of_memory(cachep, flags, nodeid); |
1688 | return NULL; | 1698 | return NULL; |
1689 | } | 1699 | } |
1690 | 1700 | ||
@@ -1702,7 +1712,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, | |||
1702 | __SetPageSlab(page); | 1712 | __SetPageSlab(page); |
1703 | if (page->pfmemalloc) | 1713 | if (page->pfmemalloc) |
1704 | SetPageSlabPfmemalloc(page); | 1714 | SetPageSlabPfmemalloc(page); |
1705 | memcg_bind_pages(cachep, cachep->gfporder); | ||
1706 | 1715 | ||
1707 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | 1716 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { |
1708 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | 1717 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); |
@@ -1738,10 +1747,10 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) | |||
1738 | page_mapcount_reset(page); | 1747 | page_mapcount_reset(page); |
1739 | page->mapping = NULL; | 1748 | page->mapping = NULL; |
1740 | 1749 | ||
1741 | memcg_release_pages(cachep, cachep->gfporder); | ||
1742 | if (current->reclaim_state) | 1750 | if (current->reclaim_state) |
1743 | current->reclaim_state->reclaimed_slab += nr_freed; | 1751 | current->reclaim_state->reclaimed_slab += nr_freed; |
1744 | __free_memcg_kmem_pages(page, cachep->gfporder); | 1752 | __free_pages(page, cachep->gfporder); |
1753 | memcg_uncharge_slab(cachep, cachep->gfporder); | ||
1745 | } | 1754 | } |
1746 | 1755 | ||
1747 | static void kmem_rcu_free(struct rcu_head *head) | 1756 | static void kmem_rcu_free(struct rcu_head *head) |
@@ -2469,8 +2478,7 @@ out: | |||
2469 | return nr_freed; | 2478 | return nr_freed; |
2470 | } | 2479 | } |
2471 | 2480 | ||
2472 | /* Called with slab_mutex held to protect against cpu hotplug */ | 2481 | int __kmem_cache_shrink(struct kmem_cache *cachep) |
2473 | static int __cache_shrink(struct kmem_cache *cachep) | ||
2474 | { | 2482 | { |
2475 | int ret = 0, i = 0; | 2483 | int ret = 0, i = 0; |
2476 | struct kmem_cache_node *n; | 2484 | struct kmem_cache_node *n; |
@@ -2491,32 +2499,11 @@ static int __cache_shrink(struct kmem_cache *cachep) | |||
2491 | return (ret ? 1 : 0); | 2499 | return (ret ? 1 : 0); |
2492 | } | 2500 | } |
2493 | 2501 | ||
2494 | /** | ||
2495 | * kmem_cache_shrink - Shrink a cache. | ||
2496 | * @cachep: The cache to shrink. | ||
2497 | * | ||
2498 | * Releases as many slabs as possible for a cache. | ||
2499 | * To help debugging, a zero exit status indicates all slabs were released. | ||
2500 | */ | ||
2501 | int kmem_cache_shrink(struct kmem_cache *cachep) | ||
2502 | { | ||
2503 | int ret; | ||
2504 | BUG_ON(!cachep || in_interrupt()); | ||
2505 | |||
2506 | get_online_cpus(); | ||
2507 | mutex_lock(&slab_mutex); | ||
2508 | ret = __cache_shrink(cachep); | ||
2509 | mutex_unlock(&slab_mutex); | ||
2510 | put_online_cpus(); | ||
2511 | return ret; | ||
2512 | } | ||
2513 | EXPORT_SYMBOL(kmem_cache_shrink); | ||
2514 | |||
2515 | int __kmem_cache_shutdown(struct kmem_cache *cachep) | 2502 | int __kmem_cache_shutdown(struct kmem_cache *cachep) |
2516 | { | 2503 | { |
2517 | int i; | 2504 | int i; |
2518 | struct kmem_cache_node *n; | 2505 | struct kmem_cache_node *n; |
2519 | int rc = __cache_shrink(cachep); | 2506 | int rc = __kmem_cache_shrink(cachep); |
2520 | 2507 | ||
2521 | if (rc) | 2508 | if (rc) |
2522 | return rc; | 2509 | return rc; |
@@ -91,6 +91,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, | |||
91 | #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) | 91 | #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) |
92 | 92 | ||
93 | int __kmem_cache_shutdown(struct kmem_cache *); | 93 | int __kmem_cache_shutdown(struct kmem_cache *); |
94 | int __kmem_cache_shrink(struct kmem_cache *); | ||
94 | void slab_kmem_cache_release(struct kmem_cache *); | 95 | void slab_kmem_cache_release(struct kmem_cache *); |
95 | 96 | ||
96 | struct seq_file; | 97 | struct seq_file; |
@@ -120,21 +121,6 @@ static inline bool is_root_cache(struct kmem_cache *s) | |||
120 | return !s->memcg_params || s->memcg_params->is_root_cache; | 121 | return !s->memcg_params || s->memcg_params->is_root_cache; |
121 | } | 122 | } |
122 | 123 | ||
123 | static inline void memcg_bind_pages(struct kmem_cache *s, int order) | ||
124 | { | ||
125 | if (!is_root_cache(s)) | ||
126 | atomic_add(1 << order, &s->memcg_params->nr_pages); | ||
127 | } | ||
128 | |||
129 | static inline void memcg_release_pages(struct kmem_cache *s, int order) | ||
130 | { | ||
131 | if (is_root_cache(s)) | ||
132 | return; | ||
133 | |||
134 | if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) | ||
135 | mem_cgroup_destroy_cache(s); | ||
136 | } | ||
137 | |||
138 | static inline bool slab_equal_or_root(struct kmem_cache *s, | 124 | static inline bool slab_equal_or_root(struct kmem_cache *s, |
139 | struct kmem_cache *p) | 125 | struct kmem_cache *p) |
140 | { | 126 | { |
@@ -192,18 +178,29 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | |||
192 | return s; | 178 | return s; |
193 | return s->memcg_params->root_cache; | 179 | return s->memcg_params->root_cache; |
194 | } | 180 | } |
195 | #else | 181 | |
196 | static inline bool is_root_cache(struct kmem_cache *s) | 182 | static __always_inline int memcg_charge_slab(struct kmem_cache *s, |
183 | gfp_t gfp, int order) | ||
197 | { | 184 | { |
198 | return true; | 185 | if (!memcg_kmem_enabled()) |
186 | return 0; | ||
187 | if (is_root_cache(s)) | ||
188 | return 0; | ||
189 | return __memcg_charge_slab(s, gfp, order); | ||
199 | } | 190 | } |
200 | 191 | ||
201 | static inline void memcg_bind_pages(struct kmem_cache *s, int order) | 192 | static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) |
202 | { | 193 | { |
194 | if (!memcg_kmem_enabled()) | ||
195 | return; | ||
196 | if (is_root_cache(s)) | ||
197 | return; | ||
198 | __memcg_uncharge_slab(s, order); | ||
203 | } | 199 | } |
204 | 200 | #else | |
205 | static inline void memcg_release_pages(struct kmem_cache *s, int order) | 201 | static inline bool is_root_cache(struct kmem_cache *s) |
206 | { | 202 | { |
203 | return true; | ||
207 | } | 204 | } |
208 | 205 | ||
209 | static inline bool slab_equal_or_root(struct kmem_cache *s, | 206 | static inline bool slab_equal_or_root(struct kmem_cache *s, |
@@ -227,6 +224,15 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | |||
227 | { | 224 | { |
228 | return s; | 225 | return s; |
229 | } | 226 | } |
227 | |||
228 | static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) | ||
229 | { | ||
230 | return 0; | ||
231 | } | ||
232 | |||
233 | static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) | ||
234 | { | ||
235 | } | ||
230 | #endif | 236 | #endif |
231 | 237 | ||
232 | static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) | 238 | static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 102cc6fca3d3..735e01a0db6f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -160,7 +160,6 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, | |||
160 | 160 | ||
161 | s->refcount = 1; | 161 | s->refcount = 1; |
162 | list_add(&s->list, &slab_caches); | 162 | list_add(&s->list, &slab_caches); |
163 | memcg_register_cache(s); | ||
164 | out: | 163 | out: |
165 | if (err) | 164 | if (err) |
166 | return ERR_PTR(err); | 165 | return ERR_PTR(err); |
@@ -205,6 +204,8 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
205 | int err; | 204 | int err; |
206 | 205 | ||
207 | get_online_cpus(); | 206 | get_online_cpus(); |
207 | get_online_mems(); | ||
208 | |||
208 | mutex_lock(&slab_mutex); | 209 | mutex_lock(&slab_mutex); |
209 | 210 | ||
210 | err = kmem_cache_sanity_check(name, size); | 211 | err = kmem_cache_sanity_check(name, size); |
@@ -239,6 +240,8 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
239 | 240 | ||
240 | out_unlock: | 241 | out_unlock: |
241 | mutex_unlock(&slab_mutex); | 242 | mutex_unlock(&slab_mutex); |
243 | |||
244 | put_online_mems(); | ||
242 | put_online_cpus(); | 245 | put_online_cpus(); |
243 | 246 | ||
244 | if (err) { | 247 | if (err) { |
@@ -258,31 +261,29 @@ EXPORT_SYMBOL(kmem_cache_create); | |||
258 | 261 | ||
259 | #ifdef CONFIG_MEMCG_KMEM | 262 | #ifdef CONFIG_MEMCG_KMEM |
260 | /* | 263 | /* |
261 | * kmem_cache_create_memcg - Create a cache for a memory cgroup. | 264 | * memcg_create_kmem_cache - Create a cache for a memory cgroup. |
262 | * @memcg: The memory cgroup the new cache is for. | 265 | * @memcg: The memory cgroup the new cache is for. |
263 | * @root_cache: The parent of the new cache. | 266 | * @root_cache: The parent of the new cache. |
267 | * @memcg_name: The name of the memory cgroup (used for naming the new cache). | ||
264 | * | 268 | * |
265 | * This function attempts to create a kmem cache that will serve allocation | 269 | * This function attempts to create a kmem cache that will serve allocation |
266 | * requests going from @memcg to @root_cache. The new cache inherits properties | 270 | * requests going from @memcg to @root_cache. The new cache inherits properties |
267 | * from its parent. | 271 | * from its parent. |
268 | */ | 272 | */ |
269 | void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache) | 273 | struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, |
274 | struct kmem_cache *root_cache, | ||
275 | const char *memcg_name) | ||
270 | { | 276 | { |
271 | struct kmem_cache *s; | 277 | struct kmem_cache *s = NULL; |
272 | char *cache_name; | 278 | char *cache_name; |
273 | 279 | ||
274 | get_online_cpus(); | 280 | get_online_cpus(); |
275 | mutex_lock(&slab_mutex); | 281 | get_online_mems(); |
276 | 282 | ||
277 | /* | 283 | mutex_lock(&slab_mutex); |
278 | * Since per-memcg caches are created asynchronously on first | ||
279 | * allocation (see memcg_kmem_get_cache()), several threads can try to | ||
280 | * create the same cache, but only one of them may succeed. | ||
281 | */ | ||
282 | if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg))) | ||
283 | goto out_unlock; | ||
284 | 284 | ||
285 | cache_name = memcg_create_cache_name(memcg, root_cache); | 285 | cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, |
286 | memcg_cache_id(memcg), memcg_name); | ||
286 | if (!cache_name) | 287 | if (!cache_name) |
287 | goto out_unlock; | 288 | goto out_unlock; |
288 | 289 | ||
@@ -292,17 +293,19 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c | |||
292 | memcg, root_cache); | 293 | memcg, root_cache); |
293 | if (IS_ERR(s)) { | 294 | if (IS_ERR(s)) { |
294 | kfree(cache_name); | 295 | kfree(cache_name); |
295 | goto out_unlock; | 296 | s = NULL; |
296 | } | 297 | } |
297 | 298 | ||
298 | s->allocflags |= __GFP_KMEMCG; | ||
299 | |||
300 | out_unlock: | 299 | out_unlock: |
301 | mutex_unlock(&slab_mutex); | 300 | mutex_unlock(&slab_mutex); |
301 | |||
302 | put_online_mems(); | ||
302 | put_online_cpus(); | 303 | put_online_cpus(); |
304 | |||
305 | return s; | ||
303 | } | 306 | } |
304 | 307 | ||
305 | static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) | 308 | static int memcg_cleanup_cache_params(struct kmem_cache *s) |
306 | { | 309 | { |
307 | int rc; | 310 | int rc; |
308 | 311 | ||
@@ -311,13 +314,13 @@ static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) | |||
311 | return 0; | 314 | return 0; |
312 | 315 | ||
313 | mutex_unlock(&slab_mutex); | 316 | mutex_unlock(&slab_mutex); |
314 | rc = __kmem_cache_destroy_memcg_children(s); | 317 | rc = __memcg_cleanup_cache_params(s); |
315 | mutex_lock(&slab_mutex); | 318 | mutex_lock(&slab_mutex); |
316 | 319 | ||
317 | return rc; | 320 | return rc; |
318 | } | 321 | } |
319 | #else | 322 | #else |
320 | static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) | 323 | static int memcg_cleanup_cache_params(struct kmem_cache *s) |
321 | { | 324 | { |
322 | return 0; | 325 | return 0; |
323 | } | 326 | } |
@@ -332,27 +335,26 @@ void slab_kmem_cache_release(struct kmem_cache *s) | |||
332 | void kmem_cache_destroy(struct kmem_cache *s) | 335 | void kmem_cache_destroy(struct kmem_cache *s) |
333 | { | 336 | { |
334 | get_online_cpus(); | 337 | get_online_cpus(); |
338 | get_online_mems(); | ||
339 | |||
335 | mutex_lock(&slab_mutex); | 340 | mutex_lock(&slab_mutex); |
336 | 341 | ||
337 | s->refcount--; | 342 | s->refcount--; |
338 | if (s->refcount) | 343 | if (s->refcount) |
339 | goto out_unlock; | 344 | goto out_unlock; |
340 | 345 | ||
341 | if (kmem_cache_destroy_memcg_children(s) != 0) | 346 | if (memcg_cleanup_cache_params(s) != 0) |
342 | goto out_unlock; | 347 | goto out_unlock; |
343 | 348 | ||
344 | list_del(&s->list); | ||
345 | memcg_unregister_cache(s); | ||
346 | |||
347 | if (__kmem_cache_shutdown(s) != 0) { | 349 | if (__kmem_cache_shutdown(s) != 0) { |
348 | list_add(&s->list, &slab_caches); | ||
349 | memcg_register_cache(s); | ||
350 | printk(KERN_ERR "kmem_cache_destroy %s: " | 350 | printk(KERN_ERR "kmem_cache_destroy %s: " |
351 | "Slab cache still has objects\n", s->name); | 351 | "Slab cache still has objects\n", s->name); |
352 | dump_stack(); | 352 | dump_stack(); |
353 | goto out_unlock; | 353 | goto out_unlock; |
354 | } | 354 | } |
355 | 355 | ||
356 | list_del(&s->list); | ||
357 | |||
356 | mutex_unlock(&slab_mutex); | 358 | mutex_unlock(&slab_mutex); |
357 | if (s->flags & SLAB_DESTROY_BY_RCU) | 359 | if (s->flags & SLAB_DESTROY_BY_RCU) |
358 | rcu_barrier(); | 360 | rcu_barrier(); |
@@ -363,15 +365,36 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
363 | #else | 365 | #else |
364 | slab_kmem_cache_release(s); | 366 | slab_kmem_cache_release(s); |
365 | #endif | 367 | #endif |
366 | goto out_put_cpus; | 368 | goto out; |
367 | 369 | ||
368 | out_unlock: | 370 | out_unlock: |
369 | mutex_unlock(&slab_mutex); | 371 | mutex_unlock(&slab_mutex); |
370 | out_put_cpus: | 372 | out: |
373 | put_online_mems(); | ||
371 | put_online_cpus(); | 374 | put_online_cpus(); |
372 | } | 375 | } |
373 | EXPORT_SYMBOL(kmem_cache_destroy); | 376 | EXPORT_SYMBOL(kmem_cache_destroy); |
374 | 377 | ||
378 | /** | ||
379 | * kmem_cache_shrink - Shrink a cache. | ||
380 | * @cachep: The cache to shrink. | ||
381 | * | ||
382 | * Releases as many slabs as possible for a cache. | ||
383 | * To help debugging, a zero exit status indicates all slabs were released. | ||
384 | */ | ||
385 | int kmem_cache_shrink(struct kmem_cache *cachep) | ||
386 | { | ||
387 | int ret; | ||
388 | |||
389 | get_online_cpus(); | ||
390 | get_online_mems(); | ||
391 | ret = __kmem_cache_shrink(cachep); | ||
392 | put_online_mems(); | ||
393 | put_online_cpus(); | ||
394 | return ret; | ||
395 | } | ||
396 | EXPORT_SYMBOL(kmem_cache_shrink); | ||
397 | |||
375 | int slab_is_available(void) | 398 | int slab_is_available(void) |
376 | { | 399 | { |
377 | return slab_state >= UP; | 400 | return slab_state >= UP; |
@@ -586,6 +609,24 @@ void __init create_kmalloc_caches(unsigned long flags) | |||
586 | } | 609 | } |
587 | #endif /* !CONFIG_SLOB */ | 610 | #endif /* !CONFIG_SLOB */ |
588 | 611 | ||
612 | /* | ||
613 | * To avoid unnecessary overhead, we pass through large allocation requests | ||
614 | * directly to the page allocator. We use __GFP_COMP, because we will need to | ||
615 | * know the allocation order to free the pages properly in kfree. | ||
616 | */ | ||
617 | void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) | ||
618 | { | ||
619 | void *ret; | ||
620 | struct page *page; | ||
621 | |||
622 | flags |= __GFP_COMP; | ||
623 | page = alloc_kmem_pages(flags, order); | ||
624 | ret = page ? page_address(page) : NULL; | ||
625 | kmemleak_alloc(ret, size, 1, flags); | ||
626 | return ret; | ||
627 | } | ||
628 | EXPORT_SYMBOL(kmalloc_order); | ||
629 | |||
589 | #ifdef CONFIG_TRACING | 630 | #ifdef CONFIG_TRACING |
590 | void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) | 631 | void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) |
591 | { | 632 | { |
@@ -620,11 +620,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c) | |||
620 | return 0; | 620 | return 0; |
621 | } | 621 | } |
622 | 622 | ||
623 | int kmem_cache_shrink(struct kmem_cache *d) | 623 | int __kmem_cache_shrink(struct kmem_cache *d) |
624 | { | 624 | { |
625 | return 0; | 625 | return 0; |
626 | } | 626 | } |
627 | EXPORT_SYMBOL(kmem_cache_shrink); | ||
628 | 627 | ||
629 | struct kmem_cache kmem_cache_boot = { | 628 | struct kmem_cache kmem_cache_boot = { |
630 | .name = "kmem_cache", | 629 | .name = "kmem_cache", |
@@ -403,7 +403,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page | |||
403 | stat(s, CMPXCHG_DOUBLE_FAIL); | 403 | stat(s, CMPXCHG_DOUBLE_FAIL); |
404 | 404 | ||
405 | #ifdef SLUB_DEBUG_CMPXCHG | 405 | #ifdef SLUB_DEBUG_CMPXCHG |
406 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | 406 | pr_info("%s %s: cmpxchg double redo ", n, s->name); |
407 | #endif | 407 | #endif |
408 | 408 | ||
409 | return 0; | 409 | return 0; |
@@ -444,7 +444,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | |||
444 | stat(s, CMPXCHG_DOUBLE_FAIL); | 444 | stat(s, CMPXCHG_DOUBLE_FAIL); |
445 | 445 | ||
446 | #ifdef SLUB_DEBUG_CMPXCHG | 446 | #ifdef SLUB_DEBUG_CMPXCHG |
447 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | 447 | pr_info("%s %s: cmpxchg double redo ", n, s->name); |
448 | #endif | 448 | #endif |
449 | 449 | ||
450 | return 0; | 450 | return 0; |
@@ -546,14 +546,14 @@ static void print_track(const char *s, struct track *t) | |||
546 | if (!t->addr) | 546 | if (!t->addr) |
547 | return; | 547 | return; |
548 | 548 | ||
549 | printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", | 549 | pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", |
550 | s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); | 550 | s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); |
551 | #ifdef CONFIG_STACKTRACE | 551 | #ifdef CONFIG_STACKTRACE |
552 | { | 552 | { |
553 | int i; | 553 | int i; |
554 | for (i = 0; i < TRACK_ADDRS_COUNT; i++) | 554 | for (i = 0; i < TRACK_ADDRS_COUNT; i++) |
555 | if (t->addrs[i]) | 555 | if (t->addrs[i]) |
556 | printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); | 556 | pr_err("\t%pS\n", (void *)t->addrs[i]); |
557 | else | 557 | else |
558 | break; | 558 | break; |
559 | } | 559 | } |
@@ -571,38 +571,37 @@ static void print_tracking(struct kmem_cache *s, void *object) | |||
571 | 571 | ||
572 | static void print_page_info(struct page *page) | 572 | static void print_page_info(struct page *page) |
573 | { | 573 | { |
574 | printk(KERN_ERR | 574 | pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", |
575 | "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", | ||
576 | page, page->objects, page->inuse, page->freelist, page->flags); | 575 | page, page->objects, page->inuse, page->freelist, page->flags); |
577 | 576 | ||
578 | } | 577 | } |
579 | 578 | ||
580 | static void slab_bug(struct kmem_cache *s, char *fmt, ...) | 579 | static void slab_bug(struct kmem_cache *s, char *fmt, ...) |
581 | { | 580 | { |
581 | struct va_format vaf; | ||
582 | va_list args; | 582 | va_list args; |
583 | char buf[100]; | ||
584 | 583 | ||
585 | va_start(args, fmt); | 584 | va_start(args, fmt); |
586 | vsnprintf(buf, sizeof(buf), fmt, args); | 585 | vaf.fmt = fmt; |
587 | va_end(args); | 586 | vaf.va = &args; |
588 | printk(KERN_ERR "========================================" | 587 | pr_err("=============================================================================\n"); |
589 | "=====================================\n"); | 588 | pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); |
590 | printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); | 589 | pr_err("-----------------------------------------------------------------------------\n\n"); |
591 | printk(KERN_ERR "----------------------------------------" | ||
592 | "-------------------------------------\n\n"); | ||
593 | 590 | ||
594 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); | 591 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
592 | va_end(args); | ||
595 | } | 593 | } |
596 | 594 | ||
597 | static void slab_fix(struct kmem_cache *s, char *fmt, ...) | 595 | static void slab_fix(struct kmem_cache *s, char *fmt, ...) |
598 | { | 596 | { |
597 | struct va_format vaf; | ||
599 | va_list args; | 598 | va_list args; |
600 | char buf[100]; | ||
601 | 599 | ||
602 | va_start(args, fmt); | 600 | va_start(args, fmt); |
603 | vsnprintf(buf, sizeof(buf), fmt, args); | 601 | vaf.fmt = fmt; |
602 | vaf.va = &args; | ||
603 | pr_err("FIX %s: %pV\n", s->name, &vaf); | ||
604 | va_end(args); | 604 | va_end(args); |
605 | printk(KERN_ERR "FIX %s: %s\n", s->name, buf); | ||
606 | } | 605 | } |
607 | 606 | ||
608 | static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | 607 | static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) |
@@ -614,8 +613,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
614 | 613 | ||
615 | print_page_info(page); | 614 | print_page_info(page); |
616 | 615 | ||
617 | printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", | 616 | pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", |
618 | p, p - addr, get_freepointer(s, p)); | 617 | p, p - addr, get_freepointer(s, p)); |
619 | 618 | ||
620 | if (p > addr + 16) | 619 | if (p > addr + 16) |
621 | print_section("Bytes b4 ", p - 16, 16); | 620 | print_section("Bytes b4 ", p - 16, 16); |
@@ -698,7 +697,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, | |||
698 | end--; | 697 | end--; |
699 | 698 | ||
700 | slab_bug(s, "%s overwritten", what); | 699 | slab_bug(s, "%s overwritten", what); |
701 | printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", | 700 | pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", |
702 | fault, end - 1, fault[0], value); | 701 | fault, end - 1, fault[0], value); |
703 | print_trailer(s, page, object); | 702 | print_trailer(s, page, object); |
704 | 703 | ||
@@ -931,7 +930,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, | |||
931 | int alloc) | 930 | int alloc) |
932 | { | 931 | { |
933 | if (s->flags & SLAB_TRACE) { | 932 | if (s->flags & SLAB_TRACE) { |
934 | printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", | 933 | pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", |
935 | s->name, | 934 | s->name, |
936 | alloc ? "alloc" : "free", | 935 | alloc ? "alloc" : "free", |
937 | object, page->inuse, | 936 | object, page->inuse, |
@@ -1134,9 +1133,8 @@ static noinline struct kmem_cache_node *free_debug_processing( | |||
1134 | slab_err(s, page, "Attempt to free object(0x%p) " | 1133 | slab_err(s, page, "Attempt to free object(0x%p) " |
1135 | "outside of slab", object); | 1134 | "outside of slab", object); |
1136 | } else if (!page->slab_cache) { | 1135 | } else if (!page->slab_cache) { |
1137 | printk(KERN_ERR | 1136 | pr_err("SLUB <none>: no slab for object 0x%p.\n", |
1138 | "SLUB <none>: no slab for object 0x%p.\n", | 1137 | object); |
1139 | object); | ||
1140 | dump_stack(); | 1138 | dump_stack(); |
1141 | } else | 1139 | } else |
1142 | object_err(s, page, object, | 1140 | object_err(s, page, object, |
@@ -1219,8 +1217,8 @@ static int __init setup_slub_debug(char *str) | |||
1219 | slub_debug |= SLAB_FAILSLAB; | 1217 | slub_debug |= SLAB_FAILSLAB; |
1220 | break; | 1218 | break; |
1221 | default: | 1219 | default: |
1222 | printk(KERN_ERR "slub_debug option '%c' " | 1220 | pr_err("slub_debug option '%c' unknown. skipped\n", |
1223 | "unknown. skipped\n", *str); | 1221 | *str); |
1224 | } | 1222 | } |
1225 | } | 1223 | } |
1226 | 1224 | ||
@@ -1314,17 +1312,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
1314 | /* | 1312 | /* |
1315 | * Slab allocation and freeing | 1313 | * Slab allocation and freeing |
1316 | */ | 1314 | */ |
1317 | static inline struct page *alloc_slab_page(gfp_t flags, int node, | 1315 | static inline struct page *alloc_slab_page(struct kmem_cache *s, |
1318 | struct kmem_cache_order_objects oo) | 1316 | gfp_t flags, int node, struct kmem_cache_order_objects oo) |
1319 | { | 1317 | { |
1318 | struct page *page; | ||
1320 | int order = oo_order(oo); | 1319 | int order = oo_order(oo); |
1321 | 1320 | ||
1322 | flags |= __GFP_NOTRACK; | 1321 | flags |= __GFP_NOTRACK; |
1323 | 1322 | ||
1323 | if (memcg_charge_slab(s, flags, order)) | ||
1324 | return NULL; | ||
1325 | |||
1324 | if (node == NUMA_NO_NODE) | 1326 | if (node == NUMA_NO_NODE) |
1325 | return alloc_pages(flags, order); | 1327 | page = alloc_pages(flags, order); |
1326 | else | 1328 | else |
1327 | return alloc_pages_exact_node(node, flags, order); | 1329 | page = alloc_pages_exact_node(node, flags, order); |
1330 | |||
1331 | if (!page) | ||
1332 | memcg_uncharge_slab(s, order); | ||
1333 | |||
1334 | return page; | ||
1328 | } | 1335 | } |
1329 | 1336 | ||
1330 | static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | 1337 | static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) |
@@ -1346,7 +1353,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1346 | */ | 1353 | */ |
1347 | alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; | 1354 | alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; |
1348 | 1355 | ||
1349 | page = alloc_slab_page(alloc_gfp, node, oo); | 1356 | page = alloc_slab_page(s, alloc_gfp, node, oo); |
1350 | if (unlikely(!page)) { | 1357 | if (unlikely(!page)) { |
1351 | oo = s->min; | 1358 | oo = s->min; |
1352 | alloc_gfp = flags; | 1359 | alloc_gfp = flags; |
@@ -1354,7 +1361,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1354 | * Allocation may have failed due to fragmentation. | 1361 | * Allocation may have failed due to fragmentation. |
1355 | * Try a lower order alloc if possible | 1362 | * Try a lower order alloc if possible |
1356 | */ | 1363 | */ |
1357 | page = alloc_slab_page(alloc_gfp, node, oo); | 1364 | page = alloc_slab_page(s, alloc_gfp, node, oo); |
1358 | 1365 | ||
1359 | if (page) | 1366 | if (page) |
1360 | stat(s, ORDER_FALLBACK); | 1367 | stat(s, ORDER_FALLBACK); |
@@ -1415,7 +1422,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1415 | 1422 | ||
1416 | order = compound_order(page); | 1423 | order = compound_order(page); |
1417 | inc_slabs_node(s, page_to_nid(page), page->objects); | 1424 | inc_slabs_node(s, page_to_nid(page), page->objects); |
1418 | memcg_bind_pages(s, order); | ||
1419 | page->slab_cache = s; | 1425 | page->slab_cache = s; |
1420 | __SetPageSlab(page); | 1426 | __SetPageSlab(page); |
1421 | if (page->pfmemalloc) | 1427 | if (page->pfmemalloc) |
@@ -1466,11 +1472,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1466 | __ClearPageSlabPfmemalloc(page); | 1472 | __ClearPageSlabPfmemalloc(page); |
1467 | __ClearPageSlab(page); | 1473 | __ClearPageSlab(page); |
1468 | 1474 | ||
1469 | memcg_release_pages(s, order); | ||
1470 | page_mapcount_reset(page); | 1475 | page_mapcount_reset(page); |
1471 | if (current->reclaim_state) | 1476 | if (current->reclaim_state) |
1472 | current->reclaim_state->reclaimed_slab += pages; | 1477 | current->reclaim_state->reclaimed_slab += pages; |
1473 | __free_memcg_kmem_pages(page, order); | 1478 | __free_pages(page, order); |
1479 | memcg_uncharge_slab(s, order); | ||
1474 | } | 1480 | } |
1475 | 1481 | ||
1476 | #define need_reserve_slab_rcu \ | 1482 | #define need_reserve_slab_rcu \ |
@@ -1770,19 +1776,19 @@ static inline void note_cmpxchg_failure(const char *n, | |||
1770 | #ifdef SLUB_DEBUG_CMPXCHG | 1776 | #ifdef SLUB_DEBUG_CMPXCHG |
1771 | unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); | 1777 | unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); |
1772 | 1778 | ||
1773 | printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); | 1779 | pr_info("%s %s: cmpxchg redo ", n, s->name); |
1774 | 1780 | ||
1775 | #ifdef CONFIG_PREEMPT | 1781 | #ifdef CONFIG_PREEMPT |
1776 | if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) | 1782 | if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) |
1777 | printk("due to cpu change %d -> %d\n", | 1783 | pr_warn("due to cpu change %d -> %d\n", |
1778 | tid_to_cpu(tid), tid_to_cpu(actual_tid)); | 1784 | tid_to_cpu(tid), tid_to_cpu(actual_tid)); |
1779 | else | 1785 | else |
1780 | #endif | 1786 | #endif |
1781 | if (tid_to_event(tid) != tid_to_event(actual_tid)) | 1787 | if (tid_to_event(tid) != tid_to_event(actual_tid)) |
1782 | printk("due to cpu running other code. Event %ld->%ld\n", | 1788 | pr_warn("due to cpu running other code. Event %ld->%ld\n", |
1783 | tid_to_event(tid), tid_to_event(actual_tid)); | 1789 | tid_to_event(tid), tid_to_event(actual_tid)); |
1784 | else | 1790 | else |
1785 | printk("for unknown reason: actual=%lx was=%lx target=%lx\n", | 1791 | pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", |
1786 | actual_tid, tid, next_tid(tid)); | 1792 | actual_tid, tid, next_tid(tid)); |
1787 | #endif | 1793 | #endif |
1788 | stat(s, CMPXCHG_DOUBLE_CPU_FAIL); | 1794 | stat(s, CMPXCHG_DOUBLE_CPU_FAIL); |
@@ -2121,11 +2127,19 @@ static inline int node_match(struct page *page, int node) | |||
2121 | return 1; | 2127 | return 1; |
2122 | } | 2128 | } |
2123 | 2129 | ||
2130 | #ifdef CONFIG_SLUB_DEBUG | ||
2124 | static int count_free(struct page *page) | 2131 | static int count_free(struct page *page) |
2125 | { | 2132 | { |
2126 | return page->objects - page->inuse; | 2133 | return page->objects - page->inuse; |
2127 | } | 2134 | } |
2128 | 2135 | ||
2136 | static inline unsigned long node_nr_objs(struct kmem_cache_node *n) | ||
2137 | { | ||
2138 | return atomic_long_read(&n->total_objects); | ||
2139 | } | ||
2140 | #endif /* CONFIG_SLUB_DEBUG */ | ||
2141 | |||
2142 | #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) | ||
2129 | static unsigned long count_partial(struct kmem_cache_node *n, | 2143 | static unsigned long count_partial(struct kmem_cache_node *n, |
2130 | int (*get_count)(struct page *)) | 2144 | int (*get_count)(struct page *)) |
2131 | { | 2145 | { |
@@ -2139,31 +2153,28 @@ static unsigned long count_partial(struct kmem_cache_node *n, | |||
2139 | spin_unlock_irqrestore(&n->list_lock, flags); | 2153 | spin_unlock_irqrestore(&n->list_lock, flags); |
2140 | return x; | 2154 | return x; |
2141 | } | 2155 | } |
2142 | 2156 | #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ | |
2143 | static inline unsigned long node_nr_objs(struct kmem_cache_node *n) | ||
2144 | { | ||
2145 | #ifdef CONFIG_SLUB_DEBUG | ||
2146 | return atomic_long_read(&n->total_objects); | ||
2147 | #else | ||
2148 | return 0; | ||
2149 | #endif | ||
2150 | } | ||
2151 | 2157 | ||
2152 | static noinline void | 2158 | static noinline void |
2153 | slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | 2159 | slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) |
2154 | { | 2160 | { |
2161 | #ifdef CONFIG_SLUB_DEBUG | ||
2162 | static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, | ||
2163 | DEFAULT_RATELIMIT_BURST); | ||
2155 | int node; | 2164 | int node; |
2156 | 2165 | ||
2157 | printk(KERN_WARNING | 2166 | if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) |
2158 | "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", | 2167 | return; |
2168 | |||
2169 | pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", | ||
2159 | nid, gfpflags); | 2170 | nid, gfpflags); |
2160 | printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " | 2171 | pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", |
2161 | "default order: %d, min order: %d\n", s->name, s->object_size, | 2172 | s->name, s->object_size, s->size, oo_order(s->oo), |
2162 | s->size, oo_order(s->oo), oo_order(s->min)); | 2173 | oo_order(s->min)); |
2163 | 2174 | ||
2164 | if (oo_order(s->min) > get_order(s->object_size)) | 2175 | if (oo_order(s->min) > get_order(s->object_size)) |
2165 | printk(KERN_WARNING " %s debugging increased min order, use " | 2176 | pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", |
2166 | "slub_debug=O to disable.\n", s->name); | 2177 | s->name); |
2167 | 2178 | ||
2168 | for_each_online_node(node) { | 2179 | for_each_online_node(node) { |
2169 | struct kmem_cache_node *n = get_node(s, node); | 2180 | struct kmem_cache_node *n = get_node(s, node); |
@@ -2178,10 +2189,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | |||
2178 | nr_slabs = node_nr_slabs(n); | 2189 | nr_slabs = node_nr_slabs(n); |
2179 | nr_objs = node_nr_objs(n); | 2190 | nr_objs = node_nr_objs(n); |
2180 | 2191 | ||
2181 | printk(KERN_WARNING | 2192 | pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", |
2182 | " node %d: slabs: %ld, objs: %ld, free: %ld\n", | ||
2183 | node, nr_slabs, nr_objs, nr_free); | 2193 | node, nr_slabs, nr_objs, nr_free); |
2184 | } | 2194 | } |
2195 | #endif | ||
2185 | } | 2196 | } |
2186 | 2197 | ||
2187 | static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | 2198 | static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, |
@@ -2198,7 +2209,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | |||
2198 | 2209 | ||
2199 | page = new_slab(s, flags, node); | 2210 | page = new_slab(s, flags, node); |
2200 | if (page) { | 2211 | if (page) { |
2201 | c = __this_cpu_ptr(s->cpu_slab); | 2212 | c = raw_cpu_ptr(s->cpu_slab); |
2202 | if (c->page) | 2213 | if (c->page) |
2203 | flush_slab(s, c); | 2214 | flush_slab(s, c); |
2204 | 2215 | ||
@@ -2323,8 +2334,6 @@ redo: | |||
2323 | if (freelist) | 2334 | if (freelist) |
2324 | goto load_freelist; | 2335 | goto load_freelist; |
2325 | 2336 | ||
2326 | stat(s, ALLOC_SLOWPATH); | ||
2327 | |||
2328 | freelist = get_freelist(s, page); | 2337 | freelist = get_freelist(s, page); |
2329 | 2338 | ||
2330 | if (!freelist) { | 2339 | if (!freelist) { |
@@ -2360,9 +2369,7 @@ new_slab: | |||
2360 | freelist = new_slab_objects(s, gfpflags, node, &c); | 2369 | freelist = new_slab_objects(s, gfpflags, node, &c); |
2361 | 2370 | ||
2362 | if (unlikely(!freelist)) { | 2371 | if (unlikely(!freelist)) { |
2363 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | 2372 | slab_out_of_memory(s, gfpflags, node); |
2364 | slab_out_of_memory(s, gfpflags, node); | ||
2365 | |||
2366 | local_irq_restore(flags); | 2373 | local_irq_restore(flags); |
2367 | return NULL; | 2374 | return NULL; |
2368 | } | 2375 | } |
@@ -2418,7 +2425,7 @@ redo: | |||
2418 | * and the retrieval of the tid. | 2425 | * and the retrieval of the tid. |
2419 | */ | 2426 | */ |
2420 | preempt_disable(); | 2427 | preempt_disable(); |
2421 | c = __this_cpu_ptr(s->cpu_slab); | 2428 | c = this_cpu_ptr(s->cpu_slab); |
2422 | 2429 | ||
2423 | /* | 2430 | /* |
2424 | * The transaction ids are globally unique per cpu and per operation on | 2431 | * The transaction ids are globally unique per cpu and per operation on |
@@ -2431,10 +2438,10 @@ redo: | |||
2431 | 2438 | ||
2432 | object = c->freelist; | 2439 | object = c->freelist; |
2433 | page = c->page; | 2440 | page = c->page; |
2434 | if (unlikely(!object || !node_match(page, node))) | 2441 | if (unlikely(!object || !node_match(page, node))) { |
2435 | object = __slab_alloc(s, gfpflags, node, addr, c); | 2442 | object = __slab_alloc(s, gfpflags, node, addr, c); |
2436 | 2443 | stat(s, ALLOC_SLOWPATH); | |
2437 | else { | 2444 | } else { |
2438 | void *next_object = get_freepointer_safe(s, object); | 2445 | void *next_object = get_freepointer_safe(s, object); |
2439 | 2446 | ||
2440 | /* | 2447 | /* |
@@ -2674,7 +2681,7 @@ redo: | |||
2674 | * during the cmpxchg then the free will succedd. | 2681 | * during the cmpxchg then the free will succedd. |
2675 | */ | 2682 | */ |
2676 | preempt_disable(); | 2683 | preempt_disable(); |
2677 | c = __this_cpu_ptr(s->cpu_slab); | 2684 | c = this_cpu_ptr(s->cpu_slab); |
2678 | 2685 | ||
2679 | tid = c->tid; | 2686 | tid = c->tid; |
2680 | preempt_enable(); | 2687 | preempt_enable(); |
@@ -2894,10 +2901,8 @@ static void early_kmem_cache_node_alloc(int node) | |||
2894 | 2901 | ||
2895 | BUG_ON(!page); | 2902 | BUG_ON(!page); |
2896 | if (page_to_nid(page) != node) { | 2903 | if (page_to_nid(page) != node) { |
2897 | printk(KERN_ERR "SLUB: Unable to allocate memory from " | 2904 | pr_err("SLUB: Unable to allocate memory from node %d\n", node); |
2898 | "node %d\n", node); | 2905 | pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); |
2899 | printk(KERN_ERR "SLUB: Allocating a useless per node structure " | ||
2900 | "in order to be able to continue\n"); | ||
2901 | } | 2906 | } |
2902 | 2907 | ||
2903 | n = page->freelist; | 2908 | n = page->freelist; |
@@ -3182,8 +3187,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, | |||
3182 | for_each_object(p, s, addr, page->objects) { | 3187 | for_each_object(p, s, addr, page->objects) { |
3183 | 3188 | ||
3184 | if (!test_bit(slab_index(p, s, addr), map)) { | 3189 | if (!test_bit(slab_index(p, s, addr), map)) { |
3185 | printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", | 3190 | pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); |
3186 | p, p - addr); | ||
3187 | print_tracking(s, p); | 3191 | print_tracking(s, p); |
3188 | } | 3192 | } |
3189 | } | 3193 | } |
@@ -3305,8 +3309,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | |||
3305 | struct page *page; | 3309 | struct page *page; |
3306 | void *ptr = NULL; | 3310 | void *ptr = NULL; |
3307 | 3311 | ||
3308 | flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; | 3312 | flags |= __GFP_COMP | __GFP_NOTRACK; |
3309 | page = alloc_pages_node(node, flags, get_order(size)); | 3313 | page = alloc_kmem_pages_node(node, flags, get_order(size)); |
3310 | if (page) | 3314 | if (page) |
3311 | ptr = page_address(page); | 3315 | ptr = page_address(page); |
3312 | 3316 | ||
@@ -3375,7 +3379,7 @@ void kfree(const void *x) | |||
3375 | if (unlikely(!PageSlab(page))) { | 3379 | if (unlikely(!PageSlab(page))) { |
3376 | BUG_ON(!PageCompound(page)); | 3380 | BUG_ON(!PageCompound(page)); |
3377 | kfree_hook(x); | 3381 | kfree_hook(x); |
3378 | __free_memcg_kmem_pages(page, compound_order(page)); | 3382 | __free_kmem_pages(page, compound_order(page)); |
3379 | return; | 3383 | return; |
3380 | } | 3384 | } |
3381 | slab_free(page->slab_cache, page, object, _RET_IP_); | 3385 | slab_free(page->slab_cache, page, object, _RET_IP_); |
@@ -3392,7 +3396,7 @@ EXPORT_SYMBOL(kfree); | |||
3392 | * being allocated from last increasing the chance that the last objects | 3396 | * being allocated from last increasing the chance that the last objects |
3393 | * are freed in them. | 3397 | * are freed in them. |
3394 | */ | 3398 | */ |
3395 | int kmem_cache_shrink(struct kmem_cache *s) | 3399 | int __kmem_cache_shrink(struct kmem_cache *s) |
3396 | { | 3400 | { |
3397 | int node; | 3401 | int node; |
3398 | int i; | 3402 | int i; |
@@ -3448,7 +3452,6 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
3448 | kfree(slabs_by_inuse); | 3452 | kfree(slabs_by_inuse); |
3449 | return 0; | 3453 | return 0; |
3450 | } | 3454 | } |
3451 | EXPORT_SYMBOL(kmem_cache_shrink); | ||
3452 | 3455 | ||
3453 | static int slab_mem_going_offline_callback(void *arg) | 3456 | static int slab_mem_going_offline_callback(void *arg) |
3454 | { | 3457 | { |
@@ -3456,7 +3459,7 @@ static int slab_mem_going_offline_callback(void *arg) | |||
3456 | 3459 | ||
3457 | mutex_lock(&slab_mutex); | 3460 | mutex_lock(&slab_mutex); |
3458 | list_for_each_entry(s, &slab_caches, list) | 3461 | list_for_each_entry(s, &slab_caches, list) |
3459 | kmem_cache_shrink(s); | 3462 | __kmem_cache_shrink(s); |
3460 | mutex_unlock(&slab_mutex); | 3463 | mutex_unlock(&slab_mutex); |
3461 | 3464 | ||
3462 | return 0; | 3465 | return 0; |
@@ -3650,9 +3653,7 @@ void __init kmem_cache_init(void) | |||
3650 | register_cpu_notifier(&slab_notifier); | 3653 | register_cpu_notifier(&slab_notifier); |
3651 | #endif | 3654 | #endif |
3652 | 3655 | ||
3653 | printk(KERN_INFO | 3656 | pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", |
3654 | "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," | ||
3655 | " CPUs=%d, Nodes=%d\n", | ||
3656 | cache_line_size(), | 3657 | cache_line_size(), |
3657 | slub_min_order, slub_max_order, slub_min_objects, | 3658 | slub_min_order, slub_max_order, slub_min_objects, |
3658 | nr_cpu_ids, nr_node_ids); | 3659 | nr_cpu_ids, nr_node_ids); |
@@ -3934,8 +3935,8 @@ static int validate_slab_node(struct kmem_cache *s, | |||
3934 | count++; | 3935 | count++; |
3935 | } | 3936 | } |
3936 | if (count != n->nr_partial) | 3937 | if (count != n->nr_partial) |
3937 | printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " | 3938 | pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", |
3938 | "counter=%ld\n", s->name, count, n->nr_partial); | 3939 | s->name, count, n->nr_partial); |
3939 | 3940 | ||
3940 | if (!(s->flags & SLAB_STORE_USER)) | 3941 | if (!(s->flags & SLAB_STORE_USER)) |
3941 | goto out; | 3942 | goto out; |
@@ -3945,9 +3946,8 @@ static int validate_slab_node(struct kmem_cache *s, | |||
3945 | count++; | 3946 | count++; |
3946 | } | 3947 | } |
3947 | if (count != atomic_long_read(&n->nr_slabs)) | 3948 | if (count != atomic_long_read(&n->nr_slabs)) |
3948 | printk(KERN_ERR "SLUB: %s %ld slabs counted but " | 3949 | pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", |
3949 | "counter=%ld\n", s->name, count, | 3950 | s->name, count, atomic_long_read(&n->nr_slabs)); |
3950 | atomic_long_read(&n->nr_slabs)); | ||
3951 | 3951 | ||
3952 | out: | 3952 | out: |
3953 | spin_unlock_irqrestore(&n->list_lock, flags); | 3953 | spin_unlock_irqrestore(&n->list_lock, flags); |
@@ -4211,53 +4211,50 @@ static void resiliency_test(void) | |||
4211 | 4211 | ||
4212 | BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); | 4212 | BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); |
4213 | 4213 | ||
4214 | printk(KERN_ERR "SLUB resiliency testing\n"); | 4214 | pr_err("SLUB resiliency testing\n"); |
4215 | printk(KERN_ERR "-----------------------\n"); | 4215 | pr_err("-----------------------\n"); |
4216 | printk(KERN_ERR "A. Corruption after allocation\n"); | 4216 | pr_err("A. Corruption after allocation\n"); |
4217 | 4217 | ||
4218 | p = kzalloc(16, GFP_KERNEL); | 4218 | p = kzalloc(16, GFP_KERNEL); |
4219 | p[16] = 0x12; | 4219 | p[16] = 0x12; |
4220 | printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" | 4220 | pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", |
4221 | " 0x12->0x%p\n\n", p + 16); | 4221 | p + 16); |
4222 | 4222 | ||
4223 | validate_slab_cache(kmalloc_caches[4]); | 4223 | validate_slab_cache(kmalloc_caches[4]); |
4224 | 4224 | ||
4225 | /* Hmmm... The next two are dangerous */ | 4225 | /* Hmmm... The next two are dangerous */ |
4226 | p = kzalloc(32, GFP_KERNEL); | 4226 | p = kzalloc(32, GFP_KERNEL); |
4227 | p[32 + sizeof(void *)] = 0x34; | 4227 | p[32 + sizeof(void *)] = 0x34; |
4228 | printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" | 4228 | pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n", |
4229 | " 0x34 -> -0x%p\n", p); | 4229 | p); |
4230 | printk(KERN_ERR | 4230 | pr_err("If allocated object is overwritten then not detectable\n\n"); |
4231 | "If allocated object is overwritten then not detectable\n\n"); | ||
4232 | 4231 | ||
4233 | validate_slab_cache(kmalloc_caches[5]); | 4232 | validate_slab_cache(kmalloc_caches[5]); |
4234 | p = kzalloc(64, GFP_KERNEL); | 4233 | p = kzalloc(64, GFP_KERNEL); |
4235 | p += 64 + (get_cycles() & 0xff) * sizeof(void *); | 4234 | p += 64 + (get_cycles() & 0xff) * sizeof(void *); |
4236 | *p = 0x56; | 4235 | *p = 0x56; |
4237 | printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", | 4236 | pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", |
4238 | p); | 4237 | p); |
4239 | printk(KERN_ERR | 4238 | pr_err("If allocated object is overwritten then not detectable\n\n"); |
4240 | "If allocated object is overwritten then not detectable\n\n"); | ||
4241 | validate_slab_cache(kmalloc_caches[6]); | 4239 | validate_slab_cache(kmalloc_caches[6]); |
4242 | 4240 | ||
4243 | printk(KERN_ERR "\nB. Corruption after free\n"); | 4241 | pr_err("\nB. Corruption after free\n"); |
4244 | p = kzalloc(128, GFP_KERNEL); | 4242 | p = kzalloc(128, GFP_KERNEL); |
4245 | kfree(p); | 4243 | kfree(p); |
4246 | *p = 0x78; | 4244 | *p = 0x78; |
4247 | printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); | 4245 | pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); |
4248 | validate_slab_cache(kmalloc_caches[7]); | 4246 | validate_slab_cache(kmalloc_caches[7]); |
4249 | 4247 | ||
4250 | p = kzalloc(256, GFP_KERNEL); | 4248 | p = kzalloc(256, GFP_KERNEL); |
4251 | kfree(p); | 4249 | kfree(p); |
4252 | p[50] = 0x9a; | 4250 | p[50] = 0x9a; |
4253 | printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", | 4251 | pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); |
4254 | p); | ||
4255 | validate_slab_cache(kmalloc_caches[8]); | 4252 | validate_slab_cache(kmalloc_caches[8]); |
4256 | 4253 | ||
4257 | p = kzalloc(512, GFP_KERNEL); | 4254 | p = kzalloc(512, GFP_KERNEL); |
4258 | kfree(p); | 4255 | kfree(p); |
4259 | p[512] = 0xab; | 4256 | p[512] = 0xab; |
4260 | printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); | 4257 | pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); |
4261 | validate_slab_cache(kmalloc_caches[9]); | 4258 | validate_slab_cache(kmalloc_caches[9]); |
4262 | } | 4259 | } |
4263 | #else | 4260 | #else |
@@ -4332,7 +4329,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4332 | } | 4329 | } |
4333 | } | 4330 | } |
4334 | 4331 | ||
4335 | lock_memory_hotplug(); | 4332 | get_online_mems(); |
4336 | #ifdef CONFIG_SLUB_DEBUG | 4333 | #ifdef CONFIG_SLUB_DEBUG |
4337 | if (flags & SO_ALL) { | 4334 | if (flags & SO_ALL) { |
4338 | for_each_node_state(node, N_NORMAL_MEMORY) { | 4335 | for_each_node_state(node, N_NORMAL_MEMORY) { |
@@ -4372,7 +4369,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4372 | x += sprintf(buf + x, " N%d=%lu", | 4369 | x += sprintf(buf + x, " N%d=%lu", |
4373 | node, nodes[node]); | 4370 | node, nodes[node]); |
4374 | #endif | 4371 | #endif |
4375 | unlock_memory_hotplug(); | 4372 | put_online_mems(); |
4376 | kfree(nodes); | 4373 | kfree(nodes); |
4377 | return x + sprintf(buf + x, "\n"); | 4374 | return x + sprintf(buf + x, "\n"); |
4378 | } | 4375 | } |
@@ -5303,7 +5300,7 @@ static int __init slab_sysfs_init(void) | |||
5303 | slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); | 5300 | slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); |
5304 | if (!slab_kset) { | 5301 | if (!slab_kset) { |
5305 | mutex_unlock(&slab_mutex); | 5302 | mutex_unlock(&slab_mutex); |
5306 | printk(KERN_ERR "Cannot register slab subsystem.\n"); | 5303 | pr_err("Cannot register slab subsystem.\n"); |
5307 | return -ENOSYS; | 5304 | return -ENOSYS; |
5308 | } | 5305 | } |
5309 | 5306 | ||
@@ -5312,8 +5309,8 @@ static int __init slab_sysfs_init(void) | |||
5312 | list_for_each_entry(s, &slab_caches, list) { | 5309 | list_for_each_entry(s, &slab_caches, list) { |
5313 | err = sysfs_slab_add(s); | 5310 | err = sysfs_slab_add(s); |
5314 | if (err) | 5311 | if (err) |
5315 | printk(KERN_ERR "SLUB: Unable to add boot slab %s" | 5312 | pr_err("SLUB: Unable to add boot slab %s to sysfs\n", |
5316 | " to sysfs\n", s->name); | 5313 | s->name); |
5317 | } | 5314 | } |
5318 | 5315 | ||
5319 | while (alias_list) { | 5316 | while (alias_list) { |
@@ -5322,8 +5319,8 @@ static int __init slab_sysfs_init(void) | |||
5322 | alias_list = alias_list->next; | 5319 | alias_list = alias_list->next; |
5323 | err = sysfs_slab_alias(al->s, al->name); | 5320 | err = sysfs_slab_alias(al->s, al->name); |
5324 | if (err) | 5321 | if (err) |
5325 | printk(KERN_ERR "SLUB: Unable to add boot slab alias" | 5322 | pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", |
5326 | " %s to sysfs\n", al->name); | 5323 | al->name); |
5327 | kfree(al); | 5324 | kfree(al); |
5328 | } | 5325 | } |
5329 | 5326 | ||
@@ -67,7 +67,7 @@ static void __page_cache_release(struct page *page) | |||
67 | static void __put_single_page(struct page *page) | 67 | static void __put_single_page(struct page *page) |
68 | { | 68 | { |
69 | __page_cache_release(page); | 69 | __page_cache_release(page); |
70 | free_hot_cold_page(page, 0); | 70 | free_hot_cold_page(page, false); |
71 | } | 71 | } |
72 | 72 | ||
73 | static void __put_compound_page(struct page *page) | 73 | static void __put_compound_page(struct page *page) |
@@ -79,95 +79,88 @@ static void __put_compound_page(struct page *page) | |||
79 | (*dtor)(page); | 79 | (*dtor)(page); |
80 | } | 80 | } |
81 | 81 | ||
82 | static void put_compound_page(struct page *page) | 82 | /** |
83 | * Two special cases here: we could avoid taking compound_lock_irqsave | ||
84 | * and could skip the tail refcounting(in _mapcount). | ||
85 | * | ||
86 | * 1. Hugetlbfs page: | ||
87 | * | ||
88 | * PageHeadHuge will remain true until the compound page | ||
89 | * is released and enters the buddy allocator, and it could | ||
90 | * not be split by __split_huge_page_refcount(). | ||
91 | * | ||
92 | * So if we see PageHeadHuge set, and we have the tail page pin, | ||
93 | * then we could safely put head page. | ||
94 | * | ||
95 | * 2. Slab THP page: | ||
96 | * | ||
97 | * PG_slab is cleared before the slab frees the head page, and | ||
98 | * tail pin cannot be the last reference left on the head page, | ||
99 | * because the slab code is free to reuse the compound page | ||
100 | * after a kfree/kmem_cache_free without having to check if | ||
101 | * there's any tail pin left. In turn all tail pinsmust be always | ||
102 | * released while the head is still pinned by the slab code | ||
103 | * and so we know PG_slab will be still set too. | ||
104 | * | ||
105 | * So if we see PageSlab set, and we have the tail page pin, | ||
106 | * then we could safely put head page. | ||
107 | */ | ||
108 | static __always_inline | ||
109 | void put_unrefcounted_compound_page(struct page *page_head, struct page *page) | ||
83 | { | 110 | { |
84 | struct page *page_head; | ||
85 | |||
86 | if (likely(!PageTail(page))) { | ||
87 | if (put_page_testzero(page)) { | ||
88 | /* | ||
89 | * By the time all refcounts have been released | ||
90 | * split_huge_page cannot run anymore from under us. | ||
91 | */ | ||
92 | if (PageHead(page)) | ||
93 | __put_compound_page(page); | ||
94 | else | ||
95 | __put_single_page(page); | ||
96 | } | ||
97 | return; | ||
98 | } | ||
99 | |||
100 | /* __split_huge_page_refcount can run under us */ | ||
101 | page_head = compound_head(page); | ||
102 | |||
103 | /* | 111 | /* |
104 | * THP can not break up slab pages so avoid taking | 112 | * If @page is a THP tail, we must read the tail page |
105 | * compound_lock() and skip the tail page refcounting (in | 113 | * flags after the head page flags. The |
106 | * _mapcount) too. Slab performs non-atomic bit ops on | 114 | * __split_huge_page_refcount side enforces write memory barriers |
107 | * page->flags for better performance. In particular | 115 | * between clearing PageTail and before the head page |
108 | * slab_unlock() in slub used to be a hot path. It is still | 116 | * can be freed and reallocated. |
109 | * hot on arches that do not support | ||
110 | * this_cpu_cmpxchg_double(). | ||
111 | * | ||
112 | * If "page" is part of a slab or hugetlbfs page it cannot be | ||
113 | * splitted and the head page cannot change from under us. And | ||
114 | * if "page" is part of a THP page under splitting, if the | ||
115 | * head page pointed by the THP tail isn't a THP head anymore, | ||
116 | * we'll find PageTail clear after smp_rmb() and we'll treat | ||
117 | * it as a single page. | ||
118 | */ | 117 | */ |
119 | if (!__compound_tail_refcounted(page_head)) { | 118 | smp_rmb(); |
119 | if (likely(PageTail(page))) { | ||
120 | /* | 120 | /* |
121 | * If "page" is a THP tail, we must read the tail page | 121 | * __split_huge_page_refcount cannot race |
122 | * flags after the head page flags. The | 122 | * here, see the comment above this function. |
123 | * split_huge_page side enforces write memory barriers | ||
124 | * between clearing PageTail and before the head page | ||
125 | * can be freed and reallocated. | ||
126 | */ | 123 | */ |
127 | smp_rmb(); | 124 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); |
128 | if (likely(PageTail(page))) { | 125 | VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); |
129 | /* | 126 | if (put_page_testzero(page_head)) { |
130 | * __split_huge_page_refcount cannot race | ||
131 | * here. | ||
132 | */ | ||
133 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); | ||
134 | VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); | ||
135 | if (put_page_testzero(page_head)) { | ||
136 | /* | ||
137 | * If this is the tail of a slab | ||
138 | * compound page, the tail pin must | ||
139 | * not be the last reference held on | ||
140 | * the page, because the PG_slab | ||
141 | * cannot be cleared before all tail | ||
142 | * pins (which skips the _mapcount | ||
143 | * tail refcounting) have been | ||
144 | * released. For hugetlbfs the tail | ||
145 | * pin may be the last reference on | ||
146 | * the page instead, because | ||
147 | * PageHeadHuge will not go away until | ||
148 | * the compound page enters the buddy | ||
149 | * allocator. | ||
150 | */ | ||
151 | VM_BUG_ON_PAGE(PageSlab(page_head), page_head); | ||
152 | __put_compound_page(page_head); | ||
153 | } | ||
154 | return; | ||
155 | } else | ||
156 | /* | 127 | /* |
157 | * __split_huge_page_refcount run before us, | 128 | * If this is the tail of a slab THP page, |
158 | * "page" was a THP tail. The split page_head | 129 | * the tail pin must not be the last reference |
159 | * has been freed and reallocated as slab or | 130 | * held on the page, because the PG_slab cannot |
160 | * hugetlbfs page of smaller order (only | 131 | * be cleared before all tail pins (which skips |
161 | * possible if reallocated as slab on x86). | 132 | * the _mapcount tail refcounting) have been |
133 | * released. | ||
134 | * | ||
135 | * If this is the tail of a hugetlbfs page, | ||
136 | * the tail pin may be the last reference on | ||
137 | * the page instead, because PageHeadHuge will | ||
138 | * not go away until the compound page enters | ||
139 | * the buddy allocator. | ||
162 | */ | 140 | */ |
163 | goto out_put_single; | 141 | VM_BUG_ON_PAGE(PageSlab(page_head), page_head); |
164 | } | 142 | __put_compound_page(page_head); |
143 | } | ||
144 | } else | ||
145 | /* | ||
146 | * __split_huge_page_refcount run before us, | ||
147 | * @page was a THP tail. The split @page_head | ||
148 | * has been freed and reallocated as slab or | ||
149 | * hugetlbfs page of smaller order (only | ||
150 | * possible if reallocated as slab on x86). | ||
151 | */ | ||
152 | if (put_page_testzero(page)) | ||
153 | __put_single_page(page); | ||
154 | } | ||
165 | 155 | ||
156 | static __always_inline | ||
157 | void put_refcounted_compound_page(struct page *page_head, struct page *page) | ||
158 | { | ||
166 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | 159 | if (likely(page != page_head && get_page_unless_zero(page_head))) { |
167 | unsigned long flags; | 160 | unsigned long flags; |
168 | 161 | ||
169 | /* | 162 | /* |
170 | * page_head wasn't a dangling pointer but it may not | 163 | * @page_head wasn't a dangling pointer but it may not |
171 | * be a head page anymore by the time we obtain the | 164 | * be a head page anymore by the time we obtain the |
172 | * lock. That is ok as long as it can't be freed from | 165 | * lock. That is ok as long as it can't be freed from |
173 | * under us. | 166 | * under us. |
@@ -178,7 +171,7 @@ static void put_compound_page(struct page *page) | |||
178 | compound_unlock_irqrestore(page_head, flags); | 171 | compound_unlock_irqrestore(page_head, flags); |
179 | if (put_page_testzero(page_head)) { | 172 | if (put_page_testzero(page_head)) { |
180 | /* | 173 | /* |
181 | * The head page may have been freed | 174 | * The @page_head may have been freed |
182 | * and reallocated as a compound page | 175 | * and reallocated as a compound page |
183 | * of smaller order and then freed | 176 | * of smaller order and then freed |
184 | * again. All we know is that it | 177 | * again. All we know is that it |
@@ -222,12 +215,51 @@ out_put_single: | |||
222 | __put_single_page(page_head); | 215 | __put_single_page(page_head); |
223 | } | 216 | } |
224 | } else { | 217 | } else { |
225 | /* page_head is a dangling pointer */ | 218 | /* @page_head is a dangling pointer */ |
226 | VM_BUG_ON_PAGE(PageTail(page), page); | 219 | VM_BUG_ON_PAGE(PageTail(page), page); |
227 | goto out_put_single; | 220 | goto out_put_single; |
228 | } | 221 | } |
229 | } | 222 | } |
230 | 223 | ||
224 | static void put_compound_page(struct page *page) | ||
225 | { | ||
226 | struct page *page_head; | ||
227 | |||
228 | /* | ||
229 | * We see the PageCompound set and PageTail not set, so @page maybe: | ||
230 | * 1. hugetlbfs head page, or | ||
231 | * 2. THP head page. | ||
232 | */ | ||
233 | if (likely(!PageTail(page))) { | ||
234 | if (put_page_testzero(page)) { | ||
235 | /* | ||
236 | * By the time all refcounts have been released | ||
237 | * split_huge_page cannot run anymore from under us. | ||
238 | */ | ||
239 | if (PageHead(page)) | ||
240 | __put_compound_page(page); | ||
241 | else | ||
242 | __put_single_page(page); | ||
243 | } | ||
244 | return; | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * We see the PageCompound set and PageTail set, so @page maybe: | ||
249 | * 1. a tail hugetlbfs page, or | ||
250 | * 2. a tail THP page, or | ||
251 | * 3. a split THP page. | ||
252 | * | ||
253 | * Case 3 is possible, as we may race with | ||
254 | * __split_huge_page_refcount tearing down a THP page. | ||
255 | */ | ||
256 | page_head = compound_head_by_tail(page); | ||
257 | if (!__compound_tail_refcounted(page_head)) | ||
258 | put_unrefcounted_compound_page(page_head, page); | ||
259 | else | ||
260 | put_refcounted_compound_page(page_head, page); | ||
261 | } | ||
262 | |||
231 | void put_page(struct page *page) | 263 | void put_page(struct page *page) |
232 | { | 264 | { |
233 | if (unlikely(PageCompound(page))) | 265 | if (unlikely(PageCompound(page))) |
@@ -441,7 +473,7 @@ void rotate_reclaimable_page(struct page *page) | |||
441 | 473 | ||
442 | page_cache_get(page); | 474 | page_cache_get(page); |
443 | local_irq_save(flags); | 475 | local_irq_save(flags); |
444 | pvec = &__get_cpu_var(lru_rotate_pvecs); | 476 | pvec = this_cpu_ptr(&lru_rotate_pvecs); |
445 | if (!pagevec_add(pvec, page)) | 477 | if (!pagevec_add(pvec, page)) |
446 | pagevec_move_tail(pvec); | 478 | pagevec_move_tail(pvec); |
447 | local_irq_restore(flags); | 479 | local_irq_restore(flags); |
@@ -583,12 +615,17 @@ void mark_page_accessed(struct page *page) | |||
583 | EXPORT_SYMBOL(mark_page_accessed); | 615 | EXPORT_SYMBOL(mark_page_accessed); |
584 | 616 | ||
585 | /* | 617 | /* |
586 | * Queue the page for addition to the LRU via pagevec. The decision on whether | 618 | * Used to mark_page_accessed(page) that is not visible yet and when it is |
587 | * to add the page to the [in]active [file|anon] list is deferred until the | 619 | * still safe to use non-atomic ops |
588 | * pagevec is drained. This gives a chance for the caller of __lru_cache_add() | ||
589 | * have the page added to the active list using mark_page_accessed(). | ||
590 | */ | 620 | */ |
591 | void __lru_cache_add(struct page *page) | 621 | void init_page_accessed(struct page *page) |
622 | { | ||
623 | if (!PageReferenced(page)) | ||
624 | __SetPageReferenced(page); | ||
625 | } | ||
626 | EXPORT_SYMBOL(init_page_accessed); | ||
627 | |||
628 | static void __lru_cache_add(struct page *page) | ||
592 | { | 629 | { |
593 | struct pagevec *pvec = &get_cpu_var(lru_add_pvec); | 630 | struct pagevec *pvec = &get_cpu_var(lru_add_pvec); |
594 | 631 | ||
@@ -598,11 +635,34 @@ void __lru_cache_add(struct page *page) | |||
598 | pagevec_add(pvec, page); | 635 | pagevec_add(pvec, page); |
599 | put_cpu_var(lru_add_pvec); | 636 | put_cpu_var(lru_add_pvec); |
600 | } | 637 | } |
601 | EXPORT_SYMBOL(__lru_cache_add); | 638 | |
639 | /** | ||
640 | * lru_cache_add: add a page to the page lists | ||
641 | * @page: the page to add | ||
642 | */ | ||
643 | void lru_cache_add_anon(struct page *page) | ||
644 | { | ||
645 | if (PageActive(page)) | ||
646 | ClearPageActive(page); | ||
647 | __lru_cache_add(page); | ||
648 | } | ||
649 | |||
650 | void lru_cache_add_file(struct page *page) | ||
651 | { | ||
652 | if (PageActive(page)) | ||
653 | ClearPageActive(page); | ||
654 | __lru_cache_add(page); | ||
655 | } | ||
656 | EXPORT_SYMBOL(lru_cache_add_file); | ||
602 | 657 | ||
603 | /** | 658 | /** |
604 | * lru_cache_add - add a page to a page list | 659 | * lru_cache_add - add a page to a page list |
605 | * @page: the page to be added to the LRU. | 660 | * @page: the page to be added to the LRU. |
661 | * | ||
662 | * Queue the page for addition to the LRU via pagevec. The decision on whether | ||
663 | * to add the page to the [in]active [file|anon] list is deferred until the | ||
664 | * pagevec is drained. This gives a chance for the caller of lru_cache_add() | ||
665 | * have the page added to the active list using mark_page_accessed(). | ||
606 | */ | 666 | */ |
607 | void lru_cache_add(struct page *page) | 667 | void lru_cache_add(struct page *page) |
608 | { | 668 | { |
@@ -813,7 +873,7 @@ void lru_add_drain_all(void) | |||
813 | * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() | 873 | * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() |
814 | * will free it. | 874 | * will free it. |
815 | */ | 875 | */ |
816 | void release_pages(struct page **pages, int nr, int cold) | 876 | void release_pages(struct page **pages, int nr, bool cold) |
817 | { | 877 | { |
818 | int i; | 878 | int i; |
819 | LIST_HEAD(pages_to_free); | 879 | LIST_HEAD(pages_to_free); |
@@ -854,7 +914,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
854 | } | 914 | } |
855 | 915 | ||
856 | /* Clear Active bit in case of parallel mark_page_accessed */ | 916 | /* Clear Active bit in case of parallel mark_page_accessed */ |
857 | ClearPageActive(page); | 917 | __ClearPageActive(page); |
858 | 918 | ||
859 | list_add(&page->lru, &pages_to_free); | 919 | list_add(&page->lru, &pages_to_free); |
860 | } | 920 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index e76ace30d436..2972eee184a4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -270,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) | |||
270 | 270 | ||
271 | for (i = 0; i < todo; i++) | 271 | for (i = 0; i < todo; i++) |
272 | free_swap_cache(pagep[i]); | 272 | free_swap_cache(pagep[i]); |
273 | release_pages(pagep, todo, 0); | 273 | release_pages(pagep, todo, false); |
274 | pagep += todo; | 274 | pagep += todo; |
275 | nr -= todo; | 275 | nr -= todo; |
276 | } | 276 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 4a7f7e6992b6..4c524f7bd0bf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages; | |||
51 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ | 51 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ |
52 | long total_swap_pages; | 52 | long total_swap_pages; |
53 | static int least_priority; | 53 | static int least_priority; |
54 | static atomic_t highest_priority_index = ATOMIC_INIT(-1); | ||
55 | 54 | ||
56 | static const char Bad_file[] = "Bad swap file entry "; | 55 | static const char Bad_file[] = "Bad swap file entry "; |
57 | static const char Unused_file[] = "Unused swap file entry "; | 56 | static const char Unused_file[] = "Unused swap file entry "; |
58 | static const char Bad_offset[] = "Bad swap offset entry "; | 57 | static const char Bad_offset[] = "Bad swap offset entry "; |
59 | static const char Unused_offset[] = "Unused swap offset entry "; | 58 | static const char Unused_offset[] = "Unused swap offset entry "; |
60 | 59 | ||
61 | struct swap_list_t swap_list = {-1, -1}; | 60 | /* |
61 | * all active swap_info_structs | ||
62 | * protected with swap_lock, and ordered by priority. | ||
63 | */ | ||
64 | PLIST_HEAD(swap_active_head); | ||
65 | |||
66 | /* | ||
67 | * all available (active, not full) swap_info_structs | ||
68 | * protected with swap_avail_lock, ordered by priority. | ||
69 | * This is used by get_swap_page() instead of swap_active_head | ||
70 | * because swap_active_head includes all swap_info_structs, | ||
71 | * but get_swap_page() doesn't need to look at full ones. | ||
72 | * This uses its own lock instead of swap_lock because when a | ||
73 | * swap_info_struct changes between not-full/full, it needs to | ||
74 | * add/remove itself to/from this list, but the swap_info_struct->lock | ||
75 | * is held and the locking order requires swap_lock to be taken | ||
76 | * before any swap_info_struct->lock. | ||
77 | */ | ||
78 | static PLIST_HEAD(swap_avail_head); | ||
79 | static DEFINE_SPINLOCK(swap_avail_lock); | ||
62 | 80 | ||
63 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 81 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
64 | 82 | ||
@@ -505,13 +523,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
505 | /* | 523 | /* |
506 | * If seek is expensive, start searching for new cluster from | 524 | * If seek is expensive, start searching for new cluster from |
507 | * start of partition, to minimize the span of allocated swap. | 525 | * start of partition, to minimize the span of allocated swap. |
508 | * But if seek is cheap, search from our current position, so | 526 | * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info |
509 | * that swap is allocated from all over the partition: if the | 527 | * case, just handled by scan_swap_map_try_ssd_cluster() above. |
510 | * Flash Translation Layer only remaps within limited zones, | ||
511 | * we don't want to wear out the first zone too quickly. | ||
512 | */ | 528 | */ |
513 | if (!(si->flags & SWP_SOLIDSTATE)) | 529 | scan_base = offset = si->lowest_bit; |
514 | scan_base = offset = si->lowest_bit; | ||
515 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | 530 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; |
516 | 531 | ||
517 | /* Locate the first empty (unaligned) cluster */ | 532 | /* Locate the first empty (unaligned) cluster */ |
@@ -531,26 +546,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
531 | } | 546 | } |
532 | } | 547 | } |
533 | 548 | ||
534 | offset = si->lowest_bit; | ||
535 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | ||
536 | |||
537 | /* Locate the first empty (unaligned) cluster */ | ||
538 | for (; last_in_cluster < scan_base; offset++) { | ||
539 | if (si->swap_map[offset]) | ||
540 | last_in_cluster = offset + SWAPFILE_CLUSTER; | ||
541 | else if (offset == last_in_cluster) { | ||
542 | spin_lock(&si->lock); | ||
543 | offset -= SWAPFILE_CLUSTER - 1; | ||
544 | si->cluster_next = offset; | ||
545 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | ||
546 | goto checks; | ||
547 | } | ||
548 | if (unlikely(--latency_ration < 0)) { | ||
549 | cond_resched(); | ||
550 | latency_ration = LATENCY_LIMIT; | ||
551 | } | ||
552 | } | ||
553 | |||
554 | offset = scan_base; | 549 | offset = scan_base; |
555 | spin_lock(&si->lock); | 550 | spin_lock(&si->lock); |
556 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 551 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
@@ -591,6 +586,9 @@ checks: | |||
591 | if (si->inuse_pages == si->pages) { | 586 | if (si->inuse_pages == si->pages) { |
592 | si->lowest_bit = si->max; | 587 | si->lowest_bit = si->max; |
593 | si->highest_bit = 0; | 588 | si->highest_bit = 0; |
589 | spin_lock(&swap_avail_lock); | ||
590 | plist_del(&si->avail_list, &swap_avail_head); | ||
591 | spin_unlock(&swap_avail_lock); | ||
594 | } | 592 | } |
595 | si->swap_map[offset] = usage; | 593 | si->swap_map[offset] = usage; |
596 | inc_cluster_info_page(si, si->cluster_info, offset); | 594 | inc_cluster_info_page(si, si->cluster_info, offset); |
@@ -640,71 +638,65 @@ no_page: | |||
640 | 638 | ||
641 | swp_entry_t get_swap_page(void) | 639 | swp_entry_t get_swap_page(void) |
642 | { | 640 | { |
643 | struct swap_info_struct *si; | 641 | struct swap_info_struct *si, *next; |
644 | pgoff_t offset; | 642 | pgoff_t offset; |
645 | int type, next; | ||
646 | int wrapped = 0; | ||
647 | int hp_index; | ||
648 | 643 | ||
649 | spin_lock(&swap_lock); | ||
650 | if (atomic_long_read(&nr_swap_pages) <= 0) | 644 | if (atomic_long_read(&nr_swap_pages) <= 0) |
651 | goto noswap; | 645 | goto noswap; |
652 | atomic_long_dec(&nr_swap_pages); | 646 | atomic_long_dec(&nr_swap_pages); |
653 | 647 | ||
654 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 648 | spin_lock(&swap_avail_lock); |
655 | hp_index = atomic_xchg(&highest_priority_index, -1); | ||
656 | /* | ||
657 | * highest_priority_index records current highest priority swap | ||
658 | * type which just frees swap entries. If its priority is | ||
659 | * higher than that of swap_list.next swap type, we use it. It | ||
660 | * isn't protected by swap_lock, so it can be an invalid value | ||
661 | * if the corresponding swap type is swapoff. We double check | ||
662 | * the flags here. It's even possible the swap type is swapoff | ||
663 | * and swapon again and its priority is changed. In such rare | ||
664 | * case, low prority swap type might be used, but eventually | ||
665 | * high priority swap will be used after several rounds of | ||
666 | * swap. | ||
667 | */ | ||
668 | if (hp_index != -1 && hp_index != type && | ||
669 | swap_info[type]->prio < swap_info[hp_index]->prio && | ||
670 | (swap_info[hp_index]->flags & SWP_WRITEOK)) { | ||
671 | type = hp_index; | ||
672 | swap_list.next = type; | ||
673 | } | ||
674 | |||
675 | si = swap_info[type]; | ||
676 | next = si->next; | ||
677 | if (next < 0 || | ||
678 | (!wrapped && si->prio != swap_info[next]->prio)) { | ||
679 | next = swap_list.head; | ||
680 | wrapped++; | ||
681 | } | ||
682 | 649 | ||
650 | start_over: | ||
651 | plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { | ||
652 | /* requeue si to after same-priority siblings */ | ||
653 | plist_requeue(&si->avail_list, &swap_avail_head); | ||
654 | spin_unlock(&swap_avail_lock); | ||
683 | spin_lock(&si->lock); | 655 | spin_lock(&si->lock); |
684 | if (!si->highest_bit) { | 656 | if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { |
685 | spin_unlock(&si->lock); | 657 | spin_lock(&swap_avail_lock); |
686 | continue; | 658 | if (plist_node_empty(&si->avail_list)) { |
687 | } | 659 | spin_unlock(&si->lock); |
688 | if (!(si->flags & SWP_WRITEOK)) { | 660 | goto nextsi; |
661 | } | ||
662 | WARN(!si->highest_bit, | ||
663 | "swap_info %d in list but !highest_bit\n", | ||
664 | si->type); | ||
665 | WARN(!(si->flags & SWP_WRITEOK), | ||
666 | "swap_info %d in list but !SWP_WRITEOK\n", | ||
667 | si->type); | ||
668 | plist_del(&si->avail_list, &swap_avail_head); | ||
689 | spin_unlock(&si->lock); | 669 | spin_unlock(&si->lock); |
690 | continue; | 670 | goto nextsi; |
691 | } | 671 | } |
692 | 672 | ||
693 | swap_list.next = next; | ||
694 | |||
695 | spin_unlock(&swap_lock); | ||
696 | /* This is called for allocating swap entry for cache */ | 673 | /* This is called for allocating swap entry for cache */ |
697 | offset = scan_swap_map(si, SWAP_HAS_CACHE); | 674 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
698 | spin_unlock(&si->lock); | 675 | spin_unlock(&si->lock); |
699 | if (offset) | 676 | if (offset) |
700 | return swp_entry(type, offset); | 677 | return swp_entry(si->type, offset); |
701 | spin_lock(&swap_lock); | 678 | pr_debug("scan_swap_map of si %d failed to find offset\n", |
702 | next = swap_list.next; | 679 | si->type); |
680 | spin_lock(&swap_avail_lock); | ||
681 | nextsi: | ||
682 | /* | ||
683 | * if we got here, it's likely that si was almost full before, | ||
684 | * and since scan_swap_map() can drop the si->lock, multiple | ||
685 | * callers probably all tried to get a page from the same si | ||
686 | * and it filled up before we could get one; or, the si filled | ||
687 | * up between us dropping swap_avail_lock and taking si->lock. | ||
688 | * Since we dropped the swap_avail_lock, the swap_avail_head | ||
689 | * list may have been modified; so if next is still in the | ||
690 | * swap_avail_head list then try it, otherwise start over. | ||
691 | */ | ||
692 | if (plist_node_empty(&next->avail_list)) | ||
693 | goto start_over; | ||
703 | } | 694 | } |
704 | 695 | ||
696 | spin_unlock(&swap_avail_lock); | ||
697 | |||
705 | atomic_long_inc(&nr_swap_pages); | 698 | atomic_long_inc(&nr_swap_pages); |
706 | noswap: | 699 | noswap: |
707 | spin_unlock(&swap_lock); | ||
708 | return (swp_entry_t) {0}; | 700 | return (swp_entry_t) {0}; |
709 | } | 701 | } |
710 | 702 | ||
@@ -766,27 +758,6 @@ out: | |||
766 | return NULL; | 758 | return NULL; |
767 | } | 759 | } |
768 | 760 | ||
769 | /* | ||
770 | * This swap type frees swap entry, check if it is the highest priority swap | ||
771 | * type which just frees swap entry. get_swap_page() uses | ||
772 | * highest_priority_index to search highest priority swap type. The | ||
773 | * swap_info_struct.lock can't protect us if there are multiple swap types | ||
774 | * active, so we use atomic_cmpxchg. | ||
775 | */ | ||
776 | static void set_highest_priority_index(int type) | ||
777 | { | ||
778 | int old_hp_index, new_hp_index; | ||
779 | |||
780 | do { | ||
781 | old_hp_index = atomic_read(&highest_priority_index); | ||
782 | if (old_hp_index != -1 && | ||
783 | swap_info[old_hp_index]->prio >= swap_info[type]->prio) | ||
784 | break; | ||
785 | new_hp_index = type; | ||
786 | } while (atomic_cmpxchg(&highest_priority_index, | ||
787 | old_hp_index, new_hp_index) != old_hp_index); | ||
788 | } | ||
789 | |||
790 | static unsigned char swap_entry_free(struct swap_info_struct *p, | 761 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
791 | swp_entry_t entry, unsigned char usage) | 762 | swp_entry_t entry, unsigned char usage) |
792 | { | 763 | { |
@@ -828,9 +799,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
828 | dec_cluster_info_page(p, p->cluster_info, offset); | 799 | dec_cluster_info_page(p, p->cluster_info, offset); |
829 | if (offset < p->lowest_bit) | 800 | if (offset < p->lowest_bit) |
830 | p->lowest_bit = offset; | 801 | p->lowest_bit = offset; |
831 | if (offset > p->highest_bit) | 802 | if (offset > p->highest_bit) { |
803 | bool was_full = !p->highest_bit; | ||
832 | p->highest_bit = offset; | 804 | p->highest_bit = offset; |
833 | set_highest_priority_index(p->type); | 805 | if (was_full && (p->flags & SWP_WRITEOK)) { |
806 | spin_lock(&swap_avail_lock); | ||
807 | WARN_ON(!plist_node_empty(&p->avail_list)); | ||
808 | if (plist_node_empty(&p->avail_list)) | ||
809 | plist_add(&p->avail_list, | ||
810 | &swap_avail_head); | ||
811 | spin_unlock(&swap_avail_lock); | ||
812 | } | ||
813 | } | ||
834 | atomic_long_inc(&nr_swap_pages); | 814 | atomic_long_inc(&nr_swap_pages); |
835 | p->inuse_pages--; | 815 | p->inuse_pages--; |
836 | frontswap_invalidate_page(p->type, offset); | 816 | frontswap_invalidate_page(p->type, offset); |
@@ -1765,30 +1745,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1765 | unsigned char *swap_map, | 1745 | unsigned char *swap_map, |
1766 | struct swap_cluster_info *cluster_info) | 1746 | struct swap_cluster_info *cluster_info) |
1767 | { | 1747 | { |
1768 | int i, prev; | ||
1769 | |||
1770 | if (prio >= 0) | 1748 | if (prio >= 0) |
1771 | p->prio = prio; | 1749 | p->prio = prio; |
1772 | else | 1750 | else |
1773 | p->prio = --least_priority; | 1751 | p->prio = --least_priority; |
1752 | /* | ||
1753 | * the plist prio is negated because plist ordering is | ||
1754 | * low-to-high, while swap ordering is high-to-low | ||
1755 | */ | ||
1756 | p->list.prio = -p->prio; | ||
1757 | p->avail_list.prio = -p->prio; | ||
1774 | p->swap_map = swap_map; | 1758 | p->swap_map = swap_map; |
1775 | p->cluster_info = cluster_info; | 1759 | p->cluster_info = cluster_info; |
1776 | p->flags |= SWP_WRITEOK; | 1760 | p->flags |= SWP_WRITEOK; |
1777 | atomic_long_add(p->pages, &nr_swap_pages); | 1761 | atomic_long_add(p->pages, &nr_swap_pages); |
1778 | total_swap_pages += p->pages; | 1762 | total_swap_pages += p->pages; |
1779 | 1763 | ||
1780 | /* insert swap space into swap_list: */ | 1764 | assert_spin_locked(&swap_lock); |
1781 | prev = -1; | 1765 | /* |
1782 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { | 1766 | * both lists are plists, and thus priority ordered. |
1783 | if (p->prio >= swap_info[i]->prio) | 1767 | * swap_active_head needs to be priority ordered for swapoff(), |
1784 | break; | 1768 | * which on removal of any swap_info_struct with an auto-assigned |
1785 | prev = i; | 1769 | * (i.e. negative) priority increments the auto-assigned priority |
1786 | } | 1770 | * of any lower-priority swap_info_structs. |
1787 | p->next = i; | 1771 | * swap_avail_head needs to be priority ordered for get_swap_page(), |
1788 | if (prev < 0) | 1772 | * which allocates swap pages from the highest available priority |
1789 | swap_list.head = swap_list.next = p->type; | 1773 | * swap_info_struct. |
1790 | else | 1774 | */ |
1791 | swap_info[prev]->next = p->type; | 1775 | plist_add(&p->list, &swap_active_head); |
1776 | spin_lock(&swap_avail_lock); | ||
1777 | plist_add(&p->avail_list, &swap_avail_head); | ||
1778 | spin_unlock(&swap_avail_lock); | ||
1792 | } | 1779 | } |
1793 | 1780 | ||
1794 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1781 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
@@ -1823,8 +1810,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1823 | struct address_space *mapping; | 1810 | struct address_space *mapping; |
1824 | struct inode *inode; | 1811 | struct inode *inode; |
1825 | struct filename *pathname; | 1812 | struct filename *pathname; |
1826 | int i, type, prev; | 1813 | int err, found = 0; |
1827 | int err; | ||
1828 | unsigned int old_block_size; | 1814 | unsigned int old_block_size; |
1829 | 1815 | ||
1830 | if (!capable(CAP_SYS_ADMIN)) | 1816 | if (!capable(CAP_SYS_ADMIN)) |
@@ -1842,17 +1828,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1842 | goto out; | 1828 | goto out; |
1843 | 1829 | ||
1844 | mapping = victim->f_mapping; | 1830 | mapping = victim->f_mapping; |
1845 | prev = -1; | ||
1846 | spin_lock(&swap_lock); | 1831 | spin_lock(&swap_lock); |
1847 | for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { | 1832 | plist_for_each_entry(p, &swap_active_head, list) { |
1848 | p = swap_info[type]; | ||
1849 | if (p->flags & SWP_WRITEOK) { | 1833 | if (p->flags & SWP_WRITEOK) { |
1850 | if (p->swap_file->f_mapping == mapping) | 1834 | if (p->swap_file->f_mapping == mapping) { |
1835 | found = 1; | ||
1851 | break; | 1836 | break; |
1837 | } | ||
1852 | } | 1838 | } |
1853 | prev = type; | ||
1854 | } | 1839 | } |
1855 | if (type < 0) { | 1840 | if (!found) { |
1856 | err = -EINVAL; | 1841 | err = -EINVAL; |
1857 | spin_unlock(&swap_lock); | 1842 | spin_unlock(&swap_lock); |
1858 | goto out_dput; | 1843 | goto out_dput; |
@@ -1864,20 +1849,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1864 | spin_unlock(&swap_lock); | 1849 | spin_unlock(&swap_lock); |
1865 | goto out_dput; | 1850 | goto out_dput; |
1866 | } | 1851 | } |
1867 | if (prev < 0) | 1852 | spin_lock(&swap_avail_lock); |
1868 | swap_list.head = p->next; | 1853 | plist_del(&p->avail_list, &swap_avail_head); |
1869 | else | 1854 | spin_unlock(&swap_avail_lock); |
1870 | swap_info[prev]->next = p->next; | ||
1871 | if (type == swap_list.next) { | ||
1872 | /* just pick something that's safe... */ | ||
1873 | swap_list.next = swap_list.head; | ||
1874 | } | ||
1875 | spin_lock(&p->lock); | 1855 | spin_lock(&p->lock); |
1876 | if (p->prio < 0) { | 1856 | if (p->prio < 0) { |
1877 | for (i = p->next; i >= 0; i = swap_info[i]->next) | 1857 | struct swap_info_struct *si = p; |
1878 | swap_info[i]->prio = p->prio--; | 1858 | |
1859 | plist_for_each_entry_continue(si, &swap_active_head, list) { | ||
1860 | si->prio++; | ||
1861 | si->list.prio--; | ||
1862 | si->avail_list.prio--; | ||
1863 | } | ||
1879 | least_priority++; | 1864 | least_priority++; |
1880 | } | 1865 | } |
1866 | plist_del(&p->list, &swap_active_head); | ||
1881 | atomic_long_sub(p->pages, &nr_swap_pages); | 1867 | atomic_long_sub(p->pages, &nr_swap_pages); |
1882 | total_swap_pages -= p->pages; | 1868 | total_swap_pages -= p->pages; |
1883 | p->flags &= ~SWP_WRITEOK; | 1869 | p->flags &= ~SWP_WRITEOK; |
@@ -1885,7 +1871,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1885 | spin_unlock(&swap_lock); | 1871 | spin_unlock(&swap_lock); |
1886 | 1872 | ||
1887 | set_current_oom_origin(); | 1873 | set_current_oom_origin(); |
1888 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ | 1874 | err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ |
1889 | clear_current_oom_origin(); | 1875 | clear_current_oom_origin(); |
1890 | 1876 | ||
1891 | if (err) { | 1877 | if (err) { |
@@ -1926,7 +1912,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1926 | frontswap_map = frontswap_map_get(p); | 1912 | frontswap_map = frontswap_map_get(p); |
1927 | spin_unlock(&p->lock); | 1913 | spin_unlock(&p->lock); |
1928 | spin_unlock(&swap_lock); | 1914 | spin_unlock(&swap_lock); |
1929 | frontswap_invalidate_area(type); | 1915 | frontswap_invalidate_area(p->type); |
1930 | frontswap_map_set(p, NULL); | 1916 | frontswap_map_set(p, NULL); |
1931 | mutex_unlock(&swapon_mutex); | 1917 | mutex_unlock(&swapon_mutex); |
1932 | free_percpu(p->percpu_cluster); | 1918 | free_percpu(p->percpu_cluster); |
@@ -1935,7 +1921,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1935 | vfree(cluster_info); | 1921 | vfree(cluster_info); |
1936 | vfree(frontswap_map); | 1922 | vfree(frontswap_map); |
1937 | /* Destroy swap account information */ | 1923 | /* Destroy swap account information */ |
1938 | swap_cgroup_swapoff(type); | 1924 | swap_cgroup_swapoff(p->type); |
1939 | 1925 | ||
1940 | inode = mapping->host; | 1926 | inode = mapping->host; |
1941 | if (S_ISBLK(inode->i_mode)) { | 1927 | if (S_ISBLK(inode->i_mode)) { |
@@ -2142,8 +2128,9 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2142 | */ | 2128 | */ |
2143 | } | 2129 | } |
2144 | INIT_LIST_HEAD(&p->first_swap_extent.list); | 2130 | INIT_LIST_HEAD(&p->first_swap_extent.list); |
2131 | plist_node_init(&p->list, 0); | ||
2132 | plist_node_init(&p->avail_list, 0); | ||
2145 | p->flags = SWP_USED; | 2133 | p->flags = SWP_USED; |
2146 | p->next = -1; | ||
2147 | spin_unlock(&swap_lock); | 2134 | spin_unlock(&swap_lock); |
2148 | spin_lock_init(&p->lock); | 2135 | spin_lock_init(&p->lock); |
2149 | 2136 | ||
diff --git a/mm/vmacache.c b/mm/vmacache.c index 1037a3bab505..9f25af825dec 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c | |||
@@ -17,6 +17,16 @@ void vmacache_flush_all(struct mm_struct *mm) | |||
17 | { | 17 | { |
18 | struct task_struct *g, *p; | 18 | struct task_struct *g, *p; |
19 | 19 | ||
20 | /* | ||
21 | * Single threaded tasks need not iterate the entire | ||
22 | * list of process. We can avoid the flushing as well | ||
23 | * since the mm's seqnum was increased and don't have | ||
24 | * to worry about other threads' seqnum. Current's | ||
25 | * flush will occur upon the next lookup. | ||
26 | */ | ||
27 | if (atomic_read(&mm->mm_users) == 1) | ||
28 | return; | ||
29 | |||
20 | rcu_read_lock(); | 30 | rcu_read_lock(); |
21 | for_each_process_thread(g, p) { | 31 | for_each_process_thread(g, p) { |
22 | /* | 32 | /* |
@@ -78,6 +88,8 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) | |||
78 | if (!vmacache_valid(mm)) | 88 | if (!vmacache_valid(mm)) |
79 | return NULL; | 89 | return NULL; |
80 | 90 | ||
91 | count_vm_vmacache_event(VMACACHE_FIND_CALLS); | ||
92 | |||
81 | for (i = 0; i < VMACACHE_SIZE; i++) { | 93 | for (i = 0; i < VMACACHE_SIZE; i++) { |
82 | struct vm_area_struct *vma = current->vmacache[i]; | 94 | struct vm_area_struct *vma = current->vmacache[i]; |
83 | 95 | ||
@@ -85,8 +97,10 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) | |||
85 | continue; | 97 | continue; |
86 | if (WARN_ON_ONCE(vma->vm_mm != mm)) | 98 | if (WARN_ON_ONCE(vma->vm_mm != mm)) |
87 | break; | 99 | break; |
88 | if (vma->vm_start <= addr && vma->vm_end > addr) | 100 | if (vma->vm_start <= addr && vma->vm_end > addr) { |
101 | count_vm_vmacache_event(VMACACHE_FIND_HITS); | ||
89 | return vma; | 102 | return vma; |
103 | } | ||
90 | } | 104 | } |
91 | 105 | ||
92 | return NULL; | 106 | return NULL; |
@@ -102,11 +116,15 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, | |||
102 | if (!vmacache_valid(mm)) | 116 | if (!vmacache_valid(mm)) |
103 | return NULL; | 117 | return NULL; |
104 | 118 | ||
119 | count_vm_vmacache_event(VMACACHE_FIND_CALLS); | ||
120 | |||
105 | for (i = 0; i < VMACACHE_SIZE; i++) { | 121 | for (i = 0; i < VMACACHE_SIZE; i++) { |
106 | struct vm_area_struct *vma = current->vmacache[i]; | 122 | struct vm_area_struct *vma = current->vmacache[i]; |
107 | 123 | ||
108 | if (vma && vma->vm_start == start && vma->vm_end == end) | 124 | if (vma && vma->vm_start == start && vma->vm_end == end) { |
125 | count_vm_vmacache_event(VMACACHE_FIND_HITS); | ||
109 | return vma; | 126 | return vma; |
127 | } | ||
110 | } | 128 | } |
111 | 129 | ||
112 | return NULL; | 130 | return NULL; |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bf233b283319..f64632b67196 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1268,6 +1268,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) | |||
1268 | vunmap_page_range(addr, end); | 1268 | vunmap_page_range(addr, end); |
1269 | flush_tlb_kernel_range(addr, end); | 1269 | flush_tlb_kernel_range(addr, end); |
1270 | } | 1270 | } |
1271 | EXPORT_SYMBOL_GPL(unmap_kernel_range); | ||
1271 | 1272 | ||
1272 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | 1273 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) |
1273 | { | 1274 | { |
@@ -1496,7 +1497,7 @@ void vfree(const void *addr) | |||
1496 | if (!addr) | 1497 | if (!addr) |
1497 | return; | 1498 | return; |
1498 | if (unlikely(in_interrupt())) { | 1499 | if (unlikely(in_interrupt())) { |
1499 | struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); | 1500 | struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); |
1500 | if (llist_add((struct llist_node *)addr, &p->list)) | 1501 | if (llist_add((struct llist_node *)addr, &p->list)) |
1501 | schedule_work(&p->wq); | 1502 | schedule_work(&p->wq); |
1502 | } else | 1503 | } else |
@@ -2619,19 +2620,19 @@ static int s_show(struct seq_file *m, void *p) | |||
2619 | seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); | 2620 | seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); |
2620 | 2621 | ||
2621 | if (v->flags & VM_IOREMAP) | 2622 | if (v->flags & VM_IOREMAP) |
2622 | seq_printf(m, " ioremap"); | 2623 | seq_puts(m, " ioremap"); |
2623 | 2624 | ||
2624 | if (v->flags & VM_ALLOC) | 2625 | if (v->flags & VM_ALLOC) |
2625 | seq_printf(m, " vmalloc"); | 2626 | seq_puts(m, " vmalloc"); |
2626 | 2627 | ||
2627 | if (v->flags & VM_MAP) | 2628 | if (v->flags & VM_MAP) |
2628 | seq_printf(m, " vmap"); | 2629 | seq_puts(m, " vmap"); |
2629 | 2630 | ||
2630 | if (v->flags & VM_USERMAP) | 2631 | if (v->flags & VM_USERMAP) |
2631 | seq_printf(m, " user"); | 2632 | seq_puts(m, " user"); |
2632 | 2633 | ||
2633 | if (v->flags & VM_VPAGES) | 2634 | if (v->flags & VM_VPAGES) |
2634 | seq_printf(m, " vpages"); | 2635 | seq_puts(m, " vpages"); |
2635 | 2636 | ||
2636 | show_numa_info(m, v); | 2637 | show_numa_info(m, v); |
2637 | seq_putc(m, '\n'); | 2638 | seq_putc(m, '\n'); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 32c661d66a45..9149444f947d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -324,7 +324,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
324 | else | 324 | else |
325 | new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); | 325 | new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); |
326 | 326 | ||
327 | trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); | 327 | trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); |
328 | return freed; | 328 | return freed; |
329 | } | 329 | } |
330 | 330 | ||
@@ -1121,7 +1121,7 @@ keep: | |||
1121 | VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); | 1121 | VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); |
1122 | } | 1122 | } |
1123 | 1123 | ||
1124 | free_hot_cold_page_list(&free_pages, 1); | 1124 | free_hot_cold_page_list(&free_pages, true); |
1125 | 1125 | ||
1126 | list_splice(&ret_pages, page_list); | 1126 | list_splice(&ret_pages, page_list); |
1127 | count_vm_events(PGACTIVATE, pgactivate); | 1127 | count_vm_events(PGACTIVATE, pgactivate); |
@@ -1439,6 +1439,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) | |||
1439 | } | 1439 | } |
1440 | 1440 | ||
1441 | /* | 1441 | /* |
1442 | * If a kernel thread (such as nfsd for loop-back mounts) services | ||
1443 | * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. | ||
1444 | * In that case we should only throttle if the backing device it is | ||
1445 | * writing to is congested. In other cases it is safe to throttle. | ||
1446 | */ | ||
1447 | static int current_may_throttle(void) | ||
1448 | { | ||
1449 | return !(current->flags & PF_LESS_THROTTLE) || | ||
1450 | current->backing_dev_info == NULL || | ||
1451 | bdi_write_congested(current->backing_dev_info); | ||
1452 | } | ||
1453 | |||
1454 | /* | ||
1442 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number | 1455 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
1443 | * of reclaimed pages | 1456 | * of reclaimed pages |
1444 | */ | 1457 | */ |
@@ -1519,7 +1532,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1519 | 1532 | ||
1520 | spin_unlock_irq(&zone->lru_lock); | 1533 | spin_unlock_irq(&zone->lru_lock); |
1521 | 1534 | ||
1522 | free_hot_cold_page_list(&page_list, 1); | 1535 | free_hot_cold_page_list(&page_list, true); |
1523 | 1536 | ||
1524 | /* | 1537 | /* |
1525 | * If reclaim is isolating dirty pages under writeback, it implies | 1538 | * If reclaim is isolating dirty pages under writeback, it implies |
@@ -1566,7 +1579,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1566 | * implies that pages are cycling through the LRU faster than | 1579 | * implies that pages are cycling through the LRU faster than |
1567 | * they are written so also forcibly stall. | 1580 | * they are written so also forcibly stall. |
1568 | */ | 1581 | */ |
1569 | if (nr_unqueued_dirty == nr_taken || nr_immediate) | 1582 | if ((nr_unqueued_dirty == nr_taken || nr_immediate) && |
1583 | current_may_throttle()) | ||
1570 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1584 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1571 | } | 1585 | } |
1572 | 1586 | ||
@@ -1575,7 +1589,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1575 | * is congested. Allow kswapd to continue until it starts encountering | 1589 | * is congested. Allow kswapd to continue until it starts encountering |
1576 | * unqueued dirty pages or cycling through the LRU too quickly. | 1590 | * unqueued dirty pages or cycling through the LRU too quickly. |
1577 | */ | 1591 | */ |
1578 | if (!sc->hibernation_mode && !current_is_kswapd()) | 1592 | if (!sc->hibernation_mode && !current_is_kswapd() && |
1593 | current_may_throttle()) | ||
1579 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); | 1594 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); |
1580 | 1595 | ||
1581 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, | 1596 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, |
@@ -1740,7 +1755,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1740 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); | 1755 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); |
1741 | spin_unlock_irq(&zone->lru_lock); | 1756 | spin_unlock_irq(&zone->lru_lock); |
1742 | 1757 | ||
1743 | free_hot_cold_page_list(&l_hold, 1); | 1758 | free_hot_cold_page_list(&l_hold, true); |
1744 | } | 1759 | } |
1745 | 1760 | ||
1746 | #ifdef CONFIG_SWAP | 1761 | #ifdef CONFIG_SWAP |
@@ -1866,6 +1881,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1866 | bool force_scan = false; | 1881 | bool force_scan = false; |
1867 | unsigned long ap, fp; | 1882 | unsigned long ap, fp; |
1868 | enum lru_list lru; | 1883 | enum lru_list lru; |
1884 | bool some_scanned; | ||
1885 | int pass; | ||
1869 | 1886 | ||
1870 | /* | 1887 | /* |
1871 | * If the zone or memcg is small, nr[l] can be 0. This | 1888 | * If the zone or memcg is small, nr[l] can be 0. This |
@@ -1989,39 +2006,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1989 | fraction[1] = fp; | 2006 | fraction[1] = fp; |
1990 | denominator = ap + fp + 1; | 2007 | denominator = ap + fp + 1; |
1991 | out: | 2008 | out: |
1992 | for_each_evictable_lru(lru) { | 2009 | some_scanned = false; |
1993 | int file = is_file_lru(lru); | 2010 | /* Only use force_scan on second pass. */ |
1994 | unsigned long size; | 2011 | for (pass = 0; !some_scanned && pass < 2; pass++) { |
1995 | unsigned long scan; | 2012 | for_each_evictable_lru(lru) { |
2013 | int file = is_file_lru(lru); | ||
2014 | unsigned long size; | ||
2015 | unsigned long scan; | ||
1996 | 2016 | ||
1997 | size = get_lru_size(lruvec, lru); | 2017 | size = get_lru_size(lruvec, lru); |
1998 | scan = size >> sc->priority; | 2018 | scan = size >> sc->priority; |
1999 | 2019 | ||
2000 | if (!scan && force_scan) | 2020 | if (!scan && pass && force_scan) |
2001 | scan = min(size, SWAP_CLUSTER_MAX); | 2021 | scan = min(size, SWAP_CLUSTER_MAX); |
2002 | 2022 | ||
2003 | switch (scan_balance) { | 2023 | switch (scan_balance) { |
2004 | case SCAN_EQUAL: | 2024 | case SCAN_EQUAL: |
2005 | /* Scan lists relative to size */ | 2025 | /* Scan lists relative to size */ |
2006 | break; | 2026 | break; |
2007 | case SCAN_FRACT: | 2027 | case SCAN_FRACT: |
2028 | /* | ||
2029 | * Scan types proportional to swappiness and | ||
2030 | * their relative recent reclaim efficiency. | ||
2031 | */ | ||
2032 | scan = div64_u64(scan * fraction[file], | ||
2033 | denominator); | ||
2034 | break; | ||
2035 | case SCAN_FILE: | ||
2036 | case SCAN_ANON: | ||
2037 | /* Scan one type exclusively */ | ||
2038 | if ((scan_balance == SCAN_FILE) != file) | ||
2039 | scan = 0; | ||
2040 | break; | ||
2041 | default: | ||
2042 | /* Look ma, no brain */ | ||
2043 | BUG(); | ||
2044 | } | ||
2045 | nr[lru] = scan; | ||
2008 | /* | 2046 | /* |
2009 | * Scan types proportional to swappiness and | 2047 | * Skip the second pass and don't force_scan, |
2010 | * their relative recent reclaim efficiency. | 2048 | * if we found something to scan. |
2011 | */ | 2049 | */ |
2012 | scan = div64_u64(scan * fraction[file], denominator); | 2050 | some_scanned |= !!scan; |
2013 | break; | ||
2014 | case SCAN_FILE: | ||
2015 | case SCAN_ANON: | ||
2016 | /* Scan one type exclusively */ | ||
2017 | if ((scan_balance == SCAN_FILE) != file) | ||
2018 | scan = 0; | ||
2019 | break; | ||
2020 | default: | ||
2021 | /* Look ma, no brain */ | ||
2022 | BUG(); | ||
2023 | } | 2051 | } |
2024 | nr[lru] = scan; | ||
2025 | } | 2052 | } |
2026 | } | 2053 | } |
2027 | 2054 | ||
@@ -2037,13 +2064,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | |||
2037 | unsigned long nr_reclaimed = 0; | 2064 | unsigned long nr_reclaimed = 0; |
2038 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 2065 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
2039 | struct blk_plug plug; | 2066 | struct blk_plug plug; |
2040 | bool scan_adjusted = false; | 2067 | bool scan_adjusted; |
2041 | 2068 | ||
2042 | get_scan_count(lruvec, sc, nr); | 2069 | get_scan_count(lruvec, sc, nr); |
2043 | 2070 | ||
2044 | /* Record the original scan target for proportional adjustments later */ | 2071 | /* Record the original scan target for proportional adjustments later */ |
2045 | memcpy(targets, nr, sizeof(nr)); | 2072 | memcpy(targets, nr, sizeof(nr)); |
2046 | 2073 | ||
2074 | /* | ||
2075 | * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal | ||
2076 | * event that can occur when there is little memory pressure e.g. | ||
2077 | * multiple streaming readers/writers. Hence, we do not abort scanning | ||
2078 | * when the requested number of pages are reclaimed when scanning at | ||
2079 | * DEF_PRIORITY on the assumption that the fact we are direct | ||
2080 | * reclaiming implies that kswapd is not keeping up and it is best to | ||
2081 | * do a batch of work at once. For memcg reclaim one check is made to | ||
2082 | * abort proportional reclaim if either the file or anon lru has already | ||
2083 | * dropped to zero at the first pass. | ||
2084 | */ | ||
2085 | scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && | ||
2086 | sc->priority == DEF_PRIORITY); | ||
2087 | |||
2047 | blk_start_plug(&plug); | 2088 | blk_start_plug(&plug); |
2048 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 2089 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
2049 | nr[LRU_INACTIVE_FILE]) { | 2090 | nr[LRU_INACTIVE_FILE]) { |
@@ -2064,17 +2105,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | |||
2064 | continue; | 2105 | continue; |
2065 | 2106 | ||
2066 | /* | 2107 | /* |
2067 | * For global direct reclaim, reclaim only the number of pages | ||
2068 | * requested. Less care is taken to scan proportionally as it | ||
2069 | * is more important to minimise direct reclaim stall latency | ||
2070 | * than it is to properly age the LRU lists. | ||
2071 | */ | ||
2072 | if (global_reclaim(sc) && !current_is_kswapd()) | ||
2073 | break; | ||
2074 | |||
2075 | /* | ||
2076 | * For kswapd and memcg, reclaim at least the number of pages | 2108 | * For kswapd and memcg, reclaim at least the number of pages |
2077 | * requested. Ensure that the anon and file LRUs shrink | 2109 | * requested. Ensure that the anon and file LRUs are scanned |
2078 | * proportionally what was requested by get_scan_count(). We | 2110 | * proportionally what was requested by get_scan_count(). We |
2079 | * stop reclaiming one LRU and reduce the amount scanning | 2111 | * stop reclaiming one LRU and reduce the amount scanning |
2080 | * proportional to the original scan target. | 2112 | * proportional to the original scan target. |
@@ -2082,6 +2114,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | |||
2082 | nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; | 2114 | nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; |
2083 | nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; | 2115 | nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; |
2084 | 2116 | ||
2117 | /* | ||
2118 | * It's just vindictive to attack the larger once the smaller | ||
2119 | * has gone to zero. And given the way we stop scanning the | ||
2120 | * smaller below, this makes sure that we only make one nudge | ||
2121 | * towards proportionality once we've got nr_to_reclaim. | ||
2122 | */ | ||
2123 | if (!nr_file || !nr_anon) | ||
2124 | break; | ||
2125 | |||
2085 | if (nr_file > nr_anon) { | 2126 | if (nr_file > nr_anon) { |
2086 | unsigned long scan_target = targets[LRU_INACTIVE_ANON] + | 2127 | unsigned long scan_target = targets[LRU_INACTIVE_ANON] + |
2087 | targets[LRU_ACTIVE_ANON] + 1; | 2128 | targets[LRU_ACTIVE_ANON] + 1; |
@@ -2268,9 +2309,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
2268 | * there is a buffer of free pages available to give compaction | 2309 | * there is a buffer of free pages available to give compaction |
2269 | * a reasonable chance of completing and allocating the page | 2310 | * a reasonable chance of completing and allocating the page |
2270 | */ | 2311 | */ |
2271 | balance_gap = min(low_wmark_pages(zone), | 2312 | balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( |
2272 | (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2313 | zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); |
2273 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2274 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); | 2314 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); |
2275 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); | 2315 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); |
2276 | 2316 | ||
@@ -2525,10 +2565,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2525 | 2565 | ||
2526 | for (i = 0; i <= ZONE_NORMAL; i++) { | 2566 | for (i = 0; i <= ZONE_NORMAL; i++) { |
2527 | zone = &pgdat->node_zones[i]; | 2567 | zone = &pgdat->node_zones[i]; |
2568 | if (!populated_zone(zone)) | ||
2569 | continue; | ||
2570 | |||
2528 | pfmemalloc_reserve += min_wmark_pages(zone); | 2571 | pfmemalloc_reserve += min_wmark_pages(zone); |
2529 | free_pages += zone_page_state(zone, NR_FREE_PAGES); | 2572 | free_pages += zone_page_state(zone, NR_FREE_PAGES); |
2530 | } | 2573 | } |
2531 | 2574 | ||
2575 | /* If there are no reserves (unexpected config) then do not throttle */ | ||
2576 | if (!pfmemalloc_reserve) | ||
2577 | return true; | ||
2578 | |||
2532 | wmark_ok = free_pages > pfmemalloc_reserve / 2; | 2579 | wmark_ok = free_pages > pfmemalloc_reserve / 2; |
2533 | 2580 | ||
2534 | /* kswapd must be awake if processes are being throttled */ | 2581 | /* kswapd must be awake if processes are being throttled */ |
@@ -2553,9 +2600,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2553 | static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | 2600 | static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, |
2554 | nodemask_t *nodemask) | 2601 | nodemask_t *nodemask) |
2555 | { | 2602 | { |
2603 | struct zoneref *z; | ||
2556 | struct zone *zone; | 2604 | struct zone *zone; |
2557 | int high_zoneidx = gfp_zone(gfp_mask); | 2605 | pg_data_t *pgdat = NULL; |
2558 | pg_data_t *pgdat; | ||
2559 | 2606 | ||
2560 | /* | 2607 | /* |
2561 | * Kernel threads should not be throttled as they may be indirectly | 2608 | * Kernel threads should not be throttled as they may be indirectly |
@@ -2574,10 +2621,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |||
2574 | if (fatal_signal_pending(current)) | 2621 | if (fatal_signal_pending(current)) |
2575 | goto out; | 2622 | goto out; |
2576 | 2623 | ||
2577 | /* Check if the pfmemalloc reserves are ok */ | 2624 | /* |
2578 | first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); | 2625 | * Check if the pfmemalloc reserves are ok by finding the first node |
2579 | pgdat = zone->zone_pgdat; | 2626 | * with a usable ZONE_NORMAL or lower zone. The expectation is that |
2580 | if (pfmemalloc_watermark_ok(pgdat)) | 2627 | * GFP_KERNEL will be required for allocating network buffers when |
2628 | * swapping over the network so ZONE_HIGHMEM is unusable. | ||
2629 | * | ||
2630 | * Throttling is based on the first usable node and throttled processes | ||
2631 | * wait on a queue until kswapd makes progress and wakes them. There | ||
2632 | * is an affinity then between processes waking up and where reclaim | ||
2633 | * progress has been made assuming the process wakes on the same node. | ||
2634 | * More importantly, processes running on remote nodes will not compete | ||
2635 | * for remote pfmemalloc reserves and processes on different nodes | ||
2636 | * should make reasonable progress. | ||
2637 | */ | ||
2638 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | ||
2639 | gfp_mask, nodemask) { | ||
2640 | if (zone_idx(zone) > ZONE_NORMAL) | ||
2641 | continue; | ||
2642 | |||
2643 | /* Throttle based on the first usable node */ | ||
2644 | pgdat = zone->zone_pgdat; | ||
2645 | if (pfmemalloc_watermark_ok(pgdat)) | ||
2646 | goto out; | ||
2647 | break; | ||
2648 | } | ||
2649 | |||
2650 | /* If no zone was usable by the allocation flags then do not throttle */ | ||
2651 | if (!pgdat) | ||
2581 | goto out; | 2652 | goto out; |
2582 | 2653 | ||
2583 | /* Account for the throttling */ | 2654 | /* Account for the throttling */ |
@@ -2891,9 +2962,8 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2891 | * high wmark plus a "gap" where the gap is either the low | 2962 | * high wmark plus a "gap" where the gap is either the low |
2892 | * watermark or 1% of the zone, whichever is smaller. | 2963 | * watermark or 1% of the zone, whichever is smaller. |
2893 | */ | 2964 | */ |
2894 | balance_gap = min(low_wmark_pages(zone), | 2965 | balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( |
2895 | (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2966 | zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); |
2896 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2897 | 2967 | ||
2898 | /* | 2968 | /* |
2899 | * If there is no low memory pressure or the zone is balanced then no | 2969 | * If there is no low memory pressure or the zone is balanced then no |
@@ -3422,7 +3492,7 @@ int kswapd_run(int nid) | |||
3422 | 3492 | ||
3423 | /* | 3493 | /* |
3424 | * Called by memory hotplug when all memory in a node is offlined. Caller must | 3494 | * Called by memory hotplug when all memory in a node is offlined. Caller must |
3425 | * hold lock_memory_hotplug(). | 3495 | * hold mem_hotplug_begin/end(). |
3426 | */ | 3496 | */ |
3427 | void kswapd_stop(int nid) | 3497 | void kswapd_stop(int nid) |
3428 | { | 3498 | { |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 302dd076b8bf..b37bd49bfd55 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -207,7 +207,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, | |||
207 | } | 207 | } |
208 | 208 | ||
209 | /* | 209 | /* |
210 | * For use when we know that interrupts are disabled. | 210 | * For use when we know that interrupts are disabled, |
211 | * or when we know that preemption is disabled and that | ||
212 | * particular counter cannot be updated from interrupt context. | ||
211 | */ | 213 | */ |
212 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | 214 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
213 | int delta) | 215 | int delta) |
@@ -489,7 +491,7 @@ static void refresh_cpu_vm_stats(void) | |||
489 | continue; | 491 | continue; |
490 | 492 | ||
491 | if (__this_cpu_read(p->pcp.count)) | 493 | if (__this_cpu_read(p->pcp.count)) |
492 | drain_zone_pages(zone, __this_cpu_ptr(&p->pcp)); | 494 | drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); |
493 | #endif | 495 | #endif |
494 | } | 496 | } |
495 | fold_diff(global_diff); | 497 | fold_diff(global_diff); |
@@ -866,6 +868,10 @@ const char * const vmstat_text[] = { | |||
866 | "nr_tlb_local_flush_one", | 868 | "nr_tlb_local_flush_one", |
867 | #endif /* CONFIG_DEBUG_TLBFLUSH */ | 869 | #endif /* CONFIG_DEBUG_TLBFLUSH */ |
868 | 870 | ||
871 | #ifdef CONFIG_DEBUG_VM_VMACACHE | ||
872 | "vmacache_find_calls", | ||
873 | "vmacache_find_hits", | ||
874 | #endif | ||
869 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 875 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
870 | }; | 876 | }; |
871 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ | 877 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ |
@@ -1226,7 +1232,7 @@ int sysctl_stat_interval __read_mostly = HZ; | |||
1226 | static void vmstat_update(struct work_struct *w) | 1232 | static void vmstat_update(struct work_struct *w) |
1227 | { | 1233 | { |
1228 | refresh_cpu_vm_stats(); | 1234 | refresh_cpu_vm_stats(); |
1229 | schedule_delayed_work(&__get_cpu_var(vmstat_work), | 1235 | schedule_delayed_work(this_cpu_ptr(&vmstat_work), |
1230 | round_jiffies_relative(sysctl_stat_interval)); | 1236 | round_jiffies_relative(sysctl_stat_interval)); |
1231 | } | 1237 | } |
1232 | 1238 | ||
@@ -247,7 +247,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) | |||
247 | * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate | 247 | * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate |
248 | * a new page. | 248 | * a new page. |
249 | */ | 249 | */ |
250 | int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, | 250 | int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, |
251 | unsigned long *handle) | 251 | unsigned long *handle) |
252 | { | 252 | { |
253 | int chunks, i, freechunks; | 253 | int chunks, i, freechunks; |
@@ -255,7 +255,7 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, | |||
255 | enum buddy bud; | 255 | enum buddy bud; |
256 | struct page *page; | 256 | struct page *page; |
257 | 257 | ||
258 | if (size <= 0 || gfp & __GFP_HIGHMEM) | 258 | if (!size || (gfp & __GFP_HIGHMEM)) |
259 | return -EINVAL; | 259 | return -EINVAL; |
260 | if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) | 260 | if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) |
261 | return -ENOSPC; | 261 | return -ENOSPC; |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 36b4591a7a2d..fe78189624cf 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -141,7 +141,7 @@ | |||
141 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE | 141 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE |
142 | 142 | ||
143 | /* | 143 | /* |
144 | * On systems with 4K page size, this gives 254 size classes! There is a | 144 | * On systems with 4K page size, this gives 255 size classes! There is a |
145 | * trader-off here: | 145 | * trader-off here: |
146 | * - Large number of size classes is potentially wasteful as free page are | 146 | * - Large number of size classes is potentially wasteful as free page are |
147 | * spread across these classes | 147 | * spread across these classes |
@@ -1082,7 +1082,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
1082 | class = &pool->size_class[class_idx]; | 1082 | class = &pool->size_class[class_idx]; |
1083 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1083 | off = obj_idx_to_offset(page, obj_idx, class->size); |
1084 | 1084 | ||
1085 | area = &__get_cpu_var(zs_map_area); | 1085 | area = this_cpu_ptr(&zs_map_area); |
1086 | if (off + class->size <= PAGE_SIZE) | 1086 | if (off + class->size <= PAGE_SIZE) |
1087 | kunmap_atomic(area->vm_addr); | 1087 | kunmap_atomic(area->vm_addr); |
1088 | else { | 1088 | else { |
diff --git a/mm/zswap.c b/mm/zswap.c index aeaef0fb5624..008388fe7b0f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -347,7 +347,7 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) | |||
347 | return NOTIFY_BAD; | 347 | return NOTIFY_BAD; |
348 | } | 348 | } |
349 | *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; | 349 | *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; |
350 | dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); | 350 | dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); |
351 | if (!dst) { | 351 | if (!dst) { |
352 | pr_err("can't allocate compressor buffer\n"); | 352 | pr_err("can't allocate compressor buffer\n"); |
353 | crypto_free_comp(tfm); | 353 | crypto_free_comp(tfm); |
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 34eb2160489d..010b18ef4ea0 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl | |||
@@ -24,6 +24,7 @@ my $emacs = 0; | |||
24 | my $terse = 0; | 24 | my $terse = 0; |
25 | my $file = 0; | 25 | my $file = 0; |
26 | my $check = 0; | 26 | my $check = 0; |
27 | my $check_orig = 0; | ||
27 | my $summary = 1; | 28 | my $summary = 1; |
28 | my $mailback = 0; | 29 | my $mailback = 0; |
29 | my $summary_file = 0; | 30 | my $summary_file = 0; |
@@ -146,6 +147,7 @@ GetOptions( | |||
146 | help(0) if ($help); | 147 | help(0) if ($help); |
147 | 148 | ||
148 | $fix = 1 if ($fix_inplace); | 149 | $fix = 1 if ($fix_inplace); |
150 | $check_orig = $check; | ||
149 | 151 | ||
150 | my $exit = 0; | 152 | my $exit = 0; |
151 | 153 | ||
@@ -397,6 +399,11 @@ foreach my $entry (@mode_permission_funcs) { | |||
397 | $mode_perms_search .= $entry->[0]; | 399 | $mode_perms_search .= $entry->[0]; |
398 | } | 400 | } |
399 | 401 | ||
402 | our $declaration_macros = qr{(?x: | ||
403 | (?:$Storage\s+)?(?:DECLARE|DEFINE)_[A-Z]+\s*\(| | ||
404 | (?:$Storage\s+)?LIST_HEAD\s*\( | ||
405 | )}; | ||
406 | |||
400 | our $allowed_asm_includes = qr{(?x: | 407 | our $allowed_asm_includes = qr{(?x: |
401 | irq| | 408 | irq| |
402 | memory | 409 | memory |
@@ -1808,11 +1815,13 @@ sub process { | |||
1808 | $here = "#$linenr: " if (!$file); | 1815 | $here = "#$linenr: " if (!$file); |
1809 | $here = "#$realline: " if ($file); | 1816 | $here = "#$realline: " if ($file); |
1810 | 1817 | ||
1818 | my $found_file = 0; | ||
1811 | # extract the filename as it passes | 1819 | # extract the filename as it passes |
1812 | if ($line =~ /^diff --git.*?(\S+)$/) { | 1820 | if ($line =~ /^diff --git.*?(\S+)$/) { |
1813 | $realfile = $1; | 1821 | $realfile = $1; |
1814 | $realfile =~ s@^([^/]*)/@@ if (!$file); | 1822 | $realfile =~ s@^([^/]*)/@@ if (!$file); |
1815 | $in_commit_log = 0; | 1823 | $in_commit_log = 0; |
1824 | $found_file = 1; | ||
1816 | } elsif ($line =~ /^\+\+\+\s+(\S+)/) { | 1825 | } elsif ($line =~ /^\+\+\+\s+(\S+)/) { |
1817 | $realfile = $1; | 1826 | $realfile = $1; |
1818 | $realfile =~ s@^([^/]*)/@@ if (!$file); | 1827 | $realfile =~ s@^([^/]*)/@@ if (!$file); |
@@ -1829,6 +1838,15 @@ sub process { | |||
1829 | ERROR("MODIFIED_INCLUDE_ASM", | 1838 | ERROR("MODIFIED_INCLUDE_ASM", |
1830 | "do not modify files in include/asm, change architecture specific files in include/asm-<architecture>\n" . "$here$rawline\n"); | 1839 | "do not modify files in include/asm, change architecture specific files in include/asm-<architecture>\n" . "$here$rawline\n"); |
1831 | } | 1840 | } |
1841 | $found_file = 1; | ||
1842 | } | ||
1843 | |||
1844 | if ($found_file) { | ||
1845 | if ($realfile =~ m@^(drivers/net/|net/)@) { | ||
1846 | $check = 1; | ||
1847 | } else { | ||
1848 | $check = $check_orig; | ||
1849 | } | ||
1832 | next; | 1850 | next; |
1833 | } | 1851 | } |
1834 | 1852 | ||
@@ -1926,6 +1944,12 @@ sub process { | |||
1926 | } | 1944 | } |
1927 | } | 1945 | } |
1928 | 1946 | ||
1947 | # Check for old stable address | ||
1948 | if ($line =~ /^\s*cc:\s*.*<?\bstable\@kernel\.org\b>?.*$/i) { | ||
1949 | ERROR("STABLE_ADDRESS", | ||
1950 | "The 'stable' address should be 'stable\@vger.kernel.org'\n" . $herecurr); | ||
1951 | } | ||
1952 | |||
1929 | # Check for unwanted Gerrit info | 1953 | # Check for unwanted Gerrit info |
1930 | if ($in_commit_log && $line =~ /^\s*change-id:/i) { | 1954 | if ($in_commit_log && $line =~ /^\s*change-id:/i) { |
1931 | ERROR("GERRIT_CHANGE_ID", | 1955 | ERROR("GERRIT_CHANGE_ID", |
@@ -2093,8 +2117,10 @@ sub process { | |||
2093 | 2117 | ||
2094 | foreach my $compat (@compats) { | 2118 | foreach my $compat (@compats) { |
2095 | my $compat2 = $compat; | 2119 | my $compat2 = $compat; |
2096 | $compat2 =~ s/\,[a-z]*\-/\,<\.\*>\-/; | 2120 | $compat2 =~ s/\,[a-zA-Z0-9]*\-/\,<\.\*>\-/; |
2097 | `grep -Erq "$compat|$compat2" $dt_path`; | 2121 | my $compat3 = $compat; |
2122 | $compat3 =~ s/\,([a-z]*)[0-9]*\-/\,$1<\.\*>\-/; | ||
2123 | `grep -Erq "$compat|$compat2|$compat3" $dt_path`; | ||
2098 | if ( $? >> 8 ) { | 2124 | if ( $? >> 8 ) { |
2099 | WARN("UNDOCUMENTED_DT_STRING", | 2125 | WARN("UNDOCUMENTED_DT_STRING", |
2100 | "DT compatible string \"$compat\" appears un-documented -- check $dt_path\n" . $herecurr); | 2126 | "DT compatible string \"$compat\" appears un-documented -- check $dt_path\n" . $herecurr); |
@@ -2266,18 +2292,37 @@ sub process { | |||
2266 | } | 2292 | } |
2267 | 2293 | ||
2268 | # check for missing blank lines after declarations | 2294 | # check for missing blank lines after declarations |
2269 | if ($realfile =~ m@^(drivers/net/|net/)@ && | 2295 | if ($sline =~ /^\+\s+\S/ && #Not at char 1 |
2270 | $prevline =~ /^\+\s+$Declare\s+$Ident/ && | 2296 | # actual declarations |
2271 | !($prevline =~ /(?:$Compare|$Assignment|$Operators)\s*$/ || | 2297 | ($prevline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || |
2272 | $prevline =~ /(?:\{\s*|\\)$/) && #extended lines | 2298 | # foo bar; where foo is some local typedef or #define |
2273 | $sline =~ /^\+\s+/ && #Not at char 1 | 2299 | $prevline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || |
2274 | !($sline =~ /^\+\s+$Declare/ || | 2300 | # known declaration macros |
2275 | $sline =~ /^\+\s+$Ident\s+$Ident/ || #eg: typedef foo | 2301 | $prevline =~ /^\+\s+$declaration_macros/) && |
2302 | # for "else if" which can look like "$Ident $Ident" | ||
2303 | !($prevline =~ /^\+\s+$c90_Keywords\b/ || | ||
2304 | # other possible extensions of declaration lines | ||
2305 | $prevline =~ /(?:$Compare|$Assignment|$Operators)\s*$/ || | ||
2306 | # not starting a section or a macro "\" extended line | ||
2307 | $prevline =~ /(?:\{\s*|\\)$/) && | ||
2308 | # looks like a declaration | ||
2309 | !($sline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || | ||
2310 | # foo bar; where foo is some local typedef or #define | ||
2311 | $sline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || | ||
2312 | # known declaration macros | ||
2313 | $sline =~ /^\+\s+$declaration_macros/ || | ||
2314 | # start of struct or union or enum | ||
2276 | $sline =~ /^\+\s+(?:union|struct|enum|typedef)\b/ || | 2315 | $sline =~ /^\+\s+(?:union|struct|enum|typedef)\b/ || |
2277 | $sline =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(])/ || | 2316 | # start or end of block or continuation of declaration |
2278 | $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/)) { | 2317 | $sline =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(\[])/ || |
2318 | # bitfield continuation | ||
2319 | $sline =~ /^\+\s+$Ident\s*:\s*\d+\s*[,;]/ || | ||
2320 | # other possible extensions of declaration lines | ||
2321 | $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/) && | ||
2322 | # indentation of previous and current line are the same | ||
2323 | (($prevline =~ /\+(\s+)\S/) && $sline =~ /^\+$1\S/)) { | ||
2279 | WARN("SPACING", | 2324 | WARN("SPACING", |
2280 | "networking uses a blank line after declarations\n" . $hereprev); | 2325 | "Missing a blank line after declarations\n" . $hereprev); |
2281 | } | 2326 | } |
2282 | 2327 | ||
2283 | # check for spaces at the beginning of a line. | 2328 | # check for spaces at the beginning of a line. |
@@ -3431,6 +3476,13 @@ sub process { | |||
3431 | } | 3476 | } |
3432 | } | 3477 | } |
3433 | 3478 | ||
3479 | # unnecessary return in a void function? (a single leading tab, then return;) | ||
3480 | if ($sline =~ /^\+\treturn\s*;\s*$/ && | ||
3481 | $prevline =~ /^\+/) { | ||
3482 | WARN("RETURN_VOID", | ||
3483 | "void function return statements are not generally useful\n" . $herecurr); | ||
3484 | } | ||
3485 | |||
3434 | # if statements using unnecessary parentheses - ie: if ((foo == bar)) | 3486 | # if statements using unnecessary parentheses - ie: if ((foo == bar)) |
3435 | if ($^V && $^V ge 5.10.0 && | 3487 | if ($^V && $^V ge 5.10.0 && |
3436 | $line =~ /\bif\s*((?:\(\s*){2,})/) { | 3488 | $line =~ /\bif\s*((?:\(\s*){2,})/) { |
@@ -3782,6 +3834,17 @@ sub process { | |||
3782 | WARN("DO_WHILE_MACRO_WITH_TRAILING_SEMICOLON", | 3834 | WARN("DO_WHILE_MACRO_WITH_TRAILING_SEMICOLON", |
3783 | "do {} while (0) macros should not be semicolon terminated\n" . "$herectx"); | 3835 | "do {} while (0) macros should not be semicolon terminated\n" . "$herectx"); |
3784 | } | 3836 | } |
3837 | } elsif ($dstat =~ /^\+\s*#\s*define\s+$Ident.*;\s*$/) { | ||
3838 | $ctx =~ s/\n*$//; | ||
3839 | my $cnt = statement_rawlines($ctx); | ||
3840 | my $herectx = $here . "\n"; | ||
3841 | |||
3842 | for (my $n = 0; $n < $cnt; $n++) { | ||
3843 | $herectx .= raw_line($linenr, $n) . "\n"; | ||
3844 | } | ||
3845 | |||
3846 | WARN("TRAILING_SEMICOLON", | ||
3847 | "macros should not use a trailing semicolon\n" . "$herectx"); | ||
3785 | } | 3848 | } |
3786 | } | 3849 | } |
3787 | 3850 | ||
@@ -4264,6 +4327,27 @@ sub process { | |||
4264 | "unchecked sscanf return value\n" . "$here\n$stat_real\n"); | 4327 | "unchecked sscanf return value\n" . "$here\n$stat_real\n"); |
4265 | } | 4328 | } |
4266 | 4329 | ||
4330 | # check for simple sscanf that should be kstrto<foo> | ||
4331 | if ($^V && $^V ge 5.10.0 && | ||
4332 | defined $stat && | ||
4333 | $line =~ /\bsscanf\b/) { | ||
4334 | my $lc = $stat =~ tr@\n@@; | ||
4335 | $lc = $lc + $linenr; | ||
4336 | my $stat_real = raw_line($linenr, 0); | ||
4337 | for (my $count = $linenr + 1; $count <= $lc; $count++) { | ||
4338 | $stat_real = $stat_real . "\n" . raw_line($count, 0); | ||
4339 | } | ||
4340 | if ($stat_real =~ /\bsscanf\b\s*\(\s*$FuncArg\s*,\s*("[^"]+")/) { | ||
4341 | my $format = $6; | ||
4342 | my $count = $format =~ tr@%@%@; | ||
4343 | if ($count == 1 && | ||
4344 | $format =~ /^"\%(?i:ll[udxi]|[udxi]ll|ll|[hl]h?[udxi]|[udxi][hl]h?|[hl]h?|[udxi])"$/) { | ||
4345 | WARN("SSCANF_TO_KSTRTO", | ||
4346 | "Prefer kstrto<type> to single variable sscanf\n" . "$here\n$stat_real\n"); | ||
4347 | } | ||
4348 | } | ||
4349 | } | ||
4350 | |||
4267 | # check for new externs in .h files. | 4351 | # check for new externs in .h files. |
4268 | if ($realfile =~ /\.h$/ && | 4352 | if ($realfile =~ /\.h$/ && |
4269 | $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) { | 4353 | $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) { |
@@ -4328,6 +4412,30 @@ sub process { | |||
4328 | "Prefer $3(sizeof(*$1)...) over $3($4...)\n" . $herecurr); | 4412 | "Prefer $3(sizeof(*$1)...) over $3($4...)\n" . $herecurr); |
4329 | } | 4413 | } |
4330 | 4414 | ||
4415 | # check for k[mz]alloc with multiplies that could be kmalloc_array/kcalloc | ||
4416 | if ($^V && $^V ge 5.10.0 && | ||
4417 | $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/) { | ||
4418 | my $oldfunc = $3; | ||
4419 | my $a1 = $4; | ||
4420 | my $a2 = $10; | ||
4421 | my $newfunc = "kmalloc_array"; | ||
4422 | $newfunc = "kcalloc" if ($oldfunc eq "kzalloc"); | ||
4423 | if ($a1 =~ /^sizeof\s*\S/ || $a2 =~ /^sizeof\s*\S/) { | ||
4424 | if (WARN("ALLOC_WITH_MULTIPLY", | ||
4425 | "Prefer $newfunc over $oldfunc with multiply\n" . $herecurr) && | ||
4426 | $fix) { | ||
4427 | my $r1 = $a1; | ||
4428 | my $r2 = $a2; | ||
4429 | if ($a1 =~ /^sizeof\s*\S/) { | ||
4430 | $r1 = $a2; | ||
4431 | $r2 = $a1; | ||
4432 | } | ||
4433 | $fixed[$linenr - 1] =~ s/\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/$1 . ' = ' . "$newfunc(" . trim($r1) . ', ' . trim($r2)/e; | ||
4434 | |||
4435 | } | ||
4436 | } | ||
4437 | } | ||
4438 | |||
4331 | # check for krealloc arg reuse | 4439 | # check for krealloc arg reuse |
4332 | if ($^V && $^V ge 5.10.0 && | 4440 | if ($^V && $^V ge 5.10.0 && |
4333 | $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*krealloc\s*\(\s*\1\s*,/) { | 4441 | $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*krealloc\s*\(\s*\1\s*,/) { |
@@ -4443,10 +4551,10 @@ sub process { | |||
4443 | "$1 is obsolete, use k$3 instead\n" . $herecurr); | 4551 | "$1 is obsolete, use k$3 instead\n" . $herecurr); |
4444 | } | 4552 | } |
4445 | 4553 | ||
4446 | # check for __initcall(), use device_initcall() explicitly please | 4554 | # check for __initcall(), use device_initcall() explicitly or more appropriate function please |
4447 | if ($line =~ /^.\s*__initcall\s*\(/) { | 4555 | if ($line =~ /^.\s*__initcall\s*\(/) { |
4448 | WARN("USE_DEVICE_INITCALL", | 4556 | WARN("USE_DEVICE_INITCALL", |
4449 | "please use device_initcall() instead of __initcall()\n" . $herecurr); | 4557 | "please use device_initcall() or more appropriate function instead of __initcall() (see include/linux/init.h)\n" . $herecurr); |
4450 | } | 4558 | } |
4451 | 4559 | ||
4452 | # check for various ops structs, ensure they are const. | 4560 | # check for various ops structs, ensure they are const. |
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 05654f5e48d5..c4d6d2e20e0d 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c | |||
@@ -32,6 +32,8 @@ | |||
32 | #include <assert.h> | 32 | #include <assert.h> |
33 | #include <ftw.h> | 33 | #include <ftw.h> |
34 | #include <time.h> | 34 | #include <time.h> |
35 | #include <setjmp.h> | ||
36 | #include <signal.h> | ||
35 | #include <sys/types.h> | 37 | #include <sys/types.h> |
36 | #include <sys/errno.h> | 38 | #include <sys/errno.h> |
37 | #include <sys/fcntl.h> | 39 | #include <sys/fcntl.h> |
@@ -824,21 +826,38 @@ static void show_file(const char *name, const struct stat *st) | |||
824 | atime, now - st->st_atime); | 826 | atime, now - st->st_atime); |
825 | } | 827 | } |
826 | 828 | ||
829 | static sigjmp_buf sigbus_jmp; | ||
830 | |||
831 | static void * volatile sigbus_addr; | ||
832 | |||
833 | static void sigbus_handler(int sig, siginfo_t *info, void *ucontex) | ||
834 | { | ||
835 | (void)sig; | ||
836 | (void)ucontex; | ||
837 | sigbus_addr = info ? info->si_addr : NULL; | ||
838 | siglongjmp(sigbus_jmp, 1); | ||
839 | } | ||
840 | |||
841 | static struct sigaction sigbus_action = { | ||
842 | .sa_sigaction = sigbus_handler, | ||
843 | .sa_flags = SA_SIGINFO, | ||
844 | }; | ||
845 | |||
827 | static void walk_file(const char *name, const struct stat *st) | 846 | static void walk_file(const char *name, const struct stat *st) |
828 | { | 847 | { |
829 | uint8_t vec[PAGEMAP_BATCH]; | 848 | uint8_t vec[PAGEMAP_BATCH]; |
830 | uint64_t buf[PAGEMAP_BATCH], flags; | 849 | uint64_t buf[PAGEMAP_BATCH], flags; |
831 | unsigned long nr_pages, pfn, i; | 850 | unsigned long nr_pages, pfn, i; |
851 | off_t off, end = st->st_size; | ||
832 | int fd; | 852 | int fd; |
833 | off_t off; | ||
834 | ssize_t len; | 853 | ssize_t len; |
835 | void *ptr; | 854 | void *ptr; |
836 | int first = 1; | 855 | int first = 1; |
837 | 856 | ||
838 | fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); | 857 | fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); |
839 | 858 | ||
840 | for (off = 0; off < st->st_size; off += len) { | 859 | for (off = 0; off < end; off += len) { |
841 | nr_pages = (st->st_size - off + page_size - 1) / page_size; | 860 | nr_pages = (end - off + page_size - 1) / page_size; |
842 | if (nr_pages > PAGEMAP_BATCH) | 861 | if (nr_pages > PAGEMAP_BATCH) |
843 | nr_pages = PAGEMAP_BATCH; | 862 | nr_pages = PAGEMAP_BATCH; |
844 | len = nr_pages * page_size; | 863 | len = nr_pages * page_size; |
@@ -855,11 +874,19 @@ static void walk_file(const char *name, const struct stat *st) | |||
855 | if (madvise(ptr, len, MADV_RANDOM)) | 874 | if (madvise(ptr, len, MADV_RANDOM)) |
856 | fatal("madvice failed: %s", name); | 875 | fatal("madvice failed: %s", name); |
857 | 876 | ||
877 | if (sigsetjmp(sigbus_jmp, 1)) { | ||
878 | end = off + sigbus_addr ? sigbus_addr - ptr : 0; | ||
879 | fprintf(stderr, "got sigbus at offset %lld: %s\n", | ||
880 | (long long)end, name); | ||
881 | goto got_sigbus; | ||
882 | } | ||
883 | |||
858 | /* populate ptes */ | 884 | /* populate ptes */ |
859 | for (i = 0; i < nr_pages ; i++) { | 885 | for (i = 0; i < nr_pages ; i++) { |
860 | if (vec[i] & 1) | 886 | if (vec[i] & 1) |
861 | (void)*(volatile int *)(ptr + i * page_size); | 887 | (void)*(volatile int *)(ptr + i * page_size); |
862 | } | 888 | } |
889 | got_sigbus: | ||
863 | 890 | ||
864 | /* turn off harvesting reference bits */ | 891 | /* turn off harvesting reference bits */ |
865 | if (madvise(ptr, len, MADV_SEQUENTIAL)) | 892 | if (madvise(ptr, len, MADV_SEQUENTIAL)) |
@@ -910,6 +937,7 @@ static void walk_page_cache(void) | |||
910 | 937 | ||
911 | kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY); | 938 | kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY); |
912 | pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); | 939 | pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); |
940 | sigaction(SIGBUS, &sigbus_action, NULL); | ||
913 | 941 | ||
914 | if (stat(opt_file, &st)) | 942 | if (stat(opt_file, &st)) |
915 | fatal("stat failed: %s\n", opt_file); | 943 | fatal("stat failed: %s\n", opt_file); |
@@ -925,6 +953,7 @@ static void walk_page_cache(void) | |||
925 | 953 | ||
926 | close(kpageflags_fd); | 954 | close(kpageflags_fd); |
927 | close(pagemap_fd); | 955 | close(pagemap_fd); |
956 | signal(SIGBUS, SIG_DFL); | ||
928 | } | 957 | } |
929 | 958 | ||
930 | static void parse_file(const char *name) | 959 | static void parse_file(const char *name) |