diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-26 22:33:41 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-26 22:33:41 -0400 |
commit | 345671ea0f9258f410eb057b9ced9cefbbe5dc78 (patch) | |
tree | fe97ba3d27679789e6aa34e39b002ee64ce25412 | |
parent | 4904008165c8a1c48602b8316139691b8c735e6e (diff) | |
parent | 22146c3ce98962436e401f7b7016a6f664c9ffb5 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
- a few misc things
- ocfs2 updates
- most of MM
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (132 commits)
hugetlbfs: dirty pages as they are added to pagecache
mm: export add_swap_extent()
mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS
tools/testing/selftests/vm/map_fixed_noreplace.c: add test for MAP_FIXED_NOREPLACE
mm: thp: relocate flush_cache_range() in migrate_misplaced_transhuge_page()
mm: thp: fix mmu_notifier in migrate_misplaced_transhuge_page()
mm: thp: fix MADV_DONTNEED vs migrate_misplaced_transhuge_page race condition
mm/kasan/quarantine.c: make quarantine_lock a raw_spinlock_t
mm/gup: cache dev_pagemap while pinning pages
Revert "x86/e820: put !E820_TYPE_RAM regions into memblock.reserved"
mm: return zero_resv_unavail optimization
mm: zero remaining unavailable struct pages
tools/testing/selftests/vm/gup_benchmark.c: add MAP_HUGETLB option
tools/testing/selftests/vm/gup_benchmark.c: add MAP_SHARED option
tools/testing/selftests/vm/gup_benchmark.c: allow user specified file
tools/testing/selftests/vm/gup_benchmark.c: fix 'write' flag usage
mm/gup_benchmark.c: add additional pinning methods
mm/gup_benchmark.c: time put_page()
mm: don't raise MEMCG_OOM event due to failed high-order allocation
mm/page-writeback.c: fix range_cyclic writeback vs writepages deadlock
...
156 files changed, 3400 insertions, 1988 deletions
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt new file mode 100644 index 000000000000..b8ca28b60215 --- /dev/null +++ b/Documentation/accounting/psi.txt | |||
@@ -0,0 +1,73 @@ | |||
1 | ================================ | ||
2 | PSI - Pressure Stall Information | ||
3 | ================================ | ||
4 | |||
5 | :Date: April, 2018 | ||
6 | :Author: Johannes Weiner <hannes@cmpxchg.org> | ||
7 | |||
8 | When CPU, memory or IO devices are contended, workloads experience | ||
9 | latency spikes, throughput losses, and run the risk of OOM kills. | ||
10 | |||
11 | Without an accurate measure of such contention, users are forced to | ||
12 | either play it safe and under-utilize their hardware resources, or | ||
13 | roll the dice and frequently suffer the disruptions resulting from | ||
14 | excessive overcommit. | ||
15 | |||
16 | The psi feature identifies and quantifies the disruptions caused by | ||
17 | such resource crunches and the time impact it has on complex workloads | ||
18 | or even entire systems. | ||
19 | |||
20 | Having an accurate measure of productivity losses caused by resource | ||
21 | scarcity aids users in sizing workloads to hardware--or provisioning | ||
22 | hardware according to workload demand. | ||
23 | |||
24 | As psi aggregates this information in realtime, systems can be managed | ||
25 | dynamically using techniques such as load shedding, migrating jobs to | ||
26 | other systems or data centers, or strategically pausing or killing low | ||
27 | priority or restartable batch jobs. | ||
28 | |||
29 | This allows maximizing hardware utilization without sacrificing | ||
30 | workload health or risking major disruptions such as OOM kills. | ||
31 | |||
32 | Pressure interface | ||
33 | ================== | ||
34 | |||
35 | Pressure information for each resource is exported through the | ||
36 | respective file in /proc/pressure/ -- cpu, memory, and io. | ||
37 | |||
38 | The format for CPU is as such: | ||
39 | |||
40 | some avg10=0.00 avg60=0.00 avg300=0.00 total=0 | ||
41 | |||
42 | and for memory and IO: | ||
43 | |||
44 | some avg10=0.00 avg60=0.00 avg300=0.00 total=0 | ||
45 | full avg10=0.00 avg60=0.00 avg300=0.00 total=0 | ||
46 | |||
47 | The "some" line indicates the share of time in which at least some | ||
48 | tasks are stalled on a given resource. | ||
49 | |||
50 | The "full" line indicates the share of time in which all non-idle | ||
51 | tasks are stalled on a given resource simultaneously. In this state | ||
52 | actual CPU cycles are going to waste, and a workload that spends | ||
53 | extended time in this state is considered to be thrashing. This has | ||
54 | severe impact on performance, and it's useful to distinguish this | ||
55 | situation from a state where some tasks are stalled but the CPU is | ||
56 | still doing productive work. As such, time spent in this subset of the | ||
57 | stall state is tracked separately and exported in the "full" averages. | ||
58 | |||
59 | The ratios are tracked as recent trends over ten, sixty, and three | ||
60 | hundred second windows, which gives insight into short term events as | ||
61 | well as medium and long term trends. The total absolute stall time is | ||
62 | tracked and exported as well, to allow detection of latency spikes | ||
63 | which wouldn't necessarily make a dent in the time averages, or to | ||
64 | average trends over custom time frames. | ||
65 | |||
66 | Cgroup2 interface | ||
67 | ================= | ||
68 | |||
69 | In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem | ||
70 | mounted, pressure stall information is also tracked for tasks grouped | ||
71 | into cgroups. Each subdirectory in the cgroupfs mountpoint contains | ||
72 | cpu.pressure, memory.pressure, and io.pressure files; the format is | ||
73 | the same as the /proc/pressure/ files. | ||
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index caf36105a1c7..8384c681a4b2 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst | |||
@@ -966,6 +966,12 @@ All time durations are in microseconds. | |||
966 | $PERIOD duration. "max" for $MAX indicates no limit. If only | 966 | $PERIOD duration. "max" for $MAX indicates no limit. If only |
967 | one number is written, $MAX is updated. | 967 | one number is written, $MAX is updated. |
968 | 968 | ||
969 | cpu.pressure | ||
970 | A read-only nested-key file which exists on non-root cgroups. | ||
971 | |||
972 | Shows pressure stall information for CPU. See | ||
973 | Documentation/accounting/psi.txt for details. | ||
974 | |||
969 | 975 | ||
970 | Memory | 976 | Memory |
971 | ------ | 977 | ------ |
@@ -1127,6 +1133,10 @@ PAGE_SIZE multiple when read back. | |||
1127 | disk readahead. For now OOM in memory cgroup kills | 1133 | disk readahead. For now OOM in memory cgroup kills |
1128 | tasks iff shortage has happened inside page fault. | 1134 | tasks iff shortage has happened inside page fault. |
1129 | 1135 | ||
1136 | This event is not raised if the OOM killer is not | ||
1137 | considered as an option, e.g. for failed high-order | ||
1138 | allocations. | ||
1139 | |||
1130 | oom_kill | 1140 | oom_kill |
1131 | The number of processes belonging to this cgroup | 1141 | The number of processes belonging to this cgroup |
1132 | killed by any kind of OOM killer. | 1142 | killed by any kind of OOM killer. |
@@ -1271,6 +1281,12 @@ PAGE_SIZE multiple when read back. | |||
1271 | higher than the limit for an extended period of time. This | 1281 | higher than the limit for an extended period of time. This |
1272 | reduces the impact on the workload and memory management. | 1282 | reduces the impact on the workload and memory management. |
1273 | 1283 | ||
1284 | memory.pressure | ||
1285 | A read-only nested-key file which exists on non-root cgroups. | ||
1286 | |||
1287 | Shows pressure stall information for memory. See | ||
1288 | Documentation/accounting/psi.txt for details. | ||
1289 | |||
1274 | 1290 | ||
1275 | Usage Guidelines | 1291 | Usage Guidelines |
1276 | ~~~~~~~~~~~~~~~~ | 1292 | ~~~~~~~~~~~~~~~~ |
@@ -1408,6 +1424,12 @@ IO Interface Files | |||
1408 | 1424 | ||
1409 | 8:16 rbps=2097152 wbps=max riops=max wiops=max | 1425 | 8:16 rbps=2097152 wbps=max riops=max wiops=max |
1410 | 1426 | ||
1427 | io.pressure | ||
1428 | A read-only nested-key file which exists on non-root cgroups. | ||
1429 | |||
1430 | Shows pressure stall information for IO. See | ||
1431 | Documentation/accounting/psi.txt for details. | ||
1432 | |||
1411 | 1433 | ||
1412 | Writeback | 1434 | Writeback |
1413 | ~~~~~~~~~ | 1435 | ~~~~~~~~~ |
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 47ca5cda0eef..b90fe3b6bc6c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt | |||
@@ -4851,6 +4851,18 @@ | |||
4851 | This is actually a boot loader parameter; the value is | 4851 | This is actually a boot loader parameter; the value is |
4852 | passed to the kernel using a special protocol. | 4852 | passed to the kernel using a special protocol. |
4853 | 4853 | ||
4854 | vm_debug[=options] [KNL] Available with CONFIG_DEBUG_VM=y. | ||
4855 | May slow down system boot speed, especially when | ||
4856 | enabled on systems with a large amount of memory. | ||
4857 | All options are enabled by default, and this | ||
4858 | interface is meant to allow for selectively | ||
4859 | enabling or disabling specific virtual memory | ||
4860 | debugging features. | ||
4861 | |||
4862 | Available options are: | ||
4863 | P Enable page structure init time poisoning | ||
4864 | - Disable all of the above options | ||
4865 | |||
4854 | vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact | 4866 | vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact |
4855 | size of <nn>. This can be used to increase the | 4867 | size of <nn>. This can be used to increase the |
4856 | minimum size (128MB on x86). It can also be used to | 4868 | minimum size (128MB on x86). It can also be used to |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 22b4b00dee31..12a5e6e693b6 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -858,6 +858,7 @@ Writeback: 0 kB | |||
858 | AnonPages: 861800 kB | 858 | AnonPages: 861800 kB |
859 | Mapped: 280372 kB | 859 | Mapped: 280372 kB |
860 | Shmem: 644 kB | 860 | Shmem: 644 kB |
861 | KReclaimable: 168048 kB | ||
861 | Slab: 284364 kB | 862 | Slab: 284364 kB |
862 | SReclaimable: 159856 kB | 863 | SReclaimable: 159856 kB |
863 | SUnreclaim: 124508 kB | 864 | SUnreclaim: 124508 kB |
@@ -925,6 +926,9 @@ AnonHugePages: Non-file backed huge pages mapped into userspace page tables | |||
925 | ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated | 926 | ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated |
926 | with huge pages | 927 | with huge pages |
927 | ShmemPmdMapped: Shared memory mapped into userspace with huge pages | 928 | ShmemPmdMapped: Shared memory mapped into userspace with huge pages |
929 | KReclaimable: Kernel allocations that the kernel will attempt to reclaim | ||
930 | under memory pressure. Includes SReclaimable (below), and other | ||
931 | direct allocations with a shrinker. | ||
928 | Slab: in-kernel data structures cache | 932 | Slab: in-kernel data structures cache |
929 | SReclaimable: Part of Slab, that might be reclaimed, such as caches | 933 | SReclaimable: Part of Slab, that might be reclaimed, such as caches |
930 | SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure | 934 | SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure |
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst index 3a775fd64e2d..195928808bac 100644 --- a/Documentation/vm/slub.rst +++ b/Documentation/vm/slub.rst | |||
@@ -36,9 +36,10 @@ debugging is enabled. Format: | |||
36 | 36 | ||
37 | slub_debug=<Debug-Options> | 37 | slub_debug=<Debug-Options> |
38 | Enable options for all slabs | 38 | Enable options for all slabs |
39 | slub_debug=<Debug-Options>,<slab name> | ||
40 | Enable options only for select slabs | ||
41 | 39 | ||
40 | slub_debug=<Debug-Options>,<slab name1>,<slab name2>,... | ||
41 | Enable options only for select slabs (no spaces | ||
42 | after a comma) | ||
42 | 43 | ||
43 | Possible debug options are:: | 44 | Possible debug options are:: |
44 | 45 | ||
@@ -62,7 +63,12 @@ Trying to find an issue in the dentry cache? Try:: | |||
62 | 63 | ||
63 | slub_debug=,dentry | 64 | slub_debug=,dentry |
64 | 65 | ||
65 | to only enable debugging on the dentry cache. | 66 | to only enable debugging on the dentry cache. You may use an asterisk at the |
67 | end of the slab name, in order to cover all slabs with the same prefix. For | ||
68 | example, here's how you can poison the dentry cache as well as all kmalloc | ||
69 | slabs: | ||
70 | |||
71 | slub_debug=P,kmalloc-*,dentry | ||
66 | 72 | ||
67 | Red zoning and tracking may realign the slab. We can just apply sanity checks | 73 | Red zoning and tracking may realign the slab. We can just apply sanity checks |
68 | to the dentry cache with:: | 74 | to the dentry cache with:: |
diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt index 2a4ee6302122..481d8d8536ac 100644 --- a/Documentation/x86/pat.txt +++ b/Documentation/x86/pat.txt | |||
@@ -90,12 +90,12 @@ pci proc | -- | -- | WC | | |||
90 | Advanced APIs for drivers | 90 | Advanced APIs for drivers |
91 | ------------------------- | 91 | ------------------------- |
92 | A. Exporting pages to users with remap_pfn_range, io_remap_pfn_range, | 92 | A. Exporting pages to users with remap_pfn_range, io_remap_pfn_range, |
93 | vm_insert_pfn | 93 | vmf_insert_pfn |
94 | 94 | ||
95 | Drivers wanting to export some pages to userspace do it by using mmap | 95 | Drivers wanting to export some pages to userspace do it by using mmap |
96 | interface and a combination of | 96 | interface and a combination of |
97 | 1) pgprot_noncached() | 97 | 1) pgprot_noncached() |
98 | 2) io_remap_pfn_range() or remap_pfn_range() or vm_insert_pfn() | 98 | 2) io_remap_pfn_range() or remap_pfn_range() or vmf_insert_pfn() |
99 | 99 | ||
100 | With PAT support, a new API pgprot_writecombine is being added. So, drivers can | 100 | With PAT support, a new API pgprot_writecombine is being added. So, drivers can |
101 | continue to use the above sequence, with either pgprot_noncached() or | 101 | continue to use the above sequence, with either pgprot_noncached() or |
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 5b4f88363453..620b0a711ee4 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig | |||
@@ -31,6 +31,8 @@ config ALPHA | |||
31 | select ODD_RT_SIGACTION | 31 | select ODD_RT_SIGACTION |
32 | select OLD_SIGSUSPEND | 32 | select OLD_SIGSUSPEND |
33 | select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 | 33 | select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 |
34 | select HAVE_MEMBLOCK | ||
35 | select NO_BOOTMEM | ||
34 | help | 36 | help |
35 | The Alpha is a 64-bit general-purpose processor designed and | 37 | The Alpha is a 64-bit general-purpose processor designed and |
36 | marketed by the Digital Equipment Corporation of blessed memory, | 38 | marketed by the Digital Equipment Corporation of blessed memory, |
diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c index aec757250e07..f70986683fc6 100644 --- a/arch/alpha/kernel/core_irongate.c +++ b/arch/alpha/kernel/core_irongate.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/init.h> | 21 | #include <linux/init.h> |
22 | #include <linux/initrd.h> | 22 | #include <linux/initrd.h> |
23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
24 | #include <linux/memblock.h> | ||
24 | 25 | ||
25 | #include <asm/ptrace.h> | 26 | #include <asm/ptrace.h> |
26 | #include <asm/cacheflush.h> | 27 | #include <asm/cacheflush.h> |
@@ -241,8 +242,7 @@ albacore_init_arch(void) | |||
241 | size / 1024); | 242 | size / 1024); |
242 | } | 243 | } |
243 | #endif | 244 | #endif |
244 | reserve_bootmem_node(NODE_DATA(0), pci_mem, memtop - | 245 | memblock_reserve(pci_mem, memtop - pci_mem); |
245 | pci_mem, BOOTMEM_DEFAULT); | ||
246 | printk("irongate_init_arch: temporarily reserving " | 246 | printk("irongate_init_arch: temporarily reserving " |
247 | "region %08lx-%08lx for PCI\n", pci_mem, memtop - 1); | 247 | "region %08lx-%08lx for PCI\n", pci_mem, memtop - 1); |
248 | } | 248 | } |
diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 5576f7646fb6..4f0d94471bc9 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/ioport.h> | 30 | #include <linux/ioport.h> |
31 | #include <linux/platform_device.h> | 31 | #include <linux/platform_device.h> |
32 | #include <linux/bootmem.h> | 32 | #include <linux/bootmem.h> |
33 | #include <linux/memblock.h> | ||
33 | #include <linux/pci.h> | 34 | #include <linux/pci.h> |
34 | #include <linux/seq_file.h> | 35 | #include <linux/seq_file.h> |
35 | #include <linux/root_dev.h> | 36 | #include <linux/root_dev.h> |
@@ -312,9 +313,7 @@ setup_memory(void *kernel_end) | |||
312 | { | 313 | { |
313 | struct memclust_struct * cluster; | 314 | struct memclust_struct * cluster; |
314 | struct memdesc_struct * memdesc; | 315 | struct memdesc_struct * memdesc; |
315 | unsigned long start_kernel_pfn, end_kernel_pfn; | 316 | unsigned long kernel_size; |
316 | unsigned long bootmap_size, bootmap_pages, bootmap_start; | ||
317 | unsigned long start, end; | ||
318 | unsigned long i; | 317 | unsigned long i; |
319 | 318 | ||
320 | /* Find free clusters, and init and free the bootmem accordingly. */ | 319 | /* Find free clusters, and init and free the bootmem accordingly. */ |
@@ -322,6 +321,8 @@ setup_memory(void *kernel_end) | |||
322 | (hwrpb->mddt_offset + (unsigned long) hwrpb); | 321 | (hwrpb->mddt_offset + (unsigned long) hwrpb); |
323 | 322 | ||
324 | for_each_mem_cluster(memdesc, cluster, i) { | 323 | for_each_mem_cluster(memdesc, cluster, i) { |
324 | unsigned long end; | ||
325 | |||
325 | printk("memcluster %lu, usage %01lx, start %8lu, end %8lu\n", | 326 | printk("memcluster %lu, usage %01lx, start %8lu, end %8lu\n", |
326 | i, cluster->usage, cluster->start_pfn, | 327 | i, cluster->usage, cluster->start_pfn, |
327 | cluster->start_pfn + cluster->numpages); | 328 | cluster->start_pfn + cluster->numpages); |
@@ -335,6 +336,9 @@ setup_memory(void *kernel_end) | |||
335 | end = cluster->start_pfn + cluster->numpages; | 336 | end = cluster->start_pfn + cluster->numpages; |
336 | if (end > max_low_pfn) | 337 | if (end > max_low_pfn) |
337 | max_low_pfn = end; | 338 | max_low_pfn = end; |
339 | |||
340 | memblock_add(PFN_PHYS(cluster->start_pfn), | ||
341 | cluster->numpages << PAGE_SHIFT); | ||
338 | } | 342 | } |
339 | 343 | ||
340 | /* | 344 | /* |
@@ -363,87 +367,9 @@ setup_memory(void *kernel_end) | |||
363 | max_low_pfn = mem_size_limit; | 367 | max_low_pfn = mem_size_limit; |
364 | } | 368 | } |
365 | 369 | ||
366 | /* Find the bounds of kernel memory. */ | 370 | /* Reserve the kernel memory. */ |
367 | start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); | 371 | kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS; |
368 | end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); | 372 | memblock_reserve(KERNEL_START_PHYS, kernel_size); |
369 | bootmap_start = -1; | ||
370 | |||
371 | try_again: | ||
372 | if (max_low_pfn <= end_kernel_pfn) | ||
373 | panic("not enough memory to boot"); | ||
374 | |||
375 | /* We need to know how many physically contiguous pages | ||
376 | we'll need for the bootmap. */ | ||
377 | bootmap_pages = bootmem_bootmap_pages(max_low_pfn); | ||
378 | |||
379 | /* Now find a good region where to allocate the bootmap. */ | ||
380 | for_each_mem_cluster(memdesc, cluster, i) { | ||
381 | if (cluster->usage & 3) | ||
382 | continue; | ||
383 | |||
384 | start = cluster->start_pfn; | ||
385 | end = start + cluster->numpages; | ||
386 | if (start >= max_low_pfn) | ||
387 | continue; | ||
388 | if (end > max_low_pfn) | ||
389 | end = max_low_pfn; | ||
390 | if (start < start_kernel_pfn) { | ||
391 | if (end > end_kernel_pfn | ||
392 | && end - end_kernel_pfn >= bootmap_pages) { | ||
393 | bootmap_start = end_kernel_pfn; | ||
394 | break; | ||
395 | } else if (end > start_kernel_pfn) | ||
396 | end = start_kernel_pfn; | ||
397 | } else if (start < end_kernel_pfn) | ||
398 | start = end_kernel_pfn; | ||
399 | if (end - start >= bootmap_pages) { | ||
400 | bootmap_start = start; | ||
401 | break; | ||
402 | } | ||
403 | } | ||
404 | |||
405 | if (bootmap_start == ~0UL) { | ||
406 | max_low_pfn >>= 1; | ||
407 | goto try_again; | ||
408 | } | ||
409 | |||
410 | /* Allocate the bootmap and mark the whole MM as reserved. */ | ||
411 | bootmap_size = init_bootmem(bootmap_start, max_low_pfn); | ||
412 | |||
413 | /* Mark the free regions. */ | ||
414 | for_each_mem_cluster(memdesc, cluster, i) { | ||
415 | if (cluster->usage & 3) | ||
416 | continue; | ||
417 | |||
418 | start = cluster->start_pfn; | ||
419 | end = cluster->start_pfn + cluster->numpages; | ||
420 | if (start >= max_low_pfn) | ||
421 | continue; | ||
422 | if (end > max_low_pfn) | ||
423 | end = max_low_pfn; | ||
424 | if (start < start_kernel_pfn) { | ||
425 | if (end > end_kernel_pfn) { | ||
426 | free_bootmem(PFN_PHYS(start), | ||
427 | (PFN_PHYS(start_kernel_pfn) | ||
428 | - PFN_PHYS(start))); | ||
429 | printk("freeing pages %ld:%ld\n", | ||
430 | start, start_kernel_pfn); | ||
431 | start = end_kernel_pfn; | ||
432 | } else if (end > start_kernel_pfn) | ||
433 | end = start_kernel_pfn; | ||
434 | } else if (start < end_kernel_pfn) | ||
435 | start = end_kernel_pfn; | ||
436 | if (start >= end) | ||
437 | continue; | ||
438 | |||
439 | free_bootmem(PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start)); | ||
440 | printk("freeing pages %ld:%ld\n", start, end); | ||
441 | } | ||
442 | |||
443 | /* Reserve the bootmap memory. */ | ||
444 | reserve_bootmem(PFN_PHYS(bootmap_start), bootmap_size, | ||
445 | BOOTMEM_DEFAULT); | ||
446 | printk("reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size)); | ||
447 | 373 | ||
448 | #ifdef CONFIG_BLK_DEV_INITRD | 374 | #ifdef CONFIG_BLK_DEV_INITRD |
449 | initrd_start = INITRD_START; | 375 | initrd_start = INITRD_START; |
@@ -459,8 +385,8 @@ setup_memory(void *kernel_end) | |||
459 | initrd_end, | 385 | initrd_end, |
460 | phys_to_virt(PFN_PHYS(max_low_pfn))); | 386 | phys_to_virt(PFN_PHYS(max_low_pfn))); |
461 | } else { | 387 | } else { |
462 | reserve_bootmem(virt_to_phys((void *)initrd_start), | 388 | memblock_reserve(virt_to_phys((void *)initrd_start), |
463 | INITRD_SIZE, BOOTMEM_DEFAULT); | 389 | INITRD_SIZE); |
464 | } | 390 | } |
465 | } | 391 | } |
466 | #endif /* CONFIG_BLK_DEV_INITRD */ | 392 | #endif /* CONFIG_BLK_DEV_INITRD */ |
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index a9e86475f169..26cd925d19b1 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/bootmem.h> | 13 | #include <linux/bootmem.h> |
14 | #include <linux/memblock.h> | ||
14 | #include <linux/swap.h> | 15 | #include <linux/swap.h> |
15 | #include <linux/initrd.h> | 16 | #include <linux/initrd.h> |
16 | #include <linux/pfn.h> | 17 | #include <linux/pfn.h> |
@@ -59,12 +60,10 @@ setup_memory_node(int nid, void *kernel_end) | |||
59 | struct memclust_struct * cluster; | 60 | struct memclust_struct * cluster; |
60 | struct memdesc_struct * memdesc; | 61 | struct memdesc_struct * memdesc; |
61 | unsigned long start_kernel_pfn, end_kernel_pfn; | 62 | unsigned long start_kernel_pfn, end_kernel_pfn; |
62 | unsigned long bootmap_size, bootmap_pages, bootmap_start; | ||
63 | unsigned long start, end; | 63 | unsigned long start, end; |
64 | unsigned long node_pfn_start, node_pfn_end; | 64 | unsigned long node_pfn_start, node_pfn_end; |
65 | unsigned long node_min_pfn, node_max_pfn; | 65 | unsigned long node_min_pfn, node_max_pfn; |
66 | int i; | 66 | int i; |
67 | unsigned long node_datasz = PFN_UP(sizeof(pg_data_t)); | ||
68 | int show_init = 0; | 67 | int show_init = 0; |
69 | 68 | ||
70 | /* Find the bounds of current node */ | 69 | /* Find the bounds of current node */ |
@@ -134,24 +133,14 @@ setup_memory_node(int nid, void *kernel_end) | |||
134 | /* Cute trick to make sure our local node data is on local memory */ | 133 | /* Cute trick to make sure our local node data is on local memory */ |
135 | node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT)); | 134 | node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT)); |
136 | #endif | 135 | #endif |
137 | /* Quasi-mark the pg_data_t as in-use */ | ||
138 | node_min_pfn += node_datasz; | ||
139 | if (node_min_pfn >= node_max_pfn) { | ||
140 | printk(" not enough mem to reserve NODE_DATA"); | ||
141 | return; | ||
142 | } | ||
143 | NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; | ||
144 | |||
145 | printk(" Detected node memory: start %8lu, end %8lu\n", | 136 | printk(" Detected node memory: start %8lu, end %8lu\n", |
146 | node_min_pfn, node_max_pfn); | 137 | node_min_pfn, node_max_pfn); |
147 | 138 | ||
148 | DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid)); | 139 | DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid)); |
149 | DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata); | ||
150 | 140 | ||
151 | /* Find the bounds of kernel memory. */ | 141 | /* Find the bounds of kernel memory. */ |
152 | start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); | 142 | start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); |
153 | end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); | 143 | end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); |
154 | bootmap_start = -1; | ||
155 | 144 | ||
156 | if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) | 145 | if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) |
157 | panic("kernel loaded out of ram"); | 146 | panic("kernel loaded out of ram"); |
@@ -161,89 +150,11 @@ setup_memory_node(int nid, void *kernel_end) | |||
161 | has much larger alignment than 8Mb, so it's safe. */ | 150 | has much larger alignment than 8Mb, so it's safe. */ |
162 | node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1); | 151 | node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1); |
163 | 152 | ||
164 | /* We need to know how many physically contiguous pages | 153 | memblock_add(PFN_PHYS(node_min_pfn), |
165 | we'll need for the bootmap. */ | 154 | (node_max_pfn - node_min_pfn) << PAGE_SHIFT); |
166 | bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn); | ||
167 | |||
168 | /* Now find a good region where to allocate the bootmap. */ | ||
169 | for_each_mem_cluster(memdesc, cluster, i) { | ||
170 | if (cluster->usage & 3) | ||
171 | continue; | ||
172 | |||
173 | start = cluster->start_pfn; | ||
174 | end = start + cluster->numpages; | ||
175 | |||
176 | if (start >= node_max_pfn || end <= node_min_pfn) | ||
177 | continue; | ||
178 | |||
179 | if (end > node_max_pfn) | ||
180 | end = node_max_pfn; | ||
181 | if (start < node_min_pfn) | ||
182 | start = node_min_pfn; | ||
183 | |||
184 | if (start < start_kernel_pfn) { | ||
185 | if (end > end_kernel_pfn | ||
186 | && end - end_kernel_pfn >= bootmap_pages) { | ||
187 | bootmap_start = end_kernel_pfn; | ||
188 | break; | ||
189 | } else if (end > start_kernel_pfn) | ||
190 | end = start_kernel_pfn; | ||
191 | } else if (start < end_kernel_pfn) | ||
192 | start = end_kernel_pfn; | ||
193 | if (end - start >= bootmap_pages) { | ||
194 | bootmap_start = start; | ||
195 | break; | ||
196 | } | ||
197 | } | ||
198 | |||
199 | if (bootmap_start == -1) | ||
200 | panic("couldn't find a contiguous place for the bootmap"); | ||
201 | |||
202 | /* Allocate the bootmap and mark the whole MM as reserved. */ | ||
203 | bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start, | ||
204 | node_min_pfn, node_max_pfn); | ||
205 | DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n", | ||
206 | bootmap_start, bootmap_size, bootmap_pages); | ||
207 | 155 | ||
208 | /* Mark the free regions. */ | 156 | NODE_DATA(nid)->node_start_pfn = node_min_pfn; |
209 | for_each_mem_cluster(memdesc, cluster, i) { | 157 | NODE_DATA(nid)->node_present_pages = node_max_pfn - node_min_pfn; |
210 | if (cluster->usage & 3) | ||
211 | continue; | ||
212 | |||
213 | start = cluster->start_pfn; | ||
214 | end = cluster->start_pfn + cluster->numpages; | ||
215 | |||
216 | if (start >= node_max_pfn || end <= node_min_pfn) | ||
217 | continue; | ||
218 | |||
219 | if (end > node_max_pfn) | ||
220 | end = node_max_pfn; | ||
221 | if (start < node_min_pfn) | ||
222 | start = node_min_pfn; | ||
223 | |||
224 | if (start < start_kernel_pfn) { | ||
225 | if (end > end_kernel_pfn) { | ||
226 | free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), | ||
227 | (PFN_PHYS(start_kernel_pfn) | ||
228 | - PFN_PHYS(start))); | ||
229 | printk(" freeing pages %ld:%ld\n", | ||
230 | start, start_kernel_pfn); | ||
231 | start = end_kernel_pfn; | ||
232 | } else if (end > start_kernel_pfn) | ||
233 | end = start_kernel_pfn; | ||
234 | } else if (start < end_kernel_pfn) | ||
235 | start = end_kernel_pfn; | ||
236 | if (start >= end) | ||
237 | continue; | ||
238 | |||
239 | free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start)); | ||
240 | printk(" freeing pages %ld:%ld\n", start, end); | ||
241 | } | ||
242 | |||
243 | /* Reserve the bootmap memory. */ | ||
244 | reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start), | ||
245 | bootmap_size, BOOTMEM_DEFAULT); | ||
246 | printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size)); | ||
247 | 158 | ||
248 | node_set_online(nid); | 159 | node_set_online(nid); |
249 | } | 160 | } |
@@ -251,6 +162,7 @@ setup_memory_node(int nid, void *kernel_end) | |||
251 | void __init | 162 | void __init |
252 | setup_memory(void *kernel_end) | 163 | setup_memory(void *kernel_end) |
253 | { | 164 | { |
165 | unsigned long kernel_size; | ||
254 | int nid; | 166 | int nid; |
255 | 167 | ||
256 | show_mem_layout(); | 168 | show_mem_layout(); |
@@ -262,6 +174,9 @@ setup_memory(void *kernel_end) | |||
262 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 174 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
263 | setup_memory_node(nid, kernel_end); | 175 | setup_memory_node(nid, kernel_end); |
264 | 176 | ||
177 | kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS; | ||
178 | memblock_reserve(KERNEL_START_PHYS, kernel_size); | ||
179 | |||
265 | #ifdef CONFIG_BLK_DEV_INITRD | 180 | #ifdef CONFIG_BLK_DEV_INITRD |
266 | initrd_start = INITRD_START; | 181 | initrd_start = INITRD_START; |
267 | if (initrd_start) { | 182 | if (initrd_start) { |
@@ -279,9 +194,8 @@ setup_memory(void *kernel_end) | |||
279 | phys_to_virt(PFN_PHYS(max_low_pfn))); | 194 | phys_to_virt(PFN_PHYS(max_low_pfn))); |
280 | } else { | 195 | } else { |
281 | nid = kvaddr_to_nid(initrd_start); | 196 | nid = kvaddr_to_nid(initrd_start); |
282 | reserve_bootmem_node(NODE_DATA(nid), | 197 | memblock_reserve(virt_to_phys((void *)initrd_start), |
283 | virt_to_phys((void *)initrd_start), | 198 | INITRD_SIZE); |
284 | INITRD_SIZE, BOOTMEM_DEFAULT); | ||
285 | } | 199 | } |
286 | } | 200 | } |
287 | #endif /* CONFIG_BLK_DEV_INITRD */ | 201 | #endif /* CONFIG_BLK_DEV_INITRD */ |
@@ -303,9 +217,8 @@ void __init paging_init(void) | |||
303 | dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | 217 | dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; |
304 | 218 | ||
305 | for_each_online_node(nid) { | 219 | for_each_online_node(nid) { |
306 | bootmem_data_t *bdata = &bootmem_node_data[nid]; | 220 | unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn; |
307 | unsigned long start_pfn = bdata->node_min_pfn; | 221 | unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_present_pages; |
308 | unsigned long end_pfn = bdata->node_low_pfn; | ||
309 | 222 | ||
310 | if (dma_local_pfn >= end_pfn - start_pfn) | 223 | if (dma_local_pfn >= end_pfn - start_pfn) |
311 | zones_size[ZONE_DMA] = end_pfn - start_pfn; | 224 | zones_size[ZONE_DMA] = end_pfn - start_pfn; |
diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h index d4014fbe5ea3..0d9f3918fa7e 100644 --- a/arch/arm/include/asm/hugetlb-3level.h +++ b/arch/arm/include/asm/hugetlb-3level.h | |||
@@ -29,6 +29,7 @@ | |||
29 | * ptes. | 29 | * ptes. |
30 | * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes). | 30 | * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes). |
31 | */ | 31 | */ |
32 | #define __HAVE_ARCH_HUGE_PTEP_GET | ||
32 | static inline pte_t huge_ptep_get(pte_t *ptep) | 33 | static inline pte_t huge_ptep_get(pte_t *ptep) |
33 | { | 34 | { |
34 | pte_t retval = *ptep; | 35 | pte_t retval = *ptep; |
@@ -37,35 +38,4 @@ static inline pte_t huge_ptep_get(pte_t *ptep) | |||
37 | return retval; | 38 | return retval; |
38 | } | 39 | } |
39 | 40 | ||
40 | static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | ||
41 | pte_t *ptep, pte_t pte) | ||
42 | { | ||
43 | set_pte_at(mm, addr, ptep, pte); | ||
44 | } | ||
45 | |||
46 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, | ||
47 | unsigned long addr, pte_t *ptep) | ||
48 | { | ||
49 | ptep_clear_flush(vma, addr, ptep); | ||
50 | } | ||
51 | |||
52 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, | ||
53 | unsigned long addr, pte_t *ptep) | ||
54 | { | ||
55 | ptep_set_wrprotect(mm, addr, ptep); | ||
56 | } | ||
57 | |||
58 | static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, | ||
59 | unsigned long addr, pte_t *ptep) | ||
60 | { | ||
61 | return ptep_get_and_clear(mm, addr, ptep); | ||
62 | } | ||
63 | |||
64 | static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, | ||
65 | unsigned long addr, pte_t *ptep, | ||
66 | pte_t pte, int dirty) | ||
67 | { | ||
68 | return ptep_set_access_flags(vma, addr, ptep, pte, dirty); | ||
69 | } | ||
70 | |||
71 | #endif /* _ASM_ARM_HUGETLB_3LEVEL_H */ | 41 | #endif /* _ASM_ARM_HUGETLB_3LEVEL_H */ |
diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 7d26f6c4f0f5..b67256c22b08 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h | |||
@@ -23,18 +23,8 @@ | |||
23 | #define _ASM_ARM_HUGETLB_H | 23 | #define _ASM_ARM_HUGETLB_H |
24 | 24 | ||
25 | #include <asm/page.h> | 25 | #include <asm/page.h> |
26 | #include <asm-generic/hugetlb.h> | ||
27 | |||
28 | #include <asm/hugetlb-3level.h> | 26 | #include <asm/hugetlb-3level.h> |
29 | 27 | #include <asm-generic/hugetlb.h> | |
30 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, | ||
31 | unsigned long addr, unsigned long end, | ||
32 | unsigned long floor, | ||
33 | unsigned long ceiling) | ||
34 | { | ||
35 | free_pgd_range(tlb, addr, end, floor, ceiling); | ||
36 | } | ||
37 | |||
38 | 28 | ||
39 | static inline int is_hugepage_only_range(struct mm_struct *mm, | 29 | static inline int is_hugepage_only_range(struct mm_struct *mm, |
40 | unsigned long addr, unsigned long len) | 30 | unsigned long addr, unsigned long len) |
@@ -42,27 +32,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, | |||
42 | return 0; | 32 | return 0; |
43 | } | 33 | } |
44 | 34 | ||
45 | static inline int prepare_hugepage_range(struct file *file, | ||
46 | unsigned long addr, unsigned long len) | ||
47 | { | ||
48 | struct hstate *h = hstate_file(file); | ||
49 | if (len & ~huge_page_mask(h)) | ||
50 | return -EINVAL; | ||
51 | if (addr & ~huge_page_mask(h)) | ||
52 | return -EINVAL; | ||
53 | return 0; | ||
54 | } | ||
55 | |||
56 | static inline int huge_pte_none(pte_t pte) | ||
57 | { | ||
58 | return pte_none(pte); | ||
59 | } | ||
60 | |||
61 | static inline pte_t huge_pte_wrprotect(pte_t pte) | ||
62 | { | ||
63 | return pte_wrprotect(pte); | ||
64 | } | ||
65 | |||
66 | static inline void arch_clear_hugepage_flags(struct page *page) | 35 | static inline void arch_clear_hugepage_flags(struct page *page) |
67 | { | 36 | { |
68 | clear_bit(PG_dcache_clean, &page->flags); | 37 | clear_bit(PG_dcache_clean, &page->flags); |
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index e73f68569624..fb6609875455 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h | |||
@@ -20,48 +20,18 @@ | |||
20 | 20 | ||
21 | #include <asm/page.h> | 21 | #include <asm/page.h> |
22 | 22 | ||
23 | #define __HAVE_ARCH_HUGE_PTEP_GET | ||
23 | static inline pte_t huge_ptep_get(pte_t *ptep) | 24 | static inline pte_t huge_ptep_get(pte_t *ptep) |
24 | { | 25 | { |
25 | return READ_ONCE(*ptep); | 26 | return READ_ONCE(*ptep); |
26 | } | 27 | } |
27 | 28 | ||
28 | |||
29 | |||
30 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, | ||
31 | unsigned long addr, unsigned long end, | ||
32 | unsigned long floor, | ||
33 | unsigned long ceiling) | ||
34 | { | ||
35 | free_pgd_range(tlb, addr, end, floor, ceiling); | ||
36 | } | ||
37 | |||
38 | static inline int is_hugepage_only_range(struct mm_struct *mm, | 29 | static inline int is_hugepage_only_range(struct mm_struct *mm, |
39 | unsigned long addr, unsigned long len) | 30 | unsigned long addr, unsigned long len) |
40 | { | 31 | { |
41 | return 0; | 32 | return 0; |
42 | } | 33 | } |
43 | 34 | ||
44 | static inline int prepare_hugepage_range(struct file *file, | ||
45 | unsigned long addr, unsigned long len) | ||
46 | { | ||
47 | struct hstate *h = hstate_file(file); | ||
48 | if (len & ~huge_page_mask(h)) | ||
49 | return -EINVAL; | ||
50 | if (addr & ~huge_page_mask(h)) | ||
51 | return -EINVAL; | ||
52 | return 0; | ||
53 | } | ||
54 | |||
55 | static inline int huge_pte_none(pte_t pte) | ||
56 | { | ||
57 | return pte_none(pte); | ||
58 | } | ||
59 | |||
60 | static inline pte_t huge_pte_wrprotect(pte_t pte) | ||
61 | { | ||
62 | return pte_wrprotect(pte); | ||
63 | } | ||
64 | |||
65 | static inline void arch_clear_hugepage_flags(struct page *page) | 35 | static inline void arch_clear_hugepage_flags(struct page *page) |
66 | { | 36 | { |
67 | clear_bit(PG_dcache_clean, &page->flags); | 37 | clear_bit(PG_dcache_clean, &page->flags); |
@@ -70,20 +40,25 @@ static inline void arch_clear_hugepage_flags(struct page *page) | |||
70 | extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, | 40 | extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, |
71 | struct page *page, int writable); | 41 | struct page *page, int writable); |
72 | #define arch_make_huge_pte arch_make_huge_pte | 42 | #define arch_make_huge_pte arch_make_huge_pte |
43 | #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT | ||
73 | extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | 44 | extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
74 | pte_t *ptep, pte_t pte); | 45 | pte_t *ptep, pte_t pte); |
46 | #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS | ||
75 | extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, | 47 | extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, |
76 | unsigned long addr, pte_t *ptep, | 48 | unsigned long addr, pte_t *ptep, |
77 | pte_t pte, int dirty); | 49 | pte_t pte, int dirty); |
50 | #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR | ||
78 | extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, | 51 | extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, |
79 | unsigned long addr, pte_t *ptep); | 52 | unsigned long addr, pte_t *ptep); |
53 | #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT | ||
80 | extern void huge_ptep_set_wrprotect(struct mm_struct *mm, | 54 | extern void huge_ptep_set_wrprotect(struct mm_struct *mm, |
81 | unsigned long addr, pte_t *ptep); | 55 | unsigned long addr, pte_t *ptep); |
56 | #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH | ||
82 | extern void huge_ptep_clear_flush(struct vm_area_struct *vma, | 57 | extern void huge_ptep_clear_flush(struct vm_area_struct *vma, |
83 | unsigned long addr, pte_t *ptep); | 58 | unsigned long addr, pte_t *ptep); |
59 | #define __HAVE_ARCH_HUGE_PTE_CLEAR | ||
84 | extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, | 60 | extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, |
85 | pte_t *ptep, unsigned long sz); | 61 | pte_t *ptep, unsigned long sz); |
86 | #define huge_pte_clear huge_pte_clear | ||
87 | extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, | 62 | extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, |
88 | pte_t *ptep, pte_t pte, unsigned long sz); | 63 | pte_t *ptep, pte_t pte, unsigned long sz); |
89 | #define set_huge_swap_pte_at set_huge_swap_pte_at | 64 | #define set_huge_swap_pte_at set_huge_swap_pte_at |
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index dd95d33a5bd5..03a6c256b7ec 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h | |||
@@ -16,6 +16,7 @@ | |||
16 | #ifndef __ASM_STRING_H | 16 | #ifndef __ASM_STRING_H |
17 | #define __ASM_STRING_H | 17 | #define __ASM_STRING_H |
18 | 18 | ||
19 | #ifndef CONFIG_KASAN | ||
19 | #define __HAVE_ARCH_STRRCHR | 20 | #define __HAVE_ARCH_STRRCHR |
20 | extern char *strrchr(const char *, int c); | 21 | extern char *strrchr(const char *, int c); |
21 | 22 | ||
@@ -34,6 +35,13 @@ extern __kernel_size_t strlen(const char *); | |||
34 | #define __HAVE_ARCH_STRNLEN | 35 | #define __HAVE_ARCH_STRNLEN |
35 | extern __kernel_size_t strnlen(const char *, __kernel_size_t); | 36 | extern __kernel_size_t strnlen(const char *, __kernel_size_t); |
36 | 37 | ||
38 | #define __HAVE_ARCH_MEMCMP | ||
39 | extern int memcmp(const void *, const void *, size_t); | ||
40 | |||
41 | #define __HAVE_ARCH_MEMCHR | ||
42 | extern void *memchr(const void *, int, __kernel_size_t); | ||
43 | #endif | ||
44 | |||
37 | #define __HAVE_ARCH_MEMCPY | 45 | #define __HAVE_ARCH_MEMCPY |
38 | extern void *memcpy(void *, const void *, __kernel_size_t); | 46 | extern void *memcpy(void *, const void *, __kernel_size_t); |
39 | extern void *__memcpy(void *, const void *, __kernel_size_t); | 47 | extern void *__memcpy(void *, const void *, __kernel_size_t); |
@@ -42,16 +50,10 @@ extern void *__memcpy(void *, const void *, __kernel_size_t); | |||
42 | extern void *memmove(void *, const void *, __kernel_size_t); | 50 | extern void *memmove(void *, const void *, __kernel_size_t); |
43 | extern void *__memmove(void *, const void *, __kernel_size_t); | 51 | extern void *__memmove(void *, const void *, __kernel_size_t); |
44 | 52 | ||
45 | #define __HAVE_ARCH_MEMCHR | ||
46 | extern void *memchr(const void *, int, __kernel_size_t); | ||
47 | |||
48 | #define __HAVE_ARCH_MEMSET | 53 | #define __HAVE_ARCH_MEMSET |
49 | extern void *memset(void *, int, __kernel_size_t); | 54 | extern void *memset(void *, int, __kernel_size_t); |
50 | extern void *__memset(void *, int, __kernel_size_t); | 55 | extern void *__memset(void *, int, __kernel_size_t); |
51 | 56 | ||
52 | #define __HAVE_ARCH_MEMCMP | ||
53 | extern int memcmp(const void *, const void *, size_t); | ||
54 | |||
55 | #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE | 57 | #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE |
56 | #define __HAVE_ARCH_MEMCPY_FLUSHCACHE | 58 | #define __HAVE_ARCH_MEMCPY_FLUSHCACHE |
57 | void memcpy_flushcache(void *dst, const void *src, size_t cnt); | 59 | void memcpy_flushcache(void *dst, const void *src, size_t cnt); |
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index d894a20b70b2..72f63a59b008 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c | |||
@@ -44,20 +44,23 @@ EXPORT_SYMBOL(__arch_copy_in_user); | |||
44 | EXPORT_SYMBOL(memstart_addr); | 44 | EXPORT_SYMBOL(memstart_addr); |
45 | 45 | ||
46 | /* string / mem functions */ | 46 | /* string / mem functions */ |
47 | #ifndef CONFIG_KASAN | ||
47 | EXPORT_SYMBOL(strchr); | 48 | EXPORT_SYMBOL(strchr); |
48 | EXPORT_SYMBOL(strrchr); | 49 | EXPORT_SYMBOL(strrchr); |
49 | EXPORT_SYMBOL(strcmp); | 50 | EXPORT_SYMBOL(strcmp); |
50 | EXPORT_SYMBOL(strncmp); | 51 | EXPORT_SYMBOL(strncmp); |
51 | EXPORT_SYMBOL(strlen); | 52 | EXPORT_SYMBOL(strlen); |
52 | EXPORT_SYMBOL(strnlen); | 53 | EXPORT_SYMBOL(strnlen); |
54 | EXPORT_SYMBOL(memcmp); | ||
55 | EXPORT_SYMBOL(memchr); | ||
56 | #endif | ||
57 | |||
53 | EXPORT_SYMBOL(memset); | 58 | EXPORT_SYMBOL(memset); |
54 | EXPORT_SYMBOL(memcpy); | 59 | EXPORT_SYMBOL(memcpy); |
55 | EXPORT_SYMBOL(memmove); | 60 | EXPORT_SYMBOL(memmove); |
56 | EXPORT_SYMBOL(__memset); | 61 | EXPORT_SYMBOL(__memset); |
57 | EXPORT_SYMBOL(__memcpy); | 62 | EXPORT_SYMBOL(__memcpy); |
58 | EXPORT_SYMBOL(__memmove); | 63 | EXPORT_SYMBOL(__memmove); |
59 | EXPORT_SYMBOL(memchr); | ||
60 | EXPORT_SYMBOL(memcmp); | ||
61 | 64 | ||
62 | /* atomic bitops */ | 65 | /* atomic bitops */ |
63 | EXPORT_SYMBOL(set_bit); | 66 | EXPORT_SYMBOL(set_bit); |
diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S index 4444c1d25f4b..0f164a4baf52 100644 --- a/arch/arm64/lib/memchr.S +++ b/arch/arm64/lib/memchr.S | |||
@@ -30,7 +30,7 @@ | |||
30 | * Returns: | 30 | * Returns: |
31 | * x0 - address of first occurrence of 'c' or 0 | 31 | * x0 - address of first occurrence of 'c' or 0 |
32 | */ | 32 | */ |
33 | ENTRY(memchr) | 33 | WEAK(memchr) |
34 | and w1, w1, #0xff | 34 | and w1, w1, #0xff |
35 | 1: subs x2, x2, #1 | 35 | 1: subs x2, x2, #1 |
36 | b.mi 2f | 36 | b.mi 2f |
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S index 2a4e239bd17a..fb295f52e9f8 100644 --- a/arch/arm64/lib/memcmp.S +++ b/arch/arm64/lib/memcmp.S | |||
@@ -58,7 +58,7 @@ pos .req x11 | |||
58 | limit_wd .req x12 | 58 | limit_wd .req x12 |
59 | mask .req x13 | 59 | mask .req x13 |
60 | 60 | ||
61 | ENTRY(memcmp) | 61 | WEAK(memcmp) |
62 | cbz limit, .Lret0 | 62 | cbz limit, .Lret0 |
63 | eor tmp1, src1, src2 | 63 | eor tmp1, src1, src2 |
64 | tst tmp1, #7 | 64 | tst tmp1, #7 |
diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S index dae0cf5591f9..7c83091d1bcd 100644 --- a/arch/arm64/lib/strchr.S +++ b/arch/arm64/lib/strchr.S | |||
@@ -29,7 +29,7 @@ | |||
29 | * Returns: | 29 | * Returns: |
30 | * x0 - address of first occurrence of 'c' or 0 | 30 | * x0 - address of first occurrence of 'c' or 0 |
31 | */ | 31 | */ |
32 | ENTRY(strchr) | 32 | WEAK(strchr) |
33 | and w1, w1, #0xff | 33 | and w1, w1, #0xff |
34 | 1: ldrb w2, [x0], #1 | 34 | 1: ldrb w2, [x0], #1 |
35 | cmp w2, w1 | 35 | cmp w2, w1 |
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index 471fe61760ef..7d5d15398bfb 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S | |||
@@ -60,7 +60,7 @@ tmp3 .req x9 | |||
60 | zeroones .req x10 | 60 | zeroones .req x10 |
61 | pos .req x11 | 61 | pos .req x11 |
62 | 62 | ||
63 | ENTRY(strcmp) | 63 | WEAK(strcmp) |
64 | eor tmp1, src1, src2 | 64 | eor tmp1, src1, src2 |
65 | mov zeroones, #REP8_01 | 65 | mov zeroones, #REP8_01 |
66 | tst tmp1, #7 | 66 | tst tmp1, #7 |
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S index 55ccc8e24c08..8e0b14205dcb 100644 --- a/arch/arm64/lib/strlen.S +++ b/arch/arm64/lib/strlen.S | |||
@@ -56,7 +56,7 @@ pos .req x12 | |||
56 | #define REP8_7f 0x7f7f7f7f7f7f7f7f | 56 | #define REP8_7f 0x7f7f7f7f7f7f7f7f |
57 | #define REP8_80 0x8080808080808080 | 57 | #define REP8_80 0x8080808080808080 |
58 | 58 | ||
59 | ENTRY(strlen) | 59 | WEAK(strlen) |
60 | mov zeroones, #REP8_01 | 60 | mov zeroones, #REP8_01 |
61 | bic src, srcin, #15 | 61 | bic src, srcin, #15 |
62 | ands tmp1, srcin, #15 | 62 | ands tmp1, srcin, #15 |
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index e267044761c6..66bd145935d9 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S | |||
@@ -64,7 +64,7 @@ limit_wd .req x13 | |||
64 | mask .req x14 | 64 | mask .req x14 |
65 | endloop .req x15 | 65 | endloop .req x15 |
66 | 66 | ||
67 | ENTRY(strncmp) | 67 | WEAK(strncmp) |
68 | cbz limit, .Lret0 | 68 | cbz limit, .Lret0 |
69 | eor tmp1, src1, src2 | 69 | eor tmp1, src1, src2 |
70 | mov zeroones, #REP8_01 | 70 | mov zeroones, #REP8_01 |
diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S index eae38da6e0bb..355be04441fe 100644 --- a/arch/arm64/lib/strnlen.S +++ b/arch/arm64/lib/strnlen.S | |||
@@ -59,7 +59,7 @@ limit_wd .req x14 | |||
59 | #define REP8_7f 0x7f7f7f7f7f7f7f7f | 59 | #define REP8_7f 0x7f7f7f7f7f7f7f7f |
60 | #define REP8_80 0x8080808080808080 | 60 | #define REP8_80 0x8080808080808080 |
61 | 61 | ||
62 | ENTRY(strnlen) | 62 | WEAK(strnlen) |
63 | cbz limit, .Lhit_limit | 63 | cbz limit, .Lhit_limit |
64 | mov zeroones, #REP8_01 | 64 | mov zeroones, #REP8_01 |
65 | bic src, srcin, #15 | 65 | bic src, srcin, #15 |
diff --git a/arch/arm64/lib/strrchr.S b/arch/arm64/lib/strrchr.S index f8e2784d5752..ea84924d5990 100644 --- a/arch/arm64/lib/strrchr.S +++ b/arch/arm64/lib/strrchr.S | |||
@@ -29,7 +29,7 @@ | |||
29 | * Returns: | 29 | * Returns: |
30 | * x0 - address of last occurrence of 'c' or 0 | 30 | * x0 - address of last occurrence of 'c' or 0 |
31 | */ | 31 | */ |
32 | ENTRY(strrchr) | 32 | WEAK(strrchr) |
33 | mov x3, #0 | 33 | mov x3, #0 |
34 | and w1, w1, #0xff | 34 | and w1, w1, #0xff |
35 | 1: ldrb w2, [x0], #1 | 35 | 1: ldrb w2, [x0], #1 |
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 3ef46522e89f..7b25d7c8fa49 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig | |||
@@ -21,6 +21,9 @@ config HEXAGON | |||
21 | select GENERIC_IRQ_SHOW | 21 | select GENERIC_IRQ_SHOW |
22 | select HAVE_ARCH_KGDB | 22 | select HAVE_ARCH_KGDB |
23 | select HAVE_ARCH_TRACEHOOK | 23 | select HAVE_ARCH_TRACEHOOK |
24 | select HAVE_MEMBLOCK | ||
25 | select ARCH_DISCARD_MEMBLOCK | ||
26 | select NO_BOOTMEM | ||
24 | select NEED_SG_DMA_LENGTH | 27 | select NEED_SG_DMA_LENGTH |
25 | select NO_IOPORT_MAP | 28 | select NO_IOPORT_MAP |
26 | select GENERIC_IOMAP | 29 | select GENERIC_IOMAP |
diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 1495d45e472d..d789b9cc0189 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/init.h> | 21 | #include <linux/init.h> |
22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
24 | #include <linux/memblock.h> | ||
24 | #include <asm/atomic.h> | 25 | #include <asm/atomic.h> |
25 | #include <linux/highmem.h> | 26 | #include <linux/highmem.h> |
26 | #include <asm/tlb.h> | 27 | #include <asm/tlb.h> |
@@ -176,7 +177,6 @@ size_t hexagon_coherent_pool_size = (size_t) (DMA_RESERVE << 22); | |||
176 | 177 | ||
177 | void __init setup_arch_memory(void) | 178 | void __init setup_arch_memory(void) |
178 | { | 179 | { |
179 | int bootmap_size; | ||
180 | /* XXX Todo: this probably should be cleaned up */ | 180 | /* XXX Todo: this probably should be cleaned up */ |
181 | u32 *segtable = (u32 *) &swapper_pg_dir[0]; | 181 | u32 *segtable = (u32 *) &swapper_pg_dir[0]; |
182 | u32 *segtable_end; | 182 | u32 *segtable_end; |
@@ -195,18 +195,22 @@ void __init setup_arch_memory(void) | |||
195 | bootmem_lastpg = PFN_DOWN((bootmem_lastpg << PAGE_SHIFT) & | 195 | bootmem_lastpg = PFN_DOWN((bootmem_lastpg << PAGE_SHIFT) & |
196 | ~((BIG_KERNEL_PAGE_SIZE) - 1)); | 196 | ~((BIG_KERNEL_PAGE_SIZE) - 1)); |
197 | 197 | ||
198 | memblock_add(PHYS_OFFSET, | ||
199 | (bootmem_lastpg - ARCH_PFN_OFFSET) << PAGE_SHIFT); | ||
200 | |||
201 | /* Reserve kernel text/data/bss */ | ||
202 | memblock_reserve(PHYS_OFFSET, | ||
203 | (bootmem_startpg - ARCH_PFN_OFFSET) << PAGE_SHIFT); | ||
198 | /* | 204 | /* |
199 | * Reserve the top DMA_RESERVE bytes of RAM for DMA (uncached) | 205 | * Reserve the top DMA_RESERVE bytes of RAM for DMA (uncached) |
200 | * memory allocation | 206 | * memory allocation |
201 | */ | 207 | */ |
202 | |||
203 | max_low_pfn = bootmem_lastpg - PFN_DOWN(DMA_RESERVED_BYTES); | 208 | max_low_pfn = bootmem_lastpg - PFN_DOWN(DMA_RESERVED_BYTES); |
204 | min_low_pfn = ARCH_PFN_OFFSET; | 209 | min_low_pfn = ARCH_PFN_OFFSET; |
205 | bootmap_size = init_bootmem_node(NODE_DATA(0), bootmem_startpg, min_low_pfn, max_low_pfn); | 210 | memblock_reserve(PFN_PHYS(max_low_pfn), DMA_RESERVED_BYTES); |
206 | 211 | ||
207 | printk(KERN_INFO "bootmem_startpg: 0x%08lx\n", bootmem_startpg); | 212 | printk(KERN_INFO "bootmem_startpg: 0x%08lx\n", bootmem_startpg); |
208 | printk(KERN_INFO "bootmem_lastpg: 0x%08lx\n", bootmem_lastpg); | 213 | printk(KERN_INFO "bootmem_lastpg: 0x%08lx\n", bootmem_lastpg); |
209 | printk(KERN_INFO "bootmap_size: %d\n", bootmap_size); | ||
210 | printk(KERN_INFO "min_low_pfn: 0x%08lx\n", min_low_pfn); | 214 | printk(KERN_INFO "min_low_pfn: 0x%08lx\n", min_low_pfn); |
211 | printk(KERN_INFO "max_low_pfn: 0x%08lx\n", max_low_pfn); | 215 | printk(KERN_INFO "max_low_pfn: 0x%08lx\n", max_low_pfn); |
212 | 216 | ||
@@ -257,14 +261,6 @@ void __init setup_arch_memory(void) | |||
257 | #endif | 261 | #endif |
258 | 262 | ||
259 | /* | 263 | /* |
260 | * Free all the memory that wasn't taken up by the bootmap, the DMA | ||
261 | * reserve, or kernel itself. | ||
262 | */ | ||
263 | free_bootmem(PFN_PHYS(bootmem_startpg) + bootmap_size, | ||
264 | PFN_PHYS(bootmem_lastpg - bootmem_startpg) - bootmap_size - | ||
265 | DMA_RESERVED_BYTES); | ||
266 | |||
267 | /* | ||
268 | * The bootmem allocator seemingly just lives to feed memory | 264 | * The bootmem allocator seemingly just lives to feed memory |
269 | * to the paging system | 265 | * to the paging system |
270 | */ | 266 | */ |
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 74d2a5540aaf..36cc0396b214 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h | |||
@@ -3,13 +3,13 @@ | |||
3 | #define _ASM_IA64_HUGETLB_H | 3 | #define _ASM_IA64_HUGETLB_H |
4 | 4 | ||
5 | #include <asm/page.h> | 5 | #include <asm/page.h> |
6 | #include <asm-generic/hugetlb.h> | ||
7 | |||
8 | 6 | ||
7 | #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE | ||
9 | void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, | 8 | void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, |
10 | unsigned long end, unsigned long floor, | 9 | unsigned long end, unsigned long floor, |
11 | unsigned long ceiling); | 10 | unsigned long ceiling); |
12 | 11 | ||
12 | #define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE | ||
13 | int prepare_hugepage_range(struct file *file, | 13 | int prepare_hugepage_range(struct file *file, |
14 | unsigned long addr, unsigned long len); | 14 | unsigned long addr, unsigned long len); |
15 | 15 | ||
@@ -21,53 +21,16 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, | |||
21 | REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); | 21 | REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); |
22 | } | 22 | } |
23 | 23 | ||
24 | static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | 24 | #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH |
25 | pte_t *ptep, pte_t pte) | ||
26 | { | ||
27 | set_pte_at(mm, addr, ptep, pte); | ||
28 | } | ||
29 | |||
30 | static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, | ||
31 | unsigned long addr, pte_t *ptep) | ||
32 | { | ||
33 | return ptep_get_and_clear(mm, addr, ptep); | ||
34 | } | ||
35 | |||
36 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, | 25 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, |
37 | unsigned long addr, pte_t *ptep) | 26 | unsigned long addr, pte_t *ptep) |
38 | { | 27 | { |
39 | } | 28 | } |
40 | 29 | ||
41 | static inline int huge_pte_none(pte_t pte) | ||
42 | { | ||
43 | return pte_none(pte); | ||
44 | } | ||
45 | |||
46 | static inline pte_t huge_pte_wrprotect(pte_t pte) | ||
47 | { | ||
48 | return pte_wrprotect(pte); | ||
49 | } | ||
50 | |||
51 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, | ||
52 | unsigned long addr, pte_t *ptep) | ||
53 | { | ||
54 | ptep_set_wrprotect(mm, addr, ptep); | ||
55 | } | ||
56 | |||
57 | static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, | ||
58 | unsigned long addr, pte_t *ptep, | ||
59 | pte_t pte, int dirty) | ||
60 | { | ||
61 | return ptep_set_access_flags(vma, addr, ptep, pte, dirty); | ||
62 | } | ||
63 | |||
64 | static inline pte_t huge_ptep_get(pte_t *ptep) | ||
65 | { | ||
66 | return *ptep; | ||
67 | } | ||
68 | |||
69 | static inline void arch_clear_hugepage_flags(struct page *page) | 30 | static inline void arch_clear_hugepage_flags(struct page *page) |
70 | { | 31 | { |
71 | } | 32 | } |
72 | 33 | ||
34 | #include <asm-generic/hugetlb.h> | ||
35 | |||
73 | #endif /* _ASM_IA64_HUGETLB_H */ | 36 | #endif /* _ASM_IA64_HUGETLB_H */ |
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 165827774bea..b1e7468eb65a 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h | |||
@@ -544,7 +544,6 @@ extern struct page *zero_page_memmap_ptr; | |||
544 | 544 | ||
545 | # ifdef CONFIG_VIRTUAL_MEM_MAP | 545 | # ifdef CONFIG_VIRTUAL_MEM_MAP |
546 | /* arch mem_map init routine is needed due to holes in a virtual mem_map */ | 546 | /* arch mem_map init routine is needed due to holes in a virtual mem_map */ |
547 | # define __HAVE_ARCH_MEMMAP_INIT | ||
548 | extern void memmap_init (unsigned long size, int nid, unsigned long zone, | 547 | extern void memmap_init (unsigned long size, int nid, unsigned long zone, |
549 | unsigned long start_pfn); | 548 | unsigned long start_pfn); |
550 | # endif /* CONFIG_VIRTUAL_MEM_MAP */ | 549 | # endif /* CONFIG_VIRTUAL_MEM_MAP */ |
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 982bc0685330..425bb6fc3bda 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h | |||
@@ -10,8 +10,6 @@ | |||
10 | #define __ASM_HUGETLB_H | 10 | #define __ASM_HUGETLB_H |
11 | 11 | ||
12 | #include <asm/page.h> | 12 | #include <asm/page.h> |
13 | #include <asm-generic/hugetlb.h> | ||
14 | |||
15 | 13 | ||
16 | static inline int is_hugepage_only_range(struct mm_struct *mm, | 14 | static inline int is_hugepage_only_range(struct mm_struct *mm, |
17 | unsigned long addr, | 15 | unsigned long addr, |
@@ -20,6 +18,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, | |||
20 | return 0; | 18 | return 0; |
21 | } | 19 | } |
22 | 20 | ||
21 | #define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE | ||
23 | static inline int prepare_hugepage_range(struct file *file, | 22 | static inline int prepare_hugepage_range(struct file *file, |
24 | unsigned long addr, | 23 | unsigned long addr, |
25 | unsigned long len) | 24 | unsigned long len) |
@@ -38,21 +37,7 @@ static inline int prepare_hugepage_range(struct file *file, | |||
38 | return 0; | 37 | return 0; |
39 | } | 38 | } |
40 | 39 | ||
41 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, | 40 | #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR |
42 | unsigned long addr, | ||
43 | unsigned long end, | ||
44 | unsigned long floor, | ||
45 | unsigned long ceiling) | ||
46 | { | ||
47 | free_pgd_range(tlb, addr, end, floor, ceiling); | ||
48 | } | ||
49 | |||
50 | static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | ||
51 | pte_t *ptep, pte_t pte) | ||
52 | { | ||
53 | set_pte_at(mm, addr, ptep, pte); | ||
54 | } | ||
55 | |||
56 | static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, | 41 | static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, |
57 | unsigned long addr, pte_t *ptep) | 42 | unsigned long addr, pte_t *ptep) |
58 | { | 43 | { |
@@ -64,29 +49,21 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, | |||
64 | return pte; | 49 | return pte; |
65 | } | 50 | } |
66 | 51 | ||
52 | #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH | ||
67 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, | 53 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, |
68 | unsigned long addr, pte_t *ptep) | 54 | unsigned long addr, pte_t *ptep) |
69 | { | 55 | { |
70 | flush_tlb_page(vma, addr & huge_page_mask(hstate_vma(vma))); | 56 | flush_tlb_page(vma, addr & huge_page_mask(hstate_vma(vma))); |
71 | } | 57 | } |
72 | 58 | ||
59 | #define __HAVE_ARCH_HUGE_PTE_NONE | ||
73 | static inline int huge_pte_none(pte_t pte) | 60 | static inline int huge_pte_none(pte_t pte) |
74 | { | 61 | { |
75 | unsigned long val = pte_val(pte) & ~_PAGE_GLOBAL; | 62 | unsigned long val = pte_val(pte) & ~_PAGE_GLOBAL; |
76 | return !val || (val == (unsigned long)invalid_pte_table); | 63 | return !val || (val == (unsigned long)invalid_pte_table); |
77 | } | 64 | } |
78 | 65 | ||
79 | static inline pte_t huge_pte_wrprotect(pte_t pte) | 66 | #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS |
80 | { | ||
81 | return pte_wrprotect(pte); | ||
82 | } | ||
83 | |||
84 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, | ||
85 | unsigned long addr, pte_t *ptep) | ||
86 | { | ||
87 | ptep_set_wrprotect(mm, addr, ptep); | ||
88 | } | ||
89 | |||
90 | static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, | 67 | static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, |
91 | unsigned long addr, | 68 | unsigned long addr, |
92 | pte_t *ptep, pte_t pte, | 69 | pte_t *ptep, pte_t pte, |
@@ -105,13 +82,10 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, | |||
105 | return changed; | 82 | return changed; |
106 | } | 83 | } |
107 | 84 | ||
108 | static inline pte_t huge_ptep_get(pte_t *ptep) | ||
109 | { | ||
110 | return *ptep; | ||
111 | } | ||
112 | |||
113 | static inline void arch_clear_hugepage_flags(struct page *page) | 85 | static inline void arch_clear_hugepage_flags(struct page *page) |
114 | { | 86 | { |
115 | } | 87 | } |
116 | 88 | ||
89 | #include <asm-generic/hugetlb.h> | ||
90 | |||
117 | #endif /* __ASM_HUGETLB_H */ | 91 | #endif /* __ASM_HUGETLB_H */ |
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 03965692fbfe..2df0c57f2833 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig | |||
@@ -23,6 +23,9 @@ config NIOS2 | |||
23 | select SPARSE_IRQ | 23 | select SPARSE_IRQ |
24 | select USB_ARCH_HAS_HCD if USB_SUPPORT | 24 | select USB_ARCH_HAS_HCD if USB_SUPPORT |
25 | select CPU_NO_EFFICIENT_FFS | 25 | select CPU_NO_EFFICIENT_FFS |
26 | select HAVE_MEMBLOCK | ||
27 | select ARCH_DISCARD_MEMBLOCK | ||
28 | select NO_BOOTMEM | ||
26 | 29 | ||
27 | config GENERIC_CSUM | 30 | config GENERIC_CSUM |
28 | def_bool y | 31 | def_bool y |
diff --git a/arch/nios2/kernel/prom.c b/arch/nios2/kernel/prom.c index 8d7446a4b475..a6d4f7530247 100644 --- a/arch/nios2/kernel/prom.c +++ b/arch/nios2/kernel/prom.c | |||
@@ -32,23 +32,6 @@ | |||
32 | 32 | ||
33 | #include <asm/sections.h> | 33 | #include <asm/sections.h> |
34 | 34 | ||
35 | void __init early_init_dt_add_memory_arch(u64 base, u64 size) | ||
36 | { | ||
37 | u64 kernel_start = (u64)virt_to_phys(_text); | ||
38 | |||
39 | if (!memory_size && | ||
40 | (kernel_start >= base) && (kernel_start < (base + size))) | ||
41 | memory_size = size; | ||
42 | |||
43 | } | ||
44 | |||
45 | int __init early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, | ||
46 | bool nomap) | ||
47 | { | ||
48 | reserve_bootmem(base, size, BOOTMEM_DEFAULT); | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | void __init early_init_devtree(void *params) | 35 | void __init early_init_devtree(void *params) |
53 | { | 36 | { |
54 | __be32 *dtb = (u32 *)__dtb_start; | 37 | __be32 *dtb = (u32 *)__dtb_start; |
diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index 926a02b17b31..2d0011ddd4d5 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/sched/task.h> | 17 | #include <linux/sched/task.h> |
18 | #include <linux/console.h> | 18 | #include <linux/console.h> |
19 | #include <linux/bootmem.h> | 19 | #include <linux/bootmem.h> |
20 | #include <linux/memblock.h> | ||
20 | #include <linux/initrd.h> | 21 | #include <linux/initrd.h> |
21 | #include <linux/of_fdt.h> | 22 | #include <linux/of_fdt.h> |
22 | #include <linux/screen_info.h> | 23 | #include <linux/screen_info.h> |
@@ -143,10 +144,12 @@ asmlinkage void __init nios2_boot_init(unsigned r4, unsigned r5, unsigned r6, | |||
143 | 144 | ||
144 | void __init setup_arch(char **cmdline_p) | 145 | void __init setup_arch(char **cmdline_p) |
145 | { | 146 | { |
146 | int bootmap_size; | 147 | int dram_start; |
147 | 148 | ||
148 | console_verbose(); | 149 | console_verbose(); |
149 | 150 | ||
151 | dram_start = memblock_start_of_DRAM(); | ||
152 | memory_size = memblock_phys_mem_size(); | ||
150 | memory_start = PAGE_ALIGN((unsigned long)__pa(_end)); | 153 | memory_start = PAGE_ALIGN((unsigned long)__pa(_end)); |
151 | memory_end = (unsigned long) CONFIG_NIOS2_MEM_BASE + memory_size; | 154 | memory_end = (unsigned long) CONFIG_NIOS2_MEM_BASE + memory_size; |
152 | 155 | ||
@@ -163,39 +166,11 @@ void __init setup_arch(char **cmdline_p) | |||
163 | max_low_pfn = PFN_DOWN(memory_end); | 166 | max_low_pfn = PFN_DOWN(memory_end); |
164 | max_mapnr = max_low_pfn; | 167 | max_mapnr = max_low_pfn; |
165 | 168 | ||
166 | /* | 169 | memblock_reserve(dram_start, memory_start - dram_start); |
167 | * give all the memory to the bootmap allocator, tell it to put the | ||
168 | * boot mem_map at the start of memory | ||
169 | */ | ||
170 | pr_debug("init_bootmem_node(?,%#lx, %#x, %#lx)\n", | ||
171 | min_low_pfn, PFN_DOWN(PHYS_OFFSET), max_low_pfn); | ||
172 | bootmap_size = init_bootmem_node(NODE_DATA(0), | ||
173 | min_low_pfn, PFN_DOWN(PHYS_OFFSET), | ||
174 | max_low_pfn); | ||
175 | |||
176 | /* | ||
177 | * free the usable memory, we have to make sure we do not free | ||
178 | * the bootmem bitmap so we then reserve it after freeing it :-) | ||
179 | */ | ||
180 | pr_debug("free_bootmem(%#lx, %#lx)\n", | ||
181 | memory_start, memory_end - memory_start); | ||
182 | free_bootmem(memory_start, memory_end - memory_start); | ||
183 | |||
184 | /* | ||
185 | * Reserve the bootmem bitmap itself as well. We do this in two | ||
186 | * steps (first step was init_bootmem()) because this catches | ||
187 | * the (very unlikely) case of us accidentally initializing the | ||
188 | * bootmem allocator with an invalid RAM area. | ||
189 | * | ||
190 | * Arguments are start, size | ||
191 | */ | ||
192 | pr_debug("reserve_bootmem(%#lx, %#x)\n", memory_start, bootmap_size); | ||
193 | reserve_bootmem(memory_start, bootmap_size, BOOTMEM_DEFAULT); | ||
194 | |||
195 | #ifdef CONFIG_BLK_DEV_INITRD | 170 | #ifdef CONFIG_BLK_DEV_INITRD |
196 | if (initrd_start) { | 171 | if (initrd_start) { |
197 | reserve_bootmem(virt_to_phys((void *)initrd_start), | 172 | memblock_reserve(virt_to_phys((void *)initrd_start), |
198 | initrd_end - initrd_start, BOOTMEM_DEFAULT); | 173 | initrd_end - initrd_start); |
199 | } | 174 | } |
200 | #endif /* CONFIG_BLK_DEV_INITRD */ | 175 | #endif /* CONFIG_BLK_DEV_INITRD */ |
201 | 176 | ||
diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 58e0f4620426..7cb595dcb7d7 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h | |||
@@ -3,12 +3,12 @@ | |||
3 | #define _ASM_PARISC64_HUGETLB_H | 3 | #define _ASM_PARISC64_HUGETLB_H |
4 | 4 | ||
5 | #include <asm/page.h> | 5 | #include <asm/page.h> |
6 | #include <asm-generic/hugetlb.h> | ||
7 | |||
8 | 6 | ||
7 | #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT | ||
9 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | 8 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
10 | pte_t *ptep, pte_t pte); | 9 | pte_t *ptep, pte_t pte); |
11 | 10 | ||
11 | #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR | ||
12 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, | 12 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, |
13 | pte_t *ptep); | 13 | pte_t *ptep); |
14 | 14 | ||
@@ -22,6 +22,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, | |||
22 | * If the arch doesn't supply something else, assume that hugepage | 22 | * If the arch doesn't supply something else, assume that hugepage |
23 | * size aligned regions are ok without further preparation. | 23 | * size aligned regions are ok without further preparation. |
24 | */ | 24 | */ |
25 | #define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE | ||
25 | static inline int prepare_hugepage_range(struct file *file, | 26 | static inline int prepare_hugepage_range(struct file *file, |
26 | unsigned long addr, unsigned long len) | 27 | unsigned long addr, unsigned long len) |
27 | { | 28 | { |
@@ -32,43 +33,25 @@ static inline int prepare_hugepage_range(struct file *file, | |||
32 | return 0; | 33 | return 0; |
33 | } | 34 | } |
34 | 35 | ||
35 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, | 36 | #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH |
36 | unsigned long addr, unsigned long end, | ||
37 | unsigned long floor, | ||
38 | unsigned long ceiling) | ||
39 | { | ||
40 | free_pgd_range(tlb, addr, end, floor, ceiling); | ||
41 | } | ||
42 | |||
43 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, | 37 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, |
44 | unsigned long addr, pte_t *ptep) | 38 | unsigned long addr, pte_t *ptep) |
45 | { | 39 | { |
46 | } | 40 | } |
47 | 41 | ||
48 | static inline int huge_pte_none(pte_t pte) | 42 | #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT |
49 | { | ||
50 | return pte_none(pte); | ||
51 | } | ||
52 | |||
53 | static inline pte_t huge_pte_wrprotect(pte_t pte) | ||
54 | { | ||
55 | return pte_wrprotect(pte); | ||
56 | } | ||
57 | |||
58 | void huge_ptep_set_wrprotect(struct mm_struct *mm, | 43 | void huge_ptep_set_wrprotect(struct mm_struct *mm, |
59 | unsigned long addr, pte_t *ptep); | 44 | unsigned long addr, pte_t *ptep); |
60 | 45 | ||
46 | #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS | ||
61 | int huge_ptep_set_access_flags(struct vm_area_struct *vma, | 47 | int huge_ptep_set_access_flags(struct vm_area_struct *vma, |
62 | unsigned long addr, pte_t *ptep, | 48 | unsigned long addr, pte_t *ptep, |
63 | pte_t pte, int dirty); | 49 | pte_t pte, int dirty); |
64 | 50 | ||
65 | static inline pte_t huge_ptep_get(pte_t *ptep) | ||
66 | { | ||
67 | return *ptep; | ||
68 | } | ||
69 | |||
70 | static inline void arch_clear_hugepage_flags(struct page *page) | 51 | static inline void arch_clear_hugepage_flags(struct page *page) |
71 | { | 52 | { |
72 | } | 53 | } |
73 | 54 | ||
55 | #include <asm-generic/hugetlb.h> | ||
56 | |||
74 | #endif /* _ASM_PARISC64_HUGETLB_H */ | 57 | #endif /* _ASM_PARISC64_HUGETLB_H */ |
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index e61dd3ae5bc0..c21d33704633 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h | |||
@@ -311,12 +311,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, | |||
311 | { | 311 | { |
312 | pte_update(ptep, _PAGE_RW, 0); | 312 | pte_update(ptep, _PAGE_RW, 0); |
313 | } | 313 | } |
314 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, | ||
315 | unsigned long addr, pte_t *ptep) | ||
316 | { | ||
317 | ptep_set_wrprotect(mm, addr, ptep); | ||
318 | } | ||
319 | |||
320 | 314 | ||
321 | static inline void __ptep_set_access_flags(struct vm_area_struct *vma, | 315 | static inline void __ptep_set_access_flags(struct vm_area_struct *vma, |
322 | pte_t *ptep, pte_t entry, | 316 | pte_t *ptep, pte_t entry, |
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index cb5dd4078d42..c4a726c10af5 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h | |||
@@ -426,6 +426,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, | |||
426 | pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0); | 426 | pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0); |
427 | } | 427 | } |
428 | 428 | ||
429 | #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT | ||
429 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, | 430 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, |
430 | unsigned long addr, pte_t *ptep) | 431 | unsigned long addr, pte_t *ptep) |
431 | { | 432 | { |
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 2d00cc530083..383da1ab9e23 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h | |||
@@ -4,7 +4,6 @@ | |||
4 | 4 | ||
5 | #ifdef CONFIG_HUGETLB_PAGE | 5 | #ifdef CONFIG_HUGETLB_PAGE |
6 | #include <asm/page.h> | 6 | #include <asm/page.h> |
7 | #include <asm-generic/hugetlb.h> | ||
8 | 7 | ||
9 | extern struct kmem_cache *hugepte_cache; | 8 | extern struct kmem_cache *hugepte_cache; |
10 | 9 | ||
@@ -110,31 +109,12 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, | |||
110 | void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); | 109 | void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); |
111 | #endif | 110 | #endif |
112 | 111 | ||
112 | #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE | ||
113 | void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, | 113 | void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, |
114 | unsigned long end, unsigned long floor, | 114 | unsigned long end, unsigned long floor, |
115 | unsigned long ceiling); | 115 | unsigned long ceiling); |
116 | 116 | ||
117 | /* | 117 | #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR |
118 | * If the arch doesn't supply something else, assume that hugepage | ||
119 | * size aligned regions are ok without further preparation. | ||
120 | */ | ||
121 | static inline int prepare_hugepage_range(struct file *file, | ||
122 | unsigned long addr, unsigned long len) | ||
123 | { | ||
124 | struct hstate *h = hstate_file(file); | ||
125 | if (len & ~huge_page_mask(h)) | ||
126 | return -EINVAL; | ||
127 | if (addr & ~huge_page_mask(h)) | ||
128 | return -EINVAL; | ||
129 | return 0; | ||
130 | } | ||
131 | |||
132 | static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | ||
133 | pte_t *ptep, pte_t pte) | ||
134 | { | ||
135 | set_pte_at(mm, addr, ptep, pte); | ||
136 | } | ||
137 | |||
138 | static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, | 118 | static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, |
139 | unsigned long addr, pte_t *ptep) | 119 | unsigned long addr, pte_t *ptep) |
140 | { | 120 | { |
@@ -145,6 +125,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, | |||
145 | #endif | 125 | #endif |
146 | } | 126 | } |
147 | 127 | ||
128 | #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH | ||
148 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, | 129 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, |
149 | unsigned long addr, pte_t *ptep) | 130 | unsigned long addr, pte_t *ptep) |
150 | { | 131 | { |
@@ -153,29 +134,17 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, | |||
153 | flush_hugetlb_page(vma, addr); | 134 | flush_hugetlb_page(vma, addr); |
154 | } | 135 | } |
155 | 136 | ||
156 | static inline int huge_pte_none(pte_t pte) | 137 | #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS |
157 | { | ||
158 | return pte_none(pte); | ||
159 | } | ||
160 | |||
161 | static inline pte_t huge_pte_wrprotect(pte_t pte) | ||
162 | { | ||
163 | return pte_wrprotect(pte); | ||
164 | } | ||
165 | |||
166 | extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, | 138 | extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, |
167 | unsigned long addr, pte_t *ptep, | 139 | unsigned long addr, pte_t *ptep, |
168 | pte_t pte, int dirty); | 140 | pte_t pte, int dirty); |
169 | 141 | ||
170 | static inline pte_t huge_ptep_get(pte_t *ptep) | ||
171 | { | ||
172 | return *ptep; | ||
173 | } | ||
174 | |||
175 | static inline void arch_clear_hugepage_flags(struct page *page) | 142 | static inline void arch_clear_hugepage_flags(struct page *page) |
176 | { | 143 | { |
177 | } | 144 | } |
178 | 145 | ||
146 | #include <asm-generic/hugetlb.h> | ||
147 | |||
179 | #else /* ! CONFIG_HUGETLB_PAGE */ | 148 | #else /* ! CONFIG_HUGETLB_PAGE */ |
180 | static inline void flush_hugetlb_page(struct vm_area_struct *vma, | 149 | static inline void flush_hugetlb_page(struct vm_area_struct *vma, |
181 | unsigned long vmaddr) | 150 | unsigned long vmaddr) |
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index f7b129a83054..3ffb0ff5a038 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h | |||
@@ -300,12 +300,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, | |||
300 | 300 | ||
301 | pte_update(ptep, clr, set); | 301 | pte_update(ptep, clr, set); |
302 | } | 302 | } |
303 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, | ||
304 | unsigned long addr, pte_t *ptep) | ||
305 | { | ||
306 | ptep_set_wrprotect(mm, addr, ptep); | ||
307 | } | ||
308 | |||
309 | 303 | ||
310 | static inline void __ptep_set_access_flags(struct vm_area_struct *vma, | 304 | static inline void __ptep_set_access_flags(struct vm_area_struct *vma, |
311 | pte_t *ptep, pte_t entry, | 305 | pte_t *ptep, pte_t entry, |
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index dc6bb9da3f23..67421f74efcf 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h | |||
@@ -275,6 +275,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, | |||
275 | pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); | 275 | pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); |
276 | } | 276 | } |
277 | 277 | ||
278 | #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT | ||
278 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, | 279 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, |
279 | unsigned long addr, pte_t *ptep) | 280 | unsigned long addr, pte_t *ptep) |
280 | { | 281 | { |
diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c index 882944c36ef5..5d8e8b6bb1cc 100644 --- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c +++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c | |||
@@ -49,7 +49,7 @@ static int calc_freq(struct spu_gov_info_struct *info) | |||
49 | cpu = info->policy->cpu; | 49 | cpu = info->policy->cpu; |
50 | busy_spus = atomic_read(&cbe_spu_info[cpu_to_node(cpu)].busy_spus); | 50 | busy_spus = atomic_read(&cbe_spu_info[cpu_to_node(cpu)].busy_spus); |
51 | 51 | ||
52 | CALC_LOAD(info->busy_spus, EXP, busy_spus * FIXED_1); | 52 | info->busy_spus = calc_load(info->busy_spus, EXP, busy_spus * FIXED_1); |
53 | pr_debug("cpu %d: busy_spus=%d, info->busy_spus=%ld\n", | 53 | pr_debug("cpu %d: busy_spus=%d, info->busy_spus=%ld\n", |
54 | cpu, busy_spus, info->busy_spus); | 54 | cpu, busy_spus, info->busy_spus); |
55 | 55 | ||
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index c9ef3c532169..9fcccb4490b9 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c | |||
@@ -987,9 +987,9 @@ static void spu_calc_load(void) | |||
987 | unsigned long active_tasks; /* fixed-point */ | 987 | unsigned long active_tasks; /* fixed-point */ |
988 | 988 | ||
989 | active_tasks = count_active_contexts() * FIXED_1; | 989 | active_tasks = count_active_contexts() * FIXED_1; |
990 | CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks); | 990 | spu_avenrun[0] = calc_load(spu_avenrun[0], EXP_1, active_tasks); |
991 | CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks); | 991 | spu_avenrun[1] = calc_load(spu_avenrun[1], EXP_5, active_tasks); |
992 | CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks); | 992 | spu_avenrun[2] = calc_load(spu_avenrun[2], EXP_15, active_tasks); |
993 | } | 993 | } |
994 | 994 | ||
995 | static void spusched_wake(struct timer_list *unused) | 995 | static void spusched_wake(struct timer_list *unused) |
@@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx, | |||
1071 | } | 1071 | } |
1072 | } | 1072 | } |
1073 | 1073 | ||
1074 | #define LOAD_INT(x) ((x) >> FSHIFT) | ||
1075 | #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) | ||
1076 | |||
1077 | static int show_spu_loadavg(struct seq_file *s, void *private) | 1074 | static int show_spu_loadavg(struct seq_file *s, void *private) |
1078 | { | 1075 | { |
1079 | int a, b, c; | 1076 | int a, b, c; |
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 433a994b1a89..54f375627532 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c | |||
@@ -25,10 +25,6 @@ | |||
25 | 25 | ||
26 | #include "appldata.h" | 26 | #include "appldata.h" |
27 | 27 | ||
28 | |||
29 | #define LOAD_INT(x) ((x) >> FSHIFT) | ||
30 | #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) | ||
31 | |||
32 | /* | 28 | /* |
33 | * OS data | 29 | * OS data |
34 | * | 30 | * |
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 735939c0f513..6f025fe18146 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h | |||
@@ -4,8 +4,6 @@ | |||
4 | 4 | ||
5 | #include <asm/cacheflush.h> | 5 | #include <asm/cacheflush.h> |
6 | #include <asm/page.h> | 6 | #include <asm/page.h> |
7 | #include <asm-generic/hugetlb.h> | ||
8 | |||
9 | 7 | ||
10 | static inline int is_hugepage_only_range(struct mm_struct *mm, | 8 | static inline int is_hugepage_only_range(struct mm_struct *mm, |
11 | unsigned long addr, | 9 | unsigned long addr, |
@@ -17,6 +15,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, | |||
17 | * If the arch doesn't supply something else, assume that hugepage | 15 | * If the arch doesn't supply something else, assume that hugepage |
18 | * size aligned regions are ok without further preparation. | 16 | * size aligned regions are ok without further preparation. |
19 | */ | 17 | */ |
18 | #define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE | ||
20 | static inline int prepare_hugepage_range(struct file *file, | 19 | static inline int prepare_hugepage_range(struct file *file, |
21 | unsigned long addr, unsigned long len) | 20 | unsigned long addr, unsigned long len) |
22 | { | 21 | { |
@@ -27,62 +26,17 @@ static inline int prepare_hugepage_range(struct file *file, | |||
27 | return 0; | 26 | return 0; |
28 | } | 27 | } |
29 | 28 | ||
30 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, | 29 | #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH |
31 | unsigned long addr, unsigned long end, | ||
32 | unsigned long floor, | ||
33 | unsigned long ceiling) | ||
34 | { | ||
35 | free_pgd_range(tlb, addr, end, floor, ceiling); | ||
36 | } | ||
37 | |||
38 | static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | ||
39 | pte_t *ptep, pte_t pte) | ||
40 | { | ||
41 | set_pte_at(mm, addr, ptep, pte); | ||
42 | } | ||
43 | |||
44 | static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, | ||
45 | unsigned long addr, pte_t *ptep) | ||
46 | { | ||
47 | return ptep_get_and_clear(mm, addr, ptep); | ||
48 | } | ||
49 | |||
50 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, | 30 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, |
51 | unsigned long addr, pte_t *ptep) | 31 | unsigned long addr, pte_t *ptep) |
52 | { | 32 | { |
53 | } | 33 | } |
54 | 34 | ||
55 | static inline int huge_pte_none(pte_t pte) | ||
56 | { | ||
57 | return pte_none(pte); | ||
58 | } | ||
59 | |||
60 | static inline pte_t huge_pte_wrprotect(pte_t pte) | ||
61 | { | ||
62 | return pte_wrprotect(pte); | ||
63 | } | ||
64 | |||
65 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, | ||
66 | unsigned long addr, pte_t *ptep) | ||
67 | { | ||
68 | ptep_set_wrprotect(mm, addr, ptep); | ||
69 | } | ||
70 | |||
71 | static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, | ||
72 | unsigned long addr, pte_t *ptep, | ||
73 | pte_t pte, int dirty) | ||
74 | { | ||
75 | return ptep_set_access_flags(vma, addr, ptep, pte, dirty); | ||
76 | } | ||
77 | |||
78 | static inline pte_t huge_ptep_get(pte_t *ptep) | ||
79 | { | ||
80 | return *ptep; | ||
81 | } | ||
82 | |||
83 | static inline void arch_clear_hugepage_flags(struct page *page) | 35 | static inline void arch_clear_hugepage_flags(struct page *page) |
84 | { | 36 | { |
85 | clear_bit(PG_dcache_clean, &page->flags); | 37 | clear_bit(PG_dcache_clean, &page->flags); |
86 | } | 38 | } |
87 | 39 | ||
40 | #include <asm-generic/hugetlb.h> | ||
41 | |||
88 | #endif /* _ASM_SH_HUGETLB_H */ | 42 | #endif /* _ASM_SH_HUGETLB_H */ |
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 300557c66698..3963f80d1cb3 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h | |||
@@ -3,7 +3,6 @@ | |||
3 | #define _ASM_SPARC64_HUGETLB_H | 3 | #define _ASM_SPARC64_HUGETLB_H |
4 | 4 | ||
5 | #include <asm/page.h> | 5 | #include <asm/page.h> |
6 | #include <asm-generic/hugetlb.h> | ||
7 | 6 | ||
8 | #ifdef CONFIG_HUGETLB_PAGE | 7 | #ifdef CONFIG_HUGETLB_PAGE |
9 | struct pud_huge_patch_entry { | 8 | struct pud_huge_patch_entry { |
@@ -13,9 +12,11 @@ struct pud_huge_patch_entry { | |||
13 | extern struct pud_huge_patch_entry __pud_huge_patch, __pud_huge_patch_end; | 12 | extern struct pud_huge_patch_entry __pud_huge_patch, __pud_huge_patch_end; |
14 | #endif | 13 | #endif |
15 | 14 | ||
15 | #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT | ||
16 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | 16 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
17 | pte_t *ptep, pte_t pte); | 17 | pte_t *ptep, pte_t pte); |
18 | 18 | ||
19 | #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR | ||
19 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, | 20 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, |
20 | pte_t *ptep); | 21 | pte_t *ptep); |
21 | 22 | ||
@@ -25,37 +26,13 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, | |||
25 | return 0; | 26 | return 0; |
26 | } | 27 | } |
27 | 28 | ||
28 | /* | 29 | #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH |
29 | * If the arch doesn't supply something else, assume that hugepage | ||
30 | * size aligned regions are ok without further preparation. | ||
31 | */ | ||
32 | static inline int prepare_hugepage_range(struct file *file, | ||
33 | unsigned long addr, unsigned long len) | ||
34 | { | ||
35 | struct hstate *h = hstate_file(file); | ||
36 | |||
37 | if (len & ~huge_page_mask(h)) | ||
38 | return -EINVAL; | ||
39 | if (addr & ~huge_page_mask(h)) | ||
40 | return -EINVAL; | ||
41 | return 0; | ||
42 | } | ||
43 | |||
44 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, | 30 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, |
45 | unsigned long addr, pte_t *ptep) | 31 | unsigned long addr, pte_t *ptep) |
46 | { | 32 | { |
47 | } | 33 | } |
48 | 34 | ||
49 | static inline int huge_pte_none(pte_t pte) | 35 | #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT |
50 | { | ||
51 | return pte_none(pte); | ||
52 | } | ||
53 | |||
54 | static inline pte_t huge_pte_wrprotect(pte_t pte) | ||
55 | { | ||
56 | return pte_wrprotect(pte); | ||
57 | } | ||
58 | |||
59 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, | 36 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, |
60 | unsigned long addr, pte_t *ptep) | 37 | unsigned long addr, pte_t *ptep) |
61 | { | 38 | { |
@@ -63,6 +40,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, | |||
63 | set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte)); | 40 | set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte)); |
64 | } | 41 | } |
65 | 42 | ||
43 | #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS | ||
66 | static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, | 44 | static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, |
67 | unsigned long addr, pte_t *ptep, | 45 | unsigned long addr, pte_t *ptep, |
68 | pte_t pte, int dirty) | 46 | pte_t pte, int dirty) |
@@ -75,17 +53,15 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, | |||
75 | return changed; | 53 | return changed; |
76 | } | 54 | } |
77 | 55 | ||
78 | static inline pte_t huge_ptep_get(pte_t *ptep) | ||
79 | { | ||
80 | return *ptep; | ||
81 | } | ||
82 | |||
83 | static inline void arch_clear_hugepage_flags(struct page *page) | 56 | static inline void arch_clear_hugepage_flags(struct page *page) |
84 | { | 57 | { |
85 | } | 58 | } |
86 | 59 | ||
60 | #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE | ||
87 | void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, | 61 | void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, |
88 | unsigned long end, unsigned long floor, | 62 | unsigned long end, unsigned long floor, |
89 | unsigned long ceiling); | 63 | unsigned long ceiling); |
90 | 64 | ||
65 | #include <asm-generic/hugetlb.h> | ||
66 | |||
91 | #endif /* _ASM_SPARC64_HUGETLB_H */ | 67 | #endif /* _ASM_SPARC64_HUGETLB_H */ |
diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 6b9938919f0b..10c15b8853ae 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig | |||
@@ -12,6 +12,8 @@ config UML | |||
12 | select HAVE_UID16 | 12 | select HAVE_UID16 |
13 | select HAVE_FUTEX_CMPXCHG if FUTEX | 13 | select HAVE_FUTEX_CMPXCHG if FUTEX |
14 | select HAVE_DEBUG_KMEMLEAK | 14 | select HAVE_DEBUG_KMEMLEAK |
15 | select HAVE_MEMBLOCK | ||
16 | select NO_BOOTMEM | ||
15 | select GENERIC_IRQ_SHOW | 17 | select GENERIC_IRQ_SHOW |
16 | select GENERIC_CPU_DEVICES | 18 | select GENERIC_CPU_DEVICES |
17 | select GENERIC_CLOCKEVENTS | 19 | select GENERIC_CLOCKEVENTS |
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index f02596e9931d..296a91a04598 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c | |||
@@ -5,6 +5,7 @@ | |||
5 | 5 | ||
6 | #include <linux/module.h> | 6 | #include <linux/module.h> |
7 | #include <linux/bootmem.h> | 7 | #include <linux/bootmem.h> |
8 | #include <linux/memblock.h> | ||
8 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
9 | #include <linux/pfn.h> | 10 | #include <linux/pfn.h> |
10 | #include <asm/page.h> | 11 | #include <asm/page.h> |
@@ -80,28 +81,23 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, | |||
80 | unsigned long len, unsigned long long highmem) | 81 | unsigned long len, unsigned long long highmem) |
81 | { | 82 | { |
82 | unsigned long reserve = reserve_end - start; | 83 | unsigned long reserve = reserve_end - start; |
83 | unsigned long pfn = PFN_UP(__pa(reserve_end)); | 84 | long map_size = len - reserve; |
84 | unsigned long delta = (len - reserve) >> PAGE_SHIFT; | ||
85 | unsigned long offset, bootmap_size; | ||
86 | long map_size; | ||
87 | int err; | 85 | int err; |
88 | 86 | ||
89 | offset = uml_reserved - uml_physmem; | ||
90 | map_size = len - offset; | ||
91 | if(map_size <= 0) { | 87 | if(map_size <= 0) { |
92 | os_warn("Too few physical memory! Needed=%lu, given=%lu\n", | 88 | os_warn("Too few physical memory! Needed=%lu, given=%lu\n", |
93 | offset, len); | 89 | reserve, len); |
94 | exit(1); | 90 | exit(1); |
95 | } | 91 | } |
96 | 92 | ||
97 | physmem_fd = create_mem_file(len + highmem); | 93 | physmem_fd = create_mem_file(len + highmem); |
98 | 94 | ||
99 | err = os_map_memory((void *) uml_reserved, physmem_fd, offset, | 95 | err = os_map_memory((void *) reserve_end, physmem_fd, reserve, |
100 | map_size, 1, 1, 1); | 96 | map_size, 1, 1, 1); |
101 | if (err < 0) { | 97 | if (err < 0) { |
102 | os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p " | 98 | os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p " |
103 | "failed - errno = %d\n", map_size, | 99 | "failed - errno = %d\n", map_size, |
104 | (void *) uml_reserved, err); | 100 | (void *) reserve_end, err); |
105 | exit(1); | 101 | exit(1); |
106 | } | 102 | } |
107 | 103 | ||
@@ -113,9 +109,11 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, | |||
113 | os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); | 109 | os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); |
114 | os_fsync_file(physmem_fd); | 110 | os_fsync_file(physmem_fd); |
115 | 111 | ||
116 | bootmap_size = init_bootmem(pfn, pfn + delta); | 112 | memblock_add(__pa(start), len + highmem); |
117 | free_bootmem(__pa(reserve_end) + bootmap_size, | 113 | memblock_reserve(__pa(start), reserve); |
118 | len - bootmap_size - reserve); | 114 | |
115 | min_low_pfn = PFN_UP(__pa(reserve_end)); | ||
116 | max_low_pfn = min_low_pfn + (map_size >> PAGE_SHIFT); | ||
119 | } | 117 | } |
120 | 118 | ||
121 | int phys_mapping(unsigned long phys, unsigned long long *offset_out) | 119 | int phys_mapping(unsigned long phys, unsigned long long *offset_out) |
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 3a3b40f79558..0c5111b206bd 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig | |||
@@ -6,6 +6,7 @@ config UNICORE32 | |||
6 | select ARCH_MIGHT_HAVE_PC_SERIO | 6 | select ARCH_MIGHT_HAVE_PC_SERIO |
7 | select DMA_DIRECT_OPS | 7 | select DMA_DIRECT_OPS |
8 | select HAVE_MEMBLOCK | 8 | select HAVE_MEMBLOCK |
9 | select NO_BOOTMEM | ||
9 | select HAVE_GENERIC_DMA_COHERENT | 10 | select HAVE_GENERIC_DMA_COHERENT |
10 | select HAVE_KERNEL_GZIP | 11 | select HAVE_KERNEL_GZIP |
11 | select HAVE_KERNEL_BZIP2 | 12 | select HAVE_KERNEL_BZIP2 |
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 5f72a8d1d953..8f8699e62bd5 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c | |||
@@ -84,58 +84,6 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low, | |||
84 | } | 84 | } |
85 | } | 85 | } |
86 | 86 | ||
87 | static void __init uc32_bootmem_init(unsigned long start_pfn, | ||
88 | unsigned long end_pfn) | ||
89 | { | ||
90 | struct memblock_region *reg; | ||
91 | unsigned int boot_pages; | ||
92 | phys_addr_t bitmap; | ||
93 | pg_data_t *pgdat; | ||
94 | |||
95 | /* | ||
96 | * Allocate the bootmem bitmap page. This must be in a region | ||
97 | * of memory which has already been mapped. | ||
98 | */ | ||
99 | boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn); | ||
100 | bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES, | ||
101 | __pfn_to_phys(end_pfn)); | ||
102 | |||
103 | /* | ||
104 | * Initialise the bootmem allocator, handing the | ||
105 | * memory banks over to bootmem. | ||
106 | */ | ||
107 | node_set_online(0); | ||
108 | pgdat = NODE_DATA(0); | ||
109 | init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn); | ||
110 | |||
111 | /* Free the lowmem regions from memblock into bootmem. */ | ||
112 | for_each_memblock(memory, reg) { | ||
113 | unsigned long start = memblock_region_memory_base_pfn(reg); | ||
114 | unsigned long end = memblock_region_memory_end_pfn(reg); | ||
115 | |||
116 | if (end >= end_pfn) | ||
117 | end = end_pfn; | ||
118 | if (start >= end) | ||
119 | break; | ||
120 | |||
121 | free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT); | ||
122 | } | ||
123 | |||
124 | /* Reserve the lowmem memblock reserved regions in bootmem. */ | ||
125 | for_each_memblock(reserved, reg) { | ||
126 | unsigned long start = memblock_region_reserved_base_pfn(reg); | ||
127 | unsigned long end = memblock_region_reserved_end_pfn(reg); | ||
128 | |||
129 | if (end >= end_pfn) | ||
130 | end = end_pfn; | ||
131 | if (start >= end) | ||
132 | break; | ||
133 | |||
134 | reserve_bootmem(__pfn_to_phys(start), | ||
135 | (end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT); | ||
136 | } | ||
137 | } | ||
138 | |||
139 | static void __init uc32_bootmem_free(unsigned long min, unsigned long max_low, | 87 | static void __init uc32_bootmem_free(unsigned long min, unsigned long max_low, |
140 | unsigned long max_high) | 88 | unsigned long max_high) |
141 | { | 89 | { |
@@ -232,7 +180,7 @@ void __init bootmem_init(void) | |||
232 | 180 | ||
233 | find_limits(&min, &max_low, &max_high); | 181 | find_limits(&min, &max_low, &max_high); |
234 | 182 | ||
235 | uc32_bootmem_init(min, max_low); | 183 | node_set_online(0); |
236 | 184 | ||
237 | /* | 185 | /* |
238 | * Sparsemem tries to allocate bootmem in memory_present(), | 186 | * Sparsemem tries to allocate bootmem in memory_present(), |
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 3f9d43f26f63..7eb878561910 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c | |||
@@ -39,7 +39,7 @@ void __init init_vdso_image(const struct vdso_image *image) | |||
39 | 39 | ||
40 | struct linux_binprm; | 40 | struct linux_binprm; |
41 | 41 | ||
42 | static int vdso_fault(const struct vm_special_mapping *sm, | 42 | static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, |
43 | struct vm_area_struct *vma, struct vm_fault *vmf) | 43 | struct vm_area_struct *vma, struct vm_fault *vmf) |
44 | { | 44 | { |
45 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; | 45 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; |
@@ -84,12 +84,11 @@ static int vdso_mremap(const struct vm_special_mapping *sm, | |||
84 | return 0; | 84 | return 0; |
85 | } | 85 | } |
86 | 86 | ||
87 | static int vvar_fault(const struct vm_special_mapping *sm, | 87 | static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, |
88 | struct vm_area_struct *vma, struct vm_fault *vmf) | 88 | struct vm_area_struct *vma, struct vm_fault *vmf) |
89 | { | 89 | { |
90 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; | 90 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; |
91 | long sym_offset; | 91 | long sym_offset; |
92 | int ret = -EFAULT; | ||
93 | 92 | ||
94 | if (!image) | 93 | if (!image) |
95 | return VM_FAULT_SIGBUS; | 94 | return VM_FAULT_SIGBUS; |
@@ -108,29 +107,24 @@ static int vvar_fault(const struct vm_special_mapping *sm, | |||
108 | return VM_FAULT_SIGBUS; | 107 | return VM_FAULT_SIGBUS; |
109 | 108 | ||
110 | if (sym_offset == image->sym_vvar_page) { | 109 | if (sym_offset == image->sym_vvar_page) { |
111 | ret = vm_insert_pfn(vma, vmf->address, | 110 | return vmf_insert_pfn(vma, vmf->address, |
112 | __pa_symbol(&__vvar_page) >> PAGE_SHIFT); | 111 | __pa_symbol(&__vvar_page) >> PAGE_SHIFT); |
113 | } else if (sym_offset == image->sym_pvclock_page) { | 112 | } else if (sym_offset == image->sym_pvclock_page) { |
114 | struct pvclock_vsyscall_time_info *pvti = | 113 | struct pvclock_vsyscall_time_info *pvti = |
115 | pvclock_get_pvti_cpu0_va(); | 114 | pvclock_get_pvti_cpu0_va(); |
116 | if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { | 115 | if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { |
117 | ret = vm_insert_pfn_prot( | 116 | return vmf_insert_pfn_prot(vma, vmf->address, |
118 | vma, | 117 | __pa(pvti) >> PAGE_SHIFT, |
119 | vmf->address, | 118 | pgprot_decrypted(vma->vm_page_prot)); |
120 | __pa(pvti) >> PAGE_SHIFT, | ||
121 | pgprot_decrypted(vma->vm_page_prot)); | ||
122 | } | 119 | } |
123 | } else if (sym_offset == image->sym_hvclock_page) { | 120 | } else if (sym_offset == image->sym_hvclock_page) { |
124 | struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); | 121 | struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); |
125 | 122 | ||
126 | if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) | 123 | if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) |
127 | ret = vm_insert_pfn(vma, vmf->address, | 124 | return vmf_insert_pfn(vma, vmf->address, |
128 | vmalloc_to_pfn(tsc_pg)); | 125 | vmalloc_to_pfn(tsc_pg)); |
129 | } | 126 | } |
130 | 127 | ||
131 | if (ret == 0 || ret == -EBUSY) | ||
132 | return VM_FAULT_NOPAGE; | ||
133 | |||
134 | return VM_FAULT_SIGBUS; | 128 | return VM_FAULT_SIGBUS; |
135 | } | 129 | } |
136 | 130 | ||
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 5ed826da5e07..7469d321f072 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h | |||
@@ -13,75 +13,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, | |||
13 | return 0; | 13 | return 0; |
14 | } | 14 | } |
15 | 15 | ||
16 | /* | ||
17 | * If the arch doesn't supply something else, assume that hugepage | ||
18 | * size aligned regions are ok without further preparation. | ||
19 | */ | ||
20 | static inline int prepare_hugepage_range(struct file *file, | ||
21 | unsigned long addr, unsigned long len) | ||
22 | { | ||
23 | struct hstate *h = hstate_file(file); | ||
24 | if (len & ~huge_page_mask(h)) | ||
25 | return -EINVAL; | ||
26 | if (addr & ~huge_page_mask(h)) | ||
27 | return -EINVAL; | ||
28 | return 0; | ||
29 | } | ||
30 | |||
31 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, | ||
32 | unsigned long addr, unsigned long end, | ||
33 | unsigned long floor, | ||
34 | unsigned long ceiling) | ||
35 | { | ||
36 | free_pgd_range(tlb, addr, end, floor, ceiling); | ||
37 | } | ||
38 | |||
39 | static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | ||
40 | pte_t *ptep, pte_t pte) | ||
41 | { | ||
42 | set_pte_at(mm, addr, ptep, pte); | ||
43 | } | ||
44 | |||
45 | static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, | ||
46 | unsigned long addr, pte_t *ptep) | ||
47 | { | ||
48 | return ptep_get_and_clear(mm, addr, ptep); | ||
49 | } | ||
50 | |||
51 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, | ||
52 | unsigned long addr, pte_t *ptep) | ||
53 | { | ||
54 | ptep_clear_flush(vma, addr, ptep); | ||
55 | } | ||
56 | |||
57 | static inline int huge_pte_none(pte_t pte) | ||
58 | { | ||
59 | return pte_none(pte); | ||
60 | } | ||
61 | |||
62 | static inline pte_t huge_pte_wrprotect(pte_t pte) | ||
63 | { | ||
64 | return pte_wrprotect(pte); | ||
65 | } | ||
66 | |||
67 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, | ||
68 | unsigned long addr, pte_t *ptep) | ||
69 | { | ||
70 | ptep_set_wrprotect(mm, addr, ptep); | ||
71 | } | ||
72 | |||
73 | static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, | ||
74 | unsigned long addr, pte_t *ptep, | ||
75 | pte_t pte, int dirty) | ||
76 | { | ||
77 | return ptep_set_access_flags(vma, addr, ptep, pte, dirty); | ||
78 | } | ||
79 | |||
80 | static inline pte_t huge_ptep_get(pte_t *ptep) | ||
81 | { | ||
82 | return *ptep; | ||
83 | } | ||
84 | |||
85 | static inline void arch_clear_hugepage_flags(struct page *page) | 16 | static inline void arch_clear_hugepage_flags(struct page *page) |
86 | { | 17 | { |
87 | } | 18 | } |
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c88c23c658c1..d1f25c831447 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -1248,7 +1248,6 @@ void __init e820__memblock_setup(void) | |||
1248 | { | 1248 | { |
1249 | int i; | 1249 | int i; |
1250 | u64 end; | 1250 | u64 end; |
1251 | u64 addr = 0; | ||
1252 | 1251 | ||
1253 | /* | 1252 | /* |
1254 | * The bootstrap memblock region count maximum is 128 entries | 1253 | * The bootstrap memblock region count maximum is 128 entries |
@@ -1265,21 +1264,13 @@ void __init e820__memblock_setup(void) | |||
1265 | struct e820_entry *entry = &e820_table->entries[i]; | 1264 | struct e820_entry *entry = &e820_table->entries[i]; |
1266 | 1265 | ||
1267 | end = entry->addr + entry->size; | 1266 | end = entry->addr + entry->size; |
1268 | if (addr < entry->addr) | ||
1269 | memblock_reserve(addr, entry->addr - addr); | ||
1270 | addr = end; | ||
1271 | if (end != (resource_size_t)end) | 1267 | if (end != (resource_size_t)end) |
1272 | continue; | 1268 | continue; |
1273 | 1269 | ||
1274 | /* | ||
1275 | * all !E820_TYPE_RAM ranges (including gap ranges) are put | ||
1276 | * into memblock.reserved to make sure that struct pages in | ||
1277 | * such regions are not left uninitialized after bootup. | ||
1278 | */ | ||
1279 | if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) | 1270 | if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) |
1280 | memblock_reserve(entry->addr, entry->size); | 1271 | continue; |
1281 | else | 1272 | |
1282 | memblock_add(entry->addr, entry->size); | 1273 | memblock_add(entry->addr, entry->size); |
1283 | } | 1274 | } |
1284 | 1275 | ||
1285 | /* Throw away partial pages: */ | 1276 | /* Throw away partial pages: */ |
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 82c756431b49..3310adecafb0 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild | |||
@@ -26,5 +26,6 @@ generic-y += rwsem.h | |||
26 | generic-y += sections.h | 26 | generic-y += sections.h |
27 | generic-y += topology.h | 27 | generic-y += topology.h |
28 | generic-y += trace_clock.h | 28 | generic-y += trace_clock.h |
29 | generic-y += vga.h | ||
29 | generic-y += word-at-a-time.h | 30 | generic-y += word-at-a-time.h |
30 | generic-y += xor.h | 31 | generic-y += xor.h |
diff --git a/arch/xtensa/include/asm/vga.h b/arch/xtensa/include/asm/vga.h deleted file mode 100644 index 1fd8cab3a297..000000000000 --- a/arch/xtensa/include/asm/vga.h +++ /dev/null | |||
@@ -1,19 +0,0 @@ | |||
1 | /* | ||
2 | * include/asm-xtensa/vga.h | ||
3 | * | ||
4 | * This file is subject to the terms and conditions of the GNU General Public | ||
5 | * License. See the file "COPYING" in the main directory of this archive | ||
6 | * for more details. | ||
7 | * | ||
8 | * Copyright (C) 2001 - 2005 Tensilica Inc. | ||
9 | */ | ||
10 | |||
11 | #ifndef _XTENSA_VGA_H | ||
12 | #define _XTENSA_VGA_H | ||
13 | |||
14 | #define VGA_MAP_MEM(x,s) (unsigned long)phys_to_virt(x) | ||
15 | |||
16 | #define vga_readb(x) (*(x)) | ||
17 | #define vga_writeb(x,y) (*(y) = (x)) | ||
18 | |||
19 | #endif | ||
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 35c48d7b8f78..28f80d227528 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c | |||
@@ -153,7 +153,7 @@ struct iolatency_grp { | |||
153 | #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC | 153 | #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC |
154 | /* | 154 | /* |
155 | * These are the constants used to fake the fixed-point moving average | 155 | * These are the constants used to fake the fixed-point moving average |
156 | * calculation just like load average. The call to CALC_LOAD folds | 156 | * calculation just like load average. The call to calc_load() folds |
157 | * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling | 157 | * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling |
158 | * window size is bucketed to try to approximately calculate average | 158 | * window size is bucketed to try to approximately calculate average |
159 | * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows | 159 | * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows |
@@ -248,7 +248,7 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, | |||
248 | return; | 248 | return; |
249 | 249 | ||
250 | /* | 250 | /* |
251 | * CALC_LOAD takes in a number stored in fixed point representation. | 251 | * calc_load() takes in a number stored in fixed point representation. |
252 | * Because we are using this for IO time in ns, the values stored | 252 | * Because we are using this for IO time in ns, the values stored |
253 | * are significantly larger than the FIXED_1 denominator (2048). | 253 | * are significantly larger than the FIXED_1 denominator (2048). |
254 | * Therefore, rounding errors in the calculation are negligible and | 254 | * Therefore, rounding errors in the calculation are negligible and |
@@ -257,7 +257,9 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, | |||
257 | exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, | 257 | exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, |
258 | div64_u64(iolat->cur_win_nsec, | 258 | div64_u64(iolat->cur_win_nsec, |
259 | BLKIOLATENCY_EXP_BUCKET_SIZE)); | 259 | BLKIOLATENCY_EXP_BUCKET_SIZE)); |
260 | CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean); | 260 | iolat->lat_avg = calc_load(iolat->lat_avg, |
261 | iolatency_exp_factors[exp_idx], | ||
262 | stat->rqs.mean); | ||
261 | } | 263 | } |
262 | 264 | ||
263 | static inline bool iolatency_may_queue(struct iolatency_grp *iolat, | 265 | static inline bool iolatency_may_queue(struct iolatency_grp *iolat, |
diff --git a/drivers/base/node.c b/drivers/base/node.c index 1ac4c36e13bb..86d6cd92ce3d 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
@@ -67,8 +67,11 @@ static ssize_t node_read_meminfo(struct device *dev, | |||
67 | int nid = dev->id; | 67 | int nid = dev->id; |
68 | struct pglist_data *pgdat = NODE_DATA(nid); | 68 | struct pglist_data *pgdat = NODE_DATA(nid); |
69 | struct sysinfo i; | 69 | struct sysinfo i; |
70 | unsigned long sreclaimable, sunreclaimable; | ||
70 | 71 | ||
71 | si_meminfo_node(&i, nid); | 72 | si_meminfo_node(&i, nid); |
73 | sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE); | ||
74 | sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE); | ||
72 | n = sprintf(buf, | 75 | n = sprintf(buf, |
73 | "Node %d MemTotal: %8lu kB\n" | 76 | "Node %d MemTotal: %8lu kB\n" |
74 | "Node %d MemFree: %8lu kB\n" | 77 | "Node %d MemFree: %8lu kB\n" |
@@ -118,6 +121,7 @@ static ssize_t node_read_meminfo(struct device *dev, | |||
118 | "Node %d NFS_Unstable: %8lu kB\n" | 121 | "Node %d NFS_Unstable: %8lu kB\n" |
119 | "Node %d Bounce: %8lu kB\n" | 122 | "Node %d Bounce: %8lu kB\n" |
120 | "Node %d WritebackTmp: %8lu kB\n" | 123 | "Node %d WritebackTmp: %8lu kB\n" |
124 | "Node %d KReclaimable: %8lu kB\n" | ||
121 | "Node %d Slab: %8lu kB\n" | 125 | "Node %d Slab: %8lu kB\n" |
122 | "Node %d SReclaimable: %8lu kB\n" | 126 | "Node %d SReclaimable: %8lu kB\n" |
123 | "Node %d SUnreclaim: %8lu kB\n" | 127 | "Node %d SUnreclaim: %8lu kB\n" |
@@ -138,20 +142,21 @@ static ssize_t node_read_meminfo(struct device *dev, | |||
138 | nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), | 142 | nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), |
139 | nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), | 143 | nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), |
140 | nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), | 144 | nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), |
141 | nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE) + | 145 | nid, K(sreclaimable + |
142 | node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), | 146 | node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)), |
143 | nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE)), | 147 | nid, K(sreclaimable + sunreclaimable), |
148 | nid, K(sreclaimable), | ||
149 | nid, K(sunreclaimable) | ||
144 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 150 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
145 | nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), | 151 | , |
146 | nid, K(node_page_state(pgdat, NR_ANON_THPS) * | 152 | nid, K(node_page_state(pgdat, NR_ANON_THPS) * |
147 | HPAGE_PMD_NR), | 153 | HPAGE_PMD_NR), |
148 | nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * | 154 | nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * |
149 | HPAGE_PMD_NR), | 155 | HPAGE_PMD_NR), |
150 | nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * | 156 | nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * |
151 | HPAGE_PMD_NR)); | 157 | HPAGE_PMD_NR) |
152 | #else | ||
153 | nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE))); | ||
154 | #endif | 158 | #endif |
159 | ); | ||
155 | n += hugetlb_report_node_meminfo(nid, buf + n); | 160 | n += hugetlb_report_node_meminfo(nid, buf + n); |
156 | return n; | 161 | return n; |
157 | } | 162 | } |
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 575a68f31761..71979605246e 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c | |||
@@ -130,10 +130,6 @@ struct menu_device { | |||
130 | int interval_ptr; | 130 | int interval_ptr; |
131 | }; | 131 | }; |
132 | 132 | ||
133 | |||
134 | #define LOAD_INT(x) ((x) >> FSHIFT) | ||
135 | #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) | ||
136 | |||
137 | static inline int get_loadavg(unsigned long load) | 133 | static inline int get_loadavg(unsigned long load) |
138 | { | 134 | { |
139 | return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10; | 135 | return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10; |
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index e1c7996c018e..475b769e120c 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c | |||
@@ -77,7 +77,6 @@ static void do_remove(struct mmu_rb_handler *handler, | |||
77 | static void handle_remove(struct work_struct *work); | 77 | static void handle_remove(struct work_struct *work); |
78 | 78 | ||
79 | static const struct mmu_notifier_ops mn_opts = { | 79 | static const struct mmu_notifier_ops mn_opts = { |
80 | .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, | ||
81 | .invalidate_range_start = mmu_notifier_range_start, | 80 | .invalidate_range_start = mmu_notifier_range_start, |
82 | }; | 81 | }; |
83 | 82 | ||
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 58da65df03f5..fd552235bd13 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c | |||
@@ -427,7 +427,6 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) | |||
427 | } | 427 | } |
428 | 428 | ||
429 | static const struct mmu_notifier_ops iommu_mn = { | 429 | static const struct mmu_notifier_ops iommu_mn = { |
430 | .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, | ||
431 | .release = mn_release, | 430 | .release = mn_release, |
432 | .clear_flush_young = mn_clear_flush_young, | 431 | .clear_flush_young = mn_clear_flush_young, |
433 | .invalidate_range = mn_invalidate_range, | 432 | .invalidate_range = mn_invalidate_range, |
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 4a03e5090952..db301efe126d 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c | |||
@@ -273,7 +273,6 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) | |||
273 | } | 273 | } |
274 | 274 | ||
275 | static const struct mmu_notifier_ops intel_mmuops = { | 275 | static const struct mmu_notifier_ops intel_mmuops = { |
276 | .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, | ||
277 | .release = intel_mm_release, | 276 | .release = intel_mm_release, |
278 | .change_pte = intel_change_pte, | 277 | .change_pte = intel_change_pte, |
279 | .invalidate_range = intel_invalidate_range, | 278 | .invalidate_range = intel_invalidate_range, |
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c index be28f05bfafa..03b49d52092e 100644 --- a/drivers/misc/sgi-gru/grutlbpurge.c +++ b/drivers/misc/sgi-gru/grutlbpurge.c | |||
@@ -261,7 +261,6 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) | |||
261 | 261 | ||
262 | 262 | ||
263 | static const struct mmu_notifier_ops gru_mmuops = { | 263 | static const struct mmu_notifier_ops gru_mmuops = { |
264 | .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, | ||
265 | .invalidate_range_start = gru_invalidate_range_start, | 264 | .invalidate_range_start = gru_invalidate_range_start, |
266 | .invalidate_range_end = gru_invalidate_range_end, | 265 | .invalidate_range_end = gru_invalidate_range_end, |
267 | .release = gru_release, | 266 | .release = gru_release, |
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 800ad252cf9c..76c83c1ffeda 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c | |||
@@ -1127,12 +1127,13 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) | |||
1127 | { | 1127 | { |
1128 | const u64 phys_offset = MIN_MEMBLOCK_ADDR; | 1128 | const u64 phys_offset = MIN_MEMBLOCK_ADDR; |
1129 | 1129 | ||
1130 | if (size < PAGE_SIZE - (base & ~PAGE_MASK)) { | ||
1131 | pr_warn("Ignoring memory block 0x%llx - 0x%llx\n", | ||
1132 | base, base + size); | ||
1133 | return; | ||
1134 | } | ||
1135 | |||
1130 | if (!PAGE_ALIGNED(base)) { | 1136 | if (!PAGE_ALIGNED(base)) { |
1131 | if (size < PAGE_SIZE - (base & ~PAGE_MASK)) { | ||
1132 | pr_warn("Ignoring memory block 0x%llx - 0x%llx\n", | ||
1133 | base, base + size); | ||
1134 | return; | ||
1135 | } | ||
1136 | size -= PAGE_SIZE - (base & ~PAGE_MASK); | 1137 | size -= PAGE_SIZE - (base & ~PAGE_MASK); |
1137 | base = PAGE_ALIGN(base); | 1138 | base = PAGE_ALIGN(base); |
1138 | } | 1139 | } |
diff --git a/drivers/staging/android/ion/ion_page_pool.c b/drivers/staging/android/ion/ion_page_pool.c index 9bc56eb48d2a..0d2a95957ee8 100644 --- a/drivers/staging/android/ion/ion_page_pool.c +++ b/drivers/staging/android/ion/ion_page_pool.c | |||
@@ -33,8 +33,8 @@ static void ion_page_pool_add(struct ion_page_pool *pool, struct page *page) | |||
33 | pool->low_count++; | 33 | pool->low_count++; |
34 | } | 34 | } |
35 | 35 | ||
36 | mod_node_page_state(page_pgdat(page), NR_INDIRECTLY_RECLAIMABLE_BYTES, | 36 | mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE, |
37 | (1 << (PAGE_SHIFT + pool->order))); | 37 | 1 << pool->order); |
38 | mutex_unlock(&pool->mutex); | 38 | mutex_unlock(&pool->mutex); |
39 | } | 39 | } |
40 | 40 | ||
@@ -53,8 +53,8 @@ static struct page *ion_page_pool_remove(struct ion_page_pool *pool, bool high) | |||
53 | } | 53 | } |
54 | 54 | ||
55 | list_del(&page->lru); | 55 | list_del(&page->lru); |
56 | mod_node_page_state(page_pgdat(page), NR_INDIRECTLY_RECLAIMABLE_BYTES, | 56 | mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE, |
57 | -(1 << (PAGE_SHIFT + pool->order))); | 57 | -(1 << pool->order)); |
58 | return page; | 58 | return page; |
59 | } | 59 | } |
60 | 60 | ||
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index f408994fc632..0c35e62f108d 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c | |||
@@ -418,9 +418,12 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) | |||
418 | int i; | 418 | int i; |
419 | vma->vm_flags |= VM_MIXEDMAP; | 419 | vma->vm_flags |= VM_MIXEDMAP; |
420 | for (i = 0; i < pages && !ret; i++) { | 420 | for (i = 0; i < pages && !ret; i++) { |
421 | vm_fault_t vmf; | ||
421 | unsigned long off = i * PAGE_SIZE; | 422 | unsigned long off = i * PAGE_SIZE; |
422 | pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); | 423 | pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); |
423 | ret = vm_insert_mixed(vma, vma->vm_start + off, pfn); | 424 | vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn); |
425 | if (vmf & VM_FAULT_ERROR) | ||
426 | ret = vm_fault_to_errno(vmf, 0); | ||
424 | } | 427 | } |
425 | } | 428 | } |
426 | 429 | ||
diff --git a/fs/dcache.c b/fs/dcache.c index 2e7e8d85e9b4..c2e443fb76ae 100644 --- a/fs/dcache.c +++ b/fs/dcache.c | |||
@@ -257,24 +257,10 @@ static void __d_free(struct rcu_head *head) | |||
257 | kmem_cache_free(dentry_cache, dentry); | 257 | kmem_cache_free(dentry_cache, dentry); |
258 | } | 258 | } |
259 | 259 | ||
260 | static void __d_free_external_name(struct rcu_head *head) | ||
261 | { | ||
262 | struct external_name *name = container_of(head, struct external_name, | ||
263 | u.head); | ||
264 | |||
265 | mod_node_page_state(page_pgdat(virt_to_page(name)), | ||
266 | NR_INDIRECTLY_RECLAIMABLE_BYTES, | ||
267 | -ksize(name)); | ||
268 | |||
269 | kfree(name); | ||
270 | } | ||
271 | |||
272 | static void __d_free_external(struct rcu_head *head) | 260 | static void __d_free_external(struct rcu_head *head) |
273 | { | 261 | { |
274 | struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); | 262 | struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); |
275 | 263 | kfree(external_name(dentry)); | |
276 | __d_free_external_name(&external_name(dentry)->u.head); | ||
277 | |||
278 | kmem_cache_free(dentry_cache, dentry); | 264 | kmem_cache_free(dentry_cache, dentry); |
279 | } | 265 | } |
280 | 266 | ||
@@ -306,7 +292,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name) | |||
306 | struct external_name *p; | 292 | struct external_name *p; |
307 | p = container_of(name->name, struct external_name, name[0]); | 293 | p = container_of(name->name, struct external_name, name[0]); |
308 | if (unlikely(atomic_dec_and_test(&p->u.count))) | 294 | if (unlikely(atomic_dec_and_test(&p->u.count))) |
309 | call_rcu(&p->u.head, __d_free_external_name); | 295 | kfree_rcu(p, u.head); |
310 | } | 296 | } |
311 | } | 297 | } |
312 | EXPORT_SYMBOL(release_dentry_name_snapshot); | 298 | EXPORT_SYMBOL(release_dentry_name_snapshot); |
@@ -1606,7 +1592,6 @@ EXPORT_SYMBOL(d_invalidate); | |||
1606 | 1592 | ||
1607 | struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) | 1593 | struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) |
1608 | { | 1594 | { |
1609 | struct external_name *ext = NULL; | ||
1610 | struct dentry *dentry; | 1595 | struct dentry *dentry; |
1611 | char *dname; | 1596 | char *dname; |
1612 | int err; | 1597 | int err; |
@@ -1627,14 +1612,15 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) | |||
1627 | dname = dentry->d_iname; | 1612 | dname = dentry->d_iname; |
1628 | } else if (name->len > DNAME_INLINE_LEN-1) { | 1613 | } else if (name->len > DNAME_INLINE_LEN-1) { |
1629 | size_t size = offsetof(struct external_name, name[1]); | 1614 | size_t size = offsetof(struct external_name, name[1]); |
1630 | 1615 | struct external_name *p = kmalloc(size + name->len, | |
1631 | ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); | 1616 | GFP_KERNEL_ACCOUNT | |
1632 | if (!ext) { | 1617 | __GFP_RECLAIMABLE); |
1618 | if (!p) { | ||
1633 | kmem_cache_free(dentry_cache, dentry); | 1619 | kmem_cache_free(dentry_cache, dentry); |
1634 | return NULL; | 1620 | return NULL; |
1635 | } | 1621 | } |
1636 | atomic_set(&ext->u.count, 1); | 1622 | atomic_set(&p->u.count, 1); |
1637 | dname = ext->name; | 1623 | dname = p->name; |
1638 | } else { | 1624 | } else { |
1639 | dname = dentry->d_iname; | 1625 | dname = dentry->d_iname; |
1640 | } | 1626 | } |
@@ -1673,12 +1659,6 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) | |||
1673 | } | 1659 | } |
1674 | } | 1660 | } |
1675 | 1661 | ||
1676 | if (unlikely(ext)) { | ||
1677 | pg_data_t *pgdat = page_pgdat(virt_to_page(ext)); | ||
1678 | mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES, | ||
1679 | ksize(ext)); | ||
1680 | } | ||
1681 | |||
1682 | this_cpu_inc(nr_dentry); | 1662 | this_cpu_inc(nr_dentry); |
1683 | 1663 | ||
1684 | return dentry; | 1664 | return dentry; |
@@ -2707,7 +2687,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target) | |||
2707 | dentry->d_name.hash_len = target->d_name.hash_len; | 2687 | dentry->d_name.hash_len = target->d_name.hash_len; |
2708 | } | 2688 | } |
2709 | if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) | 2689 | if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) |
2710 | call_rcu(&old_name->u.head, __d_free_external_name); | 2690 | kfree_rcu(old_name, u.head); |
2711 | } | 2691 | } |
2712 | 2692 | ||
2713 | /* | 2693 | /* |
diff --git a/fs/iomap.c b/fs/iomap.c index ec15cf2ec696..90c2febc93ac 100644 --- a/fs/iomap.c +++ b/fs/iomap.c | |||
@@ -1057,7 +1057,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, | |||
1057 | return length; | 1057 | return length; |
1058 | } | 1058 | } |
1059 | 1059 | ||
1060 | int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) | 1060 | vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) |
1061 | { | 1061 | { |
1062 | struct page *page = vmf->page; | 1062 | struct page *page = vmf->page; |
1063 | struct inode *inode = file_inode(vmf->vma->vm_file); | 1063 | struct inode *inode = file_inode(vmf->vma->vm_file); |
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index ff2716f9322e..fdf527b6d79c 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c | |||
@@ -236,6 +236,9 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) | |||
236 | sb->s_export_op = &kernfs_export_ops; | 236 | sb->s_export_op = &kernfs_export_ops; |
237 | sb->s_time_gran = 1; | 237 | sb->s_time_gran = 1; |
238 | 238 | ||
239 | /* sysfs dentries and inodes don't require IO to create */ | ||
240 | sb->s_shrink.seeks = 0; | ||
241 | |||
239 | /* get root inode, initialize and unlock it */ | 242 | /* get root inode, initialize and unlock it */ |
240 | mutex_lock(&kernfs_mutex); | 243 | mutex_lock(&kernfs_mutex); |
241 | inode = kernfs_get_inode(sb, info->root->kn); | 244 | inode = kernfs_get_inode(sb, info->root->kn); |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a342f008e42f..d1cbb27808e2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -5106,8 +5106,6 @@ int ocfs2_split_extent(handle_t *handle, | |||
5106 | * rightmost extent list. | 5106 | * rightmost extent list. |
5107 | */ | 5107 | */ |
5108 | if (path->p_tree_depth) { | 5108 | if (path->p_tree_depth) { |
5109 | struct ocfs2_extent_block *eb; | ||
5110 | |||
5111 | ret = ocfs2_read_extent_block(et->et_ci, | 5109 | ret = ocfs2_read_extent_block(et->et_ci, |
5112 | ocfs2_et_get_last_eb_blk(et), | 5110 | ocfs2_et_get_last_eb_blk(et), |
5113 | &last_eb_bh); | 5111 | &last_eb_bh); |
@@ -5115,8 +5113,6 @@ int ocfs2_split_extent(handle_t *handle, | |||
5115 | mlog_errno(ret); | 5113 | mlog_errno(ret); |
5116 | goto out; | 5114 | goto out; |
5117 | } | 5115 | } |
5118 | |||
5119 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
5120 | } | 5116 | } |
5121 | 5117 | ||
5122 | if (rec->e_cpos == split_rec->e_cpos && | 5118 | if (rec->e_cpos == split_rec->e_cpos && |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 302cd7caa4a7..da578ad4c08f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -1392,8 +1392,7 @@ retry: | |||
1392 | unlock: | 1392 | unlock: |
1393 | spin_unlock(&oi->ip_lock); | 1393 | spin_unlock(&oi->ip_lock); |
1394 | out: | 1394 | out: |
1395 | if (new) | 1395 | kfree(new); |
1396 | kfree(new); | ||
1397 | return ret; | 1396 | return ret; |
1398 | } | 1397 | } |
1399 | 1398 | ||
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 9b984cae4c4e..1d6dc8422899 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c | |||
@@ -329,7 +329,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) | |||
329 | { | 329 | { |
330 | char *buf; | 330 | char *buf; |
331 | 331 | ||
332 | buf = (char *) get_zeroed_page(GFP_NOFS); | 332 | buf = (char *) get_zeroed_page(GFP_ATOMIC); |
333 | if (buf) { | 333 | if (buf) { |
334 | dump_mle(mle, buf, PAGE_SIZE - 1); | 334 | dump_mle(mle, buf, PAGE_SIZE - 1); |
335 | free_page((unsigned long)buf); | 335 | free_page((unsigned long)buf); |
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 838a06d4066a..074d5de17bb2 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c | |||
@@ -531,7 +531,7 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | |||
531 | assert_spin_locked(&res->spinlock); | 531 | assert_spin_locked(&res->spinlock); |
532 | 532 | ||
533 | /* don't shuffle secondary queues */ | 533 | /* don't shuffle secondary queues */ |
534 | if ((res->owner == dlm->node_num)) { | 534 | if (res->owner == dlm->node_num) { |
535 | if (res->state & (DLM_LOCK_RES_MIGRATING | | 535 | if (res->state & (DLM_LOCK_RES_MIGRATING | |
536 | DLM_LOCK_RES_BLOCK_DIRTY)) | 536 | DLM_LOCK_RES_BLOCK_DIRTY)) |
537 | return; | 537 | return; |
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7a5ee145c733..1114ef02e780 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c | |||
@@ -4135,7 +4135,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, | |||
4135 | struct buffer_head *ref_root_bh = NULL; | 4135 | struct buffer_head *ref_root_bh = NULL; |
4136 | struct ocfs2_cached_dealloc_ctxt dealloc; | 4136 | struct ocfs2_cached_dealloc_ctxt dealloc; |
4137 | struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); | 4137 | struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); |
4138 | struct ocfs2_refcount_block *rb; | ||
4139 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; | 4138 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; |
4140 | struct ocfs2_refcount_tree *ref_tree; | 4139 | struct ocfs2_refcount_tree *ref_tree; |
4141 | 4140 | ||
@@ -4162,7 +4161,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, | |||
4162 | mlog_errno(ret); | 4161 | mlog_errno(ret); |
4163 | goto out; | 4162 | goto out; |
4164 | } | 4163 | } |
4165 | rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; | ||
4166 | 4164 | ||
4167 | ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, | 4165 | ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, |
4168 | &ref_tree->rf_ci, ref_root_bh, | 4166 | &ref_tree->rf_ci, ref_root_bh, |
diff --git a/fs/proc/inode.c b/fs/proc/inode.c index fc5306a31a1d..5792f9e39466 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c | |||
@@ -516,6 +516,9 @@ int proc_fill_super(struct super_block *s, void *data, int silent) | |||
516 | */ | 516 | */ |
517 | s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; | 517 | s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; |
518 | 518 | ||
519 | /* procfs dentries and inodes don't require IO to create */ | ||
520 | s->s_shrink.seeks = 0; | ||
521 | |||
519 | pde_get(&proc_root); | 522 | pde_get(&proc_root); |
520 | root_inode = proc_get_inode(s, &proc_root); | 523 | root_inode = proc_get_inode(s, &proc_root); |
521 | if (!root_inode) { | 524 | if (!root_inode) { |
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index d06694757201..8468baee951d 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c | |||
@@ -10,9 +10,6 @@ | |||
10 | #include <linux/seqlock.h> | 10 | #include <linux/seqlock.h> |
11 | #include <linux/time.h> | 11 | #include <linux/time.h> |
12 | 12 | ||
13 | #define LOAD_INT(x) ((x) >> FSHIFT) | ||
14 | #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) | ||
15 | |||
16 | static int loadavg_proc_show(struct seq_file *m, void *v) | 13 | static int loadavg_proc_show(struct seq_file *m, void *v) |
17 | { | 14 | { |
18 | unsigned long avnrun[3]; | 15 | unsigned long avnrun[3]; |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index edda898714eb..568d90e17c17 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
@@ -38,6 +38,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
38 | long cached; | 38 | long cached; |
39 | long available; | 39 | long available; |
40 | unsigned long pages[NR_LRU_LISTS]; | 40 | unsigned long pages[NR_LRU_LISTS]; |
41 | unsigned long sreclaimable, sunreclaim; | ||
41 | int lru; | 42 | int lru; |
42 | 43 | ||
43 | si_meminfo(&i); | 44 | si_meminfo(&i); |
@@ -53,6 +54,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
53 | pages[lru] = global_node_page_state(NR_LRU_BASE + lru); | 54 | pages[lru] = global_node_page_state(NR_LRU_BASE + lru); |
54 | 55 | ||
55 | available = si_mem_available(); | 56 | available = si_mem_available(); |
57 | sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE); | ||
58 | sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE); | ||
56 | 59 | ||
57 | show_val_kb(m, "MemTotal: ", i.totalram); | 60 | show_val_kb(m, "MemTotal: ", i.totalram); |
58 | show_val_kb(m, "MemFree: ", i.freeram); | 61 | show_val_kb(m, "MemFree: ", i.freeram); |
@@ -94,14 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
94 | show_val_kb(m, "Mapped: ", | 97 | show_val_kb(m, "Mapped: ", |
95 | global_node_page_state(NR_FILE_MAPPED)); | 98 | global_node_page_state(NR_FILE_MAPPED)); |
96 | show_val_kb(m, "Shmem: ", i.sharedram); | 99 | show_val_kb(m, "Shmem: ", i.sharedram); |
97 | show_val_kb(m, "Slab: ", | 100 | show_val_kb(m, "KReclaimable: ", sreclaimable + |
98 | global_node_page_state(NR_SLAB_RECLAIMABLE) + | 101 | global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); |
99 | global_node_page_state(NR_SLAB_UNRECLAIMABLE)); | 102 | show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); |
100 | 103 | show_val_kb(m, "SReclaimable: ", sreclaimable); | |
101 | show_val_kb(m, "SReclaimable: ", | 104 | show_val_kb(m, "SUnreclaim: ", sunreclaim); |
102 | global_node_page_state(NR_SLAB_RECLAIMABLE)); | ||
103 | show_val_kb(m, "SUnreclaim: ", | ||
104 | global_node_page_state(NR_SLAB_UNRECLAIMABLE)); | ||
105 | seq_printf(m, "KernelStack: %8lu kB\n", | 105 | seq_printf(m, "KernelStack: %8lu kB\n", |
106 | global_zone_page_state(NR_KERNEL_STACK_KB)); | 106 | global_zone_page_state(NR_KERNEL_STACK_KB)); |
107 | show_val_kb(m, "PageTables: ", | 107 | show_val_kb(m, "PageTables: ", |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5ea1d64cb0b4..a027473561c6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -713,6 +713,8 @@ static void smap_gather_stats(struct vm_area_struct *vma, | |||
713 | smaps_walk.private = mss; | 713 | smaps_walk.private = mss; |
714 | 714 | ||
715 | #ifdef CONFIG_SHMEM | 715 | #ifdef CONFIG_SHMEM |
716 | /* In case of smaps_rollup, reset the value from previous vma */ | ||
717 | mss->check_shmem_swap = false; | ||
716 | if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { | 718 | if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { |
717 | /* | 719 | /* |
718 | * For shared or readonly shmem mappings we know that all | 720 | * For shared or readonly shmem mappings we know that all |
@@ -728,7 +730,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, | |||
728 | 730 | ||
729 | if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || | 731 | if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || |
730 | !(vma->vm_flags & VM_WRITE)) { | 732 | !(vma->vm_flags & VM_WRITE)) { |
731 | mss->swap = shmem_swapped; | 733 | mss->swap += shmem_swapped; |
732 | } else { | 734 | } else { |
733 | mss->check_shmem_swap = true; | 735 | mss->check_shmem_swap = true; |
734 | smaps_walk.pte_hole = smaps_pte_hole; | 736 | smaps_walk.pte_hole = smaps_pte_hole; |
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index bfa0ec69f924..356d2b8568c1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c | |||
@@ -1026,7 +1026,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, | |||
1026 | struct userfaultfd_ctx *fork_nctx = NULL; | 1026 | struct userfaultfd_ctx *fork_nctx = NULL; |
1027 | 1027 | ||
1028 | /* always take the fd_wqh lock before the fault_pending_wqh lock */ | 1028 | /* always take the fd_wqh lock before the fault_pending_wqh lock */ |
1029 | spin_lock(&ctx->fd_wqh.lock); | 1029 | spin_lock_irq(&ctx->fd_wqh.lock); |
1030 | __add_wait_queue(&ctx->fd_wqh, &wait); | 1030 | __add_wait_queue(&ctx->fd_wqh, &wait); |
1031 | for (;;) { | 1031 | for (;;) { |
1032 | set_current_state(TASK_INTERRUPTIBLE); | 1032 | set_current_state(TASK_INTERRUPTIBLE); |
@@ -1112,13 +1112,13 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, | |||
1112 | ret = -EAGAIN; | 1112 | ret = -EAGAIN; |
1113 | break; | 1113 | break; |
1114 | } | 1114 | } |
1115 | spin_unlock(&ctx->fd_wqh.lock); | 1115 | spin_unlock_irq(&ctx->fd_wqh.lock); |
1116 | schedule(); | 1116 | schedule(); |
1117 | spin_lock(&ctx->fd_wqh.lock); | 1117 | spin_lock_irq(&ctx->fd_wqh.lock); |
1118 | } | 1118 | } |
1119 | __remove_wait_queue(&ctx->fd_wqh, &wait); | 1119 | __remove_wait_queue(&ctx->fd_wqh, &wait); |
1120 | __set_current_state(TASK_RUNNING); | 1120 | __set_current_state(TASK_RUNNING); |
1121 | spin_unlock(&ctx->fd_wqh.lock); | 1121 | spin_unlock_irq(&ctx->fd_wqh.lock); |
1122 | 1122 | ||
1123 | if (!ret && msg->event == UFFD_EVENT_FORK) { | 1123 | if (!ret && msg->event == UFFD_EVENT_FORK) { |
1124 | ret = resolve_userfault_fork(ctx, fork_nctx, msg); | 1124 | ret = resolve_userfault_fork(ctx, fork_nctx, msg); |
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 9d0cde8ab716..71d7b77eea50 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h | |||
@@ -32,7 +32,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) | |||
32 | return pte_modify(pte, newprot); | 32 | return pte_modify(pte, newprot); |
33 | } | 33 | } |
34 | 34 | ||
35 | #ifndef huge_pte_clear | 35 | #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR |
36 | static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, | 36 | static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, |
37 | pte_t *ptep, unsigned long sz) | 37 | pte_t *ptep, unsigned long sz) |
38 | { | 38 | { |
@@ -40,4 +40,90 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, | |||
40 | } | 40 | } |
41 | #endif | 41 | #endif |
42 | 42 | ||
43 | #ifndef __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE | ||
44 | static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, | ||
45 | unsigned long addr, unsigned long end, | ||
46 | unsigned long floor, unsigned long ceiling) | ||
47 | { | ||
48 | free_pgd_range(tlb, addr, end, floor, ceiling); | ||
49 | } | ||
50 | #endif | ||
51 | |||
52 | #ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT | ||
53 | static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | ||
54 | pte_t *ptep, pte_t pte) | ||
55 | { | ||
56 | set_pte_at(mm, addr, ptep, pte); | ||
57 | } | ||
58 | #endif | ||
59 | |||
60 | #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR | ||
61 | static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, | ||
62 | unsigned long addr, pte_t *ptep) | ||
63 | { | ||
64 | return ptep_get_and_clear(mm, addr, ptep); | ||
65 | } | ||
66 | #endif | ||
67 | |||
68 | #ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH | ||
69 | static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, | ||
70 | unsigned long addr, pte_t *ptep) | ||
71 | { | ||
72 | ptep_clear_flush(vma, addr, ptep); | ||
73 | } | ||
74 | #endif | ||
75 | |||
76 | #ifndef __HAVE_ARCH_HUGE_PTE_NONE | ||
77 | static inline int huge_pte_none(pte_t pte) | ||
78 | { | ||
79 | return pte_none(pte); | ||
80 | } | ||
81 | #endif | ||
82 | |||
83 | #ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT | ||
84 | static inline pte_t huge_pte_wrprotect(pte_t pte) | ||
85 | { | ||
86 | return pte_wrprotect(pte); | ||
87 | } | ||
88 | #endif | ||
89 | |||
90 | #ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE | ||
91 | static inline int prepare_hugepage_range(struct file *file, | ||
92 | unsigned long addr, unsigned long len) | ||
93 | { | ||
94 | struct hstate *h = hstate_file(file); | ||
95 | |||
96 | if (len & ~huge_page_mask(h)) | ||
97 | return -EINVAL; | ||
98 | if (addr & ~huge_page_mask(h)) | ||
99 | return -EINVAL; | ||
100 | |||
101 | return 0; | ||
102 | } | ||
103 | #endif | ||
104 | |||
105 | #ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT | ||
106 | static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, | ||
107 | unsigned long addr, pte_t *ptep) | ||
108 | { | ||
109 | ptep_set_wrprotect(mm, addr, ptep); | ||
110 | } | ||
111 | #endif | ||
112 | |||
113 | #ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS | ||
114 | static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, | ||
115 | unsigned long addr, pte_t *ptep, | ||
116 | pte_t pte, int dirty) | ||
117 | { | ||
118 | return ptep_set_access_flags(vma, addr, ptep, pte, dirty); | ||
119 | } | ||
120 | #endif | ||
121 | |||
122 | #ifndef __HAVE_ARCH_HUGE_PTEP_GET | ||
123 | static inline pte_t huge_ptep_get(pte_t *ptep) | ||
124 | { | ||
125 | return *ptep; | ||
126 | } | ||
127 | #endif | ||
128 | |||
43 | #endif /* _ASM_GENERIC_HUGETLB_H */ | 129 | #endif /* _ASM_GENERIC_HUGETLB_H */ |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 88ebc6102c7c..5657a20e0c59 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -757,7 +757,7 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) | |||
757 | /* | 757 | /* |
758 | * Interfaces that can be used by architecture code to keep track of | 758 | * Interfaces that can be used by architecture code to keep track of |
759 | * memory type of pfn mappings specified by the remap_pfn_range, | 759 | * memory type of pfn mappings specified by the remap_pfn_range, |
760 | * vm_insert_pfn. | 760 | * vmf_insert_pfn. |
761 | */ | 761 | */ |
762 | 762 | ||
763 | /* | 763 | /* |
@@ -773,7 +773,7 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, | |||
773 | 773 | ||
774 | /* | 774 | /* |
775 | * track_pfn_insert is called when a _new_ single pfn is established | 775 | * track_pfn_insert is called when a _new_ single pfn is established |
776 | * by vm_insert_pfn(). | 776 | * by vmf_insert_pfn(). |
777 | */ | 777 | */ |
778 | static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, | 778 | static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, |
779 | pfn_t pfn) | 779 | pfn_t pfn) |
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 22254c1fe1c5..5e1694fe035b 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/u64_stats_sync.h> | 20 | #include <linux/u64_stats_sync.h> |
21 | #include <linux/workqueue.h> | 21 | #include <linux/workqueue.h> |
22 | #include <linux/bpf-cgroup.h> | 22 | #include <linux/bpf-cgroup.h> |
23 | #include <linux/psi_types.h> | ||
23 | 24 | ||
24 | #ifdef CONFIG_CGROUPS | 25 | #ifdef CONFIG_CGROUPS |
25 | 26 | ||
@@ -436,6 +437,9 @@ struct cgroup { | |||
436 | /* used to schedule release agent */ | 437 | /* used to schedule release agent */ |
437 | struct work_struct release_agent_work; | 438 | struct work_struct release_agent_work; |
438 | 439 | ||
440 | /* used to track pressure stalls */ | ||
441 | struct psi_group psi; | ||
442 | |||
439 | /* used to store eBPF programs */ | 443 | /* used to store eBPF programs */ |
440 | struct cgroup_bpf bpf; | 444 | struct cgroup_bpf bpf; |
441 | 445 | ||
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b622d6608605..9968332cceed 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -650,6 +650,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) | |||
650 | pr_cont_kernfs_path(cgrp->kn); | 650 | pr_cont_kernfs_path(cgrp->kn); |
651 | } | 651 | } |
652 | 652 | ||
653 | static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) | ||
654 | { | ||
655 | return &cgrp->psi; | ||
656 | } | ||
657 | |||
653 | static inline void cgroup_init_kthreadd(void) | 658 | static inline void cgroup_init_kthreadd(void) |
654 | { | 659 | { |
655 | /* | 660 | /* |
@@ -703,6 +708,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) | |||
703 | return NULL; | 708 | return NULL; |
704 | } | 709 | } |
705 | 710 | ||
711 | static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) | ||
712 | { | ||
713 | return NULL; | ||
714 | } | ||
715 | |||
716 | static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) | ||
717 | { | ||
718 | return NULL; | ||
719 | } | ||
720 | |||
706 | static inline bool task_under_cgroup_hierarchy(struct task_struct *task, | 721 | static inline bool task_under_cgroup_hierarchy(struct task_struct *task, |
707 | struct cgroup *ancestor) | 722 | struct cgroup *ancestor) |
708 | { | 723 | { |
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 31c865d1842e..577d1b25fccd 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h | |||
@@ -57,7 +57,12 @@ struct task_delay_info { | |||
57 | 57 | ||
58 | u64 freepages_start; | 58 | u64 freepages_start; |
59 | u64 freepages_delay; /* wait for memory reclaim */ | 59 | u64 freepages_delay; /* wait for memory reclaim */ |
60 | |||
61 | u64 thrashing_start; | ||
62 | u64 thrashing_delay; /* wait for thrashing page */ | ||
63 | |||
60 | u32 freepages_count; /* total count of memory reclaim */ | 64 | u32 freepages_count; /* total count of memory reclaim */ |
65 | u32 thrashing_count; /* total count of thrash waits */ | ||
61 | }; | 66 | }; |
62 | #endif | 67 | #endif |
63 | 68 | ||
@@ -76,6 +81,8 @@ extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); | |||
76 | extern __u64 __delayacct_blkio_ticks(struct task_struct *); | 81 | extern __u64 __delayacct_blkio_ticks(struct task_struct *); |
77 | extern void __delayacct_freepages_start(void); | 82 | extern void __delayacct_freepages_start(void); |
78 | extern void __delayacct_freepages_end(void); | 83 | extern void __delayacct_freepages_end(void); |
84 | extern void __delayacct_thrashing_start(void); | ||
85 | extern void __delayacct_thrashing_end(void); | ||
79 | 86 | ||
80 | static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) | 87 | static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) |
81 | { | 88 | { |
@@ -156,6 +163,18 @@ static inline void delayacct_freepages_end(void) | |||
156 | __delayacct_freepages_end(); | 163 | __delayacct_freepages_end(); |
157 | } | 164 | } |
158 | 165 | ||
166 | static inline void delayacct_thrashing_start(void) | ||
167 | { | ||
168 | if (current->delays) | ||
169 | __delayacct_thrashing_start(); | ||
170 | } | ||
171 | |||
172 | static inline void delayacct_thrashing_end(void) | ||
173 | { | ||
174 | if (current->delays) | ||
175 | __delayacct_thrashing_end(); | ||
176 | } | ||
177 | |||
159 | #else | 178 | #else |
160 | static inline void delayacct_set_flag(int flag) | 179 | static inline void delayacct_set_flag(int flag) |
161 | {} | 180 | {} |
@@ -182,6 +201,10 @@ static inline void delayacct_freepages_start(void) | |||
182 | {} | 201 | {} |
183 | static inline void delayacct_freepages_end(void) | 202 | static inline void delayacct_freepages_end(void) |
184 | {} | 203 | {} |
204 | static inline void delayacct_thrashing_start(void) | ||
205 | {} | ||
206 | static inline void delayacct_thrashing_end(void) | ||
207 | {} | ||
185 | 208 | ||
186 | #endif /* CONFIG_TASK_DELAY_ACCT */ | 209 | #endif /* CONFIG_TASK_DELAY_ACCT */ |
187 | 210 | ||
diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4c92e3ba3e16..dde947083d4e 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h | |||
@@ -107,7 +107,7 @@ enum hmm_pfn_flag_e { | |||
107 | * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory | 107 | * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory |
108 | * HMM_PFN_NONE: corresponding CPU page table entry is pte_none() | 108 | * HMM_PFN_NONE: corresponding CPU page table entry is pte_none() |
109 | * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the | 109 | * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the |
110 | * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not | 110 | * result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not |
111 | * be mirrored by a device, because the entry will never have HMM_PFN_VALID | 111 | * be mirrored by a device, because the entry will never have HMM_PFN_VALID |
112 | * set and the pfn value is undefined. | 112 | * set and the pfn value is undefined. |
113 | * | 113 | * |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fdcb45999b26..4663ee96cf59 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -213,9 +213,9 @@ static inline int hpage_nr_pages(struct page *page) | |||
213 | } | 213 | } |
214 | 214 | ||
215 | struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, | 215 | struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, |
216 | pmd_t *pmd, int flags); | 216 | pmd_t *pmd, int flags, struct dev_pagemap **pgmap); |
217 | struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, | 217 | struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, |
218 | pud_t *pud, int flags); | 218 | pud_t *pud, int flags, struct dev_pagemap **pgmap); |
219 | 219 | ||
220 | extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); | 220 | extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); |
221 | 221 | ||
@@ -344,13 +344,13 @@ static inline void mm_put_huge_zero_page(struct mm_struct *mm) | |||
344 | } | 344 | } |
345 | 345 | ||
346 | static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, | 346 | static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, |
347 | unsigned long addr, pmd_t *pmd, int flags) | 347 | unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) |
348 | { | 348 | { |
349 | return NULL; | 349 | return NULL; |
350 | } | 350 | } |
351 | 351 | ||
352 | static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, | 352 | static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, |
353 | unsigned long addr, pud_t *pud, int flags) | 353 | unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap) |
354 | { | 354 | { |
355 | return NULL; | 355 | return NULL; |
356 | } | 356 | } |
diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 3555d54bf79a..9a4258154b25 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/bitmap.h> | 6 | #include <linux/bitmap.h> |
7 | #include <linux/mm.h> | 7 | #include <linux/mm.h> |
8 | #include <linux/types.h> | 8 | #include <linux/types.h> |
9 | #include <linux/mm_types.h> | ||
9 | 10 | ||
10 | struct address_space; | 11 | struct address_space; |
11 | struct fiemap_extent_info; | 12 | struct fiemap_extent_info; |
@@ -141,7 +142,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, | |||
141 | bool *did_zero, const struct iomap_ops *ops); | 142 | bool *did_zero, const struct iomap_ops *ops); |
142 | int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, | 143 | int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, |
143 | const struct iomap_ops *ops); | 144 | const struct iomap_ops *ops); |
144 | int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); | 145 | vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, |
146 | const struct iomap_ops *ops); | ||
145 | int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | 147 | int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
146 | loff_t start, loff_t len, const struct iomap_ops *ops); | 148 | loff_t start, loff_t len, const struct iomap_ops *ops); |
147 | loff_t iomap_seek_hole(struct inode *inode, loff_t offset, | 149 | loff_t iomap_seek_hole(struct inode *inode, loff_t offset, |
diff --git a/include/linux/linkage.h b/include/linux/linkage.h index d7618c41f74c..7c47b1a471d4 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h | |||
@@ -90,6 +90,7 @@ | |||
90 | #ifndef WEAK | 90 | #ifndef WEAK |
91 | #define WEAK(name) \ | 91 | #define WEAK(name) \ |
92 | .weak name ASM_NL \ | 92 | .weak name ASM_NL \ |
93 | ALIGN ASM_NL \ | ||
93 | name: | 94 | name: |
94 | #endif | 95 | #endif |
95 | 96 | ||
diff --git a/include/linux/math64.h b/include/linux/math64.h index 837f2f2d1d34..bb2c84afb80c 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h | |||
@@ -281,4 +281,7 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) | |||
281 | } | 281 | } |
282 | #endif /* mul_u64_u32_div */ | 282 | #endif /* mul_u64_u32_div */ |
283 | 283 | ||
284 | #define DIV64_U64_ROUND_UP(ll, d) \ | ||
285 | ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) | ||
286 | |||
284 | #endif /* _LINUX_MATH64_H */ | 287 | #endif /* _LINUX_MATH64_H */ |
diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 516920549378..2acdd046df2d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h | |||
@@ -265,21 +265,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, | |||
265 | for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ | 265 | for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ |
266 | nid, flags, p_start, p_end, p_nid) | 266 | nid, flags, p_start, p_end, p_nid) |
267 | 267 | ||
268 | /** | ||
269 | * for_each_resv_unavail_range - iterate through reserved and unavailable memory | ||
270 | * @i: u64 used as loop variable | ||
271 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | ||
272 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | ||
273 | * | ||
274 | * Walks over unavailable but reserved (reserved && !memory) areas of memblock. | ||
275 | * Available as soon as memblock is initialized. | ||
276 | * Note: because this memory does not belong to any physical node, flags and | ||
277 | * nid arguments do not make sense and thus not exported as arguments. | ||
278 | */ | ||
279 | #define for_each_resv_unavail_range(i, p_start, p_end) \ | ||
280 | for_each_mem_range(i, &memblock.reserved, &memblock.memory, \ | ||
281 | NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL) | ||
282 | |||
283 | static inline void memblock_set_region_flags(struct memblock_region *r, | 268 | static inline void memblock_set_region_flags(struct memblock_region *r, |
284 | enum memblock_flags flags) | 269 | enum memblock_flags flags) |
285 | { | 270 | { |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 652f602167df..7ab2120155a4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -78,7 +78,7 @@ struct mem_cgroup_reclaim_cookie { | |||
78 | 78 | ||
79 | struct mem_cgroup_id { | 79 | struct mem_cgroup_id { |
80 | int id; | 80 | int id; |
81 | atomic_t ref; | 81 | refcount_t ref; |
82 | }; | 82 | }; |
83 | 83 | ||
84 | /* | 84 | /* |
@@ -1268,10 +1268,11 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); | |||
1268 | void memcg_kmem_put_cache(struct kmem_cache *cachep); | 1268 | void memcg_kmem_put_cache(struct kmem_cache *cachep); |
1269 | int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, | 1269 | int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, |
1270 | struct mem_cgroup *memcg); | 1270 | struct mem_cgroup *memcg); |
1271 | |||
1272 | #ifdef CONFIG_MEMCG_KMEM | ||
1271 | int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); | 1273 | int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); |
1272 | void memcg_kmem_uncharge(struct page *page, int order); | 1274 | void memcg_kmem_uncharge(struct page *page, int order); |
1273 | 1275 | ||
1274 | #ifdef CONFIG_MEMCG_KMEM | ||
1275 | extern struct static_key_false memcg_kmem_enabled_key; | 1276 | extern struct static_key_false memcg_kmem_enabled_key; |
1276 | extern struct workqueue_struct *memcg_kmem_cache_wq; | 1277 | extern struct workqueue_struct *memcg_kmem_cache_wq; |
1277 | 1278 | ||
@@ -1307,6 +1308,16 @@ extern int memcg_expand_shrinker_maps(int new_id); | |||
1307 | extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, | 1308 | extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, |
1308 | int nid, int shrinker_id); | 1309 | int nid, int shrinker_id); |
1309 | #else | 1310 | #else |
1311 | |||
1312 | static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) | ||
1313 | { | ||
1314 | return 0; | ||
1315 | } | ||
1316 | |||
1317 | static inline void memcg_kmem_uncharge(struct page *page, int order) | ||
1318 | { | ||
1319 | } | ||
1320 | |||
1310 | #define for_each_memcg_cache_index(_idx) \ | 1321 | #define for_each_memcg_cache_index(_idx) \ |
1311 | for (; NULL; ) | 1322 | for (; NULL; ) |
1312 | 1323 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index daa2b8f1e9a8..1e52b8fd1685 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page *page) | |||
848 | { | 848 | { |
849 | return page_zonenum(page) == ZONE_DEVICE; | 849 | return page_zonenum(page) == ZONE_DEVICE; |
850 | } | 850 | } |
851 | extern void memmap_init_zone_device(struct zone *, unsigned long, | ||
852 | unsigned long, struct dev_pagemap *); | ||
851 | #else | 853 | #else |
852 | static inline bool is_zone_device_page(const struct page *page) | 854 | static inline bool is_zone_device_page(const struct page *page) |
853 | { | 855 | { |
@@ -2304,6 +2306,8 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr, | |||
2304 | unsigned long len, unsigned long prot, unsigned long flags, | 2306 | unsigned long len, unsigned long prot, unsigned long flags, |
2305 | vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, | 2307 | vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, |
2306 | struct list_head *uf); | 2308 | struct list_head *uf); |
2309 | extern int __do_munmap(struct mm_struct *, unsigned long, size_t, | ||
2310 | struct list_head *uf, bool downgrade); | ||
2307 | extern int do_munmap(struct mm_struct *, unsigned long, size_t, | 2311 | extern int do_munmap(struct mm_struct *, unsigned long, size_t, |
2308 | struct list_head *uf); | 2312 | struct list_head *uf); |
2309 | 2313 | ||
@@ -2502,11 +2506,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); | |||
2502 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, | 2506 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, |
2503 | unsigned long pfn, unsigned long size, pgprot_t); | 2507 | unsigned long pfn, unsigned long size, pgprot_t); |
2504 | int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); | 2508 | int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); |
2505 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 2509 | vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
2506 | unsigned long pfn); | 2510 | unsigned long pfn); |
2507 | int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, | 2511 | vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, |
2508 | unsigned long pfn, pgprot_t pgprot); | 2512 | unsigned long pfn, pgprot_t pgprot); |
2509 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | 2513 | vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
2510 | pfn_t pfn); | 2514 | pfn_t pfn); |
2511 | vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, | 2515 | vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, |
2512 | unsigned long addr, pfn_t pfn); | 2516 | unsigned long addr, pfn_t pfn); |
@@ -2525,32 +2529,6 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, | |||
2525 | return VM_FAULT_NOPAGE; | 2529 | return VM_FAULT_NOPAGE; |
2526 | } | 2530 | } |
2527 | 2531 | ||
2528 | static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, | ||
2529 | unsigned long addr, pfn_t pfn) | ||
2530 | { | ||
2531 | int err = vm_insert_mixed(vma, addr, pfn); | ||
2532 | |||
2533 | if (err == -ENOMEM) | ||
2534 | return VM_FAULT_OOM; | ||
2535 | if (err < 0 && err != -EBUSY) | ||
2536 | return VM_FAULT_SIGBUS; | ||
2537 | |||
2538 | return VM_FAULT_NOPAGE; | ||
2539 | } | ||
2540 | |||
2541 | static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, | ||
2542 | unsigned long addr, unsigned long pfn) | ||
2543 | { | ||
2544 | int err = vm_insert_pfn(vma, addr, pfn); | ||
2545 | |||
2546 | if (err == -ENOMEM) | ||
2547 | return VM_FAULT_OOM; | ||
2548 | if (err < 0 && err != -EBUSY) | ||
2549 | return VM_FAULT_SIGBUS; | ||
2550 | |||
2551 | return VM_FAULT_NOPAGE; | ||
2552 | } | ||
2553 | |||
2554 | static inline vm_fault_t vmf_error(int err) | 2532 | static inline vm_fault_t vmf_error(int err) |
2555 | { | 2533 | { |
2556 | if (err == -ENOMEM) | 2534 | if (err == -ENOMEM) |
@@ -2558,16 +2536,8 @@ static inline vm_fault_t vmf_error(int err) | |||
2558 | return VM_FAULT_SIGBUS; | 2536 | return VM_FAULT_SIGBUS; |
2559 | } | 2537 | } |
2560 | 2538 | ||
2561 | struct page *follow_page_mask(struct vm_area_struct *vma, | 2539 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
2562 | unsigned long address, unsigned int foll_flags, | 2540 | unsigned int foll_flags); |
2563 | unsigned int *page_mask); | ||
2564 | |||
2565 | static inline struct page *follow_page(struct vm_area_struct *vma, | ||
2566 | unsigned long address, unsigned int foll_flags) | ||
2567 | { | ||
2568 | unsigned int unused_page_mask; | ||
2569 | return follow_page_mask(vma, address, foll_flags, &unused_page_mask); | ||
2570 | } | ||
2571 | 2541 | ||
2572 | #define FOLL_WRITE 0x01 /* check pte is writable */ | 2542 | #define FOLL_WRITE 0x01 /* check pte is writable */ |
2573 | #define FOLL_TOUCH 0x02 /* mark page accessed */ | 2543 | #define FOLL_TOUCH 0x02 /* mark page accessed */ |
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 133ba78820ee..9893a6432adf 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h | |||
@@ -2,7 +2,6 @@ | |||
2 | #ifndef _LINUX_MMU_NOTIFIER_H | 2 | #ifndef _LINUX_MMU_NOTIFIER_H |
3 | #define _LINUX_MMU_NOTIFIER_H | 3 | #define _LINUX_MMU_NOTIFIER_H |
4 | 4 | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/list.h> | 5 | #include <linux/list.h> |
7 | #include <linux/spinlock.h> | 6 | #include <linux/spinlock.h> |
8 | #include <linux/mm_types.h> | 7 | #include <linux/mm_types.h> |
@@ -11,9 +10,6 @@ | |||
11 | struct mmu_notifier; | 10 | struct mmu_notifier; |
12 | struct mmu_notifier_ops; | 11 | struct mmu_notifier_ops; |
13 | 12 | ||
14 | /* mmu_notifier_ops flags */ | ||
15 | #define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01) | ||
16 | |||
17 | #ifdef CONFIG_MMU_NOTIFIER | 13 | #ifdef CONFIG_MMU_NOTIFIER |
18 | 14 | ||
19 | /* | 15 | /* |
@@ -31,15 +27,6 @@ struct mmu_notifier_mm { | |||
31 | 27 | ||
32 | struct mmu_notifier_ops { | 28 | struct mmu_notifier_ops { |
33 | /* | 29 | /* |
34 | * Flags to specify behavior of callbacks for this MMU notifier. | ||
35 | * Used to determine which context an operation may be called. | ||
36 | * | ||
37 | * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not | ||
38 | * block | ||
39 | */ | ||
40 | int flags; | ||
41 | |||
42 | /* | ||
43 | * Called either by mmu_notifier_unregister or when the mm is | 30 | * Called either by mmu_notifier_unregister or when the mm is |
44 | * being destroyed by exit_mmap, always before all pages are | 31 | * being destroyed by exit_mmap, always before all pages are |
45 | * freed. This can run concurrently with other mmu notifier | 32 | * freed. This can run concurrently with other mmu notifier |
@@ -153,7 +140,9 @@ struct mmu_notifier_ops { | |||
153 | * | 140 | * |
154 | * If blockable argument is set to false then the callback cannot | 141 | * If blockable argument is set to false then the callback cannot |
155 | * sleep and has to return with -EAGAIN. 0 should be returned | 142 | * sleep and has to return with -EAGAIN. 0 should be returned |
156 | * otherwise. | 143 | * otherwise. Please note that if invalidate_range_start approves |
144 | * a non-blocking behavior then the same applies to | ||
145 | * invalidate_range_end. | ||
157 | * | 146 | * |
158 | */ | 147 | */ |
159 | int (*invalidate_range_start)(struct mmu_notifier *mn, | 148 | int (*invalidate_range_start)(struct mmu_notifier *mn, |
@@ -181,10 +170,6 @@ struct mmu_notifier_ops { | |||
181 | * Note that this function might be called with just a sub-range | 170 | * Note that this function might be called with just a sub-range |
182 | * of what was passed to invalidate_range_start()/end(), if | 171 | * of what was passed to invalidate_range_start()/end(), if |
183 | * called between those functions. | 172 | * called between those functions. |
184 | * | ||
185 | * If this callback cannot block, and invalidate_range_{start,end} | ||
186 | * cannot block, mmu_notifier_ops.flags should have | ||
187 | * MMU_INVALIDATE_DOES_NOT_BLOCK set. | ||
188 | */ | 173 | */ |
189 | void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, | 174 | void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, |
190 | unsigned long start, unsigned long end); | 175 | unsigned long start, unsigned long end); |
@@ -239,7 +224,6 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | |||
239 | bool only_end); | 224 | bool only_end); |
240 | extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, | 225 | extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, |
241 | unsigned long start, unsigned long end); | 226 | unsigned long start, unsigned long end); |
242 | extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm); | ||
243 | 227 | ||
244 | static inline void mmu_notifier_release(struct mm_struct *mm) | 228 | static inline void mmu_notifier_release(struct mm_struct *mm) |
245 | { | 229 | { |
@@ -493,11 +477,6 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, | |||
493 | { | 477 | { |
494 | } | 478 | } |
495 | 479 | ||
496 | static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) | ||
497 | { | ||
498 | return false; | ||
499 | } | ||
500 | |||
501 | static inline void mmu_notifier_mm_init(struct mm_struct *mm) | 480 | static inline void mmu_notifier_mm_init(struct mm_struct *mm) |
502 | { | 481 | { |
503 | } | 482 | } |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d4b0c79d2924..9f0caccd5833 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -161,8 +161,10 @@ enum node_stat_item { | |||
161 | NR_SLAB_UNRECLAIMABLE, | 161 | NR_SLAB_UNRECLAIMABLE, |
162 | NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ | 162 | NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ |
163 | NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ | 163 | NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ |
164 | WORKINGSET_NODES, | ||
164 | WORKINGSET_REFAULT, | 165 | WORKINGSET_REFAULT, |
165 | WORKINGSET_ACTIVATE, | 166 | WORKINGSET_ACTIVATE, |
167 | WORKINGSET_RESTORE, | ||
166 | WORKINGSET_NODERECLAIM, | 168 | WORKINGSET_NODERECLAIM, |
167 | NR_ANON_MAPPED, /* Mapped anonymous pages */ | 169 | NR_ANON_MAPPED, /* Mapped anonymous pages */ |
168 | NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. | 170 | NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. |
@@ -180,7 +182,7 @@ enum node_stat_item { | |||
180 | NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ | 182 | NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ |
181 | NR_DIRTIED, /* page dirtyings since bootup */ | 183 | NR_DIRTIED, /* page dirtyings since bootup */ |
182 | NR_WRITTEN, /* page writings since bootup */ | 184 | NR_WRITTEN, /* page writings since bootup */ |
183 | NR_INDIRECTLY_RECLAIMABLE_BYTES, /* measured in bytes */ | 185 | NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ |
184 | NR_VM_NODE_STAT_ITEMS | 186 | NR_VM_NODE_STAT_ITEMS |
185 | }; | 187 | }; |
186 | 188 | ||
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 74bee8cecf4c..50ce1bddaf56 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -69,13 +69,14 @@ | |||
69 | */ | 69 | */ |
70 | enum pageflags { | 70 | enum pageflags { |
71 | PG_locked, /* Page is locked. Don't touch. */ | 71 | PG_locked, /* Page is locked. Don't touch. */ |
72 | PG_error, | ||
73 | PG_referenced, | 72 | PG_referenced, |
74 | PG_uptodate, | 73 | PG_uptodate, |
75 | PG_dirty, | 74 | PG_dirty, |
76 | PG_lru, | 75 | PG_lru, |
77 | PG_active, | 76 | PG_active, |
77 | PG_workingset, | ||
78 | PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ | 78 | PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ |
79 | PG_error, | ||
79 | PG_slab, | 80 | PG_slab, |
80 | PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ | 81 | PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ |
81 | PG_arch_1, | 82 | PG_arch_1, |
@@ -162,6 +163,14 @@ static inline int PagePoisoned(const struct page *page) | |||
162 | return page->flags == PAGE_POISON_PATTERN; | 163 | return page->flags == PAGE_POISON_PATTERN; |
163 | } | 164 | } |
164 | 165 | ||
166 | #ifdef CONFIG_DEBUG_VM | ||
167 | void page_init_poison(struct page *page, size_t size); | ||
168 | #else | ||
169 | static inline void page_init_poison(struct page *page, size_t size) | ||
170 | { | ||
171 | } | ||
172 | #endif | ||
173 | |||
165 | /* | 174 | /* |
166 | * Page flags policies wrt compound pages | 175 | * Page flags policies wrt compound pages |
167 | * | 176 | * |
@@ -280,6 +289,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) | |||
280 | PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) | 289 | PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) |
281 | PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) | 290 | PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) |
282 | TESTCLEARFLAG(Active, active, PF_HEAD) | 291 | TESTCLEARFLAG(Active, active, PF_HEAD) |
292 | PAGEFLAG(Workingset, workingset, PF_HEAD) | ||
293 | TESTCLEARFLAG(Workingset, workingset, PF_HEAD) | ||
283 | __PAGEFLAG(Slab, slab, PF_NO_TAIL) | 294 | __PAGEFLAG(Slab, slab, PF_NO_TAIL) |
284 | __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) | 295 | __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) |
285 | PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ | 296 | PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ |
@@ -292,6 +303,7 @@ PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); | |||
292 | 303 | ||
293 | PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) | 304 | PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) |
294 | __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) | 305 | __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) |
306 | __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) | ||
295 | PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) | 307 | PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) |
296 | __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) | 308 | __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) |
297 | __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) | 309 | __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) |
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 21713dc14ce2..7bb77850c65a 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h | |||
@@ -9,8 +9,10 @@ | |||
9 | * PFN_SG_LAST - pfn references a page and is the last scatterlist entry | 9 | * PFN_SG_LAST - pfn references a page and is the last scatterlist entry |
10 | * PFN_DEV - pfn is not covered by system memmap by default | 10 | * PFN_DEV - pfn is not covered by system memmap by default |
11 | * PFN_MAP - pfn has a dynamic page mapping established by a device driver | 11 | * PFN_MAP - pfn has a dynamic page mapping established by a device driver |
12 | * PFN_SPECIAL - for CONFIG_FS_DAX_LIMITED builds to allow XIP, but not | ||
13 | * get_user_pages | ||
12 | */ | 14 | */ |
13 | #define PFN_FLAGS_MASK (((u64) ~PAGE_MASK) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) | 15 | #define PFN_FLAGS_MASK (((u64) (~PAGE_MASK)) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) |
14 | #define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1)) | 16 | #define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1)) |
15 | #define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2)) | 17 | #define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2)) |
16 | #define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3)) | 18 | #define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3)) |
diff --git a/include/linux/psi.h b/include/linux/psi.h new file mode 100644 index 000000000000..8e0725aac0aa --- /dev/null +++ b/include/linux/psi.h | |||
@@ -0,0 +1,53 @@ | |||
1 | #ifndef _LINUX_PSI_H | ||
2 | #define _LINUX_PSI_H | ||
3 | |||
4 | #include <linux/psi_types.h> | ||
5 | #include <linux/sched.h> | ||
6 | |||
7 | struct seq_file; | ||
8 | struct css_set; | ||
9 | |||
10 | #ifdef CONFIG_PSI | ||
11 | |||
12 | extern bool psi_disabled; | ||
13 | |||
14 | void psi_init(void); | ||
15 | |||
16 | void psi_task_change(struct task_struct *task, int clear, int set); | ||
17 | |||
18 | void psi_memstall_tick(struct task_struct *task, int cpu); | ||
19 | void psi_memstall_enter(unsigned long *flags); | ||
20 | void psi_memstall_leave(unsigned long *flags); | ||
21 | |||
22 | int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); | ||
23 | |||
24 | #ifdef CONFIG_CGROUPS | ||
25 | int psi_cgroup_alloc(struct cgroup *cgrp); | ||
26 | void psi_cgroup_free(struct cgroup *cgrp); | ||
27 | void cgroup_move_task(struct task_struct *p, struct css_set *to); | ||
28 | #endif | ||
29 | |||
30 | #else /* CONFIG_PSI */ | ||
31 | |||
32 | static inline void psi_init(void) {} | ||
33 | |||
34 | static inline void psi_memstall_enter(unsigned long *flags) {} | ||
35 | static inline void psi_memstall_leave(unsigned long *flags) {} | ||
36 | |||
37 | #ifdef CONFIG_CGROUPS | ||
38 | static inline int psi_cgroup_alloc(struct cgroup *cgrp) | ||
39 | { | ||
40 | return 0; | ||
41 | } | ||
42 | static inline void psi_cgroup_free(struct cgroup *cgrp) | ||
43 | { | ||
44 | } | ||
45 | static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) | ||
46 | { | ||
47 | rcu_assign_pointer(p->cgroups, to); | ||
48 | } | ||
49 | #endif | ||
50 | |||
51 | #endif /* CONFIG_PSI */ | ||
52 | |||
53 | #endif /* _LINUX_PSI_H */ | ||
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h new file mode 100644 index 000000000000..2cf422db5d18 --- /dev/null +++ b/include/linux/psi_types.h | |||
@@ -0,0 +1,92 @@ | |||
1 | #ifndef _LINUX_PSI_TYPES_H | ||
2 | #define _LINUX_PSI_TYPES_H | ||
3 | |||
4 | #include <linux/seqlock.h> | ||
5 | #include <linux/types.h> | ||
6 | |||
7 | #ifdef CONFIG_PSI | ||
8 | |||
9 | /* Tracked task states */ | ||
10 | enum psi_task_count { | ||
11 | NR_IOWAIT, | ||
12 | NR_MEMSTALL, | ||
13 | NR_RUNNING, | ||
14 | NR_PSI_TASK_COUNTS, | ||
15 | }; | ||
16 | |||
17 | /* Task state bitmasks */ | ||
18 | #define TSK_IOWAIT (1 << NR_IOWAIT) | ||
19 | #define TSK_MEMSTALL (1 << NR_MEMSTALL) | ||
20 | #define TSK_RUNNING (1 << NR_RUNNING) | ||
21 | |||
22 | /* Resources that workloads could be stalled on */ | ||
23 | enum psi_res { | ||
24 | PSI_IO, | ||
25 | PSI_MEM, | ||
26 | PSI_CPU, | ||
27 | NR_PSI_RESOURCES, | ||
28 | }; | ||
29 | |||
30 | /* | ||
31 | * Pressure states for each resource: | ||
32 | * | ||
33 | * SOME: Stalled tasks & working tasks | ||
34 | * FULL: Stalled tasks & no working tasks | ||
35 | */ | ||
36 | enum psi_states { | ||
37 | PSI_IO_SOME, | ||
38 | PSI_IO_FULL, | ||
39 | PSI_MEM_SOME, | ||
40 | PSI_MEM_FULL, | ||
41 | PSI_CPU_SOME, | ||
42 | /* Only per-CPU, to weigh the CPU in the global average: */ | ||
43 | PSI_NONIDLE, | ||
44 | NR_PSI_STATES, | ||
45 | }; | ||
46 | |||
47 | struct psi_group_cpu { | ||
48 | /* 1st cacheline updated by the scheduler */ | ||
49 | |||
50 | /* Aggregator needs to know of concurrent changes */ | ||
51 | seqcount_t seq ____cacheline_aligned_in_smp; | ||
52 | |||
53 | /* States of the tasks belonging to this group */ | ||
54 | unsigned int tasks[NR_PSI_TASK_COUNTS]; | ||
55 | |||
56 | /* Period time sampling buckets for each state of interest (ns) */ | ||
57 | u32 times[NR_PSI_STATES]; | ||
58 | |||
59 | /* Time of last task change in this group (rq_clock) */ | ||
60 | u64 state_start; | ||
61 | |||
62 | /* 2nd cacheline updated by the aggregator */ | ||
63 | |||
64 | /* Delta detection against the sampling buckets */ | ||
65 | u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp; | ||
66 | }; | ||
67 | |||
68 | struct psi_group { | ||
69 | /* Protects data updated during an aggregation */ | ||
70 | struct mutex stat_lock; | ||
71 | |||
72 | /* Per-cpu task state & time tracking */ | ||
73 | struct psi_group_cpu __percpu *pcpu; | ||
74 | |||
75 | /* Periodic aggregation state */ | ||
76 | u64 total_prev[NR_PSI_STATES - 1]; | ||
77 | u64 last_update; | ||
78 | u64 next_update; | ||
79 | struct delayed_work clock_work; | ||
80 | |||
81 | /* Total stall times and sampled pressure averages */ | ||
82 | u64 total[NR_PSI_STATES - 1]; | ||
83 | unsigned long avg[NR_PSI_STATES - 1][3]; | ||
84 | }; | ||
85 | |||
86 | #else /* CONFIG_PSI */ | ||
87 | |||
88 | struct psi_group { }; | ||
89 | |||
90 | #endif /* CONFIG_PSI */ | ||
91 | |||
92 | #endif /* _LINUX_PSI_TYPES_H */ | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index adfb3f9a7597..8f8a5418b627 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/latencytop.h> | 25 | #include <linux/latencytop.h> |
26 | #include <linux/sched/prio.h> | 26 | #include <linux/sched/prio.h> |
27 | #include <linux/signal_types.h> | 27 | #include <linux/signal_types.h> |
28 | #include <linux/psi_types.h> | ||
28 | #include <linux/mm_types_task.h> | 29 | #include <linux/mm_types_task.h> |
29 | #include <linux/task_io_accounting.h> | 30 | #include <linux/task_io_accounting.h> |
30 | #include <linux/rseq.h> | 31 | #include <linux/rseq.h> |
@@ -706,6 +707,10 @@ struct task_struct { | |||
706 | unsigned sched_contributes_to_load:1; | 707 | unsigned sched_contributes_to_load:1; |
707 | unsigned sched_migrated:1; | 708 | unsigned sched_migrated:1; |
708 | unsigned sched_remote_wakeup:1; | 709 | unsigned sched_remote_wakeup:1; |
710 | #ifdef CONFIG_PSI | ||
711 | unsigned sched_psi_wake_requeue:1; | ||
712 | #endif | ||
713 | |||
709 | /* Force alignment to the next boundary: */ | 714 | /* Force alignment to the next boundary: */ |
710 | unsigned :0; | 715 | unsigned :0; |
711 | 716 | ||
@@ -719,9 +724,6 @@ struct task_struct { | |||
719 | #endif | 724 | #endif |
720 | #ifdef CONFIG_MEMCG | 725 | #ifdef CONFIG_MEMCG |
721 | unsigned in_user_fault:1; | 726 | unsigned in_user_fault:1; |
722 | #ifdef CONFIG_MEMCG_KMEM | ||
723 | unsigned memcg_kmem_skip_account:1; | ||
724 | #endif | ||
725 | #endif | 727 | #endif |
726 | #ifdef CONFIG_COMPAT_BRK | 728 | #ifdef CONFIG_COMPAT_BRK |
727 | unsigned brk_randomized:1; | 729 | unsigned brk_randomized:1; |
@@ -965,6 +967,10 @@ struct task_struct { | |||
965 | kernel_siginfo_t *last_siginfo; | 967 | kernel_siginfo_t *last_siginfo; |
966 | 968 | ||
967 | struct task_io_accounting ioac; | 969 | struct task_io_accounting ioac; |
970 | #ifdef CONFIG_PSI | ||
971 | /* Pressure stall state */ | ||
972 | unsigned int psi_flags; | ||
973 | #endif | ||
968 | #ifdef CONFIG_TASK_XACCT | 974 | #ifdef CONFIG_TASK_XACCT |
969 | /* Accumulated RSS usage: */ | 975 | /* Accumulated RSS usage: */ |
970 | u64 acct_rss_mem1; | 976 | u64 acct_rss_mem1; |
@@ -1391,6 +1397,7 @@ extern struct pid *cad_pid; | |||
1391 | #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ | 1397 | #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ |
1392 | #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ | 1398 | #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ |
1393 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ | 1399 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ |
1400 | #define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ | ||
1394 | #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ | 1401 | #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ |
1395 | #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ | 1402 | #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ |
1396 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ | 1403 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ |
diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index 80bc84ba5d2a..4859bea47a7b 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h | |||
@@ -22,10 +22,26 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); | |||
22 | #define EXP_5 2014 /* 1/exp(5sec/5min) */ | 22 | #define EXP_5 2014 /* 1/exp(5sec/5min) */ |
23 | #define EXP_15 2037 /* 1/exp(5sec/15min) */ | 23 | #define EXP_15 2037 /* 1/exp(5sec/15min) */ |
24 | 24 | ||
25 | #define CALC_LOAD(load,exp,n) \ | 25 | /* |
26 | load *= exp; \ | 26 | * a1 = a0 * e + a * (1 - e) |
27 | load += n*(FIXED_1-exp); \ | 27 | */ |
28 | load >>= FSHIFT; | 28 | static inline unsigned long |
29 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
30 | { | ||
31 | unsigned long newload; | ||
32 | |||
33 | newload = load * exp + active * (FIXED_1 - exp); | ||
34 | if (active >= load) | ||
35 | newload += FIXED_1-1; | ||
36 | |||
37 | return newload / FIXED_1; | ||
38 | } | ||
39 | |||
40 | extern unsigned long calc_load_n(unsigned long load, unsigned long exp, | ||
41 | unsigned long active, unsigned int n); | ||
42 | |||
43 | #define LOAD_INT(x) ((x) >> FSHIFT) | ||
44 | #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) | ||
29 | 45 | ||
30 | extern void calc_global_load(unsigned long ticks); | 46 | extern void calc_global_load(unsigned long ticks); |
31 | 47 | ||
diff --git a/include/linux/slab.h b/include/linux/slab.h index ed9cbddeb4a6..918f374e7156 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -295,12 +295,43 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, | |||
295 | #define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \ | 295 | #define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \ |
296 | (KMALLOC_MIN_SIZE) : 16) | 296 | (KMALLOC_MIN_SIZE) : 16) |
297 | 297 | ||
298 | /* | ||
299 | * Whenever changing this, take care of that kmalloc_type() and | ||
300 | * create_kmalloc_caches() still work as intended. | ||
301 | */ | ||
302 | enum kmalloc_cache_type { | ||
303 | KMALLOC_NORMAL = 0, | ||
304 | KMALLOC_RECLAIM, | ||
305 | #ifdef CONFIG_ZONE_DMA | ||
306 | KMALLOC_DMA, | ||
307 | #endif | ||
308 | NR_KMALLOC_TYPES | ||
309 | }; | ||
310 | |||
298 | #ifndef CONFIG_SLOB | 311 | #ifndef CONFIG_SLOB |
299 | extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; | 312 | extern struct kmem_cache * |
313 | kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; | ||
314 | |||
315 | static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) | ||
316 | { | ||
317 | int is_dma = 0; | ||
318 | int type_dma = 0; | ||
319 | int is_reclaimable; | ||
320 | |||
300 | #ifdef CONFIG_ZONE_DMA | 321 | #ifdef CONFIG_ZONE_DMA |
301 | extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; | 322 | is_dma = !!(flags & __GFP_DMA); |
323 | type_dma = is_dma * KMALLOC_DMA; | ||
302 | #endif | 324 | #endif |
303 | 325 | ||
326 | is_reclaimable = !!(flags & __GFP_RECLAIMABLE); | ||
327 | |||
328 | /* | ||
329 | * If an allocation is both __GFP_DMA and __GFP_RECLAIMABLE, return | ||
330 | * KMALLOC_DMA and effectively ignore __GFP_RECLAIMABLE | ||
331 | */ | ||
332 | return type_dma + (is_reclaimable & !is_dma) * KMALLOC_RECLAIM; | ||
333 | } | ||
334 | |||
304 | /* | 335 | /* |
305 | * Figure out which kmalloc slab an allocation of a certain size | 336 | * Figure out which kmalloc slab an allocation of a certain size |
306 | * belongs to. | 337 | * belongs to. |
@@ -501,18 +532,20 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) | |||
501 | static __always_inline void *kmalloc(size_t size, gfp_t flags) | 532 | static __always_inline void *kmalloc(size_t size, gfp_t flags) |
502 | { | 533 | { |
503 | if (__builtin_constant_p(size)) { | 534 | if (__builtin_constant_p(size)) { |
535 | #ifndef CONFIG_SLOB | ||
536 | unsigned int index; | ||
537 | #endif | ||
504 | if (size > KMALLOC_MAX_CACHE_SIZE) | 538 | if (size > KMALLOC_MAX_CACHE_SIZE) |
505 | return kmalloc_large(size, flags); | 539 | return kmalloc_large(size, flags); |
506 | #ifndef CONFIG_SLOB | 540 | #ifndef CONFIG_SLOB |
507 | if (!(flags & GFP_DMA)) { | 541 | index = kmalloc_index(size); |
508 | unsigned int index = kmalloc_index(size); | ||
509 | 542 | ||
510 | if (!index) | 543 | if (!index) |
511 | return ZERO_SIZE_PTR; | 544 | return ZERO_SIZE_PTR; |
512 | 545 | ||
513 | return kmem_cache_alloc_trace(kmalloc_caches[index], | 546 | return kmem_cache_alloc_trace( |
514 | flags, size); | 547 | kmalloc_caches[kmalloc_type(flags)][index], |
515 | } | 548 | flags, size); |
516 | #endif | 549 | #endif |
517 | } | 550 | } |
518 | return __kmalloc(size, flags); | 551 | return __kmalloc(size, flags); |
@@ -542,13 +575,14 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) | |||
542 | { | 575 | { |
543 | #ifndef CONFIG_SLOB | 576 | #ifndef CONFIG_SLOB |
544 | if (__builtin_constant_p(size) && | 577 | if (__builtin_constant_p(size) && |
545 | size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) { | 578 | size <= KMALLOC_MAX_CACHE_SIZE) { |
546 | unsigned int i = kmalloc_index(size); | 579 | unsigned int i = kmalloc_index(size); |
547 | 580 | ||
548 | if (!i) | 581 | if (!i) |
549 | return ZERO_SIZE_PTR; | 582 | return ZERO_SIZE_PTR; |
550 | 583 | ||
551 | return kmem_cache_alloc_node_trace(kmalloc_caches[i], | 584 | return kmem_cache_alloc_node_trace( |
585 | kmalloc_caches[kmalloc_type(flags)][i], | ||
552 | flags, node, size); | 586 | flags, node, size); |
553 | } | 587 | } |
554 | #endif | 588 | #endif |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 8e2c11e692ba..38195f5c96b1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -167,13 +167,14 @@ enum { | |||
167 | SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ | 167 | SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ |
168 | SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ | 168 | SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ |
169 | SWP_BLKDEV = (1 << 6), /* its a block device */ | 169 | SWP_BLKDEV = (1 << 6), /* its a block device */ |
170 | SWP_FILE = (1 << 7), /* set after swap_activate success */ | 170 | SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ |
171 | SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ | 171 | SWP_FS = (1 << 8), /* swap file goes through fs */ |
172 | SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ | 172 | SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ |
173 | SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ | 173 | SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ |
174 | SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */ | 174 | SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ |
175 | SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ | ||
175 | /* add others here before... */ | 176 | /* add others here before... */ |
176 | SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */ | 177 | SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */ |
177 | }; | 178 | }; |
178 | 179 | ||
179 | #define SWAP_CLUSTER_MAX 32UL | 180 | #define SWAP_CLUSTER_MAX 32UL |
@@ -296,7 +297,7 @@ struct vma_swap_readahead { | |||
296 | 297 | ||
297 | /* linux/mm/workingset.c */ | 298 | /* linux/mm/workingset.c */ |
298 | void *workingset_eviction(struct address_space *mapping, struct page *page); | 299 | void *workingset_eviction(struct address_space *mapping, struct page *page); |
299 | bool workingset_refault(void *shadow); | 300 | void workingset_refault(struct page *page, void *shadow); |
300 | void workingset_activation(struct page *page); | 301 | void workingset_activation(struct page *page); |
301 | 302 | ||
302 | /* Do not use directly, use workingset_lookup_update */ | 303 | /* Do not use directly, use workingset_lookup_update */ |
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index a81cffb76d89..a1675d43777e 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h | |||
@@ -88,6 +88,7 @@ | |||
88 | {1UL << PG_dirty, "dirty" }, \ | 88 | {1UL << PG_dirty, "dirty" }, \ |
89 | {1UL << PG_lru, "lru" }, \ | 89 | {1UL << PG_lru, "lru" }, \ |
90 | {1UL << PG_active, "active" }, \ | 90 | {1UL << PG_active, "active" }, \ |
91 | {1UL << PG_workingset, "workingset" }, \ | ||
91 | {1UL << PG_slab, "slab" }, \ | 92 | {1UL << PG_slab, "slab" }, \ |
92 | {1UL << PG_owner_priv_1, "owner_priv_1" }, \ | 93 | {1UL << PG_owner_priv_1, "owner_priv_1" }, \ |
93 | {1UL << PG_arch_1, "arch_1" }, \ | 94 | {1UL << PG_arch_1, "arch_1" }, \ |
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index b7aa7bb2349f..5e8ca16a9079 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h | |||
@@ -34,7 +34,7 @@ | |||
34 | */ | 34 | */ |
35 | 35 | ||
36 | 36 | ||
37 | #define TASKSTATS_VERSION 8 | 37 | #define TASKSTATS_VERSION 9 |
38 | #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN | 38 | #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN |
39 | * in linux/sched.h */ | 39 | * in linux/sched.h */ |
40 | 40 | ||
@@ -164,6 +164,10 @@ struct taskstats { | |||
164 | /* Delay waiting for memory reclaim */ | 164 | /* Delay waiting for memory reclaim */ |
165 | __u64 freepages_count; | 165 | __u64 freepages_count; |
166 | __u64 freepages_delay_total; | 166 | __u64 freepages_delay_total; |
167 | |||
168 | /* Delay waiting for thrashing page */ | ||
169 | __u64 thrashing_count; | ||
170 | __u64 thrashing_delay_total; | ||
167 | }; | 171 | }; |
168 | 172 | ||
169 | 173 | ||
diff --git a/init/Kconfig b/init/Kconfig index 317d5ccb5191..a4112e95724a 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -490,6 +490,25 @@ config TASK_IO_ACCOUNTING | |||
490 | 490 | ||
491 | Say N if unsure. | 491 | Say N if unsure. |
492 | 492 | ||
493 | config PSI | ||
494 | bool "Pressure stall information tracking" | ||
495 | help | ||
496 | Collect metrics that indicate how overcommitted the CPU, memory, | ||
497 | and IO capacity are in the system. | ||
498 | |||
499 | If you say Y here, the kernel will create /proc/pressure/ with the | ||
500 | pressure statistics files cpu, memory, and io. These will indicate | ||
501 | the share of walltime in which some or all tasks in the system are | ||
502 | delayed due to contention of the respective resource. | ||
503 | |||
504 | In kernels with cgroup support, cgroups (cgroup2 only) will | ||
505 | have cpu.pressure, memory.pressure, and io.pressure files, | ||
506 | which aggregate pressure stalls for the grouped tasks only. | ||
507 | |||
508 | For more details see Documentation/accounting/psi.txt. | ||
509 | |||
510 | Say N if unsure. | ||
511 | |||
493 | endmenu # "CPU/Task time and stats accounting" | 512 | endmenu # "CPU/Task time and stats accounting" |
494 | 513 | ||
495 | config CPU_ISOLATION | 514 | config CPU_ISOLATION |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4c1cf0969a80..8b79318810ad 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <linux/nsproxy.h> | 55 | #include <linux/nsproxy.h> |
56 | #include <linux/file.h> | 56 | #include <linux/file.h> |
57 | #include <linux/sched/cputime.h> | 57 | #include <linux/sched/cputime.h> |
58 | #include <linux/psi.h> | ||
58 | #include <net/sock.h> | 59 | #include <net/sock.h> |
59 | 60 | ||
60 | #define CREATE_TRACE_POINTS | 61 | #define CREATE_TRACE_POINTS |
@@ -862,7 +863,7 @@ static void css_set_move_task(struct task_struct *task, | |||
862 | */ | 863 | */ |
863 | WARN_ON_ONCE(task->flags & PF_EXITING); | 864 | WARN_ON_ONCE(task->flags & PF_EXITING); |
864 | 865 | ||
865 | rcu_assign_pointer(task->cgroups, to_cset); | 866 | cgroup_move_task(task, to_cset); |
866 | list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : | 867 | list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : |
867 | &to_cset->tasks); | 868 | &to_cset->tasks); |
868 | } | 869 | } |
@@ -3446,6 +3447,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) | |||
3446 | return ret; | 3447 | return ret; |
3447 | } | 3448 | } |
3448 | 3449 | ||
3450 | #ifdef CONFIG_PSI | ||
3451 | static int cgroup_io_pressure_show(struct seq_file *seq, void *v) | ||
3452 | { | ||
3453 | return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); | ||
3454 | } | ||
3455 | static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) | ||
3456 | { | ||
3457 | return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); | ||
3458 | } | ||
3459 | static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) | ||
3460 | { | ||
3461 | return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); | ||
3462 | } | ||
3463 | #endif | ||
3464 | |||
3449 | static int cgroup_file_open(struct kernfs_open_file *of) | 3465 | static int cgroup_file_open(struct kernfs_open_file *of) |
3450 | { | 3466 | { |
3451 | struct cftype *cft = of->kn->priv; | 3467 | struct cftype *cft = of->kn->priv; |
@@ -4576,6 +4592,23 @@ static struct cftype cgroup_base_files[] = { | |||
4576 | .flags = CFTYPE_NOT_ON_ROOT, | 4592 | .flags = CFTYPE_NOT_ON_ROOT, |
4577 | .seq_show = cpu_stat_show, | 4593 | .seq_show = cpu_stat_show, |
4578 | }, | 4594 | }, |
4595 | #ifdef CONFIG_PSI | ||
4596 | { | ||
4597 | .name = "io.pressure", | ||
4598 | .flags = CFTYPE_NOT_ON_ROOT, | ||
4599 | .seq_show = cgroup_io_pressure_show, | ||
4600 | }, | ||
4601 | { | ||
4602 | .name = "memory.pressure", | ||
4603 | .flags = CFTYPE_NOT_ON_ROOT, | ||
4604 | .seq_show = cgroup_memory_pressure_show, | ||
4605 | }, | ||
4606 | { | ||
4607 | .name = "cpu.pressure", | ||
4608 | .flags = CFTYPE_NOT_ON_ROOT, | ||
4609 | .seq_show = cgroup_cpu_pressure_show, | ||
4610 | }, | ||
4611 | #endif | ||
4579 | { } /* terminate */ | 4612 | { } /* terminate */ |
4580 | }; | 4613 | }; |
4581 | 4614 | ||
@@ -4636,6 +4669,7 @@ static void css_free_rwork_fn(struct work_struct *work) | |||
4636 | */ | 4669 | */ |
4637 | cgroup_put(cgroup_parent(cgrp)); | 4670 | cgroup_put(cgroup_parent(cgrp)); |
4638 | kernfs_put(cgrp->kn); | 4671 | kernfs_put(cgrp->kn); |
4672 | psi_cgroup_free(cgrp); | ||
4639 | if (cgroup_on_dfl(cgrp)) | 4673 | if (cgroup_on_dfl(cgrp)) |
4640 | cgroup_rstat_exit(cgrp); | 4674 | cgroup_rstat_exit(cgrp); |
4641 | kfree(cgrp); | 4675 | kfree(cgrp); |
@@ -4892,10 +4926,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent) | |||
4892 | cgrp->self.parent = &parent->self; | 4926 | cgrp->self.parent = &parent->self; |
4893 | cgrp->root = root; | 4927 | cgrp->root = root; |
4894 | cgrp->level = level; | 4928 | cgrp->level = level; |
4895 | ret = cgroup_bpf_inherit(cgrp); | 4929 | |
4930 | ret = psi_cgroup_alloc(cgrp); | ||
4896 | if (ret) | 4931 | if (ret) |
4897 | goto out_idr_free; | 4932 | goto out_idr_free; |
4898 | 4933 | ||
4934 | ret = cgroup_bpf_inherit(cgrp); | ||
4935 | if (ret) | ||
4936 | goto out_psi_free; | ||
4937 | |||
4899 | for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { | 4938 | for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { |
4900 | cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; | 4939 | cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; |
4901 | 4940 | ||
@@ -4933,6 +4972,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) | |||
4933 | 4972 | ||
4934 | return cgrp; | 4973 | return cgrp; |
4935 | 4974 | ||
4975 | out_psi_free: | ||
4976 | psi_cgroup_free(cgrp); | ||
4936 | out_idr_free: | 4977 | out_idr_free: |
4937 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); | 4978 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); |
4938 | out_stat_exit: | 4979 | out_stat_exit: |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2ddfce8f1e8f..bb4fe4e1a601 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -2556,16 +2556,11 @@ static int kdb_summary(int argc, const char **argv) | |||
2556 | } | 2556 | } |
2557 | kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); | 2557 | kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); |
2558 | 2558 | ||
2559 | /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ | ||
2560 | |||
2561 | #define LOAD_INT(x) ((x) >> FSHIFT) | ||
2562 | #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) | ||
2563 | kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", | 2559 | kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", |
2564 | LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), | 2560 | LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), |
2565 | LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), | 2561 | LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), |
2566 | LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); | 2562 | LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); |
2567 | #undef LOAD_INT | 2563 | |
2568 | #undef LOAD_FRAC | ||
2569 | /* Display in kilobytes */ | 2564 | /* Display in kilobytes */ |
2570 | #define K(x) ((x) << (PAGE_SHIFT - 10)) | 2565 | #define K(x) ((x) << (PAGE_SHIFT - 10)) |
2571 | kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" | 2566 | kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ca8ac2824f0b..2a12b988c717 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
@@ -135,9 +135,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | |||
135 | d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; | 135 | d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; |
136 | tmp = d->freepages_delay_total + tsk->delays->freepages_delay; | 136 | tmp = d->freepages_delay_total + tsk->delays->freepages_delay; |
137 | d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; | 137 | d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; |
138 | tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; | ||
139 | d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; | ||
138 | d->blkio_count += tsk->delays->blkio_count; | 140 | d->blkio_count += tsk->delays->blkio_count; |
139 | d->swapin_count += tsk->delays->swapin_count; | 141 | d->swapin_count += tsk->delays->swapin_count; |
140 | d->freepages_count += tsk->delays->freepages_count; | 142 | d->freepages_count += tsk->delays->freepages_count; |
143 | d->thrashing_count += tsk->delays->thrashing_count; | ||
141 | raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); | 144 | raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); |
142 | 145 | ||
143 | return 0; | 146 | return 0; |
@@ -169,3 +172,15 @@ void __delayacct_freepages_end(void) | |||
169 | ¤t->delays->freepages_count); | 172 | ¤t->delays->freepages_count); |
170 | } | 173 | } |
171 | 174 | ||
175 | void __delayacct_thrashing_start(void) | ||
176 | { | ||
177 | current->delays->thrashing_start = ktime_get_ns(); | ||
178 | } | ||
179 | |||
180 | void __delayacct_thrashing_end(void) | ||
181 | { | ||
182 | delayacct_end(¤t->delays->lock, | ||
183 | ¤t->delays->thrashing_start, | ||
184 | ¤t->delays->thrashing_delay, | ||
185 | ¤t->delays->thrashing_count); | ||
186 | } | ||
diff --git a/kernel/fork.c b/kernel/fork.c index f0b58479534f..8f82a3bdcb8f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -223,9 +223,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | |||
223 | return s->addr; | 223 | return s->addr; |
224 | } | 224 | } |
225 | 225 | ||
226 | /* | ||
227 | * Allocated stacks are cached and later reused by new threads, | ||
228 | * so memcg accounting is performed manually on assigning/releasing | ||
229 | * stacks to tasks. Drop __GFP_ACCOUNT. | ||
230 | */ | ||
226 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, | 231 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, |
227 | VMALLOC_START, VMALLOC_END, | 232 | VMALLOC_START, VMALLOC_END, |
228 | THREADINFO_GFP, | 233 | THREADINFO_GFP & ~__GFP_ACCOUNT, |
229 | PAGE_KERNEL, | 234 | PAGE_KERNEL, |
230 | 0, node, __builtin_return_address(0)); | 235 | 0, node, __builtin_return_address(0)); |
231 | 236 | ||
@@ -248,9 +253,19 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | |||
248 | static inline void free_thread_stack(struct task_struct *tsk) | 253 | static inline void free_thread_stack(struct task_struct *tsk) |
249 | { | 254 | { |
250 | #ifdef CONFIG_VMAP_STACK | 255 | #ifdef CONFIG_VMAP_STACK |
251 | if (task_stack_vm_area(tsk)) { | 256 | struct vm_struct *vm = task_stack_vm_area(tsk); |
257 | |||
258 | if (vm) { | ||
252 | int i; | 259 | int i; |
253 | 260 | ||
261 | for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { | ||
262 | mod_memcg_page_state(vm->pages[i], | ||
263 | MEMCG_KERNEL_STACK_KB, | ||
264 | -(int)(PAGE_SIZE / 1024)); | ||
265 | |||
266 | memcg_kmem_uncharge(vm->pages[i], 0); | ||
267 | } | ||
268 | |||
254 | for (i = 0; i < NR_CACHED_STACKS; i++) { | 269 | for (i = 0; i < NR_CACHED_STACKS; i++) { |
255 | if (this_cpu_cmpxchg(cached_stacks[i], | 270 | if (this_cpu_cmpxchg(cached_stacks[i], |
256 | NULL, tsk->stack_vm_area) != NULL) | 271 | NULL, tsk->stack_vm_area) != NULL) |
@@ -351,10 +366,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account) | |||
351 | NR_KERNEL_STACK_KB, | 366 | NR_KERNEL_STACK_KB, |
352 | PAGE_SIZE / 1024 * account); | 367 | PAGE_SIZE / 1024 * account); |
353 | } | 368 | } |
354 | |||
355 | /* All stack pages belong to the same memcg. */ | ||
356 | mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, | ||
357 | account * (THREAD_SIZE / 1024)); | ||
358 | } else { | 369 | } else { |
359 | /* | 370 | /* |
360 | * All stack pages are in the same zone and belong to the | 371 | * All stack pages are in the same zone and belong to the |
@@ -370,6 +381,35 @@ static void account_kernel_stack(struct task_struct *tsk, int account) | |||
370 | } | 381 | } |
371 | } | 382 | } |
372 | 383 | ||
384 | static int memcg_charge_kernel_stack(struct task_struct *tsk) | ||
385 | { | ||
386 | #ifdef CONFIG_VMAP_STACK | ||
387 | struct vm_struct *vm = task_stack_vm_area(tsk); | ||
388 | int ret; | ||
389 | |||
390 | if (vm) { | ||
391 | int i; | ||
392 | |||
393 | for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { | ||
394 | /* | ||
395 | * If memcg_kmem_charge() fails, page->mem_cgroup | ||
396 | * pointer is NULL, and both memcg_kmem_uncharge() | ||
397 | * and mod_memcg_page_state() in free_thread_stack() | ||
398 | * will ignore this page. So it's safe. | ||
399 | */ | ||
400 | ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0); | ||
401 | if (ret) | ||
402 | return ret; | ||
403 | |||
404 | mod_memcg_page_state(vm->pages[i], | ||
405 | MEMCG_KERNEL_STACK_KB, | ||
406 | PAGE_SIZE / 1024); | ||
407 | } | ||
408 | } | ||
409 | #endif | ||
410 | return 0; | ||
411 | } | ||
412 | |||
373 | static void release_task_stack(struct task_struct *tsk) | 413 | static void release_task_stack(struct task_struct *tsk) |
374 | { | 414 | { |
375 | if (WARN_ON(tsk->state != TASK_DEAD)) | 415 | if (WARN_ON(tsk->state != TASK_DEAD)) |
@@ -807,6 +847,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
807 | if (!stack) | 847 | if (!stack) |
808 | goto free_tsk; | 848 | goto free_tsk; |
809 | 849 | ||
850 | if (memcg_charge_kernel_stack(tsk)) | ||
851 | goto free_stack; | ||
852 | |||
810 | stack_vm_area = task_stack_vm_area(tsk); | 853 | stack_vm_area = task_stack_vm_area(tsk); |
811 | 854 | ||
812 | err = arch_dup_task_struct(tsk, orig); | 855 | err = arch_dup_task_struct(tsk, orig); |
@@ -1779,6 +1822,10 @@ static __latent_entropy struct task_struct *copy_process( | |||
1779 | 1822 | ||
1780 | p->default_timer_slack_ns = current->timer_slack_ns; | 1823 | p->default_timer_slack_ns = current->timer_slack_ns; |
1781 | 1824 | ||
1825 | #ifdef CONFIG_PSI | ||
1826 | p->psi_flags = 0; | ||
1827 | #endif | ||
1828 | |||
1782 | task_io_accounting_init(&p->ioac); | 1829 | task_io_accounting_init(&p->ioac); |
1783 | acct_clear_integrals(p); | 1830 | acct_clear_integrals(p); |
1784 | 1831 | ||
diff --git a/kernel/memremap.c b/kernel/memremap.c index 5b8600d39931..620fc4d2559a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
@@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | |||
175 | struct vmem_altmap *altmap = pgmap->altmap_valid ? | 175 | struct vmem_altmap *altmap = pgmap->altmap_valid ? |
176 | &pgmap->altmap : NULL; | 176 | &pgmap->altmap : NULL; |
177 | struct resource *res = &pgmap->res; | 177 | struct resource *res = &pgmap->res; |
178 | unsigned long pfn, pgoff, order; | 178 | struct dev_pagemap *conflict_pgmap; |
179 | pgprot_t pgprot = PAGE_KERNEL; | 179 | pgprot_t pgprot = PAGE_KERNEL; |
180 | unsigned long pgoff, order; | ||
180 | int error, nid, is_ram; | 181 | int error, nid, is_ram; |
181 | struct dev_pagemap *conflict_pgmap; | ||
182 | 182 | ||
183 | align_start = res->start & ~(SECTION_SIZE - 1); | 183 | align_start = res->start & ~(SECTION_SIZE - 1); |
184 | align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) | 184 | align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) |
@@ -256,19 +256,14 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | |||
256 | if (error) | 256 | if (error) |
257 | goto err_add_memory; | 257 | goto err_add_memory; |
258 | 258 | ||
259 | for_each_device_pfn(pfn, pgmap) { | 259 | /* |
260 | struct page *page = pfn_to_page(pfn); | 260 | * Initialization of the pages has been deferred until now in order |
261 | 261 | * to allow us to do the work while not holding the hotplug lock. | |
262 | /* | 262 | */ |
263 | * ZONE_DEVICE pages union ->lru with a ->pgmap back | 263 | memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], |
264 | * pointer. It is a bug if a ZONE_DEVICE page is ever | 264 | align_start >> PAGE_SHIFT, |
265 | * freed or placed on a driver-private list. Seed the | 265 | align_size >> PAGE_SHIFT, pgmap); |
266 | * storage with LIST_POISON* values. | 266 | percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); |
267 | */ | ||
268 | list_del(&page->lru); | ||
269 | page->pgmap = pgmap; | ||
270 | percpu_ref_get(pgmap->ref); | ||
271 | } | ||
272 | 267 | ||
273 | devm_add_action(dev, devm_memremap_pages_release, pgmap); | 268 | devm_add_action(dev, devm_memremap_pages_release, pgmap); |
274 | 269 | ||
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7fe183404c38..21fb5a5662b5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o | |||
29 | obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o | 29 | obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o |
30 | obj-$(CONFIG_MEMBARRIER) += membarrier.o | 30 | obj-$(CONFIG_MEMBARRIER) += membarrier.o |
31 | obj-$(CONFIG_CPU_ISOLATION) += isolation.o | 31 | obj-$(CONFIG_CPU_ISOLATION) += isolation.o |
32 | obj-$(CONFIG_PSI) += psi.o | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e696b03e99d..fd2fce8a001b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -722,8 +722,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | |||
722 | if (!(flags & ENQUEUE_NOCLOCK)) | 722 | if (!(flags & ENQUEUE_NOCLOCK)) |
723 | update_rq_clock(rq); | 723 | update_rq_clock(rq); |
724 | 724 | ||
725 | if (!(flags & ENQUEUE_RESTORE)) | 725 | if (!(flags & ENQUEUE_RESTORE)) { |
726 | sched_info_queued(rq, p); | 726 | sched_info_queued(rq, p); |
727 | psi_enqueue(p, flags & ENQUEUE_WAKEUP); | ||
728 | } | ||
727 | 729 | ||
728 | p->sched_class->enqueue_task(rq, p, flags); | 730 | p->sched_class->enqueue_task(rq, p, flags); |
729 | } | 731 | } |
@@ -733,8 +735,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
733 | if (!(flags & DEQUEUE_NOCLOCK)) | 735 | if (!(flags & DEQUEUE_NOCLOCK)) |
734 | update_rq_clock(rq); | 736 | update_rq_clock(rq); |
735 | 737 | ||
736 | if (!(flags & DEQUEUE_SAVE)) | 738 | if (!(flags & DEQUEUE_SAVE)) { |
737 | sched_info_dequeued(rq, p); | 739 | sched_info_dequeued(rq, p); |
740 | psi_dequeue(p, flags & DEQUEUE_SLEEP); | ||
741 | } | ||
738 | 742 | ||
739 | p->sched_class->dequeue_task(rq, p, flags); | 743 | p->sched_class->dequeue_task(rq, p, flags); |
740 | } | 744 | } |
@@ -2037,6 +2041,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2037 | cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); | 2041 | cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); |
2038 | if (task_cpu(p) != cpu) { | 2042 | if (task_cpu(p) != cpu) { |
2039 | wake_flags |= WF_MIGRATED; | 2043 | wake_flags |= WF_MIGRATED; |
2044 | psi_ttwu_dequeue(p); | ||
2040 | set_task_cpu(p, cpu); | 2045 | set_task_cpu(p, cpu); |
2041 | } | 2046 | } |
2042 | 2047 | ||
@@ -3051,6 +3056,7 @@ void scheduler_tick(void) | |||
3051 | curr->sched_class->task_tick(rq, curr, 0); | 3056 | curr->sched_class->task_tick(rq, curr, 0); |
3052 | cpu_load_update_active(rq); | 3057 | cpu_load_update_active(rq); |
3053 | calc_global_load_tick(rq); | 3058 | calc_global_load_tick(rq); |
3059 | psi_task_tick(rq); | ||
3054 | 3060 | ||
3055 | rq_unlock(rq, &rf); | 3061 | rq_unlock(rq, &rf); |
3056 | 3062 | ||
@@ -4933,9 +4939,7 @@ static void do_sched_yield(void) | |||
4933 | struct rq_flags rf; | 4939 | struct rq_flags rf; |
4934 | struct rq *rq; | 4940 | struct rq *rq; |
4935 | 4941 | ||
4936 | local_irq_disable(); | 4942 | rq = this_rq_lock_irq(&rf); |
4937 | rq = this_rq(); | ||
4938 | rq_lock(rq, &rf); | ||
4939 | 4943 | ||
4940 | schedstat_inc(rq->yld_count); | 4944 | schedstat_inc(rq->yld_count); |
4941 | current->sched_class->yield_task(rq); | 4945 | current->sched_class->yield_task(rq); |
@@ -6069,6 +6073,8 @@ void __init sched_init(void) | |||
6069 | 6073 | ||
6070 | init_schedstats(); | 6074 | init_schedstats(); |
6071 | 6075 | ||
6076 | psi_init(); | ||
6077 | |||
6072 | scheduler_running = 1; | 6078 | scheduler_running = 1; |
6073 | } | 6079 | } |
6074 | 6080 | ||
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a171c1258109..28a516575c18 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c | |||
@@ -91,19 +91,73 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) | |||
91 | return delta; | 91 | return delta; |
92 | } | 92 | } |
93 | 93 | ||
94 | /* | 94 | /** |
95 | * a1 = a0 * e + a * (1 - e) | 95 | * fixed_power_int - compute: x^n, in O(log n) time |
96 | * | ||
97 | * @x: base of the power | ||
98 | * @frac_bits: fractional bits of @x | ||
99 | * @n: power to raise @x to. | ||
100 | * | ||
101 | * By exploiting the relation between the definition of the natural power | ||
102 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
103 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
104 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
105 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
106 | * of course trivially computable in O(log_2 n), the length of our binary | ||
107 | * vector. | ||
96 | */ | 108 | */ |
97 | static unsigned long | 109 | static unsigned long |
98 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | 110 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) |
99 | { | 111 | { |
100 | unsigned long newload; | 112 | unsigned long result = 1UL << frac_bits; |
113 | |||
114 | if (n) { | ||
115 | for (;;) { | ||
116 | if (n & 1) { | ||
117 | result *= x; | ||
118 | result += 1UL << (frac_bits - 1); | ||
119 | result >>= frac_bits; | ||
120 | } | ||
121 | n >>= 1; | ||
122 | if (!n) | ||
123 | break; | ||
124 | x *= x; | ||
125 | x += 1UL << (frac_bits - 1); | ||
126 | x >>= frac_bits; | ||
127 | } | ||
128 | } | ||
101 | 129 | ||
102 | newload = load * exp + active * (FIXED_1 - exp); | 130 | return result; |
103 | if (active >= load) | 131 | } |
104 | newload += FIXED_1-1; | ||
105 | 132 | ||
106 | return newload / FIXED_1; | 133 | /* |
134 | * a1 = a0 * e + a * (1 - e) | ||
135 | * | ||
136 | * a2 = a1 * e + a * (1 - e) | ||
137 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
138 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
139 | * | ||
140 | * a3 = a2 * e + a * (1 - e) | ||
141 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
142 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
143 | * | ||
144 | * ... | ||
145 | * | ||
146 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
147 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
148 | * = a0 * e^n + a * (1 - e^n) | ||
149 | * | ||
150 | * [1] application of the geometric series: | ||
151 | * | ||
152 | * n 1 - x^(n+1) | ||
153 | * S_n := \Sum x^i = ------------- | ||
154 | * i=0 1 - x | ||
155 | */ | ||
156 | unsigned long | ||
157 | calc_load_n(unsigned long load, unsigned long exp, | ||
158 | unsigned long active, unsigned int n) | ||
159 | { | ||
160 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
107 | } | 161 | } |
108 | 162 | ||
109 | #ifdef CONFIG_NO_HZ_COMMON | 163 | #ifdef CONFIG_NO_HZ_COMMON |
@@ -225,75 +279,6 @@ static long calc_load_nohz_fold(void) | |||
225 | return delta; | 279 | return delta; |
226 | } | 280 | } |
227 | 281 | ||
228 | /** | ||
229 | * fixed_power_int - compute: x^n, in O(log n) time | ||
230 | * | ||
231 | * @x: base of the power | ||
232 | * @frac_bits: fractional bits of @x | ||
233 | * @n: power to raise @x to. | ||
234 | * | ||
235 | * By exploiting the relation between the definition of the natural power | ||
236 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
237 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
238 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
239 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
240 | * of course trivially computable in O(log_2 n), the length of our binary | ||
241 | * vector. | ||
242 | */ | ||
243 | static unsigned long | ||
244 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
245 | { | ||
246 | unsigned long result = 1UL << frac_bits; | ||
247 | |||
248 | if (n) { | ||
249 | for (;;) { | ||
250 | if (n & 1) { | ||
251 | result *= x; | ||
252 | result += 1UL << (frac_bits - 1); | ||
253 | result >>= frac_bits; | ||
254 | } | ||
255 | n >>= 1; | ||
256 | if (!n) | ||
257 | break; | ||
258 | x *= x; | ||
259 | x += 1UL << (frac_bits - 1); | ||
260 | x >>= frac_bits; | ||
261 | } | ||
262 | } | ||
263 | |||
264 | return result; | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * a1 = a0 * e + a * (1 - e) | ||
269 | * | ||
270 | * a2 = a1 * e + a * (1 - e) | ||
271 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
272 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
273 | * | ||
274 | * a3 = a2 * e + a * (1 - e) | ||
275 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
276 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
277 | * | ||
278 | * ... | ||
279 | * | ||
280 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
281 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
282 | * = a0 * e^n + a * (1 - e^n) | ||
283 | * | ||
284 | * [1] application of the geometric series: | ||
285 | * | ||
286 | * n 1 - x^(n+1) | ||
287 | * S_n := \Sum x^i = ------------- | ||
288 | * i=0 1 - x | ||
289 | */ | ||
290 | static unsigned long | ||
291 | calc_load_n(unsigned long load, unsigned long exp, | ||
292 | unsigned long active, unsigned int n) | ||
293 | { | ||
294 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
295 | } | ||
296 | |||
297 | /* | 282 | /* |
298 | * NO_HZ can leave us missing all per-CPU ticks calling | 283 | * NO_HZ can leave us missing all per-CPU ticks calling |
299 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into | 284 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into |
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c new file mode 100644 index 000000000000..7cdecfc010af --- /dev/null +++ b/kernel/sched/psi.c | |||
@@ -0,0 +1,759 @@ | |||
1 | /* | ||
2 | * Pressure stall information for CPU, memory and IO | ||
3 | * | ||
4 | * Copyright (c) 2018 Facebook, Inc. | ||
5 | * Author: Johannes Weiner <hannes@cmpxchg.org> | ||
6 | * | ||
7 | * When CPU, memory and IO are contended, tasks experience delays that | ||
8 | * reduce throughput and introduce latencies into the workload. Memory | ||
9 | * and IO contention, in addition, can cause a full loss of forward | ||
10 | * progress in which the CPU goes idle. | ||
11 | * | ||
12 | * This code aggregates individual task delays into resource pressure | ||
13 | * metrics that indicate problems with both workload health and | ||
14 | * resource utilization. | ||
15 | * | ||
16 | * Model | ||
17 | * | ||
18 | * The time in which a task can execute on a CPU is our baseline for | ||
19 | * productivity. Pressure expresses the amount of time in which this | ||
20 | * potential cannot be realized due to resource contention. | ||
21 | * | ||
22 | * This concept of productivity has two components: the workload and | ||
23 | * the CPU. To measure the impact of pressure on both, we define two | ||
24 | * contention states for a resource: SOME and FULL. | ||
25 | * | ||
26 | * In the SOME state of a given resource, one or more tasks are | ||
27 | * delayed on that resource. This affects the workload's ability to | ||
28 | * perform work, but the CPU may still be executing other tasks. | ||
29 | * | ||
30 | * In the FULL state of a given resource, all non-idle tasks are | ||
31 | * delayed on that resource such that nobody is advancing and the CPU | ||
32 | * goes idle. This leaves both workload and CPU unproductive. | ||
33 | * | ||
34 | * (Naturally, the FULL state doesn't exist for the CPU resource.) | ||
35 | * | ||
36 | * SOME = nr_delayed_tasks != 0 | ||
37 | * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 | ||
38 | * | ||
39 | * The percentage of wallclock time spent in those compound stall | ||
40 | * states gives pressure numbers between 0 and 100 for each resource, | ||
41 | * where the SOME percentage indicates workload slowdowns and the FULL | ||
42 | * percentage indicates reduced CPU utilization: | ||
43 | * | ||
44 | * %SOME = time(SOME) / period | ||
45 | * %FULL = time(FULL) / period | ||
46 | * | ||
47 | * Multiple CPUs | ||
48 | * | ||
49 | * The more tasks and available CPUs there are, the more work can be | ||
50 | * performed concurrently. This means that the potential that can go | ||
51 | * unrealized due to resource contention *also* scales with non-idle | ||
52 | * tasks and CPUs. | ||
53 | * | ||
54 | * Consider a scenario where 257 number crunching tasks are trying to | ||
55 | * run concurrently on 256 CPUs. If we simply aggregated the task | ||
56 | * states, we would have to conclude a CPU SOME pressure number of | ||
57 | * 100%, since *somebody* is waiting on a runqueue at all | ||
58 | * times. However, that is clearly not the amount of contention the | ||
59 | * workload is experiencing: only one out of 256 possible exceution | ||
60 | * threads will be contended at any given time, or about 0.4%. | ||
61 | * | ||
62 | * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any | ||
63 | * given time *one* of the tasks is delayed due to a lack of memory. | ||
64 | * Again, looking purely at the task state would yield a memory FULL | ||
65 | * pressure number of 0%, since *somebody* is always making forward | ||
66 | * progress. But again this wouldn't capture the amount of execution | ||
67 | * potential lost, which is 1 out of 4 CPUs, or 25%. | ||
68 | * | ||
69 | * To calculate wasted potential (pressure) with multiple processors, | ||
70 | * we have to base our calculation on the number of non-idle tasks in | ||
71 | * conjunction with the number of available CPUs, which is the number | ||
72 | * of potential execution threads. SOME becomes then the proportion of | ||
73 | * delayed tasks to possibe threads, and FULL is the share of possible | ||
74 | * threads that are unproductive due to delays: | ||
75 | * | ||
76 | * threads = min(nr_nonidle_tasks, nr_cpus) | ||
77 | * SOME = min(nr_delayed_tasks / threads, 1) | ||
78 | * FULL = (threads - min(nr_running_tasks, threads)) / threads | ||
79 | * | ||
80 | * For the 257 number crunchers on 256 CPUs, this yields: | ||
81 | * | ||
82 | * threads = min(257, 256) | ||
83 | * SOME = min(1 / 256, 1) = 0.4% | ||
84 | * FULL = (256 - min(257, 256)) / 256 = 0% | ||
85 | * | ||
86 | * For the 1 out of 4 memory-delayed tasks, this yields: | ||
87 | * | ||
88 | * threads = min(4, 4) | ||
89 | * SOME = min(1 / 4, 1) = 25% | ||
90 | * FULL = (4 - min(3, 4)) / 4 = 25% | ||
91 | * | ||
92 | * [ Substitute nr_cpus with 1, and you can see that it's a natural | ||
93 | * extension of the single-CPU model. ] | ||
94 | * | ||
95 | * Implementation | ||
96 | * | ||
97 | * To assess the precise time spent in each such state, we would have | ||
98 | * to freeze the system on task changes and start/stop the state | ||
99 | * clocks accordingly. Obviously that doesn't scale in practice. | ||
100 | * | ||
101 | * Because the scheduler aims to distribute the compute load evenly | ||
102 | * among the available CPUs, we can track task state locally to each | ||
103 | * CPU and, at much lower frequency, extrapolate the global state for | ||
104 | * the cumulative stall times and the running averages. | ||
105 | * | ||
106 | * For each runqueue, we track: | ||
107 | * | ||
108 | * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) | ||
109 | * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) | ||
110 | * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) | ||
111 | * | ||
112 | * and then periodically aggregate: | ||
113 | * | ||
114 | * tNONIDLE = sum(tNONIDLE[i]) | ||
115 | * | ||
116 | * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE | ||
117 | * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE | ||
118 | * | ||
119 | * %SOME = tSOME / period | ||
120 | * %FULL = tFULL / period | ||
121 | * | ||
122 | * This gives us an approximation of pressure that is practical | ||
123 | * cost-wise, yet way more sensitive and accurate than periodic | ||
124 | * sampling of the aggregate task states would be. | ||
125 | */ | ||
126 | |||
127 | #include <linux/sched/loadavg.h> | ||
128 | #include <linux/seq_file.h> | ||
129 | #include <linux/proc_fs.h> | ||
130 | #include <linux/seqlock.h> | ||
131 | #include <linux/cgroup.h> | ||
132 | #include <linux/module.h> | ||
133 | #include <linux/sched.h> | ||
134 | #include <linux/psi.h> | ||
135 | #include "sched.h" | ||
136 | |||
137 | static int psi_bug __read_mostly; | ||
138 | |||
139 | bool psi_disabled __read_mostly; | ||
140 | core_param(psi_disabled, psi_disabled, bool, 0644); | ||
141 | |||
142 | /* Running averages - we need to be higher-res than loadavg */ | ||
143 | #define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ | ||
144 | #define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */ | ||
145 | #define EXP_60s 1981 /* 1/exp(2s/60s) */ | ||
146 | #define EXP_300s 2034 /* 1/exp(2s/300s) */ | ||
147 | |||
148 | /* Sampling frequency in nanoseconds */ | ||
149 | static u64 psi_period __read_mostly; | ||
150 | |||
151 | /* System-level pressure and stall tracking */ | ||
152 | static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); | ||
153 | static struct psi_group psi_system = { | ||
154 | .pcpu = &system_group_pcpu, | ||
155 | }; | ||
156 | |||
157 | static void psi_update_work(struct work_struct *work); | ||
158 | |||
159 | static void group_init(struct psi_group *group) | ||
160 | { | ||
161 | int cpu; | ||
162 | |||
163 | for_each_possible_cpu(cpu) | ||
164 | seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); | ||
165 | group->next_update = sched_clock() + psi_period; | ||
166 | INIT_DELAYED_WORK(&group->clock_work, psi_update_work); | ||
167 | mutex_init(&group->stat_lock); | ||
168 | } | ||
169 | |||
170 | void __init psi_init(void) | ||
171 | { | ||
172 | if (psi_disabled) | ||
173 | return; | ||
174 | |||
175 | psi_period = jiffies_to_nsecs(PSI_FREQ); | ||
176 | group_init(&psi_system); | ||
177 | } | ||
178 | |||
179 | static bool test_state(unsigned int *tasks, enum psi_states state) | ||
180 | { | ||
181 | switch (state) { | ||
182 | case PSI_IO_SOME: | ||
183 | return tasks[NR_IOWAIT]; | ||
184 | case PSI_IO_FULL: | ||
185 | return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; | ||
186 | case PSI_MEM_SOME: | ||
187 | return tasks[NR_MEMSTALL]; | ||
188 | case PSI_MEM_FULL: | ||
189 | return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; | ||
190 | case PSI_CPU_SOME: | ||
191 | return tasks[NR_RUNNING] > 1; | ||
192 | case PSI_NONIDLE: | ||
193 | return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || | ||
194 | tasks[NR_RUNNING]; | ||
195 | default: | ||
196 | return false; | ||
197 | } | ||
198 | } | ||
199 | |||
200 | static void get_recent_times(struct psi_group *group, int cpu, u32 *times) | ||
201 | { | ||
202 | struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); | ||
203 | unsigned int tasks[NR_PSI_TASK_COUNTS]; | ||
204 | u64 now, state_start; | ||
205 | unsigned int seq; | ||
206 | int s; | ||
207 | |||
208 | /* Snapshot a coherent view of the CPU state */ | ||
209 | do { | ||
210 | seq = read_seqcount_begin(&groupc->seq); | ||
211 | now = cpu_clock(cpu); | ||
212 | memcpy(times, groupc->times, sizeof(groupc->times)); | ||
213 | memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); | ||
214 | state_start = groupc->state_start; | ||
215 | } while (read_seqcount_retry(&groupc->seq, seq)); | ||
216 | |||
217 | /* Calculate state time deltas against the previous snapshot */ | ||
218 | for (s = 0; s < NR_PSI_STATES; s++) { | ||
219 | u32 delta; | ||
220 | /* | ||
221 | * In addition to already concluded states, we also | ||
222 | * incorporate currently active states on the CPU, | ||
223 | * since states may last for many sampling periods. | ||
224 | * | ||
225 | * This way we keep our delta sampling buckets small | ||
226 | * (u32) and our reported pressure close to what's | ||
227 | * actually happening. | ||
228 | */ | ||
229 | if (test_state(tasks, s)) | ||
230 | times[s] += now - state_start; | ||
231 | |||
232 | delta = times[s] - groupc->times_prev[s]; | ||
233 | groupc->times_prev[s] = times[s]; | ||
234 | |||
235 | times[s] = delta; | ||
236 | } | ||
237 | } | ||
238 | |||
239 | static void calc_avgs(unsigned long avg[3], int missed_periods, | ||
240 | u64 time, u64 period) | ||
241 | { | ||
242 | unsigned long pct; | ||
243 | |||
244 | /* Fill in zeroes for periods of no activity */ | ||
245 | if (missed_periods) { | ||
246 | avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods); | ||
247 | avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods); | ||
248 | avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods); | ||
249 | } | ||
250 | |||
251 | /* Sample the most recent active period */ | ||
252 | pct = div_u64(time * 100, period); | ||
253 | pct *= FIXED_1; | ||
254 | avg[0] = calc_load(avg[0], EXP_10s, pct); | ||
255 | avg[1] = calc_load(avg[1], EXP_60s, pct); | ||
256 | avg[2] = calc_load(avg[2], EXP_300s, pct); | ||
257 | } | ||
258 | |||
259 | static bool update_stats(struct psi_group *group) | ||
260 | { | ||
261 | u64 deltas[NR_PSI_STATES - 1] = { 0, }; | ||
262 | unsigned long missed_periods = 0; | ||
263 | unsigned long nonidle_total = 0; | ||
264 | u64 now, expires, period; | ||
265 | int cpu; | ||
266 | int s; | ||
267 | |||
268 | mutex_lock(&group->stat_lock); | ||
269 | |||
270 | /* | ||
271 | * Collect the per-cpu time buckets and average them into a | ||
272 | * single time sample that is normalized to wallclock time. | ||
273 | * | ||
274 | * For averaging, each CPU is weighted by its non-idle time in | ||
275 | * the sampling period. This eliminates artifacts from uneven | ||
276 | * loading, or even entirely idle CPUs. | ||
277 | */ | ||
278 | for_each_possible_cpu(cpu) { | ||
279 | u32 times[NR_PSI_STATES]; | ||
280 | u32 nonidle; | ||
281 | |||
282 | get_recent_times(group, cpu, times); | ||
283 | |||
284 | nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); | ||
285 | nonidle_total += nonidle; | ||
286 | |||
287 | for (s = 0; s < PSI_NONIDLE; s++) | ||
288 | deltas[s] += (u64)times[s] * nonidle; | ||
289 | } | ||
290 | |||
291 | /* | ||
292 | * Integrate the sample into the running statistics that are | ||
293 | * reported to userspace: the cumulative stall times and the | ||
294 | * decaying averages. | ||
295 | * | ||
296 | * Pressure percentages are sampled at PSI_FREQ. We might be | ||
297 | * called more often when the user polls more frequently than | ||
298 | * that; we might be called less often when there is no task | ||
299 | * activity, thus no data, and clock ticks are sporadic. The | ||
300 | * below handles both. | ||
301 | */ | ||
302 | |||
303 | /* total= */ | ||
304 | for (s = 0; s < NR_PSI_STATES - 1; s++) | ||
305 | group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); | ||
306 | |||
307 | /* avgX= */ | ||
308 | now = sched_clock(); | ||
309 | expires = group->next_update; | ||
310 | if (now < expires) | ||
311 | goto out; | ||
312 | if (now - expires > psi_period) | ||
313 | missed_periods = div_u64(now - expires, psi_period); | ||
314 | |||
315 | /* | ||
316 | * The periodic clock tick can get delayed for various | ||
317 | * reasons, especially on loaded systems. To avoid clock | ||
318 | * drift, we schedule the clock in fixed psi_period intervals. | ||
319 | * But the deltas we sample out of the per-cpu buckets above | ||
320 | * are based on the actual time elapsing between clock ticks. | ||
321 | */ | ||
322 | group->next_update = expires + ((1 + missed_periods) * psi_period); | ||
323 | period = now - (group->last_update + (missed_periods * psi_period)); | ||
324 | group->last_update = now; | ||
325 | |||
326 | for (s = 0; s < NR_PSI_STATES - 1; s++) { | ||
327 | u32 sample; | ||
328 | |||
329 | sample = group->total[s] - group->total_prev[s]; | ||
330 | /* | ||
331 | * Due to the lockless sampling of the time buckets, | ||
332 | * recorded time deltas can slip into the next period, | ||
333 | * which under full pressure can result in samples in | ||
334 | * excess of the period length. | ||
335 | * | ||
336 | * We don't want to report non-sensical pressures in | ||
337 | * excess of 100%, nor do we want to drop such events | ||
338 | * on the floor. Instead we punt any overage into the | ||
339 | * future until pressure subsides. By doing this we | ||
340 | * don't underreport the occurring pressure curve, we | ||
341 | * just report it delayed by one period length. | ||
342 | * | ||
343 | * The error isn't cumulative. As soon as another | ||
344 | * delta slips from a period P to P+1, by definition | ||
345 | * it frees up its time T in P. | ||
346 | */ | ||
347 | if (sample > period) | ||
348 | sample = period; | ||
349 | group->total_prev[s] += sample; | ||
350 | calc_avgs(group->avg[s], missed_periods, sample, period); | ||
351 | } | ||
352 | out: | ||
353 | mutex_unlock(&group->stat_lock); | ||
354 | return nonidle_total; | ||
355 | } | ||
356 | |||
357 | static void psi_update_work(struct work_struct *work) | ||
358 | { | ||
359 | struct delayed_work *dwork; | ||
360 | struct psi_group *group; | ||
361 | bool nonidle; | ||
362 | |||
363 | dwork = to_delayed_work(work); | ||
364 | group = container_of(dwork, struct psi_group, clock_work); | ||
365 | |||
366 | /* | ||
367 | * If there is task activity, periodically fold the per-cpu | ||
368 | * times and feed samples into the running averages. If things | ||
369 | * are idle and there is no data to process, stop the clock. | ||
370 | * Once restarted, we'll catch up the running averages in one | ||
371 | * go - see calc_avgs() and missed_periods. | ||
372 | */ | ||
373 | |||
374 | nonidle = update_stats(group); | ||
375 | |||
376 | if (nonidle) { | ||
377 | unsigned long delay = 0; | ||
378 | u64 now; | ||
379 | |||
380 | now = sched_clock(); | ||
381 | if (group->next_update > now) | ||
382 | delay = nsecs_to_jiffies(group->next_update - now) + 1; | ||
383 | schedule_delayed_work(dwork, delay); | ||
384 | } | ||
385 | } | ||
386 | |||
387 | static void record_times(struct psi_group_cpu *groupc, int cpu, | ||
388 | bool memstall_tick) | ||
389 | { | ||
390 | u32 delta; | ||
391 | u64 now; | ||
392 | |||
393 | now = cpu_clock(cpu); | ||
394 | delta = now - groupc->state_start; | ||
395 | groupc->state_start = now; | ||
396 | |||
397 | if (test_state(groupc->tasks, PSI_IO_SOME)) { | ||
398 | groupc->times[PSI_IO_SOME] += delta; | ||
399 | if (test_state(groupc->tasks, PSI_IO_FULL)) | ||
400 | groupc->times[PSI_IO_FULL] += delta; | ||
401 | } | ||
402 | |||
403 | if (test_state(groupc->tasks, PSI_MEM_SOME)) { | ||
404 | groupc->times[PSI_MEM_SOME] += delta; | ||
405 | if (test_state(groupc->tasks, PSI_MEM_FULL)) | ||
406 | groupc->times[PSI_MEM_FULL] += delta; | ||
407 | else if (memstall_tick) { | ||
408 | u32 sample; | ||
409 | /* | ||
410 | * Since we care about lost potential, a | ||
411 | * memstall is FULL when there are no other | ||
412 | * working tasks, but also when the CPU is | ||
413 | * actively reclaiming and nothing productive | ||
414 | * could run even if it were runnable. | ||
415 | * | ||
416 | * When the timer tick sees a reclaiming CPU, | ||
417 | * regardless of runnable tasks, sample a FULL | ||
418 | * tick (or less if it hasn't been a full tick | ||
419 | * since the last state change). | ||
420 | */ | ||
421 | sample = min(delta, (u32)jiffies_to_nsecs(1)); | ||
422 | groupc->times[PSI_MEM_FULL] += sample; | ||
423 | } | ||
424 | } | ||
425 | |||
426 | if (test_state(groupc->tasks, PSI_CPU_SOME)) | ||
427 | groupc->times[PSI_CPU_SOME] += delta; | ||
428 | |||
429 | if (test_state(groupc->tasks, PSI_NONIDLE)) | ||
430 | groupc->times[PSI_NONIDLE] += delta; | ||
431 | } | ||
432 | |||
433 | static void psi_group_change(struct psi_group *group, int cpu, | ||
434 | unsigned int clear, unsigned int set) | ||
435 | { | ||
436 | struct psi_group_cpu *groupc; | ||
437 | unsigned int t, m; | ||
438 | |||
439 | groupc = per_cpu_ptr(group->pcpu, cpu); | ||
440 | |||
441 | /* | ||
442 | * First we assess the aggregate resource states this CPU's | ||
443 | * tasks have been in since the last change, and account any | ||
444 | * SOME and FULL time these may have resulted in. | ||
445 | * | ||
446 | * Then we update the task counts according to the state | ||
447 | * change requested through the @clear and @set bits. | ||
448 | */ | ||
449 | write_seqcount_begin(&groupc->seq); | ||
450 | |||
451 | record_times(groupc, cpu, false); | ||
452 | |||
453 | for (t = 0, m = clear; m; m &= ~(1 << t), t++) { | ||
454 | if (!(m & (1 << t))) | ||
455 | continue; | ||
456 | if (groupc->tasks[t] == 0 && !psi_bug) { | ||
457 | printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", | ||
458 | cpu, t, groupc->tasks[0], | ||
459 | groupc->tasks[1], groupc->tasks[2], | ||
460 | clear, set); | ||
461 | psi_bug = 1; | ||
462 | } | ||
463 | groupc->tasks[t]--; | ||
464 | } | ||
465 | |||
466 | for (t = 0; set; set &= ~(1 << t), t++) | ||
467 | if (set & (1 << t)) | ||
468 | groupc->tasks[t]++; | ||
469 | |||
470 | write_seqcount_end(&groupc->seq); | ||
471 | |||
472 | if (!delayed_work_pending(&group->clock_work)) | ||
473 | schedule_delayed_work(&group->clock_work, PSI_FREQ); | ||
474 | } | ||
475 | |||
476 | static struct psi_group *iterate_groups(struct task_struct *task, void **iter) | ||
477 | { | ||
478 | #ifdef CONFIG_CGROUPS | ||
479 | struct cgroup *cgroup = NULL; | ||
480 | |||
481 | if (!*iter) | ||
482 | cgroup = task->cgroups->dfl_cgrp; | ||
483 | else if (*iter == &psi_system) | ||
484 | return NULL; | ||
485 | else | ||
486 | cgroup = cgroup_parent(*iter); | ||
487 | |||
488 | if (cgroup && cgroup_parent(cgroup)) { | ||
489 | *iter = cgroup; | ||
490 | return cgroup_psi(cgroup); | ||
491 | } | ||
492 | #else | ||
493 | if (*iter) | ||
494 | return NULL; | ||
495 | #endif | ||
496 | *iter = &psi_system; | ||
497 | return &psi_system; | ||
498 | } | ||
499 | |||
500 | void psi_task_change(struct task_struct *task, int clear, int set) | ||
501 | { | ||
502 | int cpu = task_cpu(task); | ||
503 | struct psi_group *group; | ||
504 | void *iter = NULL; | ||
505 | |||
506 | if (!task->pid) | ||
507 | return; | ||
508 | |||
509 | if (((task->psi_flags & set) || | ||
510 | (task->psi_flags & clear) != clear) && | ||
511 | !psi_bug) { | ||
512 | printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", | ||
513 | task->pid, task->comm, cpu, | ||
514 | task->psi_flags, clear, set); | ||
515 | psi_bug = 1; | ||
516 | } | ||
517 | |||
518 | task->psi_flags &= ~clear; | ||
519 | task->psi_flags |= set; | ||
520 | |||
521 | while ((group = iterate_groups(task, &iter))) | ||
522 | psi_group_change(group, cpu, clear, set); | ||
523 | } | ||
524 | |||
525 | void psi_memstall_tick(struct task_struct *task, int cpu) | ||
526 | { | ||
527 | struct psi_group *group; | ||
528 | void *iter = NULL; | ||
529 | |||
530 | while ((group = iterate_groups(task, &iter))) { | ||
531 | struct psi_group_cpu *groupc; | ||
532 | |||
533 | groupc = per_cpu_ptr(group->pcpu, cpu); | ||
534 | write_seqcount_begin(&groupc->seq); | ||
535 | record_times(groupc, cpu, true); | ||
536 | write_seqcount_end(&groupc->seq); | ||
537 | } | ||
538 | } | ||
539 | |||
540 | /** | ||
541 | * psi_memstall_enter - mark the beginning of a memory stall section | ||
542 | * @flags: flags to handle nested sections | ||
543 | * | ||
544 | * Marks the calling task as being stalled due to a lack of memory, | ||
545 | * such as waiting for a refault or performing reclaim. | ||
546 | */ | ||
547 | void psi_memstall_enter(unsigned long *flags) | ||
548 | { | ||
549 | struct rq_flags rf; | ||
550 | struct rq *rq; | ||
551 | |||
552 | if (psi_disabled) | ||
553 | return; | ||
554 | |||
555 | *flags = current->flags & PF_MEMSTALL; | ||
556 | if (*flags) | ||
557 | return; | ||
558 | /* | ||
559 | * PF_MEMSTALL setting & accounting needs to be atomic wrt | ||
560 | * changes to the task's scheduling state, otherwise we can | ||
561 | * race with CPU migration. | ||
562 | */ | ||
563 | rq = this_rq_lock_irq(&rf); | ||
564 | |||
565 | current->flags |= PF_MEMSTALL; | ||
566 | psi_task_change(current, 0, TSK_MEMSTALL); | ||
567 | |||
568 | rq_unlock_irq(rq, &rf); | ||
569 | } | ||
570 | |||
571 | /** | ||
572 | * psi_memstall_leave - mark the end of an memory stall section | ||
573 | * @flags: flags to handle nested memdelay sections | ||
574 | * | ||
575 | * Marks the calling task as no longer stalled due to lack of memory. | ||
576 | */ | ||
577 | void psi_memstall_leave(unsigned long *flags) | ||
578 | { | ||
579 | struct rq_flags rf; | ||
580 | struct rq *rq; | ||
581 | |||
582 | if (psi_disabled) | ||
583 | return; | ||
584 | |||
585 | if (*flags) | ||
586 | return; | ||
587 | /* | ||
588 | * PF_MEMSTALL clearing & accounting needs to be atomic wrt | ||
589 | * changes to the task's scheduling state, otherwise we could | ||
590 | * race with CPU migration. | ||
591 | */ | ||
592 | rq = this_rq_lock_irq(&rf); | ||
593 | |||
594 | current->flags &= ~PF_MEMSTALL; | ||
595 | psi_task_change(current, TSK_MEMSTALL, 0); | ||
596 | |||
597 | rq_unlock_irq(rq, &rf); | ||
598 | } | ||
599 | |||
600 | #ifdef CONFIG_CGROUPS | ||
601 | int psi_cgroup_alloc(struct cgroup *cgroup) | ||
602 | { | ||
603 | if (psi_disabled) | ||
604 | return 0; | ||
605 | |||
606 | cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); | ||
607 | if (!cgroup->psi.pcpu) | ||
608 | return -ENOMEM; | ||
609 | group_init(&cgroup->psi); | ||
610 | return 0; | ||
611 | } | ||
612 | |||
613 | void psi_cgroup_free(struct cgroup *cgroup) | ||
614 | { | ||
615 | if (psi_disabled) | ||
616 | return; | ||
617 | |||
618 | cancel_delayed_work_sync(&cgroup->psi.clock_work); | ||
619 | free_percpu(cgroup->psi.pcpu); | ||
620 | } | ||
621 | |||
622 | /** | ||
623 | * cgroup_move_task - move task to a different cgroup | ||
624 | * @task: the task | ||
625 | * @to: the target css_set | ||
626 | * | ||
627 | * Move task to a new cgroup and safely migrate its associated stall | ||
628 | * state between the different groups. | ||
629 | * | ||
630 | * This function acquires the task's rq lock to lock out concurrent | ||
631 | * changes to the task's scheduling state and - in case the task is | ||
632 | * running - concurrent changes to its stall state. | ||
633 | */ | ||
634 | void cgroup_move_task(struct task_struct *task, struct css_set *to) | ||
635 | { | ||
636 | bool move_psi = !psi_disabled; | ||
637 | unsigned int task_flags = 0; | ||
638 | struct rq_flags rf; | ||
639 | struct rq *rq; | ||
640 | |||
641 | if (move_psi) { | ||
642 | rq = task_rq_lock(task, &rf); | ||
643 | |||
644 | if (task_on_rq_queued(task)) | ||
645 | task_flags = TSK_RUNNING; | ||
646 | else if (task->in_iowait) | ||
647 | task_flags = TSK_IOWAIT; | ||
648 | |||
649 | if (task->flags & PF_MEMSTALL) | ||
650 | task_flags |= TSK_MEMSTALL; | ||
651 | |||
652 | if (task_flags) | ||
653 | psi_task_change(task, task_flags, 0); | ||
654 | } | ||
655 | |||
656 | /* | ||
657 | * Lame to do this here, but the scheduler cannot be locked | ||
658 | * from the outside, so we move cgroups from inside sched/. | ||
659 | */ | ||
660 | rcu_assign_pointer(task->cgroups, to); | ||
661 | |||
662 | if (move_psi) { | ||
663 | if (task_flags) | ||
664 | psi_task_change(task, 0, task_flags); | ||
665 | |||
666 | task_rq_unlock(rq, task, &rf); | ||
667 | } | ||
668 | } | ||
669 | #endif /* CONFIG_CGROUPS */ | ||
670 | |||
671 | int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) | ||
672 | { | ||
673 | int full; | ||
674 | |||
675 | if (psi_disabled) | ||
676 | return -EOPNOTSUPP; | ||
677 | |||
678 | update_stats(group); | ||
679 | |||
680 | for (full = 0; full < 2 - (res == PSI_CPU); full++) { | ||
681 | unsigned long avg[3]; | ||
682 | u64 total; | ||
683 | int w; | ||
684 | |||
685 | for (w = 0; w < 3; w++) | ||
686 | avg[w] = group->avg[res * 2 + full][w]; | ||
687 | total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); | ||
688 | |||
689 | seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", | ||
690 | full ? "full" : "some", | ||
691 | LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), | ||
692 | LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), | ||
693 | LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), | ||
694 | total); | ||
695 | } | ||
696 | |||
697 | return 0; | ||
698 | } | ||
699 | |||
700 | static int psi_io_show(struct seq_file *m, void *v) | ||
701 | { | ||
702 | return psi_show(m, &psi_system, PSI_IO); | ||
703 | } | ||
704 | |||
705 | static int psi_memory_show(struct seq_file *m, void *v) | ||
706 | { | ||
707 | return psi_show(m, &psi_system, PSI_MEM); | ||
708 | } | ||
709 | |||
710 | static int psi_cpu_show(struct seq_file *m, void *v) | ||
711 | { | ||
712 | return psi_show(m, &psi_system, PSI_CPU); | ||
713 | } | ||
714 | |||
715 | static int psi_io_open(struct inode *inode, struct file *file) | ||
716 | { | ||
717 | return single_open(file, psi_io_show, NULL); | ||
718 | } | ||
719 | |||
720 | static int psi_memory_open(struct inode *inode, struct file *file) | ||
721 | { | ||
722 | return single_open(file, psi_memory_show, NULL); | ||
723 | } | ||
724 | |||
725 | static int psi_cpu_open(struct inode *inode, struct file *file) | ||
726 | { | ||
727 | return single_open(file, psi_cpu_show, NULL); | ||
728 | } | ||
729 | |||
730 | static const struct file_operations psi_io_fops = { | ||
731 | .open = psi_io_open, | ||
732 | .read = seq_read, | ||
733 | .llseek = seq_lseek, | ||
734 | .release = single_release, | ||
735 | }; | ||
736 | |||
737 | static const struct file_operations psi_memory_fops = { | ||
738 | .open = psi_memory_open, | ||
739 | .read = seq_read, | ||
740 | .llseek = seq_lseek, | ||
741 | .release = single_release, | ||
742 | }; | ||
743 | |||
744 | static const struct file_operations psi_cpu_fops = { | ||
745 | .open = psi_cpu_open, | ||
746 | .read = seq_read, | ||
747 | .llseek = seq_lseek, | ||
748 | .release = single_release, | ||
749 | }; | ||
750 | |||
751 | static int __init psi_proc_init(void) | ||
752 | { | ||
753 | proc_mkdir("pressure", NULL); | ||
754 | proc_create("pressure/io", 0, NULL, &psi_io_fops); | ||
755 | proc_create("pressure/memory", 0, NULL, &psi_memory_fops); | ||
756 | proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); | ||
757 | return 0; | ||
758 | } | ||
759 | module_init(psi_proc_init); | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b8c007713b3b..618577fc9aa8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -54,6 +54,7 @@ | |||
54 | #include <linux/proc_fs.h> | 54 | #include <linux/proc_fs.h> |
55 | #include <linux/prefetch.h> | 55 | #include <linux/prefetch.h> |
56 | #include <linux/profile.h> | 56 | #include <linux/profile.h> |
57 | #include <linux/psi.h> | ||
57 | #include <linux/rcupdate_wait.h> | 58 | #include <linux/rcupdate_wait.h> |
58 | #include <linux/security.h> | 59 | #include <linux/security.h> |
59 | #include <linux/stop_machine.h> | 60 | #include <linux/stop_machine.h> |
@@ -319,6 +320,7 @@ extern bool dl_cpu_busy(unsigned int cpu); | |||
319 | #ifdef CONFIG_CGROUP_SCHED | 320 | #ifdef CONFIG_CGROUP_SCHED |
320 | 321 | ||
321 | #include <linux/cgroup.h> | 322 | #include <linux/cgroup.h> |
323 | #include <linux/psi.h> | ||
322 | 324 | ||
323 | struct cfs_rq; | 325 | struct cfs_rq; |
324 | struct rt_rq; | 326 | struct rt_rq; |
@@ -957,6 +959,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | |||
957 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 959 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
958 | #define raw_rq() raw_cpu_ptr(&runqueues) | 960 | #define raw_rq() raw_cpu_ptr(&runqueues) |
959 | 961 | ||
962 | extern void update_rq_clock(struct rq *rq); | ||
963 | |||
960 | static inline u64 __rq_clock_broken(struct rq *rq) | 964 | static inline u64 __rq_clock_broken(struct rq *rq) |
961 | { | 965 | { |
962 | return READ_ONCE(rq->clock); | 966 | return READ_ONCE(rq->clock); |
@@ -1075,6 +1079,98 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) | |||
1075 | #endif | 1079 | #endif |
1076 | } | 1080 | } |
1077 | 1081 | ||
1082 | struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) | ||
1083 | __acquires(rq->lock); | ||
1084 | |||
1085 | struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | ||
1086 | __acquires(p->pi_lock) | ||
1087 | __acquires(rq->lock); | ||
1088 | |||
1089 | static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) | ||
1090 | __releases(rq->lock) | ||
1091 | { | ||
1092 | rq_unpin_lock(rq, rf); | ||
1093 | raw_spin_unlock(&rq->lock); | ||
1094 | } | ||
1095 | |||
1096 | static inline void | ||
1097 | task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) | ||
1098 | __releases(rq->lock) | ||
1099 | __releases(p->pi_lock) | ||
1100 | { | ||
1101 | rq_unpin_lock(rq, rf); | ||
1102 | raw_spin_unlock(&rq->lock); | ||
1103 | raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); | ||
1104 | } | ||
1105 | |||
1106 | static inline void | ||
1107 | rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) | ||
1108 | __acquires(rq->lock) | ||
1109 | { | ||
1110 | raw_spin_lock_irqsave(&rq->lock, rf->flags); | ||
1111 | rq_pin_lock(rq, rf); | ||
1112 | } | ||
1113 | |||
1114 | static inline void | ||
1115 | rq_lock_irq(struct rq *rq, struct rq_flags *rf) | ||
1116 | __acquires(rq->lock) | ||
1117 | { | ||
1118 | raw_spin_lock_irq(&rq->lock); | ||
1119 | rq_pin_lock(rq, rf); | ||
1120 | } | ||
1121 | |||
1122 | static inline void | ||
1123 | rq_lock(struct rq *rq, struct rq_flags *rf) | ||
1124 | __acquires(rq->lock) | ||
1125 | { | ||
1126 | raw_spin_lock(&rq->lock); | ||
1127 | rq_pin_lock(rq, rf); | ||
1128 | } | ||
1129 | |||
1130 | static inline void | ||
1131 | rq_relock(struct rq *rq, struct rq_flags *rf) | ||
1132 | __acquires(rq->lock) | ||
1133 | { | ||
1134 | raw_spin_lock(&rq->lock); | ||
1135 | rq_repin_lock(rq, rf); | ||
1136 | } | ||
1137 | |||
1138 | static inline void | ||
1139 | rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) | ||
1140 | __releases(rq->lock) | ||
1141 | { | ||
1142 | rq_unpin_lock(rq, rf); | ||
1143 | raw_spin_unlock_irqrestore(&rq->lock, rf->flags); | ||
1144 | } | ||
1145 | |||
1146 | static inline void | ||
1147 | rq_unlock_irq(struct rq *rq, struct rq_flags *rf) | ||
1148 | __releases(rq->lock) | ||
1149 | { | ||
1150 | rq_unpin_lock(rq, rf); | ||
1151 | raw_spin_unlock_irq(&rq->lock); | ||
1152 | } | ||
1153 | |||
1154 | static inline void | ||
1155 | rq_unlock(struct rq *rq, struct rq_flags *rf) | ||
1156 | __releases(rq->lock) | ||
1157 | { | ||
1158 | rq_unpin_lock(rq, rf); | ||
1159 | raw_spin_unlock(&rq->lock); | ||
1160 | } | ||
1161 | |||
1162 | static inline struct rq * | ||
1163 | this_rq_lock_irq(struct rq_flags *rf) | ||
1164 | __acquires(rq->lock) | ||
1165 | { | ||
1166 | struct rq *rq; | ||
1167 | |||
1168 | local_irq_disable(); | ||
1169 | rq = this_rq(); | ||
1170 | rq_lock(rq, rf); | ||
1171 | return rq; | ||
1172 | } | ||
1173 | |||
1078 | #ifdef CONFIG_NUMA | 1174 | #ifdef CONFIG_NUMA |
1079 | enum numa_topology_type { | 1175 | enum numa_topology_type { |
1080 | NUMA_DIRECT, | 1176 | NUMA_DIRECT, |
@@ -1717,8 +1813,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) | |||
1717 | sched_update_tick_dependency(rq); | 1813 | sched_update_tick_dependency(rq); |
1718 | } | 1814 | } |
1719 | 1815 | ||
1720 | extern void update_rq_clock(struct rq *rq); | ||
1721 | |||
1722 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); | 1816 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); |
1723 | extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); | 1817 | extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); |
1724 | 1818 | ||
@@ -1783,86 +1877,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) | |||
1783 | #endif | 1877 | #endif |
1784 | #endif | 1878 | #endif |
1785 | 1879 | ||
1786 | struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) | ||
1787 | __acquires(rq->lock); | ||
1788 | |||
1789 | struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | ||
1790 | __acquires(p->pi_lock) | ||
1791 | __acquires(rq->lock); | ||
1792 | |||
1793 | static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) | ||
1794 | __releases(rq->lock) | ||
1795 | { | ||
1796 | rq_unpin_lock(rq, rf); | ||
1797 | raw_spin_unlock(&rq->lock); | ||
1798 | } | ||
1799 | |||
1800 | static inline void | ||
1801 | task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) | ||
1802 | __releases(rq->lock) | ||
1803 | __releases(p->pi_lock) | ||
1804 | { | ||
1805 | rq_unpin_lock(rq, rf); | ||
1806 | raw_spin_unlock(&rq->lock); | ||
1807 | raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); | ||
1808 | } | ||
1809 | |||
1810 | static inline void | ||
1811 | rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) | ||
1812 | __acquires(rq->lock) | ||
1813 | { | ||
1814 | raw_spin_lock_irqsave(&rq->lock, rf->flags); | ||
1815 | rq_pin_lock(rq, rf); | ||
1816 | } | ||
1817 | |||
1818 | static inline void | ||
1819 | rq_lock_irq(struct rq *rq, struct rq_flags *rf) | ||
1820 | __acquires(rq->lock) | ||
1821 | { | ||
1822 | raw_spin_lock_irq(&rq->lock); | ||
1823 | rq_pin_lock(rq, rf); | ||
1824 | } | ||
1825 | |||
1826 | static inline void | ||
1827 | rq_lock(struct rq *rq, struct rq_flags *rf) | ||
1828 | __acquires(rq->lock) | ||
1829 | { | ||
1830 | raw_spin_lock(&rq->lock); | ||
1831 | rq_pin_lock(rq, rf); | ||
1832 | } | ||
1833 | |||
1834 | static inline void | ||
1835 | rq_relock(struct rq *rq, struct rq_flags *rf) | ||
1836 | __acquires(rq->lock) | ||
1837 | { | ||
1838 | raw_spin_lock(&rq->lock); | ||
1839 | rq_repin_lock(rq, rf); | ||
1840 | } | ||
1841 | |||
1842 | static inline void | ||
1843 | rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) | ||
1844 | __releases(rq->lock) | ||
1845 | { | ||
1846 | rq_unpin_lock(rq, rf); | ||
1847 | raw_spin_unlock_irqrestore(&rq->lock, rf->flags); | ||
1848 | } | ||
1849 | |||
1850 | static inline void | ||
1851 | rq_unlock_irq(struct rq *rq, struct rq_flags *rf) | ||
1852 | __releases(rq->lock) | ||
1853 | { | ||
1854 | rq_unpin_lock(rq, rf); | ||
1855 | raw_spin_unlock_irq(&rq->lock); | ||
1856 | } | ||
1857 | |||
1858 | static inline void | ||
1859 | rq_unlock(struct rq *rq, struct rq_flags *rf) | ||
1860 | __releases(rq->lock) | ||
1861 | { | ||
1862 | rq_unpin_lock(rq, rf); | ||
1863 | raw_spin_unlock(&rq->lock); | ||
1864 | } | ||
1865 | |||
1866 | #ifdef CONFIG_SMP | 1880 | #ifdef CONFIG_SMP |
1867 | #ifdef CONFIG_PREEMPT | 1881 | #ifdef CONFIG_PREEMPT |
1868 | 1882 | ||
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8aea199a39b4..4904c4677000 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -55,6 +55,92 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt | |||
55 | # define schedstat_val_or_zero(var) 0 | 55 | # define schedstat_val_or_zero(var) 0 |
56 | #endif /* CONFIG_SCHEDSTATS */ | 56 | #endif /* CONFIG_SCHEDSTATS */ |
57 | 57 | ||
58 | #ifdef CONFIG_PSI | ||
59 | /* | ||
60 | * PSI tracks state that persists across sleeps, such as iowaits and | ||
61 | * memory stalls. As a result, it has to distinguish between sleeps, | ||
62 | * where a task's runnable state changes, and requeues, where a task | ||
63 | * and its state are being moved between CPUs and runqueues. | ||
64 | */ | ||
65 | static inline void psi_enqueue(struct task_struct *p, bool wakeup) | ||
66 | { | ||
67 | int clear = 0, set = TSK_RUNNING; | ||
68 | |||
69 | if (psi_disabled) | ||
70 | return; | ||
71 | |||
72 | if (!wakeup || p->sched_psi_wake_requeue) { | ||
73 | if (p->flags & PF_MEMSTALL) | ||
74 | set |= TSK_MEMSTALL; | ||
75 | if (p->sched_psi_wake_requeue) | ||
76 | p->sched_psi_wake_requeue = 0; | ||
77 | } else { | ||
78 | if (p->in_iowait) | ||
79 | clear |= TSK_IOWAIT; | ||
80 | } | ||
81 | |||
82 | psi_task_change(p, clear, set); | ||
83 | } | ||
84 | |||
85 | static inline void psi_dequeue(struct task_struct *p, bool sleep) | ||
86 | { | ||
87 | int clear = TSK_RUNNING, set = 0; | ||
88 | |||
89 | if (psi_disabled) | ||
90 | return; | ||
91 | |||
92 | if (!sleep) { | ||
93 | if (p->flags & PF_MEMSTALL) | ||
94 | clear |= TSK_MEMSTALL; | ||
95 | } else { | ||
96 | if (p->in_iowait) | ||
97 | set |= TSK_IOWAIT; | ||
98 | } | ||
99 | |||
100 | psi_task_change(p, clear, set); | ||
101 | } | ||
102 | |||
103 | static inline void psi_ttwu_dequeue(struct task_struct *p) | ||
104 | { | ||
105 | if (psi_disabled) | ||
106 | return; | ||
107 | /* | ||
108 | * Is the task being migrated during a wakeup? Make sure to | ||
109 | * deregister its sleep-persistent psi states from the old | ||
110 | * queue, and let psi_enqueue() know it has to requeue. | ||
111 | */ | ||
112 | if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { | ||
113 | struct rq_flags rf; | ||
114 | struct rq *rq; | ||
115 | int clear = 0; | ||
116 | |||
117 | if (p->in_iowait) | ||
118 | clear |= TSK_IOWAIT; | ||
119 | if (p->flags & PF_MEMSTALL) | ||
120 | clear |= TSK_MEMSTALL; | ||
121 | |||
122 | rq = __task_rq_lock(p, &rf); | ||
123 | psi_task_change(p, clear, 0); | ||
124 | p->sched_psi_wake_requeue = 1; | ||
125 | __task_rq_unlock(rq, &rf); | ||
126 | } | ||
127 | } | ||
128 | |||
129 | static inline void psi_task_tick(struct rq *rq) | ||
130 | { | ||
131 | if (psi_disabled) | ||
132 | return; | ||
133 | |||
134 | if (unlikely(rq->curr->flags & PF_MEMSTALL)) | ||
135 | psi_memstall_tick(rq->curr, cpu_of(rq)); | ||
136 | } | ||
137 | #else /* CONFIG_PSI */ | ||
138 | static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} | ||
139 | static inline void psi_dequeue(struct task_struct *p, bool sleep) {} | ||
140 | static inline void psi_ttwu_dequeue(struct task_struct *p) {} | ||
141 | static inline void psi_task_tick(struct rq *rq) {} | ||
142 | #endif /* CONFIG_PSI */ | ||
143 | |||
58 | #ifdef CONFIG_SCHED_INFO | 144 | #ifdef CONFIG_SCHED_INFO |
59 | static inline void sched_info_reset_dequeued(struct task_struct *t) | 145 | static inline void sched_info_reset_dequeued(struct task_struct *t) |
60 | { | 146 | { |
diff --git a/lib/test_kasan.c b/lib/test_kasan.c index ec657105edbf..51b78405bf24 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c | |||
@@ -579,6 +579,73 @@ static noinline void __init kmem_cache_invalid_free(void) | |||
579 | kmem_cache_destroy(cache); | 579 | kmem_cache_destroy(cache); |
580 | } | 580 | } |
581 | 581 | ||
582 | static noinline void __init kasan_memchr(void) | ||
583 | { | ||
584 | char *ptr; | ||
585 | size_t size = 24; | ||
586 | |||
587 | pr_info("out-of-bounds in memchr\n"); | ||
588 | ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); | ||
589 | if (!ptr) | ||
590 | return; | ||
591 | |||
592 | memchr(ptr, '1', size + 1); | ||
593 | kfree(ptr); | ||
594 | } | ||
595 | |||
596 | static noinline void __init kasan_memcmp(void) | ||
597 | { | ||
598 | char *ptr; | ||
599 | size_t size = 24; | ||
600 | int arr[9]; | ||
601 | |||
602 | pr_info("out-of-bounds in memcmp\n"); | ||
603 | ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); | ||
604 | if (!ptr) | ||
605 | return; | ||
606 | |||
607 | memset(arr, 0, sizeof(arr)); | ||
608 | memcmp(ptr, arr, size+1); | ||
609 | kfree(ptr); | ||
610 | } | ||
611 | |||
612 | static noinline void __init kasan_strings(void) | ||
613 | { | ||
614 | char *ptr; | ||
615 | size_t size = 24; | ||
616 | |||
617 | pr_info("use-after-free in strchr\n"); | ||
618 | ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); | ||
619 | if (!ptr) | ||
620 | return; | ||
621 | |||
622 | kfree(ptr); | ||
623 | |||
624 | /* | ||
625 | * Try to cause only 1 invalid access (less spam in dmesg). | ||
626 | * For that we need ptr to point to zeroed byte. | ||
627 | * Skip metadata that could be stored in freed object so ptr | ||
628 | * will likely point to zeroed byte. | ||
629 | */ | ||
630 | ptr += 16; | ||
631 | strchr(ptr, '1'); | ||
632 | |||
633 | pr_info("use-after-free in strrchr\n"); | ||
634 | strrchr(ptr, '1'); | ||
635 | |||
636 | pr_info("use-after-free in strcmp\n"); | ||
637 | strcmp(ptr, "2"); | ||
638 | |||
639 | pr_info("use-after-free in strncmp\n"); | ||
640 | strncmp(ptr, "2", 1); | ||
641 | |||
642 | pr_info("use-after-free in strlen\n"); | ||
643 | strlen(ptr); | ||
644 | |||
645 | pr_info("use-after-free in strnlen\n"); | ||
646 | strnlen(ptr, 1); | ||
647 | } | ||
648 | |||
582 | static int __init kmalloc_tests_init(void) | 649 | static int __init kmalloc_tests_init(void) |
583 | { | 650 | { |
584 | /* | 651 | /* |
@@ -618,6 +685,9 @@ static int __init kmalloc_tests_init(void) | |||
618 | use_after_scope_test(); | 685 | use_after_scope_test(); |
619 | kmem_cache_double_free(); | 686 | kmem_cache_double_free(); |
620 | kmem_cache_invalid_free(); | 687 | kmem_cache_invalid_free(); |
688 | kasan_memchr(); | ||
689 | kasan_memcmp(); | ||
690 | kasan_strings(); | ||
621 | 691 | ||
622 | kasan_restore_multi_shot(multishot); | 692 | kasan_restore_multi_shot(multishot); |
623 | 693 | ||
diff --git a/mm/compaction.c b/mm/compaction.c index faca45ebe62d..7c607479de4a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/kthread.h> | 22 | #include <linux/kthread.h> |
23 | #include <linux/freezer.h> | 23 | #include <linux/freezer.h> |
24 | #include <linux/page_owner.h> | 24 | #include <linux/page_owner.h> |
25 | #include <linux/psi.h> | ||
25 | #include "internal.h" | 26 | #include "internal.h" |
26 | 27 | ||
27 | #ifdef CONFIG_COMPACTION | 28 | #ifdef CONFIG_COMPACTION |
@@ -2068,11 +2069,15 @@ static int kcompactd(void *p) | |||
2068 | pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; | 2069 | pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; |
2069 | 2070 | ||
2070 | while (!kthread_should_stop()) { | 2071 | while (!kthread_should_stop()) { |
2072 | unsigned long pflags; | ||
2073 | |||
2071 | trace_mm_compaction_kcompactd_sleep(pgdat->node_id); | 2074 | trace_mm_compaction_kcompactd_sleep(pgdat->node_id); |
2072 | wait_event_freezable(pgdat->kcompactd_wait, | 2075 | wait_event_freezable(pgdat->kcompactd_wait, |
2073 | kcompactd_work_requested(pgdat)); | 2076 | kcompactd_work_requested(pgdat)); |
2074 | 2077 | ||
2078 | psi_memstall_enter(&pflags); | ||
2075 | kcompactd_do_work(pgdat); | 2079 | kcompactd_do_work(pgdat); |
2080 | psi_memstall_leave(&pflags); | ||
2076 | } | 2081 | } |
2077 | 2082 | ||
2078 | return 0; | 2083 | return 0; |
diff --git a/mm/debug.c b/mm/debug.c index bd10aad8539a..cdacba12e09a 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <trace/events/mmflags.h> | 13 | #include <trace/events/mmflags.h> |
14 | #include <linux/migrate.h> | 14 | #include <linux/migrate.h> |
15 | #include <linux/page_owner.h> | 15 | #include <linux/page_owner.h> |
16 | #include <linux/ctype.h> | ||
16 | 17 | ||
17 | #include "internal.h" | 18 | #include "internal.h" |
18 | 19 | ||
@@ -175,4 +176,49 @@ void dump_mm(const struct mm_struct *mm) | |||
175 | ); | 176 | ); |
176 | } | 177 | } |
177 | 178 | ||
179 | static bool page_init_poisoning __read_mostly = true; | ||
180 | |||
181 | static int __init setup_vm_debug(char *str) | ||
182 | { | ||
183 | bool __page_init_poisoning = true; | ||
184 | |||
185 | /* | ||
186 | * Calling vm_debug with no arguments is equivalent to requesting | ||
187 | * to enable all debugging options we can control. | ||
188 | */ | ||
189 | if (*str++ != '=' || !*str) | ||
190 | goto out; | ||
191 | |||
192 | __page_init_poisoning = false; | ||
193 | if (*str == '-') | ||
194 | goto out; | ||
195 | |||
196 | while (*str) { | ||
197 | switch (tolower(*str)) { | ||
198 | case'p': | ||
199 | __page_init_poisoning = true; | ||
200 | break; | ||
201 | default: | ||
202 | pr_err("vm_debug option '%c' unknown. skipped\n", | ||
203 | *str); | ||
204 | } | ||
205 | |||
206 | str++; | ||
207 | } | ||
208 | out: | ||
209 | if (page_init_poisoning && !__page_init_poisoning) | ||
210 | pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n"); | ||
211 | |||
212 | page_init_poisoning = __page_init_poisoning; | ||
213 | |||
214 | return 1; | ||
215 | } | ||
216 | __setup("vm_debug", setup_vm_debug); | ||
217 | |||
218 | void page_init_poison(struct page *page, size_t size) | ||
219 | { | ||
220 | if (page_init_poisoning) | ||
221 | memset(page, PAGE_POISON_PATTERN, size); | ||
222 | } | ||
223 | EXPORT_SYMBOL_GPL(page_init_poison); | ||
178 | #endif /* CONFIG_DEBUG_VM */ | 224 | #endif /* CONFIG_DEBUG_VM */ |
diff --git a/mm/filemap.c b/mm/filemap.c index 52517f28e6f4..3968da1f7f5a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -36,6 +36,8 @@ | |||
36 | #include <linux/cleancache.h> | 36 | #include <linux/cleancache.h> |
37 | #include <linux/shmem_fs.h> | 37 | #include <linux/shmem_fs.h> |
38 | #include <linux/rmap.h> | 38 | #include <linux/rmap.h> |
39 | #include <linux/delayacct.h> | ||
40 | #include <linux/psi.h> | ||
39 | #include "internal.h" | 41 | #include "internal.h" |
40 | 42 | ||
41 | #define CREATE_TRACE_POINTS | 43 | #define CREATE_TRACE_POINTS |
@@ -915,12 +917,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
915 | * data from the working set, only to cache data that will | 917 | * data from the working set, only to cache data that will |
916 | * get overwritten with something else, is a waste of memory. | 918 | * get overwritten with something else, is a waste of memory. |
917 | */ | 919 | */ |
918 | if (!(gfp_mask & __GFP_WRITE) && | 920 | WARN_ON_ONCE(PageActive(page)); |
919 | shadow && workingset_refault(shadow)) { | 921 | if (!(gfp_mask & __GFP_WRITE) && shadow) |
920 | SetPageActive(page); | 922 | workingset_refault(page, shadow); |
921 | workingset_activation(page); | ||
922 | } else | ||
923 | ClearPageActive(page); | ||
924 | lru_cache_add(page); | 923 | lru_cache_add(page); |
925 | } | 924 | } |
926 | return ret; | 925 | return ret; |
@@ -1076,8 +1075,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, | |||
1076 | { | 1075 | { |
1077 | struct wait_page_queue wait_page; | 1076 | struct wait_page_queue wait_page; |
1078 | wait_queue_entry_t *wait = &wait_page.wait; | 1077 | wait_queue_entry_t *wait = &wait_page.wait; |
1078 | bool thrashing = false; | ||
1079 | unsigned long pflags; | ||
1079 | int ret = 0; | 1080 | int ret = 0; |
1080 | 1081 | ||
1082 | if (bit_nr == PG_locked && | ||
1083 | !PageUptodate(page) && PageWorkingset(page)) { | ||
1084 | if (!PageSwapBacked(page)) | ||
1085 | delayacct_thrashing_start(); | ||
1086 | psi_memstall_enter(&pflags); | ||
1087 | thrashing = true; | ||
1088 | } | ||
1089 | |||
1081 | init_wait(wait); | 1090 | init_wait(wait); |
1082 | wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; | 1091 | wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; |
1083 | wait->func = wake_page_function; | 1092 | wait->func = wake_page_function; |
@@ -1116,6 +1125,12 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, | |||
1116 | 1125 | ||
1117 | finish_wait(q, wait); | 1126 | finish_wait(q, wait); |
1118 | 1127 | ||
1128 | if (thrashing) { | ||
1129 | if (!PageSwapBacked(page)) | ||
1130 | delayacct_thrashing_end(); | ||
1131 | psi_memstall_leave(&pflags); | ||
1132 | } | ||
1133 | |||
1119 | /* | 1134 | /* |
1120 | * A signal could leave PageWaiters set. Clearing it here if | 1135 | * A signal could leave PageWaiters set. Clearing it here if |
1121 | * !waitqueue_active would be possible (by open-coding finish_wait), | 1136 | * !waitqueue_active would be possible (by open-coding finish_wait), |
@@ -2581,9 +2596,7 @@ no_cached_page: | |||
2581 | * system is low on memory, or a problem occurs while trying | 2596 | * system is low on memory, or a problem occurs while trying |
2582 | * to schedule I/O. | 2597 | * to schedule I/O. |
2583 | */ | 2598 | */ |
2584 | if (error == -ENOMEM) | 2599 | return vmf_error(error); |
2585 | return VM_FAULT_OOM; | ||
2586 | return VM_FAULT_SIGBUS; | ||
2587 | 2600 | ||
2588 | page_not_uptodate: | 2601 | page_not_uptodate: |
2589 | /* | 2602 | /* |
@@ -2748,9 +2761,9 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) | |||
2748 | return generic_file_mmap(file, vma); | 2761 | return generic_file_mmap(file, vma); |
2749 | } | 2762 | } |
2750 | #else | 2763 | #else |
2751 | int filemap_page_mkwrite(struct vm_fault *vmf) | 2764 | vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) |
2752 | { | 2765 | { |
2753 | return -ENOSYS; | 2766 | return VM_FAULT_SIGBUS; |
2754 | } | 2767 | } |
2755 | int generic_file_mmap(struct file * file, struct vm_area_struct * vma) | 2768 | int generic_file_mmap(struct file * file, struct vm_area_struct * vma) |
2756 | { | 2769 | { |
@@ -3012,7 +3025,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) | |||
3012 | if (iocb->ki_flags & IOCB_NOWAIT) { | 3025 | if (iocb->ki_flags & IOCB_NOWAIT) { |
3013 | /* If there are pages to writeback, return */ | 3026 | /* If there are pages to writeback, return */ |
3014 | if (filemap_range_has_page(inode->i_mapping, pos, | 3027 | if (filemap_range_has_page(inode->i_mapping, pos, |
3015 | pos + iov_iter_count(from))) | 3028 | pos + write_len)) |
3016 | return -EAGAIN; | 3029 | return -EAGAIN; |
3017 | } else { | 3030 | } else { |
3018 | written = filemap_write_and_wait_range(mapping, pos, | 3031 | written = filemap_write_and_wait_range(mapping, pos, |
@@ -20,6 +20,11 @@ | |||
20 | 20 | ||
21 | #include "internal.h" | 21 | #include "internal.h" |
22 | 22 | ||
23 | struct follow_page_context { | ||
24 | struct dev_pagemap *pgmap; | ||
25 | unsigned int page_mask; | ||
26 | }; | ||
27 | |||
23 | static struct page *no_page_table(struct vm_area_struct *vma, | 28 | static struct page *no_page_table(struct vm_area_struct *vma, |
24 | unsigned int flags) | 29 | unsigned int flags) |
25 | { | 30 | { |
@@ -71,10 +76,10 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) | |||
71 | } | 76 | } |
72 | 77 | ||
73 | static struct page *follow_page_pte(struct vm_area_struct *vma, | 78 | static struct page *follow_page_pte(struct vm_area_struct *vma, |
74 | unsigned long address, pmd_t *pmd, unsigned int flags) | 79 | unsigned long address, pmd_t *pmd, unsigned int flags, |
80 | struct dev_pagemap **pgmap) | ||
75 | { | 81 | { |
76 | struct mm_struct *mm = vma->vm_mm; | 82 | struct mm_struct *mm = vma->vm_mm; |
77 | struct dev_pagemap *pgmap = NULL; | ||
78 | struct page *page; | 83 | struct page *page; |
79 | spinlock_t *ptl; | 84 | spinlock_t *ptl; |
80 | pte_t *ptep, pte; | 85 | pte_t *ptep, pte; |
@@ -116,8 +121,8 @@ retry: | |||
116 | * Only return device mapping pages in the FOLL_GET case since | 121 | * Only return device mapping pages in the FOLL_GET case since |
117 | * they are only valid while holding the pgmap reference. | 122 | * they are only valid while holding the pgmap reference. |
118 | */ | 123 | */ |
119 | pgmap = get_dev_pagemap(pte_pfn(pte), NULL); | 124 | *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); |
120 | if (pgmap) | 125 | if (*pgmap) |
121 | page = pte_page(pte); | 126 | page = pte_page(pte); |
122 | else | 127 | else |
123 | goto no_page; | 128 | goto no_page; |
@@ -152,15 +157,8 @@ retry: | |||
152 | goto retry; | 157 | goto retry; |
153 | } | 158 | } |
154 | 159 | ||
155 | if (flags & FOLL_GET) { | 160 | if (flags & FOLL_GET) |
156 | get_page(page); | 161 | get_page(page); |
157 | |||
158 | /* drop the pgmap reference now that we hold the page */ | ||
159 | if (pgmap) { | ||
160 | put_dev_pagemap(pgmap); | ||
161 | pgmap = NULL; | ||
162 | } | ||
163 | } | ||
164 | if (flags & FOLL_TOUCH) { | 162 | if (flags & FOLL_TOUCH) { |
165 | if ((flags & FOLL_WRITE) && | 163 | if ((flags & FOLL_WRITE) && |
166 | !pte_dirty(pte) && !PageDirty(page)) | 164 | !pte_dirty(pte) && !PageDirty(page)) |
@@ -210,7 +208,8 @@ no_page: | |||
210 | 208 | ||
211 | static struct page *follow_pmd_mask(struct vm_area_struct *vma, | 209 | static struct page *follow_pmd_mask(struct vm_area_struct *vma, |
212 | unsigned long address, pud_t *pudp, | 210 | unsigned long address, pud_t *pudp, |
213 | unsigned int flags, unsigned int *page_mask) | 211 | unsigned int flags, |
212 | struct follow_page_context *ctx) | ||
214 | { | 213 | { |
215 | pmd_t *pmd, pmdval; | 214 | pmd_t *pmd, pmdval; |
216 | spinlock_t *ptl; | 215 | spinlock_t *ptl; |
@@ -258,13 +257,13 @@ retry: | |||
258 | } | 257 | } |
259 | if (pmd_devmap(pmdval)) { | 258 | if (pmd_devmap(pmdval)) { |
260 | ptl = pmd_lock(mm, pmd); | 259 | ptl = pmd_lock(mm, pmd); |
261 | page = follow_devmap_pmd(vma, address, pmd, flags); | 260 | page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); |
262 | spin_unlock(ptl); | 261 | spin_unlock(ptl); |
263 | if (page) | 262 | if (page) |
264 | return page; | 263 | return page; |
265 | } | 264 | } |
266 | if (likely(!pmd_trans_huge(pmdval))) | 265 | if (likely(!pmd_trans_huge(pmdval))) |
267 | return follow_page_pte(vma, address, pmd, flags); | 266 | return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); |
268 | 267 | ||
269 | if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) | 268 | if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) |
270 | return no_page_table(vma, flags); | 269 | return no_page_table(vma, flags); |
@@ -284,7 +283,7 @@ retry_locked: | |||
284 | } | 283 | } |
285 | if (unlikely(!pmd_trans_huge(*pmd))) { | 284 | if (unlikely(!pmd_trans_huge(*pmd))) { |
286 | spin_unlock(ptl); | 285 | spin_unlock(ptl); |
287 | return follow_page_pte(vma, address, pmd, flags); | 286 | return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); |
288 | } | 287 | } |
289 | if (flags & FOLL_SPLIT) { | 288 | if (flags & FOLL_SPLIT) { |
290 | int ret; | 289 | int ret; |
@@ -307,18 +306,18 @@ retry_locked: | |||
307 | } | 306 | } |
308 | 307 | ||
309 | return ret ? ERR_PTR(ret) : | 308 | return ret ? ERR_PTR(ret) : |
310 | follow_page_pte(vma, address, pmd, flags); | 309 | follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); |
311 | } | 310 | } |
312 | page = follow_trans_huge_pmd(vma, address, pmd, flags); | 311 | page = follow_trans_huge_pmd(vma, address, pmd, flags); |
313 | spin_unlock(ptl); | 312 | spin_unlock(ptl); |
314 | *page_mask = HPAGE_PMD_NR - 1; | 313 | ctx->page_mask = HPAGE_PMD_NR - 1; |
315 | return page; | 314 | return page; |
316 | } | 315 | } |
317 | 316 | ||
318 | |||
319 | static struct page *follow_pud_mask(struct vm_area_struct *vma, | 317 | static struct page *follow_pud_mask(struct vm_area_struct *vma, |
320 | unsigned long address, p4d_t *p4dp, | 318 | unsigned long address, p4d_t *p4dp, |
321 | unsigned int flags, unsigned int *page_mask) | 319 | unsigned int flags, |
320 | struct follow_page_context *ctx) | ||
322 | { | 321 | { |
323 | pud_t *pud; | 322 | pud_t *pud; |
324 | spinlock_t *ptl; | 323 | spinlock_t *ptl; |
@@ -344,7 +343,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, | |||
344 | } | 343 | } |
345 | if (pud_devmap(*pud)) { | 344 | if (pud_devmap(*pud)) { |
346 | ptl = pud_lock(mm, pud); | 345 | ptl = pud_lock(mm, pud); |
347 | page = follow_devmap_pud(vma, address, pud, flags); | 346 | page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); |
348 | spin_unlock(ptl); | 347 | spin_unlock(ptl); |
349 | if (page) | 348 | if (page) |
350 | return page; | 349 | return page; |
@@ -352,13 +351,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, | |||
352 | if (unlikely(pud_bad(*pud))) | 351 | if (unlikely(pud_bad(*pud))) |
353 | return no_page_table(vma, flags); | 352 | return no_page_table(vma, flags); |
354 | 353 | ||
355 | return follow_pmd_mask(vma, address, pud, flags, page_mask); | 354 | return follow_pmd_mask(vma, address, pud, flags, ctx); |
356 | } | 355 | } |
357 | 356 | ||
358 | |||
359 | static struct page *follow_p4d_mask(struct vm_area_struct *vma, | 357 | static struct page *follow_p4d_mask(struct vm_area_struct *vma, |
360 | unsigned long address, pgd_t *pgdp, | 358 | unsigned long address, pgd_t *pgdp, |
361 | unsigned int flags, unsigned int *page_mask) | 359 | unsigned int flags, |
360 | struct follow_page_context *ctx) | ||
362 | { | 361 | { |
363 | p4d_t *p4d; | 362 | p4d_t *p4d; |
364 | struct page *page; | 363 | struct page *page; |
@@ -378,7 +377,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, | |||
378 | return page; | 377 | return page; |
379 | return no_page_table(vma, flags); | 378 | return no_page_table(vma, flags); |
380 | } | 379 | } |
381 | return follow_pud_mask(vma, address, p4d, flags, page_mask); | 380 | return follow_pud_mask(vma, address, p4d, flags, ctx); |
382 | } | 381 | } |
383 | 382 | ||
384 | /** | 383 | /** |
@@ -396,13 +395,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, | |||
396 | */ | 395 | */ |
397 | struct page *follow_page_mask(struct vm_area_struct *vma, | 396 | struct page *follow_page_mask(struct vm_area_struct *vma, |
398 | unsigned long address, unsigned int flags, | 397 | unsigned long address, unsigned int flags, |
399 | unsigned int *page_mask) | 398 | struct follow_page_context *ctx) |
400 | { | 399 | { |
401 | pgd_t *pgd; | 400 | pgd_t *pgd; |
402 | struct page *page; | 401 | struct page *page; |
403 | struct mm_struct *mm = vma->vm_mm; | 402 | struct mm_struct *mm = vma->vm_mm; |
404 | 403 | ||
405 | *page_mask = 0; | 404 | ctx->page_mask = 0; |
406 | 405 | ||
407 | /* make this handle hugepd */ | 406 | /* make this handle hugepd */ |
408 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); | 407 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); |
@@ -431,7 +430,19 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
431 | return no_page_table(vma, flags); | 430 | return no_page_table(vma, flags); |
432 | } | 431 | } |
433 | 432 | ||
434 | return follow_p4d_mask(vma, address, pgd, flags, page_mask); | 433 | return follow_p4d_mask(vma, address, pgd, flags, ctx); |
434 | } | ||
435 | |||
436 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | ||
437 | unsigned int foll_flags) | ||
438 | { | ||
439 | struct follow_page_context ctx = { NULL }; | ||
440 | struct page *page; | ||
441 | |||
442 | page = follow_page_mask(vma, address, foll_flags, &ctx); | ||
443 | if (ctx.pgmap) | ||
444 | put_dev_pagemap(ctx.pgmap); | ||
445 | return page; | ||
435 | } | 446 | } |
436 | 447 | ||
437 | static int get_gate_page(struct mm_struct *mm, unsigned long address, | 448 | static int get_gate_page(struct mm_struct *mm, unsigned long address, |
@@ -659,9 +670,9 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
659 | unsigned int gup_flags, struct page **pages, | 670 | unsigned int gup_flags, struct page **pages, |
660 | struct vm_area_struct **vmas, int *nonblocking) | 671 | struct vm_area_struct **vmas, int *nonblocking) |
661 | { | 672 | { |
662 | long i = 0; | 673 | long ret = 0, i = 0; |
663 | unsigned int page_mask; | ||
664 | struct vm_area_struct *vma = NULL; | 674 | struct vm_area_struct *vma = NULL; |
675 | struct follow_page_context ctx = { NULL }; | ||
665 | 676 | ||
666 | if (!nr_pages) | 677 | if (!nr_pages) |
667 | return 0; | 678 | return 0; |
@@ -691,12 +702,14 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
691 | pages ? &pages[i] : NULL); | 702 | pages ? &pages[i] : NULL); |
692 | if (ret) | 703 | if (ret) |
693 | return i ? : ret; | 704 | return i ? : ret; |
694 | page_mask = 0; | 705 | ctx.page_mask = 0; |
695 | goto next_page; | 706 | goto next_page; |
696 | } | 707 | } |
697 | 708 | ||
698 | if (!vma || check_vma_flags(vma, gup_flags)) | 709 | if (!vma || check_vma_flags(vma, gup_flags)) { |
699 | return i ? : -EFAULT; | 710 | ret = -EFAULT; |
711 | goto out; | ||
712 | } | ||
700 | if (is_vm_hugetlb_page(vma)) { | 713 | if (is_vm_hugetlb_page(vma)) { |
701 | i = follow_hugetlb_page(mm, vma, pages, vmas, | 714 | i = follow_hugetlb_page(mm, vma, pages, vmas, |
702 | &start, &nr_pages, i, | 715 | &start, &nr_pages, i, |
@@ -709,23 +722,26 @@ retry: | |||
709 | * If we have a pending SIGKILL, don't keep faulting pages and | 722 | * If we have a pending SIGKILL, don't keep faulting pages and |
710 | * potentially allocating memory. | 723 | * potentially allocating memory. |
711 | */ | 724 | */ |
712 | if (unlikely(fatal_signal_pending(current))) | 725 | if (unlikely(fatal_signal_pending(current))) { |
713 | return i ? i : -ERESTARTSYS; | 726 | ret = -ERESTARTSYS; |
727 | goto out; | ||
728 | } | ||
714 | cond_resched(); | 729 | cond_resched(); |
715 | page = follow_page_mask(vma, start, foll_flags, &page_mask); | 730 | |
731 | page = follow_page_mask(vma, start, foll_flags, &ctx); | ||
716 | if (!page) { | 732 | if (!page) { |
717 | int ret; | ||
718 | ret = faultin_page(tsk, vma, start, &foll_flags, | 733 | ret = faultin_page(tsk, vma, start, &foll_flags, |
719 | nonblocking); | 734 | nonblocking); |
720 | switch (ret) { | 735 | switch (ret) { |
721 | case 0: | 736 | case 0: |
722 | goto retry; | 737 | goto retry; |
738 | case -EBUSY: | ||
739 | ret = 0; | ||
740 | /* FALLTHRU */ | ||
723 | case -EFAULT: | 741 | case -EFAULT: |
724 | case -ENOMEM: | 742 | case -ENOMEM: |
725 | case -EHWPOISON: | 743 | case -EHWPOISON: |
726 | return i ? i : ret; | 744 | goto out; |
727 | case -EBUSY: | ||
728 | return i; | ||
729 | case -ENOENT: | 745 | case -ENOENT: |
730 | goto next_page; | 746 | goto next_page; |
731 | } | 747 | } |
@@ -737,27 +753,31 @@ retry: | |||
737 | */ | 753 | */ |
738 | goto next_page; | 754 | goto next_page; |
739 | } else if (IS_ERR(page)) { | 755 | } else if (IS_ERR(page)) { |
740 | return i ? i : PTR_ERR(page); | 756 | ret = PTR_ERR(page); |
757 | goto out; | ||
741 | } | 758 | } |
742 | if (pages) { | 759 | if (pages) { |
743 | pages[i] = page; | 760 | pages[i] = page; |
744 | flush_anon_page(vma, page, start); | 761 | flush_anon_page(vma, page, start); |
745 | flush_dcache_page(page); | 762 | flush_dcache_page(page); |
746 | page_mask = 0; | 763 | ctx.page_mask = 0; |
747 | } | 764 | } |
748 | next_page: | 765 | next_page: |
749 | if (vmas) { | 766 | if (vmas) { |
750 | vmas[i] = vma; | 767 | vmas[i] = vma; |
751 | page_mask = 0; | 768 | ctx.page_mask = 0; |
752 | } | 769 | } |
753 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); | 770 | page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); |
754 | if (page_increm > nr_pages) | 771 | if (page_increm > nr_pages) |
755 | page_increm = nr_pages; | 772 | page_increm = nr_pages; |
756 | i += page_increm; | 773 | i += page_increm; |
757 | start += page_increm * PAGE_SIZE; | 774 | start += page_increm * PAGE_SIZE; |
758 | nr_pages -= page_increm; | 775 | nr_pages -= page_increm; |
759 | } while (nr_pages); | 776 | } while (nr_pages); |
760 | return i; | 777 | out: |
778 | if (ctx.pgmap) | ||
779 | put_dev_pagemap(ctx.pgmap); | ||
780 | return i ? i : ret; | ||
761 | } | 781 | } |
762 | 782 | ||
763 | static bool vma_permits_fault(struct vm_area_struct *vma, | 783 | static bool vma_permits_fault(struct vm_area_struct *vma, |
@@ -1780,12 +1800,11 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write) | |||
1780 | int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | 1800 | int __get_user_pages_fast(unsigned long start, int nr_pages, int write, |
1781 | struct page **pages) | 1801 | struct page **pages) |
1782 | { | 1802 | { |
1783 | unsigned long addr, len, end; | 1803 | unsigned long len, end; |
1784 | unsigned long flags; | 1804 | unsigned long flags; |
1785 | int nr = 0; | 1805 | int nr = 0; |
1786 | 1806 | ||
1787 | start &= PAGE_MASK; | 1807 | start &= PAGE_MASK; |
1788 | addr = start; | ||
1789 | len = (unsigned long) nr_pages << PAGE_SHIFT; | 1808 | len = (unsigned long) nr_pages << PAGE_SHIFT; |
1790 | end = start + len; | 1809 | end = start + len; |
1791 | 1810 | ||
@@ -1807,7 +1826,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
1807 | 1826 | ||
1808 | if (gup_fast_permitted(start, nr_pages, write)) { | 1827 | if (gup_fast_permitted(start, nr_pages, write)) { |
1809 | local_irq_save(flags); | 1828 | local_irq_save(flags); |
1810 | gup_pgd_range(addr, end, write, pages, &nr); | 1829 | gup_pgd_range(start, end, write, pages, &nr); |
1811 | local_irq_restore(flags); | 1830 | local_irq_restore(flags); |
1812 | } | 1831 | } |
1813 | 1832 | ||
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 7405c9d89d65..debf11388a60 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c | |||
@@ -6,13 +6,17 @@ | |||
6 | #include <linux/debugfs.h> | 6 | #include <linux/debugfs.h> |
7 | 7 | ||
8 | #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) | 8 | #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) |
9 | #define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) | ||
10 | #define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) | ||
9 | 11 | ||
10 | struct gup_benchmark { | 12 | struct gup_benchmark { |
11 | __u64 delta_usec; | 13 | __u64 get_delta_usec; |
14 | __u64 put_delta_usec; | ||
12 | __u64 addr; | 15 | __u64 addr; |
13 | __u64 size; | 16 | __u64 size; |
14 | __u32 nr_pages_per_call; | 17 | __u32 nr_pages_per_call; |
15 | __u32 flags; | 18 | __u32 flags; |
19 | __u64 expansion[10]; /* For future use */ | ||
16 | }; | 20 | }; |
17 | 21 | ||
18 | static int __gup_benchmark_ioctl(unsigned int cmd, | 22 | static int __gup_benchmark_ioctl(unsigned int cmd, |
@@ -41,21 +45,40 @@ static int __gup_benchmark_ioctl(unsigned int cmd, | |||
41 | nr = (next - addr) / PAGE_SIZE; | 45 | nr = (next - addr) / PAGE_SIZE; |
42 | } | 46 | } |
43 | 47 | ||
44 | nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i); | 48 | switch (cmd) { |
49 | case GUP_FAST_BENCHMARK: | ||
50 | nr = get_user_pages_fast(addr, nr, gup->flags & 1, | ||
51 | pages + i); | ||
52 | break; | ||
53 | case GUP_LONGTERM_BENCHMARK: | ||
54 | nr = get_user_pages_longterm(addr, nr, gup->flags & 1, | ||
55 | pages + i, NULL); | ||
56 | break; | ||
57 | case GUP_BENCHMARK: | ||
58 | nr = get_user_pages(addr, nr, gup->flags & 1, pages + i, | ||
59 | NULL); | ||
60 | break; | ||
61 | default: | ||
62 | return -1; | ||
63 | } | ||
64 | |||
45 | if (nr <= 0) | 65 | if (nr <= 0) |
46 | break; | 66 | break; |
47 | i += nr; | 67 | i += nr; |
48 | } | 68 | } |
49 | end_time = ktime_get(); | 69 | end_time = ktime_get(); |
50 | 70 | ||
51 | gup->delta_usec = ktime_us_delta(end_time, start_time); | 71 | gup->get_delta_usec = ktime_us_delta(end_time, start_time); |
52 | gup->size = addr - gup->addr; | 72 | gup->size = addr - gup->addr; |
53 | 73 | ||
74 | start_time = ktime_get(); | ||
54 | for (i = 0; i < nr_pages; i++) { | 75 | for (i = 0; i < nr_pages; i++) { |
55 | if (!pages[i]) | 76 | if (!pages[i]) |
56 | break; | 77 | break; |
57 | put_page(pages[i]); | 78 | put_page(pages[i]); |
58 | } | 79 | } |
80 | end_time = ktime_get(); | ||
81 | gup->put_delta_usec = ktime_us_delta(end_time, start_time); | ||
59 | 82 | ||
60 | kvfree(pages); | 83 | kvfree(pages); |
61 | return 0; | 84 | return 0; |
@@ -67,8 +90,14 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, | |||
67 | struct gup_benchmark gup; | 90 | struct gup_benchmark gup; |
68 | int ret; | 91 | int ret; |
69 | 92 | ||
70 | if (cmd != GUP_FAST_BENCHMARK) | 93 | switch (cmd) { |
94 | case GUP_FAST_BENCHMARK: | ||
95 | case GUP_LONGTERM_BENCHMARK: | ||
96 | case GUP_BENCHMARK: | ||
97 | break; | ||
98 | default: | ||
71 | return -EINVAL; | 99 | return -EINVAL; |
100 | } | ||
72 | 101 | ||
73 | if (copy_from_user(&gup, (void __user *)arg, sizeof(gup))) | 102 | if (copy_from_user(&gup, (void __user *)arg, sizeof(gup))) |
74 | return -EFAULT; | 103 | return -EFAULT; |
@@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) | |||
1024 | resource_size_t key, align_start, align_size, align_end; | 1024 | resource_size_t key, align_start, align_size, align_end; |
1025 | struct device *device = devmem->device; | 1025 | struct device *device = devmem->device; |
1026 | int ret, nid, is_ram; | 1026 | int ret, nid, is_ram; |
1027 | unsigned long pfn; | ||
1028 | 1027 | ||
1029 | align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); | 1028 | align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); |
1030 | align_size = ALIGN(devmem->resource->start + | 1029 | align_size = ALIGN(devmem->resource->start + |
@@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) | |||
1109 | align_size >> PAGE_SHIFT, NULL); | 1108 | align_size >> PAGE_SHIFT, NULL); |
1110 | mem_hotplug_done(); | 1109 | mem_hotplug_done(); |
1111 | 1110 | ||
1112 | for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { | 1111 | /* |
1113 | struct page *page = pfn_to_page(pfn); | 1112 | * Initialization of the pages has been deferred until now in order |
1113 | * to allow us to do the work while not holding the hotplug lock. | ||
1114 | */ | ||
1115 | memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], | ||
1116 | align_start >> PAGE_SHIFT, | ||
1117 | align_size >> PAGE_SHIFT, &devmem->pagemap); | ||
1114 | 1118 | ||
1115 | page->pgmap = &devmem->pagemap; | ||
1116 | } | ||
1117 | return 0; | 1119 | return 0; |
1118 | 1120 | ||
1119 | error_add_memory: | 1121 | error_add_memory: |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index deed97fba979..25ef59b7ee34 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -852,11 +852,10 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, | |||
852 | } | 852 | } |
853 | 853 | ||
854 | struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, | 854 | struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, |
855 | pmd_t *pmd, int flags) | 855 | pmd_t *pmd, int flags, struct dev_pagemap **pgmap) |
856 | { | 856 | { |
857 | unsigned long pfn = pmd_pfn(*pmd); | 857 | unsigned long pfn = pmd_pfn(*pmd); |
858 | struct mm_struct *mm = vma->vm_mm; | 858 | struct mm_struct *mm = vma->vm_mm; |
859 | struct dev_pagemap *pgmap; | ||
860 | struct page *page; | 859 | struct page *page; |
861 | 860 | ||
862 | assert_spin_locked(pmd_lockptr(mm, pmd)); | 861 | assert_spin_locked(pmd_lockptr(mm, pmd)); |
@@ -886,12 +885,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, | |||
886 | return ERR_PTR(-EEXIST); | 885 | return ERR_PTR(-EEXIST); |
887 | 886 | ||
888 | pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; | 887 | pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; |
889 | pgmap = get_dev_pagemap(pfn, NULL); | 888 | *pgmap = get_dev_pagemap(pfn, *pgmap); |
890 | if (!pgmap) | 889 | if (!*pgmap) |
891 | return ERR_PTR(-EFAULT); | 890 | return ERR_PTR(-EFAULT); |
892 | page = pfn_to_page(pfn); | 891 | page = pfn_to_page(pfn); |
893 | get_page(page); | 892 | get_page(page); |
894 | put_dev_pagemap(pgmap); | ||
895 | 893 | ||
896 | return page; | 894 | return page; |
897 | } | 895 | } |
@@ -1000,11 +998,10 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr, | |||
1000 | } | 998 | } |
1001 | 999 | ||
1002 | struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, | 1000 | struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, |
1003 | pud_t *pud, int flags) | 1001 | pud_t *pud, int flags, struct dev_pagemap **pgmap) |
1004 | { | 1002 | { |
1005 | unsigned long pfn = pud_pfn(*pud); | 1003 | unsigned long pfn = pud_pfn(*pud); |
1006 | struct mm_struct *mm = vma->vm_mm; | 1004 | struct mm_struct *mm = vma->vm_mm; |
1007 | struct dev_pagemap *pgmap; | ||
1008 | struct page *page; | 1005 | struct page *page; |
1009 | 1006 | ||
1010 | assert_spin_locked(pud_lockptr(mm, pud)); | 1007 | assert_spin_locked(pud_lockptr(mm, pud)); |
@@ -1028,12 +1025,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, | |||
1028 | return ERR_PTR(-EEXIST); | 1025 | return ERR_PTR(-EEXIST); |
1029 | 1026 | ||
1030 | pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; | 1027 | pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; |
1031 | pgmap = get_dev_pagemap(pfn, NULL); | 1028 | *pgmap = get_dev_pagemap(pfn, *pgmap); |
1032 | if (!pgmap) | 1029 | if (!*pgmap) |
1033 | return ERR_PTR(-EFAULT); | 1030 | return ERR_PTR(-EFAULT); |
1034 | page = pfn_to_page(pfn); | 1031 | page = pfn_to_page(pfn); |
1035 | get_page(page); | 1032 | get_page(page); |
1036 | put_dev_pagemap(pgmap); | ||
1037 | 1033 | ||
1038 | return page; | 1034 | return page; |
1039 | } | 1035 | } |
@@ -1562,8 +1558,20 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) | |||
1562 | * We are not sure a pending tlb flush here is for a huge page | 1558 | * We are not sure a pending tlb flush here is for a huge page |
1563 | * mapping or not. Hence use the tlb range variant | 1559 | * mapping or not. Hence use the tlb range variant |
1564 | */ | 1560 | */ |
1565 | if (mm_tlb_flush_pending(vma->vm_mm)) | 1561 | if (mm_tlb_flush_pending(vma->vm_mm)) { |
1566 | flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); | 1562 | flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); |
1563 | /* | ||
1564 | * change_huge_pmd() released the pmd lock before | ||
1565 | * invalidating the secondary MMUs sharing the primary | ||
1566 | * MMU pagetables (with ->invalidate_range()). The | ||
1567 | * mmu_notifier_invalidate_range_end() (which | ||
1568 | * internally calls ->invalidate_range()) in | ||
1569 | * change_pmd_range() will run after us, so we can't | ||
1570 | * rely on it here and we need an explicit invalidate. | ||
1571 | */ | ||
1572 | mmu_notifier_invalidate_range(vma->vm_mm, haddr, | ||
1573 | haddr + HPAGE_PMD_SIZE); | ||
1574 | } | ||
1567 | 1575 | ||
1568 | /* | 1576 | /* |
1569 | * Migrate the THP to the requested node, returns with page unlocked | 1577 | * Migrate the THP to the requested node, returns with page unlocked |
@@ -2369,6 +2377,7 @@ static void __split_huge_page_tail(struct page *head, int tail, | |||
2369 | (1L << PG_mlocked) | | 2377 | (1L << PG_mlocked) | |
2370 | (1L << PG_uptodate) | | 2378 | (1L << PG_uptodate) | |
2371 | (1L << PG_active) | | 2379 | (1L << PG_active) | |
2380 | (1L << PG_workingset) | | ||
2372 | (1L << PG_locked) | | 2381 | (1L << PG_locked) | |
2373 | (1L << PG_unevictable) | | 2382 | (1L << PG_unevictable) | |
2374 | (1L << PG_dirty))); | 2383 | (1L << PG_dirty))); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5c390f5a5207..7b5c0ad9a6bd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -3690,6 +3690,12 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, | |||
3690 | return err; | 3690 | return err; |
3691 | ClearPagePrivate(page); | 3691 | ClearPagePrivate(page); |
3692 | 3692 | ||
3693 | /* | ||
3694 | * set page dirty so that it will not be removed from cache/file | ||
3695 | * by non-hugetlbfs specific code paths. | ||
3696 | */ | ||
3697 | set_page_dirty(page); | ||
3698 | |||
3693 | spin_lock(&inode->i_lock); | 3699 | spin_lock(&inode->i_lock); |
3694 | inode->i_blocks += blocks_per_huge_page(h); | 3700 | inode->i_blocks += blocks_per_huge_page(h); |
3695 | spin_unlock(&inode->i_lock); | 3701 | spin_unlock(&inode->i_lock); |
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 3a8ddf8baf7d..b209dbaefde8 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c | |||
@@ -103,7 +103,7 @@ static int quarantine_head; | |||
103 | static int quarantine_tail; | 103 | static int quarantine_tail; |
104 | /* Total size of all objects in global_quarantine across all batches. */ | 104 | /* Total size of all objects in global_quarantine across all batches. */ |
105 | static unsigned long quarantine_size; | 105 | static unsigned long quarantine_size; |
106 | static DEFINE_SPINLOCK(quarantine_lock); | 106 | static DEFINE_RAW_SPINLOCK(quarantine_lock); |
107 | DEFINE_STATIC_SRCU(remove_cache_srcu); | 107 | DEFINE_STATIC_SRCU(remove_cache_srcu); |
108 | 108 | ||
109 | /* Maximum size of the global queue. */ | 109 | /* Maximum size of the global queue. */ |
@@ -190,7 +190,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) | |||
190 | if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { | 190 | if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { |
191 | qlist_move_all(q, &temp); | 191 | qlist_move_all(q, &temp); |
192 | 192 | ||
193 | spin_lock(&quarantine_lock); | 193 | raw_spin_lock(&quarantine_lock); |
194 | WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); | 194 | WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); |
195 | qlist_move_all(&temp, &global_quarantine[quarantine_tail]); | 195 | qlist_move_all(&temp, &global_quarantine[quarantine_tail]); |
196 | if (global_quarantine[quarantine_tail].bytes >= | 196 | if (global_quarantine[quarantine_tail].bytes >= |
@@ -203,7 +203,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) | |||
203 | if (new_tail != quarantine_head) | 203 | if (new_tail != quarantine_head) |
204 | quarantine_tail = new_tail; | 204 | quarantine_tail = new_tail; |
205 | } | 205 | } |
206 | spin_unlock(&quarantine_lock); | 206 | raw_spin_unlock(&quarantine_lock); |
207 | } | 207 | } |
208 | 208 | ||
209 | local_irq_restore(flags); | 209 | local_irq_restore(flags); |
@@ -230,7 +230,7 @@ void quarantine_reduce(void) | |||
230 | * expected case). | 230 | * expected case). |
231 | */ | 231 | */ |
232 | srcu_idx = srcu_read_lock(&remove_cache_srcu); | 232 | srcu_idx = srcu_read_lock(&remove_cache_srcu); |
233 | spin_lock_irqsave(&quarantine_lock, flags); | 233 | raw_spin_lock_irqsave(&quarantine_lock, flags); |
234 | 234 | ||
235 | /* | 235 | /* |
236 | * Update quarantine size in case of hotplug. Allocate a fraction of | 236 | * Update quarantine size in case of hotplug. Allocate a fraction of |
@@ -254,7 +254,7 @@ void quarantine_reduce(void) | |||
254 | quarantine_head = 0; | 254 | quarantine_head = 0; |
255 | } | 255 | } |
256 | 256 | ||
257 | spin_unlock_irqrestore(&quarantine_lock, flags); | 257 | raw_spin_unlock_irqrestore(&quarantine_lock, flags); |
258 | 258 | ||
259 | qlist_free_all(&to_free, NULL); | 259 | qlist_free_all(&to_free, NULL); |
260 | srcu_read_unlock(&remove_cache_srcu, srcu_idx); | 260 | srcu_read_unlock(&remove_cache_srcu, srcu_idx); |
@@ -310,17 +310,17 @@ void quarantine_remove_cache(struct kmem_cache *cache) | |||
310 | */ | 310 | */ |
311 | on_each_cpu(per_cpu_remove_cache, cache, 1); | 311 | on_each_cpu(per_cpu_remove_cache, cache, 1); |
312 | 312 | ||
313 | spin_lock_irqsave(&quarantine_lock, flags); | 313 | raw_spin_lock_irqsave(&quarantine_lock, flags); |
314 | for (i = 0; i < QUARANTINE_BATCHES; i++) { | 314 | for (i = 0; i < QUARANTINE_BATCHES; i++) { |
315 | if (qlist_empty(&global_quarantine[i])) | 315 | if (qlist_empty(&global_quarantine[i])) |
316 | continue; | 316 | continue; |
317 | qlist_move_cache(&global_quarantine[i], &to_free, cache); | 317 | qlist_move_cache(&global_quarantine[i], &to_free, cache); |
318 | /* Scanning whole quarantine can take a while. */ | 318 | /* Scanning whole quarantine can take a while. */ |
319 | spin_unlock_irqrestore(&quarantine_lock, flags); | 319 | raw_spin_unlock_irqrestore(&quarantine_lock, flags); |
320 | cond_resched(); | 320 | cond_resched(); |
321 | spin_lock_irqsave(&quarantine_lock, flags); | 321 | raw_spin_lock_irqsave(&quarantine_lock, flags); |
322 | } | 322 | } |
323 | spin_unlock_irqrestore(&quarantine_lock, flags); | 323 | raw_spin_unlock_irqrestore(&quarantine_lock, flags); |
324 | 324 | ||
325 | qlist_free_all(&to_free, cache); | 325 | qlist_free_all(&to_free, cache); |
326 | 326 | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 17dd883198ae..4f7e4b5a2f08 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -86,6 +86,7 @@ | |||
86 | #include <linux/seq_file.h> | 86 | #include <linux/seq_file.h> |
87 | #include <linux/cpumask.h> | 87 | #include <linux/cpumask.h> |
88 | #include <linux/spinlock.h> | 88 | #include <linux/spinlock.h> |
89 | #include <linux/module.h> | ||
89 | #include <linux/mutex.h> | 90 | #include <linux/mutex.h> |
90 | #include <linux/rcupdate.h> | 91 | #include <linux/rcupdate.h> |
91 | #include <linux/stacktrace.h> | 92 | #include <linux/stacktrace.h> |
@@ -181,6 +182,7 @@ struct kmemleak_object { | |||
181 | /* flag set to not scan the object */ | 182 | /* flag set to not scan the object */ |
182 | #define OBJECT_NO_SCAN (1 << 2) | 183 | #define OBJECT_NO_SCAN (1 << 2) |
183 | 184 | ||
185 | #define HEX_PREFIX " " | ||
184 | /* number of bytes to print per line; must be 16 or 32 */ | 186 | /* number of bytes to print per line; must be 16 or 32 */ |
185 | #define HEX_ROW_SIZE 16 | 187 | #define HEX_ROW_SIZE 16 |
186 | /* number of bytes to print at a time (1, 2, 4, 8) */ | 188 | /* number of bytes to print at a time (1, 2, 4, 8) */ |
@@ -235,6 +237,9 @@ static int kmemleak_skip_disable; | |||
235 | /* If there are leaks that can be reported */ | 237 | /* If there are leaks that can be reported */ |
236 | static bool kmemleak_found_leaks; | 238 | static bool kmemleak_found_leaks; |
237 | 239 | ||
240 | static bool kmemleak_verbose; | ||
241 | module_param_named(verbose, kmemleak_verbose, bool, 0600); | ||
242 | |||
238 | /* | 243 | /* |
239 | * Early object allocation/freeing logging. Kmemleak is initialized after the | 244 | * Early object allocation/freeing logging. Kmemleak is initialized after the |
240 | * kernel allocator. However, both the kernel allocator and kmemleak may | 245 | * kernel allocator. However, both the kernel allocator and kmemleak may |
@@ -299,6 +304,25 @@ static void kmemleak_disable(void); | |||
299 | kmemleak_disable(); \ | 304 | kmemleak_disable(); \ |
300 | } while (0) | 305 | } while (0) |
301 | 306 | ||
307 | #define warn_or_seq_printf(seq, fmt, ...) do { \ | ||
308 | if (seq) \ | ||
309 | seq_printf(seq, fmt, ##__VA_ARGS__); \ | ||
310 | else \ | ||
311 | pr_warn(fmt, ##__VA_ARGS__); \ | ||
312 | } while (0) | ||
313 | |||
314 | static void warn_or_seq_hex_dump(struct seq_file *seq, int prefix_type, | ||
315 | int rowsize, int groupsize, const void *buf, | ||
316 | size_t len, bool ascii) | ||
317 | { | ||
318 | if (seq) | ||
319 | seq_hex_dump(seq, HEX_PREFIX, prefix_type, rowsize, groupsize, | ||
320 | buf, len, ascii); | ||
321 | else | ||
322 | print_hex_dump(KERN_WARNING, pr_fmt(HEX_PREFIX), prefix_type, | ||
323 | rowsize, groupsize, buf, len, ascii); | ||
324 | } | ||
325 | |||
302 | /* | 326 | /* |
303 | * Printing of the objects hex dump to the seq file. The number of lines to be | 327 | * Printing of the objects hex dump to the seq file. The number of lines to be |
304 | * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The | 328 | * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The |
@@ -314,10 +338,10 @@ static void hex_dump_object(struct seq_file *seq, | |||
314 | /* limit the number of lines to HEX_MAX_LINES */ | 338 | /* limit the number of lines to HEX_MAX_LINES */ |
315 | len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); | 339 | len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); |
316 | 340 | ||
317 | seq_printf(seq, " hex dump (first %zu bytes):\n", len); | 341 | warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); |
318 | kasan_disable_current(); | 342 | kasan_disable_current(); |
319 | seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE, | 343 | warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE, |
320 | HEX_GROUP_SIZE, ptr, len, HEX_ASCII); | 344 | HEX_GROUP_SIZE, ptr, len, HEX_ASCII); |
321 | kasan_enable_current(); | 345 | kasan_enable_current(); |
322 | } | 346 | } |
323 | 347 | ||
@@ -365,17 +389,17 @@ static void print_unreferenced(struct seq_file *seq, | |||
365 | int i; | 389 | int i; |
366 | unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); | 390 | unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); |
367 | 391 | ||
368 | seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", | 392 | warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", |
369 | object->pointer, object->size); | 393 | object->pointer, object->size); |
370 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", | 394 | warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", |
371 | object->comm, object->pid, object->jiffies, | 395 | object->comm, object->pid, object->jiffies, |
372 | msecs_age / 1000, msecs_age % 1000); | 396 | msecs_age / 1000, msecs_age % 1000); |
373 | hex_dump_object(seq, object); | 397 | hex_dump_object(seq, object); |
374 | seq_printf(seq, " backtrace:\n"); | 398 | warn_or_seq_printf(seq, " backtrace:\n"); |
375 | 399 | ||
376 | for (i = 0; i < object->trace_len; i++) { | 400 | for (i = 0; i < object->trace_len; i++) { |
377 | void *ptr = (void *)object->trace[i]; | 401 | void *ptr = (void *)object->trace[i]; |
378 | seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); | 402 | warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); |
379 | } | 403 | } |
380 | } | 404 | } |
381 | 405 | ||
@@ -1598,6 +1622,10 @@ static void kmemleak_scan(void) | |||
1598 | if (unreferenced_object(object) && | 1622 | if (unreferenced_object(object) && |
1599 | !(object->flags & OBJECT_REPORTED)) { | 1623 | !(object->flags & OBJECT_REPORTED)) { |
1600 | object->flags |= OBJECT_REPORTED; | 1624 | object->flags |= OBJECT_REPORTED; |
1625 | |||
1626 | if (kmemleak_verbose) | ||
1627 | print_unreferenced(NULL, object); | ||
1628 | |||
1601 | new_leaks++; | 1629 | new_leaks++; |
1602 | } | 1630 | } |
1603 | spin_unlock_irqrestore(&object->lock, flags); | 1631 | spin_unlock_irqrestore(&object->lock, flags); |
diff --git a/mm/memblock.c b/mm/memblock.c index 237944479d25..a85315083b5a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -1444,10 +1444,9 @@ void * __init memblock_virt_alloc_try_nid_raw( | |||
1444 | 1444 | ||
1445 | ptr = memblock_virt_alloc_internal(size, align, | 1445 | ptr = memblock_virt_alloc_internal(size, align, |
1446 | min_addr, max_addr, nid); | 1446 | min_addr, max_addr, nid); |
1447 | #ifdef CONFIG_DEBUG_VM | ||
1448 | if (ptr && size > 0) | 1447 | if (ptr && size > 0) |
1449 | memset(ptr, PAGE_POISON_PATTERN, size); | 1448 | page_init_poison(ptr, size); |
1450 | #endif | 1449 | |
1451 | return ptr; | 1450 | return ptr; |
1452 | } | 1451 | } |
1453 | 1452 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e79cb59552d9..10a9b554d69f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1669,6 +1669,8 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int | |||
1669 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 1669 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
1670 | return OOM_SKIPPED; | 1670 | return OOM_SKIPPED; |
1671 | 1671 | ||
1672 | memcg_memory_event(memcg, MEMCG_OOM); | ||
1673 | |||
1672 | /* | 1674 | /* |
1673 | * We are in the middle of the charge context here, so we | 1675 | * We are in the middle of the charge context here, so we |
1674 | * don't want to block when potentially sitting on a callstack | 1676 | * don't want to block when potentially sitting on a callstack |
@@ -2250,8 +2252,6 @@ retry: | |||
2250 | if (fatal_signal_pending(current)) | 2252 | if (fatal_signal_pending(current)) |
2251 | goto force; | 2253 | goto force; |
2252 | 2254 | ||
2253 | memcg_memory_event(mem_over_limit, MEMCG_OOM); | ||
2254 | |||
2255 | /* | 2255 | /* |
2256 | * keep retrying as long as the memcg oom killer is able to make | 2256 | * keep retrying as long as the memcg oom killer is able to make |
2257 | * a forward progress or bypass the charge if the oom killer | 2257 | * a forward progress or bypass the charge if the oom killer |
@@ -2460,7 +2460,7 @@ static void memcg_kmem_cache_create_func(struct work_struct *w) | |||
2460 | /* | 2460 | /* |
2461 | * Enqueue the creation of a per-memcg kmem_cache. | 2461 | * Enqueue the creation of a per-memcg kmem_cache. |
2462 | */ | 2462 | */ |
2463 | static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, | 2463 | static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, |
2464 | struct kmem_cache *cachep) | 2464 | struct kmem_cache *cachep) |
2465 | { | 2465 | { |
2466 | struct memcg_kmem_cache_create_work *cw; | 2466 | struct memcg_kmem_cache_create_work *cw; |
@@ -2478,25 +2478,6 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, | |||
2478 | queue_work(memcg_kmem_cache_wq, &cw->work); | 2478 | queue_work(memcg_kmem_cache_wq, &cw->work); |
2479 | } | 2479 | } |
2480 | 2480 | ||
2481 | static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, | ||
2482 | struct kmem_cache *cachep) | ||
2483 | { | ||
2484 | /* | ||
2485 | * We need to stop accounting when we kmalloc, because if the | ||
2486 | * corresponding kmalloc cache is not yet created, the first allocation | ||
2487 | * in __memcg_schedule_kmem_cache_create will recurse. | ||
2488 | * | ||
2489 | * However, it is better to enclose the whole function. Depending on | ||
2490 | * the debugging options enabled, INIT_WORK(), for instance, can | ||
2491 | * trigger an allocation. This too, will make us recurse. Because at | ||
2492 | * this point we can't allow ourselves back into memcg_kmem_get_cache, | ||
2493 | * the safest choice is to do it like this, wrapping the whole function. | ||
2494 | */ | ||
2495 | current->memcg_kmem_skip_account = 1; | ||
2496 | __memcg_schedule_kmem_cache_create(memcg, cachep); | ||
2497 | current->memcg_kmem_skip_account = 0; | ||
2498 | } | ||
2499 | |||
2500 | static inline bool memcg_kmem_bypass(void) | 2481 | static inline bool memcg_kmem_bypass(void) |
2501 | { | 2482 | { |
2502 | if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) | 2483 | if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) |
@@ -2531,9 +2512,6 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
2531 | if (memcg_kmem_bypass()) | 2512 | if (memcg_kmem_bypass()) |
2532 | return cachep; | 2513 | return cachep; |
2533 | 2514 | ||
2534 | if (current->memcg_kmem_skip_account) | ||
2535 | return cachep; | ||
2536 | |||
2537 | memcg = get_mem_cgroup_from_current(); | 2515 | memcg = get_mem_cgroup_from_current(); |
2538 | kmemcg_id = READ_ONCE(memcg->kmemcg_id); | 2516 | kmemcg_id = READ_ONCE(memcg->kmemcg_id); |
2539 | if (kmemcg_id < 0) | 2517 | if (kmemcg_id < 0) |
@@ -4321,14 +4299,12 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg) | |||
4321 | 4299 | ||
4322 | static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) | 4300 | static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) |
4323 | { | 4301 | { |
4324 | VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); | 4302 | refcount_add(n, &memcg->id.ref); |
4325 | atomic_add(n, &memcg->id.ref); | ||
4326 | } | 4303 | } |
4327 | 4304 | ||
4328 | static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) | 4305 | static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) |
4329 | { | 4306 | { |
4330 | VM_BUG_ON(atomic_read(&memcg->id.ref) < n); | 4307 | if (refcount_sub_and_test(n, &memcg->id.ref)) { |
4331 | if (atomic_sub_and_test(n, &memcg->id.ref)) { | ||
4332 | mem_cgroup_id_remove(memcg); | 4308 | mem_cgroup_id_remove(memcg); |
4333 | 4309 | ||
4334 | /* Memcg ID pins CSS */ | 4310 | /* Memcg ID pins CSS */ |
@@ -4545,7 +4521,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
4545 | } | 4521 | } |
4546 | 4522 | ||
4547 | /* Online state pins memcg ID, memcg ID pins CSS */ | 4523 | /* Online state pins memcg ID, memcg ID pins CSS */ |
4548 | atomic_set(&memcg->id.ref, 1); | 4524 | refcount_set(&memcg->id.ref, 1); |
4549 | css_get(css); | 4525 | css_get(css); |
4550 | return 0; | 4526 | return 0; |
4551 | } | 4527 | } |
@@ -4573,6 +4549,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
4573 | memcg_offline_kmem(memcg); | 4549 | memcg_offline_kmem(memcg); |
4574 | wb_memcg_offline(memcg); | 4550 | wb_memcg_offline(memcg); |
4575 | 4551 | ||
4552 | drain_all_stock(memcg); | ||
4553 | |||
4576 | mem_cgroup_id_put(memcg); | 4554 | mem_cgroup_id_put(memcg); |
4577 | } | 4555 | } |
4578 | 4556 | ||
@@ -5595,6 +5573,13 @@ static int memory_stat_show(struct seq_file *m, void *v) | |||
5595 | seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); | 5573 | seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); |
5596 | seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); | 5574 | seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); |
5597 | 5575 | ||
5576 | seq_printf(m, "workingset_refault %lu\n", | ||
5577 | acc.stat[WORKINGSET_REFAULT]); | ||
5578 | seq_printf(m, "workingset_activate %lu\n", | ||
5579 | acc.stat[WORKINGSET_ACTIVATE]); | ||
5580 | seq_printf(m, "workingset_nodereclaim %lu\n", | ||
5581 | acc.stat[WORKINGSET_NODERECLAIM]); | ||
5582 | |||
5598 | seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); | 5583 | seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); |
5599 | seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + | 5584 | seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + |
5600 | acc.events[PGSCAN_DIRECT]); | 5585 | acc.events[PGSCAN_DIRECT]); |
@@ -5605,13 +5590,6 @@ static int memory_stat_show(struct seq_file *m, void *v) | |||
5605 | seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); | 5590 | seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); |
5606 | seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); | 5591 | seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); |
5607 | 5592 | ||
5608 | seq_printf(m, "workingset_refault %lu\n", | ||
5609 | acc.stat[WORKINGSET_REFAULT]); | ||
5610 | seq_printf(m, "workingset_activate %lu\n", | ||
5611 | acc.stat[WORKINGSET_ACTIVATE]); | ||
5612 | seq_printf(m, "workingset_nodereclaim %lu\n", | ||
5613 | acc.stat[WORKINGSET_NODERECLAIM]); | ||
5614 | |||
5615 | return 0; | 5593 | return 0; |
5616 | } | 5594 | } |
5617 | 5595 | ||
@@ -6377,7 +6355,7 @@ subsys_initcall(mem_cgroup_init); | |||
6377 | #ifdef CONFIG_MEMCG_SWAP | 6355 | #ifdef CONFIG_MEMCG_SWAP |
6378 | static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) | 6356 | static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) |
6379 | { | 6357 | { |
6380 | while (!atomic_inc_not_zero(&memcg->id.ref)) { | 6358 | while (!refcount_inc_not_zero(&memcg->id.ref)) { |
6381 | /* | 6359 | /* |
6382 | * The root cgroup cannot be destroyed, so it's refcount must | 6360 | * The root cgroup cannot be destroyed, so it's refcount must |
6383 | * always be >= 1. | 6361 | * always be >= 1. |
diff --git a/mm/memory.c b/mm/memory.c index 21a5e6e4758b..072139579d89 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1520,19 +1520,16 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
1520 | } | 1520 | } |
1521 | EXPORT_SYMBOL(vm_insert_page); | 1521 | EXPORT_SYMBOL(vm_insert_page); |
1522 | 1522 | ||
1523 | static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 1523 | static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
1524 | pfn_t pfn, pgprot_t prot, bool mkwrite) | 1524 | pfn_t pfn, pgprot_t prot, bool mkwrite) |
1525 | { | 1525 | { |
1526 | struct mm_struct *mm = vma->vm_mm; | 1526 | struct mm_struct *mm = vma->vm_mm; |
1527 | int retval; | ||
1528 | pte_t *pte, entry; | 1527 | pte_t *pte, entry; |
1529 | spinlock_t *ptl; | 1528 | spinlock_t *ptl; |
1530 | 1529 | ||
1531 | retval = -ENOMEM; | ||
1532 | pte = get_locked_pte(mm, addr, &ptl); | 1530 | pte = get_locked_pte(mm, addr, &ptl); |
1533 | if (!pte) | 1531 | if (!pte) |
1534 | goto out; | 1532 | return VM_FAULT_OOM; |
1535 | retval = -EBUSY; | ||
1536 | if (!pte_none(*pte)) { | 1533 | if (!pte_none(*pte)) { |
1537 | if (mkwrite) { | 1534 | if (mkwrite) { |
1538 | /* | 1535 | /* |
@@ -1565,56 +1562,32 @@ out_mkwrite: | |||
1565 | set_pte_at(mm, addr, pte, entry); | 1562 | set_pte_at(mm, addr, pte, entry); |
1566 | update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ | 1563 | update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ |
1567 | 1564 | ||
1568 | retval = 0; | ||
1569 | out_unlock: | 1565 | out_unlock: |
1570 | pte_unmap_unlock(pte, ptl); | 1566 | pte_unmap_unlock(pte, ptl); |
1571 | out: | 1567 | return VM_FAULT_NOPAGE; |
1572 | return retval; | ||
1573 | } | ||
1574 | |||
1575 | /** | ||
1576 | * vm_insert_pfn - insert single pfn into user vma | ||
1577 | * @vma: user vma to map to | ||
1578 | * @addr: target user address of this page | ||
1579 | * @pfn: source kernel pfn | ||
1580 | * | ||
1581 | * Similar to vm_insert_page, this allows drivers to insert individual pages | ||
1582 | * they've allocated into a user vma. Same comments apply. | ||
1583 | * | ||
1584 | * This function should only be called from a vm_ops->fault handler, and | ||
1585 | * in that case the handler should return NULL. | ||
1586 | * | ||
1587 | * vma cannot be a COW mapping. | ||
1588 | * | ||
1589 | * As this is called only for pages that do not currently exist, we | ||
1590 | * do not need to flush old virtual caches or the TLB. | ||
1591 | */ | ||
1592 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | ||
1593 | unsigned long pfn) | ||
1594 | { | ||
1595 | return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); | ||
1596 | } | 1568 | } |
1597 | EXPORT_SYMBOL(vm_insert_pfn); | ||
1598 | 1569 | ||
1599 | /** | 1570 | /** |
1600 | * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot | 1571 | * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot |
1601 | * @vma: user vma to map to | 1572 | * @vma: user vma to map to |
1602 | * @addr: target user address of this page | 1573 | * @addr: target user address of this page |
1603 | * @pfn: source kernel pfn | 1574 | * @pfn: source kernel pfn |
1604 | * @pgprot: pgprot flags for the inserted page | 1575 | * @pgprot: pgprot flags for the inserted page |
1605 | * | 1576 | * |
1606 | * This is exactly like vm_insert_pfn, except that it allows drivers to | 1577 | * This is exactly like vmf_insert_pfn(), except that it allows drivers to |
1607 | * to override pgprot on a per-page basis. | 1578 | * to override pgprot on a per-page basis. |
1608 | * | 1579 | * |
1609 | * This only makes sense for IO mappings, and it makes no sense for | 1580 | * This only makes sense for IO mappings, and it makes no sense for |
1610 | * cow mappings. In general, using multiple vmas is preferable; | 1581 | * COW mappings. In general, using multiple vmas is preferable; |
1611 | * vm_insert_pfn_prot should only be used if using multiple VMAs is | 1582 | * vmf_insert_pfn_prot should only be used if using multiple VMAs is |
1612 | * impractical. | 1583 | * impractical. |
1584 | * | ||
1585 | * Context: Process context. May allocate using %GFP_KERNEL. | ||
1586 | * Return: vm_fault_t value. | ||
1613 | */ | 1587 | */ |
1614 | int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, | 1588 | vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, |
1615 | unsigned long pfn, pgprot_t pgprot) | 1589 | unsigned long pfn, pgprot_t pgprot) |
1616 | { | 1590 | { |
1617 | int ret; | ||
1618 | /* | 1591 | /* |
1619 | * Technically, architectures with pte_special can avoid all these | 1592 | * Technically, architectures with pte_special can avoid all these |
1620 | * restrictions (same for remap_pfn_range). However we would like | 1593 | * restrictions (same for remap_pfn_range). However we would like |
@@ -1628,19 +1601,44 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, | |||
1628 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); | 1601 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); |
1629 | 1602 | ||
1630 | if (addr < vma->vm_start || addr >= vma->vm_end) | 1603 | if (addr < vma->vm_start || addr >= vma->vm_end) |
1631 | return -EFAULT; | 1604 | return VM_FAULT_SIGBUS; |
1632 | 1605 | ||
1633 | if (!pfn_modify_allowed(pfn, pgprot)) | 1606 | if (!pfn_modify_allowed(pfn, pgprot)) |
1634 | return -EACCES; | 1607 | return VM_FAULT_SIGBUS; |
1635 | 1608 | ||
1636 | track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); | 1609 | track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); |
1637 | 1610 | ||
1638 | ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, | 1611 | return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, |
1639 | false); | 1612 | false); |
1613 | } | ||
1614 | EXPORT_SYMBOL(vmf_insert_pfn_prot); | ||
1640 | 1615 | ||
1641 | return ret; | 1616 | /** |
1617 | * vmf_insert_pfn - insert single pfn into user vma | ||
1618 | * @vma: user vma to map to | ||
1619 | * @addr: target user address of this page | ||
1620 | * @pfn: source kernel pfn | ||
1621 | * | ||
1622 | * Similar to vm_insert_page, this allows drivers to insert individual pages | ||
1623 | * they've allocated into a user vma. Same comments apply. | ||
1624 | * | ||
1625 | * This function should only be called from a vm_ops->fault handler, and | ||
1626 | * in that case the handler should return the result of this function. | ||
1627 | * | ||
1628 | * vma cannot be a COW mapping. | ||
1629 | * | ||
1630 | * As this is called only for pages that do not currently exist, we | ||
1631 | * do not need to flush old virtual caches or the TLB. | ||
1632 | * | ||
1633 | * Context: Process context. May allocate using %GFP_KERNEL. | ||
1634 | * Return: vm_fault_t value. | ||
1635 | */ | ||
1636 | vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | ||
1637 | unsigned long pfn) | ||
1638 | { | ||
1639 | return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); | ||
1642 | } | 1640 | } |
1643 | EXPORT_SYMBOL(vm_insert_pfn_prot); | 1641 | EXPORT_SYMBOL(vmf_insert_pfn); |
1644 | 1642 | ||
1645 | static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) | 1643 | static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) |
1646 | { | 1644 | { |
@@ -1656,20 +1654,21 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) | |||
1656 | return false; | 1654 | return false; |
1657 | } | 1655 | } |
1658 | 1656 | ||
1659 | static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | 1657 | static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, |
1660 | pfn_t pfn, bool mkwrite) | 1658 | unsigned long addr, pfn_t pfn, bool mkwrite) |
1661 | { | 1659 | { |
1662 | pgprot_t pgprot = vma->vm_page_prot; | 1660 | pgprot_t pgprot = vma->vm_page_prot; |
1661 | int err; | ||
1663 | 1662 | ||
1664 | BUG_ON(!vm_mixed_ok(vma, pfn)); | 1663 | BUG_ON(!vm_mixed_ok(vma, pfn)); |
1665 | 1664 | ||
1666 | if (addr < vma->vm_start || addr >= vma->vm_end) | 1665 | if (addr < vma->vm_start || addr >= vma->vm_end) |
1667 | return -EFAULT; | 1666 | return VM_FAULT_SIGBUS; |
1668 | 1667 | ||
1669 | track_pfn_insert(vma, &pgprot, pfn); | 1668 | track_pfn_insert(vma, &pgprot, pfn); |
1670 | 1669 | ||
1671 | if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) | 1670 | if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) |
1672 | return -EACCES; | 1671 | return VM_FAULT_SIGBUS; |
1673 | 1672 | ||
1674 | /* | 1673 | /* |
1675 | * If we don't have pte special, then we have to use the pfn_valid() | 1674 | * If we don't have pte special, then we have to use the pfn_valid() |
@@ -1688,36 +1687,35 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | |||
1688 | * result in pfn_t_has_page() == false. | 1687 | * result in pfn_t_has_page() == false. |
1689 | */ | 1688 | */ |
1690 | page = pfn_to_page(pfn_t_to_pfn(pfn)); | 1689 | page = pfn_to_page(pfn_t_to_pfn(pfn)); |
1691 | return insert_page(vma, addr, page, pgprot); | 1690 | err = insert_page(vma, addr, page, pgprot); |
1691 | } else { | ||
1692 | return insert_pfn(vma, addr, pfn, pgprot, mkwrite); | ||
1692 | } | 1693 | } |
1693 | return insert_pfn(vma, addr, pfn, pgprot, mkwrite); | 1694 | |
1695 | if (err == -ENOMEM) | ||
1696 | return VM_FAULT_OOM; | ||
1697 | if (err < 0 && err != -EBUSY) | ||
1698 | return VM_FAULT_SIGBUS; | ||
1699 | |||
1700 | return VM_FAULT_NOPAGE; | ||
1694 | } | 1701 | } |
1695 | 1702 | ||
1696 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | 1703 | vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
1697 | pfn_t pfn) | 1704 | pfn_t pfn) |
1698 | { | 1705 | { |
1699 | return __vm_insert_mixed(vma, addr, pfn, false); | 1706 | return __vm_insert_mixed(vma, addr, pfn, false); |
1700 | |||
1701 | } | 1707 | } |
1702 | EXPORT_SYMBOL(vm_insert_mixed); | 1708 | EXPORT_SYMBOL(vmf_insert_mixed); |
1703 | 1709 | ||
1704 | /* | 1710 | /* |
1705 | * If the insertion of PTE failed because someone else already added a | 1711 | * If the insertion of PTE failed because someone else already added a |
1706 | * different entry in the mean time, we treat that as success as we assume | 1712 | * different entry in the mean time, we treat that as success as we assume |
1707 | * the same entry was actually inserted. | 1713 | * the same entry was actually inserted. |
1708 | */ | 1714 | */ |
1709 | |||
1710 | vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, | 1715 | vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, |
1711 | unsigned long addr, pfn_t pfn) | 1716 | unsigned long addr, pfn_t pfn) |
1712 | { | 1717 | { |
1713 | int err; | 1718 | return __vm_insert_mixed(vma, addr, pfn, true); |
1714 | |||
1715 | err = __vm_insert_mixed(vma, addr, pfn, true); | ||
1716 | if (err == -ENOMEM) | ||
1717 | return VM_FAULT_OOM; | ||
1718 | if (err < 0 && err != -EBUSY) | ||
1719 | return VM_FAULT_SIGBUS; | ||
1720 | return VM_FAULT_NOPAGE; | ||
1721 | } | 1719 | } |
1722 | EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); | 1720 | EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); |
1723 | 1721 | ||
@@ -3498,10 +3496,36 @@ static vm_fault_t do_fault(struct vm_fault *vmf) | |||
3498 | struct vm_area_struct *vma = vmf->vma; | 3496 | struct vm_area_struct *vma = vmf->vma; |
3499 | vm_fault_t ret; | 3497 | vm_fault_t ret; |
3500 | 3498 | ||
3501 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ | 3499 | /* |
3502 | if (!vma->vm_ops->fault) | 3500 | * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND |
3503 | ret = VM_FAULT_SIGBUS; | 3501 | */ |
3504 | else if (!(vmf->flags & FAULT_FLAG_WRITE)) | 3502 | if (!vma->vm_ops->fault) { |
3503 | /* | ||
3504 | * If we find a migration pmd entry or a none pmd entry, which | ||
3505 | * should never happen, return SIGBUS | ||
3506 | */ | ||
3507 | if (unlikely(!pmd_present(*vmf->pmd))) | ||
3508 | ret = VM_FAULT_SIGBUS; | ||
3509 | else { | ||
3510 | vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, | ||
3511 | vmf->pmd, | ||
3512 | vmf->address, | ||
3513 | &vmf->ptl); | ||
3514 | /* | ||
3515 | * Make sure this is not a temporary clearing of pte | ||
3516 | * by holding ptl and checking again. A R/M/W update | ||
3517 | * of pte involves: take ptl, clearing the pte so that | ||
3518 | * we don't have concurrent modification by hardware | ||
3519 | * followed by an update. | ||
3520 | */ | ||
3521 | if (unlikely(pte_none(*vmf->pte))) | ||
3522 | ret = VM_FAULT_SIGBUS; | ||
3523 | else | ||
3524 | ret = VM_FAULT_NOPAGE; | ||
3525 | |||
3526 | pte_unmap_unlock(vmf->pte, vmf->ptl); | ||
3527 | } | ||
3528 | } else if (!(vmf->flags & FAULT_FLAG_WRITE)) | ||
3505 | ret = do_read_fault(vmf); | 3529 | ret = do_read_fault(vmf); |
3506 | else if (!(vma->vm_flags & VM_SHARED)) | 3530 | else if (!(vma->vm_flags & VM_SHARED)) |
3507 | ret = do_cow_fault(vmf); | 3531 | ret = do_cow_fault(vmf); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 38d94b703e9d..7e6509a53d79 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -687,62 +687,19 @@ static void node_states_check_changes_online(unsigned long nr_pages, | |||
687 | struct zone *zone, struct memory_notify *arg) | 687 | struct zone *zone, struct memory_notify *arg) |
688 | { | 688 | { |
689 | int nid = zone_to_nid(zone); | 689 | int nid = zone_to_nid(zone); |
690 | enum zone_type zone_last = ZONE_NORMAL; | ||
691 | 690 | ||
692 | /* | 691 | arg->status_change_nid = -1; |
693 | * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] | 692 | arg->status_change_nid_normal = -1; |
694 | * contains nodes which have zones of 0...ZONE_NORMAL, | 693 | arg->status_change_nid_high = -1; |
695 | * set zone_last to ZONE_NORMAL. | ||
696 | * | ||
697 | * If we don't have HIGHMEM nor movable node, | ||
698 | * node_states[N_NORMAL_MEMORY] contains nodes which have zones of | ||
699 | * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
700 | */ | ||
701 | if (N_MEMORY == N_NORMAL_MEMORY) | ||
702 | zone_last = ZONE_MOVABLE; | ||
703 | 694 | ||
704 | /* | 695 | if (!node_state(nid, N_MEMORY)) |
705 | * if the memory to be online is in a zone of 0...zone_last, and | 696 | arg->status_change_nid = nid; |
706 | * the zones of 0...zone_last don't have memory before online, we will | 697 | if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) |
707 | * need to set the node to node_states[N_NORMAL_MEMORY] after | ||
708 | * the memory is online. | ||
709 | */ | ||
710 | if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) | ||
711 | arg->status_change_nid_normal = nid; | 698 | arg->status_change_nid_normal = nid; |
712 | else | ||
713 | arg->status_change_nid_normal = -1; | ||
714 | |||
715 | #ifdef CONFIG_HIGHMEM | 699 | #ifdef CONFIG_HIGHMEM |
716 | /* | 700 | if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY)) |
717 | * If we have movable node, node_states[N_HIGH_MEMORY] | ||
718 | * contains nodes which have zones of 0...ZONE_HIGHMEM, | ||
719 | * set zone_last to ZONE_HIGHMEM. | ||
720 | * | ||
721 | * If we don't have movable node, node_states[N_NORMAL_MEMORY] | ||
722 | * contains nodes which have zones of 0...ZONE_MOVABLE, | ||
723 | * set zone_last to ZONE_MOVABLE. | ||
724 | */ | ||
725 | zone_last = ZONE_HIGHMEM; | ||
726 | if (N_MEMORY == N_HIGH_MEMORY) | ||
727 | zone_last = ZONE_MOVABLE; | ||
728 | |||
729 | if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) | ||
730 | arg->status_change_nid_high = nid; | 701 | arg->status_change_nid_high = nid; |
731 | else | ||
732 | arg->status_change_nid_high = -1; | ||
733 | #else | ||
734 | arg->status_change_nid_high = arg->status_change_nid_normal; | ||
735 | #endif | 702 | #endif |
736 | |||
737 | /* | ||
738 | * if the node don't have memory befor online, we will need to | ||
739 | * set the node to node_states[N_MEMORY] after the memory | ||
740 | * is online. | ||
741 | */ | ||
742 | if (!node_state(nid, N_MEMORY)) | ||
743 | arg->status_change_nid = nid; | ||
744 | else | ||
745 | arg->status_change_nid = -1; | ||
746 | } | 703 | } |
747 | 704 | ||
748 | static void node_states_set_node(int node, struct memory_notify *arg) | 705 | static void node_states_set_node(int node, struct memory_notify *arg) |
@@ -753,7 +710,8 @@ static void node_states_set_node(int node, struct memory_notify *arg) | |||
753 | if (arg->status_change_nid_high >= 0) | 710 | if (arg->status_change_nid_high >= 0) |
754 | node_set_state(node, N_HIGH_MEMORY); | 711 | node_set_state(node, N_HIGH_MEMORY); |
755 | 712 | ||
756 | node_set_state(node, N_MEMORY); | 713 | if (arg->status_change_nid >= 0) |
714 | node_set_state(node, N_MEMORY); | ||
757 | } | 715 | } |
758 | 716 | ||
759 | static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, | 717 | static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, |
@@ -1505,75 +1463,53 @@ static void node_states_check_changes_offline(unsigned long nr_pages, | |||
1505 | { | 1463 | { |
1506 | struct pglist_data *pgdat = zone->zone_pgdat; | 1464 | struct pglist_data *pgdat = zone->zone_pgdat; |
1507 | unsigned long present_pages = 0; | 1465 | unsigned long present_pages = 0; |
1508 | enum zone_type zt, zone_last = ZONE_NORMAL; | 1466 | enum zone_type zt; |
1509 | 1467 | ||
1510 | /* | 1468 | arg->status_change_nid = -1; |
1511 | * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] | 1469 | arg->status_change_nid_normal = -1; |
1512 | * contains nodes which have zones of 0...ZONE_NORMAL, | 1470 | arg->status_change_nid_high = -1; |
1513 | * set zone_last to ZONE_NORMAL. | ||
1514 | * | ||
1515 | * If we don't have HIGHMEM nor movable node, | ||
1516 | * node_states[N_NORMAL_MEMORY] contains nodes which have zones of | ||
1517 | * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
1518 | */ | ||
1519 | if (N_MEMORY == N_NORMAL_MEMORY) | ||
1520 | zone_last = ZONE_MOVABLE; | ||
1521 | 1471 | ||
1522 | /* | 1472 | /* |
1523 | * check whether node_states[N_NORMAL_MEMORY] will be changed. | 1473 | * Check whether node_states[N_NORMAL_MEMORY] will be changed. |
1524 | * If the memory to be offline is in a zone of 0...zone_last, | 1474 | * If the memory to be offline is within the range |
1525 | * and it is the last present memory, 0...zone_last will | 1475 | * [0..ZONE_NORMAL], and it is the last present memory there, |
1526 | * become empty after offline , thus we can determind we will | 1476 | * the zones in that range will become empty after the offlining, |
1527 | * need to clear the node from node_states[N_NORMAL_MEMORY]. | 1477 | * thus we can determine that we need to clear the node from |
1478 | * node_states[N_NORMAL_MEMORY]. | ||
1528 | */ | 1479 | */ |
1529 | for (zt = 0; zt <= zone_last; zt++) | 1480 | for (zt = 0; zt <= ZONE_NORMAL; zt++) |
1530 | present_pages += pgdat->node_zones[zt].present_pages; | 1481 | present_pages += pgdat->node_zones[zt].present_pages; |
1531 | if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) | 1482 | if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) |
1532 | arg->status_change_nid_normal = zone_to_nid(zone); | 1483 | arg->status_change_nid_normal = zone_to_nid(zone); |
1533 | else | ||
1534 | arg->status_change_nid_normal = -1; | ||
1535 | 1484 | ||
1536 | #ifdef CONFIG_HIGHMEM | 1485 | #ifdef CONFIG_HIGHMEM |
1537 | /* | 1486 | /* |
1538 | * If we have movable node, node_states[N_HIGH_MEMORY] | 1487 | * node_states[N_HIGH_MEMORY] contains nodes which |
1539 | * contains nodes which have zones of 0...ZONE_HIGHMEM, | 1488 | * have normal memory or high memory. |
1540 | * set zone_last to ZONE_HIGHMEM. | 1489 | * Here we add the present_pages belonging to ZONE_HIGHMEM. |
1541 | * | 1490 | * If the zone is within the range of [0..ZONE_HIGHMEM), and |
1542 | * If we don't have movable node, node_states[N_NORMAL_MEMORY] | 1491 | * we determine that the zones in that range become empty, |
1543 | * contains nodes which have zones of 0...ZONE_MOVABLE, | 1492 | * we need to clear the node for N_HIGH_MEMORY. |
1544 | * set zone_last to ZONE_MOVABLE. | ||
1545 | */ | 1493 | */ |
1546 | zone_last = ZONE_HIGHMEM; | 1494 | present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages; |
1547 | if (N_MEMORY == N_HIGH_MEMORY) | 1495 | if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages) |
1548 | zone_last = ZONE_MOVABLE; | ||
1549 | |||
1550 | for (; zt <= zone_last; zt++) | ||
1551 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1552 | if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) | ||
1553 | arg->status_change_nid_high = zone_to_nid(zone); | 1496 | arg->status_change_nid_high = zone_to_nid(zone); |
1554 | else | ||
1555 | arg->status_change_nid_high = -1; | ||
1556 | #else | ||
1557 | arg->status_change_nid_high = arg->status_change_nid_normal; | ||
1558 | #endif | 1497 | #endif |
1559 | 1498 | ||
1560 | /* | 1499 | /* |
1561 | * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE | 1500 | * We have accounted the pages from [0..ZONE_NORMAL), and |
1501 | * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM | ||
1502 | * as well. | ||
1503 | * Here we count the possible pages from ZONE_MOVABLE. | ||
1504 | * If after having accounted all the pages, we see that the nr_pages | ||
1505 | * to be offlined is over or equal to the accounted pages, | ||
1506 | * we know that the node will become empty, and so, we can clear | ||
1507 | * it for N_MEMORY as well. | ||
1562 | */ | 1508 | */ |
1563 | zone_last = ZONE_MOVABLE; | 1509 | present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; |
1564 | 1510 | ||
1565 | /* | ||
1566 | * check whether node_states[N_HIGH_MEMORY] will be changed | ||
1567 | * If we try to offline the last present @nr_pages from the node, | ||
1568 | * we can determind we will need to clear the node from | ||
1569 | * node_states[N_HIGH_MEMORY]. | ||
1570 | */ | ||
1571 | for (; zt <= zone_last; zt++) | ||
1572 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1573 | if (nr_pages >= present_pages) | 1511 | if (nr_pages >= present_pages) |
1574 | arg->status_change_nid = zone_to_nid(zone); | 1512 | arg->status_change_nid = zone_to_nid(zone); |
1575 | else | ||
1576 | arg->status_change_nid = -1; | ||
1577 | } | 1513 | } |
1578 | 1514 | ||
1579 | static void node_states_clear_node(int node, struct memory_notify *arg) | 1515 | static void node_states_clear_node(int node, struct memory_notify *arg) |
@@ -1581,12 +1517,10 @@ static void node_states_clear_node(int node, struct memory_notify *arg) | |||
1581 | if (arg->status_change_nid_normal >= 0) | 1517 | if (arg->status_change_nid_normal >= 0) |
1582 | node_clear_state(node, N_NORMAL_MEMORY); | 1518 | node_clear_state(node, N_NORMAL_MEMORY); |
1583 | 1519 | ||
1584 | if ((N_MEMORY != N_NORMAL_MEMORY) && | 1520 | if (arg->status_change_nid_high >= 0) |
1585 | (arg->status_change_nid_high >= 0)) | ||
1586 | node_clear_state(node, N_HIGH_MEMORY); | 1521 | node_clear_state(node, N_HIGH_MEMORY); |
1587 | 1522 | ||
1588 | if ((N_MEMORY != N_HIGH_MEMORY) && | 1523 | if (arg->status_change_nid >= 0) |
1589 | (arg->status_change_nid >= 0)) | ||
1590 | node_clear_state(node, N_MEMORY); | 1524 | node_clear_state(node, N_MEMORY); |
1591 | } | 1525 | } |
1592 | 1526 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index da858f794eb6..cfd26d7e61a1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -797,16 +797,19 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) | |||
797 | } | 797 | } |
798 | } | 798 | } |
799 | 799 | ||
800 | static int lookup_node(unsigned long addr) | 800 | static int lookup_node(struct mm_struct *mm, unsigned long addr) |
801 | { | 801 | { |
802 | struct page *p; | 802 | struct page *p; |
803 | int err; | 803 | int err; |
804 | 804 | ||
805 | err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL); | 805 | int locked = 1; |
806 | err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); | ||
806 | if (err >= 0) { | 807 | if (err >= 0) { |
807 | err = page_to_nid(p); | 808 | err = page_to_nid(p); |
808 | put_page(p); | 809 | put_page(p); |
809 | } | 810 | } |
811 | if (locked) | ||
812 | up_read(&mm->mmap_sem); | ||
810 | return err; | 813 | return err; |
811 | } | 814 | } |
812 | 815 | ||
@@ -817,7 +820,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
817 | int err; | 820 | int err; |
818 | struct mm_struct *mm = current->mm; | 821 | struct mm_struct *mm = current->mm; |
819 | struct vm_area_struct *vma = NULL; | 822 | struct vm_area_struct *vma = NULL; |
820 | struct mempolicy *pol = current->mempolicy; | 823 | struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; |
821 | 824 | ||
822 | if (flags & | 825 | if (flags & |
823 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) | 826 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) |
@@ -857,7 +860,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
857 | 860 | ||
858 | if (flags & MPOL_F_NODE) { | 861 | if (flags & MPOL_F_NODE) { |
859 | if (flags & MPOL_F_ADDR) { | 862 | if (flags & MPOL_F_ADDR) { |
860 | err = lookup_node(addr); | 863 | /* |
864 | * Take a refcount on the mpol, lookup_node() | ||
865 | * wil drop the mmap_sem, so after calling | ||
866 | * lookup_node() only "pol" remains valid, "vma" | ||
867 | * is stale. | ||
868 | */ | ||
869 | pol_refcount = pol; | ||
870 | vma = NULL; | ||
871 | mpol_get(pol); | ||
872 | err = lookup_node(mm, addr); | ||
861 | if (err < 0) | 873 | if (err < 0) |
862 | goto out; | 874 | goto out; |
863 | *policy = err; | 875 | *policy = err; |
@@ -892,7 +904,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
892 | out: | 904 | out: |
893 | mpol_cond_put(pol); | 905 | mpol_cond_put(pol); |
894 | if (vma) | 906 | if (vma) |
895 | up_read(¤t->mm->mmap_sem); | 907 | up_read(&mm->mmap_sem); |
908 | if (pol_refcount) | ||
909 | mpol_put(pol_refcount); | ||
896 | return err; | 910 | return err; |
897 | } | 911 | } |
898 | 912 | ||
@@ -2697,12 +2711,11 @@ static const char * const policy_modes[] = | |||
2697 | int mpol_parse_str(char *str, struct mempolicy **mpol) | 2711 | int mpol_parse_str(char *str, struct mempolicy **mpol) |
2698 | { | 2712 | { |
2699 | struct mempolicy *new = NULL; | 2713 | struct mempolicy *new = NULL; |
2700 | unsigned short mode; | ||
2701 | unsigned short mode_flags; | 2714 | unsigned short mode_flags; |
2702 | nodemask_t nodes; | 2715 | nodemask_t nodes; |
2703 | char *nodelist = strchr(str, ':'); | 2716 | char *nodelist = strchr(str, ':'); |
2704 | char *flags = strchr(str, '='); | 2717 | char *flags = strchr(str, '='); |
2705 | int err = 1; | 2718 | int err = 1, mode; |
2706 | 2719 | ||
2707 | if (nodelist) { | 2720 | if (nodelist) { |
2708 | /* NUL-terminate mode or flags string */ | 2721 | /* NUL-terminate mode or flags string */ |
@@ -2717,12 +2730,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) | |||
2717 | if (flags) | 2730 | if (flags) |
2718 | *flags++ = '\0'; /* terminate mode string */ | 2731 | *flags++ = '\0'; /* terminate mode string */ |
2719 | 2732 | ||
2720 | for (mode = 0; mode < MPOL_MAX; mode++) { | 2733 | mode = match_string(policy_modes, MPOL_MAX, str); |
2721 | if (!strcmp(str, policy_modes[mode])) { | 2734 | if (mode < 0) |
2722 | break; | ||
2723 | } | ||
2724 | } | ||
2725 | if (mode >= MPOL_MAX) | ||
2726 | goto out; | 2735 | goto out; |
2727 | 2736 | ||
2728 | switch (mode) { | 2737 | switch (mode) { |
diff --git a/mm/migrate.c b/mm/migrate.c index 84381b55b2bd..b6700f2962f3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -685,6 +685,8 @@ void migrate_page_states(struct page *newpage, struct page *page) | |||
685 | SetPageActive(newpage); | 685 | SetPageActive(newpage); |
686 | } else if (TestClearPageUnevictable(page)) | 686 | } else if (TestClearPageUnevictable(page)) |
687 | SetPageUnevictable(newpage); | 687 | SetPageUnevictable(newpage); |
688 | if (PageWorkingset(page)) | ||
689 | SetPageWorkingset(newpage); | ||
688 | if (PageChecked(page)) | 690 | if (PageChecked(page)) |
689 | SetPageChecked(newpage); | 691 | SetPageChecked(newpage); |
690 | if (PageMappedToDisk(page)) | 692 | if (PageMappedToDisk(page)) |
@@ -1973,8 +1975,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1973 | int isolated = 0; | 1975 | int isolated = 0; |
1974 | struct page *new_page = NULL; | 1976 | struct page *new_page = NULL; |
1975 | int page_lru = page_is_file_cache(page); | 1977 | int page_lru = page_is_file_cache(page); |
1976 | unsigned long mmun_start = address & HPAGE_PMD_MASK; | 1978 | unsigned long start = address & HPAGE_PMD_MASK; |
1977 | unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; | ||
1978 | 1979 | ||
1979 | new_page = alloc_pages_node(node, | 1980 | new_page = alloc_pages_node(node, |
1980 | (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), | 1981 | (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), |
@@ -1997,15 +1998,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1997 | /* anon mapping, we can simply copy page->mapping to the new page: */ | 1998 | /* anon mapping, we can simply copy page->mapping to the new page: */ |
1998 | new_page->mapping = page->mapping; | 1999 | new_page->mapping = page->mapping; |
1999 | new_page->index = page->index; | 2000 | new_page->index = page->index; |
2001 | /* flush the cache before copying using the kernel virtual address */ | ||
2002 | flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); | ||
2000 | migrate_page_copy(new_page, page); | 2003 | migrate_page_copy(new_page, page); |
2001 | WARN_ON(PageLRU(new_page)); | 2004 | WARN_ON(PageLRU(new_page)); |
2002 | 2005 | ||
2003 | /* Recheck the target PMD */ | 2006 | /* Recheck the target PMD */ |
2004 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
2005 | ptl = pmd_lock(mm, pmd); | 2007 | ptl = pmd_lock(mm, pmd); |
2006 | if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { | 2008 | if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { |
2007 | spin_unlock(ptl); | 2009 | spin_unlock(ptl); |
2008 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2009 | 2010 | ||
2010 | /* Reverse changes made by migrate_page_copy() */ | 2011 | /* Reverse changes made by migrate_page_copy() */ |
2011 | if (TestClearPageActive(new_page)) | 2012 | if (TestClearPageActive(new_page)) |
@@ -2029,16 +2030,26 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
2029 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 2030 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
2030 | 2031 | ||
2031 | /* | 2032 | /* |
2032 | * Clear the old entry under pagetable lock and establish the new PTE. | 2033 | * Overwrite the old entry under pagetable lock and establish |
2033 | * Any parallel GUP will either observe the old page blocking on the | 2034 | * the new PTE. Any parallel GUP will either observe the old |
2034 | * page lock, block on the page table lock or observe the new page. | 2035 | * page blocking on the page lock, block on the page table |
2035 | * The SetPageUptodate on the new page and page_add_new_anon_rmap | 2036 | * lock or observe the new page. The SetPageUptodate on the |
2036 | * guarantee the copy is visible before the pagetable update. | 2037 | * new page and page_add_new_anon_rmap guarantee the copy is |
2038 | * visible before the pagetable update. | ||
2037 | */ | 2039 | */ |
2038 | flush_cache_range(vma, mmun_start, mmun_end); | 2040 | page_add_anon_rmap(new_page, vma, start, true); |
2039 | page_add_anon_rmap(new_page, vma, mmun_start, true); | 2041 | /* |
2040 | pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); | 2042 | * At this point the pmd is numa/protnone (i.e. non present) and the TLB |
2041 | set_pmd_at(mm, mmun_start, pmd, entry); | 2043 | * has already been flushed globally. So no TLB can be currently |
2044 | * caching this non present pmd mapping. There's no need to clear the | ||
2045 | * pmd before doing set_pmd_at(), nor to flush the TLB after | ||
2046 | * set_pmd_at(). Clearing the pmd here would introduce a race | ||
2047 | * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the | ||
2048 | * mmap_sem for reading. If the pmd is set to NULL at any given time, | ||
2049 | * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this | ||
2050 | * pmd. | ||
2051 | */ | ||
2052 | set_pmd_at(mm, start, pmd, entry); | ||
2042 | update_mmu_cache_pmd(vma, address, &entry); | 2053 | update_mmu_cache_pmd(vma, address, &entry); |
2043 | 2054 | ||
2044 | page_ref_unfreeze(page, 2); | 2055 | page_ref_unfreeze(page, 2); |
@@ -2047,11 +2058,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
2047 | set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); | 2058 | set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); |
2048 | 2059 | ||
2049 | spin_unlock(ptl); | 2060 | spin_unlock(ptl); |
2050 | /* | ||
2051 | * No need to double call mmu_notifier->invalidate_range() callback as | ||
2052 | * the above pmdp_huge_clear_flush_notify() did already call it. | ||
2053 | */ | ||
2054 | mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); | ||
2055 | 2061 | ||
2056 | /* Take an "isolate" reference and put new page on the LRU. */ | 2062 | /* Take an "isolate" reference and put new page on the LRU. */ |
2057 | get_page(new_page); | 2063 | get_page(new_page); |
@@ -2075,7 +2081,7 @@ out_fail: | |||
2075 | ptl = pmd_lock(mm, pmd); | 2081 | ptl = pmd_lock(mm, pmd); |
2076 | if (pmd_same(*pmd, entry)) { | 2082 | if (pmd_same(*pmd, entry)) { |
2077 | entry = pmd_modify(entry, vma->vm_page_prot); | 2083 | entry = pmd_modify(entry, vma->vm_page_prot); |
2078 | set_pmd_at(mm, mmun_start, pmd, entry); | 2084 | set_pmd_at(mm, start, pmd, entry); |
2079 | update_mmu_cache_pmd(vma, address, &entry); | 2085 | update_mmu_cache_pmd(vma, address, &entry); |
2080 | } | 2086 | } |
2081 | spin_unlock(ptl); | 2087 | spin_unlock(ptl); |
@@ -191,16 +191,19 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long | |||
191 | SYSCALL_DEFINE1(brk, unsigned long, brk) | 191 | SYSCALL_DEFINE1(brk, unsigned long, brk) |
192 | { | 192 | { |
193 | unsigned long retval; | 193 | unsigned long retval; |
194 | unsigned long newbrk, oldbrk; | 194 | unsigned long newbrk, oldbrk, origbrk; |
195 | struct mm_struct *mm = current->mm; | 195 | struct mm_struct *mm = current->mm; |
196 | struct vm_area_struct *next; | 196 | struct vm_area_struct *next; |
197 | unsigned long min_brk; | 197 | unsigned long min_brk; |
198 | bool populate; | 198 | bool populate; |
199 | bool downgraded = false; | ||
199 | LIST_HEAD(uf); | 200 | LIST_HEAD(uf); |
200 | 201 | ||
201 | if (down_write_killable(&mm->mmap_sem)) | 202 | if (down_write_killable(&mm->mmap_sem)) |
202 | return -EINTR; | 203 | return -EINTR; |
203 | 204 | ||
205 | origbrk = mm->brk; | ||
206 | |||
204 | #ifdef CONFIG_COMPAT_BRK | 207 | #ifdef CONFIG_COMPAT_BRK |
205 | /* | 208 | /* |
206 | * CONFIG_COMPAT_BRK can still be overridden by setting | 209 | * CONFIG_COMPAT_BRK can still be overridden by setting |
@@ -229,14 +232,32 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
229 | 232 | ||
230 | newbrk = PAGE_ALIGN(brk); | 233 | newbrk = PAGE_ALIGN(brk); |
231 | oldbrk = PAGE_ALIGN(mm->brk); | 234 | oldbrk = PAGE_ALIGN(mm->brk); |
232 | if (oldbrk == newbrk) | 235 | if (oldbrk == newbrk) { |
233 | goto set_brk; | 236 | mm->brk = brk; |
237 | goto success; | ||
238 | } | ||
234 | 239 | ||
235 | /* Always allow shrinking brk. */ | 240 | /* |
241 | * Always allow shrinking brk. | ||
242 | * __do_munmap() may downgrade mmap_sem to read. | ||
243 | */ | ||
236 | if (brk <= mm->brk) { | 244 | if (brk <= mm->brk) { |
237 | if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf)) | 245 | int ret; |
238 | goto set_brk; | 246 | |
239 | goto out; | 247 | /* |
248 | * mm->brk must to be protected by write mmap_sem so update it | ||
249 | * before downgrading mmap_sem. When __do_munmap() fails, | ||
250 | * mm->brk will be restored from origbrk. | ||
251 | */ | ||
252 | mm->brk = brk; | ||
253 | ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); | ||
254 | if (ret < 0) { | ||
255 | mm->brk = origbrk; | ||
256 | goto out; | ||
257 | } else if (ret == 1) { | ||
258 | downgraded = true; | ||
259 | } | ||
260 | goto success; | ||
240 | } | 261 | } |
241 | 262 | ||
242 | /* Check against existing mmap mappings. */ | 263 | /* Check against existing mmap mappings. */ |
@@ -247,18 +268,21 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
247 | /* Ok, looks good - let it rip. */ | 268 | /* Ok, looks good - let it rip. */ |
248 | if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) | 269 | if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) |
249 | goto out; | 270 | goto out; |
250 | |||
251 | set_brk: | ||
252 | mm->brk = brk; | 271 | mm->brk = brk; |
272 | |||
273 | success: | ||
253 | populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; | 274 | populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; |
254 | up_write(&mm->mmap_sem); | 275 | if (downgraded) |
276 | up_read(&mm->mmap_sem); | ||
277 | else | ||
278 | up_write(&mm->mmap_sem); | ||
255 | userfaultfd_unmap_complete(mm, &uf); | 279 | userfaultfd_unmap_complete(mm, &uf); |
256 | if (populate) | 280 | if (populate) |
257 | mm_populate(oldbrk, newbrk - oldbrk); | 281 | mm_populate(oldbrk, newbrk - oldbrk); |
258 | return brk; | 282 | return brk; |
259 | 283 | ||
260 | out: | 284 | out: |
261 | retval = mm->brk; | 285 | retval = origbrk; |
262 | up_write(&mm->mmap_sem); | 286 | up_write(&mm->mmap_sem); |
263 | return retval; | 287 | return retval; |
264 | } | 288 | } |
@@ -2687,8 +2711,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2687 | * work. This now handles partial unmappings. | 2711 | * work. This now handles partial unmappings. |
2688 | * Jeremy Fitzhardinge <jeremy@goop.org> | 2712 | * Jeremy Fitzhardinge <jeremy@goop.org> |
2689 | */ | 2713 | */ |
2690 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, | 2714 | int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, |
2691 | struct list_head *uf) | 2715 | struct list_head *uf, bool downgrade) |
2692 | { | 2716 | { |
2693 | unsigned long end; | 2717 | unsigned long end; |
2694 | struct vm_area_struct *vma, *prev, *last; | 2718 | struct vm_area_struct *vma, *prev, *last; |
@@ -2770,25 +2794,38 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, | |||
2770 | mm->locked_vm -= vma_pages(tmp); | 2794 | mm->locked_vm -= vma_pages(tmp); |
2771 | munlock_vma_pages_all(tmp); | 2795 | munlock_vma_pages_all(tmp); |
2772 | } | 2796 | } |
2797 | |||
2773 | tmp = tmp->vm_next; | 2798 | tmp = tmp->vm_next; |
2774 | } | 2799 | } |
2775 | } | 2800 | } |
2776 | 2801 | ||
2777 | /* | 2802 | /* Detach vmas from rbtree */ |
2778 | * Remove the vma's, and unmap the actual pages | ||
2779 | */ | ||
2780 | detach_vmas_to_be_unmapped(mm, vma, prev, end); | 2803 | detach_vmas_to_be_unmapped(mm, vma, prev, end); |
2781 | unmap_region(mm, vma, prev, start, end); | ||
2782 | 2804 | ||
2805 | /* | ||
2806 | * mpx unmap needs to be called with mmap_sem held for write. | ||
2807 | * It is safe to call it before unmap_region(). | ||
2808 | */ | ||
2783 | arch_unmap(mm, vma, start, end); | 2809 | arch_unmap(mm, vma, start, end); |
2784 | 2810 | ||
2811 | if (downgrade) | ||
2812 | downgrade_write(&mm->mmap_sem); | ||
2813 | |||
2814 | unmap_region(mm, vma, prev, start, end); | ||
2815 | |||
2785 | /* Fix up all other VM information */ | 2816 | /* Fix up all other VM information */ |
2786 | remove_vma_list(mm, vma); | 2817 | remove_vma_list(mm, vma); |
2787 | 2818 | ||
2788 | return 0; | 2819 | return downgrade ? 1 : 0; |
2789 | } | 2820 | } |
2790 | 2821 | ||
2791 | int vm_munmap(unsigned long start, size_t len) | 2822 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, |
2823 | struct list_head *uf) | ||
2824 | { | ||
2825 | return __do_munmap(mm, start, len, uf, false); | ||
2826 | } | ||
2827 | |||
2828 | static int __vm_munmap(unsigned long start, size_t len, bool downgrade) | ||
2792 | { | 2829 | { |
2793 | int ret; | 2830 | int ret; |
2794 | struct mm_struct *mm = current->mm; | 2831 | struct mm_struct *mm = current->mm; |
@@ -2797,17 +2834,32 @@ int vm_munmap(unsigned long start, size_t len) | |||
2797 | if (down_write_killable(&mm->mmap_sem)) | 2834 | if (down_write_killable(&mm->mmap_sem)) |
2798 | return -EINTR; | 2835 | return -EINTR; |
2799 | 2836 | ||
2800 | ret = do_munmap(mm, start, len, &uf); | 2837 | ret = __do_munmap(mm, start, len, &uf, downgrade); |
2801 | up_write(&mm->mmap_sem); | 2838 | /* |
2839 | * Returning 1 indicates mmap_sem is downgraded. | ||
2840 | * But 1 is not legal return value of vm_munmap() and munmap(), reset | ||
2841 | * it to 0 before return. | ||
2842 | */ | ||
2843 | if (ret == 1) { | ||
2844 | up_read(&mm->mmap_sem); | ||
2845 | ret = 0; | ||
2846 | } else | ||
2847 | up_write(&mm->mmap_sem); | ||
2848 | |||
2802 | userfaultfd_unmap_complete(mm, &uf); | 2849 | userfaultfd_unmap_complete(mm, &uf); |
2803 | return ret; | 2850 | return ret; |
2804 | } | 2851 | } |
2852 | |||
2853 | int vm_munmap(unsigned long start, size_t len) | ||
2854 | { | ||
2855 | return __vm_munmap(start, len, false); | ||
2856 | } | ||
2805 | EXPORT_SYMBOL(vm_munmap); | 2857 | EXPORT_SYMBOL(vm_munmap); |
2806 | 2858 | ||
2807 | SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) | 2859 | SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) |
2808 | { | 2860 | { |
2809 | profile_munmap(addr); | 2861 | profile_munmap(addr); |
2810 | return vm_munmap(addr, len); | 2862 | return __vm_munmap(addr, len, true); |
2811 | } | 2863 | } |
2812 | 2864 | ||
2813 | 2865 | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 82bb1a939c0e..5119ff846769 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -247,37 +247,6 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, | |||
247 | } | 247 | } |
248 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); | 248 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); |
249 | 249 | ||
250 | /* | ||
251 | * Must be called while holding mm->mmap_sem for either read or write. | ||
252 | * The result is guaranteed to be valid until mm->mmap_sem is dropped. | ||
253 | */ | ||
254 | bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) | ||
255 | { | ||
256 | struct mmu_notifier *mn; | ||
257 | int id; | ||
258 | bool ret = false; | ||
259 | |||
260 | WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); | ||
261 | |||
262 | if (!mm_has_notifiers(mm)) | ||
263 | return ret; | ||
264 | |||
265 | id = srcu_read_lock(&srcu); | ||
266 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { | ||
267 | if (!mn->ops->invalidate_range && | ||
268 | !mn->ops->invalidate_range_start && | ||
269 | !mn->ops->invalidate_range_end) | ||
270 | continue; | ||
271 | |||
272 | if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) { | ||
273 | ret = true; | ||
274 | break; | ||
275 | } | ||
276 | } | ||
277 | srcu_read_unlock(&srcu, id); | ||
278 | return ret; | ||
279 | } | ||
280 | |||
281 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | 250 | static int do_mmu_notifier_register(struct mmu_notifier *mn, |
282 | struct mm_struct *mm, | 251 | struct mm_struct *mm, |
283 | int take_mmap_sem) | 252 | int take_mmap_sem) |
diff --git a/mm/mremap.c b/mm/mremap.c index a9617e72e6b7..7f9f9180e401 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -521,6 +521,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
521 | unsigned long ret = -EINVAL; | 521 | unsigned long ret = -EINVAL; |
522 | unsigned long charged = 0; | 522 | unsigned long charged = 0; |
523 | bool locked = false; | 523 | bool locked = false; |
524 | bool downgraded = false; | ||
524 | struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; | 525 | struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; |
525 | LIST_HEAD(uf_unmap_early); | 526 | LIST_HEAD(uf_unmap_early); |
526 | LIST_HEAD(uf_unmap); | 527 | LIST_HEAD(uf_unmap); |
@@ -557,12 +558,20 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
557 | /* | 558 | /* |
558 | * Always allow a shrinking remap: that just unmaps | 559 | * Always allow a shrinking remap: that just unmaps |
559 | * the unnecessary pages.. | 560 | * the unnecessary pages.. |
560 | * do_munmap does all the needed commit accounting | 561 | * __do_munmap does all the needed commit accounting, and |
562 | * downgrades mmap_sem to read if so directed. | ||
561 | */ | 563 | */ |
562 | if (old_len >= new_len) { | 564 | if (old_len >= new_len) { |
563 | ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); | 565 | int retval; |
564 | if (ret && old_len != new_len) | 566 | |
567 | retval = __do_munmap(mm, addr+new_len, old_len - new_len, | ||
568 | &uf_unmap, true); | ||
569 | if (retval < 0 && old_len != new_len) { | ||
570 | ret = retval; | ||
565 | goto out; | 571 | goto out; |
572 | /* Returning 1 indicates mmap_sem is downgraded to read. */ | ||
573 | } else if (retval == 1) | ||
574 | downgraded = true; | ||
566 | ret = addr; | 575 | ret = addr; |
567 | goto out; | 576 | goto out; |
568 | } | 577 | } |
@@ -627,7 +636,10 @@ out: | |||
627 | vm_unacct_memory(charged); | 636 | vm_unacct_memory(charged); |
628 | locked = 0; | 637 | locked = 0; |
629 | } | 638 | } |
630 | up_write(¤t->mm->mmap_sem); | 639 | if (downgraded) |
640 | up_read(¤t->mm->mmap_sem); | ||
641 | else | ||
642 | up_write(¤t->mm->mmap_sem); | ||
631 | if (locked && new_len > old_len) | 643 | if (locked && new_len > old_len) |
632 | mm_populate(new_addr + old_len, new_len - old_len); | 644 | mm_populate(new_addr + old_len, new_len - old_len); |
633 | userfaultfd_unmap_complete(mm, &uf_unmap_early); | 645 | userfaultfd_unmap_complete(mm, &uf_unmap_early); |
diff --git a/mm/nommu.c b/mm/nommu.c index e4aac33216ae..749276beb109 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1709,11 +1709,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
1709 | return ret; | 1709 | return ret; |
1710 | } | 1710 | } |
1711 | 1711 | ||
1712 | struct page *follow_page_mask(struct vm_area_struct *vma, | 1712 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
1713 | unsigned long address, unsigned int flags, | 1713 | unsigned int foll_flags) |
1714 | unsigned int *page_mask) | ||
1715 | { | 1714 | { |
1716 | *page_mask = 0; | ||
1717 | return NULL; | 1715 | return NULL; |
1718 | } | 1716 | } |
1719 | 1717 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 84ae9bf5858a..439a304a6c92 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2149,6 +2149,13 @@ EXPORT_SYMBOL(tag_pages_for_writeback); | |||
2149 | * not miss some pages (e.g., because some other process has cleared TOWRITE | 2149 | * not miss some pages (e.g., because some other process has cleared TOWRITE |
2150 | * tag we set). The rule we follow is that TOWRITE tag can be cleared only | 2150 | * tag we set). The rule we follow is that TOWRITE tag can be cleared only |
2151 | * by the process clearing the DIRTY tag (and submitting the page for IO). | 2151 | * by the process clearing the DIRTY tag (and submitting the page for IO). |
2152 | * | ||
2153 | * To avoid deadlocks between range_cyclic writeback and callers that hold | ||
2154 | * pages in PageWriteback to aggregate IO until write_cache_pages() returns, | ||
2155 | * we do not loop back to the start of the file. Doing so causes a page | ||
2156 | * lock/page writeback access order inversion - we should only ever lock | ||
2157 | * multiple pages in ascending page->index order, and looping back to the start | ||
2158 | * of the file violates that rule and causes deadlocks. | ||
2152 | */ | 2159 | */ |
2153 | int write_cache_pages(struct address_space *mapping, | 2160 | int write_cache_pages(struct address_space *mapping, |
2154 | struct writeback_control *wbc, writepage_t writepage, | 2161 | struct writeback_control *wbc, writepage_t writepage, |
@@ -2162,7 +2169,6 @@ int write_cache_pages(struct address_space *mapping, | |||
2162 | pgoff_t index; | 2169 | pgoff_t index; |
2163 | pgoff_t end; /* Inclusive */ | 2170 | pgoff_t end; /* Inclusive */ |
2164 | pgoff_t done_index; | 2171 | pgoff_t done_index; |
2165 | int cycled; | ||
2166 | int range_whole = 0; | 2172 | int range_whole = 0; |
2167 | int tag; | 2173 | int tag; |
2168 | 2174 | ||
@@ -2170,23 +2176,17 @@ int write_cache_pages(struct address_space *mapping, | |||
2170 | if (wbc->range_cyclic) { | 2176 | if (wbc->range_cyclic) { |
2171 | writeback_index = mapping->writeback_index; /* prev offset */ | 2177 | writeback_index = mapping->writeback_index; /* prev offset */ |
2172 | index = writeback_index; | 2178 | index = writeback_index; |
2173 | if (index == 0) | ||
2174 | cycled = 1; | ||
2175 | else | ||
2176 | cycled = 0; | ||
2177 | end = -1; | 2179 | end = -1; |
2178 | } else { | 2180 | } else { |
2179 | index = wbc->range_start >> PAGE_SHIFT; | 2181 | index = wbc->range_start >> PAGE_SHIFT; |
2180 | end = wbc->range_end >> PAGE_SHIFT; | 2182 | end = wbc->range_end >> PAGE_SHIFT; |
2181 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 2183 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
2182 | range_whole = 1; | 2184 | range_whole = 1; |
2183 | cycled = 1; /* ignore range_cyclic tests */ | ||
2184 | } | 2185 | } |
2185 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) | 2186 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
2186 | tag = PAGECACHE_TAG_TOWRITE; | 2187 | tag = PAGECACHE_TAG_TOWRITE; |
2187 | else | 2188 | else |
2188 | tag = PAGECACHE_TAG_DIRTY; | 2189 | tag = PAGECACHE_TAG_DIRTY; |
2189 | retry: | ||
2190 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) | 2190 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
2191 | tag_pages_for_writeback(mapping, index, end); | 2191 | tag_pages_for_writeback(mapping, index, end); |
2192 | done_index = index; | 2192 | done_index = index; |
@@ -2272,17 +2272,14 @@ continue_unlock: | |||
2272 | pagevec_release(&pvec); | 2272 | pagevec_release(&pvec); |
2273 | cond_resched(); | 2273 | cond_resched(); |
2274 | } | 2274 | } |
2275 | if (!cycled && !done) { | 2275 | |
2276 | /* | 2276 | /* |
2277 | * range_cyclic: | 2277 | * If we hit the last page and there is more work to be done: wrap |
2278 | * We hit the last page and there is more work to be done: wrap | 2278 | * back the index back to the start of the file for the next |
2279 | * back to the start of the file | 2279 | * time we are called. |
2280 | */ | 2280 | */ |
2281 | cycled = 1; | 2281 | if (wbc->range_cyclic && !done) |
2282 | index = 0; | 2282 | done_index = 0; |
2283 | end = writeback_index - 1; | ||
2284 | goto retry; | ||
2285 | } | ||
2286 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | 2283 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) |
2287 | mapping->writeback_index = done_index; | 2284 | mapping->writeback_index = done_index; |
2288 | 2285 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2ef1c17942f..863d46da6586 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -66,6 +66,7 @@ | |||
66 | #include <linux/ftrace.h> | 66 | #include <linux/ftrace.h> |
67 | #include <linux/lockdep.h> | 67 | #include <linux/lockdep.h> |
68 | #include <linux/nmi.h> | 68 | #include <linux/nmi.h> |
69 | #include <linux/psi.h> | ||
69 | 70 | ||
70 | #include <asm/sections.h> | 71 | #include <asm/sections.h> |
71 | #include <asm/tlbflush.h> | 72 | #include <asm/tlbflush.h> |
@@ -306,24 +307,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn) | |||
306 | } | 307 | } |
307 | 308 | ||
308 | /* | 309 | /* |
309 | * Returns false when the remaining initialisation should be deferred until | 310 | * Returns true when the remaining initialisation should be deferred until |
310 | * later in the boot cycle when it can be parallelised. | 311 | * later in the boot cycle when it can be parallelised. |
311 | */ | 312 | */ |
312 | static inline bool update_defer_init(pg_data_t *pgdat, | 313 | static bool __meminit |
313 | unsigned long pfn, unsigned long zone_end, | 314 | defer_init(int nid, unsigned long pfn, unsigned long end_pfn) |
314 | unsigned long *nr_initialised) | ||
315 | { | 315 | { |
316 | static unsigned long prev_end_pfn, nr_initialised; | ||
317 | |||
318 | /* | ||
319 | * prev_end_pfn static that contains the end of previous zone | ||
320 | * No need to protect because called very early in boot before smp_init. | ||
321 | */ | ||
322 | if (prev_end_pfn != end_pfn) { | ||
323 | prev_end_pfn = end_pfn; | ||
324 | nr_initialised = 0; | ||
325 | } | ||
326 | |||
316 | /* Always populate low zones for address-constrained allocations */ | 327 | /* Always populate low zones for address-constrained allocations */ |
317 | if (zone_end < pgdat_end_pfn(pgdat)) | 328 | if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) |
318 | return true; | ||
319 | (*nr_initialised)++; | ||
320 | if ((*nr_initialised > pgdat->static_init_pgcnt) && | ||
321 | (pfn & (PAGES_PER_SECTION - 1)) == 0) { | ||
322 | pgdat->first_deferred_pfn = pfn; | ||
323 | return false; | 329 | return false; |
330 | nr_initialised++; | ||
331 | if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) && | ||
332 | (pfn & (PAGES_PER_SECTION - 1)) == 0) { | ||
333 | NODE_DATA(nid)->first_deferred_pfn = pfn; | ||
334 | return true; | ||
324 | } | 335 | } |
325 | 336 | return false; | |
326 | return true; | ||
327 | } | 337 | } |
328 | #else | 338 | #else |
329 | static inline bool early_page_uninitialised(unsigned long pfn) | 339 | static inline bool early_page_uninitialised(unsigned long pfn) |
@@ -331,11 +341,9 @@ static inline bool early_page_uninitialised(unsigned long pfn) | |||
331 | return false; | 341 | return false; |
332 | } | 342 | } |
333 | 343 | ||
334 | static inline bool update_defer_init(pg_data_t *pgdat, | 344 | static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) |
335 | unsigned long pfn, unsigned long zone_end, | ||
336 | unsigned long *nr_initialised) | ||
337 | { | 345 | { |
338 | return true; | 346 | return false; |
339 | } | 347 | } |
340 | #endif | 348 | #endif |
341 | 349 | ||
@@ -1231,7 +1239,12 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) | |||
1231 | /* Avoid false-positive PageTail() */ | 1239 | /* Avoid false-positive PageTail() */ |
1232 | INIT_LIST_HEAD(&page->lru); | 1240 | INIT_LIST_HEAD(&page->lru); |
1233 | 1241 | ||
1234 | SetPageReserved(page); | 1242 | /* |
1243 | * no need for atomic set_bit because the struct | ||
1244 | * page is not visible yet so nobody should | ||
1245 | * access it yet. | ||
1246 | */ | ||
1247 | __SetPageReserved(page); | ||
1235 | } | 1248 | } |
1236 | } | 1249 | } |
1237 | } | 1250 | } |
@@ -2015,10 +2028,6 @@ static int move_freepages(struct zone *zone, | |||
2015 | pfn_valid(page_to_pfn(end_page)) && | 2028 | pfn_valid(page_to_pfn(end_page)) && |
2016 | page_zone(start_page) != page_zone(end_page)); | 2029 | page_zone(start_page) != page_zone(end_page)); |
2017 | #endif | 2030 | #endif |
2018 | |||
2019 | if (num_movable) | ||
2020 | *num_movable = 0; | ||
2021 | |||
2022 | for (page = start_page; page <= end_page;) { | 2031 | for (page = start_page; page <= end_page;) { |
2023 | if (!pfn_valid_within(page_to_pfn(page))) { | 2032 | if (!pfn_valid_within(page_to_pfn(page))) { |
2024 | page++; | 2033 | page++; |
@@ -2058,6 +2067,9 @@ int move_freepages_block(struct zone *zone, struct page *page, | |||
2058 | unsigned long start_pfn, end_pfn; | 2067 | unsigned long start_pfn, end_pfn; |
2059 | struct page *start_page, *end_page; | 2068 | struct page *start_page, *end_page; |
2060 | 2069 | ||
2070 | if (num_movable) | ||
2071 | *num_movable = 0; | ||
2072 | |||
2061 | start_pfn = page_to_pfn(page); | 2073 | start_pfn = page_to_pfn(page); |
2062 | start_pfn = start_pfn & ~(pageblock_nr_pages-1); | 2074 | start_pfn = start_pfn & ~(pageblock_nr_pages-1); |
2063 | start_page = pfn_to_page(start_pfn); | 2075 | start_page = pfn_to_page(start_pfn); |
@@ -3366,26 +3378,12 @@ try_this_zone: | |||
3366 | return NULL; | 3378 | return NULL; |
3367 | } | 3379 | } |
3368 | 3380 | ||
3369 | /* | ||
3370 | * Large machines with many possible nodes should not always dump per-node | ||
3371 | * meminfo in irq context. | ||
3372 | */ | ||
3373 | static inline bool should_suppress_show_mem(void) | ||
3374 | { | ||
3375 | bool ret = false; | ||
3376 | |||
3377 | #if NODES_SHIFT > 8 | ||
3378 | ret = in_interrupt(); | ||
3379 | #endif | ||
3380 | return ret; | ||
3381 | } | ||
3382 | |||
3383 | static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) | 3381 | static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) |
3384 | { | 3382 | { |
3385 | unsigned int filter = SHOW_MEM_FILTER_NODES; | 3383 | unsigned int filter = SHOW_MEM_FILTER_NODES; |
3386 | static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); | 3384 | static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); |
3387 | 3385 | ||
3388 | if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) | 3386 | if (!__ratelimit(&show_mem_rs)) |
3389 | return; | 3387 | return; |
3390 | 3388 | ||
3391 | /* | 3389 | /* |
@@ -3549,15 +3547,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
3549 | enum compact_priority prio, enum compact_result *compact_result) | 3547 | enum compact_priority prio, enum compact_result *compact_result) |
3550 | { | 3548 | { |
3551 | struct page *page; | 3549 | struct page *page; |
3550 | unsigned long pflags; | ||
3552 | unsigned int noreclaim_flag; | 3551 | unsigned int noreclaim_flag; |
3553 | 3552 | ||
3554 | if (!order) | 3553 | if (!order) |
3555 | return NULL; | 3554 | return NULL; |
3556 | 3555 | ||
3556 | psi_memstall_enter(&pflags); | ||
3557 | noreclaim_flag = memalloc_noreclaim_save(); | 3557 | noreclaim_flag = memalloc_noreclaim_save(); |
3558 | |||
3558 | *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, | 3559 | *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, |
3559 | prio); | 3560 | prio); |
3561 | |||
3560 | memalloc_noreclaim_restore(noreclaim_flag); | 3562 | memalloc_noreclaim_restore(noreclaim_flag); |
3563 | psi_memstall_leave(&pflags); | ||
3561 | 3564 | ||
3562 | if (*compact_result <= COMPACT_INACTIVE) | 3565 | if (*compact_result <= COMPACT_INACTIVE) |
3563 | return NULL; | 3566 | return NULL; |
@@ -3756,11 +3759,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, | |||
3756 | struct reclaim_state reclaim_state; | 3759 | struct reclaim_state reclaim_state; |
3757 | int progress; | 3760 | int progress; |
3758 | unsigned int noreclaim_flag; | 3761 | unsigned int noreclaim_flag; |
3762 | unsigned long pflags; | ||
3759 | 3763 | ||
3760 | cond_resched(); | 3764 | cond_resched(); |
3761 | 3765 | ||
3762 | /* We now go into synchronous reclaim */ | 3766 | /* We now go into synchronous reclaim */ |
3763 | cpuset_memory_pressure_bump(); | 3767 | cpuset_memory_pressure_bump(); |
3768 | psi_memstall_enter(&pflags); | ||
3764 | fs_reclaim_acquire(gfp_mask); | 3769 | fs_reclaim_acquire(gfp_mask); |
3765 | noreclaim_flag = memalloc_noreclaim_save(); | 3770 | noreclaim_flag = memalloc_noreclaim_save(); |
3766 | reclaim_state.reclaimed_slab = 0; | 3771 | reclaim_state.reclaimed_slab = 0; |
@@ -3772,6 +3777,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, | |||
3772 | current->reclaim_state = NULL; | 3777 | current->reclaim_state = NULL; |
3773 | memalloc_noreclaim_restore(noreclaim_flag); | 3778 | memalloc_noreclaim_restore(noreclaim_flag); |
3774 | fs_reclaim_release(gfp_mask); | 3779 | fs_reclaim_release(gfp_mask); |
3780 | psi_memstall_leave(&pflags); | ||
3775 | 3781 | ||
3776 | cond_resched(); | 3782 | cond_resched(); |
3777 | 3783 | ||
@@ -3922,6 +3928,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, | |||
3922 | { | 3928 | { |
3923 | struct zone *zone; | 3929 | struct zone *zone; |
3924 | struct zoneref *z; | 3930 | struct zoneref *z; |
3931 | bool ret = false; | ||
3925 | 3932 | ||
3926 | /* | 3933 | /* |
3927 | * Costly allocations might have made a progress but this doesn't mean | 3934 | * Costly allocations might have made a progress but this doesn't mean |
@@ -3985,25 +3992,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, | |||
3985 | } | 3992 | } |
3986 | } | 3993 | } |
3987 | 3994 | ||
3988 | /* | 3995 | ret = true; |
3989 | * Memory allocation/reclaim might be called from a WQ | 3996 | goto out; |
3990 | * context and the current implementation of the WQ | ||
3991 | * concurrency control doesn't recognize that | ||
3992 | * a particular WQ is congested if the worker thread is | ||
3993 | * looping without ever sleeping. Therefore we have to | ||
3994 | * do a short sleep here rather than calling | ||
3995 | * cond_resched(). | ||
3996 | */ | ||
3997 | if (current->flags & PF_WQ_WORKER) | ||
3998 | schedule_timeout_uninterruptible(1); | ||
3999 | else | ||
4000 | cond_resched(); | ||
4001 | |||
4002 | return true; | ||
4003 | } | 3997 | } |
4004 | } | 3998 | } |
4005 | 3999 | ||
4006 | return false; | 4000 | out: |
4001 | /* | ||
4002 | * Memory allocation/reclaim might be called from a WQ context and the | ||
4003 | * current implementation of the WQ concurrency control doesn't | ||
4004 | * recognize that a particular WQ is congested if the worker thread is | ||
4005 | * looping without ever sleeping. Therefore we have to do a short sleep | ||
4006 | * here rather than calling cond_resched(). | ||
4007 | */ | ||
4008 | if (current->flags & PF_WQ_WORKER) | ||
4009 | schedule_timeout_uninterruptible(1); | ||
4010 | else | ||
4011 | cond_resched(); | ||
4012 | return ret; | ||
4007 | } | 4013 | } |
4008 | 4014 | ||
4009 | static inline bool | 4015 | static inline bool |
@@ -4701,6 +4707,7 @@ long si_mem_available(void) | |||
4701 | unsigned long pagecache; | 4707 | unsigned long pagecache; |
4702 | unsigned long wmark_low = 0; | 4708 | unsigned long wmark_low = 0; |
4703 | unsigned long pages[NR_LRU_LISTS]; | 4709 | unsigned long pages[NR_LRU_LISTS]; |
4710 | unsigned long reclaimable; | ||
4704 | struct zone *zone; | 4711 | struct zone *zone; |
4705 | int lru; | 4712 | int lru; |
4706 | 4713 | ||
@@ -4726,19 +4733,13 @@ long si_mem_available(void) | |||
4726 | available += pagecache; | 4733 | available += pagecache; |
4727 | 4734 | ||
4728 | /* | 4735 | /* |
4729 | * Part of the reclaimable slab consists of items that are in use, | 4736 | * Part of the reclaimable slab and other kernel memory consists of |
4730 | * and cannot be freed. Cap this estimate at the low watermark. | 4737 | * items that are in use, and cannot be freed. Cap this estimate at the |
4738 | * low watermark. | ||
4731 | */ | 4739 | */ |
4732 | available += global_node_page_state(NR_SLAB_RECLAIMABLE) - | 4740 | reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) + |
4733 | min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, | 4741 | global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); |
4734 | wmark_low); | 4742 | available += reclaimable - min(reclaimable / 2, wmark_low); |
4735 | |||
4736 | /* | ||
4737 | * Part of the kernel memory, which can be released under memory | ||
4738 | * pressure. | ||
4739 | */ | ||
4740 | available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> | ||
4741 | PAGE_SHIFT; | ||
4742 | 4743 | ||
4743 | if (available < 0) | 4744 | if (available < 0) |
4744 | available = 0; | 4745 | available = 0; |
@@ -5449,6 +5450,30 @@ void __ref build_all_zonelists(pg_data_t *pgdat) | |||
5449 | #endif | 5450 | #endif |
5450 | } | 5451 | } |
5451 | 5452 | ||
5453 | /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ | ||
5454 | static bool __meminit | ||
5455 | overlap_memmap_init(unsigned long zone, unsigned long *pfn) | ||
5456 | { | ||
5457 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
5458 | static struct memblock_region *r; | ||
5459 | |||
5460 | if (mirrored_kernelcore && zone == ZONE_MOVABLE) { | ||
5461 | if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { | ||
5462 | for_each_memblock(memory, r) { | ||
5463 | if (*pfn < memblock_region_memory_end_pfn(r)) | ||
5464 | break; | ||
5465 | } | ||
5466 | } | ||
5467 | if (*pfn >= memblock_region_memory_base_pfn(r) && | ||
5468 | memblock_is_mirror(r)) { | ||
5469 | *pfn = memblock_region_memory_end_pfn(r); | ||
5470 | return true; | ||
5471 | } | ||
5472 | } | ||
5473 | #endif | ||
5474 | return false; | ||
5475 | } | ||
5476 | |||
5452 | /* | 5477 | /* |
5453 | * Initially all pages are reserved - free ones are freed | 5478 | * Initially all pages are reserved - free ones are freed |
5454 | * up by free_all_bootmem() once the early boot process is | 5479 | * up by free_all_bootmem() once the early boot process is |
@@ -5458,67 +5483,118 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
5458 | unsigned long start_pfn, enum memmap_context context, | 5483 | unsigned long start_pfn, enum memmap_context context, |
5459 | struct vmem_altmap *altmap) | 5484 | struct vmem_altmap *altmap) |
5460 | { | 5485 | { |
5461 | unsigned long end_pfn = start_pfn + size; | 5486 | unsigned long pfn, end_pfn = start_pfn + size; |
5462 | pg_data_t *pgdat = NODE_DATA(nid); | ||
5463 | unsigned long pfn; | ||
5464 | unsigned long nr_initialised = 0; | ||
5465 | struct page *page; | 5487 | struct page *page; |
5466 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
5467 | struct memblock_region *r = NULL, *tmp; | ||
5468 | #endif | ||
5469 | 5488 | ||
5470 | if (highest_memmap_pfn < end_pfn - 1) | 5489 | if (highest_memmap_pfn < end_pfn - 1) |
5471 | highest_memmap_pfn = end_pfn - 1; | 5490 | highest_memmap_pfn = end_pfn - 1; |
5472 | 5491 | ||
5492 | #ifdef CONFIG_ZONE_DEVICE | ||
5473 | /* | 5493 | /* |
5474 | * Honor reservation requested by the driver for this ZONE_DEVICE | 5494 | * Honor reservation requested by the driver for this ZONE_DEVICE |
5475 | * memory | 5495 | * memory. We limit the total number of pages to initialize to just |
5496 | * those that might contain the memory mapping. We will defer the | ||
5497 | * ZONE_DEVICE page initialization until after we have released | ||
5498 | * the hotplug lock. | ||
5476 | */ | 5499 | */ |
5477 | if (altmap && start_pfn == altmap->base_pfn) | 5500 | if (zone == ZONE_DEVICE) { |
5478 | start_pfn += altmap->reserve; | 5501 | if (!altmap) |
5502 | return; | ||
5503 | |||
5504 | if (start_pfn == altmap->base_pfn) | ||
5505 | start_pfn += altmap->reserve; | ||
5506 | end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); | ||
5507 | } | ||
5508 | #endif | ||
5479 | 5509 | ||
5480 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 5510 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
5481 | /* | 5511 | /* |
5482 | * There can be holes in boot-time mem_map[]s handed to this | 5512 | * There can be holes in boot-time mem_map[]s handed to this |
5483 | * function. They do not exist on hotplugged memory. | 5513 | * function. They do not exist on hotplugged memory. |
5484 | */ | 5514 | */ |
5485 | if (context != MEMMAP_EARLY) | 5515 | if (context == MEMMAP_EARLY) { |
5486 | goto not_early; | 5516 | if (!early_pfn_valid(pfn)) |
5487 | |||
5488 | if (!early_pfn_valid(pfn)) | ||
5489 | continue; | ||
5490 | if (!early_pfn_in_nid(pfn, nid)) | ||
5491 | continue; | ||
5492 | if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) | ||
5493 | break; | ||
5494 | |||
5495 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
5496 | /* | ||
5497 | * Check given memblock attribute by firmware which can affect | ||
5498 | * kernel memory layout. If zone==ZONE_MOVABLE but memory is | ||
5499 | * mirrored, it's an overlapped memmap init. skip it. | ||
5500 | */ | ||
5501 | if (mirrored_kernelcore && zone == ZONE_MOVABLE) { | ||
5502 | if (!r || pfn >= memblock_region_memory_end_pfn(r)) { | ||
5503 | for_each_memblock(memory, tmp) | ||
5504 | if (pfn < memblock_region_memory_end_pfn(tmp)) | ||
5505 | break; | ||
5506 | r = tmp; | ||
5507 | } | ||
5508 | if (pfn >= memblock_region_memory_base_pfn(r) && | ||
5509 | memblock_is_mirror(r)) { | ||
5510 | /* already initialized as NORMAL */ | ||
5511 | pfn = memblock_region_memory_end_pfn(r); | ||
5512 | continue; | 5517 | continue; |
5513 | } | 5518 | if (!early_pfn_in_nid(pfn, nid)) |
5519 | continue; | ||
5520 | if (overlap_memmap_init(zone, &pfn)) | ||
5521 | continue; | ||
5522 | if (defer_init(nid, pfn, end_pfn)) | ||
5523 | break; | ||
5514 | } | 5524 | } |
5515 | #endif | ||
5516 | 5525 | ||
5517 | not_early: | ||
5518 | page = pfn_to_page(pfn); | 5526 | page = pfn_to_page(pfn); |
5519 | __init_single_page(page, pfn, zone, nid); | 5527 | __init_single_page(page, pfn, zone, nid); |
5520 | if (context == MEMMAP_HOTPLUG) | 5528 | if (context == MEMMAP_HOTPLUG) |
5521 | SetPageReserved(page); | 5529 | __SetPageReserved(page); |
5530 | |||
5531 | /* | ||
5532 | * Mark the block movable so that blocks are reserved for | ||
5533 | * movable at startup. This will force kernel allocations | ||
5534 | * to reserve their blocks rather than leaking throughout | ||
5535 | * the address space during boot when many long-lived | ||
5536 | * kernel allocations are made. | ||
5537 | * | ||
5538 | * bitmap is created for zone's valid pfn range. but memmap | ||
5539 | * can be created for invalid pages (for alignment) | ||
5540 | * check here not to call set_pageblock_migratetype() against | ||
5541 | * pfn out of zone. | ||
5542 | */ | ||
5543 | if (!(pfn & (pageblock_nr_pages - 1))) { | ||
5544 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
5545 | cond_resched(); | ||
5546 | } | ||
5547 | } | ||
5548 | } | ||
5549 | |||
5550 | #ifdef CONFIG_ZONE_DEVICE | ||
5551 | void __ref memmap_init_zone_device(struct zone *zone, | ||
5552 | unsigned long start_pfn, | ||
5553 | unsigned long size, | ||
5554 | struct dev_pagemap *pgmap) | ||
5555 | { | ||
5556 | unsigned long pfn, end_pfn = start_pfn + size; | ||
5557 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
5558 | unsigned long zone_idx = zone_idx(zone); | ||
5559 | unsigned long start = jiffies; | ||
5560 | int nid = pgdat->node_id; | ||
5561 | |||
5562 | if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone))) | ||
5563 | return; | ||
5564 | |||
5565 | /* | ||
5566 | * The call to memmap_init_zone should have already taken care | ||
5567 | * of the pages reserved for the memmap, so we can just jump to | ||
5568 | * the end of that region and start processing the device pages. | ||
5569 | */ | ||
5570 | if (pgmap->altmap_valid) { | ||
5571 | struct vmem_altmap *altmap = &pgmap->altmap; | ||
5572 | |||
5573 | start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); | ||
5574 | size = end_pfn - start_pfn; | ||
5575 | } | ||
5576 | |||
5577 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | ||
5578 | struct page *page = pfn_to_page(pfn); | ||
5579 | |||
5580 | __init_single_page(page, pfn, zone_idx, nid); | ||
5581 | |||
5582 | /* | ||
5583 | * Mark page reserved as it will need to wait for onlining | ||
5584 | * phase for it to be fully associated with a zone. | ||
5585 | * | ||
5586 | * We can use the non-atomic __set_bit operation for setting | ||
5587 | * the flag as we are still initializing the pages. | ||
5588 | */ | ||
5589 | __SetPageReserved(page); | ||
5590 | |||
5591 | /* | ||
5592 | * ZONE_DEVICE pages union ->lru with a ->pgmap back | ||
5593 | * pointer and hmm_data. It is a bug if a ZONE_DEVICE | ||
5594 | * page is ever freed or placed on a driver-private list. | ||
5595 | */ | ||
5596 | page->pgmap = pgmap; | ||
5597 | page->hmm_data = 0; | ||
5522 | 5598 | ||
5523 | /* | 5599 | /* |
5524 | * Mark the block movable so that blocks are reserved for | 5600 | * Mark the block movable so that blocks are reserved for |
@@ -5540,8 +5616,12 @@ not_early: | |||
5540 | cond_resched(); | 5616 | cond_resched(); |
5541 | } | 5617 | } |
5542 | } | 5618 | } |
5619 | |||
5620 | pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), | ||
5621 | size, jiffies_to_msecs(jiffies - start)); | ||
5543 | } | 5622 | } |
5544 | 5623 | ||
5624 | #endif | ||
5545 | static void __meminit zone_init_free_lists(struct zone *zone) | 5625 | static void __meminit zone_init_free_lists(struct zone *zone) |
5546 | { | 5626 | { |
5547 | unsigned int order, t; | 5627 | unsigned int order, t; |
@@ -5551,10 +5631,11 @@ static void __meminit zone_init_free_lists(struct zone *zone) | |||
5551 | } | 5631 | } |
5552 | } | 5632 | } |
5553 | 5633 | ||
5554 | #ifndef __HAVE_ARCH_MEMMAP_INIT | 5634 | void __meminit __weak memmap_init(unsigned long size, int nid, |
5555 | #define memmap_init(size, nid, zone, start_pfn) \ | 5635 | unsigned long zone, unsigned long start_pfn) |
5556 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL) | 5636 | { |
5557 | #endif | 5637 | memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL); |
5638 | } | ||
5558 | 5639 | ||
5559 | static int zone_batchsize(struct zone *zone) | 5640 | static int zone_batchsize(struct zone *zone) |
5560 | { | 5641 | { |
@@ -6428,45 +6509,65 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, | |||
6428 | } | 6509 | } |
6429 | 6510 | ||
6430 | #if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) | 6511 | #if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) |
6512 | |||
6513 | /* | ||
6514 | * Zero all valid struct pages in range [spfn, epfn), return number of struct | ||
6515 | * pages zeroed | ||
6516 | */ | ||
6517 | static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) | ||
6518 | { | ||
6519 | unsigned long pfn; | ||
6520 | u64 pgcnt = 0; | ||
6521 | |||
6522 | for (pfn = spfn; pfn < epfn; pfn++) { | ||
6523 | if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { | ||
6524 | pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) | ||
6525 | + pageblock_nr_pages - 1; | ||
6526 | continue; | ||
6527 | } | ||
6528 | mm_zero_struct_page(pfn_to_page(pfn)); | ||
6529 | pgcnt++; | ||
6530 | } | ||
6531 | |||
6532 | return pgcnt; | ||
6533 | } | ||
6534 | |||
6431 | /* | 6535 | /* |
6432 | * Only struct pages that are backed by physical memory are zeroed and | 6536 | * Only struct pages that are backed by physical memory are zeroed and |
6433 | * initialized by going through __init_single_page(). But, there are some | 6537 | * initialized by going through __init_single_page(). But, there are some |
6434 | * struct pages which are reserved in memblock allocator and their fields | 6538 | * struct pages which are reserved in memblock allocator and their fields |
6435 | * may be accessed (for example page_to_pfn() on some configuration accesses | 6539 | * may be accessed (for example page_to_pfn() on some configuration accesses |
6436 | * flags). We must explicitly zero those struct pages. | 6540 | * flags). We must explicitly zero those struct pages. |
6541 | * | ||
6542 | * This function also addresses a similar issue where struct pages are left | ||
6543 | * uninitialized because the physical address range is not covered by | ||
6544 | * memblock.memory or memblock.reserved. That could happen when memblock | ||
6545 | * layout is manually configured via memmap=. | ||
6437 | */ | 6546 | */ |
6438 | void __init zero_resv_unavail(void) | 6547 | void __init zero_resv_unavail(void) |
6439 | { | 6548 | { |
6440 | phys_addr_t start, end; | 6549 | phys_addr_t start, end; |
6441 | unsigned long pfn; | ||
6442 | u64 i, pgcnt; | 6550 | u64 i, pgcnt; |
6551 | phys_addr_t next = 0; | ||
6443 | 6552 | ||
6444 | /* | 6553 | /* |
6445 | * Loop through ranges that are reserved, but do not have reported | 6554 | * Loop through unavailable ranges not covered by memblock.memory. |
6446 | * physical memory backing. | ||
6447 | */ | 6555 | */ |
6448 | pgcnt = 0; | 6556 | pgcnt = 0; |
6449 | for_each_resv_unavail_range(i, &start, &end) { | 6557 | for_each_mem_range(i, &memblock.memory, NULL, |
6450 | for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { | 6558 | NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { |
6451 | if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { | 6559 | if (next < start) |
6452 | pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) | 6560 | pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start)); |
6453 | + pageblock_nr_pages - 1; | 6561 | next = end; |
6454 | continue; | ||
6455 | } | ||
6456 | mm_zero_struct_page(pfn_to_page(pfn)); | ||
6457 | pgcnt++; | ||
6458 | } | ||
6459 | } | 6562 | } |
6563 | pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn); | ||
6460 | 6564 | ||
6461 | /* | 6565 | /* |
6462 | * Struct pages that do not have backing memory. This could be because | 6566 | * Struct pages that do not have backing memory. This could be because |
6463 | * firmware is using some of this memory, or for some other reasons. | 6567 | * firmware is using some of this memory, or for some other reasons. |
6464 | * Once memblock is changed so such behaviour is not allowed: i.e. | ||
6465 | * list of "reserved" memory must be a subset of list of "memory", then | ||
6466 | * this code can be removed. | ||
6467 | */ | 6568 | */ |
6468 | if (pgcnt) | 6569 | if (pgcnt) |
6469 | pr_info("Reserved but unavailable: %lld pages", pgcnt); | 6570 | pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); |
6470 | } | 6571 | } |
6471 | #endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ | 6572 | #endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ |
6472 | 6573 | ||
@@ -6803,15 +6904,12 @@ static void check_for_memory(pg_data_t *pgdat, int nid) | |||
6803 | { | 6904 | { |
6804 | enum zone_type zone_type; | 6905 | enum zone_type zone_type; |
6805 | 6906 | ||
6806 | if (N_MEMORY == N_NORMAL_MEMORY) | ||
6807 | return; | ||
6808 | |||
6809 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { | 6907 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { |
6810 | struct zone *zone = &pgdat->node_zones[zone_type]; | 6908 | struct zone *zone = &pgdat->node_zones[zone_type]; |
6811 | if (populated_zone(zone)) { | 6909 | if (populated_zone(zone)) { |
6812 | node_set_state(nid, N_HIGH_MEMORY); | 6910 | if (IS_ENABLED(CONFIG_HIGHMEM)) |
6813 | if (N_NORMAL_MEMORY != N_HIGH_MEMORY && | 6911 | node_set_state(nid, N_HIGH_MEMORY); |
6814 | zone_type <= ZONE_NORMAL) | 6912 | if (zone_type <= ZONE_NORMAL) |
6815 | node_set_state(nid, N_NORMAL_MEMORY); | 6913 | node_set_state(nid, N_NORMAL_MEMORY); |
6816 | break; | 6914 | break; |
6817 | } | 6915 | } |
diff --git a/mm/page_io.c b/mm/page_io.c index 573d3663d846..a451ffa9491c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -283,7 +283,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, | |||
283 | struct swap_info_struct *sis = page_swap_info(page); | 283 | struct swap_info_struct *sis = page_swap_info(page); |
284 | 284 | ||
285 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); | 285 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
286 | if (sis->flags & SWP_FILE) { | 286 | if (sis->flags & SWP_FS) { |
287 | struct kiocb kiocb; | 287 | struct kiocb kiocb; |
288 | struct file *swap_file = sis->swap_file; | 288 | struct file *swap_file = sis->swap_file; |
289 | struct address_space *mapping = swap_file->f_mapping; | 289 | struct address_space *mapping = swap_file->f_mapping; |
@@ -365,7 +365,7 @@ int swap_readpage(struct page *page, bool synchronous) | |||
365 | goto out; | 365 | goto out; |
366 | } | 366 | } |
367 | 367 | ||
368 | if (sis->flags & SWP_FILE) { | 368 | if (sis->flags & SWP_FS) { |
369 | struct file *swap_file = sis->swap_file; | 369 | struct file *swap_file = sis->swap_file; |
370 | struct address_space *mapping = swap_file->f_mapping; | 370 | struct address_space *mapping = swap_file->f_mapping; |
371 | 371 | ||
@@ -423,7 +423,7 @@ int swap_set_page_dirty(struct page *page) | |||
423 | { | 423 | { |
424 | struct swap_info_struct *sis = page_swap_info(page); | 424 | struct swap_info_struct *sis = page_swap_info(page); |
425 | 425 | ||
426 | if (sis->flags & SWP_FILE) { | 426 | if (sis->flags & SWP_FS) { |
427 | struct address_space *mapping = sis->swap_file->f_mapping; | 427 | struct address_space *mapping = sis->swap_file->f_mapping; |
428 | 428 | ||
429 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); | 429 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
@@ -1288,7 +1288,7 @@ void __init kmem_cache_init(void) | |||
1288 | * Initialize the caches that provide memory for the kmem_cache_node | 1288 | * Initialize the caches that provide memory for the kmem_cache_node |
1289 | * structures first. Without this, further allocations will bug. | 1289 | * structures first. Without this, further allocations will bug. |
1290 | */ | 1290 | */ |
1291 | kmalloc_caches[INDEX_NODE] = create_kmalloc_cache( | 1291 | kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache( |
1292 | kmalloc_info[INDEX_NODE].name, | 1292 | kmalloc_info[INDEX_NODE].name, |
1293 | kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS, | 1293 | kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS, |
1294 | 0, kmalloc_size(INDEX_NODE)); | 1294 | 0, kmalloc_size(INDEX_NODE)); |
@@ -1304,7 +1304,7 @@ void __init kmem_cache_init(void) | |||
1304 | for_each_online_node(nid) { | 1304 | for_each_online_node(nid) { |
1305 | init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); | 1305 | init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); |
1306 | 1306 | ||
1307 | init_list(kmalloc_caches[INDEX_NODE], | 1307 | init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE], |
1308 | &init_kmem_cache_node[SIZE_NODE + nid], nid); | 1308 | &init_kmem_cache_node[SIZE_NODE + nid], nid); |
1309 | } | 1309 | } |
1310 | } | 1310 | } |
@@ -3675,6 +3675,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) | |||
3675 | struct kmem_cache *cachep; | 3675 | struct kmem_cache *cachep; |
3676 | void *ret; | 3676 | void *ret; |
3677 | 3677 | ||
3678 | if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) | ||
3679 | return NULL; | ||
3678 | cachep = kmalloc_slab(size, flags); | 3680 | cachep = kmalloc_slab(size, flags); |
3679 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3681 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
3680 | return cachep; | 3682 | return cachep; |
@@ -3710,6 +3712,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3710 | struct kmem_cache *cachep; | 3712 | struct kmem_cache *cachep; |
3711 | void *ret; | 3713 | void *ret; |
3712 | 3714 | ||
3715 | if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) | ||
3716 | return NULL; | ||
3713 | cachep = kmalloc_slab(size, flags); | 3717 | cachep = kmalloc_slab(size, flags); |
3714 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3718 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
3715 | return cachep; | 3719 | return cachep; |
diff --git a/mm/slab_common.c b/mm/slab_common.c index fea3376f9816..7eb8dc136c1c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -973,14 +973,10 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, | |||
973 | return s; | 973 | return s; |
974 | } | 974 | } |
975 | 975 | ||
976 | struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; | 976 | struct kmem_cache * |
977 | kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init; | ||
977 | EXPORT_SYMBOL(kmalloc_caches); | 978 | EXPORT_SYMBOL(kmalloc_caches); |
978 | 979 | ||
979 | #ifdef CONFIG_ZONE_DMA | ||
980 | struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; | ||
981 | EXPORT_SYMBOL(kmalloc_dma_caches); | ||
982 | #endif | ||
983 | |||
984 | /* | 980 | /* |
985 | * Conversion table for small slabs sizes / 8 to the index in the | 981 | * Conversion table for small slabs sizes / 8 to the index in the |
986 | * kmalloc array. This is necessary for slabs < 192 since we have non power | 982 | * kmalloc array. This is necessary for slabs < 192 since we have non power |
@@ -1027,25 +1023,20 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) | |||
1027 | { | 1023 | { |
1028 | unsigned int index; | 1024 | unsigned int index; |
1029 | 1025 | ||
1030 | if (unlikely(size > KMALLOC_MAX_SIZE)) { | ||
1031 | WARN_ON_ONCE(!(flags & __GFP_NOWARN)); | ||
1032 | return NULL; | ||
1033 | } | ||
1034 | |||
1035 | if (size <= 192) { | 1026 | if (size <= 192) { |
1036 | if (!size) | 1027 | if (!size) |
1037 | return ZERO_SIZE_PTR; | 1028 | return ZERO_SIZE_PTR; |
1038 | 1029 | ||
1039 | index = size_index[size_index_elem(size)]; | 1030 | index = size_index[size_index_elem(size)]; |
1040 | } else | 1031 | } else { |
1032 | if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { | ||
1033 | WARN_ON(1); | ||
1034 | return NULL; | ||
1035 | } | ||
1041 | index = fls(size - 1); | 1036 | index = fls(size - 1); |
1037 | } | ||
1042 | 1038 | ||
1043 | #ifdef CONFIG_ZONE_DMA | 1039 | return kmalloc_caches[kmalloc_type(flags)][index]; |
1044 | if (unlikely((flags & GFP_DMA))) | ||
1045 | return kmalloc_dma_caches[index]; | ||
1046 | |||
1047 | #endif | ||
1048 | return kmalloc_caches[index]; | ||
1049 | } | 1040 | } |
1050 | 1041 | ||
1051 | /* | 1042 | /* |
@@ -1059,15 +1050,15 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { | |||
1059 | {"kmalloc-16", 16}, {"kmalloc-32", 32}, | 1050 | {"kmalloc-16", 16}, {"kmalloc-32", 32}, |
1060 | {"kmalloc-64", 64}, {"kmalloc-128", 128}, | 1051 | {"kmalloc-64", 64}, {"kmalloc-128", 128}, |
1061 | {"kmalloc-256", 256}, {"kmalloc-512", 512}, | 1052 | {"kmalloc-256", 256}, {"kmalloc-512", 512}, |
1062 | {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, | 1053 | {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048}, |
1063 | {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, | 1054 | {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192}, |
1064 | {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, | 1055 | {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768}, |
1065 | {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, | 1056 | {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072}, |
1066 | {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, | 1057 | {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288}, |
1067 | {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, | 1058 | {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152}, |
1068 | {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, | 1059 | {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608}, |
1069 | {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, | 1060 | {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432}, |
1070 | {"kmalloc-67108864", 67108864} | 1061 | {"kmalloc-64M", 67108864} |
1071 | }; | 1062 | }; |
1072 | 1063 | ||
1073 | /* | 1064 | /* |
@@ -1117,9 +1108,36 @@ void __init setup_kmalloc_cache_index_table(void) | |||
1117 | } | 1108 | } |
1118 | } | 1109 | } |
1119 | 1110 | ||
1120 | static void __init new_kmalloc_cache(int idx, slab_flags_t flags) | 1111 | static const char * |
1112 | kmalloc_cache_name(const char *prefix, unsigned int size) | ||
1113 | { | ||
1114 | |||
1115 | static const char units[3] = "\0kM"; | ||
1116 | int idx = 0; | ||
1117 | |||
1118 | while (size >= 1024 && (size % 1024 == 0)) { | ||
1119 | size /= 1024; | ||
1120 | idx++; | ||
1121 | } | ||
1122 | |||
1123 | return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]); | ||
1124 | } | ||
1125 | |||
1126 | static void __init | ||
1127 | new_kmalloc_cache(int idx, int type, slab_flags_t flags) | ||
1121 | { | 1128 | { |
1122 | kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, | 1129 | const char *name; |
1130 | |||
1131 | if (type == KMALLOC_RECLAIM) { | ||
1132 | flags |= SLAB_RECLAIM_ACCOUNT; | ||
1133 | name = kmalloc_cache_name("kmalloc-rcl", | ||
1134 | kmalloc_info[idx].size); | ||
1135 | BUG_ON(!name); | ||
1136 | } else { | ||
1137 | name = kmalloc_info[idx].name; | ||
1138 | } | ||
1139 | |||
1140 | kmalloc_caches[type][idx] = create_kmalloc_cache(name, | ||
1123 | kmalloc_info[idx].size, flags, 0, | 1141 | kmalloc_info[idx].size, flags, 0, |
1124 | kmalloc_info[idx].size); | 1142 | kmalloc_info[idx].size); |
1125 | } | 1143 | } |
@@ -1131,21 +1149,25 @@ static void __init new_kmalloc_cache(int idx, slab_flags_t flags) | |||
1131 | */ | 1149 | */ |
1132 | void __init create_kmalloc_caches(slab_flags_t flags) | 1150 | void __init create_kmalloc_caches(slab_flags_t flags) |
1133 | { | 1151 | { |
1134 | int i; | 1152 | int i, type; |
1135 | 1153 | ||
1136 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { | 1154 | for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { |
1137 | if (!kmalloc_caches[i]) | 1155 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { |
1138 | new_kmalloc_cache(i, flags); | 1156 | if (!kmalloc_caches[type][i]) |
1157 | new_kmalloc_cache(i, type, flags); | ||
1139 | 1158 | ||
1140 | /* | 1159 | /* |
1141 | * Caches that are not of the two-to-the-power-of size. | 1160 | * Caches that are not of the two-to-the-power-of size. |
1142 | * These have to be created immediately after the | 1161 | * These have to be created immediately after the |
1143 | * earlier power of two caches | 1162 | * earlier power of two caches |
1144 | */ | 1163 | */ |
1145 | if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) | 1164 | if (KMALLOC_MIN_SIZE <= 32 && i == 6 && |
1146 | new_kmalloc_cache(1, flags); | 1165 | !kmalloc_caches[type][1]) |
1147 | if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) | 1166 | new_kmalloc_cache(1, type, flags); |
1148 | new_kmalloc_cache(2, flags); | 1167 | if (KMALLOC_MIN_SIZE <= 64 && i == 7 && |
1168 | !kmalloc_caches[type][2]) | ||
1169 | new_kmalloc_cache(2, type, flags); | ||
1170 | } | ||
1149 | } | 1171 | } |
1150 | 1172 | ||
1151 | /* Kmalloc array is now usable */ | 1173 | /* Kmalloc array is now usable */ |
@@ -1153,16 +1175,15 @@ void __init create_kmalloc_caches(slab_flags_t flags) | |||
1153 | 1175 | ||
1154 | #ifdef CONFIG_ZONE_DMA | 1176 | #ifdef CONFIG_ZONE_DMA |
1155 | for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { | 1177 | for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { |
1156 | struct kmem_cache *s = kmalloc_caches[i]; | 1178 | struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i]; |
1157 | 1179 | ||
1158 | if (s) { | 1180 | if (s) { |
1159 | unsigned int size = kmalloc_size(i); | 1181 | unsigned int size = kmalloc_size(i); |
1160 | char *n = kasprintf(GFP_NOWAIT, | 1182 | const char *n = kmalloc_cache_name("dma-kmalloc", size); |
1161 | "dma-kmalloc-%u", size); | ||
1162 | 1183 | ||
1163 | BUG_ON(!n); | 1184 | BUG_ON(!n); |
1164 | kmalloc_dma_caches[i] = create_kmalloc_cache(n, | 1185 | kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( |
1165 | size, SLAB_CACHE_DMA | flags, 0, 0); | 1186 | n, size, SLAB_CACHE_DMA | flags, 0, 0); |
1166 | } | 1187 | } |
1167 | } | 1188 | } |
1168 | #endif | 1189 | #endif |
@@ -1276,16 +1276,54 @@ out: | |||
1276 | 1276 | ||
1277 | __setup("slub_debug", setup_slub_debug); | 1277 | __setup("slub_debug", setup_slub_debug); |
1278 | 1278 | ||
1279 | /* | ||
1280 | * kmem_cache_flags - apply debugging options to the cache | ||
1281 | * @object_size: the size of an object without meta data | ||
1282 | * @flags: flags to set | ||
1283 | * @name: name of the cache | ||
1284 | * @ctor: constructor function | ||
1285 | * | ||
1286 | * Debug option(s) are applied to @flags. In addition to the debug | ||
1287 | * option(s), if a slab name (or multiple) is specified i.e. | ||
1288 | * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ... | ||
1289 | * then only the select slabs will receive the debug option(s). | ||
1290 | */ | ||
1279 | slab_flags_t kmem_cache_flags(unsigned int object_size, | 1291 | slab_flags_t kmem_cache_flags(unsigned int object_size, |
1280 | slab_flags_t flags, const char *name, | 1292 | slab_flags_t flags, const char *name, |
1281 | void (*ctor)(void *)) | 1293 | void (*ctor)(void *)) |
1282 | { | 1294 | { |
1283 | /* | 1295 | char *iter; |
1284 | * Enable debugging if selected on the kernel commandline. | 1296 | size_t len; |
1285 | */ | 1297 | |
1286 | if (slub_debug && (!slub_debug_slabs || (name && | 1298 | /* If slub_debug = 0, it folds into the if conditional. */ |
1287 | !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) | 1299 | if (!slub_debug_slabs) |
1288 | flags |= slub_debug; | 1300 | return flags | slub_debug; |
1301 | |||
1302 | len = strlen(name); | ||
1303 | iter = slub_debug_slabs; | ||
1304 | while (*iter) { | ||
1305 | char *end, *glob; | ||
1306 | size_t cmplen; | ||
1307 | |||
1308 | end = strchr(iter, ','); | ||
1309 | if (!end) | ||
1310 | end = iter + strlen(iter); | ||
1311 | |||
1312 | glob = strnchr(iter, end - iter, '*'); | ||
1313 | if (glob) | ||
1314 | cmplen = glob - iter; | ||
1315 | else | ||
1316 | cmplen = max_t(size_t, len, (end - iter)); | ||
1317 | |||
1318 | if (!strncmp(name, iter, cmplen)) { | ||
1319 | flags |= slub_debug; | ||
1320 | break; | ||
1321 | } | ||
1322 | |||
1323 | if (!*end) | ||
1324 | break; | ||
1325 | iter = end + 1; | ||
1326 | } | ||
1289 | 1327 | ||
1290 | return flags; | 1328 | return flags; |
1291 | } | 1329 | } |
@@ -3621,9 +3659,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, | |||
3621 | #ifdef CONFIG_SLUB_DEBUG | 3659 | #ifdef CONFIG_SLUB_DEBUG |
3622 | void *addr = page_address(page); | 3660 | void *addr = page_address(page); |
3623 | void *p; | 3661 | void *p; |
3624 | unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects), | 3662 | unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC); |
3625 | sizeof(long), | ||
3626 | GFP_ATOMIC); | ||
3627 | if (!map) | 3663 | if (!map) |
3628 | return; | 3664 | return; |
3629 | slab_err(s, page, text, s->name); | 3665 | slab_err(s, page, text, s->name); |
@@ -3638,7 +3674,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, | |||
3638 | } | 3674 | } |
3639 | } | 3675 | } |
3640 | slab_unlock(page); | 3676 | slab_unlock(page); |
3641 | kfree(map); | 3677 | bitmap_free(map); |
3642 | #endif | 3678 | #endif |
3643 | } | 3679 | } |
3644 | 3680 | ||
@@ -4411,10 +4447,8 @@ static long validate_slab_cache(struct kmem_cache *s) | |||
4411 | { | 4447 | { |
4412 | int node; | 4448 | int node; |
4413 | unsigned long count = 0; | 4449 | unsigned long count = 0; |
4414 | unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), | ||
4415 | sizeof(unsigned long), | ||
4416 | GFP_KERNEL); | ||
4417 | struct kmem_cache_node *n; | 4450 | struct kmem_cache_node *n; |
4451 | unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); | ||
4418 | 4452 | ||
4419 | if (!map) | 4453 | if (!map) |
4420 | return -ENOMEM; | 4454 | return -ENOMEM; |
@@ -4422,7 +4456,7 @@ static long validate_slab_cache(struct kmem_cache *s) | |||
4422 | flush_all(s); | 4456 | flush_all(s); |
4423 | for_each_kmem_cache_node(s, node, n) | 4457 | for_each_kmem_cache_node(s, node, n) |
4424 | count += validate_slab_node(s, n, map); | 4458 | count += validate_slab_node(s, n, map); |
4425 | kfree(map); | 4459 | bitmap_free(map); |
4426 | return count; | 4460 | return count; |
4427 | } | 4461 | } |
4428 | /* | 4462 | /* |
@@ -4573,14 +4607,12 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
4573 | unsigned long i; | 4607 | unsigned long i; |
4574 | struct loc_track t = { 0, 0, NULL }; | 4608 | struct loc_track t = { 0, 0, NULL }; |
4575 | int node; | 4609 | int node; |
4576 | unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), | ||
4577 | sizeof(unsigned long), | ||
4578 | GFP_KERNEL); | ||
4579 | struct kmem_cache_node *n; | 4610 | struct kmem_cache_node *n; |
4611 | unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); | ||
4580 | 4612 | ||
4581 | if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), | 4613 | if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), |
4582 | GFP_KERNEL)) { | 4614 | GFP_KERNEL)) { |
4583 | kfree(map); | 4615 | bitmap_free(map); |
4584 | return sprintf(buf, "Out of memory\n"); | 4616 | return sprintf(buf, "Out of memory\n"); |
4585 | } | 4617 | } |
4586 | /* Push back cpu slabs */ | 4618 | /* Push back cpu slabs */ |
@@ -4646,7 +4678,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
4646 | } | 4678 | } |
4647 | 4679 | ||
4648 | free_loc_track(&t); | 4680 | free_loc_track(&t); |
4649 | kfree(map); | 4681 | bitmap_free(map); |
4650 | if (!t.count) | 4682 | if (!t.count) |
4651 | len += sprintf(buf, "No data\n"); | 4683 | len += sprintf(buf, "No data\n"); |
4652 | return len; | 4684 | return len; |
@@ -4657,6 +4689,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
4657 | static void __init resiliency_test(void) | 4689 | static void __init resiliency_test(void) |
4658 | { | 4690 | { |
4659 | u8 *p; | 4691 | u8 *p; |
4692 | int type = KMALLOC_NORMAL; | ||
4660 | 4693 | ||
4661 | BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); | 4694 | BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); |
4662 | 4695 | ||
@@ -4669,7 +4702,7 @@ static void __init resiliency_test(void) | |||
4669 | pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", | 4702 | pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", |
4670 | p + 16); | 4703 | p + 16); |
4671 | 4704 | ||
4672 | validate_slab_cache(kmalloc_caches[4]); | 4705 | validate_slab_cache(kmalloc_caches[type][4]); |
4673 | 4706 | ||
4674 | /* Hmmm... The next two are dangerous */ | 4707 | /* Hmmm... The next two are dangerous */ |
4675 | p = kzalloc(32, GFP_KERNEL); | 4708 | p = kzalloc(32, GFP_KERNEL); |
@@ -4678,33 +4711,33 @@ static void __init resiliency_test(void) | |||
4678 | p); | 4711 | p); |
4679 | pr_err("If allocated object is overwritten then not detectable\n\n"); | 4712 | pr_err("If allocated object is overwritten then not detectable\n\n"); |
4680 | 4713 | ||
4681 | validate_slab_cache(kmalloc_caches[5]); | 4714 | validate_slab_cache(kmalloc_caches[type][5]); |
4682 | p = kzalloc(64, GFP_KERNEL); | 4715 | p = kzalloc(64, GFP_KERNEL); |
4683 | p += 64 + (get_cycles() & 0xff) * sizeof(void *); | 4716 | p += 64 + (get_cycles() & 0xff) * sizeof(void *); |
4684 | *p = 0x56; | 4717 | *p = 0x56; |
4685 | pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", | 4718 | pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", |
4686 | p); | 4719 | p); |
4687 | pr_err("If allocated object is overwritten then not detectable\n\n"); | 4720 | pr_err("If allocated object is overwritten then not detectable\n\n"); |
4688 | validate_slab_cache(kmalloc_caches[6]); | 4721 | validate_slab_cache(kmalloc_caches[type][6]); |
4689 | 4722 | ||
4690 | pr_err("\nB. Corruption after free\n"); | 4723 | pr_err("\nB. Corruption after free\n"); |
4691 | p = kzalloc(128, GFP_KERNEL); | 4724 | p = kzalloc(128, GFP_KERNEL); |
4692 | kfree(p); | 4725 | kfree(p); |
4693 | *p = 0x78; | 4726 | *p = 0x78; |
4694 | pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); | 4727 | pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); |
4695 | validate_slab_cache(kmalloc_caches[7]); | 4728 | validate_slab_cache(kmalloc_caches[type][7]); |
4696 | 4729 | ||
4697 | p = kzalloc(256, GFP_KERNEL); | 4730 | p = kzalloc(256, GFP_KERNEL); |
4698 | kfree(p); | 4731 | kfree(p); |
4699 | p[50] = 0x9a; | 4732 | p[50] = 0x9a; |
4700 | pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); | 4733 | pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); |
4701 | validate_slab_cache(kmalloc_caches[8]); | 4734 | validate_slab_cache(kmalloc_caches[type][8]); |
4702 | 4735 | ||
4703 | p = kzalloc(512, GFP_KERNEL); | 4736 | p = kzalloc(512, GFP_KERNEL); |
4704 | kfree(p); | 4737 | kfree(p); |
4705 | p[512] = 0xab; | 4738 | p[512] = 0xab; |
4706 | pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); | 4739 | pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); |
4707 | validate_slab_cache(kmalloc_caches[9]); | 4740 | validate_slab_cache(kmalloc_caches[type][9]); |
4708 | } | 4741 | } |
4709 | #else | 4742 | #else |
4710 | #ifdef CONFIG_SYSFS | 4743 | #ifdef CONFIG_SYSFS |
diff --git a/mm/sparse.c b/mm/sparse.c index 10b07eea9a6e..67ad061f7fb8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -696,13 +696,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, | |||
696 | goto out; | 696 | goto out; |
697 | } | 697 | } |
698 | 698 | ||
699 | #ifdef CONFIG_DEBUG_VM | ||
700 | /* | 699 | /* |
701 | * Poison uninitialized struct pages in order to catch invalid flags | 700 | * Poison uninitialized struct pages in order to catch invalid flags |
702 | * combinations. | 701 | * combinations. |
703 | */ | 702 | */ |
704 | memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION); | 703 | page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION); |
705 | #endif | ||
706 | 704 | ||
707 | section_mark_present(ms); | 705 | section_mark_present(ms); |
708 | sparse_init_one_section(ms, section_nr, memmap, usemap); | 706 | sparse_init_one_section(ms, section_nr, memmap, usemap); |
@@ -29,7 +29,6 @@ | |||
29 | #include <linux/cpu.h> | 29 | #include <linux/cpu.h> |
30 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
31 | #include <linux/backing-dev.h> | 31 | #include <linux/backing-dev.h> |
32 | #include <linux/memremap.h> | ||
33 | #include <linux/memcontrol.h> | 32 | #include <linux/memcontrol.h> |
34 | #include <linux/gfp.h> | 33 | #include <linux/gfp.h> |
35 | #include <linux/uio.h> | 34 | #include <linux/uio.h> |
diff --git a/mm/swap_state.c b/mm/swap_state.c index ecee9c6c4cc1..0d6a7f268d2e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -448,6 +448,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
448 | /* | 448 | /* |
449 | * Initiate read into locked page and return. | 449 | * Initiate read into locked page and return. |
450 | */ | 450 | */ |
451 | SetPageWorkingset(new_page); | ||
451 | lru_cache_add_anon(new_page); | 452 | lru_cache_add_anon(new_page); |
452 | *new_page_allocated = true; | 453 | *new_page_allocated = true; |
453 | return new_page; | 454 | return new_page; |
diff --git a/mm/swapfile.c b/mm/swapfile.c index d954b71c4f9c..644f746e167a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -103,26 +103,39 @@ static inline unsigned char swap_count(unsigned char ent) | |||
103 | return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ | 103 | return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ |
104 | } | 104 | } |
105 | 105 | ||
106 | /* Reclaim the swap entry anyway if possible */ | ||
107 | #define TTRS_ANYWAY 0x1 | ||
108 | /* | ||
109 | * Reclaim the swap entry if there are no more mappings of the | ||
110 | * corresponding page | ||
111 | */ | ||
112 | #define TTRS_UNMAPPED 0x2 | ||
113 | /* Reclaim the swap entry if swap is getting full*/ | ||
114 | #define TTRS_FULL 0x4 | ||
115 | |||
106 | /* returns 1 if swap entry is freed */ | 116 | /* returns 1 if swap entry is freed */ |
107 | static int | 117 | static int __try_to_reclaim_swap(struct swap_info_struct *si, |
108 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | 118 | unsigned long offset, unsigned long flags) |
109 | { | 119 | { |
110 | swp_entry_t entry = swp_entry(si->type, offset); | 120 | swp_entry_t entry = swp_entry(si->type, offset); |
111 | struct page *page; | 121 | struct page *page; |
112 | int ret = 0; | 122 | int ret = 0; |
113 | 123 | ||
114 | page = find_get_page(swap_address_space(entry), swp_offset(entry)); | 124 | page = find_get_page(swap_address_space(entry), offset); |
115 | if (!page) | 125 | if (!page) |
116 | return 0; | 126 | return 0; |
117 | /* | 127 | /* |
118 | * This function is called from scan_swap_map() and it's called | 128 | * When this function is called from scan_swap_map_slots() and it's |
119 | * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. | 129 | * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, |
120 | * We have to use trylock for avoiding deadlock. This is a special | 130 | * here. We have to use trylock for avoiding deadlock. This is a special |
121 | * case and you should use try_to_free_swap() with explicit lock_page() | 131 | * case and you should use try_to_free_swap() with explicit lock_page() |
122 | * in usual operations. | 132 | * in usual operations. |
123 | */ | 133 | */ |
124 | if (trylock_page(page)) { | 134 | if (trylock_page(page)) { |
125 | ret = try_to_free_swap(page); | 135 | if ((flags & TTRS_ANYWAY) || |
136 | ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || | ||
137 | ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) | ||
138 | ret = try_to_free_swap(page); | ||
126 | unlock_page(page); | 139 | unlock_page(page); |
127 | } | 140 | } |
128 | put_page(page); | 141 | put_page(page); |
@@ -780,7 +793,7 @@ checks: | |||
780 | int swap_was_freed; | 793 | int swap_was_freed; |
781 | unlock_cluster(ci); | 794 | unlock_cluster(ci); |
782 | spin_unlock(&si->lock); | 795 | spin_unlock(&si->lock); |
783 | swap_was_freed = __try_to_reclaim_swap(si, offset); | 796 | swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); |
784 | spin_lock(&si->lock); | 797 | spin_lock(&si->lock); |
785 | /* entry was freed successfully, try to use this again */ | 798 | /* entry was freed successfully, try to use this again */ |
786 | if (swap_was_freed) | 799 | if (swap_was_freed) |
@@ -919,6 +932,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) | |||
919 | struct swap_cluster_info *ci; | 932 | struct swap_cluster_info *ci; |
920 | 933 | ||
921 | ci = lock_cluster(si, offset); | 934 | ci = lock_cluster(si, offset); |
935 | memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); | ||
922 | cluster_set_count_flag(ci, 0, 0); | 936 | cluster_set_count_flag(ci, 0, 0); |
923 | free_cluster(si, idx); | 937 | free_cluster(si, idx); |
924 | unlock_cluster(ci); | 938 | unlock_cluster(ci); |
@@ -989,7 +1003,7 @@ start_over: | |||
989 | goto nextsi; | 1003 | goto nextsi; |
990 | } | 1004 | } |
991 | if (size == SWAPFILE_CLUSTER) { | 1005 | if (size == SWAPFILE_CLUSTER) { |
992 | if (!(si->flags & SWP_FILE)) | 1006 | if (!(si->flags & SWP_FS)) |
993 | n_ret = swap_alloc_cluster(si, swp_entries); | 1007 | n_ret = swap_alloc_cluster(si, swp_entries); |
994 | } else | 1008 | } else |
995 | n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, | 1009 | n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, |
@@ -1169,6 +1183,8 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, | |||
1169 | ci = lock_cluster_or_swap_info(p, offset); | 1183 | ci = lock_cluster_or_swap_info(p, offset); |
1170 | usage = __swap_entry_free_locked(p, offset, usage); | 1184 | usage = __swap_entry_free_locked(p, offset, usage); |
1171 | unlock_cluster_or_swap_info(p, ci); | 1185 | unlock_cluster_or_swap_info(p, ci); |
1186 | if (!usage) | ||
1187 | free_swap_slot(entry); | ||
1172 | 1188 | ||
1173 | return usage; | 1189 | return usage; |
1174 | } | 1190 | } |
@@ -1199,10 +1215,8 @@ void swap_free(swp_entry_t entry) | |||
1199 | struct swap_info_struct *p; | 1215 | struct swap_info_struct *p; |
1200 | 1216 | ||
1201 | p = _swap_info_get(entry); | 1217 | p = _swap_info_get(entry); |
1202 | if (p) { | 1218 | if (p) |
1203 | if (!__swap_entry_free(p, entry, 1)) | 1219 | __swap_entry_free(p, entry, 1); |
1204 | free_swap_slot(entry); | ||
1205 | } | ||
1206 | } | 1220 | } |
1207 | 1221 | ||
1208 | /* | 1222 | /* |
@@ -1237,9 +1251,6 @@ void put_swap_page(struct page *page, swp_entry_t entry) | |||
1237 | if (free_entries == SWAPFILE_CLUSTER) { | 1251 | if (free_entries == SWAPFILE_CLUSTER) { |
1238 | unlock_cluster_or_swap_info(si, ci); | 1252 | unlock_cluster_or_swap_info(si, ci); |
1239 | spin_lock(&si->lock); | 1253 | spin_lock(&si->lock); |
1240 | ci = lock_cluster(si, offset); | ||
1241 | memset(map, 0, SWAPFILE_CLUSTER); | ||
1242 | unlock_cluster(ci); | ||
1243 | mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); | 1254 | mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); |
1244 | swap_free_cluster(si, idx); | 1255 | swap_free_cluster(si, idx); |
1245 | spin_unlock(&si->lock); | 1256 | spin_unlock(&si->lock); |
@@ -1612,7 +1623,6 @@ int try_to_free_swap(struct page *page) | |||
1612 | int free_swap_and_cache(swp_entry_t entry) | 1623 | int free_swap_and_cache(swp_entry_t entry) |
1613 | { | 1624 | { |
1614 | struct swap_info_struct *p; | 1625 | struct swap_info_struct *p; |
1615 | struct page *page = NULL; | ||
1616 | unsigned char count; | 1626 | unsigned char count; |
1617 | 1627 | ||
1618 | if (non_swap_entry(entry)) | 1628 | if (non_swap_entry(entry)) |
@@ -1622,30 +1632,9 @@ int free_swap_and_cache(swp_entry_t entry) | |||
1622 | if (p) { | 1632 | if (p) { |
1623 | count = __swap_entry_free(p, entry, 1); | 1633 | count = __swap_entry_free(p, entry, 1); |
1624 | if (count == SWAP_HAS_CACHE && | 1634 | if (count == SWAP_HAS_CACHE && |
1625 | !swap_page_trans_huge_swapped(p, entry)) { | 1635 | !swap_page_trans_huge_swapped(p, entry)) |
1626 | page = find_get_page(swap_address_space(entry), | 1636 | __try_to_reclaim_swap(p, swp_offset(entry), |
1627 | swp_offset(entry)); | 1637 | TTRS_UNMAPPED | TTRS_FULL); |
1628 | if (page && !trylock_page(page)) { | ||
1629 | put_page(page); | ||
1630 | page = NULL; | ||
1631 | } | ||
1632 | } else if (!count) | ||
1633 | free_swap_slot(entry); | ||
1634 | } | ||
1635 | if (page) { | ||
1636 | /* | ||
1637 | * Not mapped elsewhere, or swap space full? Free it! | ||
1638 | * Also recheck PageSwapCache now page is locked (above). | ||
1639 | */ | ||
1640 | if (PageSwapCache(page) && !PageWriteback(page) && | ||
1641 | (!page_mapped(page) || mem_cgroup_swap_full(page)) && | ||
1642 | !swap_page_trans_huge_swapped(p, entry)) { | ||
1643 | page = compound_head(page); | ||
1644 | delete_from_swap_cache(page); | ||
1645 | SetPageDirty(page); | ||
1646 | } | ||
1647 | unlock_page(page); | ||
1648 | put_page(page); | ||
1649 | } | 1638 | } |
1650 | return p != NULL; | 1639 | return p != NULL; |
1651 | } | 1640 | } |
@@ -2310,12 +2299,13 @@ static void destroy_swap_extents(struct swap_info_struct *sis) | |||
2310 | kfree(se); | 2299 | kfree(se); |
2311 | } | 2300 | } |
2312 | 2301 | ||
2313 | if (sis->flags & SWP_FILE) { | 2302 | if (sis->flags & SWP_ACTIVATED) { |
2314 | struct file *swap_file = sis->swap_file; | 2303 | struct file *swap_file = sis->swap_file; |
2315 | struct address_space *mapping = swap_file->f_mapping; | 2304 | struct address_space *mapping = swap_file->f_mapping; |
2316 | 2305 | ||
2317 | sis->flags &= ~SWP_FILE; | 2306 | sis->flags &= ~SWP_ACTIVATED; |
2318 | mapping->a_ops->swap_deactivate(swap_file); | 2307 | if (mapping->a_ops->swap_deactivate) |
2308 | mapping->a_ops->swap_deactivate(swap_file); | ||
2319 | } | 2309 | } |
2320 | } | 2310 | } |
2321 | 2311 | ||
@@ -2364,6 +2354,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
2364 | list_add_tail(&new_se->list, &sis->first_swap_extent.list); | 2354 | list_add_tail(&new_se->list, &sis->first_swap_extent.list); |
2365 | return 1; | 2355 | return 1; |
2366 | } | 2356 | } |
2357 | EXPORT_SYMBOL_GPL(add_swap_extent); | ||
2367 | 2358 | ||
2368 | /* | 2359 | /* |
2369 | * A `swap extent' is a simple thing which maps a contiguous range of pages | 2360 | * A `swap extent' is a simple thing which maps a contiguous range of pages |
@@ -2411,8 +2402,10 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
2411 | 2402 | ||
2412 | if (mapping->a_ops->swap_activate) { | 2403 | if (mapping->a_ops->swap_activate) { |
2413 | ret = mapping->a_ops->swap_activate(sis, swap_file, span); | 2404 | ret = mapping->a_ops->swap_activate(sis, swap_file, span); |
2405 | if (ret >= 0) | ||
2406 | sis->flags |= SWP_ACTIVATED; | ||
2414 | if (!ret) { | 2407 | if (!ret) { |
2415 | sis->flags |= SWP_FILE; | 2408 | sis->flags |= SWP_FS; |
2416 | ret = add_swap_extent(sis, 0, sis->max, 0); | 2409 | ret = add_swap_extent(sis, 0, sis->max, 0); |
2417 | *span = sis->pages; | 2410 | *span = sis->pages; |
2418 | } | 2411 | } |
@@ -435,7 +435,7 @@ EXPORT_SYMBOL(kvmalloc_node); | |||
435 | * It is slightly more efficient to use kfree() or vfree() if you are certain | 435 | * It is slightly more efficient to use kfree() or vfree() if you are certain |
436 | * that you know which one to use. | 436 | * that you know which one to use. |
437 | * | 437 | * |
438 | * Context: Any context except NMI. | 438 | * Context: Either preemptible task context or not-NMI interrupt. |
439 | */ | 439 | */ |
440 | void kvfree(const void *addr) | 440 | void kvfree(const void *addr) |
441 | { | 441 | { |
@@ -678,8 +678,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
678 | * Part of the kernel memory, which can be released | 678 | * Part of the kernel memory, which can be released |
679 | * under memory pressure. | 679 | * under memory pressure. |
680 | */ | 680 | */ |
681 | free += global_node_page_state( | 681 | free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); |
682 | NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT; | ||
683 | 682 | ||
684 | /* | 683 | /* |
685 | * Leave reserved pages. The pages are not for anonymous pages. | 684 | * Leave reserved pages. The pages are not for anonymous pages. |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a728fc492557..97d4b25d0373 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1577,6 +1577,8 @@ void vfree_atomic(const void *addr) | |||
1577 | * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling | 1577 | * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling |
1578 | * conventions for vfree() arch-depenedent would be a really bad idea) | 1578 | * conventions for vfree() arch-depenedent would be a really bad idea) |
1579 | * | 1579 | * |
1580 | * May sleep if called *not* from interrupt context. | ||
1581 | * | ||
1580 | * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) | 1582 | * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) |
1581 | */ | 1583 | */ |
1582 | void vfree(const void *addr) | 1584 | void vfree(const void *addr) |
@@ -1585,6 +1587,8 @@ void vfree(const void *addr) | |||
1585 | 1587 | ||
1586 | kmemleak_free(addr); | 1588 | kmemleak_free(addr); |
1587 | 1589 | ||
1590 | might_sleep_if(!in_interrupt()); | ||
1591 | |||
1588 | if (!addr) | 1592 | if (!addr) |
1589 | return; | 1593 | return; |
1590 | if (unlikely(in_interrupt())) | 1594 | if (unlikely(in_interrupt())) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index c5ef7240cbcb..28c9ae5633b9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/prefetch.h> | 49 | #include <linux/prefetch.h> |
50 | #include <linux/printk.h> | 50 | #include <linux/printk.h> |
51 | #include <linux/dax.h> | 51 | #include <linux/dax.h> |
52 | #include <linux/psi.h> | ||
52 | 53 | ||
53 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
54 | #include <asm/div64.h> | 55 | #include <asm/div64.h> |
@@ -473,9 +474,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, | |||
473 | nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); | 474 | nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); |
474 | 475 | ||
475 | total_scan = nr; | 476 | total_scan = nr; |
476 | delta = freeable >> priority; | 477 | if (shrinker->seeks) { |
477 | delta *= 4; | 478 | delta = freeable >> priority; |
478 | do_div(delta, shrinker->seeks); | 479 | delta *= 4; |
480 | do_div(delta, shrinker->seeks); | ||
481 | } else { | ||
482 | /* | ||
483 | * These objects don't require any IO to create. Trim | ||
484 | * them aggressively under memory pressure to keep | ||
485 | * them from causing refetches in the IO caches. | ||
486 | */ | ||
487 | delta = freeable / 2; | ||
488 | } | ||
479 | 489 | ||
480 | /* | 490 | /* |
481 | * Make sure we apply some minimal pressure on default priority | 491 | * Make sure we apply some minimal pressure on default priority |
@@ -2145,6 +2155,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
2145 | } | 2155 | } |
2146 | 2156 | ||
2147 | ClearPageActive(page); /* we are de-activating */ | 2157 | ClearPageActive(page); /* we are de-activating */ |
2158 | SetPageWorkingset(page); | ||
2148 | list_add(&page->lru, &l_inactive); | 2159 | list_add(&page->lru, &l_inactive); |
2149 | } | 2160 | } |
2150 | 2161 | ||
@@ -2456,9 +2467,11 @@ out: | |||
2456 | /* | 2467 | /* |
2457 | * Scan types proportional to swappiness and | 2468 | * Scan types proportional to swappiness and |
2458 | * their relative recent reclaim efficiency. | 2469 | * their relative recent reclaim efficiency. |
2470 | * Make sure we don't miss the last page | ||
2471 | * because of a round-off error. | ||
2459 | */ | 2472 | */ |
2460 | scan = div64_u64(scan * fraction[file], | 2473 | scan = DIV64_U64_ROUND_UP(scan * fraction[file], |
2461 | denominator); | 2474 | denominator); |
2462 | break; | 2475 | break; |
2463 | case SCAN_FILE: | 2476 | case SCAN_FILE: |
2464 | case SCAN_ANON: | 2477 | case SCAN_ANON: |
@@ -3302,6 +3315,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
3302 | { | 3315 | { |
3303 | struct zonelist *zonelist; | 3316 | struct zonelist *zonelist; |
3304 | unsigned long nr_reclaimed; | 3317 | unsigned long nr_reclaimed; |
3318 | unsigned long pflags; | ||
3305 | int nid; | 3319 | int nid; |
3306 | unsigned int noreclaim_flag; | 3320 | unsigned int noreclaim_flag; |
3307 | struct scan_control sc = { | 3321 | struct scan_control sc = { |
@@ -3330,9 +3344,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
3330 | sc.gfp_mask, | 3344 | sc.gfp_mask, |
3331 | sc.reclaim_idx); | 3345 | sc.reclaim_idx); |
3332 | 3346 | ||
3347 | psi_memstall_enter(&pflags); | ||
3333 | noreclaim_flag = memalloc_noreclaim_save(); | 3348 | noreclaim_flag = memalloc_noreclaim_save(); |
3349 | |||
3334 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | 3350 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); |
3351 | |||
3335 | memalloc_noreclaim_restore(noreclaim_flag); | 3352 | memalloc_noreclaim_restore(noreclaim_flag); |
3353 | psi_memstall_leave(&pflags); | ||
3336 | 3354 | ||
3337 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); | 3355 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); |
3338 | 3356 | ||
@@ -3497,6 +3515,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3497 | int i; | 3515 | int i; |
3498 | unsigned long nr_soft_reclaimed; | 3516 | unsigned long nr_soft_reclaimed; |
3499 | unsigned long nr_soft_scanned; | 3517 | unsigned long nr_soft_scanned; |
3518 | unsigned long pflags; | ||
3500 | struct zone *zone; | 3519 | struct zone *zone; |
3501 | struct scan_control sc = { | 3520 | struct scan_control sc = { |
3502 | .gfp_mask = GFP_KERNEL, | 3521 | .gfp_mask = GFP_KERNEL, |
@@ -3507,6 +3526,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3507 | .may_swap = 1, | 3526 | .may_swap = 1, |
3508 | }; | 3527 | }; |
3509 | 3528 | ||
3529 | psi_memstall_enter(&pflags); | ||
3510 | __fs_reclaim_acquire(); | 3530 | __fs_reclaim_acquire(); |
3511 | 3531 | ||
3512 | count_vm_event(PAGEOUTRUN); | 3532 | count_vm_event(PAGEOUTRUN); |
@@ -3608,6 +3628,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3608 | out: | 3628 | out: |
3609 | snapshot_refaults(NULL, pgdat); | 3629 | snapshot_refaults(NULL, pgdat); |
3610 | __fs_reclaim_release(); | 3630 | __fs_reclaim_release(); |
3631 | psi_memstall_leave(&pflags); | ||
3611 | /* | 3632 | /* |
3612 | * Return the order kswapd stopped reclaiming at as | 3633 | * Return the order kswapd stopped reclaiming at as |
3613 | * prepare_kswapd_sleep() takes it into account. If another caller | 3634 | * prepare_kswapd_sleep() takes it into account. If another caller |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 7878da76abf2..6038ce593ce3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -1143,8 +1143,10 @@ const char * const vmstat_text[] = { | |||
1143 | "nr_slab_unreclaimable", | 1143 | "nr_slab_unreclaimable", |
1144 | "nr_isolated_anon", | 1144 | "nr_isolated_anon", |
1145 | "nr_isolated_file", | 1145 | "nr_isolated_file", |
1146 | "workingset_nodes", | ||
1146 | "workingset_refault", | 1147 | "workingset_refault", |
1147 | "workingset_activate", | 1148 | "workingset_activate", |
1149 | "workingset_restore", | ||
1148 | "workingset_nodereclaim", | 1150 | "workingset_nodereclaim", |
1149 | "nr_anon_pages", | 1151 | "nr_anon_pages", |
1150 | "nr_mapped", | 1152 | "nr_mapped", |
@@ -1161,7 +1163,7 @@ const char * const vmstat_text[] = { | |||
1161 | "nr_vmscan_immediate_reclaim", | 1163 | "nr_vmscan_immediate_reclaim", |
1162 | "nr_dirtied", | 1164 | "nr_dirtied", |
1163 | "nr_written", | 1165 | "nr_written", |
1164 | "", /* nr_indirectly_reclaimable */ | 1166 | "nr_kernel_misc_reclaimable", |
1165 | 1167 | ||
1166 | /* enum writeback_stat_item counters */ | 1168 | /* enum writeback_stat_item counters */ |
1167 | "nr_dirty_threshold", | 1169 | "nr_dirty_threshold", |
@@ -1663,6 +1665,8 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) | |||
1663 | stat_items_size += sizeof(struct vm_event_state); | 1665 | stat_items_size += sizeof(struct vm_event_state); |
1664 | #endif | 1666 | #endif |
1665 | 1667 | ||
1668 | BUILD_BUG_ON(stat_items_size != | ||
1669 | ARRAY_SIZE(vmstat_text) * sizeof(unsigned long)); | ||
1666 | v = kmalloc(stat_items_size, GFP_KERNEL); | 1670 | v = kmalloc(stat_items_size, GFP_KERNEL); |
1667 | m->private = v; | 1671 | m->private = v; |
1668 | if (!v) | 1672 | if (!v) |
@@ -1706,10 +1710,6 @@ static int vmstat_show(struct seq_file *m, void *arg) | |||
1706 | unsigned long *l = arg; | 1710 | unsigned long *l = arg; |
1707 | unsigned long off = l - (unsigned long *)m->private; | 1711 | unsigned long off = l - (unsigned long *)m->private; |
1708 | 1712 | ||
1709 | /* Skip hidden vmstat items. */ | ||
1710 | if (*vmstat_text[off] == '\0') | ||
1711 | return 0; | ||
1712 | |||
1713 | seq_puts(m, vmstat_text[off]); | 1713 | seq_puts(m, vmstat_text[off]); |
1714 | seq_put_decimal_ull(m, " ", *l); | 1714 | seq_put_decimal_ull(m, " ", *l); |
1715 | seq_putc(m, '\n'); | 1715 | seq_putc(m, '\n'); |
diff --git a/mm/workingset.c b/mm/workingset.c index 4516dd790129..cbc13d4dfa79 100644 --- a/mm/workingset.c +++ b/mm/workingset.c | |||
@@ -121,7 +121,7 @@ | |||
121 | * the only thing eating into inactive list space is active pages. | 121 | * the only thing eating into inactive list space is active pages. |
122 | * | 122 | * |
123 | * | 123 | * |
124 | * Activating refaulting pages | 124 | * Refaulting inactive pages |
125 | * | 125 | * |
126 | * All that is known about the active list is that the pages have been | 126 | * All that is known about the active list is that the pages have been |
127 | * accessed more than once in the past. This means that at any given | 127 | * accessed more than once in the past. This means that at any given |
@@ -134,6 +134,10 @@ | |||
134 | * used less frequently than the refaulting page - or even not used at | 134 | * used less frequently than the refaulting page - or even not used at |
135 | * all anymore. | 135 | * all anymore. |
136 | * | 136 | * |
137 | * That means if inactive cache is refaulting with a suitable refault | ||
138 | * distance, we assume the cache workingset is transitioning and put | ||
139 | * pressure on the current active list. | ||
140 | * | ||
137 | * If this is wrong and demotion kicks in, the pages which are truly | 141 | * If this is wrong and demotion kicks in, the pages which are truly |
138 | * used more frequently will be reactivated while the less frequently | 142 | * used more frequently will be reactivated while the less frequently |
139 | * used once will be evicted from memory. | 143 | * used once will be evicted from memory. |
@@ -141,6 +145,14 @@ | |||
141 | * But if this is right, the stale pages will be pushed out of memory | 145 | * But if this is right, the stale pages will be pushed out of memory |
142 | * and the used pages get to stay in cache. | 146 | * and the used pages get to stay in cache. |
143 | * | 147 | * |
148 | * Refaulting active pages | ||
149 | * | ||
150 | * If on the other hand the refaulting pages have recently been | ||
151 | * deactivated, it means that the active list is no longer protecting | ||
152 | * actively used cache from reclaim. The cache is NOT transitioning to | ||
153 | * a different workingset; the existing workingset is thrashing in the | ||
154 | * space allocated to the page cache. | ||
155 | * | ||
144 | * | 156 | * |
145 | * Implementation | 157 | * Implementation |
146 | * | 158 | * |
@@ -156,8 +168,7 @@ | |||
156 | */ | 168 | */ |
157 | 169 | ||
158 | #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ | 170 | #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ |
159 | NODES_SHIFT + \ | 171 | 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) |
160 | MEM_CGROUP_ID_SHIFT) | ||
161 | #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) | 172 | #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) |
162 | 173 | ||
163 | /* | 174 | /* |
@@ -170,23 +181,28 @@ | |||
170 | */ | 181 | */ |
171 | static unsigned int bucket_order __read_mostly; | 182 | static unsigned int bucket_order __read_mostly; |
172 | 183 | ||
173 | static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) | 184 | static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, |
185 | bool workingset) | ||
174 | { | 186 | { |
175 | eviction >>= bucket_order; | 187 | eviction >>= bucket_order; |
176 | eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; | 188 | eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; |
177 | eviction = (eviction << NODES_SHIFT) | pgdat->node_id; | 189 | eviction = (eviction << NODES_SHIFT) | pgdat->node_id; |
190 | eviction = (eviction << 1) | workingset; | ||
178 | eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); | 191 | eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); |
179 | 192 | ||
180 | return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); | 193 | return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); |
181 | } | 194 | } |
182 | 195 | ||
183 | static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, | 196 | static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, |
184 | unsigned long *evictionp) | 197 | unsigned long *evictionp, bool *workingsetp) |
185 | { | 198 | { |
186 | unsigned long entry = (unsigned long)shadow; | 199 | unsigned long entry = (unsigned long)shadow; |
187 | int memcgid, nid; | 200 | int memcgid, nid; |
201 | bool workingset; | ||
188 | 202 | ||
189 | entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; | 203 | entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; |
204 | workingset = entry & 1; | ||
205 | entry >>= 1; | ||
190 | nid = entry & ((1UL << NODES_SHIFT) - 1); | 206 | nid = entry & ((1UL << NODES_SHIFT) - 1); |
191 | entry >>= NODES_SHIFT; | 207 | entry >>= NODES_SHIFT; |
192 | memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); | 208 | memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); |
@@ -195,6 +211,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, | |||
195 | *memcgidp = memcgid; | 211 | *memcgidp = memcgid; |
196 | *pgdat = NODE_DATA(nid); | 212 | *pgdat = NODE_DATA(nid); |
197 | *evictionp = entry << bucket_order; | 213 | *evictionp = entry << bucket_order; |
214 | *workingsetp = workingset; | ||
198 | } | 215 | } |
199 | 216 | ||
200 | /** | 217 | /** |
@@ -207,8 +224,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, | |||
207 | */ | 224 | */ |
208 | void *workingset_eviction(struct address_space *mapping, struct page *page) | 225 | void *workingset_eviction(struct address_space *mapping, struct page *page) |
209 | { | 226 | { |
210 | struct mem_cgroup *memcg = page_memcg(page); | ||
211 | struct pglist_data *pgdat = page_pgdat(page); | 227 | struct pglist_data *pgdat = page_pgdat(page); |
228 | struct mem_cgroup *memcg = page_memcg(page); | ||
212 | int memcgid = mem_cgroup_id(memcg); | 229 | int memcgid = mem_cgroup_id(memcg); |
213 | unsigned long eviction; | 230 | unsigned long eviction; |
214 | struct lruvec *lruvec; | 231 | struct lruvec *lruvec; |
@@ -220,30 +237,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) | |||
220 | 237 | ||
221 | lruvec = mem_cgroup_lruvec(pgdat, memcg); | 238 | lruvec = mem_cgroup_lruvec(pgdat, memcg); |
222 | eviction = atomic_long_inc_return(&lruvec->inactive_age); | 239 | eviction = atomic_long_inc_return(&lruvec->inactive_age); |
223 | return pack_shadow(memcgid, pgdat, eviction); | 240 | return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); |
224 | } | 241 | } |
225 | 242 | ||
226 | /** | 243 | /** |
227 | * workingset_refault - evaluate the refault of a previously evicted page | 244 | * workingset_refault - evaluate the refault of a previously evicted page |
245 | * @page: the freshly allocated replacement page | ||
228 | * @shadow: shadow entry of the evicted page | 246 | * @shadow: shadow entry of the evicted page |
229 | * | 247 | * |
230 | * Calculates and evaluates the refault distance of the previously | 248 | * Calculates and evaluates the refault distance of the previously |
231 | * evicted page in the context of the node it was allocated in. | 249 | * evicted page in the context of the node it was allocated in. |
232 | * | ||
233 | * Returns %true if the page should be activated, %false otherwise. | ||
234 | */ | 250 | */ |
235 | bool workingset_refault(void *shadow) | 251 | void workingset_refault(struct page *page, void *shadow) |
236 | { | 252 | { |
237 | unsigned long refault_distance; | 253 | unsigned long refault_distance; |
254 | struct pglist_data *pgdat; | ||
238 | unsigned long active_file; | 255 | unsigned long active_file; |
239 | struct mem_cgroup *memcg; | 256 | struct mem_cgroup *memcg; |
240 | unsigned long eviction; | 257 | unsigned long eviction; |
241 | struct lruvec *lruvec; | 258 | struct lruvec *lruvec; |
242 | unsigned long refault; | 259 | unsigned long refault; |
243 | struct pglist_data *pgdat; | 260 | bool workingset; |
244 | int memcgid; | 261 | int memcgid; |
245 | 262 | ||
246 | unpack_shadow(shadow, &memcgid, &pgdat, &eviction); | 263 | unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); |
247 | 264 | ||
248 | rcu_read_lock(); | 265 | rcu_read_lock(); |
249 | /* | 266 | /* |
@@ -263,41 +280,51 @@ bool workingset_refault(void *shadow) | |||
263 | * configurations instead. | 280 | * configurations instead. |
264 | */ | 281 | */ |
265 | memcg = mem_cgroup_from_id(memcgid); | 282 | memcg = mem_cgroup_from_id(memcgid); |
266 | if (!mem_cgroup_disabled() && !memcg) { | 283 | if (!mem_cgroup_disabled() && !memcg) |
267 | rcu_read_unlock(); | 284 | goto out; |
268 | return false; | ||
269 | } | ||
270 | lruvec = mem_cgroup_lruvec(pgdat, memcg); | 285 | lruvec = mem_cgroup_lruvec(pgdat, memcg); |
271 | refault = atomic_long_read(&lruvec->inactive_age); | 286 | refault = atomic_long_read(&lruvec->inactive_age); |
272 | active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); | 287 | active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); |
273 | 288 | ||
274 | /* | 289 | /* |
275 | * The unsigned subtraction here gives an accurate distance | 290 | * Calculate the refault distance |
276 | * across inactive_age overflows in most cases. | ||
277 | * | 291 | * |
278 | * There is a special case: usually, shadow entries have a | 292 | * The unsigned subtraction here gives an accurate distance |
279 | * short lifetime and are either refaulted or reclaimed along | 293 | * across inactive_age overflows in most cases. There is a |
280 | * with the inode before they get too old. But it is not | 294 | * special case: usually, shadow entries have a short lifetime |
281 | * impossible for the inactive_age to lap a shadow entry in | 295 | * and are either refaulted or reclaimed along with the inode |
282 | * the field, which can then can result in a false small | 296 | * before they get too old. But it is not impossible for the |
283 | * refault distance, leading to a false activation should this | 297 | * inactive_age to lap a shadow entry in the field, which can |
284 | * old entry actually refault again. However, earlier kernels | 298 | * then result in a false small refault distance, leading to a |
285 | * used to deactivate unconditionally with *every* reclaim | 299 | * false activation should this old entry actually refault |
286 | * invocation for the longest time, so the occasional | 300 | * again. However, earlier kernels used to deactivate |
287 | * inappropriate activation leading to pressure on the active | 301 | * unconditionally with *every* reclaim invocation for the |
288 | * list is not a problem. | 302 | * longest time, so the occasional inappropriate activation |
303 | * leading to pressure on the active list is not a problem. | ||
289 | */ | 304 | */ |
290 | refault_distance = (refault - eviction) & EVICTION_MASK; | 305 | refault_distance = (refault - eviction) & EVICTION_MASK; |
291 | 306 | ||
292 | inc_lruvec_state(lruvec, WORKINGSET_REFAULT); | 307 | inc_lruvec_state(lruvec, WORKINGSET_REFAULT); |
293 | 308 | ||
294 | if (refault_distance <= active_file) { | 309 | /* |
295 | inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); | 310 | * Compare the distance to the existing workingset size. We |
296 | rcu_read_unlock(); | 311 | * don't act on pages that couldn't stay resident even if all |
297 | return true; | 312 | * the memory was available to the page cache. |
313 | */ | ||
314 | if (refault_distance > active_file) | ||
315 | goto out; | ||
316 | |||
317 | SetPageActive(page); | ||
318 | atomic_long_inc(&lruvec->inactive_age); | ||
319 | inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); | ||
320 | |||
321 | /* Page was active prior to eviction */ | ||
322 | if (workingset) { | ||
323 | SetPageWorkingset(page); | ||
324 | inc_lruvec_state(lruvec, WORKINGSET_RESTORE); | ||
298 | } | 325 | } |
326 | out: | ||
299 | rcu_read_unlock(); | 327 | rcu_read_unlock(); |
300 | return false; | ||
301 | } | 328 | } |
302 | 329 | ||
303 | /** | 330 | /** |
@@ -350,12 +377,20 @@ void workingset_update_node(struct radix_tree_node *node) | |||
350 | * already where they should be. The list_empty() test is safe | 377 | * already where they should be. The list_empty() test is safe |
351 | * as node->private_list is protected by the i_pages lock. | 378 | * as node->private_list is protected by the i_pages lock. |
352 | */ | 379 | */ |
380 | VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ | ||
381 | |||
353 | if (node->count && node->count == node->exceptional) { | 382 | if (node->count && node->count == node->exceptional) { |
354 | if (list_empty(&node->private_list)) | 383 | if (list_empty(&node->private_list)) { |
355 | list_lru_add(&shadow_nodes, &node->private_list); | 384 | list_lru_add(&shadow_nodes, &node->private_list); |
385 | __inc_lruvec_page_state(virt_to_page(node), | ||
386 | WORKINGSET_NODES); | ||
387 | } | ||
356 | } else { | 388 | } else { |
357 | if (!list_empty(&node->private_list)) | 389 | if (!list_empty(&node->private_list)) { |
358 | list_lru_del(&shadow_nodes, &node->private_list); | 390 | list_lru_del(&shadow_nodes, &node->private_list); |
391 | __dec_lruvec_page_state(virt_to_page(node), | ||
392 | WORKINGSET_NODES); | ||
393 | } | ||
359 | } | 394 | } |
360 | } | 395 | } |
361 | 396 | ||
@@ -364,7 +399,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, | |||
364 | { | 399 | { |
365 | unsigned long max_nodes; | 400 | unsigned long max_nodes; |
366 | unsigned long nodes; | 401 | unsigned long nodes; |
367 | unsigned long cache; | 402 | unsigned long pages; |
368 | 403 | ||
369 | nodes = list_lru_shrink_count(&shadow_nodes, sc); | 404 | nodes = list_lru_shrink_count(&shadow_nodes, sc); |
370 | 405 | ||
@@ -390,14 +425,20 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, | |||
390 | * | 425 | * |
391 | * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE | 426 | * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE |
392 | */ | 427 | */ |
428 | #ifdef CONFIG_MEMCG | ||
393 | if (sc->memcg) { | 429 | if (sc->memcg) { |
394 | cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, | 430 | struct lruvec *lruvec; |
395 | LRU_ALL_FILE); | 431 | |
396 | } else { | 432 | pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, |
397 | cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + | 433 | LRU_ALL); |
398 | node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); | 434 | lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); |
399 | } | 435 | pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); |
400 | max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); | 436 | pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); |
437 | } else | ||
438 | #endif | ||
439 | pages = node_present_pages(sc->nid); | ||
440 | |||
441 | max_nodes = pages >> (RADIX_TREE_MAP_SHIFT - 3); | ||
401 | 442 | ||
402 | if (!nodes) | 443 | if (!nodes) |
403 | return SHRINK_EMPTY; | 444 | return SHRINK_EMPTY; |
@@ -440,6 +481,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, | |||
440 | } | 481 | } |
441 | 482 | ||
442 | list_lru_isolate(lru, item); | 483 | list_lru_isolate(lru, item); |
484 | __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES); | ||
485 | |||
443 | spin_unlock(lru_lock); | 486 | spin_unlock(lru_lock); |
444 | 487 | ||
445 | /* | 488 | /* |
@@ -467,7 +510,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, | |||
467 | } | 510 | } |
468 | if (WARN_ON_ONCE(node->exceptional)) | 511 | if (WARN_ON_ONCE(node->exceptional)) |
469 | goto out_invalid; | 512 | goto out_invalid; |
470 | inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); | 513 | __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); |
471 | __radix_tree_delete_node(&mapping->i_pages, node, | 514 | __radix_tree_delete_node(&mapping->i_pages, node, |
472 | workingset_lookup_update(mapping)); | 515 | workingset_lookup_update(mapping)); |
473 | 516 | ||
@@ -491,7 +534,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, | |||
491 | static struct shrinker workingset_shadow_shrinker = { | 534 | static struct shrinker workingset_shadow_shrinker = { |
492 | .count_objects = count_shadow_nodes, | 535 | .count_objects = count_shadow_nodes, |
493 | .scan_objects = scan_shadow_nodes, | 536 | .scan_objects = scan_shadow_nodes, |
494 | .seeks = DEFAULT_SEEKS, | 537 | .seeks = 0, /* ->count reports only fully expendable nodes */ |
495 | .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, | 538 | .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, |
496 | }; | 539 | }; |
497 | 540 | ||
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9da65552e7ca..0787d33b80d8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -418,7 +418,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle, | |||
418 | case ZPOOL_MM_WO: | 418 | case ZPOOL_MM_WO: |
419 | zs_mm = ZS_MM_WO; | 419 | zs_mm = ZS_MM_WO; |
420 | break; | 420 | break; |
421 | case ZPOOL_MM_RW: /* fallthru */ | 421 | case ZPOOL_MM_RW: /* fall through */ |
422 | default: | 422 | default: |
423 | zs_mm = ZS_MM_RW; | 423 | zs_mm = ZS_MM_RW; |
424 | break; | 424 | break; |
diff --git a/scripts/tags.sh b/scripts/tags.sh index 26de7d5aa5c8..4fa070f9231a 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh | |||
@@ -203,7 +203,7 @@ regex_c=( | |||
203 | '/\<DECLARE_\(TASKLET\|WORK\|DELAYED_WORK\)(\([[:alnum:]_]*\)/\2/v/' | 203 | '/\<DECLARE_\(TASKLET\|WORK\|DELAYED_WORK\)(\([[:alnum:]_]*\)/\2/v/' |
204 | '/\(^\s\)OFFSET(\([[:alnum:]_]*\)/\2/v/' | 204 | '/\(^\s\)OFFSET(\([[:alnum:]_]*\)/\2/v/' |
205 | '/\(^\s\)DEFINE(\([[:alnum:]_]*\)/\2/v/' | 205 | '/\(^\s\)DEFINE(\([[:alnum:]_]*\)/\2/v/' |
206 | '/\<DEFINE_HASHTABLE(\([[:alnum:]_]*\)/\1/v/' | 206 | '/\<\(DEFINE\|DECLARE\)_HASHTABLE(\([[:alnum:]_]*\)/\2/v/' |
207 | ) | 207 | ) |
208 | regex_kconfig=( | 208 | regex_kconfig=( |
209 | '/^[[:blank:]]*\(menu\|\)config[[:blank:]]\+\([[:alnum:]_]\+\)/\2/' | 209 | '/^[[:blank:]]*\(menu\|\)config[[:blank:]]\+\([[:alnum:]_]\+\)/\2/' |
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 9f420d98b5fb..8cb504d30384 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c | |||
@@ -203,6 +203,8 @@ static void print_delayacct(struct taskstats *t) | |||
203 | "SWAP %15s%15s%15s\n" | 203 | "SWAP %15s%15s%15s\n" |
204 | " %15llu%15llu%15llums\n" | 204 | " %15llu%15llu%15llums\n" |
205 | "RECLAIM %12s%15s%15s\n" | 205 | "RECLAIM %12s%15s%15s\n" |
206 | " %15llu%15llu%15llums\n" | ||
207 | "THRASHING%12s%15s%15s\n" | ||
206 | " %15llu%15llu%15llums\n", | 208 | " %15llu%15llu%15llums\n", |
207 | "count", "real total", "virtual total", | 209 | "count", "real total", "virtual total", |
208 | "delay total", "delay average", | 210 | "delay total", "delay average", |
@@ -222,7 +224,11 @@ static void print_delayacct(struct taskstats *t) | |||
222 | "count", "delay total", "delay average", | 224 | "count", "delay total", "delay average", |
223 | (unsigned long long)t->freepages_count, | 225 | (unsigned long long)t->freepages_count, |
224 | (unsigned long long)t->freepages_delay_total, | 226 | (unsigned long long)t->freepages_delay_total, |
225 | average_ms(t->freepages_delay_total, t->freepages_count)); | 227 | average_ms(t->freepages_delay_total, t->freepages_count), |
228 | "count", "delay total", "delay average", | ||
229 | (unsigned long long)t->thrashing_count, | ||
230 | (unsigned long long)t->thrashing_delay_total, | ||
231 | average_ms(t->thrashing_delay_total, t->thrashing_count)); | ||
226 | } | 232 | } |
227 | 233 | ||
228 | static void task_context_switch_counts(struct taskstats *t) | 234 | static void task_context_switch_counts(struct taskstats *t) |
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index af5ff83f6d7f..31b3c98b6d34 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore | |||
@@ -13,3 +13,4 @@ mlock-random-test | |||
13 | virtual_address_range | 13 | virtual_address_range |
14 | gup_benchmark | 14 | gup_benchmark |
15 | va_128TBswitch | 15 | va_128TBswitch |
16 | map_fixed_noreplace | ||
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index e94b7b14bcb2..6e67e726e5a5 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile | |||
@@ -12,6 +12,7 @@ TEST_GEN_FILES += gup_benchmark | |||
12 | TEST_GEN_FILES += hugepage-mmap | 12 | TEST_GEN_FILES += hugepage-mmap |
13 | TEST_GEN_FILES += hugepage-shm | 13 | TEST_GEN_FILES += hugepage-shm |
14 | TEST_GEN_FILES += map_hugetlb | 14 | TEST_GEN_FILES += map_hugetlb |
15 | TEST_GEN_FILES += map_fixed_noreplace | ||
15 | TEST_GEN_FILES += map_populate | 16 | TEST_GEN_FILES += map_populate |
16 | TEST_GEN_FILES += mlock-random-test | 17 | TEST_GEN_FILES += mlock-random-test |
17 | TEST_GEN_FILES += mlock2-tests | 18 | TEST_GEN_FILES += mlock2-tests |
diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c index 36df55132036..880b96fc80d4 100644 --- a/tools/testing/selftests/vm/gup_benchmark.c +++ b/tools/testing/selftests/vm/gup_benchmark.c | |||
@@ -15,9 +15,12 @@ | |||
15 | #define PAGE_SIZE sysconf(_SC_PAGESIZE) | 15 | #define PAGE_SIZE sysconf(_SC_PAGESIZE) |
16 | 16 | ||
17 | #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) | 17 | #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) |
18 | #define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) | ||
19 | #define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) | ||
18 | 20 | ||
19 | struct gup_benchmark { | 21 | struct gup_benchmark { |
20 | __u64 delta_usec; | 22 | __u64 get_delta_usec; |
23 | __u64 put_delta_usec; | ||
21 | __u64 addr; | 24 | __u64 addr; |
22 | __u64 size; | 25 | __u64 size; |
23 | __u32 nr_pages_per_call; | 26 | __u32 nr_pages_per_call; |
@@ -28,10 +31,12 @@ int main(int argc, char **argv) | |||
28 | { | 31 | { |
29 | struct gup_benchmark gup; | 32 | struct gup_benchmark gup; |
30 | unsigned long size = 128 * MB; | 33 | unsigned long size = 128 * MB; |
31 | int i, fd, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; | 34 | int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; |
35 | int cmd = GUP_FAST_BENCHMARK, flags = MAP_PRIVATE; | ||
36 | char *file = "/dev/zero"; | ||
32 | char *p; | 37 | char *p; |
33 | 38 | ||
34 | while ((opt = getopt(argc, argv, "m:r:n:tT")) != -1) { | 39 | while ((opt = getopt(argc, argv, "m:r:n:f:tTLUSH")) != -1) { |
35 | switch (opt) { | 40 | switch (opt) { |
36 | case 'm': | 41 | case 'm': |
37 | size = atoi(optarg) * MB; | 42 | size = atoi(optarg) * MB; |
@@ -48,13 +53,36 @@ int main(int argc, char **argv) | |||
48 | case 'T': | 53 | case 'T': |
49 | thp = 0; | 54 | thp = 0; |
50 | break; | 55 | break; |
56 | case 'L': | ||
57 | cmd = GUP_LONGTERM_BENCHMARK; | ||
58 | break; | ||
59 | case 'U': | ||
60 | cmd = GUP_BENCHMARK; | ||
61 | break; | ||
51 | case 'w': | 62 | case 'w': |
52 | write = 1; | 63 | write = 1; |
64 | break; | ||
65 | case 'f': | ||
66 | file = optarg; | ||
67 | break; | ||
68 | case 'S': | ||
69 | flags &= ~MAP_PRIVATE; | ||
70 | flags |= MAP_SHARED; | ||
71 | break; | ||
72 | case 'H': | ||
73 | flags |= MAP_HUGETLB; | ||
74 | break; | ||
53 | default: | 75 | default: |
54 | return -1; | 76 | return -1; |
55 | } | 77 | } |
56 | } | 78 | } |
57 | 79 | ||
80 | filed = open(file, O_RDWR|O_CREAT); | ||
81 | if (filed < 0) { | ||
82 | perror("open"); | ||
83 | exit(filed); | ||
84 | } | ||
85 | |||
58 | gup.nr_pages_per_call = nr_pages; | 86 | gup.nr_pages_per_call = nr_pages; |
59 | gup.flags = write; | 87 | gup.flags = write; |
60 | 88 | ||
@@ -62,8 +90,7 @@ int main(int argc, char **argv) | |||
62 | if (fd == -1) | 90 | if (fd == -1) |
63 | perror("open"), exit(1); | 91 | perror("open"), exit(1); |
64 | 92 | ||
65 | p = mmap(NULL, size, PROT_READ | PROT_WRITE, | 93 | p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); |
66 | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); | ||
67 | if (p == MAP_FAILED) | 94 | if (p == MAP_FAILED) |
68 | perror("mmap"), exit(1); | 95 | perror("mmap"), exit(1); |
69 | gup.addr = (unsigned long)p; | 96 | gup.addr = (unsigned long)p; |
@@ -78,10 +105,11 @@ int main(int argc, char **argv) | |||
78 | 105 | ||
79 | for (i = 0; i < repeats; i++) { | 106 | for (i = 0; i < repeats; i++) { |
80 | gup.size = size; | 107 | gup.size = size; |
81 | if (ioctl(fd, GUP_FAST_BENCHMARK, &gup)) | 108 | if (ioctl(fd, cmd, &gup)) |
82 | perror("ioctl"), exit(1); | 109 | perror("ioctl"), exit(1); |
83 | 110 | ||
84 | printf("Time: %lld us", gup.delta_usec); | 111 | printf("Time: get:%lld put:%lld us", gup.get_delta_usec, |
112 | gup.put_delta_usec); | ||
85 | if (gup.size != size) | 113 | if (gup.size != size) |
86 | printf(", truncated (size: %lld)", gup.size); | 114 | printf(", truncated (size: %lld)", gup.size); |
87 | printf("\n"); | 115 | printf("\n"); |
diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/vm/map_fixed_noreplace.c new file mode 100644 index 000000000000..d91bde511268 --- /dev/null +++ b/tools/testing/selftests/vm/map_fixed_noreplace.c | |||
@@ -0,0 +1,206 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | /* | ||
4 | * Test that MAP_FIXED_NOREPLACE works. | ||
5 | * | ||
6 | * Copyright 2018, Jann Horn <jannh@google.com> | ||
7 | * Copyright 2018, Michael Ellerman, IBM Corporation. | ||
8 | */ | ||
9 | |||
10 | #include <sys/mman.h> | ||
11 | #include <errno.h> | ||
12 | #include <stdio.h> | ||
13 | #include <stdlib.h> | ||
14 | #include <unistd.h> | ||
15 | |||
16 | #ifndef MAP_FIXED_NOREPLACE | ||
17 | #define MAP_FIXED_NOREPLACE 0x100000 | ||
18 | #endif | ||
19 | |||
20 | #define BASE_ADDRESS (256ul * 1024 * 1024) | ||
21 | |||
22 | |||
23 | static void dump_maps(void) | ||
24 | { | ||
25 | char cmd[32]; | ||
26 | |||
27 | snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); | ||
28 | system(cmd); | ||
29 | } | ||
30 | |||
31 | int main(void) | ||
32 | { | ||
33 | unsigned long flags, addr, size, page_size; | ||
34 | char *p; | ||
35 | |||
36 | page_size = sysconf(_SC_PAGE_SIZE); | ||
37 | |||
38 | flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; | ||
39 | |||
40 | // Check we can map all the areas we need below | ||
41 | errno = 0; | ||
42 | addr = BASE_ADDRESS; | ||
43 | size = 5 * page_size; | ||
44 | p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); | ||
45 | |||
46 | printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); | ||
47 | |||
48 | if (p == MAP_FAILED) { | ||
49 | dump_maps(); | ||
50 | printf("Error: couldn't map the space we need for the test\n"); | ||
51 | return 1; | ||
52 | } | ||
53 | |||
54 | errno = 0; | ||
55 | if (munmap((void *)addr, 5 * page_size) != 0) { | ||
56 | dump_maps(); | ||
57 | printf("Error: munmap failed!?\n"); | ||
58 | return 1; | ||
59 | } | ||
60 | printf("unmap() successful\n"); | ||
61 | |||
62 | errno = 0; | ||
63 | addr = BASE_ADDRESS + page_size; | ||
64 | size = 3 * page_size; | ||
65 | p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); | ||
66 | printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); | ||
67 | |||
68 | if (p == MAP_FAILED) { | ||
69 | dump_maps(); | ||
70 | printf("Error: first mmap() failed unexpectedly\n"); | ||
71 | return 1; | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * Exact same mapping again: | ||
76 | * base | free | new | ||
77 | * +1 | mapped | new | ||
78 | * +2 | mapped | new | ||
79 | * +3 | mapped | new | ||
80 | * +4 | free | new | ||
81 | */ | ||
82 | errno = 0; | ||
83 | addr = BASE_ADDRESS; | ||
84 | size = 5 * page_size; | ||
85 | p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); | ||
86 | printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); | ||
87 | |||
88 | if (p != MAP_FAILED) { | ||
89 | dump_maps(); | ||
90 | printf("Error:1: mmap() succeeded when it shouldn't have\n"); | ||
91 | return 1; | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * Second mapping contained within first: | ||
96 | * | ||
97 | * base | free | | ||
98 | * +1 | mapped | | ||
99 | * +2 | mapped | new | ||
100 | * +3 | mapped | | ||
101 | * +4 | free | | ||
102 | */ | ||
103 | errno = 0; | ||
104 | addr = BASE_ADDRESS + (2 * page_size); | ||
105 | size = page_size; | ||
106 | p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); | ||
107 | printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); | ||
108 | |||
109 | if (p != MAP_FAILED) { | ||
110 | dump_maps(); | ||
111 | printf("Error:2: mmap() succeeded when it shouldn't have\n"); | ||
112 | return 1; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * Overlap end of existing mapping: | ||
117 | * base | free | | ||
118 | * +1 | mapped | | ||
119 | * +2 | mapped | | ||
120 | * +3 | mapped | new | ||
121 | * +4 | free | new | ||
122 | */ | ||
123 | errno = 0; | ||
124 | addr = BASE_ADDRESS + (3 * page_size); | ||
125 | size = 2 * page_size; | ||
126 | p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); | ||
127 | printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); | ||
128 | |||
129 | if (p != MAP_FAILED) { | ||
130 | dump_maps(); | ||
131 | printf("Error:3: mmap() succeeded when it shouldn't have\n"); | ||
132 | return 1; | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * Overlap start of existing mapping: | ||
137 | * base | free | new | ||
138 | * +1 | mapped | new | ||
139 | * +2 | mapped | | ||
140 | * +3 | mapped | | ||
141 | * +4 | free | | ||
142 | */ | ||
143 | errno = 0; | ||
144 | addr = BASE_ADDRESS; | ||
145 | size = 2 * page_size; | ||
146 | p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); | ||
147 | printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); | ||
148 | |||
149 | if (p != MAP_FAILED) { | ||
150 | dump_maps(); | ||
151 | printf("Error:4: mmap() succeeded when it shouldn't have\n"); | ||
152 | return 1; | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * Adjacent to start of existing mapping: | ||
157 | * base | free | new | ||
158 | * +1 | mapped | | ||
159 | * +2 | mapped | | ||
160 | * +3 | mapped | | ||
161 | * +4 | free | | ||
162 | */ | ||
163 | errno = 0; | ||
164 | addr = BASE_ADDRESS; | ||
165 | size = page_size; | ||
166 | p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); | ||
167 | printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); | ||
168 | |||
169 | if (p == MAP_FAILED) { | ||
170 | dump_maps(); | ||
171 | printf("Error:5: mmap() failed when it shouldn't have\n"); | ||
172 | return 1; | ||
173 | } | ||
174 | |||
175 | /* | ||
176 | * Adjacent to end of existing mapping: | ||
177 | * base | free | | ||
178 | * +1 | mapped | | ||
179 | * +2 | mapped | | ||
180 | * +3 | mapped | | ||
181 | * +4 | free | new | ||
182 | */ | ||
183 | errno = 0; | ||
184 | addr = BASE_ADDRESS + (4 * page_size); | ||
185 | size = page_size; | ||
186 | p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); | ||
187 | printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); | ||
188 | |||
189 | if (p == MAP_FAILED) { | ||
190 | dump_maps(); | ||
191 | printf("Error:6: mmap() failed when it shouldn't have\n"); | ||
192 | return 1; | ||
193 | } | ||
194 | |||
195 | addr = BASE_ADDRESS; | ||
196 | size = 5 * page_size; | ||
197 | if (munmap((void *)addr, size) != 0) { | ||
198 | dump_maps(); | ||
199 | printf("Error: munmap failed!?\n"); | ||
200 | return 1; | ||
201 | } | ||
202 | printf("unmap() successful\n"); | ||
203 | |||
204 | printf("OK\n"); | ||
205 | return 0; | ||
206 | } | ||
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 7b8171e3128a..5d1db824f73a 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c | |||
@@ -34,18 +34,6 @@ | |||
34 | * per-CPU threads 1 by triggering userfaults inside | 34 | * per-CPU threads 1 by triggering userfaults inside |
35 | * pthread_mutex_lock will also verify the atomicity of the memory | 35 | * pthread_mutex_lock will also verify the atomicity of the memory |
36 | * transfer (UFFDIO_COPY). | 36 | * transfer (UFFDIO_COPY). |
37 | * | ||
38 | * The program takes two parameters: the amounts of physical memory in | ||
39 | * megabytes (MiB) of the area and the number of bounces to execute. | ||
40 | * | ||
41 | * # 100MiB 99999 bounces | ||
42 | * ./userfaultfd 100 99999 | ||
43 | * | ||
44 | * # 1GiB 99 bounces | ||
45 | * ./userfaultfd 1000 99 | ||
46 | * | ||
47 | * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers | ||
48 | * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done | ||
49 | */ | 37 | */ |
50 | 38 | ||
51 | #define _GNU_SOURCE | 39 | #define _GNU_SOURCE |
@@ -115,6 +103,30 @@ pthread_attr_t attr; | |||
115 | ~(unsigned long)(sizeof(unsigned long long) \ | 103 | ~(unsigned long)(sizeof(unsigned long long) \ |
116 | - 1))) | 104 | - 1))) |
117 | 105 | ||
106 | const char *examples = | ||
107 | "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" | ||
108 | "./userfaultfd anon 100 99999\n\n" | ||
109 | "# Run share memory test on 1GiB region with 99 bounces:\n" | ||
110 | "./userfaultfd shmem 1000 99\n\n" | ||
111 | "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n" | ||
112 | "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n" | ||
113 | "# Run the same hugetlb test but using shmem:\n" | ||
114 | "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n" | ||
115 | "# 10MiB-~6GiB 999 bounces anonymous test, " | ||
116 | "continue forever unless an error triggers\n" | ||
117 | "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; | ||
118 | |||
119 | static void usage(void) | ||
120 | { | ||
121 | fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> " | ||
122 | "[hugetlbfs_file]\n\n"); | ||
123 | fprintf(stderr, "Supported <test type>: anon, hugetlb, " | ||
124 | "hugetlb_shared, shmem\n\n"); | ||
125 | fprintf(stderr, "Examples:\n\n"); | ||
126 | fprintf(stderr, examples); | ||
127 | exit(1); | ||
128 | } | ||
129 | |||
118 | static int anon_release_pages(char *rel_area) | 130 | static int anon_release_pages(char *rel_area) |
119 | { | 131 | { |
120 | int ret = 0; | 132 | int ret = 0; |
@@ -439,6 +451,43 @@ static int copy_page(int ufd, unsigned long offset) | |||
439 | return __copy_page(ufd, offset, false); | 451 | return __copy_page(ufd, offset, false); |
440 | } | 452 | } |
441 | 453 | ||
454 | static int uffd_read_msg(int ufd, struct uffd_msg *msg) | ||
455 | { | ||
456 | int ret = read(uffd, msg, sizeof(*msg)); | ||
457 | |||
458 | if (ret != sizeof(*msg)) { | ||
459 | if (ret < 0) { | ||
460 | if (errno == EAGAIN) | ||
461 | return 1; | ||
462 | else | ||
463 | perror("blocking read error"), exit(1); | ||
464 | } else { | ||
465 | fprintf(stderr, "short read\n"), exit(1); | ||
466 | } | ||
467 | } | ||
468 | |||
469 | return 0; | ||
470 | } | ||
471 | |||
472 | /* Return 1 if page fault handled by us; otherwise 0 */ | ||
473 | static int uffd_handle_page_fault(struct uffd_msg *msg) | ||
474 | { | ||
475 | unsigned long offset; | ||
476 | |||
477 | if (msg->event != UFFD_EVENT_PAGEFAULT) | ||
478 | fprintf(stderr, "unexpected msg event %u\n", | ||
479 | msg->event), exit(1); | ||
480 | |||
481 | if (bounces & BOUNCE_VERIFY && | ||
482 | msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) | ||
483 | fprintf(stderr, "unexpected write fault\n"), exit(1); | ||
484 | |||
485 | offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; | ||
486 | offset &= ~(page_size-1); | ||
487 | |||
488 | return copy_page(uffd, offset); | ||
489 | } | ||
490 | |||
442 | static void *uffd_poll_thread(void *arg) | 491 | static void *uffd_poll_thread(void *arg) |
443 | { | 492 | { |
444 | unsigned long cpu = (unsigned long) arg; | 493 | unsigned long cpu = (unsigned long) arg; |
@@ -446,7 +495,6 @@ static void *uffd_poll_thread(void *arg) | |||
446 | struct uffd_msg msg; | 495 | struct uffd_msg msg; |
447 | struct uffdio_register uffd_reg; | 496 | struct uffdio_register uffd_reg; |
448 | int ret; | 497 | int ret; |
449 | unsigned long offset; | ||
450 | char tmp_chr; | 498 | char tmp_chr; |
451 | unsigned long userfaults = 0; | 499 | unsigned long userfaults = 0; |
452 | 500 | ||
@@ -470,25 +518,15 @@ static void *uffd_poll_thread(void *arg) | |||
470 | if (!(pollfd[0].revents & POLLIN)) | 518 | if (!(pollfd[0].revents & POLLIN)) |
471 | fprintf(stderr, "pollfd[0].revents %d\n", | 519 | fprintf(stderr, "pollfd[0].revents %d\n", |
472 | pollfd[0].revents), exit(1); | 520 | pollfd[0].revents), exit(1); |
473 | ret = read(uffd, &msg, sizeof(msg)); | 521 | if (uffd_read_msg(uffd, &msg)) |
474 | if (ret < 0) { | 522 | continue; |
475 | if (errno == EAGAIN) | ||
476 | continue; | ||
477 | perror("nonblocking read error"), exit(1); | ||
478 | } | ||
479 | switch (msg.event) { | 523 | switch (msg.event) { |
480 | default: | 524 | default: |
481 | fprintf(stderr, "unexpected msg event %u\n", | 525 | fprintf(stderr, "unexpected msg event %u\n", |
482 | msg.event), exit(1); | 526 | msg.event), exit(1); |
483 | break; | 527 | break; |
484 | case UFFD_EVENT_PAGEFAULT: | 528 | case UFFD_EVENT_PAGEFAULT: |
485 | if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) | 529 | userfaults += uffd_handle_page_fault(&msg); |
486 | fprintf(stderr, "unexpected write fault\n"), exit(1); | ||
487 | offset = (char *)(unsigned long)msg.arg.pagefault.address - | ||
488 | area_dst; | ||
489 | offset &= ~(page_size-1); | ||
490 | if (copy_page(uffd, offset)) | ||
491 | userfaults++; | ||
492 | break; | 530 | break; |
493 | case UFFD_EVENT_FORK: | 531 | case UFFD_EVENT_FORK: |
494 | close(uffd); | 532 | close(uffd); |
@@ -516,8 +554,6 @@ static void *uffd_read_thread(void *arg) | |||
516 | { | 554 | { |
517 | unsigned long *this_cpu_userfaults; | 555 | unsigned long *this_cpu_userfaults; |
518 | struct uffd_msg msg; | 556 | struct uffd_msg msg; |
519 | unsigned long offset; | ||
520 | int ret; | ||
521 | 557 | ||
522 | this_cpu_userfaults = (unsigned long *) arg; | 558 | this_cpu_userfaults = (unsigned long *) arg; |
523 | *this_cpu_userfaults = 0; | 559 | *this_cpu_userfaults = 0; |
@@ -526,24 +562,9 @@ static void *uffd_read_thread(void *arg) | |||
526 | /* from here cancellation is ok */ | 562 | /* from here cancellation is ok */ |
527 | 563 | ||
528 | for (;;) { | 564 | for (;;) { |
529 | ret = read(uffd, &msg, sizeof(msg)); | 565 | if (uffd_read_msg(uffd, &msg)) |
530 | if (ret != sizeof(msg)) { | 566 | continue; |
531 | if (ret < 0) | 567 | (*this_cpu_userfaults) += uffd_handle_page_fault(&msg); |
532 | perror("blocking read error"), exit(1); | ||
533 | else | ||
534 | fprintf(stderr, "short read\n"), exit(1); | ||
535 | } | ||
536 | if (msg.event != UFFD_EVENT_PAGEFAULT) | ||
537 | fprintf(stderr, "unexpected msg event %u\n", | ||
538 | msg.event), exit(1); | ||
539 | if (bounces & BOUNCE_VERIFY && | ||
540 | msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) | ||
541 | fprintf(stderr, "unexpected write fault\n"), exit(1); | ||
542 | offset = (char *)(unsigned long)msg.arg.pagefault.address - | ||
543 | area_dst; | ||
544 | offset &= ~(page_size-1); | ||
545 | if (copy_page(uffd, offset)) | ||
546 | (*this_cpu_userfaults)++; | ||
547 | } | 568 | } |
548 | return (void *)NULL; | 569 | return (void *)NULL; |
549 | } | 570 | } |
@@ -605,6 +626,12 @@ static int stress(unsigned long *userfaults) | |||
605 | if (uffd_test_ops->release_pages(area_src)) | 626 | if (uffd_test_ops->release_pages(area_src)) |
606 | return 1; | 627 | return 1; |
607 | 628 | ||
629 | |||
630 | finished = 1; | ||
631 | for (cpu = 0; cpu < nr_cpus; cpu++) | ||
632 | if (pthread_join(locking_threads[cpu], NULL)) | ||
633 | return 1; | ||
634 | |||
608 | for (cpu = 0; cpu < nr_cpus; cpu++) { | 635 | for (cpu = 0; cpu < nr_cpus; cpu++) { |
609 | char c; | 636 | char c; |
610 | if (bounces & BOUNCE_POLL) { | 637 | if (bounces & BOUNCE_POLL) { |
@@ -622,11 +649,6 @@ static int stress(unsigned long *userfaults) | |||
622 | } | 649 | } |
623 | } | 650 | } |
624 | 651 | ||
625 | finished = 1; | ||
626 | for (cpu = 0; cpu < nr_cpus; cpu++) | ||
627 | if (pthread_join(locking_threads[cpu], NULL)) | ||
628 | return 1; | ||
629 | |||
630 | return 0; | 652 | return 0; |
631 | } | 653 | } |
632 | 654 | ||
@@ -1272,8 +1294,7 @@ static void sigalrm(int sig) | |||
1272 | int main(int argc, char **argv) | 1294 | int main(int argc, char **argv) |
1273 | { | 1295 | { |
1274 | if (argc < 4) | 1296 | if (argc < 4) |
1275 | fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"), | 1297 | usage(); |
1276 | exit(1); | ||
1277 | 1298 | ||
1278 | if (signal(SIGALRM, sigalrm) == SIG_ERR) | 1299 | if (signal(SIGALRM, sigalrm) == SIG_ERR) |
1279 | fprintf(stderr, "failed to arm SIGALRM"), exit(1); | 1300 | fprintf(stderr, "failed to arm SIGALRM"), exit(1); |
@@ -1286,20 +1307,19 @@ int main(int argc, char **argv) | |||
1286 | nr_cpus; | 1307 | nr_cpus; |
1287 | if (!nr_pages_per_cpu) { | 1308 | if (!nr_pages_per_cpu) { |
1288 | fprintf(stderr, "invalid MiB\n"); | 1309 | fprintf(stderr, "invalid MiB\n"); |
1289 | fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); | 1310 | usage(); |
1290 | } | 1311 | } |
1291 | 1312 | ||
1292 | bounces = atoi(argv[3]); | 1313 | bounces = atoi(argv[3]); |
1293 | if (bounces <= 0) { | 1314 | if (bounces <= 0) { |
1294 | fprintf(stderr, "invalid bounces\n"); | 1315 | fprintf(stderr, "invalid bounces\n"); |
1295 | fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); | 1316 | usage(); |
1296 | } | 1317 | } |
1297 | nr_pages = nr_pages_per_cpu * nr_cpus; | 1318 | nr_pages = nr_pages_per_cpu * nr_cpus; |
1298 | 1319 | ||
1299 | if (test_type == TEST_HUGETLB) { | 1320 | if (test_type == TEST_HUGETLB) { |
1300 | if (argc < 5) | 1321 | if (argc < 5) |
1301 | fprintf(stderr, "Usage: hugetlb <MiB> <bounces> <hugetlbfs_file>\n"), | 1322 | usage(); |
1302 | exit(1); | ||
1303 | huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); | 1323 | huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); |
1304 | if (huge_fd < 0) { | 1324 | if (huge_fd < 0) { |
1305 | fprintf(stderr, "Open of %s failed", argv[3]); | 1325 | fprintf(stderr, "Open of %s failed", argv[3]); |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 786ade1843a2..2679e476b6c3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -497,7 +497,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, | |||
497 | } | 497 | } |
498 | 498 | ||
499 | static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { | 499 | static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { |
500 | .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, | ||
501 | .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, | 500 | .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, |
502 | .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, | 501 | .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, |
503 | .clear_flush_young = kvm_mmu_notifier_clear_flush_young, | 502 | .clear_flush_young = kvm_mmu_notifier_clear_flush_young, |