diff options
116 files changed, 2491 insertions, 1717 deletions
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index 4f4563277864..71daa35ec2d9 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt | |||
@@ -327,6 +327,85 @@ supported and the interface files "release_agent" and | |||
327 | - use_hierarchy is on by default and the cgroup file for the flag is | 327 | - use_hierarchy is on by default and the cgroup file for the flag is |
328 | not created. | 328 | not created. |
329 | 329 | ||
330 | - The original lower boundary, the soft limit, is defined as a limit | ||
331 | that is per default unset. As a result, the set of cgroups that | ||
332 | global reclaim prefers is opt-in, rather than opt-out. The costs | ||
333 | for optimizing these mostly negative lookups are so high that the | ||
334 | implementation, despite its enormous size, does not even provide the | ||
335 | basic desirable behavior. First off, the soft limit has no | ||
336 | hierarchical meaning. All configured groups are organized in a | ||
337 | global rbtree and treated like equal peers, regardless where they | ||
338 | are located in the hierarchy. This makes subtree delegation | ||
339 | impossible. Second, the soft limit reclaim pass is so aggressive | ||
340 | that it not just introduces high allocation latencies into the | ||
341 | system, but also impacts system performance due to overreclaim, to | ||
342 | the point where the feature becomes self-defeating. | ||
343 | |||
344 | The memory.low boundary on the other hand is a top-down allocated | ||
345 | reserve. A cgroup enjoys reclaim protection when it and all its | ||
346 | ancestors are below their low boundaries, which makes delegation of | ||
347 | subtrees possible. Secondly, new cgroups have no reserve per | ||
348 | default and in the common case most cgroups are eligible for the | ||
349 | preferred reclaim pass. This allows the new low boundary to be | ||
350 | efficiently implemented with just a minor addition to the generic | ||
351 | reclaim code, without the need for out-of-band data structures and | ||
352 | reclaim passes. Because the generic reclaim code considers all | ||
353 | cgroups except for the ones running low in the preferred first | ||
354 | reclaim pass, overreclaim of individual groups is eliminated as | ||
355 | well, resulting in much better overall workload performance. | ||
356 | |||
357 | - The original high boundary, the hard limit, is defined as a strict | ||
358 | limit that can not budge, even if the OOM killer has to be called. | ||
359 | But this generally goes against the goal of making the most out of | ||
360 | the available memory. The memory consumption of workloads varies | ||
361 | during runtime, and that requires users to overcommit. But doing | ||
362 | that with a strict upper limit requires either a fairly accurate | ||
363 | prediction of the working set size or adding slack to the limit. | ||
364 | Since working set size estimation is hard and error prone, and | ||
365 | getting it wrong results in OOM kills, most users tend to err on the | ||
366 | side of a looser limit and end up wasting precious resources. | ||
367 | |||
368 | The memory.high boundary on the other hand can be set much more | ||
369 | conservatively. When hit, it throttles allocations by forcing them | ||
370 | into direct reclaim to work off the excess, but it never invokes the | ||
371 | OOM killer. As a result, a high boundary that is chosen too | ||
372 | aggressively will not terminate the processes, but instead it will | ||
373 | lead to gradual performance degradation. The user can monitor this | ||
374 | and make corrections until the minimal memory footprint that still | ||
375 | gives acceptable performance is found. | ||
376 | |||
377 | In extreme cases, with many concurrent allocations and a complete | ||
378 | breakdown of reclaim progress within the group, the high boundary | ||
379 | can be exceeded. But even then it's mostly better to satisfy the | ||
380 | allocation from the slack available in other groups or the rest of | ||
381 | the system than killing the group. Otherwise, memory.max is there | ||
382 | to limit this type of spillover and ultimately contain buggy or even | ||
383 | malicious applications. | ||
384 | |||
385 | - The original control file names are unwieldy and inconsistent in | ||
386 | many different ways. For example, the upper boundary hit count is | ||
387 | exported in the memory.failcnt file, but an OOM event count has to | ||
388 | be manually counted by listening to memory.oom_control events, and | ||
389 | lower boundary / soft limit events have to be counted by first | ||
390 | setting a threshold for that value and then counting those events. | ||
391 | Also, usage and limit files encode their units in the filename. | ||
392 | That makes the filenames very long, even though this is not | ||
393 | information that a user needs to be reminded of every time they type | ||
394 | out those names. | ||
395 | |||
396 | To address these naming issues, as well as to signal clearly that | ||
397 | the new interface carries a new configuration model, the naming | ||
398 | conventions in it necessarily differ from the old interface. | ||
399 | |||
400 | - The original limit files indicate the state of an unset limit with a | ||
401 | Very High Number, and a configured limit can be unset by echoing -1 | ||
402 | into those files. But that very high number is implementation and | ||
403 | architecture dependent and not very descriptive. And while -1 can | ||
404 | be understood as an underflow into the highest possible value, -2 or | ||
405 | -10M etc. do not work, so it's not consistent. | ||
406 | |||
407 | memory.low, memory.high, and memory.max will use the string | ||
408 | "infinity" to indicate and set the highest possible value. | ||
330 | 409 | ||
331 | 5. Planned Changes | 410 | 5. Planned Changes |
332 | 411 | ||
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 79b3cc821e7b..cf8fc2f0b34b 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -42,6 +42,7 @@ Table of Contents | |||
42 | 3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm | 42 | 3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm |
43 | 3.7 /proc/<pid>/task/<tid>/children - Information about task children | 43 | 3.7 /proc/<pid>/task/<tid>/children - Information about task children |
44 | 3.8 /proc/<pid>/fdinfo/<fd> - Information about opened file | 44 | 3.8 /proc/<pid>/fdinfo/<fd> - Information about opened file |
45 | 3.9 /proc/<pid>/map_files - Information about memory mapped files | ||
45 | 46 | ||
46 | 4 Configuring procfs | 47 | 4 Configuring procfs |
47 | 4.1 Mount options | 48 | 4.1 Mount options |
@@ -1763,6 +1764,28 @@ pair provide additional information particular to the objects they represent. | |||
1763 | with TIMER_ABSTIME option which will be shown in 'settime flags', but 'it_value' | 1764 | with TIMER_ABSTIME option which will be shown in 'settime flags', but 'it_value' |
1764 | still exhibits timer's remaining time. | 1765 | still exhibits timer's remaining time. |
1765 | 1766 | ||
1767 | 3.9 /proc/<pid>/map_files - Information about memory mapped files | ||
1768 | --------------------------------------------------------------------- | ||
1769 | This directory contains symbolic links which represent memory mapped files | ||
1770 | the process is maintaining. Example output: | ||
1771 | |||
1772 | | lr-------- 1 root root 64 Jan 27 11:24 333c600000-333c620000 -> /usr/lib64/ld-2.18.so | ||
1773 | | lr-------- 1 root root 64 Jan 27 11:24 333c81f000-333c820000 -> /usr/lib64/ld-2.18.so | ||
1774 | | lr-------- 1 root root 64 Jan 27 11:24 333c820000-333c821000 -> /usr/lib64/ld-2.18.so | ||
1775 | | ... | ||
1776 | | lr-------- 1 root root 64 Jan 27 11:24 35d0421000-35d0422000 -> /usr/lib64/libselinux.so.1 | ||
1777 | | lr-------- 1 root root 64 Jan 27 11:24 400000-41a000 -> /usr/bin/ls | ||
1778 | |||
1779 | The name of a link represents the virtual memory bounds of a mapping, i.e. | ||
1780 | vm_area_struct::vm_start-vm_area_struct::vm_end. | ||
1781 | |||
1782 | The main purpose of the map_files is to retrieve a set of memory mapped | ||
1783 | files in a fast way instead of parsing /proc/<pid>/maps or | ||
1784 | /proc/<pid>/smaps, both of which contain many more records. At the same | ||
1785 | time one can open(2) mappings from the listings of two processes and | ||
1786 | comparing their inode numbers to figure out which anonymous memory areas | ||
1787 | are actually shared. | ||
1788 | |||
1766 | ------------------------------------------------------------------------------ | 1789 | ------------------------------------------------------------------------------ |
1767 | Configuring procfs | 1790 | Configuring procfs |
1768 | ------------------------------------------------------------------------------ | 1791 | ------------------------------------------------------------------------------ |
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index de3afef76837..902b4574acfb 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -555,12 +555,12 @@ this is causing problems for your system/application. | |||
555 | 555 | ||
556 | oom_dump_tasks | 556 | oom_dump_tasks |
557 | 557 | ||
558 | Enables a system-wide task dump (excluding kernel threads) to be | 558 | Enables a system-wide task dump (excluding kernel threads) to be produced |
559 | produced when the kernel performs an OOM-killing and includes such | 559 | when the kernel performs an OOM-killing and includes such information as |
560 | information as pid, uid, tgid, vm size, rss, nr_ptes, swapents, | 560 | pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj |
561 | oom_score_adj score, and name. This is helpful to determine why the | 561 | score, and name. This is helpful to determine why the OOM killer was |
562 | OOM killer was invoked, to identify the rogue task that caused it, | 562 | invoked, to identify the rogue task that caused it, and to determine why |
563 | and to determine why the OOM killer chose the task it did to kill. | 563 | the OOM killer chose the task it did to kill. |
564 | 564 | ||
565 | If this is set to zero, this information is suppressed. On very | 565 | If this is set to zero, this information is suppressed. On very |
566 | large systems with thousands of tasks it may not be feasible to dump | 566 | large systems with thousands of tasks it may not be feasible to dump |
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 5948e455c4d2..6fbd55ef6b45 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt | |||
@@ -62,6 +62,8 @@ There are three components to pagemap: | |||
62 | 20. NOPAGE | 62 | 20. NOPAGE |
63 | 21. KSM | 63 | 21. KSM |
64 | 22. THP | 64 | 22. THP |
65 | 23. BALLOON | ||
66 | 24. ZERO_PAGE | ||
65 | 67 | ||
66 | Short descriptions to the page flags: | 68 | Short descriptions to the page flags: |
67 | 69 | ||
@@ -102,6 +104,12 @@ Short descriptions to the page flags: | |||
102 | 22. THP | 104 | 22. THP |
103 | contiguous pages which construct transparent hugepages | 105 | contiguous pages which construct transparent hugepages |
104 | 106 | ||
107 | 23. BALLOON | ||
108 | balloon compaction page | ||
109 | |||
110 | 24. ZERO_PAGE | ||
111 | zero page for pfn_zero or huge_zero page | ||
112 | |||
105 | [IO related page flags] | 113 | [IO related page flags] |
106 | 1. ERROR IO error occurred | 114 | 1. ERROR IO error occurred |
107 | 3. UPTODATE page has up-to-date data | 115 | 3. UPTODATE page has up-to-date data |
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index fce22cf88ee9..a9a119592372 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h | |||
@@ -45,7 +45,7 @@ struct vm_area_struct; | |||
45 | #define PTRS_PER_PMD (1UL << (PAGE_SHIFT-3)) | 45 | #define PTRS_PER_PMD (1UL << (PAGE_SHIFT-3)) |
46 | #define PTRS_PER_PGD (1UL << (PAGE_SHIFT-3)) | 46 | #define PTRS_PER_PGD (1UL << (PAGE_SHIFT-3)) |
47 | #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) | 47 | #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) |
48 | #define FIRST_USER_ADDRESS 0 | 48 | #define FIRST_USER_ADDRESS 0UL |
49 | 49 | ||
50 | /* Number of pointers that fit on a page: this will go away. */ | 50 | /* Number of pointers that fit on a page: this will go away. */ |
51 | #define PTRS_PER_PAGE (1UL << (PAGE_SHIFT-3)) | 51 | #define PTRS_PER_PAGE (1UL << (PAGE_SHIFT-3)) |
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index bdc8ccaf390d..ffed3b2cf313 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h | |||
@@ -211,7 +211,7 @@ | |||
211 | * No special requirements for lowest virtual address we permit any user space | 211 | * No special requirements for lowest virtual address we permit any user space |
212 | * mapping to be mapped at. | 212 | * mapping to be mapped at. |
213 | */ | 213 | */ |
214 | #define FIRST_USER_ADDRESS 0 | 214 | #define FIRST_USER_ADDRESS 0UL |
215 | 215 | ||
216 | 216 | ||
217 | /**************************************************************** | 217 | /**************************************************************** |
diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index bcc5e300413f..bfd662e49a25 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h | |||
@@ -10,6 +10,8 @@ | |||
10 | #ifndef _ASM_PGTABLE_2LEVEL_H | 10 | #ifndef _ASM_PGTABLE_2LEVEL_H |
11 | #define _ASM_PGTABLE_2LEVEL_H | 11 | #define _ASM_PGTABLE_2LEVEL_H |
12 | 12 | ||
13 | #define __PAGETABLE_PMD_FOLDED | ||
14 | |||
13 | /* | 15 | /* |
14 | * Hardware-wise, we have a two level page table structure, where the first | 16 | * Hardware-wise, we have a two level page table structure, where the first |
15 | * level has 4096 entries, and the second level has 256 entries. Each entry | 17 | * level has 4096 entries, and the second level has 256 entries. Each entry |
diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h index c35e53ee6663..add094d09e3e 100644 --- a/arch/arm/include/asm/pgtable-nommu.h +++ b/arch/arm/include/asm/pgtable-nommu.h | |||
@@ -85,7 +85,7 @@ extern unsigned int kobjsize(const void *objp); | |||
85 | #define VMALLOC_START 0UL | 85 | #define VMALLOC_START 0UL |
86 | #define VMALLOC_END 0xffffffffUL | 86 | #define VMALLOC_END 0xffffffffUL |
87 | 87 | ||
88 | #define FIRST_USER_ADDRESS (0) | 88 | #define FIRST_USER_ADDRESS 0UL |
89 | 89 | ||
90 | #include <asm-generic/pgtable.h> | 90 | #include <asm-generic/pgtable.h> |
91 | 91 | ||
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c index 66781bf34077..c72412415093 100644 --- a/arch/arm/mm/hugetlbpage.c +++ b/arch/arm/mm/hugetlbpage.c | |||
@@ -36,12 +36,6 @@ | |||
36 | * of type casting from pmd_t * to pte_t *. | 36 | * of type casting from pmd_t * to pte_t *. |
37 | */ | 37 | */ |
38 | 38 | ||
39 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, | ||
40 | int write) | ||
41 | { | ||
42 | return ERR_PTR(-EINVAL); | ||
43 | } | ||
44 | |||
45 | int pud_huge(pud_t pud) | 39 | int pud_huge(pud_t pud) |
46 | { | 40 | { |
47 | return 0; | 41 | return 0; |
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 249379535be2..a3681f11dd9f 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c | |||
@@ -97,6 +97,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) | |||
97 | 97 | ||
98 | no_pte: | 98 | no_pte: |
99 | pmd_free(mm, new_pmd); | 99 | pmd_free(mm, new_pmd); |
100 | mm_dec_nr_pmds(mm); | ||
100 | no_pmd: | 101 | no_pmd: |
101 | pud_free(mm, new_pud); | 102 | pud_free(mm, new_pud); |
102 | no_pud: | 103 | no_pud: |
@@ -130,9 +131,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) | |||
130 | pte = pmd_pgtable(*pmd); | 131 | pte = pmd_pgtable(*pmd); |
131 | pmd_clear(pmd); | 132 | pmd_clear(pmd); |
132 | pte_free(mm, pte); | 133 | pte_free(mm, pte); |
134 | atomic_long_dec(&mm->nr_ptes); | ||
133 | no_pmd: | 135 | no_pmd: |
134 | pud_clear(pud); | 136 | pud_clear(pud); |
135 | pmd_free(mm, pmd); | 137 | pmd_free(mm, pmd); |
138 | mm_dec_nr_pmds(mm); | ||
136 | no_pud: | 139 | no_pud: |
137 | pgd_clear(pgd); | 140 | pgd_clear(pgd); |
138 | pud_free(mm, pud); | 141 | pud_free(mm, pud); |
@@ -152,6 +155,7 @@ no_pgd: | |||
152 | pmd = pmd_offset(pud, 0); | 155 | pmd = pmd_offset(pud, 0); |
153 | pud_clear(pud); | 156 | pud_clear(pud); |
154 | pmd_free(mm, pmd); | 157 | pmd_free(mm, pmd); |
158 | mm_dec_nr_pmds(mm); | ||
155 | pgd_clear(pgd); | 159 | pgd_clear(pgd); |
156 | pud_free(mm, pud); | 160 | pud_free(mm, pud); |
157 | } | 161 | } |
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index cf1d9c86f20a..16449c535e50 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h | |||
@@ -45,7 +45,7 @@ | |||
45 | 45 | ||
46 | #define vmemmap ((struct page *)(VMALLOC_END + SZ_64K)) | 46 | #define vmemmap ((struct page *)(VMALLOC_END + SZ_64K)) |
47 | 47 | ||
48 | #define FIRST_USER_ADDRESS 0 | 48 | #define FIRST_USER_ADDRESS 0UL |
49 | 49 | ||
50 | #ifndef __ASSEMBLY__ | 50 | #ifndef __ASSEMBLY__ |
51 | extern void __pte_error(const char *file, int line, unsigned long val); | 51 | extern void __pte_error(const char *file, int line, unsigned long val); |
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 023747bf4dd7..2de9d2e59d96 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c | |||
@@ -38,12 +38,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |||
38 | } | 38 | } |
39 | #endif | 39 | #endif |
40 | 40 | ||
41 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, | ||
42 | int write) | ||
43 | { | ||
44 | return ERR_PTR(-EINVAL); | ||
45 | } | ||
46 | |||
47 | int pmd_huge(pmd_t pmd) | 41 | int pmd_huge(pmd_t pmd) |
48 | { | 42 | { |
49 | return !(pmd_val(pmd) & PMD_TABLE_BIT); | 43 | return !(pmd_val(pmd) & PMD_TABLE_BIT); |
diff --git a/arch/avr32/include/asm/pgtable.h b/arch/avr32/include/asm/pgtable.h index ac7a817e2126..35800664076e 100644 --- a/arch/avr32/include/asm/pgtable.h +++ b/arch/avr32/include/asm/pgtable.h | |||
@@ -30,7 +30,7 @@ | |||
30 | #define PGDIR_MASK (~(PGDIR_SIZE-1)) | 30 | #define PGDIR_MASK (~(PGDIR_SIZE-1)) |
31 | 31 | ||
32 | #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) | 32 | #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) |
33 | #define FIRST_USER_ADDRESS 0 | 33 | #define FIRST_USER_ADDRESS 0UL |
34 | 34 | ||
35 | #ifndef __ASSEMBLY__ | 35 | #ifndef __ASSEMBLY__ |
36 | extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; | 36 | extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; |
diff --git a/arch/cris/include/asm/pgtable.h b/arch/cris/include/asm/pgtable.h index e824257971c4..ceefc314d64d 100644 --- a/arch/cris/include/asm/pgtable.h +++ b/arch/cris/include/asm/pgtable.h | |||
@@ -67,7 +67,7 @@ extern void paging_init(void); | |||
67 | */ | 67 | */ |
68 | 68 | ||
69 | #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) | 69 | #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) |
70 | #define FIRST_USER_ADDRESS 0 | 70 | #define FIRST_USER_ADDRESS 0UL |
71 | 71 | ||
72 | /* zero page used for uninitialized stuff */ | 72 | /* zero page used for uninitialized stuff */ |
73 | #ifndef __ASSEMBLY__ | 73 | #ifndef __ASSEMBLY__ |
diff --git a/arch/frv/include/asm/pgtable.h b/arch/frv/include/asm/pgtable.h index c49699d5902d..93bcf2abd1a1 100644 --- a/arch/frv/include/asm/pgtable.h +++ b/arch/frv/include/asm/pgtable.h | |||
@@ -140,7 +140,7 @@ extern unsigned long empty_zero_page; | |||
140 | #define PTRS_PER_PTE 4096 | 140 | #define PTRS_PER_PTE 4096 |
141 | 141 | ||
142 | #define USER_PGDS_IN_LAST_PML4 (TASK_SIZE / PGDIR_SIZE) | 142 | #define USER_PGDS_IN_LAST_PML4 (TASK_SIZE / PGDIR_SIZE) |
143 | #define FIRST_USER_ADDRESS 0 | 143 | #define FIRST_USER_ADDRESS 0UL |
144 | 144 | ||
145 | #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) | 145 | #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) |
146 | #define KERNEL_PGD_PTRS (PTRS_PER_PGD - USER_PGD_PTRS) | 146 | #define KERNEL_PGD_PTRS (PTRS_PER_PGD - USER_PGD_PTRS) |
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index 6e35e71d2aea..49eab8136ec3 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h | |||
@@ -171,7 +171,7 @@ extern unsigned long _dflt_cache_att; | |||
171 | extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* located in head.S */ | 171 | extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* located in head.S */ |
172 | 172 | ||
173 | /* Seems to be zero even in architectures where the zero page is firewalled? */ | 173 | /* Seems to be zero even in architectures where the zero page is firewalled? */ |
174 | #define FIRST_USER_ADDRESS 0 | 174 | #define FIRST_USER_ADDRESS 0UL |
175 | #define pte_special(pte) 0 | 175 | #define pte_special(pte) 0 |
176 | #define pte_mkspecial(pte) (pte) | 176 | #define pte_mkspecial(pte) (pte) |
177 | 177 | ||
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 2f07bb3dda91..7b6f8801df57 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h | |||
@@ -127,7 +127,7 @@ | |||
127 | #define PTRS_PER_PGD_SHIFT PTRS_PER_PTD_SHIFT | 127 | #define PTRS_PER_PGD_SHIFT PTRS_PER_PTD_SHIFT |
128 | #define PTRS_PER_PGD (1UL << PTRS_PER_PGD_SHIFT) | 128 | #define PTRS_PER_PGD (1UL << PTRS_PER_PGD_SHIFT) |
129 | #define USER_PTRS_PER_PGD (5*PTRS_PER_PGD/8) /* regions 0-4 are user regions */ | 129 | #define USER_PTRS_PER_PGD (5*PTRS_PER_PGD/8) /* regions 0-4 are user regions */ |
130 | #define FIRST_USER_ADDRESS 0 | 130 | #define FIRST_USER_ADDRESS 0UL |
131 | 131 | ||
132 | /* | 132 | /* |
133 | * All the normal masks have the "page accessed" bits on, as any time | 133 | * All the normal masks have the "page accessed" bits on, as any time |
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 76069c18ee42..52b7604b5215 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c | |||
@@ -114,12 +114,6 @@ int pud_huge(pud_t pud) | |||
114 | return 0; | 114 | return 0; |
115 | } | 115 | } |
116 | 116 | ||
117 | struct page * | ||
118 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) | ||
119 | { | ||
120 | return NULL; | ||
121 | } | ||
122 | |||
123 | void hugetlb_free_pgd_range(struct mmu_gather *tlb, | 117 | void hugetlb_free_pgd_range(struct mmu_gather *tlb, |
124 | unsigned long addr, unsigned long end, | 118 | unsigned long addr, unsigned long end, |
125 | unsigned long floor, unsigned long ceiling) | 119 | unsigned long floor, unsigned long ceiling) |
diff --git a/arch/m32r/include/asm/pgtable.h b/arch/m32r/include/asm/pgtable.h index 050f7a686e3d..8c1fb902a9ce 100644 --- a/arch/m32r/include/asm/pgtable.h +++ b/arch/m32r/include/asm/pgtable.h | |||
@@ -53,7 +53,7 @@ extern unsigned long empty_zero_page[1024]; | |||
53 | #define PGDIR_MASK (~(PGDIR_SIZE - 1)) | 53 | #define PGDIR_MASK (~(PGDIR_SIZE - 1)) |
54 | 54 | ||
55 | #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) | 55 | #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) |
56 | #define FIRST_USER_ADDRESS 0 | 56 | #define FIRST_USER_ADDRESS 0UL |
57 | 57 | ||
58 | #ifndef __ASSEMBLY__ | 58 | #ifndef __ASSEMBLY__ |
59 | /* Just any arbitrary offset to the start of the vmalloc VM area: the | 59 | /* Just any arbitrary offset to the start of the vmalloc VM area: the |
diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index 9f5abbda1ea7..28a145bfbb71 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h | |||
@@ -66,7 +66,7 @@ | |||
66 | #define PTRS_PER_PGD 128 | 66 | #define PTRS_PER_PGD 128 |
67 | #endif | 67 | #endif |
68 | #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) | 68 | #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) |
69 | #define FIRST_USER_ADDRESS 0 | 69 | #define FIRST_USER_ADDRESS 0UL |
70 | 70 | ||
71 | /* Virtual address region for use by kernel_map() */ | 71 | /* Virtual address region for use by kernel_map() */ |
72 | #ifdef CONFIG_SUN3 | 72 | #ifdef CONFIG_SUN3 |
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index 3c32075d2945..7ca80ac42ed5 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c | |||
@@ -94,12 +94,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |||
94 | return 0; | 94 | return 0; |
95 | } | 95 | } |
96 | 96 | ||
97 | struct page *follow_huge_addr(struct mm_struct *mm, | ||
98 | unsigned long address, int write) | ||
99 | { | ||
100 | return ERR_PTR(-EINVAL); | ||
101 | } | ||
102 | |||
103 | int pmd_huge(pmd_t pmd) | 97 | int pmd_huge(pmd_t pmd) |
104 | { | 98 | { |
105 | return pmd_page_shift(pmd) > PAGE_SHIFT; | 99 | return pmd_page_shift(pmd) > PAGE_SHIFT; |
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 91b9b46fbb5d..e53b8532353c 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h | |||
@@ -61,6 +61,8 @@ extern int mem_init_done; | |||
61 | 61 | ||
62 | #include <asm-generic/4level-fixup.h> | 62 | #include <asm-generic/4level-fixup.h> |
63 | 63 | ||
64 | #define __PAGETABLE_PMD_FOLDED | ||
65 | |||
64 | #ifdef __KERNEL__ | 66 | #ifdef __KERNEL__ |
65 | #ifndef __ASSEMBLY__ | 67 | #ifndef __ASSEMBLY__ |
66 | 68 | ||
@@ -70,7 +72,7 @@ extern int mem_init_done; | |||
70 | #include <asm/mmu.h> | 72 | #include <asm/mmu.h> |
71 | #include <asm/page.h> | 73 | #include <asm/page.h> |
72 | 74 | ||
73 | #define FIRST_USER_ADDRESS 0 | 75 | #define FIRST_USER_ADDRESS 0UL |
74 | 76 | ||
75 | extern unsigned long va_to_phys(unsigned long address); | 77 | extern unsigned long va_to_phys(unsigned long address); |
76 | extern pte_t *va_to_pte(unsigned long address); | 78 | extern pte_t *va_to_pte(unsigned long address); |
diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index 16aa9f23e17b..a6be006b6f75 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h | |||
@@ -57,7 +57,7 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1, | |||
57 | #define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t)) | 57 | #define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t)) |
58 | 58 | ||
59 | #define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE) | 59 | #define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE) |
60 | #define FIRST_USER_ADDRESS 0 | 60 | #define FIRST_USER_ADDRESS 0UL |
61 | 61 | ||
62 | #define VMALLOC_START MAP_BASE | 62 | #define VMALLOC_START MAP_BASE |
63 | 63 | ||
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 70795a67a276..349995d19c7f 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c | |||
@@ -301,11 +301,9 @@ slow_irqon: | |||
301 | start += nr << PAGE_SHIFT; | 301 | start += nr << PAGE_SHIFT; |
302 | pages += nr; | 302 | pages += nr; |
303 | 303 | ||
304 | down_read(&mm->mmap_sem); | 304 | ret = get_user_pages_unlocked(current, mm, start, |
305 | ret = get_user_pages(current, mm, start, | 305 | (end - start) >> PAGE_SHIFT, |
306 | (end - start) >> PAGE_SHIFT, | 306 | write, 0, pages); |
307 | write, 0, pages, NULL); | ||
308 | up_read(&mm->mmap_sem); | ||
309 | 307 | ||
310 | /* Have to be a bit careful with return values */ | 308 | /* Have to be a bit careful with return values */ |
311 | if (nr > 0) { | 309 | if (nr > 0) { |
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 4ec8ee10d371..06e0f421b41b 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c | |||
@@ -68,12 +68,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) | |||
68 | return 0; | 68 | return 0; |
69 | } | 69 | } |
70 | 70 | ||
71 | struct page * | ||
72 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | ||
73 | { | ||
74 | return ERR_PTR(-EINVAL); | ||
75 | } | ||
76 | |||
77 | int pmd_huge(pmd_t pmd) | 71 | int pmd_huge(pmd_t pmd) |
78 | { | 72 | { |
79 | return (pmd_val(pmd) & _PAGE_HUGE) != 0; | 73 | return (pmd_val(pmd) & _PAGE_HUGE) != 0; |
@@ -83,15 +77,3 @@ int pud_huge(pud_t pud) | |||
83 | { | 77 | { |
84 | return (pud_val(pud) & _PAGE_HUGE) != 0; | 78 | return (pud_val(pud) & _PAGE_HUGE) != 0; |
85 | } | 79 | } |
86 | |||
87 | struct page * | ||
88 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
89 | pmd_t *pmd, int write) | ||
90 | { | ||
91 | struct page *page; | ||
92 | |||
93 | page = pte_page(*(pte_t *)pmd); | ||
94 | if (page) | ||
95 | page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); | ||
96 | return page; | ||
97 | } | ||
diff --git a/arch/mn10300/include/asm/pgtable.h b/arch/mn10300/include/asm/pgtable.h index 629181ae111e..afab728ab65e 100644 --- a/arch/mn10300/include/asm/pgtable.h +++ b/arch/mn10300/include/asm/pgtable.h | |||
@@ -65,7 +65,7 @@ extern void paging_init(void); | |||
65 | #define PGDIR_MASK (~(PGDIR_SIZE - 1)) | 65 | #define PGDIR_MASK (~(PGDIR_SIZE - 1)) |
66 | 66 | ||
67 | #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) | 67 | #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) |
68 | #define FIRST_USER_ADDRESS 0 | 68 | #define FIRST_USER_ADDRESS 0UL |
69 | 69 | ||
70 | #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) | 70 | #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) |
71 | #define KERNEL_PGD_PTRS (PTRS_PER_PGD - USER_PGD_PTRS) | 71 | #define KERNEL_PGD_PTRS (PTRS_PER_PGD - USER_PGD_PTRS) |
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 7b292e3a3138..a213e8c9aad0 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h | |||
@@ -24,7 +24,7 @@ | |||
24 | #include <asm/pgtable-bits.h> | 24 | #include <asm/pgtable-bits.h> |
25 | #include <asm-generic/pgtable-nopmd.h> | 25 | #include <asm-generic/pgtable-nopmd.h> |
26 | 26 | ||
27 | #define FIRST_USER_ADDRESS 0 | 27 | #define FIRST_USER_ADDRESS 0UL |
28 | 28 | ||
29 | #define VMALLOC_START CONFIG_NIOS2_KERNEL_MMU_REGION_BASE | 29 | #define VMALLOC_START CONFIG_NIOS2_KERNEL_MMU_REGION_BASE |
30 | #define VMALLOC_END (CONFIG_NIOS2_KERNEL_REGION_BASE - 1) | 30 | #define VMALLOC_END (CONFIG_NIOS2_KERNEL_REGION_BASE - 1) |
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 18994ccb1185..69c7df0e1420 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h | |||
@@ -77,7 +77,7 @@ extern void paging_init(void); | |||
77 | */ | 77 | */ |
78 | 78 | ||
79 | #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) | 79 | #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) |
80 | #define FIRST_USER_ADDRESS 0 | 80 | #define FIRST_USER_ADDRESS 0UL |
81 | 81 | ||
82 | /* | 82 | /* |
83 | * Kernels own virtual memory area. | 83 | * Kernels own virtual memory area. |
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 1d49a4a7749b..8c966b2270aa 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h | |||
@@ -134,7 +134,7 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long); | |||
134 | * pgd entries used up by user/kernel: | 134 | * pgd entries used up by user/kernel: |
135 | */ | 135 | */ |
136 | 136 | ||
137 | #define FIRST_USER_ADDRESS 0 | 137 | #define FIRST_USER_ADDRESS 0UL |
138 | 138 | ||
139 | /* NB: The tlb miss handlers make certain assumptions about the order */ | 139 | /* NB: The tlb miss handlers make certain assumptions about the order */ |
140 | /* of the following bits, so be careful (One example, bits 25-31 */ | 140 | /* of the following bits, so be careful (One example, bits 25-31 */ |
diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h index 26ce0ab0a9e4..14bdcbd31670 100644 --- a/arch/powerpc/include/asm/pgtable-ppc32.h +++ b/arch/powerpc/include/asm/pgtable-ppc32.h | |||
@@ -45,7 +45,7 @@ extern int icache_44x_need_flush; | |||
45 | #define PTRS_PER_PGD (1 << (32 - PGDIR_SHIFT)) | 45 | #define PTRS_PER_PGD (1 << (32 - PGDIR_SHIFT)) |
46 | 46 | ||
47 | #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) | 47 | #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) |
48 | #define FIRST_USER_ADDRESS 0 | 48 | #define FIRST_USER_ADDRESS 0UL |
49 | 49 | ||
50 | #define pte_ERROR(e) \ | 50 | #define pte_ERROR(e) \ |
51 | pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \ | 51 | pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \ |
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index b9dcc936e2d1..d46532ccc386 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h | |||
@@ -12,7 +12,7 @@ | |||
12 | #endif | 12 | #endif |
13 | #include <asm/barrier.h> | 13 | #include <asm/barrier.h> |
14 | 14 | ||
15 | #define FIRST_USER_ADDRESS 0 | 15 | #define FIRST_USER_ADDRESS 0UL |
16 | 16 | ||
17 | /* | 17 | /* |
18 | * Size of EA range mapped by our pagetables. | 18 | * Size of EA range mapped by our pagetables. |
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 5ff4e07d920a..cf0464f4284f 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c | |||
@@ -714,6 +714,14 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |||
714 | return NULL; | 714 | return NULL; |
715 | } | 715 | } |
716 | 716 | ||
717 | struct page * | ||
718 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | ||
719 | pud_t *pud, int write) | ||
720 | { | ||
721 | BUG(); | ||
722 | return NULL; | ||
723 | } | ||
724 | |||
717 | static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, | 725 | static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, |
718 | unsigned long sz) | 726 | unsigned long sz) |
719 | { | 727 | { |
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index 6c0b1f5f8d2c..fa9fb5b4c66c 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c | |||
@@ -134,7 +134,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) | |||
134 | static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, | 134 | static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, |
135 | unsigned long end, struct mm_walk *walk) | 135 | unsigned long end, struct mm_walk *walk) |
136 | { | 136 | { |
137 | struct vm_area_struct *vma = walk->private; | 137 | struct vm_area_struct *vma = walk->vma; |
138 | split_huge_page_pmd(vma, addr, pmd); | 138 | split_huge_page_pmd(vma, addr, pmd); |
139 | return 0; | 139 | return 0; |
140 | } | 140 | } |
@@ -163,9 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, | |||
163 | if (vma->vm_start >= (addr + len)) | 163 | if (vma->vm_start >= (addr + len)) |
164 | break; | 164 | break; |
165 | vma->vm_flags |= VM_NOHUGEPAGE; | 165 | vma->vm_flags |= VM_NOHUGEPAGE; |
166 | subpage_proto_walk.private = vma; | 166 | walk_page_vma(vma, &subpage_proto_walk); |
167 | walk_page_range(vma->vm_start, vma->vm_end, | ||
168 | &subpage_proto_walk); | ||
169 | vma = vma->vm_next; | 167 | vma = vma->vm_next; |
170 | } | 168 | } |
171 | } | 169 | } |
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0441ec24ae87..fbb5ee3ae57c 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h | |||
@@ -99,7 +99,7 @@ extern unsigned long zero_page_mask; | |||
99 | #endif /* CONFIG_64BIT */ | 99 | #endif /* CONFIG_64BIT */ |
100 | #define PTRS_PER_PGD 2048 | 100 | #define PTRS_PER_PGD 2048 |
101 | 101 | ||
102 | #define FIRST_USER_ADDRESS 0 | 102 | #define FIRST_USER_ADDRESS 0UL |
103 | 103 | ||
104 | #define pte_ERROR(e) \ | 104 | #define pte_ERROR(e) \ |
105 | printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e)) | 105 | printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e)) |
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 639fce464008..5c586c78ca8d 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c | |||
@@ -235,10 +235,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
235 | /* Try to get the remaining pages with get_user_pages */ | 235 | /* Try to get the remaining pages with get_user_pages */ |
236 | start += nr << PAGE_SHIFT; | 236 | start += nr << PAGE_SHIFT; |
237 | pages += nr; | 237 | pages += nr; |
238 | down_read(&mm->mmap_sem); | 238 | ret = get_user_pages_unlocked(current, mm, start, |
239 | ret = get_user_pages(current, mm, start, | 239 | nr_pages - nr, write, 0, pages); |
240 | nr_pages - nr, write, 0, pages, NULL); | ||
241 | up_read(&mm->mmap_sem); | ||
242 | /* Have to be a bit careful with return values */ | 240 | /* Have to be a bit careful with return values */ |
243 | if (nr > 0) | 241 | if (nr > 0) |
244 | ret = (ret < 0) ? nr : ret + nr; | 242 | ret = (ret < 0) ? nr : ret + nr; |
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 3c80d2e38f03..210ffede0153 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c | |||
@@ -192,12 +192,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |||
192 | return 0; | 192 | return 0; |
193 | } | 193 | } |
194 | 194 | ||
195 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, | ||
196 | int write) | ||
197 | { | ||
198 | return ERR_PTR(-EINVAL); | ||
199 | } | ||
200 | |||
201 | int pmd_huge(pmd_t pmd) | 195 | int pmd_huge(pmd_t pmd) |
202 | { | 196 | { |
203 | if (!MACHINE_HAS_HPAGE) | 197 | if (!MACHINE_HAS_HPAGE) |
@@ -210,17 +204,3 @@ int pud_huge(pud_t pud) | |||
210 | { | 204 | { |
211 | return 0; | 205 | return 0; |
212 | } | 206 | } |
213 | |||
214 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
215 | pmd_t *pmdp, int write) | ||
216 | { | ||
217 | struct page *page; | ||
218 | |||
219 | if (!MACHINE_HAS_HPAGE) | ||
220 | return NULL; | ||
221 | |||
222 | page = pmd_page(*pmdp); | ||
223 | if (page) | ||
224 | page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); | ||
225 | return page; | ||
226 | } | ||
diff --git a/arch/score/include/asm/pgtable.h b/arch/score/include/asm/pgtable.h index 5170ffdea643..0553e5cd5985 100644 --- a/arch/score/include/asm/pgtable.h +++ b/arch/score/include/asm/pgtable.h | |||
@@ -27,7 +27,7 @@ extern pte_t invalid_pte_table[PAGE_SIZE/sizeof(pte_t)]; | |||
27 | #define PTRS_PER_PTE 1024 | 27 | #define PTRS_PER_PTE 1024 |
28 | 28 | ||
29 | #define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE) | 29 | #define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE) |
30 | #define FIRST_USER_ADDRESS 0 | 30 | #define FIRST_USER_ADDRESS 0UL |
31 | 31 | ||
32 | #define VMALLOC_START (0xc0000000UL) | 32 | #define VMALLOC_START (0xc0000000UL) |
33 | 33 | ||
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index cf434c64408d..89c513a982fc 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h | |||
@@ -62,7 +62,7 @@ static inline unsigned long long neff_sign_extend(unsigned long val) | |||
62 | /* Entries per level */ | 62 | /* Entries per level */ |
63 | #define PTRS_PER_PTE (PAGE_SIZE / (1 << PTE_MAGNITUDE)) | 63 | #define PTRS_PER_PTE (PAGE_SIZE / (1 << PTE_MAGNITUDE)) |
64 | 64 | ||
65 | #define FIRST_USER_ADDRESS 0 | 65 | #define FIRST_USER_ADDRESS 0UL |
66 | 66 | ||
67 | #define PHYS_ADDR_MASK29 0x1fffffff | 67 | #define PHYS_ADDR_MASK29 0x1fffffff |
68 | #define PHYS_ADDR_MASK32 0xffffffff | 68 | #define PHYS_ADDR_MASK32 0xffffffff |
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c index 37458f38b220..e15f52a17b6c 100644 --- a/arch/sh/mm/gup.c +++ b/arch/sh/mm/gup.c | |||
@@ -257,10 +257,8 @@ slow_irqon: | |||
257 | start += nr << PAGE_SHIFT; | 257 | start += nr << PAGE_SHIFT; |
258 | pages += nr; | 258 | pages += nr; |
259 | 259 | ||
260 | down_read(&mm->mmap_sem); | 260 | ret = get_user_pages_unlocked(current, mm, start, |
261 | ret = get_user_pages(current, mm, start, | 261 | (end - start) >> PAGE_SHIFT, write, 0, pages); |
262 | (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); | ||
263 | up_read(&mm->mmap_sem); | ||
264 | 262 | ||
265 | /* Have to be a bit careful with return values */ | 263 | /* Have to be a bit careful with return values */ |
266 | if (nr > 0) { | 264 | if (nr > 0) { |
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index d7762349ea48..534bc978af8a 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c | |||
@@ -67,12 +67,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |||
67 | return 0; | 67 | return 0; |
68 | } | 68 | } |
69 | 69 | ||
70 | struct page *follow_huge_addr(struct mm_struct *mm, | ||
71 | unsigned long address, int write) | ||
72 | { | ||
73 | return ERR_PTR(-EINVAL); | ||
74 | } | ||
75 | |||
76 | int pmd_huge(pmd_t pmd) | 70 | int pmd_huge(pmd_t pmd) |
77 | { | 71 | { |
78 | return 0; | 72 | return 0; |
@@ -82,9 +76,3 @@ int pud_huge(pud_t pud) | |||
82 | { | 76 | { |
83 | return 0; | 77 | return 0; |
84 | } | 78 | } |
85 | |||
86 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
87 | pmd_t *pmd, int write) | ||
88 | { | ||
89 | return NULL; | ||
90 | } | ||
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index b2f7dc46a7d1..f06b36a00a3b 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h | |||
@@ -44,7 +44,7 @@ unsigned long __init bootmem_init(unsigned long *pages_avail); | |||
44 | #define PTRS_PER_PMD SRMMU_PTRS_PER_PMD | 44 | #define PTRS_PER_PMD SRMMU_PTRS_PER_PMD |
45 | #define PTRS_PER_PGD SRMMU_PTRS_PER_PGD | 45 | #define PTRS_PER_PGD SRMMU_PTRS_PER_PGD |
46 | #define USER_PTRS_PER_PGD PAGE_OFFSET / SRMMU_PGDIR_SIZE | 46 | #define USER_PTRS_PER_PGD PAGE_OFFSET / SRMMU_PGDIR_SIZE |
47 | #define FIRST_USER_ADDRESS 0 | 47 | #define FIRST_USER_ADDRESS 0UL |
48 | #define PTE_SIZE (PTRS_PER_PTE*4) | 48 | #define PTE_SIZE (PTRS_PER_PTE*4) |
49 | 49 | ||
50 | #define PAGE_NONE SRMMU_PAGE_NONE | 50 | #define PAGE_NONE SRMMU_PAGE_NONE |
@@ -102,7 +102,8 @@ extern unsigned long empty_zero_page; | |||
102 | */ | 102 | */ |
103 | static inline unsigned long srmmu_swap(unsigned long *addr, unsigned long value) | 103 | static inline unsigned long srmmu_swap(unsigned long *addr, unsigned long value) |
104 | { | 104 | { |
105 | __asm__ __volatile__("swap [%2], %0" : "=&r" (value) : "0" (value), "r" (addr)); | 105 | __asm__ __volatile__("swap [%2], %0" : |
106 | "=&r" (value) : "0" (value), "r" (addr) : "memory"); | ||
106 | return value; | 107 | return value; |
107 | } | 108 | } |
108 | 109 | ||
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 2ac7873ad6fd..dc165ebdf05a 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h | |||
@@ -93,7 +93,7 @@ bool kern_addr_valid(unsigned long addr); | |||
93 | #define PTRS_PER_PGD (1UL << PGDIR_BITS) | 93 | #define PTRS_PER_PGD (1UL << PGDIR_BITS) |
94 | 94 | ||
95 | /* Kernel has a separate 44bit address space. */ | 95 | /* Kernel has a separate 44bit address space. */ |
96 | #define FIRST_USER_ADDRESS 0 | 96 | #define FIRST_USER_ADDRESS 0UL |
97 | 97 | ||
98 | #define pmd_ERROR(e) \ | 98 | #define pmd_ERROR(e) \ |
99 | pr_err("%s:%d: bad pmd %p(%016lx) seen at (%pS)\n", \ | 99 | pr_err("%s:%d: bad pmd %p(%016lx) seen at (%pS)\n", \ |
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index ae6ce383d4df..2e5c4fc2daa9 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c | |||
@@ -249,10 +249,8 @@ slow: | |||
249 | start += nr << PAGE_SHIFT; | 249 | start += nr << PAGE_SHIFT; |
250 | pages += nr; | 250 | pages += nr; |
251 | 251 | ||
252 | down_read(&mm->mmap_sem); | 252 | ret = get_user_pages_unlocked(current, mm, start, |
253 | ret = get_user_pages(current, mm, start, | 253 | (end - start) >> PAGE_SHIFT, write, 0, pages); |
254 | (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); | ||
255 | up_read(&mm->mmap_sem); | ||
256 | 254 | ||
257 | /* Have to be a bit careful with return values */ | 255 | /* Have to be a bit careful with return values */ |
258 | if (nr > 0) { | 256 | if (nr > 0) { |
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index d329537739c6..4242eab12e10 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c | |||
@@ -215,12 +215,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, | |||
215 | return entry; | 215 | return entry; |
216 | } | 216 | } |
217 | 217 | ||
218 | struct page *follow_huge_addr(struct mm_struct *mm, | ||
219 | unsigned long address, int write) | ||
220 | { | ||
221 | return ERR_PTR(-EINVAL); | ||
222 | } | ||
223 | |||
224 | int pmd_huge(pmd_t pmd) | 218 | int pmd_huge(pmd_t pmd) |
225 | { | 219 | { |
226 | return 0; | 220 | return 0; |
@@ -230,9 +224,3 @@ int pud_huge(pud_t pud) | |||
230 | { | 224 | { |
231 | return 0; | 225 | return 0; |
232 | } | 226 | } |
233 | |||
234 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
235 | pmd_t *pmd, int write) | ||
236 | { | ||
237 | return NULL; | ||
238 | } | ||
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index bc75b6ef2e79..95a4f19d16c5 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h | |||
@@ -67,7 +67,7 @@ extern void pgtable_cache_init(void); | |||
67 | extern void paging_init(void); | 67 | extern void paging_init(void); |
68 | extern void set_page_homes(void); | 68 | extern void set_page_homes(void); |
69 | 69 | ||
70 | #define FIRST_USER_ADDRESS 0 | 70 | #define FIRST_USER_ADDRESS 0UL |
71 | 71 | ||
72 | #define _PAGE_PRESENT HV_PTE_PRESENT | 72 | #define _PAGE_PRESENT HV_PTE_PRESENT |
73 | #define _PAGE_HUGE_PAGE HV_PTE_PAGE | 73 | #define _PAGE_HUGE_PAGE HV_PTE_PAGE |
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 3270e0019266..8416240c322c 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c | |||
@@ -150,12 +150,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
150 | return NULL; | 150 | return NULL; |
151 | } | 151 | } |
152 | 152 | ||
153 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, | ||
154 | int write) | ||
155 | { | ||
156 | return ERR_PTR(-EINVAL); | ||
157 | } | ||
158 | |||
159 | int pmd_huge(pmd_t pmd) | 153 | int pmd_huge(pmd_t pmd) |
160 | { | 154 | { |
161 | return !!(pmd_val(pmd) & _PAGE_HUGE_PAGE); | 155 | return !!(pmd_val(pmd) & _PAGE_HUGE_PAGE); |
@@ -166,28 +160,6 @@ int pud_huge(pud_t pud) | |||
166 | return !!(pud_val(pud) & _PAGE_HUGE_PAGE); | 160 | return !!(pud_val(pud) & _PAGE_HUGE_PAGE); |
167 | } | 161 | } |
168 | 162 | ||
169 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
170 | pmd_t *pmd, int write) | ||
171 | { | ||
172 | struct page *page; | ||
173 | |||
174 | page = pte_page(*(pte_t *)pmd); | ||
175 | if (page) | ||
176 | page += ((address & ~PMD_MASK) >> PAGE_SHIFT); | ||
177 | return page; | ||
178 | } | ||
179 | |||
180 | struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, | ||
181 | pud_t *pud, int write) | ||
182 | { | ||
183 | struct page *page; | ||
184 | |||
185 | page = pte_page(*(pte_t *)pud); | ||
186 | if (page) | ||
187 | page += ((address & ~PUD_MASK) >> PAGE_SHIFT); | ||
188 | return page; | ||
189 | } | ||
190 | |||
191 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | 163 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) |
192 | { | 164 | { |
193 | return 0; | 165 | return 0; |
diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h index 7afe86035fa7..cfbe59752469 100644 --- a/arch/um/include/asm/pgtable-2level.h +++ b/arch/um/include/asm/pgtable-2level.h | |||
@@ -23,7 +23,7 @@ | |||
23 | #define PTRS_PER_PTE 1024 | 23 | #define PTRS_PER_PTE 1024 |
24 | #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) | 24 | #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) |
25 | #define PTRS_PER_PGD 1024 | 25 | #define PTRS_PER_PGD 1024 |
26 | #define FIRST_USER_ADDRESS 0 | 26 | #define FIRST_USER_ADDRESS 0UL |
27 | 27 | ||
28 | #define pte_ERROR(e) \ | 28 | #define pte_ERROR(e) \ |
29 | printk("%s:%d: bad pte %p(%08lx).\n", __FILE__, __LINE__, &(e), \ | 29 | printk("%s:%d: bad pte %p(%08lx).\n", __FILE__, __LINE__, &(e), \ |
diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h index 344c559c0a17..2b4274e7c095 100644 --- a/arch/um/include/asm/pgtable-3level.h +++ b/arch/um/include/asm/pgtable-3level.h | |||
@@ -41,7 +41,7 @@ | |||
41 | #endif | 41 | #endif |
42 | 42 | ||
43 | #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) | 43 | #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) |
44 | #define FIRST_USER_ADDRESS 0 | 44 | #define FIRST_USER_ADDRESS 0UL |
45 | 45 | ||
46 | #define pte_ERROR(e) \ | 46 | #define pte_ERROR(e) \ |
47 | printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \ | 47 | printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \ |
diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c index 08b8d4295e70..2ade20d8eab3 100644 --- a/arch/unicore32/mm/pgd.c +++ b/arch/unicore32/mm/pgd.c | |||
@@ -69,6 +69,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) | |||
69 | 69 | ||
70 | no_pte: | 70 | no_pte: |
71 | pmd_free(mm, new_pmd); | 71 | pmd_free(mm, new_pmd); |
72 | mm_dec_nr_pmds(mm); | ||
72 | no_pmd: | 73 | no_pmd: |
73 | free_pages((unsigned long)new_pgd, 0); | 74 | free_pages((unsigned long)new_pgd, 0); |
74 | no_pgd: | 75 | no_pgd: |
@@ -96,7 +97,9 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) | |||
96 | pte = pmd_pgtable(*pmd); | 97 | pte = pmd_pgtable(*pmd); |
97 | pmd_clear(pmd); | 98 | pmd_clear(pmd); |
98 | pte_free(mm, pte); | 99 | pte_free(mm, pte); |
100 | atomic_long_dec(&mm->nr_ptes); | ||
99 | pmd_free(mm, pmd); | 101 | pmd_free(mm, pmd); |
102 | mm_dec_nr_pmds(mm); | ||
100 | free: | 103 | free: |
101 | free_pages((unsigned long) pgd, 0); | 104 | free_pages((unsigned long) pgd, 0); |
102 | } | 105 | } |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 5185a4f599ec..3e0230c94cff 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
@@ -4,7 +4,7 @@ | |||
4 | #include <linux/const.h> | 4 | #include <linux/const.h> |
5 | #include <asm/page_types.h> | 5 | #include <asm/page_types.h> |
6 | 6 | ||
7 | #define FIRST_USER_ADDRESS 0 | 7 | #define FIRST_USER_ADDRESS 0UL |
8 | 8 | ||
9 | #define _PAGE_BIT_PRESENT 0 /* is present */ | 9 | #define _PAGE_BIT_PRESENT 0 /* is present */ |
10 | #define _PAGE_BIT_RW 1 /* writeable */ | 10 | #define _PAGE_BIT_RW 1 /* writeable */ |
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index d7547824e763..89df70e0caa6 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c | |||
@@ -172,7 +172,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
172 | */ | 172 | */ |
173 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | 173 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) |
174 | return 0; | 174 | return 0; |
175 | if (unlikely(pmd_large(pmd))) { | 175 | if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { |
176 | /* | 176 | /* |
177 | * NUMA hinting faults need to be handled in the GUP | 177 | * NUMA hinting faults need to be handled in the GUP |
178 | * slowpath for accounting purposes and so that they | 178 | * slowpath for accounting purposes and so that they |
@@ -388,10 +388,9 @@ slow_irqon: | |||
388 | start += nr << PAGE_SHIFT; | 388 | start += nr << PAGE_SHIFT; |
389 | pages += nr; | 389 | pages += nr; |
390 | 390 | ||
391 | down_read(&mm->mmap_sem); | 391 | ret = get_user_pages_unlocked(current, mm, start, |
392 | ret = get_user_pages(current, mm, start, | 392 | (end - start) >> PAGE_SHIFT, |
393 | (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); | 393 | write, 0, pages); |
394 | up_read(&mm->mmap_sem); | ||
395 | 394 | ||
396 | /* Have to be a bit careful with return values */ | 395 | /* Have to be a bit careful with return values */ |
397 | if (nr > 0) { | 396 | if (nr > 0) { |
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index bca0aa3a003f..42982b26e32b 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c | |||
@@ -52,23 +52,17 @@ int pud_huge(pud_t pud) | |||
52 | return 0; | 52 | return 0; |
53 | } | 53 | } |
54 | 54 | ||
55 | struct page * | ||
56 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
57 | pmd_t *pmd, int write) | ||
58 | { | ||
59 | return NULL; | ||
60 | } | ||
61 | #else | 55 | #else |
62 | 56 | ||
63 | struct page * | 57 | /* |
64 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | 58 | * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal |
65 | { | 59 | * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. |
66 | return ERR_PTR(-EINVAL); | 60 | * Otherwise, returns 0. |
67 | } | 61 | */ |
68 | |||
69 | int pmd_huge(pmd_t pmd) | 62 | int pmd_huge(pmd_t pmd) |
70 | { | 63 | { |
71 | return !!(pmd_val(pmd) & _PAGE_PSE); | 64 | return !pmd_none(pmd) && |
65 | (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; | ||
72 | } | 66 | } |
73 | 67 | ||
74 | int pud_huge(pud_t pud) | 68 | int pud_huge(pud_t pud) |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6fb6927f9e76..7b22adaad4f1 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) | |||
190 | 190 | ||
191 | #endif /* CONFIG_X86_PAE */ | 191 | #endif /* CONFIG_X86_PAE */ |
192 | 192 | ||
193 | static void free_pmds(pmd_t *pmds[]) | 193 | static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) |
194 | { | 194 | { |
195 | int i; | 195 | int i; |
196 | 196 | ||
@@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[]) | |||
198 | if (pmds[i]) { | 198 | if (pmds[i]) { |
199 | pgtable_pmd_page_dtor(virt_to_page(pmds[i])); | 199 | pgtable_pmd_page_dtor(virt_to_page(pmds[i])); |
200 | free_page((unsigned long)pmds[i]); | 200 | free_page((unsigned long)pmds[i]); |
201 | mm_dec_nr_pmds(mm); | ||
201 | } | 202 | } |
202 | } | 203 | } |
203 | 204 | ||
204 | static int preallocate_pmds(pmd_t *pmds[]) | 205 | static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) |
205 | { | 206 | { |
206 | int i; | 207 | int i; |
207 | bool failed = false; | 208 | bool failed = false; |
@@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[]) | |||
215 | pmd = NULL; | 216 | pmd = NULL; |
216 | failed = true; | 217 | failed = true; |
217 | } | 218 | } |
219 | if (pmd) | ||
220 | mm_inc_nr_pmds(mm); | ||
218 | pmds[i] = pmd; | 221 | pmds[i] = pmd; |
219 | } | 222 | } |
220 | 223 | ||
221 | if (failed) { | 224 | if (failed) { |
222 | free_pmds(pmds); | 225 | free_pmds(mm, pmds); |
223 | return -ENOMEM; | 226 | return -ENOMEM; |
224 | } | 227 | } |
225 | 228 | ||
@@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | |||
246 | 249 | ||
247 | paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); | 250 | paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); |
248 | pmd_free(mm, pmd); | 251 | pmd_free(mm, pmd); |
252 | mm_dec_nr_pmds(mm); | ||
249 | } | 253 | } |
250 | } | 254 | } |
251 | } | 255 | } |
@@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) | |||
283 | 287 | ||
284 | mm->pgd = pgd; | 288 | mm->pgd = pgd; |
285 | 289 | ||
286 | if (preallocate_pmds(pmds) != 0) | 290 | if (preallocate_pmds(mm, pmds) != 0) |
287 | goto out_free_pgd; | 291 | goto out_free_pgd; |
288 | 292 | ||
289 | if (paravirt_pgd_alloc(mm) != 0) | 293 | if (paravirt_pgd_alloc(mm) != 0) |
@@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) | |||
304 | return pgd; | 308 | return pgd; |
305 | 309 | ||
306 | out_free_pmds: | 310 | out_free_pmds: |
307 | free_pmds(pmds); | 311 | free_pmds(mm, pmds); |
308 | out_free_pgd: | 312 | out_free_pgd: |
309 | free_page((unsigned long)pgd); | 313 | free_page((unsigned long)pgd); |
310 | out: | 314 | out: |
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 01b80dce9d65..a5e929a10c20 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h | |||
@@ -57,7 +57,7 @@ | |||
57 | #define PTRS_PER_PGD 1024 | 57 | #define PTRS_PER_PGD 1024 |
58 | #define PGD_ORDER 0 | 58 | #define PGD_ORDER 0 |
59 | #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) | 59 | #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) |
60 | #define FIRST_USER_ADDRESS 0 | 60 | #define FIRST_USER_ADDRESS 0UL |
61 | #define FIRST_USER_PGD_NR (FIRST_USER_ADDRESS >> PGDIR_SHIFT) | 61 | #define FIRST_USER_PGD_NR (FIRST_USER_ADDRESS >> PGDIR_SHIFT) |
62 | 62 | ||
63 | /* | 63 | /* |
diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c index bee2329e0b2e..24152accc66c 100644 --- a/drivers/media/pci/ivtv/ivtv-udma.c +++ b/drivers/media/pci/ivtv/ivtv-udma.c | |||
@@ -124,10 +124,8 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, | |||
124 | } | 124 | } |
125 | 125 | ||
126 | /* Get user pages for DMA Xfer */ | 126 | /* Get user pages for DMA Xfer */ |
127 | down_read(¤t->mm->mmap_sem); | 127 | err = get_user_pages_unlocked(current, current->mm, |
128 | err = get_user_pages(current, current->mm, | 128 | user_dma.uaddr, user_dma.page_count, 0, 1, dma->map); |
129 | user_dma.uaddr, user_dma.page_count, 0, 1, dma->map, NULL); | ||
130 | up_read(¤t->mm->mmap_sem); | ||
131 | 129 | ||
132 | if (user_dma.page_count != err) { | 130 | if (user_dma.page_count != err) { |
133 | IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n", | 131 | IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n", |
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 128d3b55bdd9..9a1c34205254 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c | |||
@@ -4551,18 +4551,15 @@ static int sgl_map_user_pages(struct st_buffer *STbp, | |||
4551 | return -ENOMEM; | 4551 | return -ENOMEM; |
4552 | 4552 | ||
4553 | /* Try to fault in all of the necessary pages */ | 4553 | /* Try to fault in all of the necessary pages */ |
4554 | down_read(¤t->mm->mmap_sem); | ||
4555 | /* rw==READ means read from drive, write into memory area */ | 4554 | /* rw==READ means read from drive, write into memory area */ |
4556 | res = get_user_pages( | 4555 | res = get_user_pages_unlocked( |
4557 | current, | 4556 | current, |
4558 | current->mm, | 4557 | current->mm, |
4559 | uaddr, | 4558 | uaddr, |
4560 | nr_pages, | 4559 | nr_pages, |
4561 | rw == READ, | 4560 | rw == READ, |
4562 | 0, /* don't force */ | 4561 | 0, /* don't force */ |
4563 | pages, | 4562 | pages); |
4564 | NULL); | ||
4565 | up_read(¤t->mm->mmap_sem); | ||
4566 | 4563 | ||
4567 | /* Errors and no page mapped should return here */ | 4564 | /* Errors and no page mapped should return here */ |
4568 | if (res < nr_pages) | 4565 | if (res < nr_pages) |
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index b545d3d1da3e..feafa172b155 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c | |||
@@ -160,7 +160,12 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) | |||
160 | selected->pid, selected->comm, | 160 | selected->pid, selected->comm, |
161 | selected_oom_score_adj, selected_tasksize); | 161 | selected_oom_score_adj, selected_tasksize); |
162 | lowmem_deathpending_timeout = jiffies + HZ; | 162 | lowmem_deathpending_timeout = jiffies + HZ; |
163 | set_tsk_thread_flag(selected, TIF_MEMDIE); | 163 | /* |
164 | * FIXME: lowmemorykiller shouldn't abuse global OOM killer | ||
165 | * infrastructure. There is no real reason why the selected | ||
166 | * task should have access to the memory reserves. | ||
167 | */ | ||
168 | mark_tsk_oom_victim(selected); | ||
164 | send_sig(SIGKILL, selected, 0); | 169 | send_sig(SIGKILL, selected, 0); |
165 | rem += selected_tasksize; | 170 | rem += selected_tasksize; |
166 | } | 171 | } |
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 42bad18c66c9..259a4d5a4e8f 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c | |||
@@ -90,7 +90,7 @@ static void sysrq_handle_loglevel(int key) | |||
90 | 90 | ||
91 | i = key - '0'; | 91 | i = key - '0'; |
92 | console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; | 92 | console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; |
93 | printk("Loglevel set to %d\n", i); | 93 | pr_info("Loglevel set to %d\n", i); |
94 | console_loglevel = i; | 94 | console_loglevel = i; |
95 | } | 95 | } |
96 | static struct sysrq_key_op sysrq_loglevel_op = { | 96 | static struct sysrq_key_op sysrq_loglevel_op = { |
@@ -220,7 +220,7 @@ static void showacpu(void *dummy) | |||
220 | return; | 220 | return; |
221 | 221 | ||
222 | spin_lock_irqsave(&show_lock, flags); | 222 | spin_lock_irqsave(&show_lock, flags); |
223 | printk(KERN_INFO "CPU%d:\n", smp_processor_id()); | 223 | pr_info("CPU%d:\n", smp_processor_id()); |
224 | show_stack(NULL, NULL); | 224 | show_stack(NULL, NULL); |
225 | spin_unlock_irqrestore(&show_lock, flags); | 225 | spin_unlock_irqrestore(&show_lock, flags); |
226 | } | 226 | } |
@@ -243,7 +243,7 @@ static void sysrq_handle_showallcpus(int key) | |||
243 | struct pt_regs *regs = get_irq_regs(); | 243 | struct pt_regs *regs = get_irq_regs(); |
244 | 244 | ||
245 | if (regs) { | 245 | if (regs) { |
246 | printk(KERN_INFO "CPU%d:\n", smp_processor_id()); | 246 | pr_info("CPU%d:\n", smp_processor_id()); |
247 | show_regs(regs); | 247 | show_regs(regs); |
248 | } | 248 | } |
249 | schedule_work(&sysrq_showallcpus); | 249 | schedule_work(&sysrq_showallcpus); |
@@ -355,8 +355,9 @@ static struct sysrq_key_op sysrq_term_op = { | |||
355 | 355 | ||
356 | static void moom_callback(struct work_struct *ignored) | 356 | static void moom_callback(struct work_struct *ignored) |
357 | { | 357 | { |
358 | out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL, | 358 | if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), |
359 | 0, NULL, true); | 359 | GFP_KERNEL, 0, NULL, true)) |
360 | pr_info("OOM request ignored because killer is disabled\n"); | ||
360 | } | 361 | } |
361 | 362 | ||
362 | static DECLARE_WORK(moom_work, moom_callback); | 363 | static DECLARE_WORK(moom_work, moom_callback); |
@@ -522,7 +523,7 @@ void __handle_sysrq(int key, bool check_mask) | |||
522 | */ | 523 | */ |
523 | orig_log_level = console_loglevel; | 524 | orig_log_level = console_loglevel; |
524 | console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; | 525 | console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; |
525 | printk(KERN_INFO "SysRq : "); | 526 | pr_info("SysRq : "); |
526 | 527 | ||
527 | op_p = __sysrq_get_key_op(key); | 528 | op_p = __sysrq_get_key_op(key); |
528 | if (op_p) { | 529 | if (op_p) { |
@@ -531,14 +532,14 @@ void __handle_sysrq(int key, bool check_mask) | |||
531 | * should not) and is the invoked operation enabled? | 532 | * should not) and is the invoked operation enabled? |
532 | */ | 533 | */ |
533 | if (!check_mask || sysrq_on_mask(op_p->enable_mask)) { | 534 | if (!check_mask || sysrq_on_mask(op_p->enable_mask)) { |
534 | printk("%s\n", op_p->action_msg); | 535 | pr_cont("%s\n", op_p->action_msg); |
535 | console_loglevel = orig_log_level; | 536 | console_loglevel = orig_log_level; |
536 | op_p->handler(key); | 537 | op_p->handler(key); |
537 | } else { | 538 | } else { |
538 | printk("This sysrq operation is disabled.\n"); | 539 | pr_cont("This sysrq operation is disabled.\n"); |
539 | } | 540 | } |
540 | } else { | 541 | } else { |
541 | printk("HELP : "); | 542 | pr_cont("HELP : "); |
542 | /* Only print the help msg once per handler */ | 543 | /* Only print the help msg once per handler */ |
543 | for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) { | 544 | for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) { |
544 | if (sysrq_key_table[i]) { | 545 | if (sysrq_key_table[i]) { |
@@ -549,10 +550,10 @@ void __handle_sysrq(int key, bool check_mask) | |||
549 | ; | 550 | ; |
550 | if (j != i) | 551 | if (j != i) |
551 | continue; | 552 | continue; |
552 | printk("%s ", sysrq_key_table[i]->help_msg); | 553 | pr_cont("%s ", sysrq_key_table[i]->help_msg); |
553 | } | 554 | } |
554 | } | 555 | } |
555 | printk("\n"); | 556 | pr_cont("\n"); |
556 | console_loglevel = orig_log_level; | 557 | console_loglevel = orig_log_level; |
557 | } | 558 | } |
558 | rcu_read_unlock(); | 559 | rcu_read_unlock(); |
diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c index 7c74f58fc101..0e24eb9c219c 100644 --- a/drivers/video/fbdev/pvr2fb.c +++ b/drivers/video/fbdev/pvr2fb.c | |||
@@ -686,10 +686,8 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf, | |||
686 | if (!pages) | 686 | if (!pages) |
687 | return -ENOMEM; | 687 | return -ENOMEM; |
688 | 688 | ||
689 | down_read(¤t->mm->mmap_sem); | 689 | ret = get_user_pages_unlocked(current, current->mm, (unsigned long)buf, |
690 | ret = get_user_pages(current, current->mm, (unsigned long)buf, | 690 | nr_pages, WRITE, 0, pages); |
691 | nr_pages, WRITE, 0, pages, NULL); | ||
692 | up_read(¤t->mm->mmap_sem); | ||
693 | 691 | ||
694 | if (ret < nr_pages) { | 692 | if (ret < nr_pages) { |
695 | nr_pages = ret; | 693 | nr_pages = ret; |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 790dbae3343c..c73df6a7c9b6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -1407,8 +1407,8 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) | |||
1407 | while (index <= end_index) { | 1407 | while (index <= end_index) { |
1408 | page = find_get_page(inode->i_mapping, index); | 1408 | page = find_get_page(inode->i_mapping, index); |
1409 | BUG_ON(!page); /* Pages should be in the extent_io_tree */ | 1409 | BUG_ON(!page); /* Pages should be in the extent_io_tree */ |
1410 | account_page_redirty(page); | ||
1411 | __set_page_dirty_nobuffers(page); | 1410 | __set_page_dirty_nobuffers(page); |
1411 | account_page_redirty(page); | ||
1412 | page_cache_release(page); | 1412 | page_cache_release(page); |
1413 | index++; | 1413 | index++; |
1414 | } | 1414 | } |
diff --git a/fs/proc/page.c b/fs/proc/page.c index 1e3187da1fed..7eee2d8b97d9 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/ksm.h> | 5 | #include <linux/ksm.h> |
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/mmzone.h> | 7 | #include <linux/mmzone.h> |
8 | #include <linux/huge_mm.h> | ||
8 | #include <linux/proc_fs.h> | 9 | #include <linux/proc_fs.h> |
9 | #include <linux/seq_file.h> | 10 | #include <linux/seq_file.h> |
10 | #include <linux/hugetlb.h> | 11 | #include <linux/hugetlb.h> |
@@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page) | |||
121 | * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon | 122 | * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon |
122 | * to make sure a given page is a thp, not a non-huge compound page. | 123 | * to make sure a given page is a thp, not a non-huge compound page. |
123 | */ | 124 | */ |
124 | else if (PageTransCompound(page) && (PageLRU(compound_head(page)) || | 125 | else if (PageTransCompound(page)) { |
125 | PageAnon(compound_head(page)))) | 126 | struct page *head = compound_head(page); |
126 | u |= 1 << KPF_THP; | 127 | |
128 | if (PageLRU(head) || PageAnon(head)) | ||
129 | u |= 1 << KPF_THP; | ||
130 | else if (is_huge_zero_page(head)) { | ||
131 | u |= 1 << KPF_ZERO_PAGE; | ||
132 | u |= 1 << KPF_THP; | ||
133 | } | ||
134 | } else if (is_zero_pfn(page_to_pfn(page))) | ||
135 | u |= 1 << KPF_ZERO_PAGE; | ||
136 | |||
127 | 137 | ||
128 | /* | 138 | /* |
129 | * Caveats on high order pages: page->_count will only be set | 139 | * Caveats on high order pages: page->_count will only be set |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6396f88c6687..0e36c1e49fe3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -21,7 +21,7 @@ | |||
21 | 21 | ||
22 | void task_mem(struct seq_file *m, struct mm_struct *mm) | 22 | void task_mem(struct seq_file *m, struct mm_struct *mm) |
23 | { | 23 | { |
24 | unsigned long data, text, lib, swap; | 24 | unsigned long data, text, lib, swap, ptes, pmds; |
25 | unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; | 25 | unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; |
26 | 26 | ||
27 | /* | 27 | /* |
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) | |||
42 | text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; | 42 | text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; |
43 | lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; | 43 | lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; |
44 | swap = get_mm_counter(mm, MM_SWAPENTS); | 44 | swap = get_mm_counter(mm, MM_SWAPENTS); |
45 | ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); | ||
46 | pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); | ||
45 | seq_printf(m, | 47 | seq_printf(m, |
46 | "VmPeak:\t%8lu kB\n" | 48 | "VmPeak:\t%8lu kB\n" |
47 | "VmSize:\t%8lu kB\n" | 49 | "VmSize:\t%8lu kB\n" |
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) | |||
54 | "VmExe:\t%8lu kB\n" | 56 | "VmExe:\t%8lu kB\n" |
55 | "VmLib:\t%8lu kB\n" | 57 | "VmLib:\t%8lu kB\n" |
56 | "VmPTE:\t%8lu kB\n" | 58 | "VmPTE:\t%8lu kB\n" |
59 | "VmPMD:\t%8lu kB\n" | ||
57 | "VmSwap:\t%8lu kB\n", | 60 | "VmSwap:\t%8lu kB\n", |
58 | hiwater_vm << (PAGE_SHIFT-10), | 61 | hiwater_vm << (PAGE_SHIFT-10), |
59 | total_vm << (PAGE_SHIFT-10), | 62 | total_vm << (PAGE_SHIFT-10), |
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) | |||
63 | total_rss << (PAGE_SHIFT-10), | 66 | total_rss << (PAGE_SHIFT-10), |
64 | data << (PAGE_SHIFT-10), | 67 | data << (PAGE_SHIFT-10), |
65 | mm->stack_vm << (PAGE_SHIFT-10), text, lib, | 68 | mm->stack_vm << (PAGE_SHIFT-10), text, lib, |
66 | (PTRS_PER_PTE * sizeof(pte_t) * | 69 | ptes >> 10, |
67 | atomic_long_read(&mm->nr_ptes)) >> 10, | 70 | pmds >> 10, |
68 | swap << (PAGE_SHIFT-10)); | 71 | swap << (PAGE_SHIFT-10)); |
69 | } | 72 | } |
70 | 73 | ||
@@ -433,7 +436,6 @@ const struct file_operations proc_tid_maps_operations = { | |||
433 | 436 | ||
434 | #ifdef CONFIG_PROC_PAGE_MONITOR | 437 | #ifdef CONFIG_PROC_PAGE_MONITOR |
435 | struct mem_size_stats { | 438 | struct mem_size_stats { |
436 | struct vm_area_struct *vma; | ||
437 | unsigned long resident; | 439 | unsigned long resident; |
438 | unsigned long shared_clean; | 440 | unsigned long shared_clean; |
439 | unsigned long shared_dirty; | 441 | unsigned long shared_dirty; |
@@ -482,7 +484,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, | |||
482 | struct mm_walk *walk) | 484 | struct mm_walk *walk) |
483 | { | 485 | { |
484 | struct mem_size_stats *mss = walk->private; | 486 | struct mem_size_stats *mss = walk->private; |
485 | struct vm_area_struct *vma = mss->vma; | 487 | struct vm_area_struct *vma = walk->vma; |
486 | struct page *page = NULL; | 488 | struct page *page = NULL; |
487 | 489 | ||
488 | if (pte_present(*pte)) { | 490 | if (pte_present(*pte)) { |
@@ -506,7 +508,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, | |||
506 | struct mm_walk *walk) | 508 | struct mm_walk *walk) |
507 | { | 509 | { |
508 | struct mem_size_stats *mss = walk->private; | 510 | struct mem_size_stats *mss = walk->private; |
509 | struct vm_area_struct *vma = mss->vma; | 511 | struct vm_area_struct *vma = walk->vma; |
510 | struct page *page; | 512 | struct page *page; |
511 | 513 | ||
512 | /* FOLL_DUMP will return -EFAULT on huge zero page */ | 514 | /* FOLL_DUMP will return -EFAULT on huge zero page */ |
@@ -527,8 +529,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, | |||
527 | static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | 529 | static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
528 | struct mm_walk *walk) | 530 | struct mm_walk *walk) |
529 | { | 531 | { |
530 | struct mem_size_stats *mss = walk->private; | 532 | struct vm_area_struct *vma = walk->vma; |
531 | struct vm_area_struct *vma = mss->vma; | ||
532 | pte_t *pte; | 533 | pte_t *pte; |
533 | spinlock_t *ptl; | 534 | spinlock_t *ptl; |
534 | 535 | ||
@@ -620,10 +621,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) | |||
620 | }; | 621 | }; |
621 | 622 | ||
622 | memset(&mss, 0, sizeof mss); | 623 | memset(&mss, 0, sizeof mss); |
623 | mss.vma = vma; | ||
624 | /* mmap_sem is held in m_start */ | 624 | /* mmap_sem is held in m_start */ |
625 | if (vma->vm_mm && !is_vm_hugetlb_page(vma)) | 625 | walk_page_vma(vma, &smaps_walk); |
626 | walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); | ||
627 | 626 | ||
628 | show_map_vma(m, vma, is_pid); | 627 | show_map_vma(m, vma, is_pid); |
629 | 628 | ||
@@ -737,14 +736,13 @@ enum clear_refs_types { | |||
737 | }; | 736 | }; |
738 | 737 | ||
739 | struct clear_refs_private { | 738 | struct clear_refs_private { |
740 | struct vm_area_struct *vma; | ||
741 | enum clear_refs_types type; | 739 | enum clear_refs_types type; |
742 | }; | 740 | }; |
743 | 741 | ||
742 | #ifdef CONFIG_MEM_SOFT_DIRTY | ||
744 | static inline void clear_soft_dirty(struct vm_area_struct *vma, | 743 | static inline void clear_soft_dirty(struct vm_area_struct *vma, |
745 | unsigned long addr, pte_t *pte) | 744 | unsigned long addr, pte_t *pte) |
746 | { | 745 | { |
747 | #ifdef CONFIG_MEM_SOFT_DIRTY | ||
748 | /* | 746 | /* |
749 | * The soft-dirty tracker uses #PF-s to catch writes | 747 | * The soft-dirty tracker uses #PF-s to catch writes |
750 | * to pages, so write-protect the pte as well. See the | 748 | * to pages, so write-protect the pte as well. See the |
@@ -761,19 +759,60 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, | |||
761 | } | 759 | } |
762 | 760 | ||
763 | set_pte_at(vma->vm_mm, addr, pte, ptent); | 761 | set_pte_at(vma->vm_mm, addr, pte, ptent); |
764 | #endif | ||
765 | } | 762 | } |
766 | 763 | ||
764 | static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, | ||
765 | unsigned long addr, pmd_t *pmdp) | ||
766 | { | ||
767 | pmd_t pmd = *pmdp; | ||
768 | |||
769 | pmd = pmd_wrprotect(pmd); | ||
770 | pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); | ||
771 | |||
772 | if (vma->vm_flags & VM_SOFTDIRTY) | ||
773 | vma->vm_flags &= ~VM_SOFTDIRTY; | ||
774 | |||
775 | set_pmd_at(vma->vm_mm, addr, pmdp, pmd); | ||
776 | } | ||
777 | |||
778 | #else | ||
779 | |||
780 | static inline void clear_soft_dirty(struct vm_area_struct *vma, | ||
781 | unsigned long addr, pte_t *pte) | ||
782 | { | ||
783 | } | ||
784 | |||
785 | static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, | ||
786 | unsigned long addr, pmd_t *pmdp) | ||
787 | { | ||
788 | } | ||
789 | #endif | ||
790 | |||
767 | static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, | 791 | static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, |
768 | unsigned long end, struct mm_walk *walk) | 792 | unsigned long end, struct mm_walk *walk) |
769 | { | 793 | { |
770 | struct clear_refs_private *cp = walk->private; | 794 | struct clear_refs_private *cp = walk->private; |
771 | struct vm_area_struct *vma = cp->vma; | 795 | struct vm_area_struct *vma = walk->vma; |
772 | pte_t *pte, ptent; | 796 | pte_t *pte, ptent; |
773 | spinlock_t *ptl; | 797 | spinlock_t *ptl; |
774 | struct page *page; | 798 | struct page *page; |
775 | 799 | ||
776 | split_huge_page_pmd(vma, addr, pmd); | 800 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
801 | if (cp->type == CLEAR_REFS_SOFT_DIRTY) { | ||
802 | clear_soft_dirty_pmd(vma, addr, pmd); | ||
803 | goto out; | ||
804 | } | ||
805 | |||
806 | page = pmd_page(*pmd); | ||
807 | |||
808 | /* Clear accessed and referenced bits. */ | ||
809 | pmdp_test_and_clear_young(vma, addr, pmd); | ||
810 | ClearPageReferenced(page); | ||
811 | out: | ||
812 | spin_unlock(ptl); | ||
813 | return 0; | ||
814 | } | ||
815 | |||
777 | if (pmd_trans_unstable(pmd)) | 816 | if (pmd_trans_unstable(pmd)) |
778 | return 0; | 817 | return 0; |
779 | 818 | ||
@@ -802,6 +841,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, | |||
802 | return 0; | 841 | return 0; |
803 | } | 842 | } |
804 | 843 | ||
844 | static int clear_refs_test_walk(unsigned long start, unsigned long end, | ||
845 | struct mm_walk *walk) | ||
846 | { | ||
847 | struct clear_refs_private *cp = walk->private; | ||
848 | struct vm_area_struct *vma = walk->vma; | ||
849 | |||
850 | if (vma->vm_flags & VM_PFNMAP) | ||
851 | return 1; | ||
852 | |||
853 | /* | ||
854 | * Writing 1 to /proc/pid/clear_refs affects all pages. | ||
855 | * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. | ||
856 | * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. | ||
857 | * Writing 4 to /proc/pid/clear_refs affects all pages. | ||
858 | */ | ||
859 | if (cp->type == CLEAR_REFS_ANON && vma->vm_file) | ||
860 | return 1; | ||
861 | if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) | ||
862 | return 1; | ||
863 | return 0; | ||
864 | } | ||
865 | |||
805 | static ssize_t clear_refs_write(struct file *file, const char __user *buf, | 866 | static ssize_t clear_refs_write(struct file *file, const char __user *buf, |
806 | size_t count, loff_t *ppos) | 867 | size_t count, loff_t *ppos) |
807 | { | 868 | { |
@@ -842,6 +903,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, | |||
842 | }; | 903 | }; |
843 | struct mm_walk clear_refs_walk = { | 904 | struct mm_walk clear_refs_walk = { |
844 | .pmd_entry = clear_refs_pte_range, | 905 | .pmd_entry = clear_refs_pte_range, |
906 | .test_walk = clear_refs_test_walk, | ||
845 | .mm = mm, | 907 | .mm = mm, |
846 | .private = &cp, | 908 | .private = &cp, |
847 | }; | 909 | }; |
@@ -861,28 +923,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, | |||
861 | } | 923 | } |
862 | mmu_notifier_invalidate_range_start(mm, 0, -1); | 924 | mmu_notifier_invalidate_range_start(mm, 0, -1); |
863 | } | 925 | } |
864 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 926 | walk_page_range(0, ~0UL, &clear_refs_walk); |
865 | cp.vma = vma; | ||
866 | if (is_vm_hugetlb_page(vma)) | ||
867 | continue; | ||
868 | /* | ||
869 | * Writing 1 to /proc/pid/clear_refs affects all pages. | ||
870 | * | ||
871 | * Writing 2 to /proc/pid/clear_refs only affects | ||
872 | * Anonymous pages. | ||
873 | * | ||
874 | * Writing 3 to /proc/pid/clear_refs only affects file | ||
875 | * mapped pages. | ||
876 | * | ||
877 | * Writing 4 to /proc/pid/clear_refs affects all pages. | ||
878 | */ | ||
879 | if (type == CLEAR_REFS_ANON && vma->vm_file) | ||
880 | continue; | ||
881 | if (type == CLEAR_REFS_MAPPED && !vma->vm_file) | ||
882 | continue; | ||
883 | walk_page_range(vma->vm_start, vma->vm_end, | ||
884 | &clear_refs_walk); | ||
885 | } | ||
886 | if (type == CLEAR_REFS_SOFT_DIRTY) | 927 | if (type == CLEAR_REFS_SOFT_DIRTY) |
887 | mmu_notifier_invalidate_range_end(mm, 0, -1); | 928 | mmu_notifier_invalidate_range_end(mm, 0, -1); |
888 | flush_tlb_mm(mm); | 929 | flush_tlb_mm(mm); |
@@ -1050,15 +1091,13 @@ static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemap | |||
1050 | static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | 1091 | static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
1051 | struct mm_walk *walk) | 1092 | struct mm_walk *walk) |
1052 | { | 1093 | { |
1053 | struct vm_area_struct *vma; | 1094 | struct vm_area_struct *vma = walk->vma; |
1054 | struct pagemapread *pm = walk->private; | 1095 | struct pagemapread *pm = walk->private; |
1055 | spinlock_t *ptl; | 1096 | spinlock_t *ptl; |
1056 | pte_t *pte; | 1097 | pte_t *pte, *orig_pte; |
1057 | int err = 0; | 1098 | int err = 0; |
1058 | 1099 | ||
1059 | /* find the first VMA at or above 'addr' */ | 1100 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
1060 | vma = find_vma(walk->mm, addr); | ||
1061 | if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | ||
1062 | int pmd_flags2; | 1101 | int pmd_flags2; |
1063 | 1102 | ||
1064 | if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) | 1103 | if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) |
@@ -1084,51 +1123,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
1084 | if (pmd_trans_unstable(pmd)) | 1123 | if (pmd_trans_unstable(pmd)) |
1085 | return 0; | 1124 | return 0; |
1086 | 1125 | ||
1087 | while (1) { | 1126 | /* |
1088 | /* End of address space hole, which we mark as non-present. */ | 1127 | * We can assume that @vma always points to a valid one and @end never |
1089 | unsigned long hole_end; | 1128 | * goes beyond vma->vm_end. |
1090 | 1129 | */ | |
1091 | if (vma) | 1130 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
1092 | hole_end = min(end, vma->vm_start); | 1131 | for (; addr < end; pte++, addr += PAGE_SIZE) { |
1093 | else | 1132 | pagemap_entry_t pme; |
1094 | hole_end = end; | ||
1095 | |||
1096 | for (; addr < hole_end; addr += PAGE_SIZE) { | ||
1097 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); | ||
1098 | |||
1099 | err = add_to_pagemap(addr, &pme, pm); | ||
1100 | if (err) | ||
1101 | return err; | ||
1102 | } | ||
1103 | |||
1104 | if (!vma || vma->vm_start >= end) | ||
1105 | break; | ||
1106 | /* | ||
1107 | * We can't possibly be in a hugetlb VMA. In general, | ||
1108 | * for a mm_walk with a pmd_entry and a hugetlb_entry, | ||
1109 | * the pmd_entry can only be called on addresses in a | ||
1110 | * hugetlb if the walk starts in a non-hugetlb VMA and | ||
1111 | * spans a hugepage VMA. Since pagemap_read walks are | ||
1112 | * PMD-sized and PMD-aligned, this will never be true. | ||
1113 | */ | ||
1114 | BUG_ON(is_vm_hugetlb_page(vma)); | ||
1115 | |||
1116 | /* Addresses in the VMA. */ | ||
1117 | for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { | ||
1118 | pagemap_entry_t pme; | ||
1119 | pte = pte_offset_map(pmd, addr); | ||
1120 | pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); | ||
1121 | pte_unmap(pte); | ||
1122 | err = add_to_pagemap(addr, &pme, pm); | ||
1123 | if (err) | ||
1124 | return err; | ||
1125 | } | ||
1126 | 1133 | ||
1127 | if (addr == end) | 1134 | pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); |
1135 | err = add_to_pagemap(addr, &pme, pm); | ||
1136 | if (err) | ||
1128 | break; | 1137 | break; |
1129 | |||
1130 | vma = find_vma(walk->mm, addr); | ||
1131 | } | 1138 | } |
1139 | pte_unmap_unlock(orig_pte, ptl); | ||
1132 | 1140 | ||
1133 | cond_resched(); | 1141 | cond_resched(); |
1134 | 1142 | ||
@@ -1154,15 +1162,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, | |||
1154 | struct mm_walk *walk) | 1162 | struct mm_walk *walk) |
1155 | { | 1163 | { |
1156 | struct pagemapread *pm = walk->private; | 1164 | struct pagemapread *pm = walk->private; |
1157 | struct vm_area_struct *vma; | 1165 | struct vm_area_struct *vma = walk->vma; |
1158 | int err = 0; | 1166 | int err = 0; |
1159 | int flags2; | 1167 | int flags2; |
1160 | pagemap_entry_t pme; | 1168 | pagemap_entry_t pme; |
1161 | 1169 | ||
1162 | vma = find_vma(walk->mm, addr); | 1170 | if (vma->vm_flags & VM_SOFTDIRTY) |
1163 | WARN_ON_ONCE(!vma); | ||
1164 | |||
1165 | if (vma && (vma->vm_flags & VM_SOFTDIRTY)) | ||
1166 | flags2 = __PM_SOFT_DIRTY; | 1171 | flags2 = __PM_SOFT_DIRTY; |
1167 | else | 1172 | else |
1168 | flags2 = 0; | 1173 | flags2 = 0; |
@@ -1322,7 +1327,6 @@ const struct file_operations proc_pagemap_operations = { | |||
1322 | #ifdef CONFIG_NUMA | 1327 | #ifdef CONFIG_NUMA |
1323 | 1328 | ||
1324 | struct numa_maps { | 1329 | struct numa_maps { |
1325 | struct vm_area_struct *vma; | ||
1326 | unsigned long pages; | 1330 | unsigned long pages; |
1327 | unsigned long anon; | 1331 | unsigned long anon; |
1328 | unsigned long active; | 1332 | unsigned long active; |
@@ -1391,18 +1395,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, | |||
1391 | static int gather_pte_stats(pmd_t *pmd, unsigned long addr, | 1395 | static int gather_pte_stats(pmd_t *pmd, unsigned long addr, |
1392 | unsigned long end, struct mm_walk *walk) | 1396 | unsigned long end, struct mm_walk *walk) |
1393 | { | 1397 | { |
1394 | struct numa_maps *md; | 1398 | struct numa_maps *md = walk->private; |
1399 | struct vm_area_struct *vma = walk->vma; | ||
1395 | spinlock_t *ptl; | 1400 | spinlock_t *ptl; |
1396 | pte_t *orig_pte; | 1401 | pte_t *orig_pte; |
1397 | pte_t *pte; | 1402 | pte_t *pte; |
1398 | 1403 | ||
1399 | md = walk->private; | 1404 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
1400 | |||
1401 | if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { | ||
1402 | pte_t huge_pte = *(pte_t *)pmd; | 1405 | pte_t huge_pte = *(pte_t *)pmd; |
1403 | struct page *page; | 1406 | struct page *page; |
1404 | 1407 | ||
1405 | page = can_gather_numa_stats(huge_pte, md->vma, addr); | 1408 | page = can_gather_numa_stats(huge_pte, vma, addr); |
1406 | if (page) | 1409 | if (page) |
1407 | gather_stats(page, md, pte_dirty(huge_pte), | 1410 | gather_stats(page, md, pte_dirty(huge_pte), |
1408 | HPAGE_PMD_SIZE/PAGE_SIZE); | 1411 | HPAGE_PMD_SIZE/PAGE_SIZE); |
@@ -1414,7 +1417,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, | |||
1414 | return 0; | 1417 | return 0; |
1415 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | 1418 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
1416 | do { | 1419 | do { |
1417 | struct page *page = can_gather_numa_stats(*pte, md->vma, addr); | 1420 | struct page *page = can_gather_numa_stats(*pte, vma, addr); |
1418 | if (!page) | 1421 | if (!page) |
1419 | continue; | 1422 | continue; |
1420 | gather_stats(page, md, pte_dirty(*pte), 1); | 1423 | gather_stats(page, md, pte_dirty(*pte), 1); |
@@ -1424,7 +1427,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, | |||
1424 | return 0; | 1427 | return 0; |
1425 | } | 1428 | } |
1426 | #ifdef CONFIG_HUGETLB_PAGE | 1429 | #ifdef CONFIG_HUGETLB_PAGE |
1427 | static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, | 1430 | static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, |
1428 | unsigned long addr, unsigned long end, struct mm_walk *walk) | 1431 | unsigned long addr, unsigned long end, struct mm_walk *walk) |
1429 | { | 1432 | { |
1430 | struct numa_maps *md; | 1433 | struct numa_maps *md; |
@@ -1443,7 +1446,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, | |||
1443 | } | 1446 | } |
1444 | 1447 | ||
1445 | #else | 1448 | #else |
1446 | static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, | 1449 | static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, |
1447 | unsigned long addr, unsigned long end, struct mm_walk *walk) | 1450 | unsigned long addr, unsigned long end, struct mm_walk *walk) |
1448 | { | 1451 | { |
1449 | return 0; | 1452 | return 0; |
@@ -1461,7 +1464,12 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) | |||
1461 | struct numa_maps *md = &numa_priv->md; | 1464 | struct numa_maps *md = &numa_priv->md; |
1462 | struct file *file = vma->vm_file; | 1465 | struct file *file = vma->vm_file; |
1463 | struct mm_struct *mm = vma->vm_mm; | 1466 | struct mm_struct *mm = vma->vm_mm; |
1464 | struct mm_walk walk = {}; | 1467 | struct mm_walk walk = { |
1468 | .hugetlb_entry = gather_hugetlb_stats, | ||
1469 | .pmd_entry = gather_pte_stats, | ||
1470 | .private = md, | ||
1471 | .mm = mm, | ||
1472 | }; | ||
1465 | struct mempolicy *pol; | 1473 | struct mempolicy *pol; |
1466 | char buffer[64]; | 1474 | char buffer[64]; |
1467 | int nid; | 1475 | int nid; |
@@ -1472,13 +1480,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) | |||
1472 | /* Ensure we start with an empty set of numa_maps statistics. */ | 1480 | /* Ensure we start with an empty set of numa_maps statistics. */ |
1473 | memset(md, 0, sizeof(*md)); | 1481 | memset(md, 0, sizeof(*md)); |
1474 | 1482 | ||
1475 | md->vma = vma; | ||
1476 | |||
1477 | walk.hugetlb_entry = gather_hugetbl_stats; | ||
1478 | walk.pmd_entry = gather_pte_stats; | ||
1479 | walk.private = md; | ||
1480 | walk.mm = mm; | ||
1481 | |||
1482 | pol = __get_vma_policy(vma, vma->vm_start); | 1483 | pol = __get_vma_policy(vma, vma->vm_start); |
1483 | if (pol) { | 1484 | if (pol) { |
1484 | mpol_to_str(buffer, sizeof(buffer), pol); | 1485 | mpol_to_str(buffer, sizeof(buffer), pol); |
@@ -1512,7 +1513,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) | |||
1512 | if (is_vm_hugetlb_page(vma)) | 1513 | if (is_vm_hugetlb_page(vma)) |
1513 | seq_puts(m, " huge"); | 1514 | seq_puts(m, " huge"); |
1514 | 1515 | ||
1515 | walk_page_range(vma->vm_start, vma->vm_end, &walk); | 1516 | /* mmap_sem is held by m_start */ |
1517 | walk_page_vma(vma, &walk); | ||
1516 | 1518 | ||
1517 | if (!md->pages) | 1519 | if (!md->pages) |
1518 | goto out; | 1520 | goto out; |
diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h index 77ff547730af..5bdab6bffd23 100644 --- a/include/asm-generic/4level-fixup.h +++ b/include/asm-generic/4level-fixup.h | |||
@@ -4,6 +4,7 @@ | |||
4 | #define __ARCH_HAS_4LEVEL_HACK | 4 | #define __ARCH_HAS_4LEVEL_HACK |
5 | #define __PAGETABLE_PUD_FOLDED | 5 | #define __PAGETABLE_PUD_FOLDED |
6 | 6 | ||
7 | #define PUD_SHIFT PGDIR_SHIFT | ||
7 | #define PUD_SIZE PGDIR_SIZE | 8 | #define PUD_SIZE PGDIR_SIZE |
8 | #define PUD_MASK PGDIR_MASK | 9 | #define PUD_MASK PGDIR_MASK |
9 | #define PTRS_PER_PUD 1 | 10 | #define PTRS_PER_PUD 1 |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 3238ffa33f68..a014559e4a49 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -12,6 +12,10 @@ | |||
12 | #define COMPACT_PARTIAL 3 | 12 | #define COMPACT_PARTIAL 3 |
13 | /* The full zone was compacted */ | 13 | /* The full zone was compacted */ |
14 | #define COMPACT_COMPLETE 4 | 14 | #define COMPACT_COMPLETE 4 |
15 | /* For more detailed tracepoint output */ | ||
16 | #define COMPACT_NO_SUITABLE_PAGE 5 | ||
17 | #define COMPACT_NOT_SUITABLE_ZONE 6 | ||
18 | /* When adding new state, please change compaction_status_string, too */ | ||
15 | 19 | ||
16 | /* Used to signal whether compaction detected need_sched() or lock contention */ | 20 | /* Used to signal whether compaction detected need_sched() or lock contention */ |
17 | /* No contention detected */ | 21 | /* No contention detected */ |
@@ -21,6 +25,8 @@ | |||
21 | /* Zone lock or lru_lock was contended in async compaction */ | 25 | /* Zone lock or lru_lock was contended in async compaction */ |
22 | #define COMPACT_CONTENDED_LOCK 2 | 26 | #define COMPACT_CONTENDED_LOCK 2 |
23 | 27 | ||
28 | struct alloc_context; /* in mm/internal.h */ | ||
29 | |||
24 | #ifdef CONFIG_COMPACTION | 30 | #ifdef CONFIG_COMPACTION |
25 | extern int sysctl_compact_memory; | 31 | extern int sysctl_compact_memory; |
26 | extern int sysctl_compaction_handler(struct ctl_table *table, int write, | 32 | extern int sysctl_compaction_handler(struct ctl_table *table, int write, |
@@ -30,81 +36,25 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, | |||
30 | void __user *buffer, size_t *length, loff_t *ppos); | 36 | void __user *buffer, size_t *length, loff_t *ppos); |
31 | 37 | ||
32 | extern int fragmentation_index(struct zone *zone, unsigned int order); | 38 | extern int fragmentation_index(struct zone *zone, unsigned int order); |
33 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, | 39 | extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, |
34 | int order, gfp_t gfp_mask, nodemask_t *mask, | 40 | int alloc_flags, const struct alloc_context *ac, |
35 | enum migrate_mode mode, int *contended, | 41 | enum migrate_mode mode, int *contended); |
36 | int alloc_flags, int classzone_idx); | ||
37 | extern void compact_pgdat(pg_data_t *pgdat, int order); | 42 | extern void compact_pgdat(pg_data_t *pgdat, int order); |
38 | extern void reset_isolation_suitable(pg_data_t *pgdat); | 43 | extern void reset_isolation_suitable(pg_data_t *pgdat); |
39 | extern unsigned long compaction_suitable(struct zone *zone, int order, | 44 | extern unsigned long compaction_suitable(struct zone *zone, int order, |
40 | int alloc_flags, int classzone_idx); | 45 | int alloc_flags, int classzone_idx); |
41 | 46 | ||
42 | /* Do not skip compaction more than 64 times */ | 47 | extern void defer_compaction(struct zone *zone, int order); |
43 | #define COMPACT_MAX_DEFER_SHIFT 6 | 48 | extern bool compaction_deferred(struct zone *zone, int order); |
44 | 49 | extern void compaction_defer_reset(struct zone *zone, int order, | |
45 | /* | 50 | bool alloc_success); |
46 | * Compaction is deferred when compaction fails to result in a page | 51 | extern bool compaction_restarting(struct zone *zone, int order); |
47 | * allocation success. 1 << compact_defer_limit compactions are skipped up | ||
48 | * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT | ||
49 | */ | ||
50 | static inline void defer_compaction(struct zone *zone, int order) | ||
51 | { | ||
52 | zone->compact_considered = 0; | ||
53 | zone->compact_defer_shift++; | ||
54 | |||
55 | if (order < zone->compact_order_failed) | ||
56 | zone->compact_order_failed = order; | ||
57 | |||
58 | if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) | ||
59 | zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; | ||
60 | } | ||
61 | |||
62 | /* Returns true if compaction should be skipped this time */ | ||
63 | static inline bool compaction_deferred(struct zone *zone, int order) | ||
64 | { | ||
65 | unsigned long defer_limit = 1UL << zone->compact_defer_shift; | ||
66 | |||
67 | if (order < zone->compact_order_failed) | ||
68 | return false; | ||
69 | |||
70 | /* Avoid possible overflow */ | ||
71 | if (++zone->compact_considered > defer_limit) | ||
72 | zone->compact_considered = defer_limit; | ||
73 | |||
74 | return zone->compact_considered < defer_limit; | ||
75 | } | ||
76 | |||
77 | /* | ||
78 | * Update defer tracking counters after successful compaction of given order, | ||
79 | * which means an allocation either succeeded (alloc_success == true) or is | ||
80 | * expected to succeed. | ||
81 | */ | ||
82 | static inline void compaction_defer_reset(struct zone *zone, int order, | ||
83 | bool alloc_success) | ||
84 | { | ||
85 | if (alloc_success) { | ||
86 | zone->compact_considered = 0; | ||
87 | zone->compact_defer_shift = 0; | ||
88 | } | ||
89 | if (order >= zone->compact_order_failed) | ||
90 | zone->compact_order_failed = order + 1; | ||
91 | } | ||
92 | |||
93 | /* Returns true if restarting compaction after many failures */ | ||
94 | static inline bool compaction_restarting(struct zone *zone, int order) | ||
95 | { | ||
96 | if (order < zone->compact_order_failed) | ||
97 | return false; | ||
98 | |||
99 | return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && | ||
100 | zone->compact_considered >= 1UL << zone->compact_defer_shift; | ||
101 | } | ||
102 | 52 | ||
103 | #else | 53 | #else |
104 | static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, | 54 | static inline unsigned long try_to_compact_pages(gfp_t gfp_mask, |
105 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 55 | unsigned int order, int alloc_flags, |
106 | enum migrate_mode mode, int *contended, | 56 | const struct alloc_context *ac, |
107 | int alloc_flags, int classzone_idx) | 57 | enum migrate_mode mode, int *contended) |
108 | { | 58 | { |
109 | return COMPACT_CONTINUE; | 59 | return COMPACT_CONTINUE; |
110 | } | 60 | } |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b840e3b2770d..51bd1e72a917 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -334,18 +334,22 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) | |||
334 | } | 334 | } |
335 | extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, | 335 | extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, |
336 | struct vm_area_struct *vma, unsigned long addr, | 336 | struct vm_area_struct *vma, unsigned long addr, |
337 | int node); | 337 | int node, bool hugepage); |
338 | #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ | ||
339 | alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) | ||
338 | #else | 340 | #else |
339 | #define alloc_pages(gfp_mask, order) \ | 341 | #define alloc_pages(gfp_mask, order) \ |
340 | alloc_pages_node(numa_node_id(), gfp_mask, order) | 342 | alloc_pages_node(numa_node_id(), gfp_mask, order) |
341 | #define alloc_pages_vma(gfp_mask, order, vma, addr, node) \ | 343 | #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ |
344 | alloc_pages(gfp_mask, order) | ||
345 | #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ | ||
342 | alloc_pages(gfp_mask, order) | 346 | alloc_pages(gfp_mask, order) |
343 | #endif | 347 | #endif |
344 | #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) | 348 | #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) |
345 | #define alloc_page_vma(gfp_mask, vma, addr) \ | 349 | #define alloc_page_vma(gfp_mask, vma, addr) \ |
346 | alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) | 350 | alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) |
347 | #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ | 351 | #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ |
348 | alloc_pages_vma(gfp_mask, 0, vma, addr, node) | 352 | alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) |
349 | 353 | ||
350 | extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order); | 354 | extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order); |
351 | extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, | 355 | extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ad9051bab267..f10b20f05159 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -157,6 +157,13 @@ static inline int hpage_nr_pages(struct page *page) | |||
157 | extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 157 | extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, |
158 | unsigned long addr, pmd_t pmd, pmd_t *pmdp); | 158 | unsigned long addr, pmd_t pmd, pmd_t *pmdp); |
159 | 159 | ||
160 | extern struct page *huge_zero_page; | ||
161 | |||
162 | static inline bool is_huge_zero_page(struct page *page) | ||
163 | { | ||
164 | return ACCESS_ONCE(huge_zero_page) == page; | ||
165 | } | ||
166 | |||
160 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | 167 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ |
161 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) | 168 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) |
162 | #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) | 169 | #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) |
@@ -206,6 +213,11 @@ static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_str | |||
206 | return 0; | 213 | return 0; |
207 | } | 214 | } |
208 | 215 | ||
216 | static inline bool is_huge_zero_page(struct page *page) | ||
217 | { | ||
218 | return false; | ||
219 | } | ||
220 | |||
209 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 221 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
210 | 222 | ||
211 | #endif /* _LINUX_HUGE_MM_H */ | 223 | #endif /* _LINUX_HUGE_MM_H */ |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7d7856359920..7b5785032049 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -99,9 +99,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); | |||
99 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, | 99 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, |
100 | int write); | 100 | int write); |
101 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 101 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
102 | pmd_t *pmd, int write); | 102 | pmd_t *pmd, int flags); |
103 | struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, | 103 | struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, |
104 | pud_t *pud, int write); | 104 | pud_t *pud, int flags); |
105 | int pmd_huge(pmd_t pmd); | 105 | int pmd_huge(pmd_t pmd); |
106 | int pud_huge(pud_t pmd); | 106 | int pud_huge(pud_t pmd); |
107 | unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | 107 | unsigned long hugetlb_change_protection(struct vm_area_struct *vma, |
@@ -133,8 +133,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) | |||
133 | static inline void hugetlb_show_meminfo(void) | 133 | static inline void hugetlb_show_meminfo(void) |
134 | { | 134 | { |
135 | } | 135 | } |
136 | #define follow_huge_pmd(mm, addr, pmd, write) NULL | 136 | #define follow_huge_pmd(mm, addr, pmd, flags) NULL |
137 | #define follow_huge_pud(mm, addr, pud, write) NULL | 137 | #define follow_huge_pud(mm, addr, pud, flags) NULL |
138 | #define prepare_hugepage_range(file, addr, len) (-EINVAL) | 138 | #define prepare_hugepage_range(file, addr, len) (-EINVAL) |
139 | #define pmd_huge(x) 0 | 139 | #define pmd_huge(x) 0 |
140 | #define pud_huge(x) 0 | 140 | #define pud_huge(x) 0 |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 26f106022c88..d189ee098aa2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -200,17 +200,6 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, | |||
200 | int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); | 200 | int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); |
201 | #endif | 201 | #endif |
202 | 202 | ||
203 | /* | ||
204 | * Carry out a gup that requires IO. Allow the mm to relinquish the mmap | ||
205 | * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL | ||
206 | * controls whether we retry the gup one more time to completion in that case. | ||
207 | * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp | ||
208 | * handler. | ||
209 | */ | ||
210 | int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm, | ||
211 | unsigned long addr, bool write_fault, | ||
212 | struct page **pagep); | ||
213 | |||
214 | enum { | 203 | enum { |
215 | OUTSIDE_GUEST_MODE, | 204 | OUTSIDE_GUEST_MODE, |
216 | IN_GUEST_MODE, | 205 | IN_GUEST_MODE, |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fb212e1d700d..6cfd934c7c9b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -52,7 +52,27 @@ struct mem_cgroup_reclaim_cookie { | |||
52 | unsigned int generation; | 52 | unsigned int generation; |
53 | }; | 53 | }; |
54 | 54 | ||
55 | enum mem_cgroup_events_index { | ||
56 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ | ||
57 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ | ||
58 | MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ | ||
59 | MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ | ||
60 | MEM_CGROUP_EVENTS_NSTATS, | ||
61 | /* default hierarchy events */ | ||
62 | MEMCG_LOW = MEM_CGROUP_EVENTS_NSTATS, | ||
63 | MEMCG_HIGH, | ||
64 | MEMCG_MAX, | ||
65 | MEMCG_OOM, | ||
66 | MEMCG_NR_EVENTS, | ||
67 | }; | ||
68 | |||
55 | #ifdef CONFIG_MEMCG | 69 | #ifdef CONFIG_MEMCG |
70 | void mem_cgroup_events(struct mem_cgroup *memcg, | ||
71 | enum mem_cgroup_events_index idx, | ||
72 | unsigned int nr); | ||
73 | |||
74 | bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); | ||
75 | |||
56 | int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | 76 | int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, |
57 | gfp_t gfp_mask, struct mem_cgroup **memcgp); | 77 | gfp_t gfp_mask, struct mem_cgroup **memcgp); |
58 | void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, | 78 | void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, |
@@ -102,6 +122,7 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); | |||
102 | * For memory reclaim. | 122 | * For memory reclaim. |
103 | */ | 123 | */ |
104 | int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec); | 124 | int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec); |
125 | bool mem_cgroup_lruvec_online(struct lruvec *lruvec); | ||
105 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); | 126 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); |
106 | unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); | 127 | unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); |
107 | void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); | 128 | void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); |
@@ -138,12 +159,10 @@ static inline bool mem_cgroup_disabled(void) | |||
138 | return false; | 159 | return false; |
139 | } | 160 | } |
140 | 161 | ||
141 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, | 162 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page); |
142 | unsigned long *flags); | ||
143 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, | ||
144 | unsigned long *flags); | ||
145 | void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, | 163 | void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, |
146 | enum mem_cgroup_stat_index idx, int val); | 164 | enum mem_cgroup_stat_index idx, int val); |
165 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg); | ||
147 | 166 | ||
148 | static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, | 167 | static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, |
149 | enum mem_cgroup_stat_index idx) | 168 | enum mem_cgroup_stat_index idx) |
@@ -176,6 +195,18 @@ void mem_cgroup_split_huge_fixup(struct page *head); | |||
176 | #else /* CONFIG_MEMCG */ | 195 | #else /* CONFIG_MEMCG */ |
177 | struct mem_cgroup; | 196 | struct mem_cgroup; |
178 | 197 | ||
198 | static inline void mem_cgroup_events(struct mem_cgroup *memcg, | ||
199 | enum mem_cgroup_events_index idx, | ||
200 | unsigned int nr) | ||
201 | { | ||
202 | } | ||
203 | |||
204 | static inline bool mem_cgroup_low(struct mem_cgroup *root, | ||
205 | struct mem_cgroup *memcg) | ||
206 | { | ||
207 | return false; | ||
208 | } | ||
209 | |||
179 | static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | 210 | static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, |
180 | gfp_t gfp_mask, | 211 | gfp_t gfp_mask, |
181 | struct mem_cgroup **memcgp) | 212 | struct mem_cgroup **memcgp) |
@@ -268,6 +299,11 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
268 | return 1; | 299 | return 1; |
269 | } | 300 | } |
270 | 301 | ||
302 | static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) | ||
303 | { | ||
304 | return true; | ||
305 | } | ||
306 | |||
271 | static inline unsigned long | 307 | static inline unsigned long |
272 | mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) | 308 | mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) |
273 | { | 309 | { |
@@ -285,14 +321,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
285 | { | 321 | { |
286 | } | 322 | } |
287 | 323 | ||
288 | static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, | 324 | static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) |
289 | bool *locked, unsigned long *flags) | ||
290 | { | 325 | { |
291 | return NULL; | 326 | return NULL; |
292 | } | 327 | } |
293 | 328 | ||
294 | static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, | 329 | static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) |
295 | bool *locked, unsigned long *flags) | ||
296 | { | 330 | { |
297 | } | 331 | } |
298 | 332 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index 65db4aee738a..a4d24f3c5430 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -484,7 +484,8 @@ static inline void page_mapcount_reset(struct page *page) | |||
484 | 484 | ||
485 | static inline int page_mapcount(struct page *page) | 485 | static inline int page_mapcount(struct page *page) |
486 | { | 486 | { |
487 | return atomic_read(&(page)->_mapcount) + 1; | 487 | VM_BUG_ON_PAGE(PageSlab(page), page); |
488 | return atomic_read(&page->_mapcount) + 1; | ||
488 | } | 489 | } |
489 | 490 | ||
490 | static inline int page_count(struct page *page) | 491 | static inline int page_count(struct page *page) |
@@ -627,29 +628,28 @@ int split_free_page(struct page *page); | |||
627 | * prototype for that function and accessor functions. | 628 | * prototype for that function and accessor functions. |
628 | * These are _only_ valid on the head of a PG_compound page. | 629 | * These are _only_ valid on the head of a PG_compound page. |
629 | */ | 630 | */ |
630 | typedef void compound_page_dtor(struct page *); | ||
631 | 631 | ||
632 | static inline void set_compound_page_dtor(struct page *page, | 632 | static inline void set_compound_page_dtor(struct page *page, |
633 | compound_page_dtor *dtor) | 633 | compound_page_dtor *dtor) |
634 | { | 634 | { |
635 | page[1].lru.next = (void *)dtor; | 635 | page[1].compound_dtor = dtor; |
636 | } | 636 | } |
637 | 637 | ||
638 | static inline compound_page_dtor *get_compound_page_dtor(struct page *page) | 638 | static inline compound_page_dtor *get_compound_page_dtor(struct page *page) |
639 | { | 639 | { |
640 | return (compound_page_dtor *)page[1].lru.next; | 640 | return page[1].compound_dtor; |
641 | } | 641 | } |
642 | 642 | ||
643 | static inline int compound_order(struct page *page) | 643 | static inline int compound_order(struct page *page) |
644 | { | 644 | { |
645 | if (!PageHead(page)) | 645 | if (!PageHead(page)) |
646 | return 0; | 646 | return 0; |
647 | return (unsigned long)page[1].lru.prev; | 647 | return page[1].compound_order; |
648 | } | 648 | } |
649 | 649 | ||
650 | static inline void set_compound_order(struct page *page, unsigned long order) | 650 | static inline void set_compound_order(struct page *page, unsigned long order) |
651 | { | 651 | { |
652 | page[1].lru.prev = (void *)order; | 652 | page[1].compound_order = order; |
653 | } | 653 | } |
654 | 654 | ||
655 | #ifdef CONFIG_MMU | 655 | #ifdef CONFIG_MMU |
@@ -1164,8 +1164,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | |||
1164 | 1164 | ||
1165 | /** | 1165 | /** |
1166 | * mm_walk - callbacks for walk_page_range | 1166 | * mm_walk - callbacks for walk_page_range |
1167 | * @pgd_entry: if set, called for each non-empty PGD (top-level) entry | ||
1168 | * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry | ||
1169 | * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry | 1167 | * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry |
1170 | * this handler is required to be able to handle | 1168 | * this handler is required to be able to handle |
1171 | * pmd_trans_huge() pmds. They may simply choose to | 1169 | * pmd_trans_huge() pmds. They may simply choose to |
@@ -1173,16 +1171,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | |||
1173 | * @pte_entry: if set, called for each non-empty PTE (4th-level) entry | 1171 | * @pte_entry: if set, called for each non-empty PTE (4th-level) entry |
1174 | * @pte_hole: if set, called for each hole at all levels | 1172 | * @pte_hole: if set, called for each hole at all levels |
1175 | * @hugetlb_entry: if set, called for each hugetlb entry | 1173 | * @hugetlb_entry: if set, called for each hugetlb entry |
1176 | * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry | 1174 | * @test_walk: caller specific callback function to determine whether |
1177 | * is used. | 1175 | * we walk over the current vma or not. A positive returned |
1176 | * value means "do page table walk over the current vma," | ||
1177 | * and a negative one means "abort current page table walk | ||
1178 | * right now." 0 means "skip the current vma." | ||
1179 | * @mm: mm_struct representing the target process of page table walk | ||
1180 | * @vma: vma currently walked (NULL if walking outside vmas) | ||
1181 | * @private: private data for callbacks' usage | ||
1178 | * | 1182 | * |
1179 | * (see walk_page_range for more details) | 1183 | * (see the comment on walk_page_range() for more details) |
1180 | */ | 1184 | */ |
1181 | struct mm_walk { | 1185 | struct mm_walk { |
1182 | int (*pgd_entry)(pgd_t *pgd, unsigned long addr, | ||
1183 | unsigned long next, struct mm_walk *walk); | ||
1184 | int (*pud_entry)(pud_t *pud, unsigned long addr, | ||
1185 | unsigned long next, struct mm_walk *walk); | ||
1186 | int (*pmd_entry)(pmd_t *pmd, unsigned long addr, | 1186 | int (*pmd_entry)(pmd_t *pmd, unsigned long addr, |
1187 | unsigned long next, struct mm_walk *walk); | 1187 | unsigned long next, struct mm_walk *walk); |
1188 | int (*pte_entry)(pte_t *pte, unsigned long addr, | 1188 | int (*pte_entry)(pte_t *pte, unsigned long addr, |
@@ -1192,12 +1192,16 @@ struct mm_walk { | |||
1192 | int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, | 1192 | int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, |
1193 | unsigned long addr, unsigned long next, | 1193 | unsigned long addr, unsigned long next, |
1194 | struct mm_walk *walk); | 1194 | struct mm_walk *walk); |
1195 | int (*test_walk)(unsigned long addr, unsigned long next, | ||
1196 | struct mm_walk *walk); | ||
1195 | struct mm_struct *mm; | 1197 | struct mm_struct *mm; |
1198 | struct vm_area_struct *vma; | ||
1196 | void *private; | 1199 | void *private; |
1197 | }; | 1200 | }; |
1198 | 1201 | ||
1199 | int walk_page_range(unsigned long addr, unsigned long end, | 1202 | int walk_page_range(unsigned long addr, unsigned long end, |
1200 | struct mm_walk *walk); | 1203 | struct mm_walk *walk); |
1204 | int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); | ||
1201 | void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, | 1205 | void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, |
1202 | unsigned long end, unsigned long floor, unsigned long ceiling); | 1206 | unsigned long end, unsigned long floor, unsigned long ceiling); |
1203 | int copy_page_range(struct mm_struct *dst, struct mm_struct *src, | 1207 | int copy_page_range(struct mm_struct *dst, struct mm_struct *src, |
@@ -1261,6 +1265,17 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1261 | unsigned long start, unsigned long nr_pages, | 1265 | unsigned long start, unsigned long nr_pages, |
1262 | int write, int force, struct page **pages, | 1266 | int write, int force, struct page **pages, |
1263 | struct vm_area_struct **vmas); | 1267 | struct vm_area_struct **vmas); |
1268 | long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, | ||
1269 | unsigned long start, unsigned long nr_pages, | ||
1270 | int write, int force, struct page **pages, | ||
1271 | int *locked); | ||
1272 | long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
1273 | unsigned long start, unsigned long nr_pages, | ||
1274 | int write, int force, struct page **pages, | ||
1275 | unsigned int gup_flags); | ||
1276 | long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
1277 | unsigned long start, unsigned long nr_pages, | ||
1278 | int write, int force, struct page **pages); | ||
1264 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | 1279 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, |
1265 | struct page **pages); | 1280 | struct page **pages); |
1266 | struct kvec; | 1281 | struct kvec; |
@@ -1438,8 +1453,32 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, | |||
1438 | { | 1453 | { |
1439 | return 0; | 1454 | return 0; |
1440 | } | 1455 | } |
1456 | |||
1457 | static inline unsigned long mm_nr_pmds(struct mm_struct *mm) | ||
1458 | { | ||
1459 | return 0; | ||
1460 | } | ||
1461 | |||
1462 | static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} | ||
1463 | static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} | ||
1464 | |||
1441 | #else | 1465 | #else |
1442 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); | 1466 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); |
1467 | |||
1468 | static inline unsigned long mm_nr_pmds(struct mm_struct *mm) | ||
1469 | { | ||
1470 | return atomic_long_read(&mm->nr_pmds); | ||
1471 | } | ||
1472 | |||
1473 | static inline void mm_inc_nr_pmds(struct mm_struct *mm) | ||
1474 | { | ||
1475 | atomic_long_inc(&mm->nr_pmds); | ||
1476 | } | ||
1477 | |||
1478 | static inline void mm_dec_nr_pmds(struct mm_struct *mm) | ||
1479 | { | ||
1480 | atomic_long_dec(&mm->nr_pmds); | ||
1481 | } | ||
1443 | #endif | 1482 | #endif |
1444 | 1483 | ||
1445 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, | 1484 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 07c8bd3f7b48..199a03aab8dc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -28,6 +28,8 @@ struct mem_cgroup; | |||
28 | IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) | 28 | IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) |
29 | #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) | 29 | #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) |
30 | 30 | ||
31 | typedef void compound_page_dtor(struct page *); | ||
32 | |||
31 | /* | 33 | /* |
32 | * Each physical page in the system has a struct page associated with | 34 | * Each physical page in the system has a struct page associated with |
33 | * it to keep track of whatever it is we are using the page for at the | 35 | * it to keep track of whatever it is we are using the page for at the |
@@ -142,6 +144,12 @@ struct page { | |||
142 | struct rcu_head rcu_head; /* Used by SLAB | 144 | struct rcu_head rcu_head; /* Used by SLAB |
143 | * when destroying via RCU | 145 | * when destroying via RCU |
144 | */ | 146 | */ |
147 | /* First tail page of compound page */ | ||
148 | struct { | ||
149 | compound_page_dtor *compound_dtor; | ||
150 | unsigned long compound_order; | ||
151 | }; | ||
152 | |||
145 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS | 153 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS |
146 | pgtable_t pmd_huge_pte; /* protected by page->ptl */ | 154 | pgtable_t pmd_huge_pte; /* protected by page->ptl */ |
147 | #endif | 155 | #endif |
@@ -355,7 +363,8 @@ struct mm_struct { | |||
355 | pgd_t * pgd; | 363 | pgd_t * pgd; |
356 | atomic_t mm_users; /* How many users with user space? */ | 364 | atomic_t mm_users; /* How many users with user space? */ |
357 | atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ | 365 | atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ |
358 | atomic_long_t nr_ptes; /* Page table pages */ | 366 | atomic_long_t nr_ptes; /* PTE page table pages */ |
367 | atomic_long_t nr_pmds; /* PMD page table pages */ | ||
359 | int map_count; /* number of VMAs */ | 368 | int map_count; /* number of VMAs */ |
360 | 369 | ||
361 | spinlock_t page_table_lock; /* Protects page tables and some counters */ | 370 | spinlock_t page_table_lock; /* Protects page tables and some counters */ |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2f0856d14b21..f279d9c158cd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -426,7 +426,7 @@ struct zone { | |||
426 | const char *name; | 426 | const char *name; |
427 | 427 | ||
428 | /* | 428 | /* |
429 | * Number of MIGRATE_RESEVE page block. To maintain for just | 429 | * Number of MIGRATE_RESERVE page block. To maintain for just |
430 | * optimization. Protected by zone->lock. | 430 | * optimization. Protected by zone->lock. |
431 | */ | 431 | */ |
432 | int nr_migrate_reserve_block; | 432 | int nr_migrate_reserve_block; |
@@ -970,7 +970,6 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) | |||
970 | * @z - The cursor used as a starting point for the search | 970 | * @z - The cursor used as a starting point for the search |
971 | * @highest_zoneidx - The zone index of the highest zone to return | 971 | * @highest_zoneidx - The zone index of the highest zone to return |
972 | * @nodes - An optional nodemask to filter the zonelist with | 972 | * @nodes - An optional nodemask to filter the zonelist with |
973 | * @zone - The first suitable zone found is returned via this parameter | ||
974 | * | 973 | * |
975 | * This function returns the next zone at or below a given zone index that is | 974 | * This function returns the next zone at or below a given zone index that is |
976 | * within the allowed nodemask using a cursor as the starting point for the | 975 | * within the allowed nodemask using a cursor as the starting point for the |
@@ -980,8 +979,7 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) | |||
980 | */ | 979 | */ |
981 | struct zoneref *next_zones_zonelist(struct zoneref *z, | 980 | struct zoneref *next_zones_zonelist(struct zoneref *z, |
982 | enum zone_type highest_zoneidx, | 981 | enum zone_type highest_zoneidx, |
983 | nodemask_t *nodes, | 982 | nodemask_t *nodes); |
984 | struct zone **zone); | ||
985 | 983 | ||
986 | /** | 984 | /** |
987 | * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist | 985 | * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist |
@@ -1000,8 +998,10 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, | |||
1000 | nodemask_t *nodes, | 998 | nodemask_t *nodes, |
1001 | struct zone **zone) | 999 | struct zone **zone) |
1002 | { | 1000 | { |
1003 | return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes, | 1001 | struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs, |
1004 | zone); | 1002 | highest_zoneidx, nodes); |
1003 | *zone = zonelist_zone(z); | ||
1004 | return z; | ||
1005 | } | 1005 | } |
1006 | 1006 | ||
1007 | /** | 1007 | /** |
@@ -1018,7 +1018,8 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, | |||
1018 | #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ | 1018 | #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ |
1019 | for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ | 1019 | for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ |
1020 | zone; \ | 1020 | zone; \ |
1021 | z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \ | 1021 | z = next_zones_zonelist(++z, highidx, nodemask), \ |
1022 | zone = zonelist_zone(z)) \ | ||
1022 | 1023 | ||
1023 | /** | 1024 | /** |
1024 | * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index | 1025 | * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index |
diff --git a/include/linux/oom.h b/include/linux/oom.h index 76200984d1e2..d5771bed59c9 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -47,6 +47,10 @@ static inline bool oom_task_origin(const struct task_struct *p) | |||
47 | return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); | 47 | return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); |
48 | } | 48 | } |
49 | 49 | ||
50 | extern void mark_tsk_oom_victim(struct task_struct *tsk); | ||
51 | |||
52 | extern void unmark_oom_victim(void); | ||
53 | |||
50 | extern unsigned long oom_badness(struct task_struct *p, | 54 | extern unsigned long oom_badness(struct task_struct *p, |
51 | struct mem_cgroup *memcg, const nodemask_t *nodemask, | 55 | struct mem_cgroup *memcg, const nodemask_t *nodemask, |
52 | unsigned long totalpages); | 56 | unsigned long totalpages); |
@@ -68,22 +72,14 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
68 | unsigned long totalpages, const nodemask_t *nodemask, | 72 | unsigned long totalpages, const nodemask_t *nodemask, |
69 | bool force_kill); | 73 | bool force_kill); |
70 | 74 | ||
71 | extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 75 | extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
72 | int order, nodemask_t *mask, bool force_kill); | 76 | int order, nodemask_t *mask, bool force_kill); |
73 | extern int register_oom_notifier(struct notifier_block *nb); | 77 | extern int register_oom_notifier(struct notifier_block *nb); |
74 | extern int unregister_oom_notifier(struct notifier_block *nb); | 78 | extern int unregister_oom_notifier(struct notifier_block *nb); |
75 | 79 | ||
76 | extern bool oom_killer_disabled; | 80 | extern bool oom_killer_disabled; |
77 | 81 | extern bool oom_killer_disable(void); | |
78 | static inline void oom_killer_disable(void) | 82 | extern void oom_killer_enable(void); |
79 | { | ||
80 | oom_killer_disabled = true; | ||
81 | } | ||
82 | |||
83 | static inline void oom_killer_enable(void) | ||
84 | { | ||
85 | oom_killer_disabled = false; | ||
86 | } | ||
87 | 83 | ||
88 | extern struct task_struct *find_lock_task_mm(struct task_struct *p); | 84 | extern struct task_struct *find_lock_task_mm(struct task_struct *p); |
89 | 85 | ||
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 955421575d16..17fa4f8de3a6 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h | |||
@@ -41,7 +41,8 @@ int page_counter_try_charge(struct page_counter *counter, | |||
41 | struct page_counter **fail); | 41 | struct page_counter **fail); |
42 | void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); | 42 | void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); |
43 | int page_counter_limit(struct page_counter *counter, unsigned long limit); | 43 | int page_counter_limit(struct page_counter *counter, unsigned long limit); |
44 | int page_counter_memparse(const char *buf, unsigned long *nr_pages); | 44 | int page_counter_memparse(const char *buf, const char *max, |
45 | unsigned long *nr_pages); | ||
45 | 46 | ||
46 | static inline void page_counter_reset_watermark(struct page_counter *counter) | 47 | static inline void page_counter_reset_watermark(struct page_counter *counter) |
47 | { | 48 | { |
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index d2a2c84c72d0..c42981cd99aa 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h | |||
@@ -40,7 +40,7 @@ struct page_ext { | |||
40 | #ifdef CONFIG_PAGE_OWNER | 40 | #ifdef CONFIG_PAGE_OWNER |
41 | unsigned int order; | 41 | unsigned int order; |
42 | gfp_t gfp_mask; | 42 | gfp_t gfp_mask; |
43 | struct stack_trace trace; | 43 | unsigned int nr_entries; |
44 | unsigned long trace_entries[8]; | 44 | unsigned long trace_entries[8]; |
45 | #endif | 45 | #endif |
46 | }; | 46 | }; |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 34e8b60ab973..7067eca501e2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -437,16 +437,6 @@ extern int reuse_swap_page(struct page *); | |||
437 | extern int try_to_free_swap(struct page *); | 437 | extern int try_to_free_swap(struct page *); |
438 | struct backing_dev_info; | 438 | struct backing_dev_info; |
439 | 439 | ||
440 | #ifdef CONFIG_MEMCG | ||
441 | extern void | ||
442 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); | ||
443 | #else | ||
444 | static inline void | ||
445 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | ||
446 | { | ||
447 | } | ||
448 | #endif | ||
449 | |||
450 | #else /* CONFIG_SWAP */ | 440 | #else /* CONFIG_SWAP */ |
451 | 441 | ||
452 | #define swap_address_space(entry) (NULL) | 442 | #define swap_address_space(entry) (NULL) |
@@ -547,11 +537,6 @@ static inline swp_entry_t get_swap_page(void) | |||
547 | return entry; | 537 | return entry; |
548 | } | 538 | } |
549 | 539 | ||
550 | static inline void | ||
551 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) | ||
552 | { | ||
553 | } | ||
554 | |||
555 | #endif /* CONFIG_SWAP */ | 540 | #endif /* CONFIG_SWAP */ |
556 | #endif /* __KERNEL__*/ | 541 | #endif /* __KERNEL__*/ |
557 | #endif /* _LINUX_SWAP_H */ | 542 | #endif /* _LINUX_SWAP_H */ |
diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 50cbc876be56..831a3168ab35 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h | |||
@@ -135,6 +135,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry) | |||
135 | *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); | 135 | *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); |
136 | } | 136 | } |
137 | 137 | ||
138 | extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, | ||
139 | spinlock_t *ptl); | ||
138 | extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, | 140 | extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, |
139 | unsigned long address); | 141 | unsigned long address); |
140 | extern void migration_entry_wait_huge(struct vm_area_struct *vma, | 142 | extern void migration_entry_wait_huge(struct vm_area_struct *vma, |
@@ -148,6 +150,8 @@ static inline int is_migration_entry(swp_entry_t swp) | |||
148 | } | 150 | } |
149 | #define migration_entry_to_page(swp) NULL | 151 | #define migration_entry_to_page(swp) NULL |
150 | static inline void make_migration_entry_read(swp_entry_t *entryp) { } | 152 | static inline void make_migration_entry_read(swp_entry_t *entryp) { } |
153 | static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, | ||
154 | spinlock_t *ptl) { } | ||
151 | static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, | 155 | static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, |
152 | unsigned long address) { } | 156 | unsigned long address) { } |
153 | static inline void migration_entry_wait_huge(struct vm_area_struct *vma, | 157 | static inline void migration_entry_wait_huge(struct vm_area_struct *vma, |
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index c6814b917bdf..9a6a3fe0fb51 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h | |||
@@ -11,39 +11,55 @@ | |||
11 | 11 | ||
12 | DECLARE_EVENT_CLASS(mm_compaction_isolate_template, | 12 | DECLARE_EVENT_CLASS(mm_compaction_isolate_template, |
13 | 13 | ||
14 | TP_PROTO(unsigned long nr_scanned, | 14 | TP_PROTO( |
15 | unsigned long start_pfn, | ||
16 | unsigned long end_pfn, | ||
17 | unsigned long nr_scanned, | ||
15 | unsigned long nr_taken), | 18 | unsigned long nr_taken), |
16 | 19 | ||
17 | TP_ARGS(nr_scanned, nr_taken), | 20 | TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken), |
18 | 21 | ||
19 | TP_STRUCT__entry( | 22 | TP_STRUCT__entry( |
23 | __field(unsigned long, start_pfn) | ||
24 | __field(unsigned long, end_pfn) | ||
20 | __field(unsigned long, nr_scanned) | 25 | __field(unsigned long, nr_scanned) |
21 | __field(unsigned long, nr_taken) | 26 | __field(unsigned long, nr_taken) |
22 | ), | 27 | ), |
23 | 28 | ||
24 | TP_fast_assign( | 29 | TP_fast_assign( |
30 | __entry->start_pfn = start_pfn; | ||
31 | __entry->end_pfn = end_pfn; | ||
25 | __entry->nr_scanned = nr_scanned; | 32 | __entry->nr_scanned = nr_scanned; |
26 | __entry->nr_taken = nr_taken; | 33 | __entry->nr_taken = nr_taken; |
27 | ), | 34 | ), |
28 | 35 | ||
29 | TP_printk("nr_scanned=%lu nr_taken=%lu", | 36 | TP_printk("range=(0x%lx ~ 0x%lx) nr_scanned=%lu nr_taken=%lu", |
37 | __entry->start_pfn, | ||
38 | __entry->end_pfn, | ||
30 | __entry->nr_scanned, | 39 | __entry->nr_scanned, |
31 | __entry->nr_taken) | 40 | __entry->nr_taken) |
32 | ); | 41 | ); |
33 | 42 | ||
34 | DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, | 43 | DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, |
35 | 44 | ||
36 | TP_PROTO(unsigned long nr_scanned, | 45 | TP_PROTO( |
46 | unsigned long start_pfn, | ||
47 | unsigned long end_pfn, | ||
48 | unsigned long nr_scanned, | ||
37 | unsigned long nr_taken), | 49 | unsigned long nr_taken), |
38 | 50 | ||
39 | TP_ARGS(nr_scanned, nr_taken) | 51 | TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) |
40 | ); | 52 | ); |
41 | 53 | ||
42 | DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, | 54 | DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, |
43 | TP_PROTO(unsigned long nr_scanned, | 55 | |
56 | TP_PROTO( | ||
57 | unsigned long start_pfn, | ||
58 | unsigned long end_pfn, | ||
59 | unsigned long nr_scanned, | ||
44 | unsigned long nr_taken), | 60 | unsigned long nr_taken), |
45 | 61 | ||
46 | TP_ARGS(nr_scanned, nr_taken) | 62 | TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) |
47 | ); | 63 | ); |
48 | 64 | ||
49 | TRACE_EVENT(mm_compaction_migratepages, | 65 | TRACE_EVENT(mm_compaction_migratepages, |
@@ -85,47 +101,198 @@ TRACE_EVENT(mm_compaction_migratepages, | |||
85 | ); | 101 | ); |
86 | 102 | ||
87 | TRACE_EVENT(mm_compaction_begin, | 103 | TRACE_EVENT(mm_compaction_begin, |
88 | TP_PROTO(unsigned long zone_start, unsigned long migrate_start, | 104 | TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, |
89 | unsigned long free_start, unsigned long zone_end), | 105 | unsigned long free_pfn, unsigned long zone_end, bool sync), |
90 | 106 | ||
91 | TP_ARGS(zone_start, migrate_start, free_start, zone_end), | 107 | TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync), |
92 | 108 | ||
93 | TP_STRUCT__entry( | 109 | TP_STRUCT__entry( |
94 | __field(unsigned long, zone_start) | 110 | __field(unsigned long, zone_start) |
95 | __field(unsigned long, migrate_start) | 111 | __field(unsigned long, migrate_pfn) |
96 | __field(unsigned long, free_start) | 112 | __field(unsigned long, free_pfn) |
97 | __field(unsigned long, zone_end) | 113 | __field(unsigned long, zone_end) |
114 | __field(bool, sync) | ||
98 | ), | 115 | ), |
99 | 116 | ||
100 | TP_fast_assign( | 117 | TP_fast_assign( |
101 | __entry->zone_start = zone_start; | 118 | __entry->zone_start = zone_start; |
102 | __entry->migrate_start = migrate_start; | 119 | __entry->migrate_pfn = migrate_pfn; |
103 | __entry->free_start = free_start; | 120 | __entry->free_pfn = free_pfn; |
104 | __entry->zone_end = zone_end; | 121 | __entry->zone_end = zone_end; |
122 | __entry->sync = sync; | ||
105 | ), | 123 | ), |
106 | 124 | ||
107 | TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu", | 125 | TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s", |
108 | __entry->zone_start, | 126 | __entry->zone_start, |
109 | __entry->migrate_start, | 127 | __entry->migrate_pfn, |
110 | __entry->free_start, | 128 | __entry->free_pfn, |
111 | __entry->zone_end) | 129 | __entry->zone_end, |
130 | __entry->sync ? "sync" : "async") | ||
112 | ); | 131 | ); |
113 | 132 | ||
114 | TRACE_EVENT(mm_compaction_end, | 133 | TRACE_EVENT(mm_compaction_end, |
115 | TP_PROTO(int status), | 134 | TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, |
135 | unsigned long free_pfn, unsigned long zone_end, bool sync, | ||
136 | int status), | ||
116 | 137 | ||
117 | TP_ARGS(status), | 138 | TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync, status), |
118 | 139 | ||
119 | TP_STRUCT__entry( | 140 | TP_STRUCT__entry( |
141 | __field(unsigned long, zone_start) | ||
142 | __field(unsigned long, migrate_pfn) | ||
143 | __field(unsigned long, free_pfn) | ||
144 | __field(unsigned long, zone_end) | ||
145 | __field(bool, sync) | ||
120 | __field(int, status) | 146 | __field(int, status) |
121 | ), | 147 | ), |
122 | 148 | ||
123 | TP_fast_assign( | 149 | TP_fast_assign( |
150 | __entry->zone_start = zone_start; | ||
151 | __entry->migrate_pfn = migrate_pfn; | ||
152 | __entry->free_pfn = free_pfn; | ||
153 | __entry->zone_end = zone_end; | ||
154 | __entry->sync = sync; | ||
124 | __entry->status = status; | 155 | __entry->status = status; |
125 | ), | 156 | ), |
126 | 157 | ||
127 | TP_printk("status=%d", __entry->status) | 158 | TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s status=%s", |
159 | __entry->zone_start, | ||
160 | __entry->migrate_pfn, | ||
161 | __entry->free_pfn, | ||
162 | __entry->zone_end, | ||
163 | __entry->sync ? "sync" : "async", | ||
164 | compaction_status_string[__entry->status]) | ||
165 | ); | ||
166 | |||
167 | TRACE_EVENT(mm_compaction_try_to_compact_pages, | ||
168 | |||
169 | TP_PROTO( | ||
170 | int order, | ||
171 | gfp_t gfp_mask, | ||
172 | enum migrate_mode mode), | ||
173 | |||
174 | TP_ARGS(order, gfp_mask, mode), | ||
175 | |||
176 | TP_STRUCT__entry( | ||
177 | __field(int, order) | ||
178 | __field(gfp_t, gfp_mask) | ||
179 | __field(enum migrate_mode, mode) | ||
180 | ), | ||
181 | |||
182 | TP_fast_assign( | ||
183 | __entry->order = order; | ||
184 | __entry->gfp_mask = gfp_mask; | ||
185 | __entry->mode = mode; | ||
186 | ), | ||
187 | |||
188 | TP_printk("order=%d gfp_mask=0x%x mode=%d", | ||
189 | __entry->order, | ||
190 | __entry->gfp_mask, | ||
191 | (int)__entry->mode) | ||
192 | ); | ||
193 | |||
194 | DECLARE_EVENT_CLASS(mm_compaction_suitable_template, | ||
195 | |||
196 | TP_PROTO(struct zone *zone, | ||
197 | int order, | ||
198 | int ret), | ||
199 | |||
200 | TP_ARGS(zone, order, ret), | ||
201 | |||
202 | TP_STRUCT__entry( | ||
203 | __field(int, nid) | ||
204 | __field(char *, name) | ||
205 | __field(int, order) | ||
206 | __field(int, ret) | ||
207 | ), | ||
208 | |||
209 | TP_fast_assign( | ||
210 | __entry->nid = zone_to_nid(zone); | ||
211 | __entry->name = (char *)zone->name; | ||
212 | __entry->order = order; | ||
213 | __entry->ret = ret; | ||
214 | ), | ||
215 | |||
216 | TP_printk("node=%d zone=%-8s order=%d ret=%s", | ||
217 | __entry->nid, | ||
218 | __entry->name, | ||
219 | __entry->order, | ||
220 | compaction_status_string[__entry->ret]) | ||
221 | ); | ||
222 | |||
223 | DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished, | ||
224 | |||
225 | TP_PROTO(struct zone *zone, | ||
226 | int order, | ||
227 | int ret), | ||
228 | |||
229 | TP_ARGS(zone, order, ret) | ||
230 | ); | ||
231 | |||
232 | DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable, | ||
233 | |||
234 | TP_PROTO(struct zone *zone, | ||
235 | int order, | ||
236 | int ret), | ||
237 | |||
238 | TP_ARGS(zone, order, ret) | ||
239 | ); | ||
240 | |||
241 | #ifdef CONFIG_COMPACTION | ||
242 | DECLARE_EVENT_CLASS(mm_compaction_defer_template, | ||
243 | |||
244 | TP_PROTO(struct zone *zone, int order), | ||
245 | |||
246 | TP_ARGS(zone, order), | ||
247 | |||
248 | TP_STRUCT__entry( | ||
249 | __field(int, nid) | ||
250 | __field(char *, name) | ||
251 | __field(int, order) | ||
252 | __field(unsigned int, considered) | ||
253 | __field(unsigned int, defer_shift) | ||
254 | __field(int, order_failed) | ||
255 | ), | ||
256 | |||
257 | TP_fast_assign( | ||
258 | __entry->nid = zone_to_nid(zone); | ||
259 | __entry->name = (char *)zone->name; | ||
260 | __entry->order = order; | ||
261 | __entry->considered = zone->compact_considered; | ||
262 | __entry->defer_shift = zone->compact_defer_shift; | ||
263 | __entry->order_failed = zone->compact_order_failed; | ||
264 | ), | ||
265 | |||
266 | TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu", | ||
267 | __entry->nid, | ||
268 | __entry->name, | ||
269 | __entry->order, | ||
270 | __entry->order_failed, | ||
271 | __entry->considered, | ||
272 | 1UL << __entry->defer_shift) | ||
273 | ); | ||
274 | |||
275 | DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_deferred, | ||
276 | |||
277 | TP_PROTO(struct zone *zone, int order), | ||
278 | |||
279 | TP_ARGS(zone, order) | ||
280 | ); | ||
281 | |||
282 | DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_compaction, | ||
283 | |||
284 | TP_PROTO(struct zone *zone, int order), | ||
285 | |||
286 | TP_ARGS(zone, order) | ||
287 | ); | ||
288 | |||
289 | DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset, | ||
290 | |||
291 | TP_PROTO(struct zone *zone, int order), | ||
292 | |||
293 | TP_ARGS(zone, order) | ||
128 | ); | 294 | ); |
295 | #endif | ||
129 | 296 | ||
130 | #endif /* _TRACE_COMPACTION_H */ | 297 | #endif /* _TRACE_COMPACTION_H */ |
131 | 298 | ||
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index aece1346ceb7..4ad10baecd4d 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h | |||
@@ -268,11 +268,11 @@ TRACE_EVENT(mm_page_alloc_extfrag, | |||
268 | 268 | ||
269 | TP_PROTO(struct page *page, | 269 | TP_PROTO(struct page *page, |
270 | int alloc_order, int fallback_order, | 270 | int alloc_order, int fallback_order, |
271 | int alloc_migratetype, int fallback_migratetype, int new_migratetype), | 271 | int alloc_migratetype, int fallback_migratetype), |
272 | 272 | ||
273 | TP_ARGS(page, | 273 | TP_ARGS(page, |
274 | alloc_order, fallback_order, | 274 | alloc_order, fallback_order, |
275 | alloc_migratetype, fallback_migratetype, new_migratetype), | 275 | alloc_migratetype, fallback_migratetype), |
276 | 276 | ||
277 | TP_STRUCT__entry( | 277 | TP_STRUCT__entry( |
278 | __field( struct page *, page ) | 278 | __field( struct page *, page ) |
@@ -289,7 +289,8 @@ TRACE_EVENT(mm_page_alloc_extfrag, | |||
289 | __entry->fallback_order = fallback_order; | 289 | __entry->fallback_order = fallback_order; |
290 | __entry->alloc_migratetype = alloc_migratetype; | 290 | __entry->alloc_migratetype = alloc_migratetype; |
291 | __entry->fallback_migratetype = fallback_migratetype; | 291 | __entry->fallback_migratetype = fallback_migratetype; |
292 | __entry->change_ownership = (new_migratetype == alloc_migratetype); | 292 | __entry->change_ownership = (alloc_migratetype == |
293 | get_pageblock_migratetype(page)); | ||
293 | ), | 294 | ), |
294 | 295 | ||
295 | TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", | 296 | TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", |
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 2f96d233c980..a6c4962e5d46 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h | |||
@@ -32,6 +32,7 @@ | |||
32 | #define KPF_KSM 21 | 32 | #define KPF_KSM 21 |
33 | #define KPF_THP 22 | 33 | #define KPF_THP 22 |
34 | #define KPF_BALLOON 23 | 34 | #define KPF_BALLOON 23 |
35 | #define KPF_ZERO_PAGE 24 | ||
35 | 36 | ||
36 | 37 | ||
37 | #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ | 38 | #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ |
diff --git a/kernel/exit.c b/kernel/exit.c index 6806c55475ee..feff10bbb307 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk) | |||
435 | task_unlock(tsk); | 435 | task_unlock(tsk); |
436 | mm_update_next_owner(mm); | 436 | mm_update_next_owner(mm); |
437 | mmput(mm); | 437 | mmput(mm); |
438 | clear_thread_flag(TIF_MEMDIE); | 438 | if (test_thread_flag(TIF_MEMDIE)) |
439 | unmark_oom_victim(); | ||
439 | } | 440 | } |
440 | 441 | ||
441 | static struct task_struct *find_alive_thread(struct task_struct *p) | 442 | static struct task_struct *find_alive_thread(struct task_struct *p) |
diff --git a/kernel/fork.c b/kernel/fork.c index b379d9abddc7..66e19c251581 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -555,6 +555,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) | |||
555 | INIT_LIST_HEAD(&mm->mmlist); | 555 | INIT_LIST_HEAD(&mm->mmlist); |
556 | mm->core_state = NULL; | 556 | mm->core_state = NULL; |
557 | atomic_long_set(&mm->nr_ptes, 0); | 557 | atomic_long_set(&mm->nr_ptes, 0); |
558 | #ifndef __PAGETABLE_PMD_FOLDED | ||
559 | atomic_long_set(&mm->nr_pmds, 0); | ||
560 | #endif | ||
558 | mm->map_count = 0; | 561 | mm->map_count = 0; |
559 | mm->locked_vm = 0; | 562 | mm->locked_vm = 0; |
560 | mm->pinned_vm = 0; | 563 | mm->pinned_vm = 0; |
@@ -603,6 +606,14 @@ static void check_mm(struct mm_struct *mm) | |||
603 | printk(KERN_ALERT "BUG: Bad rss-counter state " | 606 | printk(KERN_ALERT "BUG: Bad rss-counter state " |
604 | "mm:%p idx:%d val:%ld\n", mm, i, x); | 607 | "mm:%p idx:%d val:%ld\n", mm, i, x); |
605 | } | 608 | } |
609 | |||
610 | if (atomic_long_read(&mm->nr_ptes)) | ||
611 | pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", | ||
612 | atomic_long_read(&mm->nr_ptes)); | ||
613 | if (mm_nr_pmds(mm)) | ||
614 | pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", | ||
615 | mm_nr_pmds(mm)); | ||
616 | |||
606 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | 617 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
607 | VM_BUG_ON_MM(mm->pmd_huge_pte, mm); | 618 | VM_BUG_ON_MM(mm->pmd_huge_pte, mm); |
608 | #endif | 619 | #endif |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 5a6ec8678b9a..564f786df470 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -84,8 +84,8 @@ static int try_to_freeze_tasks(bool user_only) | |||
84 | elapsed_msecs = elapsed_msecs64; | 84 | elapsed_msecs = elapsed_msecs64; |
85 | 85 | ||
86 | if (todo) { | 86 | if (todo) { |
87 | printk("\n"); | 87 | pr_cont("\n"); |
88 | printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " | 88 | pr_err("Freezing of tasks %s after %d.%03d seconds " |
89 | "(%d tasks refusing to freeze, wq_busy=%d):\n", | 89 | "(%d tasks refusing to freeze, wq_busy=%d):\n", |
90 | wakeup ? "aborted" : "failed", | 90 | wakeup ? "aborted" : "failed", |
91 | elapsed_msecs / 1000, elapsed_msecs % 1000, | 91 | elapsed_msecs / 1000, elapsed_msecs % 1000, |
@@ -101,37 +101,13 @@ static int try_to_freeze_tasks(bool user_only) | |||
101 | read_unlock(&tasklist_lock); | 101 | read_unlock(&tasklist_lock); |
102 | } | 102 | } |
103 | } else { | 103 | } else { |
104 | printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, | 104 | pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, |
105 | elapsed_msecs % 1000); | 105 | elapsed_msecs % 1000); |
106 | } | 106 | } |
107 | 107 | ||
108 | return todo ? -EBUSY : 0; | 108 | return todo ? -EBUSY : 0; |
109 | } | 109 | } |
110 | 110 | ||
111 | static bool __check_frozen_processes(void) | ||
112 | { | ||
113 | struct task_struct *g, *p; | ||
114 | |||
115 | for_each_process_thread(g, p) | ||
116 | if (p != current && !freezer_should_skip(p) && !frozen(p)) | ||
117 | return false; | ||
118 | |||
119 | return true; | ||
120 | } | ||
121 | |||
122 | /* | ||
123 | * Returns true if all freezable tasks (except for current) are frozen already | ||
124 | */ | ||
125 | static bool check_frozen_processes(void) | ||
126 | { | ||
127 | bool ret; | ||
128 | |||
129 | read_lock(&tasklist_lock); | ||
130 | ret = __check_frozen_processes(); | ||
131 | read_unlock(&tasklist_lock); | ||
132 | return ret; | ||
133 | } | ||
134 | |||
135 | /** | 111 | /** |
136 | * freeze_processes - Signal user space processes to enter the refrigerator. | 112 | * freeze_processes - Signal user space processes to enter the refrigerator. |
137 | * The current thread will not be frozen. The same process that calls | 113 | * The current thread will not be frozen. The same process that calls |
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void) | |||
142 | int freeze_processes(void) | 118 | int freeze_processes(void) |
143 | { | 119 | { |
144 | int error; | 120 | int error; |
145 | int oom_kills_saved; | ||
146 | 121 | ||
147 | error = __usermodehelper_disable(UMH_FREEZING); | 122 | error = __usermodehelper_disable(UMH_FREEZING); |
148 | if (error) | 123 | if (error) |
@@ -155,31 +130,24 @@ int freeze_processes(void) | |||
155 | atomic_inc(&system_freezing_cnt); | 130 | atomic_inc(&system_freezing_cnt); |
156 | 131 | ||
157 | pm_wakeup_clear(); | 132 | pm_wakeup_clear(); |
158 | printk("Freezing user space processes ... "); | 133 | pr_info("Freezing user space processes ... "); |
159 | pm_freezing = true; | 134 | pm_freezing = true; |
160 | oom_kills_saved = oom_kills_count(); | ||
161 | error = try_to_freeze_tasks(true); | 135 | error = try_to_freeze_tasks(true); |
162 | if (!error) { | 136 | if (!error) { |
163 | __usermodehelper_set_disable_depth(UMH_DISABLED); | 137 | __usermodehelper_set_disable_depth(UMH_DISABLED); |
164 | oom_killer_disable(); | 138 | pr_cont("done."); |
165 | |||
166 | /* | ||
167 | * There might have been an OOM kill while we were | ||
168 | * freezing tasks and the killed task might be still | ||
169 | * on the way out so we have to double check for race. | ||
170 | */ | ||
171 | if (oom_kills_count() != oom_kills_saved && | ||
172 | !check_frozen_processes()) { | ||
173 | __usermodehelper_set_disable_depth(UMH_ENABLED); | ||
174 | printk("OOM in progress."); | ||
175 | error = -EBUSY; | ||
176 | } else { | ||
177 | printk("done."); | ||
178 | } | ||
179 | } | 139 | } |
180 | printk("\n"); | 140 | pr_cont("\n"); |
181 | BUG_ON(in_atomic()); | 141 | BUG_ON(in_atomic()); |
182 | 142 | ||
143 | /* | ||
144 | * Now that the whole userspace is frozen we need to disbale | ||
145 | * the OOM killer to disallow any further interference with | ||
146 | * killable tasks. | ||
147 | */ | ||
148 | if (!error && !oom_killer_disable()) | ||
149 | error = -EBUSY; | ||
150 | |||
183 | if (error) | 151 | if (error) |
184 | thaw_processes(); | 152 | thaw_processes(); |
185 | return error; | 153 | return error; |
@@ -197,13 +165,14 @@ int freeze_kernel_threads(void) | |||
197 | { | 165 | { |
198 | int error; | 166 | int error; |
199 | 167 | ||
200 | printk("Freezing remaining freezable tasks ... "); | 168 | pr_info("Freezing remaining freezable tasks ... "); |
169 | |||
201 | pm_nosig_freezing = true; | 170 | pm_nosig_freezing = true; |
202 | error = try_to_freeze_tasks(false); | 171 | error = try_to_freeze_tasks(false); |
203 | if (!error) | 172 | if (!error) |
204 | printk("done."); | 173 | pr_cont("done."); |
205 | 174 | ||
206 | printk("\n"); | 175 | pr_cont("\n"); |
207 | BUG_ON(in_atomic()); | 176 | BUG_ON(in_atomic()); |
208 | 177 | ||
209 | if (error) | 178 | if (error) |
@@ -224,7 +193,7 @@ void thaw_processes(void) | |||
224 | 193 | ||
225 | oom_killer_enable(); | 194 | oom_killer_enable(); |
226 | 195 | ||
227 | printk("Restarting tasks ... "); | 196 | pr_info("Restarting tasks ... "); |
228 | 197 | ||
229 | __usermodehelper_set_disable_depth(UMH_FREEZING); | 198 | __usermodehelper_set_disable_depth(UMH_FREEZING); |
230 | thaw_workqueues(); | 199 | thaw_workqueues(); |
@@ -243,7 +212,7 @@ void thaw_processes(void) | |||
243 | usermodehelper_enable(); | 212 | usermodehelper_enable(); |
244 | 213 | ||
245 | schedule(); | 214 | schedule(); |
246 | printk("done.\n"); | 215 | pr_cont("done.\n"); |
247 | trace_suspend_resume(TPS("thaw_processes"), 0, false); | 216 | trace_suspend_resume(TPS("thaw_processes"), 0, false); |
248 | } | 217 | } |
249 | 218 | ||
@@ -252,7 +221,7 @@ void thaw_kernel_threads(void) | |||
252 | struct task_struct *g, *p; | 221 | struct task_struct *g, *p; |
253 | 222 | ||
254 | pm_nosig_freezing = false; | 223 | pm_nosig_freezing = false; |
255 | printk("Restarting kernel threads ... "); | 224 | pr_info("Restarting kernel threads ... "); |
256 | 225 | ||
257 | thaw_workqueues(); | 226 | thaw_workqueues(); |
258 | 227 | ||
@@ -264,5 +233,5 @@ void thaw_kernel_threads(void) | |||
264 | read_unlock(&tasklist_lock); | 233 | read_unlock(&tasklist_lock); |
265 | 234 | ||
266 | schedule(); | 235 | schedule(); |
267 | printk("done.\n"); | 236 | pr_cont("done.\n"); |
268 | } | 237 | } |
@@ -199,6 +199,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, | |||
199 | cma->order_per_bit = order_per_bit; | 199 | cma->order_per_bit = order_per_bit; |
200 | *res_cma = cma; | 200 | *res_cma = cma; |
201 | cma_area_count++; | 201 | cma_area_count++; |
202 | totalcma_pages += (size / PAGE_SIZE); | ||
202 | 203 | ||
203 | return 0; | 204 | return 0; |
204 | } | 205 | } |
@@ -337,7 +338,6 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
337 | if (ret) | 338 | if (ret) |
338 | goto err; | 339 | goto err; |
339 | 340 | ||
340 | totalcma_pages += (size / PAGE_SIZE); | ||
341 | pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, | 341 | pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, |
342 | &base); | 342 | &base); |
343 | return 0; | 343 | return 0; |
diff --git a/mm/compaction.c b/mm/compaction.c index 546e571e9d60..b68736c8a1ce 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -34,6 +34,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta) | |||
34 | #endif | 34 | #endif |
35 | 35 | ||
36 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 36 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
37 | #ifdef CONFIG_TRACEPOINTS | ||
38 | static const char *const compaction_status_string[] = { | ||
39 | "deferred", | ||
40 | "skipped", | ||
41 | "continue", | ||
42 | "partial", | ||
43 | "complete", | ||
44 | "no_suitable_page", | ||
45 | "not_suitable_zone", | ||
46 | }; | ||
47 | #endif | ||
37 | 48 | ||
38 | #define CREATE_TRACE_POINTS | 49 | #define CREATE_TRACE_POINTS |
39 | #include <trace/events/compaction.h> | 50 | #include <trace/events/compaction.h> |
@@ -113,6 +124,77 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn, | |||
113 | } | 124 | } |
114 | 125 | ||
115 | #ifdef CONFIG_COMPACTION | 126 | #ifdef CONFIG_COMPACTION |
127 | |||
128 | /* Do not skip compaction more than 64 times */ | ||
129 | #define COMPACT_MAX_DEFER_SHIFT 6 | ||
130 | |||
131 | /* | ||
132 | * Compaction is deferred when compaction fails to result in a page | ||
133 | * allocation success. 1 << compact_defer_limit compactions are skipped up | ||
134 | * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT | ||
135 | */ | ||
136 | void defer_compaction(struct zone *zone, int order) | ||
137 | { | ||
138 | zone->compact_considered = 0; | ||
139 | zone->compact_defer_shift++; | ||
140 | |||
141 | if (order < zone->compact_order_failed) | ||
142 | zone->compact_order_failed = order; | ||
143 | |||
144 | if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) | ||
145 | zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; | ||
146 | |||
147 | trace_mm_compaction_defer_compaction(zone, order); | ||
148 | } | ||
149 | |||
150 | /* Returns true if compaction should be skipped this time */ | ||
151 | bool compaction_deferred(struct zone *zone, int order) | ||
152 | { | ||
153 | unsigned long defer_limit = 1UL << zone->compact_defer_shift; | ||
154 | |||
155 | if (order < zone->compact_order_failed) | ||
156 | return false; | ||
157 | |||
158 | /* Avoid possible overflow */ | ||
159 | if (++zone->compact_considered > defer_limit) | ||
160 | zone->compact_considered = defer_limit; | ||
161 | |||
162 | if (zone->compact_considered >= defer_limit) | ||
163 | return false; | ||
164 | |||
165 | trace_mm_compaction_deferred(zone, order); | ||
166 | |||
167 | return true; | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * Update defer tracking counters after successful compaction of given order, | ||
172 | * which means an allocation either succeeded (alloc_success == true) or is | ||
173 | * expected to succeed. | ||
174 | */ | ||
175 | void compaction_defer_reset(struct zone *zone, int order, | ||
176 | bool alloc_success) | ||
177 | { | ||
178 | if (alloc_success) { | ||
179 | zone->compact_considered = 0; | ||
180 | zone->compact_defer_shift = 0; | ||
181 | } | ||
182 | if (order >= zone->compact_order_failed) | ||
183 | zone->compact_order_failed = order + 1; | ||
184 | |||
185 | trace_mm_compaction_defer_reset(zone, order); | ||
186 | } | ||
187 | |||
188 | /* Returns true if restarting compaction after many failures */ | ||
189 | bool compaction_restarting(struct zone *zone, int order) | ||
190 | { | ||
191 | if (order < zone->compact_order_failed) | ||
192 | return false; | ||
193 | |||
194 | return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && | ||
195 | zone->compact_considered >= 1UL << zone->compact_defer_shift; | ||
196 | } | ||
197 | |||
116 | /* Returns true if the pageblock should be scanned for pages to isolate. */ | 198 | /* Returns true if the pageblock should be scanned for pages to isolate. */ |
117 | static inline bool isolation_suitable(struct compact_control *cc, | 199 | static inline bool isolation_suitable(struct compact_control *cc, |
118 | struct page *page) | 200 | struct page *page) |
@@ -421,11 +503,12 @@ isolate_fail: | |||
421 | 503 | ||
422 | } | 504 | } |
423 | 505 | ||
506 | trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, | ||
507 | nr_scanned, total_isolated); | ||
508 | |||
424 | /* Record how far we have got within the block */ | 509 | /* Record how far we have got within the block */ |
425 | *start_pfn = blockpfn; | 510 | *start_pfn = blockpfn; |
426 | 511 | ||
427 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | ||
428 | |||
429 | /* | 512 | /* |
430 | * If strict isolation is requested by CMA then check that all the | 513 | * If strict isolation is requested by CMA then check that all the |
431 | * pages requested were isolated. If there were any failures, 0 is | 514 | * pages requested were isolated. If there were any failures, 0 is |
@@ -581,6 +664,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
581 | unsigned long flags = 0; | 664 | unsigned long flags = 0; |
582 | bool locked = false; | 665 | bool locked = false; |
583 | struct page *page = NULL, *valid_page = NULL; | 666 | struct page *page = NULL, *valid_page = NULL; |
667 | unsigned long start_pfn = low_pfn; | ||
584 | 668 | ||
585 | /* | 669 | /* |
586 | * Ensure that there are not too many pages isolated from the LRU | 670 | * Ensure that there are not too many pages isolated from the LRU |
@@ -741,7 +825,8 @@ isolate_success: | |||
741 | if (low_pfn == end_pfn) | 825 | if (low_pfn == end_pfn) |
742 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | 826 | update_pageblock_skip(cc, valid_page, nr_isolated, true); |
743 | 827 | ||
744 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 828 | trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, |
829 | nr_scanned, nr_isolated); | ||
745 | 830 | ||
746 | count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); | 831 | count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); |
747 | if (nr_isolated) | 832 | if (nr_isolated) |
@@ -1037,7 +1122,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1037 | return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; | 1122 | return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; |
1038 | } | 1123 | } |
1039 | 1124 | ||
1040 | static int compact_finished(struct zone *zone, struct compact_control *cc, | 1125 | static int __compact_finished(struct zone *zone, struct compact_control *cc, |
1041 | const int migratetype) | 1126 | const int migratetype) |
1042 | { | 1127 | { |
1043 | unsigned int order; | 1128 | unsigned int order; |
@@ -1092,7 +1177,20 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, | |||
1092 | return COMPACT_PARTIAL; | 1177 | return COMPACT_PARTIAL; |
1093 | } | 1178 | } |
1094 | 1179 | ||
1095 | return COMPACT_CONTINUE; | 1180 | return COMPACT_NO_SUITABLE_PAGE; |
1181 | } | ||
1182 | |||
1183 | static int compact_finished(struct zone *zone, struct compact_control *cc, | ||
1184 | const int migratetype) | ||
1185 | { | ||
1186 | int ret; | ||
1187 | |||
1188 | ret = __compact_finished(zone, cc, migratetype); | ||
1189 | trace_mm_compaction_finished(zone, cc->order, ret); | ||
1190 | if (ret == COMPACT_NO_SUITABLE_PAGE) | ||
1191 | ret = COMPACT_CONTINUE; | ||
1192 | |||
1193 | return ret; | ||
1096 | } | 1194 | } |
1097 | 1195 | ||
1098 | /* | 1196 | /* |
@@ -1102,7 +1200,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, | |||
1102 | * COMPACT_PARTIAL - If the allocation would succeed without compaction | 1200 | * COMPACT_PARTIAL - If the allocation would succeed without compaction |
1103 | * COMPACT_CONTINUE - If compaction should run now | 1201 | * COMPACT_CONTINUE - If compaction should run now |
1104 | */ | 1202 | */ |
1105 | unsigned long compaction_suitable(struct zone *zone, int order, | 1203 | static unsigned long __compaction_suitable(struct zone *zone, int order, |
1106 | int alloc_flags, int classzone_idx) | 1204 | int alloc_flags, int classzone_idx) |
1107 | { | 1205 | { |
1108 | int fragindex; | 1206 | int fragindex; |
@@ -1146,11 +1244,24 @@ unsigned long compaction_suitable(struct zone *zone, int order, | |||
1146 | */ | 1244 | */ |
1147 | fragindex = fragmentation_index(zone, order); | 1245 | fragindex = fragmentation_index(zone, order); |
1148 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | 1246 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) |
1149 | return COMPACT_SKIPPED; | 1247 | return COMPACT_NOT_SUITABLE_ZONE; |
1150 | 1248 | ||
1151 | return COMPACT_CONTINUE; | 1249 | return COMPACT_CONTINUE; |
1152 | } | 1250 | } |
1153 | 1251 | ||
1252 | unsigned long compaction_suitable(struct zone *zone, int order, | ||
1253 | int alloc_flags, int classzone_idx) | ||
1254 | { | ||
1255 | unsigned long ret; | ||
1256 | |||
1257 | ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); | ||
1258 | trace_mm_compaction_suitable(zone, order, ret); | ||
1259 | if (ret == COMPACT_NOT_SUITABLE_ZONE) | ||
1260 | ret = COMPACT_SKIPPED; | ||
1261 | |||
1262 | return ret; | ||
1263 | } | ||
1264 | |||
1154 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 1265 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
1155 | { | 1266 | { |
1156 | int ret; | 1267 | int ret; |
@@ -1197,7 +1308,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1197 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; | 1308 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; |
1198 | } | 1309 | } |
1199 | 1310 | ||
1200 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); | 1311 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, |
1312 | cc->free_pfn, end_pfn, sync); | ||
1201 | 1313 | ||
1202 | migrate_prep_local(); | 1314 | migrate_prep_local(); |
1203 | 1315 | ||
@@ -1299,7 +1411,8 @@ out: | |||
1299 | zone->compact_cached_free_pfn = free_pfn; | 1411 | zone->compact_cached_free_pfn = free_pfn; |
1300 | } | 1412 | } |
1301 | 1413 | ||
1302 | trace_mm_compaction_end(ret); | 1414 | trace_mm_compaction_end(start_pfn, cc->migrate_pfn, |
1415 | cc->free_pfn, end_pfn, sync, ret); | ||
1303 | 1416 | ||
1304 | return ret; | 1417 | return ret; |
1305 | } | 1418 | } |
@@ -1335,22 +1448,20 @@ int sysctl_extfrag_threshold = 500; | |||
1335 | 1448 | ||
1336 | /** | 1449 | /** |
1337 | * try_to_compact_pages - Direct compact to satisfy a high-order allocation | 1450 | * try_to_compact_pages - Direct compact to satisfy a high-order allocation |
1338 | * @zonelist: The zonelist used for the current allocation | ||
1339 | * @order: The order of the current allocation | ||
1340 | * @gfp_mask: The GFP mask of the current allocation | 1451 | * @gfp_mask: The GFP mask of the current allocation |
1341 | * @nodemask: The allowed nodes to allocate from | 1452 | * @order: The order of the current allocation |
1453 | * @alloc_flags: The allocation flags of the current allocation | ||
1454 | * @ac: The context of current allocation | ||
1342 | * @mode: The migration mode for async, sync light, or sync migration | 1455 | * @mode: The migration mode for async, sync light, or sync migration |
1343 | * @contended: Return value that determines if compaction was aborted due to | 1456 | * @contended: Return value that determines if compaction was aborted due to |
1344 | * need_resched() or lock contention | 1457 | * need_resched() or lock contention |
1345 | * | 1458 | * |
1346 | * This is the main entry point for direct page compaction. | 1459 | * This is the main entry point for direct page compaction. |
1347 | */ | 1460 | */ |
1348 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1461 | unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, |
1349 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1462 | int alloc_flags, const struct alloc_context *ac, |
1350 | enum migrate_mode mode, int *contended, | 1463 | enum migrate_mode mode, int *contended) |
1351 | int alloc_flags, int classzone_idx) | ||
1352 | { | 1464 | { |
1353 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
1354 | int may_enter_fs = gfp_mask & __GFP_FS; | 1465 | int may_enter_fs = gfp_mask & __GFP_FS; |
1355 | int may_perform_io = gfp_mask & __GFP_IO; | 1466 | int may_perform_io = gfp_mask & __GFP_IO; |
1356 | struct zoneref *z; | 1467 | struct zoneref *z; |
@@ -1364,9 +1475,11 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1364 | if (!order || !may_enter_fs || !may_perform_io) | 1475 | if (!order || !may_enter_fs || !may_perform_io) |
1365 | return COMPACT_SKIPPED; | 1476 | return COMPACT_SKIPPED; |
1366 | 1477 | ||
1478 | trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); | ||
1479 | |||
1367 | /* Compact each zone in the list */ | 1480 | /* Compact each zone in the list */ |
1368 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 1481 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, |
1369 | nodemask) { | 1482 | ac->nodemask) { |
1370 | int status; | 1483 | int status; |
1371 | int zone_contended; | 1484 | int zone_contended; |
1372 | 1485 | ||
@@ -1374,7 +1487,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1374 | continue; | 1487 | continue; |
1375 | 1488 | ||
1376 | status = compact_zone_order(zone, order, gfp_mask, mode, | 1489 | status = compact_zone_order(zone, order, gfp_mask, mode, |
1377 | &zone_contended, alloc_flags, classzone_idx); | 1490 | &zone_contended, alloc_flags, |
1491 | ac->classzone_idx); | ||
1378 | rc = max(status, rc); | 1492 | rc = max(status, rc); |
1379 | /* | 1493 | /* |
1380 | * It takes at least one zone that wasn't lock contended | 1494 | * It takes at least one zone that wasn't lock contended |
@@ -1384,7 +1498,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1384 | 1498 | ||
1385 | /* If a normal allocation would succeed, stop compacting */ | 1499 | /* If a normal allocation would succeed, stop compacting */ |
1386 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), | 1500 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), |
1387 | classzone_idx, alloc_flags)) { | 1501 | ac->classzone_idx, alloc_flags)) { |
1388 | /* | 1502 | /* |
1389 | * We think the allocation will succeed in this zone, | 1503 | * We think the allocation will succeed in this zone, |
1390 | * but it is not certain, hence the false. The caller | 1504 | * but it is not certain, hence the false. The caller |
diff --git a/mm/debug.c b/mm/debug.c index d69cb5a7ba9a..3eb3ac2fcee7 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm) | |||
173 | "get_unmapped_area %p\n" | 173 | "get_unmapped_area %p\n" |
174 | #endif | 174 | #endif |
175 | "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" | 175 | "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" |
176 | "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" | 176 | "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" |
177 | "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" | 177 | "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" |
178 | "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" | 178 | "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" |
179 | "start_code %lx end_code %lx start_data %lx end_data %lx\n" | 179 | "start_code %lx end_code %lx start_data %lx end_data %lx\n" |
@@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm) | |||
206 | mm->pgd, atomic_read(&mm->mm_users), | 206 | mm->pgd, atomic_read(&mm->mm_users), |
207 | atomic_read(&mm->mm_count), | 207 | atomic_read(&mm->mm_count), |
208 | atomic_long_read((atomic_long_t *)&mm->nr_ptes), | 208 | atomic_long_read((atomic_long_t *)&mm->nr_ptes), |
209 | mm_nr_pmds((struct mm_struct *)mm), | ||
209 | mm->map_count, | 210 | mm->map_count, |
210 | mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, | 211 | mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, |
211 | mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, | 212 | mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, |
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
167 | if (pud_none(*pud)) | 167 | if (pud_none(*pud)) |
168 | return no_page_table(vma, flags); | 168 | return no_page_table(vma, flags); |
169 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { | 169 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { |
170 | if (flags & FOLL_GET) | 170 | page = follow_huge_pud(mm, address, pud, flags); |
171 | return NULL; | 171 | if (page) |
172 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | 172 | return page; |
173 | return page; | 173 | return no_page_table(vma, flags); |
174 | } | 174 | } |
175 | if (unlikely(pud_bad(*pud))) | 175 | if (unlikely(pud_bad(*pud))) |
176 | return no_page_table(vma, flags); | 176 | return no_page_table(vma, flags); |
@@ -179,19 +179,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
179 | if (pmd_none(*pmd)) | 179 | if (pmd_none(*pmd)) |
180 | return no_page_table(vma, flags); | 180 | return no_page_table(vma, flags); |
181 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { | 181 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { |
182 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 182 | page = follow_huge_pmd(mm, address, pmd, flags); |
183 | if (flags & FOLL_GET) { | 183 | if (page) |
184 | /* | 184 | return page; |
185 | * Refcount on tail pages are not well-defined and | 185 | return no_page_table(vma, flags); |
186 | * shouldn't be taken. The caller should handle a NULL | ||
187 | * return when trying to follow tail pages. | ||
188 | */ | ||
189 | if (PageHead(page)) | ||
190 | get_page(page); | ||
191 | else | ||
192 | page = NULL; | ||
193 | } | ||
194 | return page; | ||
195 | } | 186 | } |
196 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | 187 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) |
197 | return no_page_table(vma, flags); | 188 | return no_page_table(vma, flags); |
@@ -584,6 +575,185 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
584 | return 0; | 575 | return 0; |
585 | } | 576 | } |
586 | 577 | ||
578 | static __always_inline long __get_user_pages_locked(struct task_struct *tsk, | ||
579 | struct mm_struct *mm, | ||
580 | unsigned long start, | ||
581 | unsigned long nr_pages, | ||
582 | int write, int force, | ||
583 | struct page **pages, | ||
584 | struct vm_area_struct **vmas, | ||
585 | int *locked, bool notify_drop, | ||
586 | unsigned int flags) | ||
587 | { | ||
588 | long ret, pages_done; | ||
589 | bool lock_dropped; | ||
590 | |||
591 | if (locked) { | ||
592 | /* if VM_FAULT_RETRY can be returned, vmas become invalid */ | ||
593 | BUG_ON(vmas); | ||
594 | /* check caller initialized locked */ | ||
595 | BUG_ON(*locked != 1); | ||
596 | } | ||
597 | |||
598 | if (pages) | ||
599 | flags |= FOLL_GET; | ||
600 | if (write) | ||
601 | flags |= FOLL_WRITE; | ||
602 | if (force) | ||
603 | flags |= FOLL_FORCE; | ||
604 | |||
605 | pages_done = 0; | ||
606 | lock_dropped = false; | ||
607 | for (;;) { | ||
608 | ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, | ||
609 | vmas, locked); | ||
610 | if (!locked) | ||
611 | /* VM_FAULT_RETRY couldn't trigger, bypass */ | ||
612 | return ret; | ||
613 | |||
614 | /* VM_FAULT_RETRY cannot return errors */ | ||
615 | if (!*locked) { | ||
616 | BUG_ON(ret < 0); | ||
617 | BUG_ON(ret >= nr_pages); | ||
618 | } | ||
619 | |||
620 | if (!pages) | ||
621 | /* If it's a prefault don't insist harder */ | ||
622 | return ret; | ||
623 | |||
624 | if (ret > 0) { | ||
625 | nr_pages -= ret; | ||
626 | pages_done += ret; | ||
627 | if (!nr_pages) | ||
628 | break; | ||
629 | } | ||
630 | if (*locked) { | ||
631 | /* VM_FAULT_RETRY didn't trigger */ | ||
632 | if (!pages_done) | ||
633 | pages_done = ret; | ||
634 | break; | ||
635 | } | ||
636 | /* VM_FAULT_RETRY triggered, so seek to the faulting offset */ | ||
637 | pages += ret; | ||
638 | start += ret << PAGE_SHIFT; | ||
639 | |||
640 | /* | ||
641 | * Repeat on the address that fired VM_FAULT_RETRY | ||
642 | * without FAULT_FLAG_ALLOW_RETRY but with | ||
643 | * FAULT_FLAG_TRIED. | ||
644 | */ | ||
645 | *locked = 1; | ||
646 | lock_dropped = true; | ||
647 | down_read(&mm->mmap_sem); | ||
648 | ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, | ||
649 | pages, NULL, NULL); | ||
650 | if (ret != 1) { | ||
651 | BUG_ON(ret > 1); | ||
652 | if (!pages_done) | ||
653 | pages_done = ret; | ||
654 | break; | ||
655 | } | ||
656 | nr_pages--; | ||
657 | pages_done++; | ||
658 | if (!nr_pages) | ||
659 | break; | ||
660 | pages++; | ||
661 | start += PAGE_SIZE; | ||
662 | } | ||
663 | if (notify_drop && lock_dropped && *locked) { | ||
664 | /* | ||
665 | * We must let the caller know we temporarily dropped the lock | ||
666 | * and so the critical section protected by it was lost. | ||
667 | */ | ||
668 | up_read(&mm->mmap_sem); | ||
669 | *locked = 0; | ||
670 | } | ||
671 | return pages_done; | ||
672 | } | ||
673 | |||
674 | /* | ||
675 | * We can leverage the VM_FAULT_RETRY functionality in the page fault | ||
676 | * paths better by using either get_user_pages_locked() or | ||
677 | * get_user_pages_unlocked(). | ||
678 | * | ||
679 | * get_user_pages_locked() is suitable to replace the form: | ||
680 | * | ||
681 | * down_read(&mm->mmap_sem); | ||
682 | * do_something() | ||
683 | * get_user_pages(tsk, mm, ..., pages, NULL); | ||
684 | * up_read(&mm->mmap_sem); | ||
685 | * | ||
686 | * to: | ||
687 | * | ||
688 | * int locked = 1; | ||
689 | * down_read(&mm->mmap_sem); | ||
690 | * do_something() | ||
691 | * get_user_pages_locked(tsk, mm, ..., pages, &locked); | ||
692 | * if (locked) | ||
693 | * up_read(&mm->mmap_sem); | ||
694 | */ | ||
695 | long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, | ||
696 | unsigned long start, unsigned long nr_pages, | ||
697 | int write, int force, struct page **pages, | ||
698 | int *locked) | ||
699 | { | ||
700 | return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, | ||
701 | pages, NULL, locked, true, FOLL_TOUCH); | ||
702 | } | ||
703 | EXPORT_SYMBOL(get_user_pages_locked); | ||
704 | |||
705 | /* | ||
706 | * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to | ||
707 | * pass additional gup_flags as last parameter (like FOLL_HWPOISON). | ||
708 | * | ||
709 | * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the | ||
710 | * caller if required (just like with __get_user_pages). "FOLL_GET", | ||
711 | * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed | ||
712 | * according to the parameters "pages", "write", "force" | ||
713 | * respectively. | ||
714 | */ | ||
715 | __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
716 | unsigned long start, unsigned long nr_pages, | ||
717 | int write, int force, struct page **pages, | ||
718 | unsigned int gup_flags) | ||
719 | { | ||
720 | long ret; | ||
721 | int locked = 1; | ||
722 | down_read(&mm->mmap_sem); | ||
723 | ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, | ||
724 | pages, NULL, &locked, false, gup_flags); | ||
725 | if (locked) | ||
726 | up_read(&mm->mmap_sem); | ||
727 | return ret; | ||
728 | } | ||
729 | EXPORT_SYMBOL(__get_user_pages_unlocked); | ||
730 | |||
731 | /* | ||
732 | * get_user_pages_unlocked() is suitable to replace the form: | ||
733 | * | ||
734 | * down_read(&mm->mmap_sem); | ||
735 | * get_user_pages(tsk, mm, ..., pages, NULL); | ||
736 | * up_read(&mm->mmap_sem); | ||
737 | * | ||
738 | * with: | ||
739 | * | ||
740 | * get_user_pages_unlocked(tsk, mm, ..., pages); | ||
741 | * | ||
742 | * It is functionally equivalent to get_user_pages_fast so | ||
743 | * get_user_pages_fast should be used instead, if the two parameters | ||
744 | * "tsk" and "mm" are respectively equal to current and current->mm, | ||
745 | * or if "force" shall be set to 1 (get_user_pages_fast misses the | ||
746 | * "force" parameter). | ||
747 | */ | ||
748 | long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
749 | unsigned long start, unsigned long nr_pages, | ||
750 | int write, int force, struct page **pages) | ||
751 | { | ||
752 | return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, | ||
753 | force, pages, FOLL_TOUCH); | ||
754 | } | ||
755 | EXPORT_SYMBOL(get_user_pages_unlocked); | ||
756 | |||
587 | /* | 757 | /* |
588 | * get_user_pages() - pin user pages in memory | 758 | * get_user_pages() - pin user pages in memory |
589 | * @tsk: the task_struct to use for page fault accounting, or | 759 | * @tsk: the task_struct to use for page fault accounting, or |
@@ -633,22 +803,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
633 | * use the correct cache flushing APIs. | 803 | * use the correct cache flushing APIs. |
634 | * | 804 | * |
635 | * See also get_user_pages_fast, for performance critical applications. | 805 | * See also get_user_pages_fast, for performance critical applications. |
806 | * | ||
807 | * get_user_pages should be phased out in favor of | ||
808 | * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing | ||
809 | * should use get_user_pages because it cannot pass | ||
810 | * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. | ||
636 | */ | 811 | */ |
637 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 812 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
638 | unsigned long start, unsigned long nr_pages, int write, | 813 | unsigned long start, unsigned long nr_pages, int write, |
639 | int force, struct page **pages, struct vm_area_struct **vmas) | 814 | int force, struct page **pages, struct vm_area_struct **vmas) |
640 | { | 815 | { |
641 | int flags = FOLL_TOUCH; | 816 | return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, |
642 | 817 | pages, vmas, NULL, false, FOLL_TOUCH); | |
643 | if (pages) | ||
644 | flags |= FOLL_GET; | ||
645 | if (write) | ||
646 | flags |= FOLL_WRITE; | ||
647 | if (force) | ||
648 | flags |= FOLL_FORCE; | ||
649 | |||
650 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, | ||
651 | NULL); | ||
652 | } | 818 | } |
653 | EXPORT_SYMBOL(get_user_pages); | 819 | EXPORT_SYMBOL(get_user_pages); |
654 | 820 | ||
@@ -1077,10 +1243,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
1077 | start += nr << PAGE_SHIFT; | 1243 | start += nr << PAGE_SHIFT; |
1078 | pages += nr; | 1244 | pages += nr; |
1079 | 1245 | ||
1080 | down_read(&mm->mmap_sem); | 1246 | ret = get_user_pages_unlocked(current, mm, start, |
1081 | ret = get_user_pages(current, mm, start, | 1247 | nr_pages - nr, write, 0, pages); |
1082 | nr_pages - nr, write, 0, pages, NULL); | ||
1083 | up_read(&mm->mmap_sem); | ||
1084 | 1248 | ||
1085 | /* Have to be a bit careful with return values */ | 1249 | /* Have to be a bit careful with return values */ |
1086 | if (nr > 0) { | 1250 | if (nr > 0) { |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 817a875f2b8c..cb7be110cad3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -171,12 +171,7 @@ static int start_khugepaged(void) | |||
171 | } | 171 | } |
172 | 172 | ||
173 | static atomic_t huge_zero_refcount; | 173 | static atomic_t huge_zero_refcount; |
174 | static struct page *huge_zero_page __read_mostly; | 174 | struct page *huge_zero_page __read_mostly; |
175 | |||
176 | static inline bool is_huge_zero_page(struct page *page) | ||
177 | { | ||
178 | return ACCESS_ONCE(huge_zero_page) == page; | ||
179 | } | ||
180 | 175 | ||
181 | static inline bool is_huge_zero_pmd(pmd_t pmd) | 176 | static inline bool is_huge_zero_pmd(pmd_t pmd) |
182 | { | 177 | { |
@@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) | |||
766 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; | 761 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; |
767 | } | 762 | } |
768 | 763 | ||
769 | static inline struct page *alloc_hugepage_vma(int defrag, | ||
770 | struct vm_area_struct *vma, | ||
771 | unsigned long haddr, int nd, | ||
772 | gfp_t extra_gfp) | ||
773 | { | ||
774 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), | ||
775 | HPAGE_PMD_ORDER, vma, haddr, nd); | ||
776 | } | ||
777 | |||
778 | /* Caller must hold page table lock. */ | 764 | /* Caller must hold page table lock. */ |
779 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 765 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
780 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 766 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
@@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
795 | unsigned long address, pmd_t *pmd, | 781 | unsigned long address, pmd_t *pmd, |
796 | unsigned int flags) | 782 | unsigned int flags) |
797 | { | 783 | { |
784 | gfp_t gfp; | ||
798 | struct page *page; | 785 | struct page *page; |
799 | unsigned long haddr = address & HPAGE_PMD_MASK; | 786 | unsigned long haddr = address & HPAGE_PMD_MASK; |
800 | 787 | ||
@@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
829 | } | 816 | } |
830 | return 0; | 817 | return 0; |
831 | } | 818 | } |
832 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 819 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); |
833 | vma, haddr, numa_node_id(), 0); | 820 | page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); |
834 | if (unlikely(!page)) { | 821 | if (unlikely(!page)) { |
835 | count_vm_event(THP_FAULT_FALLBACK); | 822 | count_vm_event(THP_FAULT_FALLBACK); |
836 | return VM_FAULT_FALLBACK; | 823 | return VM_FAULT_FALLBACK; |
@@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1118 | spin_unlock(ptl); | 1105 | spin_unlock(ptl); |
1119 | alloc: | 1106 | alloc: |
1120 | if (transparent_hugepage_enabled(vma) && | 1107 | if (transparent_hugepage_enabled(vma) && |
1121 | !transparent_hugepage_debug_cow()) | 1108 | !transparent_hugepage_debug_cow()) { |
1122 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 1109 | gfp_t gfp; |
1123 | vma, haddr, numa_node_id(), 0); | 1110 | |
1124 | else | 1111 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); |
1112 | new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); | ||
1113 | } else | ||
1125 | new_page = NULL; | 1114 | new_page = NULL; |
1126 | 1115 | ||
1127 | if (unlikely(!new_page)) { | 1116 | if (unlikely(!new_page)) { |
@@ -1423,26 +1412,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1423 | return ret; | 1412 | return ret; |
1424 | } | 1413 | } |
1425 | 1414 | ||
1426 | int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | ||
1427 | unsigned long addr, unsigned long end, | ||
1428 | unsigned char *vec) | ||
1429 | { | ||
1430 | spinlock_t *ptl; | ||
1431 | int ret = 0; | ||
1432 | |||
1433 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | ||
1434 | /* | ||
1435 | * All logical pages in the range are present | ||
1436 | * if backed by a huge page. | ||
1437 | */ | ||
1438 | spin_unlock(ptl); | ||
1439 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
1440 | ret = 1; | ||
1441 | } | ||
1442 | |||
1443 | return ret; | ||
1444 | } | ||
1445 | |||
1446 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | 1415 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, |
1447 | unsigned long old_addr, | 1416 | unsigned long old_addr, |
1448 | unsigned long new_addr, unsigned long old_end, | 1417 | unsigned long new_addr, unsigned long old_end, |
@@ -2148,7 +2117,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2148 | { | 2117 | { |
2149 | struct page *page; | 2118 | struct page *page; |
2150 | pte_t *_pte; | 2119 | pte_t *_pte; |
2151 | int referenced = 0, none = 0; | 2120 | int none = 0; |
2121 | bool referenced = false, writable = false; | ||
2152 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | 2122 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; |
2153 | _pte++, address += PAGE_SIZE) { | 2123 | _pte++, address += PAGE_SIZE) { |
2154 | pte_t pteval = *_pte; | 2124 | pte_t pteval = *_pte; |
@@ -2158,7 +2128,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2158 | else | 2128 | else |
2159 | goto out; | 2129 | goto out; |
2160 | } | 2130 | } |
2161 | if (!pte_present(pteval) || !pte_write(pteval)) | 2131 | if (!pte_present(pteval)) |
2162 | goto out; | 2132 | goto out; |
2163 | page = vm_normal_page(vma, address, pteval); | 2133 | page = vm_normal_page(vma, address, pteval); |
2164 | if (unlikely(!page)) | 2134 | if (unlikely(!page)) |
@@ -2168,9 +2138,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2168 | VM_BUG_ON_PAGE(!PageAnon(page), page); | 2138 | VM_BUG_ON_PAGE(!PageAnon(page), page); |
2169 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | 2139 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
2170 | 2140 | ||
2171 | /* cannot use mapcount: can't collapse if there's a gup pin */ | ||
2172 | if (page_count(page) != 1) | ||
2173 | goto out; | ||
2174 | /* | 2141 | /* |
2175 | * We can do it before isolate_lru_page because the | 2142 | * We can do it before isolate_lru_page because the |
2176 | * page can't be freed from under us. NOTE: PG_lock | 2143 | * page can't be freed from under us. NOTE: PG_lock |
@@ -2179,6 +2146,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2179 | */ | 2146 | */ |
2180 | if (!trylock_page(page)) | 2147 | if (!trylock_page(page)) |
2181 | goto out; | 2148 | goto out; |
2149 | |||
2150 | /* | ||
2151 | * cannot use mapcount: can't collapse if there's a gup pin. | ||
2152 | * The page must only be referenced by the scanned process | ||
2153 | * and page swap cache. | ||
2154 | */ | ||
2155 | if (page_count(page) != 1 + !!PageSwapCache(page)) { | ||
2156 | unlock_page(page); | ||
2157 | goto out; | ||
2158 | } | ||
2159 | if (pte_write(pteval)) { | ||
2160 | writable = true; | ||
2161 | } else { | ||
2162 | if (PageSwapCache(page) && !reuse_swap_page(page)) { | ||
2163 | unlock_page(page); | ||
2164 | goto out; | ||
2165 | } | ||
2166 | /* | ||
2167 | * Page is not in the swap cache. It can be collapsed | ||
2168 | * into a THP. | ||
2169 | */ | ||
2170 | } | ||
2171 | |||
2182 | /* | 2172 | /* |
2183 | * Isolate the page to avoid collapsing an hugepage | 2173 | * Isolate the page to avoid collapsing an hugepage |
2184 | * currently in use by the VM. | 2174 | * currently in use by the VM. |
@@ -2195,9 +2185,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2195 | /* If there is no mapped pte young don't collapse the page */ | 2185 | /* If there is no mapped pte young don't collapse the page */ |
2196 | if (pte_young(pteval) || PageReferenced(page) || | 2186 | if (pte_young(pteval) || PageReferenced(page) || |
2197 | mmu_notifier_test_young(vma->vm_mm, address)) | 2187 | mmu_notifier_test_young(vma->vm_mm, address)) |
2198 | referenced = 1; | 2188 | referenced = true; |
2199 | } | 2189 | } |
2200 | if (likely(referenced)) | 2190 | if (likely(referenced && writable)) |
2201 | return 1; | 2191 | return 1; |
2202 | out: | 2192 | out: |
2203 | release_pte_pages(pte, _pte); | 2193 | release_pte_pages(pte, _pte); |
@@ -2550,11 +2540,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2550 | { | 2540 | { |
2551 | pmd_t *pmd; | 2541 | pmd_t *pmd; |
2552 | pte_t *pte, *_pte; | 2542 | pte_t *pte, *_pte; |
2553 | int ret = 0, referenced = 0, none = 0; | 2543 | int ret = 0, none = 0; |
2554 | struct page *page; | 2544 | struct page *page; |
2555 | unsigned long _address; | 2545 | unsigned long _address; |
2556 | spinlock_t *ptl; | 2546 | spinlock_t *ptl; |
2557 | int node = NUMA_NO_NODE; | 2547 | int node = NUMA_NO_NODE; |
2548 | bool writable = false, referenced = false; | ||
2558 | 2549 | ||
2559 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2550 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2560 | 2551 | ||
@@ -2573,8 +2564,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2573 | else | 2564 | else |
2574 | goto out_unmap; | 2565 | goto out_unmap; |
2575 | } | 2566 | } |
2576 | if (!pte_present(pteval) || !pte_write(pteval)) | 2567 | if (!pte_present(pteval)) |
2577 | goto out_unmap; | 2568 | goto out_unmap; |
2569 | if (pte_write(pteval)) | ||
2570 | writable = true; | ||
2571 | |||
2578 | page = vm_normal_page(vma, _address, pteval); | 2572 | page = vm_normal_page(vma, _address, pteval); |
2579 | if (unlikely(!page)) | 2573 | if (unlikely(!page)) |
2580 | goto out_unmap; | 2574 | goto out_unmap; |
@@ -2591,14 +2585,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2591 | VM_BUG_ON_PAGE(PageCompound(page), page); | 2585 | VM_BUG_ON_PAGE(PageCompound(page), page); |
2592 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2586 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
2593 | goto out_unmap; | 2587 | goto out_unmap; |
2594 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2588 | /* |
2595 | if (page_count(page) != 1) | 2589 | * cannot use mapcount: can't collapse if there's a gup pin. |
2590 | * The page must only be referenced by the scanned process | ||
2591 | * and page swap cache. | ||
2592 | */ | ||
2593 | if (page_count(page) != 1 + !!PageSwapCache(page)) | ||
2596 | goto out_unmap; | 2594 | goto out_unmap; |
2597 | if (pte_young(pteval) || PageReferenced(page) || | 2595 | if (pte_young(pteval) || PageReferenced(page) || |
2598 | mmu_notifier_test_young(vma->vm_mm, address)) | 2596 | mmu_notifier_test_young(vma->vm_mm, address)) |
2599 | referenced = 1; | 2597 | referenced = true; |
2600 | } | 2598 | } |
2601 | if (referenced) | 2599 | if (referenced && writable) |
2602 | ret = 1; | 2600 | ret = 1; |
2603 | out_unmap: | 2601 | out_unmap: |
2604 | pte_unmap_unlock(pte, ptl); | 2602 | pte_unmap_unlock(pte, ptl); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index be0e5d0db5ec..0a9ac6c26832 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -2657,9 +2657,10 @@ again: | |||
2657 | goto unlock; | 2657 | goto unlock; |
2658 | 2658 | ||
2659 | /* | 2659 | /* |
2660 | * HWPoisoned hugepage is already unmapped and dropped reference | 2660 | * Migrating hugepage or HWPoisoned hugepage is already |
2661 | * unmapped and its refcount is dropped, so just clear pte here. | ||
2661 | */ | 2662 | */ |
2662 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { | 2663 | if (unlikely(!pte_present(pte))) { |
2663 | huge_pte_clear(mm, address, ptep); | 2664 | huge_pte_clear(mm, address, ptep); |
2664 | goto unlock; | 2665 | goto unlock; |
2665 | } | 2666 | } |
@@ -3134,6 +3135,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3134 | struct page *pagecache_page = NULL; | 3135 | struct page *pagecache_page = NULL; |
3135 | struct hstate *h = hstate_vma(vma); | 3136 | struct hstate *h = hstate_vma(vma); |
3136 | struct address_space *mapping; | 3137 | struct address_space *mapping; |
3138 | int need_wait_lock = 0; | ||
3137 | 3139 | ||
3138 | address &= huge_page_mask(h); | 3140 | address &= huge_page_mask(h); |
3139 | 3141 | ||
@@ -3172,6 +3174,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3172 | ret = 0; | 3174 | ret = 0; |
3173 | 3175 | ||
3174 | /* | 3176 | /* |
3177 | * entry could be a migration/hwpoison entry at this point, so this | ||
3178 | * check prevents the kernel from going below assuming that we have | ||
3179 | * a active hugepage in pagecache. This goto expects the 2nd page fault, | ||
3180 | * and is_hugetlb_entry_(migration|hwpoisoned) check will properly | ||
3181 | * handle it. | ||
3182 | */ | ||
3183 | if (!pte_present(entry)) | ||
3184 | goto out_mutex; | ||
3185 | |||
3186 | /* | ||
3175 | * If we are going to COW the mapping later, we examine the pending | 3187 | * If we are going to COW the mapping later, we examine the pending |
3176 | * reservations for this page now. This will ensure that any | 3188 | * reservations for this page now. This will ensure that any |
3177 | * allocations necessary to record that reservation occur outside the | 3189 | * allocations necessary to record that reservation occur outside the |
@@ -3190,30 +3202,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3190 | vma, address); | 3202 | vma, address); |
3191 | } | 3203 | } |
3192 | 3204 | ||
3205 | ptl = huge_pte_lock(h, mm, ptep); | ||
3206 | |||
3207 | /* Check for a racing update before calling hugetlb_cow */ | ||
3208 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) | ||
3209 | goto out_ptl; | ||
3210 | |||
3193 | /* | 3211 | /* |
3194 | * hugetlb_cow() requires page locks of pte_page(entry) and | 3212 | * hugetlb_cow() requires page locks of pte_page(entry) and |
3195 | * pagecache_page, so here we need take the former one | 3213 | * pagecache_page, so here we need take the former one |
3196 | * when page != pagecache_page or !pagecache_page. | 3214 | * when page != pagecache_page or !pagecache_page. |
3197 | * Note that locking order is always pagecache_page -> page, | ||
3198 | * so no worry about deadlock. | ||
3199 | */ | 3215 | */ |
3200 | page = pte_page(entry); | 3216 | page = pte_page(entry); |
3201 | get_page(page); | ||
3202 | if (page != pagecache_page) | 3217 | if (page != pagecache_page) |
3203 | lock_page(page); | 3218 | if (!trylock_page(page)) { |
3204 | 3219 | need_wait_lock = 1; | |
3205 | ptl = huge_pte_lockptr(h, mm, ptep); | 3220 | goto out_ptl; |
3206 | spin_lock(ptl); | 3221 | } |
3207 | /* Check for a racing update before calling hugetlb_cow */ | ||
3208 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) | ||
3209 | goto out_ptl; | ||
3210 | 3222 | ||
3223 | get_page(page); | ||
3211 | 3224 | ||
3212 | if (flags & FAULT_FLAG_WRITE) { | 3225 | if (flags & FAULT_FLAG_WRITE) { |
3213 | if (!huge_pte_write(entry)) { | 3226 | if (!huge_pte_write(entry)) { |
3214 | ret = hugetlb_cow(mm, vma, address, ptep, entry, | 3227 | ret = hugetlb_cow(mm, vma, address, ptep, entry, |
3215 | pagecache_page, ptl); | 3228 | pagecache_page, ptl); |
3216 | goto out_ptl; | 3229 | goto out_put_page; |
3217 | } | 3230 | } |
3218 | entry = huge_pte_mkdirty(entry); | 3231 | entry = huge_pte_mkdirty(entry); |
3219 | } | 3232 | } |
@@ -3221,7 +3234,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3221 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, | 3234 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, |
3222 | flags & FAULT_FLAG_WRITE)) | 3235 | flags & FAULT_FLAG_WRITE)) |
3223 | update_mmu_cache(vma, address, ptep); | 3236 | update_mmu_cache(vma, address, ptep); |
3224 | 3237 | out_put_page: | |
3238 | if (page != pagecache_page) | ||
3239 | unlock_page(page); | ||
3240 | put_page(page); | ||
3225 | out_ptl: | 3241 | out_ptl: |
3226 | spin_unlock(ptl); | 3242 | spin_unlock(ptl); |
3227 | 3243 | ||
@@ -3229,12 +3245,17 @@ out_ptl: | |||
3229 | unlock_page(pagecache_page); | 3245 | unlock_page(pagecache_page); |
3230 | put_page(pagecache_page); | 3246 | put_page(pagecache_page); |
3231 | } | 3247 | } |
3232 | if (page != pagecache_page) | ||
3233 | unlock_page(page); | ||
3234 | put_page(page); | ||
3235 | |||
3236 | out_mutex: | 3248 | out_mutex: |
3237 | mutex_unlock(&htlb_fault_mutex_table[hash]); | 3249 | mutex_unlock(&htlb_fault_mutex_table[hash]); |
3250 | /* | ||
3251 | * Generally it's safe to hold refcount during waiting page lock. But | ||
3252 | * here we just wait to defer the next page fault to avoid busy loop and | ||
3253 | * the page is not used after unlocked before returning from the current | ||
3254 | * page fault. So we are safe from accessing freed page, even if we wait | ||
3255 | * here without taking refcount. | ||
3256 | */ | ||
3257 | if (need_wait_lock) | ||
3258 | wait_on_page_locked(page); | ||
3238 | return ret; | 3259 | return ret; |
3239 | } | 3260 | } |
3240 | 3261 | ||
@@ -3364,7 +3385,26 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3364 | spin_unlock(ptl); | 3385 | spin_unlock(ptl); |
3365 | continue; | 3386 | continue; |
3366 | } | 3387 | } |
3367 | if (!huge_pte_none(huge_ptep_get(ptep))) { | 3388 | pte = huge_ptep_get(ptep); |
3389 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { | ||
3390 | spin_unlock(ptl); | ||
3391 | continue; | ||
3392 | } | ||
3393 | if (unlikely(is_hugetlb_entry_migration(pte))) { | ||
3394 | swp_entry_t entry = pte_to_swp_entry(pte); | ||
3395 | |||
3396 | if (is_write_migration_entry(entry)) { | ||
3397 | pte_t newpte; | ||
3398 | |||
3399 | make_migration_entry_read(&entry); | ||
3400 | newpte = swp_entry_to_pte(entry); | ||
3401 | set_huge_pte_at(mm, address, ptep, newpte); | ||
3402 | pages++; | ||
3403 | } | ||
3404 | spin_unlock(ptl); | ||
3405 | continue; | ||
3406 | } | ||
3407 | if (!huge_pte_none(pte)) { | ||
3368 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 3408 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
3369 | pte = pte_mkhuge(huge_pte_modify(pte, newprot)); | 3409 | pte = pte_mkhuge(huge_pte_modify(pte, newprot)); |
3370 | pte = arch_make_huge_pte(pte, vma, NULL, 0); | 3410 | pte = arch_make_huge_pte(pte, vma, NULL, 0); |
@@ -3558,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3558 | if (saddr) { | 3598 | if (saddr) { |
3559 | spte = huge_pte_offset(svma->vm_mm, saddr); | 3599 | spte = huge_pte_offset(svma->vm_mm, saddr); |
3560 | if (spte) { | 3600 | if (spte) { |
3601 | mm_inc_nr_pmds(mm); | ||
3561 | get_page(virt_to_page(spte)); | 3602 | get_page(virt_to_page(spte)); |
3562 | break; | 3603 | break; |
3563 | } | 3604 | } |
@@ -3569,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3569 | 3610 | ||
3570 | ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); | 3611 | ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); |
3571 | spin_lock(ptl); | 3612 | spin_lock(ptl); |
3572 | if (pud_none(*pud)) | 3613 | if (pud_none(*pud)) { |
3573 | pud_populate(mm, pud, | 3614 | pud_populate(mm, pud, |
3574 | (pmd_t *)((unsigned long)spte & PAGE_MASK)); | 3615 | (pmd_t *)((unsigned long)spte & PAGE_MASK)); |
3575 | else | 3616 | } else { |
3576 | put_page(virt_to_page(spte)); | 3617 | put_page(virt_to_page(spte)); |
3618 | mm_inc_nr_pmds(mm); | ||
3619 | } | ||
3577 | spin_unlock(ptl); | 3620 | spin_unlock(ptl); |
3578 | out: | 3621 | out: |
3579 | pte = (pte_t *)pmd_alloc(mm, pud, addr); | 3622 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
@@ -3604,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |||
3604 | 3647 | ||
3605 | pud_clear(pud); | 3648 | pud_clear(pud); |
3606 | put_page(virt_to_page(ptep)); | 3649 | put_page(virt_to_page(ptep)); |
3650 | mm_dec_nr_pmds(mm); | ||
3607 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; | 3651 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; |
3608 | return 1; | 3652 | return 1; |
3609 | } | 3653 | } |
@@ -3660,42 +3704,64 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
3660 | return (pte_t *) pmd; | 3704 | return (pte_t *) pmd; |
3661 | } | 3705 | } |
3662 | 3706 | ||
3663 | struct page * | 3707 | #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ |
3664 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
3665 | pmd_t *pmd, int write) | ||
3666 | { | ||
3667 | struct page *page; | ||
3668 | 3708 | ||
3669 | page = pte_page(*(pte_t *)pmd); | 3709 | /* |
3670 | if (page) | 3710 | * These functions are overwritable if your architecture needs its own |
3671 | page += ((address & ~PMD_MASK) >> PAGE_SHIFT); | 3711 | * behavior. |
3672 | return page; | 3712 | */ |
3713 | struct page * __weak | ||
3714 | follow_huge_addr(struct mm_struct *mm, unsigned long address, | ||
3715 | int write) | ||
3716 | { | ||
3717 | return ERR_PTR(-EINVAL); | ||
3673 | } | 3718 | } |
3674 | 3719 | ||
3675 | struct page * | 3720 | struct page * __weak |
3676 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | 3721 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
3677 | pud_t *pud, int write) | 3722 | pmd_t *pmd, int flags) |
3678 | { | 3723 | { |
3679 | struct page *page; | 3724 | struct page *page = NULL; |
3680 | 3725 | spinlock_t *ptl; | |
3681 | page = pte_page(*(pte_t *)pud); | 3726 | retry: |
3682 | if (page) | 3727 | ptl = pmd_lockptr(mm, pmd); |
3683 | page += ((address & ~PUD_MASK) >> PAGE_SHIFT); | 3728 | spin_lock(ptl); |
3729 | /* | ||
3730 | * make sure that the address range covered by this pmd is not | ||
3731 | * unmapped from other threads. | ||
3732 | */ | ||
3733 | if (!pmd_huge(*pmd)) | ||
3734 | goto out; | ||
3735 | if (pmd_present(*pmd)) { | ||
3736 | page = pte_page(*(pte_t *)pmd) + | ||
3737 | ((address & ~PMD_MASK) >> PAGE_SHIFT); | ||
3738 | if (flags & FOLL_GET) | ||
3739 | get_page(page); | ||
3740 | } else { | ||
3741 | if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { | ||
3742 | spin_unlock(ptl); | ||
3743 | __migration_entry_wait(mm, (pte_t *)pmd, ptl); | ||
3744 | goto retry; | ||
3745 | } | ||
3746 | /* | ||
3747 | * hwpoisoned entry is treated as no_page_table in | ||
3748 | * follow_page_mask(). | ||
3749 | */ | ||
3750 | } | ||
3751 | out: | ||
3752 | spin_unlock(ptl); | ||
3684 | return page; | 3753 | return page; |
3685 | } | 3754 | } |
3686 | 3755 | ||
3687 | #else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ | ||
3688 | |||
3689 | /* Can be overriden by architectures */ | ||
3690 | struct page * __weak | 3756 | struct page * __weak |
3691 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | 3757 | follow_huge_pud(struct mm_struct *mm, unsigned long address, |
3692 | pud_t *pud, int write) | 3758 | pud_t *pud, int flags) |
3693 | { | 3759 | { |
3694 | BUG(); | 3760 | if (flags & FOLL_GET) |
3695 | return NULL; | 3761 | return NULL; |
3696 | } | ||
3697 | 3762 | ||
3698 | #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ | 3763 | return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); |
3764 | } | ||
3699 | 3765 | ||
3700 | #ifdef CONFIG_MEMORY_FAILURE | 3766 | #ifdef CONFIG_MEMORY_FAILURE |
3701 | 3767 | ||
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 037e1c00a5b7..6e0057439a46 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
@@ -279,7 +279,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, | |||
279 | return -EINVAL; | 279 | return -EINVAL; |
280 | 280 | ||
281 | buf = strstrip(buf); | 281 | buf = strstrip(buf); |
282 | ret = page_counter_memparse(buf, &nr_pages); | 282 | ret = page_counter_memparse(buf, "-1", &nr_pages); |
283 | if (ret) | 283 | if (ret) |
284 | return ret; | 284 | return ret; |
285 | 285 | ||
diff --git a/mm/internal.h b/mm/internal.h index efad241f7014..c4d6c9b43491 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -110,6 +110,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); | |||
110 | */ | 110 | */ |
111 | 111 | ||
112 | /* | 112 | /* |
113 | * Structure for holding the mostly immutable allocation parameters passed | ||
114 | * between functions involved in allocations, including the alloc_pages* | ||
115 | * family of functions. | ||
116 | * | ||
117 | * nodemask, migratetype and high_zoneidx are initialized only once in | ||
118 | * __alloc_pages_nodemask() and then never change. | ||
119 | * | ||
120 | * zonelist, preferred_zone and classzone_idx are set first in | ||
121 | * __alloc_pages_nodemask() for the fast path, and might be later changed | ||
122 | * in __alloc_pages_slowpath(). All other functions pass the whole strucure | ||
123 | * by a const pointer. | ||
124 | */ | ||
125 | struct alloc_context { | ||
126 | struct zonelist *zonelist; | ||
127 | nodemask_t *nodemask; | ||
128 | struct zone *preferred_zone; | ||
129 | int classzone_idx; | ||
130 | int migratetype; | ||
131 | enum zone_type high_zoneidx; | ||
132 | }; | ||
133 | |||
134 | /* | ||
113 | * Locate the struct page for both the matching buddy in our | 135 | * Locate the struct page for both the matching buddy in our |
114 | * pair (buddy1) and the combined O(n+1) page they form (page). | 136 | * pair (buddy1) and the combined O(n+1) page they form (page). |
115 | * | 137 | * |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3f8a4f52a0c..095c1f96fbec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys); | |||
72 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 72 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
73 | static struct mem_cgroup *root_mem_cgroup __read_mostly; | 73 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
74 | 74 | ||
75 | /* Whether the swap controller is active */ | ||
75 | #ifdef CONFIG_MEMCG_SWAP | 76 | #ifdef CONFIG_MEMCG_SWAP |
76 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | ||
77 | int do_swap_account __read_mostly; | 77 | int do_swap_account __read_mostly; |
78 | |||
79 | /* for remember boot option*/ | ||
80 | #ifdef CONFIG_MEMCG_SWAP_ENABLED | ||
81 | static int really_do_swap_account __initdata = 1; | ||
82 | #else | ||
83 | static int really_do_swap_account __initdata; | ||
84 | #endif | ||
85 | |||
86 | #else | 78 | #else |
87 | #define do_swap_account 0 | 79 | #define do_swap_account 0 |
88 | #endif | 80 | #endif |
89 | 81 | ||
90 | |||
91 | static const char * const mem_cgroup_stat_names[] = { | 82 | static const char * const mem_cgroup_stat_names[] = { |
92 | "cache", | 83 | "cache", |
93 | "rss", | 84 | "rss", |
@@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = { | |||
97 | "swap", | 88 | "swap", |
98 | }; | 89 | }; |
99 | 90 | ||
100 | enum mem_cgroup_events_index { | ||
101 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ | ||
102 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ | ||
103 | MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ | ||
104 | MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ | ||
105 | MEM_CGROUP_EVENTS_NSTATS, | ||
106 | }; | ||
107 | |||
108 | static const char * const mem_cgroup_events_names[] = { | 91 | static const char * const mem_cgroup_events_names[] = { |
109 | "pgpgin", | 92 | "pgpgin", |
110 | "pgpgout", | 93 | "pgpgout", |
@@ -138,7 +121,7 @@ enum mem_cgroup_events_target { | |||
138 | 121 | ||
139 | struct mem_cgroup_stat_cpu { | 122 | struct mem_cgroup_stat_cpu { |
140 | long count[MEM_CGROUP_STAT_NSTATS]; | 123 | long count[MEM_CGROUP_STAT_NSTATS]; |
141 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; | 124 | unsigned long events[MEMCG_NR_EVENTS]; |
142 | unsigned long nr_page_events; | 125 | unsigned long nr_page_events; |
143 | unsigned long targets[MEM_CGROUP_NTARGETS]; | 126 | unsigned long targets[MEM_CGROUP_NTARGETS]; |
144 | }; | 127 | }; |
@@ -284,6 +267,10 @@ struct mem_cgroup { | |||
284 | struct page_counter memsw; | 267 | struct page_counter memsw; |
285 | struct page_counter kmem; | 268 | struct page_counter kmem; |
286 | 269 | ||
270 | /* Normal memory consumption range */ | ||
271 | unsigned long low; | ||
272 | unsigned long high; | ||
273 | |||
287 | unsigned long soft_limit; | 274 | unsigned long soft_limit; |
288 | 275 | ||
289 | /* vmpressure notifications */ | 276 | /* vmpressure notifications */ |
@@ -325,9 +312,11 @@ struct mem_cgroup { | |||
325 | /* | 312 | /* |
326 | * set > 0 if pages under this cgroup are moving to other cgroup. | 313 | * set > 0 if pages under this cgroup are moving to other cgroup. |
327 | */ | 314 | */ |
328 | atomic_t moving_account; | 315 | atomic_t moving_account; |
329 | /* taken only while moving_account > 0 */ | 316 | /* taken only while moving_account > 0 */ |
330 | spinlock_t move_lock; | 317 | spinlock_t move_lock; |
318 | struct task_struct *move_lock_task; | ||
319 | unsigned long move_lock_flags; | ||
331 | /* | 320 | /* |
332 | * percpu counter. | 321 | * percpu counter. |
333 | */ | 322 | */ |
@@ -371,21 +360,18 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | |||
371 | 360 | ||
372 | /* Stuffs for move charges at task migration. */ | 361 | /* Stuffs for move charges at task migration. */ |
373 | /* | 362 | /* |
374 | * Types of charges to be moved. "move_charge_at_immitgrate" and | 363 | * Types of charges to be moved. |
375 | * "immigrate_flags" are treated as a left-shifted bitmap of these types. | ||
376 | */ | 364 | */ |
377 | enum move_type { | 365 | #define MOVE_ANON 0x1U |
378 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | 366 | #define MOVE_FILE 0x2U |
379 | MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ | 367 | #define MOVE_MASK (MOVE_ANON | MOVE_FILE) |
380 | NR_MOVE_TYPE, | ||
381 | }; | ||
382 | 368 | ||
383 | /* "mc" and its members are protected by cgroup_mutex */ | 369 | /* "mc" and its members are protected by cgroup_mutex */ |
384 | static struct move_charge_struct { | 370 | static struct move_charge_struct { |
385 | spinlock_t lock; /* for from, to */ | 371 | spinlock_t lock; /* for from, to */ |
386 | struct mem_cgroup *from; | 372 | struct mem_cgroup *from; |
387 | struct mem_cgroup *to; | 373 | struct mem_cgroup *to; |
388 | unsigned long immigrate_flags; | 374 | unsigned long flags; |
389 | unsigned long precharge; | 375 | unsigned long precharge; |
390 | unsigned long moved_charge; | 376 | unsigned long moved_charge; |
391 | unsigned long moved_swap; | 377 | unsigned long moved_swap; |
@@ -396,16 +382,6 @@ static struct move_charge_struct { | |||
396 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | 382 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
397 | }; | 383 | }; |
398 | 384 | ||
399 | static bool move_anon(void) | ||
400 | { | ||
401 | return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); | ||
402 | } | ||
403 | |||
404 | static bool move_file(void) | ||
405 | { | ||
406 | return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); | ||
407 | } | ||
408 | |||
409 | /* | 385 | /* |
410 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 386 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
411 | * limit reclaim to prevent infinite loops, if they ever occur. | 387 | * limit reclaim to prevent infinite loops, if they ever occur. |
@@ -1365,6 +1341,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
1365 | return inactive * inactive_ratio < active; | 1341 | return inactive * inactive_ratio < active; |
1366 | } | 1342 | } |
1367 | 1343 | ||
1344 | bool mem_cgroup_lruvec_online(struct lruvec *lruvec) | ||
1345 | { | ||
1346 | struct mem_cgroup_per_zone *mz; | ||
1347 | struct mem_cgroup *memcg; | ||
1348 | |||
1349 | if (mem_cgroup_disabled()) | ||
1350 | return true; | ||
1351 | |||
1352 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); | ||
1353 | memcg = mz->memcg; | ||
1354 | |||
1355 | return !!(memcg->css.flags & CSS_ONLINE); | ||
1356 | } | ||
1357 | |||
1368 | #define mem_cgroup_from_counter(counter, member) \ | 1358 | #define mem_cgroup_from_counter(counter, member) \ |
1369 | container_of(counter, struct mem_cgroup, member) | 1359 | container_of(counter, struct mem_cgroup, member) |
1370 | 1360 | ||
@@ -1557,7 +1547,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1557 | * quickly exit and free its memory. | 1547 | * quickly exit and free its memory. |
1558 | */ | 1548 | */ |
1559 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { | 1549 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { |
1560 | set_thread_flag(TIF_MEMDIE); | 1550 | mark_tsk_oom_victim(current); |
1561 | return; | 1551 | return; |
1562 | } | 1552 | } |
1563 | 1553 | ||
@@ -1931,7 +1921,7 @@ bool mem_cgroup_oom_synchronize(bool handle) | |||
1931 | if (!memcg) | 1921 | if (!memcg) |
1932 | return false; | 1922 | return false; |
1933 | 1923 | ||
1934 | if (!handle) | 1924 | if (!handle || oom_killer_disabled) |
1935 | goto cleanup; | 1925 | goto cleanup; |
1936 | 1926 | ||
1937 | owait.memcg = memcg; | 1927 | owait.memcg = memcg; |
@@ -1977,34 +1967,33 @@ cleanup: | |||
1977 | /** | 1967 | /** |
1978 | * mem_cgroup_begin_page_stat - begin a page state statistics transaction | 1968 | * mem_cgroup_begin_page_stat - begin a page state statistics transaction |
1979 | * @page: page that is going to change accounted state | 1969 | * @page: page that is going to change accounted state |
1980 | * @locked: &memcg->move_lock slowpath was taken | ||
1981 | * @flags: IRQ-state flags for &memcg->move_lock | ||
1982 | * | 1970 | * |
1983 | * This function must mark the beginning of an accounted page state | 1971 | * This function must mark the beginning of an accounted page state |
1984 | * change to prevent double accounting when the page is concurrently | 1972 | * change to prevent double accounting when the page is concurrently |
1985 | * being moved to another memcg: | 1973 | * being moved to another memcg: |
1986 | * | 1974 | * |
1987 | * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); | 1975 | * memcg = mem_cgroup_begin_page_stat(page); |
1988 | * if (TestClearPageState(page)) | 1976 | * if (TestClearPageState(page)) |
1989 | * mem_cgroup_update_page_stat(memcg, state, -1); | 1977 | * mem_cgroup_update_page_stat(memcg, state, -1); |
1990 | * mem_cgroup_end_page_stat(memcg, locked, flags); | 1978 | * mem_cgroup_end_page_stat(memcg); |
1991 | * | ||
1992 | * The RCU lock is held throughout the transaction. The fast path can | ||
1993 | * get away without acquiring the memcg->move_lock (@locked is false) | ||
1994 | * because page moving starts with an RCU grace period. | ||
1995 | * | ||
1996 | * The RCU lock also protects the memcg from being freed when the page | ||
1997 | * state that is going to change is the only thing preventing the page | ||
1998 | * from being uncharged. E.g. end-writeback clearing PageWriteback(), | ||
1999 | * which allows migration to go ahead and uncharge the page before the | ||
2000 | * account transaction might be complete. | ||
2001 | */ | 1979 | */ |
2002 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, | 1980 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) |
2003 | bool *locked, | ||
2004 | unsigned long *flags) | ||
2005 | { | 1981 | { |
2006 | struct mem_cgroup *memcg; | 1982 | struct mem_cgroup *memcg; |
1983 | unsigned long flags; | ||
2007 | 1984 | ||
1985 | /* | ||
1986 | * The RCU lock is held throughout the transaction. The fast | ||
1987 | * path can get away without acquiring the memcg->move_lock | ||
1988 | * because page moving starts with an RCU grace period. | ||
1989 | * | ||
1990 | * The RCU lock also protects the memcg from being freed when | ||
1991 | * the page state that is going to change is the only thing | ||
1992 | * preventing the page from being uncharged. | ||
1993 | * E.g. end-writeback clearing PageWriteback(), which allows | ||
1994 | * migration to go ahead and uncharge the page before the | ||
1995 | * account transaction might be complete. | ||
1996 | */ | ||
2008 | rcu_read_lock(); | 1997 | rcu_read_lock(); |
2009 | 1998 | ||
2010 | if (mem_cgroup_disabled()) | 1999 | if (mem_cgroup_disabled()) |
@@ -2014,16 +2003,22 @@ again: | |||
2014 | if (unlikely(!memcg)) | 2003 | if (unlikely(!memcg)) |
2015 | return NULL; | 2004 | return NULL; |
2016 | 2005 | ||
2017 | *locked = false; | ||
2018 | if (atomic_read(&memcg->moving_account) <= 0) | 2006 | if (atomic_read(&memcg->moving_account) <= 0) |
2019 | return memcg; | 2007 | return memcg; |
2020 | 2008 | ||
2021 | spin_lock_irqsave(&memcg->move_lock, *flags); | 2009 | spin_lock_irqsave(&memcg->move_lock, flags); |
2022 | if (memcg != page->mem_cgroup) { | 2010 | if (memcg != page->mem_cgroup) { |
2023 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | 2011 | spin_unlock_irqrestore(&memcg->move_lock, flags); |
2024 | goto again; | 2012 | goto again; |
2025 | } | 2013 | } |
2026 | *locked = true; | 2014 | |
2015 | /* | ||
2016 | * When charge migration first begins, we can have locked and | ||
2017 | * unlocked page stat updates happening concurrently. Track | ||
2018 | * the task who has the lock for mem_cgroup_end_page_stat(). | ||
2019 | */ | ||
2020 | memcg->move_lock_task = current; | ||
2021 | memcg->move_lock_flags = flags; | ||
2027 | 2022 | ||
2028 | return memcg; | 2023 | return memcg; |
2029 | } | 2024 | } |
@@ -2031,14 +2026,17 @@ again: | |||
2031 | /** | 2026 | /** |
2032 | * mem_cgroup_end_page_stat - finish a page state statistics transaction | 2027 | * mem_cgroup_end_page_stat - finish a page state statistics transaction |
2033 | * @memcg: the memcg that was accounted against | 2028 | * @memcg: the memcg that was accounted against |
2034 | * @locked: value received from mem_cgroup_begin_page_stat() | ||
2035 | * @flags: value received from mem_cgroup_begin_page_stat() | ||
2036 | */ | 2029 | */ |
2037 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, | 2030 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) |
2038 | unsigned long *flags) | ||
2039 | { | 2031 | { |
2040 | if (memcg && *locked) | 2032 | if (memcg && memcg->move_lock_task == current) { |
2041 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | 2033 | unsigned long flags = memcg->move_lock_flags; |
2034 | |||
2035 | memcg->move_lock_task = NULL; | ||
2036 | memcg->move_lock_flags = 0; | ||
2037 | |||
2038 | spin_unlock_irqrestore(&memcg->move_lock, flags); | ||
2039 | } | ||
2042 | 2040 | ||
2043 | rcu_read_unlock(); | 2041 | rcu_read_unlock(); |
2044 | } | 2042 | } |
@@ -2131,17 +2129,6 @@ static void drain_local_stock(struct work_struct *dummy) | |||
2131 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | 2129 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); |
2132 | } | 2130 | } |
2133 | 2131 | ||
2134 | static void __init memcg_stock_init(void) | ||
2135 | { | ||
2136 | int cpu; | ||
2137 | |||
2138 | for_each_possible_cpu(cpu) { | ||
2139 | struct memcg_stock_pcp *stock = | ||
2140 | &per_cpu(memcg_stock, cpu); | ||
2141 | INIT_WORK(&stock->work, drain_local_stock); | ||
2142 | } | ||
2143 | } | ||
2144 | |||
2145 | /* | 2132 | /* |
2146 | * Cache charges(val) to local per_cpu area. | 2133 | * Cache charges(val) to local per_cpu area. |
2147 | * This will be consumed by consume_stock() function, later. | 2134 | * This will be consumed by consume_stock() function, later. |
@@ -2291,6 +2278,8 @@ retry: | |||
2291 | if (!(gfp_mask & __GFP_WAIT)) | 2278 | if (!(gfp_mask & __GFP_WAIT)) |
2292 | goto nomem; | 2279 | goto nomem; |
2293 | 2280 | ||
2281 | mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); | ||
2282 | |||
2294 | nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, | 2283 | nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, |
2295 | gfp_mask, may_swap); | 2284 | gfp_mask, may_swap); |
2296 | 2285 | ||
@@ -2332,6 +2321,8 @@ retry: | |||
2332 | if (fatal_signal_pending(current)) | 2321 | if (fatal_signal_pending(current)) |
2333 | goto bypass; | 2322 | goto bypass; |
2334 | 2323 | ||
2324 | mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); | ||
2325 | |||
2335 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); | 2326 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); |
2336 | nomem: | 2327 | nomem: |
2337 | if (!(gfp_mask & __GFP_NOFAIL)) | 2328 | if (!(gfp_mask & __GFP_NOFAIL)) |
@@ -2343,6 +2334,16 @@ done_restock: | |||
2343 | css_get_many(&memcg->css, batch); | 2334 | css_get_many(&memcg->css, batch); |
2344 | if (batch > nr_pages) | 2335 | if (batch > nr_pages) |
2345 | refill_stock(memcg, batch - nr_pages); | 2336 | refill_stock(memcg, batch - nr_pages); |
2337 | /* | ||
2338 | * If the hierarchy is above the normal consumption range, | ||
2339 | * make the charging task trim their excess contribution. | ||
2340 | */ | ||
2341 | do { | ||
2342 | if (page_counter_read(&memcg->memory) <= memcg->high) | ||
2343 | continue; | ||
2344 | mem_cgroup_events(memcg, MEMCG_HIGH, 1); | ||
2345 | try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); | ||
2346 | } while ((memcg = parent_mem_cgroup(memcg))); | ||
2346 | done: | 2347 | done: |
2347 | return ret; | 2348 | return ret; |
2348 | } | 2349 | } |
@@ -3390,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, | |||
3390 | int ret; | 3391 | int ret; |
3391 | 3392 | ||
3392 | buf = strstrip(buf); | 3393 | buf = strstrip(buf); |
3393 | ret = page_counter_memparse(buf, &nr_pages); | 3394 | ret = page_counter_memparse(buf, "-1", &nr_pages); |
3394 | if (ret) | 3395 | if (ret) |
3395 | return ret; | 3396 | return ret; |
3396 | 3397 | ||
@@ -3466,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, | |||
3466 | { | 3467 | { |
3467 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 3468 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
3468 | 3469 | ||
3469 | if (val >= (1 << NR_MOVE_TYPE)) | 3470 | if (val & ~MOVE_MASK) |
3470 | return -EINVAL; | 3471 | return -EINVAL; |
3471 | 3472 | ||
3472 | /* | 3473 | /* |
@@ -3544,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v) | |||
3544 | struct mem_cgroup *mi; | 3545 | struct mem_cgroup *mi; |
3545 | unsigned int i; | 3546 | unsigned int i; |
3546 | 3547 | ||
3548 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != | ||
3549 | MEM_CGROUP_STAT_NSTATS); | ||
3550 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != | ||
3551 | MEM_CGROUP_EVENTS_NSTATS); | ||
3547 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 3552 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
3548 | 3553 | ||
3549 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 3554 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
@@ -3758,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | |||
3758 | unsigned long usage; | 3763 | unsigned long usage; |
3759 | int i, size, ret; | 3764 | int i, size, ret; |
3760 | 3765 | ||
3761 | ret = page_counter_memparse(args, &threshold); | 3766 | ret = page_counter_memparse(args, "-1", &threshold); |
3762 | if (ret) | 3767 | if (ret) |
3763 | return ret; | 3768 | return ret; |
3764 | 3769 | ||
@@ -4248,7 +4253,7 @@ out_kfree: | |||
4248 | return ret; | 4253 | return ret; |
4249 | } | 4254 | } |
4250 | 4255 | ||
4251 | static struct cftype mem_cgroup_files[] = { | 4256 | static struct cftype mem_cgroup_legacy_files[] = { |
4252 | { | 4257 | { |
4253 | .name = "usage_in_bytes", | 4258 | .name = "usage_in_bytes", |
4254 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 4259 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
@@ -4359,34 +4364,6 @@ static struct cftype mem_cgroup_files[] = { | |||
4359 | { }, /* terminate */ | 4364 | { }, /* terminate */ |
4360 | }; | 4365 | }; |
4361 | 4366 | ||
4362 | #ifdef CONFIG_MEMCG_SWAP | ||
4363 | static struct cftype memsw_cgroup_files[] = { | ||
4364 | { | ||
4365 | .name = "memsw.usage_in_bytes", | ||
4366 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
4367 | .read_u64 = mem_cgroup_read_u64, | ||
4368 | }, | ||
4369 | { | ||
4370 | .name = "memsw.max_usage_in_bytes", | ||
4371 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
4372 | .write = mem_cgroup_reset, | ||
4373 | .read_u64 = mem_cgroup_read_u64, | ||
4374 | }, | ||
4375 | { | ||
4376 | .name = "memsw.limit_in_bytes", | ||
4377 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
4378 | .write = mem_cgroup_write, | ||
4379 | .read_u64 = mem_cgroup_read_u64, | ||
4380 | }, | ||
4381 | { | ||
4382 | .name = "memsw.failcnt", | ||
4383 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
4384 | .write = mem_cgroup_reset, | ||
4385 | .read_u64 = mem_cgroup_read_u64, | ||
4386 | }, | ||
4387 | { }, /* terminate */ | ||
4388 | }; | ||
4389 | #endif | ||
4390 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 4367 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
4391 | { | 4368 | { |
4392 | struct mem_cgroup_per_node *pn; | 4369 | struct mem_cgroup_per_node *pn; |
@@ -4482,29 +4459,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
4482 | } | 4459 | } |
4483 | EXPORT_SYMBOL(parent_mem_cgroup); | 4460 | EXPORT_SYMBOL(parent_mem_cgroup); |
4484 | 4461 | ||
4485 | static void __init mem_cgroup_soft_limit_tree_init(void) | ||
4486 | { | ||
4487 | struct mem_cgroup_tree_per_node *rtpn; | ||
4488 | struct mem_cgroup_tree_per_zone *rtpz; | ||
4489 | int tmp, node, zone; | ||
4490 | |||
4491 | for_each_node(node) { | ||
4492 | tmp = node; | ||
4493 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
4494 | tmp = -1; | ||
4495 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
4496 | BUG_ON(!rtpn); | ||
4497 | |||
4498 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
4499 | |||
4500 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
4501 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
4502 | rtpz->rb_root = RB_ROOT; | ||
4503 | spin_lock_init(&rtpz->lock); | ||
4504 | } | ||
4505 | } | ||
4506 | } | ||
4507 | |||
4508 | static struct cgroup_subsys_state * __ref | 4462 | static struct cgroup_subsys_state * __ref |
4509 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 4463 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
4510 | { | 4464 | { |
@@ -4524,6 +4478,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
4524 | if (parent_css == NULL) { | 4478 | if (parent_css == NULL) { |
4525 | root_mem_cgroup = memcg; | 4479 | root_mem_cgroup = memcg; |
4526 | page_counter_init(&memcg->memory, NULL); | 4480 | page_counter_init(&memcg->memory, NULL); |
4481 | memcg->high = PAGE_COUNTER_MAX; | ||
4527 | memcg->soft_limit = PAGE_COUNTER_MAX; | 4482 | memcg->soft_limit = PAGE_COUNTER_MAX; |
4528 | page_counter_init(&memcg->memsw, NULL); | 4483 | page_counter_init(&memcg->memsw, NULL); |
4529 | page_counter_init(&memcg->kmem, NULL); | 4484 | page_counter_init(&memcg->kmem, NULL); |
@@ -4569,6 +4524,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
4569 | 4524 | ||
4570 | if (parent->use_hierarchy) { | 4525 | if (parent->use_hierarchy) { |
4571 | page_counter_init(&memcg->memory, &parent->memory); | 4526 | page_counter_init(&memcg->memory, &parent->memory); |
4527 | memcg->high = PAGE_COUNTER_MAX; | ||
4572 | memcg->soft_limit = PAGE_COUNTER_MAX; | 4528 | memcg->soft_limit = PAGE_COUNTER_MAX; |
4573 | page_counter_init(&memcg->memsw, &parent->memsw); | 4529 | page_counter_init(&memcg->memsw, &parent->memsw); |
4574 | page_counter_init(&memcg->kmem, &parent->kmem); | 4530 | page_counter_init(&memcg->kmem, &parent->kmem); |
@@ -4579,6 +4535,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
4579 | */ | 4535 | */ |
4580 | } else { | 4536 | } else { |
4581 | page_counter_init(&memcg->memory, NULL); | 4537 | page_counter_init(&memcg->memory, NULL); |
4538 | memcg->high = PAGE_COUNTER_MAX; | ||
4582 | memcg->soft_limit = PAGE_COUNTER_MAX; | 4539 | memcg->soft_limit = PAGE_COUNTER_MAX; |
4583 | page_counter_init(&memcg->memsw, NULL); | 4540 | page_counter_init(&memcg->memsw, NULL); |
4584 | page_counter_init(&memcg->kmem, NULL); | 4541 | page_counter_init(&memcg->kmem, NULL); |
@@ -4654,6 +4611,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) | |||
4654 | mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); | 4611 | mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); |
4655 | mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); | 4612 | mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); |
4656 | memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); | 4613 | memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); |
4614 | memcg->low = 0; | ||
4615 | memcg->high = PAGE_COUNTER_MAX; | ||
4657 | memcg->soft_limit = PAGE_COUNTER_MAX; | 4616 | memcg->soft_limit = PAGE_COUNTER_MAX; |
4658 | } | 4617 | } |
4659 | 4618 | ||
@@ -4730,12 +4689,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | |||
4730 | if (!page || !page_mapped(page)) | 4689 | if (!page || !page_mapped(page)) |
4731 | return NULL; | 4690 | return NULL; |
4732 | if (PageAnon(page)) { | 4691 | if (PageAnon(page)) { |
4733 | /* we don't move shared anon */ | 4692 | if (!(mc.flags & MOVE_ANON)) |
4734 | if (!move_anon()) | ||
4735 | return NULL; | 4693 | return NULL; |
4736 | } else if (!move_file()) | 4694 | } else { |
4737 | /* we ignore mapcount for file pages */ | 4695 | if (!(mc.flags & MOVE_FILE)) |
4738 | return NULL; | 4696 | return NULL; |
4697 | } | ||
4739 | if (!get_page_unless_zero(page)) | 4698 | if (!get_page_unless_zero(page)) |
4740 | return NULL; | 4699 | return NULL; |
4741 | 4700 | ||
@@ -4749,7 +4708,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | |||
4749 | struct page *page = NULL; | 4708 | struct page *page = NULL; |
4750 | swp_entry_t ent = pte_to_swp_entry(ptent); | 4709 | swp_entry_t ent = pte_to_swp_entry(ptent); |
4751 | 4710 | ||
4752 | if (!move_anon() || non_swap_entry(ent)) | 4711 | if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) |
4753 | return NULL; | 4712 | return NULL; |
4754 | /* | 4713 | /* |
4755 | * Because lookup_swap_cache() updates some statistics counter, | 4714 | * Because lookup_swap_cache() updates some statistics counter, |
@@ -4778,7 +4737,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
4778 | 4737 | ||
4779 | if (!vma->vm_file) /* anonymous vma */ | 4738 | if (!vma->vm_file) /* anonymous vma */ |
4780 | return NULL; | 4739 | return NULL; |
4781 | if (!move_file()) | 4740 | if (!(mc.flags & MOVE_FILE)) |
4782 | return NULL; | 4741 | return NULL; |
4783 | 4742 | ||
4784 | mapping = vma->vm_file->f_mapping; | 4743 | mapping = vma->vm_file->f_mapping; |
@@ -4857,7 +4816,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | |||
4857 | 4816 | ||
4858 | page = pmd_page(pmd); | 4817 | page = pmd_page(pmd); |
4859 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); | 4818 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); |
4860 | if (!move_anon()) | 4819 | if (!(mc.flags & MOVE_ANON)) |
4861 | return ret; | 4820 | return ret; |
4862 | if (page->mem_cgroup == mc.from) { | 4821 | if (page->mem_cgroup == mc.from) { |
4863 | ret = MC_TARGET_PAGE; | 4822 | ret = MC_TARGET_PAGE; |
@@ -4880,7 +4839,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
4880 | unsigned long addr, unsigned long end, | 4839 | unsigned long addr, unsigned long end, |
4881 | struct mm_walk *walk) | 4840 | struct mm_walk *walk) |
4882 | { | 4841 | { |
4883 | struct vm_area_struct *vma = walk->private; | 4842 | struct vm_area_struct *vma = walk->vma; |
4884 | pte_t *pte; | 4843 | pte_t *pte; |
4885 | spinlock_t *ptl; | 4844 | spinlock_t *ptl; |
4886 | 4845 | ||
@@ -4906,20 +4865,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
4906 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | 4865 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) |
4907 | { | 4866 | { |
4908 | unsigned long precharge; | 4867 | unsigned long precharge; |
4909 | struct vm_area_struct *vma; | ||
4910 | 4868 | ||
4869 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
4870 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
4871 | .mm = mm, | ||
4872 | }; | ||
4911 | down_read(&mm->mmap_sem); | 4873 | down_read(&mm->mmap_sem); |
4912 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4874 | walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); |
4913 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
4914 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
4915 | .mm = mm, | ||
4916 | .private = vma, | ||
4917 | }; | ||
4918 | if (is_vm_hugetlb_page(vma)) | ||
4919 | continue; | ||
4920 | walk_page_range(vma->vm_start, vma->vm_end, | ||
4921 | &mem_cgroup_count_precharge_walk); | ||
4922 | } | ||
4923 | up_read(&mm->mmap_sem); | 4875 | up_read(&mm->mmap_sem); |
4924 | 4876 | ||
4925 | precharge = mc.precharge; | 4877 | precharge = mc.precharge; |
@@ -4999,15 +4951,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
4999 | struct task_struct *p = cgroup_taskset_first(tset); | 4951 | struct task_struct *p = cgroup_taskset_first(tset); |
5000 | int ret = 0; | 4952 | int ret = 0; |
5001 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4953 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5002 | unsigned long move_charge_at_immigrate; | 4954 | unsigned long move_flags; |
5003 | 4955 | ||
5004 | /* | 4956 | /* |
5005 | * We are now commited to this value whatever it is. Changes in this | 4957 | * We are now commited to this value whatever it is. Changes in this |
5006 | * tunable will only affect upcoming migrations, not the current one. | 4958 | * tunable will only affect upcoming migrations, not the current one. |
5007 | * So we need to save it, and keep it going. | 4959 | * So we need to save it, and keep it going. |
5008 | */ | 4960 | */ |
5009 | move_charge_at_immigrate = memcg->move_charge_at_immigrate; | 4961 | move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate); |
5010 | if (move_charge_at_immigrate) { | 4962 | if (move_flags) { |
5011 | struct mm_struct *mm; | 4963 | struct mm_struct *mm; |
5012 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 4964 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
5013 | 4965 | ||
@@ -5027,7 +4979,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
5027 | spin_lock(&mc.lock); | 4979 | spin_lock(&mc.lock); |
5028 | mc.from = from; | 4980 | mc.from = from; |
5029 | mc.to = memcg; | 4981 | mc.to = memcg; |
5030 | mc.immigrate_flags = move_charge_at_immigrate; | 4982 | mc.flags = move_flags; |
5031 | spin_unlock(&mc.lock); | 4983 | spin_unlock(&mc.lock); |
5032 | /* We set mc.moving_task later */ | 4984 | /* We set mc.moving_task later */ |
5033 | 4985 | ||
@@ -5052,7 +5004,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
5052 | struct mm_walk *walk) | 5004 | struct mm_walk *walk) |
5053 | { | 5005 | { |
5054 | int ret = 0; | 5006 | int ret = 0; |
5055 | struct vm_area_struct *vma = walk->private; | 5007 | struct vm_area_struct *vma = walk->vma; |
5056 | pte_t *pte; | 5008 | pte_t *pte; |
5057 | spinlock_t *ptl; | 5009 | spinlock_t *ptl; |
5058 | enum mc_target_type target_type; | 5010 | enum mc_target_type target_type; |
@@ -5148,7 +5100,10 @@ put: /* get_mctgt_type() gets the page */ | |||
5148 | 5100 | ||
5149 | static void mem_cgroup_move_charge(struct mm_struct *mm) | 5101 | static void mem_cgroup_move_charge(struct mm_struct *mm) |
5150 | { | 5102 | { |
5151 | struct vm_area_struct *vma; | 5103 | struct mm_walk mem_cgroup_move_charge_walk = { |
5104 | .pmd_entry = mem_cgroup_move_charge_pte_range, | ||
5105 | .mm = mm, | ||
5106 | }; | ||
5152 | 5107 | ||
5153 | lru_add_drain_all(); | 5108 | lru_add_drain_all(); |
5154 | /* | 5109 | /* |
@@ -5171,24 +5126,11 @@ retry: | |||
5171 | cond_resched(); | 5126 | cond_resched(); |
5172 | goto retry; | 5127 | goto retry; |
5173 | } | 5128 | } |
5174 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 5129 | /* |
5175 | int ret; | 5130 | * When we have consumed all precharges and failed in doing |
5176 | struct mm_walk mem_cgroup_move_charge_walk = { | 5131 | * additional charge, the page walk just aborts. |
5177 | .pmd_entry = mem_cgroup_move_charge_pte_range, | 5132 | */ |
5178 | .mm = mm, | 5133 | walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); |
5179 | .private = vma, | ||
5180 | }; | ||
5181 | if (is_vm_hugetlb_page(vma)) | ||
5182 | continue; | ||
5183 | ret = walk_page_range(vma->vm_start, vma->vm_end, | ||
5184 | &mem_cgroup_move_charge_walk); | ||
5185 | if (ret) | ||
5186 | /* | ||
5187 | * means we have consumed all precharges and failed in | ||
5188 | * doing additional charge. Just abandon here. | ||
5189 | */ | ||
5190 | break; | ||
5191 | } | ||
5192 | up_read(&mm->mmap_sem); | 5134 | up_read(&mm->mmap_sem); |
5193 | atomic_dec(&mc.from->moving_account); | 5135 | atomic_dec(&mc.from->moving_account); |
5194 | } | 5136 | } |
@@ -5239,118 +5181,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) | |||
5239 | mem_cgroup_from_css(root_css)->use_hierarchy = true; | 5181 | mem_cgroup_from_css(root_css)->use_hierarchy = true; |
5240 | } | 5182 | } |
5241 | 5183 | ||
5242 | struct cgroup_subsys memory_cgrp_subsys = { | 5184 | static u64 memory_current_read(struct cgroup_subsys_state *css, |
5243 | .css_alloc = mem_cgroup_css_alloc, | 5185 | struct cftype *cft) |
5244 | .css_online = mem_cgroup_css_online, | 5186 | { |
5245 | .css_offline = mem_cgroup_css_offline, | 5187 | return mem_cgroup_usage(mem_cgroup_from_css(css), false); |
5246 | .css_free = mem_cgroup_css_free, | 5188 | } |
5247 | .css_reset = mem_cgroup_css_reset, | ||
5248 | .can_attach = mem_cgroup_can_attach, | ||
5249 | .cancel_attach = mem_cgroup_cancel_attach, | ||
5250 | .attach = mem_cgroup_move_task, | ||
5251 | .bind = mem_cgroup_bind, | ||
5252 | .legacy_cftypes = mem_cgroup_files, | ||
5253 | .early_init = 0, | ||
5254 | }; | ||
5255 | 5189 | ||
5256 | #ifdef CONFIG_MEMCG_SWAP | 5190 | static int memory_low_show(struct seq_file *m, void *v) |
5257 | static int __init enable_swap_account(char *s) | ||
5258 | { | 5191 | { |
5259 | if (!strcmp(s, "1")) | 5192 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
5260 | really_do_swap_account = 1; | 5193 | unsigned long low = ACCESS_ONCE(memcg->low); |
5261 | else if (!strcmp(s, "0")) | 5194 | |
5262 | really_do_swap_account = 0; | 5195 | if (low == PAGE_COUNTER_MAX) |
5263 | return 1; | 5196 | seq_puts(m, "infinity\n"); |
5197 | else | ||
5198 | seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); | ||
5199 | |||
5200 | return 0; | ||
5264 | } | 5201 | } |
5265 | __setup("swapaccount=", enable_swap_account); | ||
5266 | 5202 | ||
5267 | static void __init memsw_file_init(void) | 5203 | static ssize_t memory_low_write(struct kernfs_open_file *of, |
5204 | char *buf, size_t nbytes, loff_t off) | ||
5268 | { | 5205 | { |
5269 | WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, | 5206 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); |
5270 | memsw_cgroup_files)); | 5207 | unsigned long low; |
5208 | int err; | ||
5209 | |||
5210 | buf = strstrip(buf); | ||
5211 | err = page_counter_memparse(buf, "infinity", &low); | ||
5212 | if (err) | ||
5213 | return err; | ||
5214 | |||
5215 | memcg->low = low; | ||
5216 | |||
5217 | return nbytes; | ||
5271 | } | 5218 | } |
5272 | 5219 | ||
5273 | static void __init enable_swap_cgroup(void) | 5220 | static int memory_high_show(struct seq_file *m, void *v) |
5274 | { | 5221 | { |
5275 | if (!mem_cgroup_disabled() && really_do_swap_account) { | 5222 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
5276 | do_swap_account = 1; | 5223 | unsigned long high = ACCESS_ONCE(memcg->high); |
5277 | memsw_file_init(); | 5224 | |
5278 | } | 5225 | if (high == PAGE_COUNTER_MAX) |
5226 | seq_puts(m, "infinity\n"); | ||
5227 | else | ||
5228 | seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); | ||
5229 | |||
5230 | return 0; | ||
5279 | } | 5231 | } |
5280 | 5232 | ||
5281 | #else | 5233 | static ssize_t memory_high_write(struct kernfs_open_file *of, |
5282 | static void __init enable_swap_cgroup(void) | 5234 | char *buf, size_t nbytes, loff_t off) |
5283 | { | 5235 | { |
5236 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | ||
5237 | unsigned long high; | ||
5238 | int err; | ||
5239 | |||
5240 | buf = strstrip(buf); | ||
5241 | err = page_counter_memparse(buf, "infinity", &high); | ||
5242 | if (err) | ||
5243 | return err; | ||
5244 | |||
5245 | memcg->high = high; | ||
5246 | |||
5247 | return nbytes; | ||
5284 | } | 5248 | } |
5285 | #endif | ||
5286 | 5249 | ||
5287 | #ifdef CONFIG_MEMCG_SWAP | 5250 | static int memory_max_show(struct seq_file *m, void *v) |
5288 | /** | ||
5289 | * mem_cgroup_swapout - transfer a memsw charge to swap | ||
5290 | * @page: page whose memsw charge to transfer | ||
5291 | * @entry: swap entry to move the charge to | ||
5292 | * | ||
5293 | * Transfer the memsw charge of @page to @entry. | ||
5294 | */ | ||
5295 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | ||
5296 | { | 5251 | { |
5297 | struct mem_cgroup *memcg; | 5252 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
5298 | unsigned short oldid; | 5253 | unsigned long max = ACCESS_ONCE(memcg->memory.limit); |
5299 | 5254 | ||
5300 | VM_BUG_ON_PAGE(PageLRU(page), page); | 5255 | if (max == PAGE_COUNTER_MAX) |
5301 | VM_BUG_ON_PAGE(page_count(page), page); | 5256 | seq_puts(m, "infinity\n"); |
5257 | else | ||
5258 | seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); | ||
5302 | 5259 | ||
5303 | if (!do_swap_account) | 5260 | return 0; |
5304 | return; | 5261 | } |
5305 | 5262 | ||
5306 | memcg = page->mem_cgroup; | 5263 | static ssize_t memory_max_write(struct kernfs_open_file *of, |
5264 | char *buf, size_t nbytes, loff_t off) | ||
5265 | { | ||
5266 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | ||
5267 | unsigned long max; | ||
5268 | int err; | ||
5307 | 5269 | ||
5308 | /* Readahead page, never charged */ | 5270 | buf = strstrip(buf); |
5309 | if (!memcg) | 5271 | err = page_counter_memparse(buf, "infinity", &max); |
5310 | return; | 5272 | if (err) |
5273 | return err; | ||
5311 | 5274 | ||
5312 | oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); | 5275 | err = mem_cgroup_resize_limit(memcg, max); |
5313 | VM_BUG_ON_PAGE(oldid, page); | 5276 | if (err) |
5314 | mem_cgroup_swap_statistics(memcg, true); | 5277 | return err; |
5315 | 5278 | ||
5316 | page->mem_cgroup = NULL; | 5279 | return nbytes; |
5280 | } | ||
5317 | 5281 | ||
5318 | if (!mem_cgroup_is_root(memcg)) | 5282 | static int memory_events_show(struct seq_file *m, void *v) |
5319 | page_counter_uncharge(&memcg->memory, 1); | 5283 | { |
5284 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | ||
5320 | 5285 | ||
5321 | /* XXX: caller holds IRQ-safe mapping->tree_lock */ | 5286 | seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); |
5322 | VM_BUG_ON(!irqs_disabled()); | 5287 | seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); |
5288 | seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); | ||
5289 | seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); | ||
5323 | 5290 | ||
5324 | mem_cgroup_charge_statistics(memcg, page, -1); | 5291 | return 0; |
5325 | memcg_check_events(memcg, page); | 5292 | } |
5293 | |||
5294 | static struct cftype memory_files[] = { | ||
5295 | { | ||
5296 | .name = "current", | ||
5297 | .read_u64 = memory_current_read, | ||
5298 | }, | ||
5299 | { | ||
5300 | .name = "low", | ||
5301 | .flags = CFTYPE_NOT_ON_ROOT, | ||
5302 | .seq_show = memory_low_show, | ||
5303 | .write = memory_low_write, | ||
5304 | }, | ||
5305 | { | ||
5306 | .name = "high", | ||
5307 | .flags = CFTYPE_NOT_ON_ROOT, | ||
5308 | .seq_show = memory_high_show, | ||
5309 | .write = memory_high_write, | ||
5310 | }, | ||
5311 | { | ||
5312 | .name = "max", | ||
5313 | .flags = CFTYPE_NOT_ON_ROOT, | ||
5314 | .seq_show = memory_max_show, | ||
5315 | .write = memory_max_write, | ||
5316 | }, | ||
5317 | { | ||
5318 | .name = "events", | ||
5319 | .flags = CFTYPE_NOT_ON_ROOT, | ||
5320 | .seq_show = memory_events_show, | ||
5321 | }, | ||
5322 | { } /* terminate */ | ||
5323 | }; | ||
5324 | |||
5325 | struct cgroup_subsys memory_cgrp_subsys = { | ||
5326 | .css_alloc = mem_cgroup_css_alloc, | ||
5327 | .css_online = mem_cgroup_css_online, | ||
5328 | .css_offline = mem_cgroup_css_offline, | ||
5329 | .css_free = mem_cgroup_css_free, | ||
5330 | .css_reset = mem_cgroup_css_reset, | ||
5331 | .can_attach = mem_cgroup_can_attach, | ||
5332 | .cancel_attach = mem_cgroup_cancel_attach, | ||
5333 | .attach = mem_cgroup_move_task, | ||
5334 | .bind = mem_cgroup_bind, | ||
5335 | .dfl_cftypes = memory_files, | ||
5336 | .legacy_cftypes = mem_cgroup_legacy_files, | ||
5337 | .early_init = 0, | ||
5338 | }; | ||
5339 | |||
5340 | /** | ||
5341 | * mem_cgroup_events - count memory events against a cgroup | ||
5342 | * @memcg: the memory cgroup | ||
5343 | * @idx: the event index | ||
5344 | * @nr: the number of events to account for | ||
5345 | */ | ||
5346 | void mem_cgroup_events(struct mem_cgroup *memcg, | ||
5347 | enum mem_cgroup_events_index idx, | ||
5348 | unsigned int nr) | ||
5349 | { | ||
5350 | this_cpu_add(memcg->stat->events[idx], nr); | ||
5326 | } | 5351 | } |
5327 | 5352 | ||
5328 | /** | 5353 | /** |
5329 | * mem_cgroup_uncharge_swap - uncharge a swap entry | 5354 | * mem_cgroup_low - check if memory consumption is below the normal range |
5330 | * @entry: swap entry to uncharge | 5355 | * @root: the highest ancestor to consider |
5356 | * @memcg: the memory cgroup to check | ||
5331 | * | 5357 | * |
5332 | * Drop the memsw charge associated with @entry. | 5358 | * Returns %true if memory consumption of @memcg, and that of all |
5359 | * configurable ancestors up to @root, is below the normal range. | ||
5333 | */ | 5360 | */ |
5334 | void mem_cgroup_uncharge_swap(swp_entry_t entry) | 5361 | bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) |
5335 | { | 5362 | { |
5336 | struct mem_cgroup *memcg; | 5363 | if (mem_cgroup_disabled()) |
5337 | unsigned short id; | 5364 | return false; |
5338 | 5365 | ||
5339 | if (!do_swap_account) | 5366 | /* |
5340 | return; | 5367 | * The toplevel group doesn't have a configurable range, so |
5368 | * it's never low when looked at directly, and it is not | ||
5369 | * considered an ancestor when assessing the hierarchy. | ||
5370 | */ | ||
5341 | 5371 | ||
5342 | id = swap_cgroup_record(entry, 0); | 5372 | if (memcg == root_mem_cgroup) |
5343 | rcu_read_lock(); | 5373 | return false; |
5344 | memcg = mem_cgroup_lookup(id); | 5374 | |
5345 | if (memcg) { | 5375 | if (page_counter_read(&memcg->memory) > memcg->low) |
5346 | if (!mem_cgroup_is_root(memcg)) | 5376 | return false; |
5347 | page_counter_uncharge(&memcg->memsw, 1); | 5377 | |
5348 | mem_cgroup_swap_statistics(memcg, false); | 5378 | while (memcg != root) { |
5349 | css_put(&memcg->css); | 5379 | memcg = parent_mem_cgroup(memcg); |
5380 | |||
5381 | if (memcg == root_mem_cgroup) | ||
5382 | break; | ||
5383 | |||
5384 | if (page_counter_read(&memcg->memory) > memcg->low) | ||
5385 | return false; | ||
5350 | } | 5386 | } |
5351 | rcu_read_unlock(); | 5387 | return true; |
5352 | } | 5388 | } |
5353 | #endif | ||
5354 | 5389 | ||
5355 | /** | 5390 | /** |
5356 | * mem_cgroup_try_charge - try charging a page | 5391 | * mem_cgroup_try_charge - try charging a page |
@@ -5684,10 +5719,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, | |||
5684 | */ | 5719 | */ |
5685 | static int __init mem_cgroup_init(void) | 5720 | static int __init mem_cgroup_init(void) |
5686 | { | 5721 | { |
5722 | int cpu, node; | ||
5723 | |||
5687 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 5724 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
5688 | enable_swap_cgroup(); | 5725 | |
5689 | mem_cgroup_soft_limit_tree_init(); | 5726 | for_each_possible_cpu(cpu) |
5690 | memcg_stock_init(); | 5727 | INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, |
5728 | drain_local_stock); | ||
5729 | |||
5730 | for_each_node(node) { | ||
5731 | struct mem_cgroup_tree_per_node *rtpn; | ||
5732 | int zone; | ||
5733 | |||
5734 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, | ||
5735 | node_online(node) ? node : NUMA_NO_NODE); | ||
5736 | |||
5737 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
5738 | struct mem_cgroup_tree_per_zone *rtpz; | ||
5739 | |||
5740 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
5741 | rtpz->rb_root = RB_ROOT; | ||
5742 | spin_lock_init(&rtpz->lock); | ||
5743 | } | ||
5744 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
5745 | } | ||
5746 | |||
5691 | return 0; | 5747 | return 0; |
5692 | } | 5748 | } |
5693 | subsys_initcall(mem_cgroup_init); | 5749 | subsys_initcall(mem_cgroup_init); |
5750 | |||
5751 | #ifdef CONFIG_MEMCG_SWAP | ||
5752 | /** | ||
5753 | * mem_cgroup_swapout - transfer a memsw charge to swap | ||
5754 | * @page: page whose memsw charge to transfer | ||
5755 | * @entry: swap entry to move the charge to | ||
5756 | * | ||
5757 | * Transfer the memsw charge of @page to @entry. | ||
5758 | */ | ||
5759 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | ||
5760 | { | ||
5761 | struct mem_cgroup *memcg; | ||
5762 | unsigned short oldid; | ||
5763 | |||
5764 | VM_BUG_ON_PAGE(PageLRU(page), page); | ||
5765 | VM_BUG_ON_PAGE(page_count(page), page); | ||
5766 | |||
5767 | if (!do_swap_account) | ||
5768 | return; | ||
5769 | |||
5770 | memcg = page->mem_cgroup; | ||
5771 | |||
5772 | /* Readahead page, never charged */ | ||
5773 | if (!memcg) | ||
5774 | return; | ||
5775 | |||
5776 | oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); | ||
5777 | VM_BUG_ON_PAGE(oldid, page); | ||
5778 | mem_cgroup_swap_statistics(memcg, true); | ||
5779 | |||
5780 | page->mem_cgroup = NULL; | ||
5781 | |||
5782 | if (!mem_cgroup_is_root(memcg)) | ||
5783 | page_counter_uncharge(&memcg->memory, 1); | ||
5784 | |||
5785 | /* XXX: caller holds IRQ-safe mapping->tree_lock */ | ||
5786 | VM_BUG_ON(!irqs_disabled()); | ||
5787 | |||
5788 | mem_cgroup_charge_statistics(memcg, page, -1); | ||
5789 | memcg_check_events(memcg, page); | ||
5790 | } | ||
5791 | |||
5792 | /** | ||
5793 | * mem_cgroup_uncharge_swap - uncharge a swap entry | ||
5794 | * @entry: swap entry to uncharge | ||
5795 | * | ||
5796 | * Drop the memsw charge associated with @entry. | ||
5797 | */ | ||
5798 | void mem_cgroup_uncharge_swap(swp_entry_t entry) | ||
5799 | { | ||
5800 | struct mem_cgroup *memcg; | ||
5801 | unsigned short id; | ||
5802 | |||
5803 | if (!do_swap_account) | ||
5804 | return; | ||
5805 | |||
5806 | id = swap_cgroup_record(entry, 0); | ||
5807 | rcu_read_lock(); | ||
5808 | memcg = mem_cgroup_lookup(id); | ||
5809 | if (memcg) { | ||
5810 | if (!mem_cgroup_is_root(memcg)) | ||
5811 | page_counter_uncharge(&memcg->memsw, 1); | ||
5812 | mem_cgroup_swap_statistics(memcg, false); | ||
5813 | css_put(&memcg->css); | ||
5814 | } | ||
5815 | rcu_read_unlock(); | ||
5816 | } | ||
5817 | |||
5818 | /* for remember boot option*/ | ||
5819 | #ifdef CONFIG_MEMCG_SWAP_ENABLED | ||
5820 | static int really_do_swap_account __initdata = 1; | ||
5821 | #else | ||
5822 | static int really_do_swap_account __initdata; | ||
5823 | #endif | ||
5824 | |||
5825 | static int __init enable_swap_account(char *s) | ||
5826 | { | ||
5827 | if (!strcmp(s, "1")) | ||
5828 | really_do_swap_account = 1; | ||
5829 | else if (!strcmp(s, "0")) | ||
5830 | really_do_swap_account = 0; | ||
5831 | return 1; | ||
5832 | } | ||
5833 | __setup("swapaccount=", enable_swap_account); | ||
5834 | |||
5835 | static struct cftype memsw_cgroup_files[] = { | ||
5836 | { | ||
5837 | .name = "memsw.usage_in_bytes", | ||
5838 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
5839 | .read_u64 = mem_cgroup_read_u64, | ||
5840 | }, | ||
5841 | { | ||
5842 | .name = "memsw.max_usage_in_bytes", | ||
5843 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
5844 | .write = mem_cgroup_reset, | ||
5845 | .read_u64 = mem_cgroup_read_u64, | ||
5846 | }, | ||
5847 | { | ||
5848 | .name = "memsw.limit_in_bytes", | ||
5849 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
5850 | .write = mem_cgroup_write, | ||
5851 | .read_u64 = mem_cgroup_read_u64, | ||
5852 | }, | ||
5853 | { | ||
5854 | .name = "memsw.failcnt", | ||
5855 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
5856 | .write = mem_cgroup_reset, | ||
5857 | .read_u64 = mem_cgroup_read_u64, | ||
5858 | }, | ||
5859 | { }, /* terminate */ | ||
5860 | }; | ||
5861 | |||
5862 | static int __init mem_cgroup_swap_init(void) | ||
5863 | { | ||
5864 | if (!mem_cgroup_disabled() && really_do_swap_account) { | ||
5865 | do_swap_account = 1; | ||
5866 | WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, | ||
5867 | memsw_cgroup_files)); | ||
5868 | } | ||
5869 | return 0; | ||
5870 | } | ||
5871 | subsys_initcall(mem_cgroup_swap_init); | ||
5872 | |||
5873 | #endif /* CONFIG_MEMCG_SWAP */ | ||
diff --git a/mm/memory.c b/mm/memory.c index d63849b5188f..bbe6a73a899d 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
428 | pmd = pmd_offset(pud, start); | 428 | pmd = pmd_offset(pud, start); |
429 | pud_clear(pud); | 429 | pud_clear(pud); |
430 | pmd_free_tlb(tlb, pmd, start); | 430 | pmd_free_tlb(tlb, pmd, start); |
431 | mm_dec_nr_pmds(tlb->mm); | ||
431 | } | 432 | } |
432 | 433 | ||
433 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | 434 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, |
@@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | |||
3322 | 3323 | ||
3323 | spin_lock(&mm->page_table_lock); | 3324 | spin_lock(&mm->page_table_lock); |
3324 | #ifndef __ARCH_HAS_4LEVEL_HACK | 3325 | #ifndef __ARCH_HAS_4LEVEL_HACK |
3325 | if (pud_present(*pud)) /* Another has populated it */ | 3326 | if (!pud_present(*pud)) { |
3326 | pmd_free(mm, new); | 3327 | mm_inc_nr_pmds(mm); |
3327 | else | ||
3328 | pud_populate(mm, pud, new); | 3328 | pud_populate(mm, pud, new); |
3329 | #else | 3329 | } else /* Another has populated it */ |
3330 | if (pgd_present(*pud)) /* Another has populated it */ | ||
3331 | pmd_free(mm, new); | 3330 | pmd_free(mm, new); |
3332 | else | 3331 | #else |
3332 | if (!pgd_present(*pud)) { | ||
3333 | mm_inc_nr_pmds(mm); | ||
3333 | pgd_populate(mm, pud, new); | 3334 | pgd_populate(mm, pud, new); |
3335 | } else /* Another has populated it */ | ||
3336 | pmd_free(mm, new); | ||
3334 | #endif /* __ARCH_HAS_4LEVEL_HACK */ | 3337 | #endif /* __ARCH_HAS_4LEVEL_HACK */ |
3335 | spin_unlock(&mm->page_table_lock); | 3338 | spin_unlock(&mm->page_table_lock); |
3336 | return 0; | 3339 | return 0; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e0961b8c39c..f1bd23803576 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { | |||
471 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 471 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
472 | unsigned long flags); | 472 | unsigned long flags); |
473 | 473 | ||
474 | struct queue_pages { | ||
475 | struct list_head *pagelist; | ||
476 | unsigned long flags; | ||
477 | nodemask_t *nmask; | ||
478 | struct vm_area_struct *prev; | ||
479 | }; | ||
480 | |||
474 | /* | 481 | /* |
475 | * Scan through pages checking if pages follow certain conditions, | 482 | * Scan through pages checking if pages follow certain conditions, |
476 | * and move them to the pagelist if they do. | 483 | * and move them to the pagelist if they do. |
477 | */ | 484 | */ |
478 | static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 485 | static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, |
479 | unsigned long addr, unsigned long end, | 486 | unsigned long end, struct mm_walk *walk) |
480 | const nodemask_t *nodes, unsigned long flags, | ||
481 | void *private) | ||
482 | { | 487 | { |
483 | pte_t *orig_pte; | 488 | struct vm_area_struct *vma = walk->vma; |
489 | struct page *page; | ||
490 | struct queue_pages *qp = walk->private; | ||
491 | unsigned long flags = qp->flags; | ||
492 | int nid; | ||
484 | pte_t *pte; | 493 | pte_t *pte; |
485 | spinlock_t *ptl; | 494 | spinlock_t *ptl; |
486 | 495 | ||
487 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 496 | split_huge_page_pmd(vma, addr, pmd); |
488 | do { | 497 | if (pmd_trans_unstable(pmd)) |
489 | struct page *page; | 498 | return 0; |
490 | int nid; | ||
491 | 499 | ||
500 | pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | ||
501 | for (; addr != end; pte++, addr += PAGE_SIZE) { | ||
492 | if (!pte_present(*pte)) | 502 | if (!pte_present(*pte)) |
493 | continue; | 503 | continue; |
494 | page = vm_normal_page(vma, addr, *pte); | 504 | page = vm_normal_page(vma, addr, *pte); |
@@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
501 | if (PageReserved(page)) | 511 | if (PageReserved(page)) |
502 | continue; | 512 | continue; |
503 | nid = page_to_nid(page); | 513 | nid = page_to_nid(page); |
504 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 514 | if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) |
505 | continue; | 515 | continue; |
506 | 516 | ||
507 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | 517 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
508 | migrate_page_add(page, private, flags); | 518 | migrate_page_add(page, qp->pagelist, flags); |
509 | else | 519 | } |
510 | break; | 520 | pte_unmap_unlock(pte - 1, ptl); |
511 | } while (pte++, addr += PAGE_SIZE, addr != end); | 521 | cond_resched(); |
512 | pte_unmap_unlock(orig_pte, ptl); | 522 | return 0; |
513 | return addr != end; | ||
514 | } | 523 | } |
515 | 524 | ||
516 | static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, | 525 | static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, |
517 | pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, | 526 | unsigned long addr, unsigned long end, |
518 | void *private) | 527 | struct mm_walk *walk) |
519 | { | 528 | { |
520 | #ifdef CONFIG_HUGETLB_PAGE | 529 | #ifdef CONFIG_HUGETLB_PAGE |
530 | struct queue_pages *qp = walk->private; | ||
531 | unsigned long flags = qp->flags; | ||
521 | int nid; | 532 | int nid; |
522 | struct page *page; | 533 | struct page *page; |
523 | spinlock_t *ptl; | 534 | spinlock_t *ptl; |
524 | pte_t entry; | 535 | pte_t entry; |
525 | 536 | ||
526 | ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); | 537 | ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); |
527 | entry = huge_ptep_get((pte_t *)pmd); | 538 | entry = huge_ptep_get(pte); |
528 | if (!pte_present(entry)) | 539 | if (!pte_present(entry)) |
529 | goto unlock; | 540 | goto unlock; |
530 | page = pte_page(entry); | 541 | page = pte_page(entry); |
531 | nid = page_to_nid(page); | 542 | nid = page_to_nid(page); |
532 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 543 | if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) |
533 | goto unlock; | 544 | goto unlock; |
534 | /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ | 545 | /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ |
535 | if (flags & (MPOL_MF_MOVE_ALL) || | 546 | if (flags & (MPOL_MF_MOVE_ALL) || |
536 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) | 547 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) |
537 | isolate_huge_page(page, private); | 548 | isolate_huge_page(page, qp->pagelist); |
538 | unlock: | 549 | unlock: |
539 | spin_unlock(ptl); | 550 | spin_unlock(ptl); |
540 | #else | 551 | #else |
541 | BUG(); | 552 | BUG(); |
542 | #endif | 553 | #endif |
543 | } | ||
544 | |||
545 | static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, | ||
546 | unsigned long addr, unsigned long end, | ||
547 | const nodemask_t *nodes, unsigned long flags, | ||
548 | void *private) | ||
549 | { | ||
550 | pmd_t *pmd; | ||
551 | unsigned long next; | ||
552 | |||
553 | pmd = pmd_offset(pud, addr); | ||
554 | do { | ||
555 | next = pmd_addr_end(addr, end); | ||
556 | if (!pmd_present(*pmd)) | ||
557 | continue; | ||
558 | if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { | ||
559 | queue_pages_hugetlb_pmd_range(vma, pmd, nodes, | ||
560 | flags, private); | ||
561 | continue; | ||
562 | } | ||
563 | split_huge_page_pmd(vma, addr, pmd); | ||
564 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
565 | continue; | ||
566 | if (queue_pages_pte_range(vma, pmd, addr, next, nodes, | ||
567 | flags, private)) | ||
568 | return -EIO; | ||
569 | } while (pmd++, addr = next, addr != end); | ||
570 | return 0; | ||
571 | } | ||
572 | |||
573 | static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | ||
574 | unsigned long addr, unsigned long end, | ||
575 | const nodemask_t *nodes, unsigned long flags, | ||
576 | void *private) | ||
577 | { | ||
578 | pud_t *pud; | ||
579 | unsigned long next; | ||
580 | |||
581 | pud = pud_offset(pgd, addr); | ||
582 | do { | ||
583 | next = pud_addr_end(addr, end); | ||
584 | if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) | ||
585 | continue; | ||
586 | if (pud_none_or_clear_bad(pud)) | ||
587 | continue; | ||
588 | if (queue_pages_pmd_range(vma, pud, addr, next, nodes, | ||
589 | flags, private)) | ||
590 | return -EIO; | ||
591 | } while (pud++, addr = next, addr != end); | ||
592 | return 0; | ||
593 | } | ||
594 | |||
595 | static inline int queue_pages_pgd_range(struct vm_area_struct *vma, | ||
596 | unsigned long addr, unsigned long end, | ||
597 | const nodemask_t *nodes, unsigned long flags, | ||
598 | void *private) | ||
599 | { | ||
600 | pgd_t *pgd; | ||
601 | unsigned long next; | ||
602 | |||
603 | pgd = pgd_offset(vma->vm_mm, addr); | ||
604 | do { | ||
605 | next = pgd_addr_end(addr, end); | ||
606 | if (pgd_none_or_clear_bad(pgd)) | ||
607 | continue; | ||
608 | if (queue_pages_pud_range(vma, pgd, addr, next, nodes, | ||
609 | flags, private)) | ||
610 | return -EIO; | ||
611 | } while (pgd++, addr = next, addr != end); | ||
612 | return 0; | 554 | return 0; |
613 | } | 555 | } |
614 | 556 | ||
@@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
641 | } | 583 | } |
642 | #endif /* CONFIG_NUMA_BALANCING */ | 584 | #endif /* CONFIG_NUMA_BALANCING */ |
643 | 585 | ||
586 | static int queue_pages_test_walk(unsigned long start, unsigned long end, | ||
587 | struct mm_walk *walk) | ||
588 | { | ||
589 | struct vm_area_struct *vma = walk->vma; | ||
590 | struct queue_pages *qp = walk->private; | ||
591 | unsigned long endvma = vma->vm_end; | ||
592 | unsigned long flags = qp->flags; | ||
593 | |||
594 | if (vma->vm_flags & VM_PFNMAP) | ||
595 | return 1; | ||
596 | |||
597 | if (endvma > end) | ||
598 | endvma = end; | ||
599 | if (vma->vm_start > start) | ||
600 | start = vma->vm_start; | ||
601 | |||
602 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | ||
603 | if (!vma->vm_next && vma->vm_end < end) | ||
604 | return -EFAULT; | ||
605 | if (qp->prev && qp->prev->vm_end < vma->vm_start) | ||
606 | return -EFAULT; | ||
607 | } | ||
608 | |||
609 | qp->prev = vma; | ||
610 | |||
611 | if (vma->vm_flags & VM_PFNMAP) | ||
612 | return 1; | ||
613 | |||
614 | if (flags & MPOL_MF_LAZY) { | ||
615 | /* Similar to task_numa_work, skip inaccessible VMAs */ | ||
616 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | ||
617 | change_prot_numa(vma, start, endvma); | ||
618 | return 1; | ||
619 | } | ||
620 | |||
621 | if ((flags & MPOL_MF_STRICT) || | ||
622 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | ||
623 | vma_migratable(vma))) | ||
624 | /* queue pages from current vma */ | ||
625 | return 0; | ||
626 | return 1; | ||
627 | } | ||
628 | |||
644 | /* | 629 | /* |
645 | * Walk through page tables and collect pages to be migrated. | 630 | * Walk through page tables and collect pages to be migrated. |
646 | * | 631 | * |
@@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
650 | */ | 635 | */ |
651 | static int | 636 | static int |
652 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 637 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
653 | const nodemask_t *nodes, unsigned long flags, void *private) | 638 | nodemask_t *nodes, unsigned long flags, |
654 | { | 639 | struct list_head *pagelist) |
655 | int err = 0; | 640 | { |
656 | struct vm_area_struct *vma, *prev; | 641 | struct queue_pages qp = { |
657 | 642 | .pagelist = pagelist, | |
658 | vma = find_vma(mm, start); | 643 | .flags = flags, |
659 | if (!vma) | 644 | .nmask = nodes, |
660 | return -EFAULT; | 645 | .prev = NULL, |
661 | prev = NULL; | 646 | }; |
662 | for (; vma && vma->vm_start < end; vma = vma->vm_next) { | 647 | struct mm_walk queue_pages_walk = { |
663 | unsigned long endvma = vma->vm_end; | 648 | .hugetlb_entry = queue_pages_hugetlb, |
664 | 649 | .pmd_entry = queue_pages_pte_range, | |
665 | if (endvma > end) | 650 | .test_walk = queue_pages_test_walk, |
666 | endvma = end; | 651 | .mm = mm, |
667 | if (vma->vm_start > start) | 652 | .private = &qp, |
668 | start = vma->vm_start; | 653 | }; |
669 | 654 | ||
670 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | 655 | return walk_page_range(start, end, &queue_pages_walk); |
671 | if (!vma->vm_next && vma->vm_end < end) | ||
672 | return -EFAULT; | ||
673 | if (prev && prev->vm_end < vma->vm_start) | ||
674 | return -EFAULT; | ||
675 | } | ||
676 | |||
677 | if (flags & MPOL_MF_LAZY) { | ||
678 | /* Similar to task_numa_work, skip inaccessible VMAs */ | ||
679 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | ||
680 | change_prot_numa(vma, start, endvma); | ||
681 | goto next; | ||
682 | } | ||
683 | |||
684 | if ((flags & MPOL_MF_STRICT) || | ||
685 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | ||
686 | vma_migratable(vma))) { | ||
687 | |||
688 | err = queue_pages_pgd_range(vma, start, endvma, nodes, | ||
689 | flags, private); | ||
690 | if (err) | ||
691 | break; | ||
692 | } | ||
693 | next: | ||
694 | prev = vma; | ||
695 | } | ||
696 | return err; | ||
697 | } | 656 | } |
698 | 657 | ||
699 | /* | 658 | /* |
@@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1988 | * @order:Order of the GFP allocation. | 1947 | * @order:Order of the GFP allocation. |
1989 | * @vma: Pointer to VMA or NULL if not available. | 1948 | * @vma: Pointer to VMA or NULL if not available. |
1990 | * @addr: Virtual Address of the allocation. Must be inside the VMA. | 1949 | * @addr: Virtual Address of the allocation. Must be inside the VMA. |
1950 | * @node: Which node to prefer for allocation (modulo policy). | ||
1951 | * @hugepage: for hugepages try only the preferred node if possible | ||
1991 | * | 1952 | * |
1992 | * This function allocates a page from the kernel page pool and applies | 1953 | * This function allocates a page from the kernel page pool and applies |
1993 | * a NUMA policy associated with the VMA or the current process. | 1954 | * a NUMA policy associated with the VMA or the current process. |
1994 | * When VMA is not NULL caller must hold down_read on the mmap_sem of the | 1955 | * When VMA is not NULL caller must hold down_read on the mmap_sem of the |
1995 | * mm_struct of the VMA to prevent it from going away. Should be used for | 1956 | * mm_struct of the VMA to prevent it from going away. Should be used for |
1996 | * all allocations for pages that will be mapped into | 1957 | * all allocations for pages that will be mapped into user space. Returns |
1997 | * user space. Returns NULL when no page can be allocated. | 1958 | * NULL when no page can be allocated. |
1998 | * | ||
1999 | * Should be called with the mm_sem of the vma hold. | ||
2000 | */ | 1959 | */ |
2001 | struct page * | 1960 | struct page * |
2002 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | 1961 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
2003 | unsigned long addr, int node) | 1962 | unsigned long addr, int node, bool hugepage) |
2004 | { | 1963 | { |
2005 | struct mempolicy *pol; | 1964 | struct mempolicy *pol; |
2006 | struct page *page; | 1965 | struct page *page; |
2007 | unsigned int cpuset_mems_cookie; | 1966 | unsigned int cpuset_mems_cookie; |
1967 | struct zonelist *zl; | ||
1968 | nodemask_t *nmask; | ||
2008 | 1969 | ||
2009 | retry_cpuset: | 1970 | retry_cpuset: |
2010 | pol = get_vma_policy(vma, addr); | 1971 | pol = get_vma_policy(vma, addr); |
2011 | cpuset_mems_cookie = read_mems_allowed_begin(); | 1972 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2012 | 1973 | ||
2013 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1974 | if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage && |
1975 | pol->mode != MPOL_INTERLEAVE)) { | ||
1976 | /* | ||
1977 | * For hugepage allocation and non-interleave policy which | ||
1978 | * allows the current node, we only try to allocate from the | ||
1979 | * current node and don't fall back to other nodes, as the | ||
1980 | * cost of remote accesses would likely offset THP benefits. | ||
1981 | * | ||
1982 | * If the policy is interleave, or does not allow the current | ||
1983 | * node in its nodemask, we allocate the standard way. | ||
1984 | */ | ||
1985 | nmask = policy_nodemask(gfp, pol); | ||
1986 | if (!nmask || node_isset(node, *nmask)) { | ||
1987 | mpol_cond_put(pol); | ||
1988 | page = alloc_pages_exact_node(node, gfp, order); | ||
1989 | goto out; | ||
1990 | } | ||
1991 | } | ||
1992 | |||
1993 | if (pol->mode == MPOL_INTERLEAVE) { | ||
2014 | unsigned nid; | 1994 | unsigned nid; |
2015 | 1995 | ||
2016 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | 1996 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
2017 | mpol_cond_put(pol); | 1997 | mpol_cond_put(pol); |
2018 | page = alloc_page_interleave(gfp, order, nid); | 1998 | page = alloc_page_interleave(gfp, order, nid); |
2019 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 1999 | goto out; |
2020 | goto retry_cpuset; | ||
2021 | |||
2022 | return page; | ||
2023 | } | 2000 | } |
2024 | page = __alloc_pages_nodemask(gfp, order, | 2001 | |
2025 | policy_zonelist(gfp, pol, node), | 2002 | nmask = policy_nodemask(gfp, pol); |
2026 | policy_nodemask(gfp, pol)); | 2003 | zl = policy_zonelist(gfp, pol, node); |
2027 | mpol_cond_put(pol); | 2004 | mpol_cond_put(pol); |
2005 | page = __alloc_pages_nodemask(gfp, order, zl, nmask); | ||
2006 | out: | ||
2028 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 2007 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2029 | goto retry_cpuset; | 2008 | goto retry_cpuset; |
2030 | return page; | 2009 | return page; |
diff --git a/mm/migrate.c b/mm/migrate.c index 6e284bcca8bb..f98067e5d353 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -197,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new) | |||
197 | * get to the page and wait until migration is finished. | 197 | * get to the page and wait until migration is finished. |
198 | * When we return from this function the fault will be retried. | 198 | * When we return from this function the fault will be retried. |
199 | */ | 199 | */ |
200 | static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, | 200 | void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, |
201 | spinlock_t *ptl) | 201 | spinlock_t *ptl) |
202 | { | 202 | { |
203 | pte_t pte; | 203 | pte_t pte; |
@@ -1236,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
1236 | goto put_and_set; | 1236 | goto put_and_set; |
1237 | 1237 | ||
1238 | if (PageHuge(page)) { | 1238 | if (PageHuge(page)) { |
1239 | isolate_huge_page(page, &pagelist); | 1239 | if (PageHead(page)) |
1240 | isolate_huge_page(page, &pagelist); | ||
1240 | goto put_and_set; | 1241 | goto put_and_set; |
1241 | } | 1242 | } |
1242 | 1243 | ||
diff --git a/mm/mincore.c b/mm/mincore.c index 46527c023e0c..be25efde64a4 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -19,38 +19,25 @@ | |||
19 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
20 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
21 | 21 | ||
22 | static void mincore_hugetlb_page_range(struct vm_area_struct *vma, | 22 | static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, |
23 | unsigned long addr, unsigned long end, | 23 | unsigned long end, struct mm_walk *walk) |
24 | unsigned char *vec) | ||
25 | { | 24 | { |
26 | #ifdef CONFIG_HUGETLB_PAGE | 25 | #ifdef CONFIG_HUGETLB_PAGE |
27 | struct hstate *h; | 26 | unsigned char present; |
27 | unsigned char *vec = walk->private; | ||
28 | 28 | ||
29 | h = hstate_vma(vma); | 29 | /* |
30 | while (1) { | 30 | * Hugepages under user process are always in RAM and never |
31 | unsigned char present; | 31 | * swapped out, but theoretically it needs to be checked. |
32 | pte_t *ptep; | 32 | */ |
33 | /* | 33 | present = pte && !huge_pte_none(huge_ptep_get(pte)); |
34 | * Huge pages are always in RAM for now, but | 34 | for (; addr != end; vec++, addr += PAGE_SIZE) |
35 | * theoretically it needs to be checked. | 35 | *vec = present; |
36 | */ | 36 | walk->private = vec; |
37 | ptep = huge_pte_offset(current->mm, | ||
38 | addr & huge_page_mask(h)); | ||
39 | present = ptep && !huge_pte_none(huge_ptep_get(ptep)); | ||
40 | while (1) { | ||
41 | *vec = present; | ||
42 | vec++; | ||
43 | addr += PAGE_SIZE; | ||
44 | if (addr == end) | ||
45 | return; | ||
46 | /* check hugepage border */ | ||
47 | if (!(addr & ~huge_page_mask(h))) | ||
48 | break; | ||
49 | } | ||
50 | } | ||
51 | #else | 37 | #else |
52 | BUG(); | 38 | BUG(); |
53 | #endif | 39 | #endif |
40 | return 0; | ||
54 | } | 41 | } |
55 | 42 | ||
56 | /* | 43 | /* |
@@ -94,9 +81,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
94 | return present; | 81 | return present; |
95 | } | 82 | } |
96 | 83 | ||
97 | static void mincore_unmapped_range(struct vm_area_struct *vma, | 84 | static int __mincore_unmapped_range(unsigned long addr, unsigned long end, |
98 | unsigned long addr, unsigned long end, | 85 | struct vm_area_struct *vma, unsigned char *vec) |
99 | unsigned char *vec) | ||
100 | { | 86 | { |
101 | unsigned long nr = (end - addr) >> PAGE_SHIFT; | 87 | unsigned long nr = (end - addr) >> PAGE_SHIFT; |
102 | int i; | 88 | int i; |
@@ -111,23 +97,44 @@ static void mincore_unmapped_range(struct vm_area_struct *vma, | |||
111 | for (i = 0; i < nr; i++) | 97 | for (i = 0; i < nr; i++) |
112 | vec[i] = 0; | 98 | vec[i] = 0; |
113 | } | 99 | } |
100 | return nr; | ||
101 | } | ||
102 | |||
103 | static int mincore_unmapped_range(unsigned long addr, unsigned long end, | ||
104 | struct mm_walk *walk) | ||
105 | { | ||
106 | walk->private += __mincore_unmapped_range(addr, end, | ||
107 | walk->vma, walk->private); | ||
108 | return 0; | ||
114 | } | 109 | } |
115 | 110 | ||
116 | static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 111 | static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
117 | unsigned long addr, unsigned long end, | 112 | struct mm_walk *walk) |
118 | unsigned char *vec) | ||
119 | { | 113 | { |
120 | unsigned long next; | ||
121 | spinlock_t *ptl; | 114 | spinlock_t *ptl; |
115 | struct vm_area_struct *vma = walk->vma; | ||
122 | pte_t *ptep; | 116 | pte_t *ptep; |
117 | unsigned char *vec = walk->private; | ||
118 | int nr = (end - addr) >> PAGE_SHIFT; | ||
119 | |||
120 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | ||
121 | memset(vec, 1, nr); | ||
122 | spin_unlock(ptl); | ||
123 | goto out; | ||
124 | } | ||
125 | |||
126 | if (pmd_trans_unstable(pmd)) { | ||
127 | __mincore_unmapped_range(addr, end, vma, vec); | ||
128 | goto out; | ||
129 | } | ||
123 | 130 | ||
124 | ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 131 | ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
125 | do { | 132 | for (; addr != end; ptep++, addr += PAGE_SIZE) { |
126 | pte_t pte = *ptep; | 133 | pte_t pte = *ptep; |
127 | 134 | ||
128 | next = addr + PAGE_SIZE; | ||
129 | if (pte_none(pte)) | 135 | if (pte_none(pte)) |
130 | mincore_unmapped_range(vma, addr, next, vec); | 136 | __mincore_unmapped_range(addr, addr + PAGE_SIZE, |
137 | vma, vec); | ||
131 | else if (pte_present(pte)) | 138 | else if (pte_present(pte)) |
132 | *vec = 1; | 139 | *vec = 1; |
133 | else { /* pte is a swap entry */ | 140 | else { /* pte is a swap entry */ |
@@ -150,69 +157,12 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
150 | } | 157 | } |
151 | } | 158 | } |
152 | vec++; | 159 | vec++; |
153 | } while (ptep++, addr = next, addr != end); | 160 | } |
154 | pte_unmap_unlock(ptep - 1, ptl); | 161 | pte_unmap_unlock(ptep - 1, ptl); |
155 | } | 162 | out: |
156 | 163 | walk->private += nr; | |
157 | static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 164 | cond_resched(); |
158 | unsigned long addr, unsigned long end, | 165 | return 0; |
159 | unsigned char *vec) | ||
160 | { | ||
161 | unsigned long next; | ||
162 | pmd_t *pmd; | ||
163 | |||
164 | pmd = pmd_offset(pud, addr); | ||
165 | do { | ||
166 | next = pmd_addr_end(addr, end); | ||
167 | if (pmd_trans_huge(*pmd)) { | ||
168 | if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { | ||
169 | vec += (next - addr) >> PAGE_SHIFT; | ||
170 | continue; | ||
171 | } | ||
172 | /* fall through */ | ||
173 | } | ||
174 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
175 | mincore_unmapped_range(vma, addr, next, vec); | ||
176 | else | ||
177 | mincore_pte_range(vma, pmd, addr, next, vec); | ||
178 | vec += (next - addr) >> PAGE_SHIFT; | ||
179 | } while (pmd++, addr = next, addr != end); | ||
180 | } | ||
181 | |||
182 | static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | ||
183 | unsigned long addr, unsigned long end, | ||
184 | unsigned char *vec) | ||
185 | { | ||
186 | unsigned long next; | ||
187 | pud_t *pud; | ||
188 | |||
189 | pud = pud_offset(pgd, addr); | ||
190 | do { | ||
191 | next = pud_addr_end(addr, end); | ||
192 | if (pud_none_or_clear_bad(pud)) | ||
193 | mincore_unmapped_range(vma, addr, next, vec); | ||
194 | else | ||
195 | mincore_pmd_range(vma, pud, addr, next, vec); | ||
196 | vec += (next - addr) >> PAGE_SHIFT; | ||
197 | } while (pud++, addr = next, addr != end); | ||
198 | } | ||
199 | |||
200 | static void mincore_page_range(struct vm_area_struct *vma, | ||
201 | unsigned long addr, unsigned long end, | ||
202 | unsigned char *vec) | ||
203 | { | ||
204 | unsigned long next; | ||
205 | pgd_t *pgd; | ||
206 | |||
207 | pgd = pgd_offset(vma->vm_mm, addr); | ||
208 | do { | ||
209 | next = pgd_addr_end(addr, end); | ||
210 | if (pgd_none_or_clear_bad(pgd)) | ||
211 | mincore_unmapped_range(vma, addr, next, vec); | ||
212 | else | ||
213 | mincore_pud_range(vma, pgd, addr, next, vec); | ||
214 | vec += (next - addr) >> PAGE_SHIFT; | ||
215 | } while (pgd++, addr = next, addr != end); | ||
216 | } | 166 | } |
217 | 167 | ||
218 | /* | 168 | /* |
@@ -224,18 +174,22 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v | |||
224 | { | 174 | { |
225 | struct vm_area_struct *vma; | 175 | struct vm_area_struct *vma; |
226 | unsigned long end; | 176 | unsigned long end; |
177 | int err; | ||
178 | struct mm_walk mincore_walk = { | ||
179 | .pmd_entry = mincore_pte_range, | ||
180 | .pte_hole = mincore_unmapped_range, | ||
181 | .hugetlb_entry = mincore_hugetlb, | ||
182 | .private = vec, | ||
183 | }; | ||
227 | 184 | ||
228 | vma = find_vma(current->mm, addr); | 185 | vma = find_vma(current->mm, addr); |
229 | if (!vma || addr < vma->vm_start) | 186 | if (!vma || addr < vma->vm_start) |
230 | return -ENOMEM; | 187 | return -ENOMEM; |
231 | 188 | mincore_walk.mm = vma->vm_mm; | |
232 | end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); | 189 | end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); |
233 | 190 | err = walk_page_range(addr, end, &mincore_walk); | |
234 | if (is_vm_hugetlb_page(vma)) | 191 | if (err < 0) |
235 | mincore_hugetlb_page_range(vma, addr, end, vec); | 192 | return err; |
236 | else | ||
237 | mincore_page_range(vma, addr, end, vec); | ||
238 | |||
239 | return (end - addr) >> PAGE_SHIFT; | 193 | return (end - addr) >> PAGE_SHIFT; |
240 | } | 194 | } |
241 | 195 | ||
@@ -152,7 +152,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); | |||
152 | */ | 152 | */ |
153 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | 153 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
154 | { | 154 | { |
155 | unsigned long free, allowed, reserve; | 155 | long free, allowed, reserve; |
156 | 156 | ||
157 | VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < | 157 | VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < |
158 | -(s64)vm_committed_as_batch * num_online_cpus(), | 158 | -(s64)vm_committed_as_batch * num_online_cpus(), |
@@ -220,7 +220,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
220 | */ | 220 | */ |
221 | if (mm) { | 221 | if (mm) { |
222 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); | 222 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); |
223 | allowed -= min(mm->total_vm / 32, reserve); | 223 | allowed -= min_t(long, mm->total_vm / 32, reserve); |
224 | } | 224 | } |
225 | 225 | ||
226 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) | 226 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
@@ -2851,9 +2851,6 @@ void exit_mmap(struct mm_struct *mm) | |||
2851 | vma = remove_vma(vma); | 2851 | vma = remove_vma(vma); |
2852 | } | 2852 | } |
2853 | vm_unacct_memory(nr_accounted); | 2853 | vm_unacct_memory(nr_accounted); |
2854 | |||
2855 | WARN_ON(atomic_long_read(&mm->nr_ptes) > | ||
2856 | (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | ||
2857 | } | 2854 | } |
2858 | 2855 | ||
2859 | /* Insert vm structure into process list sorted by address | 2856 | /* Insert vm structure into process list sorted by address |
diff --git a/mm/mmzone.c b/mm/mmzone.c index bf34fb8556db..7d87ebb0d632 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -54,8 +54,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) | |||
54 | /* Returns the next zone at or below highest_zoneidx in a zonelist */ | 54 | /* Returns the next zone at or below highest_zoneidx in a zonelist */ |
55 | struct zoneref *next_zones_zonelist(struct zoneref *z, | 55 | struct zoneref *next_zones_zonelist(struct zoneref *z, |
56 | enum zone_type highest_zoneidx, | 56 | enum zone_type highest_zoneidx, |
57 | nodemask_t *nodes, | 57 | nodemask_t *nodes) |
58 | struct zone **zone) | ||
59 | { | 58 | { |
60 | /* | 59 | /* |
61 | * Find the next suitable zone to use for the allocation. | 60 | * Find the next suitable zone to use for the allocation. |
@@ -69,7 +68,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, | |||
69 | (z->zone && !zref_in_nodemask(z, nodes))) | 68 | (z->zone && !zref_in_nodemask(z, nodes))) |
70 | z++; | 69 | z++; |
71 | 70 | ||
72 | *zone = zonelist_zone(z); | ||
73 | return z; | 71 | return z; |
74 | } | 72 | } |
75 | 73 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index 541bed64e348..1a19fb3b0463 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -214,6 +214,39 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
214 | } | 214 | } |
215 | EXPORT_SYMBOL(get_user_pages); | 215 | EXPORT_SYMBOL(get_user_pages); |
216 | 216 | ||
217 | long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, | ||
218 | unsigned long start, unsigned long nr_pages, | ||
219 | int write, int force, struct page **pages, | ||
220 | int *locked) | ||
221 | { | ||
222 | return get_user_pages(tsk, mm, start, nr_pages, write, force, | ||
223 | pages, NULL); | ||
224 | } | ||
225 | EXPORT_SYMBOL(get_user_pages_locked); | ||
226 | |||
227 | long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
228 | unsigned long start, unsigned long nr_pages, | ||
229 | int write, int force, struct page **pages, | ||
230 | unsigned int gup_flags) | ||
231 | { | ||
232 | long ret; | ||
233 | down_read(&mm->mmap_sem); | ||
234 | ret = get_user_pages(tsk, mm, start, nr_pages, write, force, | ||
235 | pages, NULL); | ||
236 | up_read(&mm->mmap_sem); | ||
237 | return ret; | ||
238 | } | ||
239 | EXPORT_SYMBOL(__get_user_pages_unlocked); | ||
240 | |||
241 | long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
242 | unsigned long start, unsigned long nr_pages, | ||
243 | int write, int force, struct page **pages) | ||
244 | { | ||
245 | return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, | ||
246 | force, pages, 0); | ||
247 | } | ||
248 | EXPORT_SYMBOL(get_user_pages_unlocked); | ||
249 | |||
217 | /** | 250 | /** |
218 | * follow_pfn - look up PFN at a user virtual address | 251 | * follow_pfn - look up PFN at a user virtual address |
219 | * @vma: memory mapping | 252 | * @vma: memory mapping |
@@ -1895,7 +1928,7 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
1895 | */ | 1928 | */ |
1896 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | 1929 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
1897 | { | 1930 | { |
1898 | unsigned long free, allowed, reserve; | 1931 | long free, allowed, reserve; |
1899 | 1932 | ||
1900 | vm_acct_memory(pages); | 1933 | vm_acct_memory(pages); |
1901 | 1934 | ||
@@ -1959,7 +1992,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1959 | */ | 1992 | */ |
1960 | if (mm) { | 1993 | if (mm) { |
1961 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); | 1994 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); |
1962 | allowed -= min(mm->total_vm / 32, reserve); | 1995 | allowed -= min_t(long, mm->total_vm / 32, reserve); |
1963 | } | 1996 | } |
1964 | 1997 | ||
1965 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) | 1998 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d503e9ce1c7b..642f38cb175a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
169 | * The baseline for the badness score is the proportion of RAM that each | 169 | * The baseline for the badness score is the proportion of RAM that each |
170 | * task's rss, pagetable and swap space use. | 170 | * task's rss, pagetable and swap space use. |
171 | */ | 171 | */ |
172 | points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + | 172 | points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + |
173 | get_mm_counter(p->mm, MM_SWAPENTS); | 173 | atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); |
174 | task_unlock(p); | 174 | task_unlock(p); |
175 | 175 | ||
176 | /* | 176 | /* |
@@ -266,8 +266,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
266 | * Don't allow any other task to have access to the reserves. | 266 | * Don't allow any other task to have access to the reserves. |
267 | */ | 267 | */ |
268 | if (test_tsk_thread_flag(task, TIF_MEMDIE)) { | 268 | if (test_tsk_thread_flag(task, TIF_MEMDIE)) { |
269 | if (unlikely(frozen(task))) | ||
270 | __thaw_task(task); | ||
271 | if (!force_kill) | 269 | if (!force_kill) |
272 | return OOM_SCAN_ABORT; | 270 | return OOM_SCAN_ABORT; |
273 | } | 271 | } |
@@ -353,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
353 | struct task_struct *p; | 351 | struct task_struct *p; |
354 | struct task_struct *task; | 352 | struct task_struct *task; |
355 | 353 | ||
356 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); | 354 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); |
357 | rcu_read_lock(); | 355 | rcu_read_lock(); |
358 | for_each_process(p) { | 356 | for_each_process(p) { |
359 | if (oom_unkillable_task(p, memcg, nodemask)) | 357 | if (oom_unkillable_task(p, memcg, nodemask)) |
@@ -369,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
369 | continue; | 367 | continue; |
370 | } | 368 | } |
371 | 369 | ||
372 | pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", | 370 | pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", |
373 | task->pid, from_kuid(&init_user_ns, task_uid(task)), | 371 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
374 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 372 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
375 | atomic_long_read(&task->mm->nr_ptes), | 373 | atomic_long_read(&task->mm->nr_ptes), |
374 | mm_nr_pmds(task->mm), | ||
376 | get_mm_counter(task->mm, MM_SWAPENTS), | 375 | get_mm_counter(task->mm, MM_SWAPENTS), |
377 | task->signal->oom_score_adj, task->comm); | 376 | task->signal->oom_score_adj, task->comm); |
378 | task_unlock(task); | 377 | task_unlock(task); |
@@ -400,20 +399,98 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
400 | } | 399 | } |
401 | 400 | ||
402 | /* | 401 | /* |
403 | * Number of OOM killer invocations (including memcg OOM killer). | 402 | * Number of OOM victims in flight |
404 | * Primarily used by PM freezer to check for potential races with | ||
405 | * OOM killed frozen task. | ||
406 | */ | 403 | */ |
407 | static atomic_t oom_kills = ATOMIC_INIT(0); | 404 | static atomic_t oom_victims = ATOMIC_INIT(0); |
405 | static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); | ||
408 | 406 | ||
409 | int oom_kills_count(void) | 407 | bool oom_killer_disabled __read_mostly; |
408 | static DECLARE_RWSEM(oom_sem); | ||
409 | |||
410 | /** | ||
411 | * mark_tsk_oom_victim - marks the given taks as OOM victim. | ||
412 | * @tsk: task to mark | ||
413 | * | ||
414 | * Has to be called with oom_sem taken for read and never after | ||
415 | * oom has been disabled already. | ||
416 | */ | ||
417 | void mark_tsk_oom_victim(struct task_struct *tsk) | ||
410 | { | 418 | { |
411 | return atomic_read(&oom_kills); | 419 | WARN_ON(oom_killer_disabled); |
420 | /* OOM killer might race with memcg OOM */ | ||
421 | if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) | ||
422 | return; | ||
423 | /* | ||
424 | * Make sure that the task is woken up from uninterruptible sleep | ||
425 | * if it is frozen because OOM killer wouldn't be able to free | ||
426 | * any memory and livelock. freezing_slow_path will tell the freezer | ||
427 | * that TIF_MEMDIE tasks should be ignored. | ||
428 | */ | ||
429 | __thaw_task(tsk); | ||
430 | atomic_inc(&oom_victims); | ||
431 | } | ||
432 | |||
433 | /** | ||
434 | * unmark_oom_victim - unmarks the current task as OOM victim. | ||
435 | * | ||
436 | * Wakes up all waiters in oom_killer_disable() | ||
437 | */ | ||
438 | void unmark_oom_victim(void) | ||
439 | { | ||
440 | if (!test_and_clear_thread_flag(TIF_MEMDIE)) | ||
441 | return; | ||
442 | |||
443 | down_read(&oom_sem); | ||
444 | /* | ||
445 | * There is no need to signal the lasst oom_victim if there | ||
446 | * is nobody who cares. | ||
447 | */ | ||
448 | if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) | ||
449 | wake_up_all(&oom_victims_wait); | ||
450 | up_read(&oom_sem); | ||
451 | } | ||
452 | |||
453 | /** | ||
454 | * oom_killer_disable - disable OOM killer | ||
455 | * | ||
456 | * Forces all page allocations to fail rather than trigger OOM killer. | ||
457 | * Will block and wait until all OOM victims are killed. | ||
458 | * | ||
459 | * The function cannot be called when there are runnable user tasks because | ||
460 | * the userspace would see unexpected allocation failures as a result. Any | ||
461 | * new usage of this function should be consulted with MM people. | ||
462 | * | ||
463 | * Returns true if successful and false if the OOM killer cannot be | ||
464 | * disabled. | ||
465 | */ | ||
466 | bool oom_killer_disable(void) | ||
467 | { | ||
468 | /* | ||
469 | * Make sure to not race with an ongoing OOM killer | ||
470 | * and that the current is not the victim. | ||
471 | */ | ||
472 | down_write(&oom_sem); | ||
473 | if (test_thread_flag(TIF_MEMDIE)) { | ||
474 | up_write(&oom_sem); | ||
475 | return false; | ||
476 | } | ||
477 | |||
478 | oom_killer_disabled = true; | ||
479 | up_write(&oom_sem); | ||
480 | |||
481 | wait_event(oom_victims_wait, !atomic_read(&oom_victims)); | ||
482 | |||
483 | return true; | ||
412 | } | 484 | } |
413 | 485 | ||
414 | void note_oom_kill(void) | 486 | /** |
487 | * oom_killer_enable - enable OOM killer | ||
488 | */ | ||
489 | void oom_killer_enable(void) | ||
415 | { | 490 | { |
416 | atomic_inc(&oom_kills); | 491 | down_write(&oom_sem); |
492 | oom_killer_disabled = false; | ||
493 | up_write(&oom_sem); | ||
417 | } | 494 | } |
418 | 495 | ||
419 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 496 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
@@ -438,11 +515,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
438 | * If the task is already exiting, don't alarm the sysadmin or kill | 515 | * If the task is already exiting, don't alarm the sysadmin or kill |
439 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 516 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
440 | */ | 517 | */ |
441 | if (task_will_free_mem(p)) { | 518 | task_lock(p); |
442 | set_tsk_thread_flag(p, TIF_MEMDIE); | 519 | if (p->mm && task_will_free_mem(p)) { |
520 | mark_tsk_oom_victim(p); | ||
521 | task_unlock(p); | ||
443 | put_task_struct(p); | 522 | put_task_struct(p); |
444 | return; | 523 | return; |
445 | } | 524 | } |
525 | task_unlock(p); | ||
446 | 526 | ||
447 | if (__ratelimit(&oom_rs)) | 527 | if (__ratelimit(&oom_rs)) |
448 | dump_header(p, gfp_mask, order, memcg, nodemask); | 528 | dump_header(p, gfp_mask, order, memcg, nodemask); |
@@ -492,6 +572,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
492 | 572 | ||
493 | /* mm cannot safely be dereferenced after task_unlock(victim) */ | 573 | /* mm cannot safely be dereferenced after task_unlock(victim) */ |
494 | mm = victim->mm; | 574 | mm = victim->mm; |
575 | mark_tsk_oom_victim(victim); | ||
495 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | 576 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
496 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), | 577 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), |
497 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), | 578 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), |
@@ -522,7 +603,6 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
522 | } | 603 | } |
523 | rcu_read_unlock(); | 604 | rcu_read_unlock(); |
524 | 605 | ||
525 | set_tsk_thread_flag(victim, TIF_MEMDIE); | ||
526 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); | 606 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); |
527 | put_task_struct(victim); | 607 | put_task_struct(victim); |
528 | } | 608 | } |
@@ -611,7 +691,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
611 | } | 691 | } |
612 | 692 | ||
613 | /** | 693 | /** |
614 | * out_of_memory - kill the "best" process when we run out of memory | 694 | * __out_of_memory - kill the "best" process when we run out of memory |
615 | * @zonelist: zonelist pointer | 695 | * @zonelist: zonelist pointer |
616 | * @gfp_mask: memory allocation flags | 696 | * @gfp_mask: memory allocation flags |
617 | * @order: amount of memory being requested as a power of 2 | 697 | * @order: amount of memory being requested as a power of 2 |
@@ -623,7 +703,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
623 | * OR try to be smart about which process to kill. Note that we | 703 | * OR try to be smart about which process to kill. Note that we |
624 | * don't have to be perfect here, we just have to be good. | 704 | * don't have to be perfect here, we just have to be good. |
625 | */ | 705 | */ |
626 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 706 | static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
627 | int order, nodemask_t *nodemask, bool force_kill) | 707 | int order, nodemask_t *nodemask, bool force_kill) |
628 | { | 708 | { |
629 | const nodemask_t *mpol_mask; | 709 | const nodemask_t *mpol_mask; |
@@ -643,9 +723,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
643 | * If current has a pending SIGKILL or is exiting, then automatically | 723 | * If current has a pending SIGKILL or is exiting, then automatically |
644 | * select it. The goal is to allow it to allocate so that it may | 724 | * select it. The goal is to allow it to allocate so that it may |
645 | * quickly exit and free its memory. | 725 | * quickly exit and free its memory. |
726 | * | ||
727 | * But don't select if current has already released its mm and cleared | ||
728 | * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur. | ||
646 | */ | 729 | */ |
647 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { | 730 | if (current->mm && |
648 | set_thread_flag(TIF_MEMDIE); | 731 | (fatal_signal_pending(current) || task_will_free_mem(current))) { |
732 | mark_tsk_oom_victim(current); | ||
649 | return; | 733 | return; |
650 | } | 734 | } |
651 | 735 | ||
@@ -688,6 +772,32 @@ out: | |||
688 | schedule_timeout_killable(1); | 772 | schedule_timeout_killable(1); |
689 | } | 773 | } |
690 | 774 | ||
775 | /** | ||
776 | * out_of_memory - tries to invoke OOM killer. | ||
777 | * @zonelist: zonelist pointer | ||
778 | * @gfp_mask: memory allocation flags | ||
779 | * @order: amount of memory being requested as a power of 2 | ||
780 | * @nodemask: nodemask passed to page allocator | ||
781 | * @force_kill: true if a task must be killed, even if others are exiting | ||
782 | * | ||
783 | * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() | ||
784 | * when it returns false. Otherwise returns true. | ||
785 | */ | ||
786 | bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | ||
787 | int order, nodemask_t *nodemask, bool force_kill) | ||
788 | { | ||
789 | bool ret = false; | ||
790 | |||
791 | down_read(&oom_sem); | ||
792 | if (!oom_killer_disabled) { | ||
793 | __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); | ||
794 | ret = true; | ||
795 | } | ||
796 | up_read(&oom_sem); | ||
797 | |||
798 | return ret; | ||
799 | } | ||
800 | |||
691 | /* | 801 | /* |
692 | * The pagefault handler calls here because it is out of memory, so kill a | 802 | * The pagefault handler calls here because it is out of memory, so kill a |
693 | * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a | 803 | * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a |
@@ -697,12 +807,25 @@ void pagefault_out_of_memory(void) | |||
697 | { | 807 | { |
698 | struct zonelist *zonelist; | 808 | struct zonelist *zonelist; |
699 | 809 | ||
810 | down_read(&oom_sem); | ||
700 | if (mem_cgroup_oom_synchronize(true)) | 811 | if (mem_cgroup_oom_synchronize(true)) |
701 | return; | 812 | goto unlock; |
702 | 813 | ||
703 | zonelist = node_zonelist(first_memory_node, GFP_KERNEL); | 814 | zonelist = node_zonelist(first_memory_node, GFP_KERNEL); |
704 | if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { | 815 | if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { |
705 | out_of_memory(NULL, 0, 0, NULL, false); | 816 | if (!oom_killer_disabled) |
817 | __out_of_memory(NULL, 0, 0, NULL, false); | ||
818 | else | ||
819 | /* | ||
820 | * There shouldn't be any user tasks runable while the | ||
821 | * OOM killer is disabled so the current task has to | ||
822 | * be a racing OOM victim for which oom_killer_disable() | ||
823 | * is waiting for. | ||
824 | */ | ||
825 | WARN_ON(test_thread_flag(TIF_MEMDIE)); | ||
826 | |||
706 | oom_zonelist_unlock(zonelist, GFP_KERNEL); | 827 | oom_zonelist_unlock(zonelist, GFP_KERNEL); |
707 | } | 828 | } |
829 | unlock: | ||
830 | up_read(&oom_sem); | ||
708 | } | 831 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6f4335238e33..6a73e47e81c6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2168,9 +2168,12 @@ EXPORT_SYMBOL(account_page_redirty); | |||
2168 | */ | 2168 | */ |
2169 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) | 2169 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) |
2170 | { | 2170 | { |
2171 | int ret; | ||
2172 | |||
2171 | wbc->pages_skipped++; | 2173 | wbc->pages_skipped++; |
2174 | ret = __set_page_dirty_nobuffers(page); | ||
2172 | account_page_redirty(page); | 2175 | account_page_redirty(page); |
2173 | return __set_page_dirty_nobuffers(page); | 2176 | return ret; |
2174 | } | 2177 | } |
2175 | EXPORT_SYMBOL(redirty_page_for_writepage); | 2178 | EXPORT_SYMBOL(redirty_page_for_writepage); |
2176 | 2179 | ||
@@ -2308,12 +2311,10 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); | |||
2308 | int test_clear_page_writeback(struct page *page) | 2311 | int test_clear_page_writeback(struct page *page) |
2309 | { | 2312 | { |
2310 | struct address_space *mapping = page_mapping(page); | 2313 | struct address_space *mapping = page_mapping(page); |
2311 | unsigned long memcg_flags; | ||
2312 | struct mem_cgroup *memcg; | 2314 | struct mem_cgroup *memcg; |
2313 | bool locked; | ||
2314 | int ret; | 2315 | int ret; |
2315 | 2316 | ||
2316 | memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); | 2317 | memcg = mem_cgroup_begin_page_stat(page); |
2317 | if (mapping) { | 2318 | if (mapping) { |
2318 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2319 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
2319 | unsigned long flags; | 2320 | unsigned long flags; |
@@ -2338,19 +2339,17 @@ int test_clear_page_writeback(struct page *page) | |||
2338 | dec_zone_page_state(page, NR_WRITEBACK); | 2339 | dec_zone_page_state(page, NR_WRITEBACK); |
2339 | inc_zone_page_state(page, NR_WRITTEN); | 2340 | inc_zone_page_state(page, NR_WRITTEN); |
2340 | } | 2341 | } |
2341 | mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); | 2342 | mem_cgroup_end_page_stat(memcg); |
2342 | return ret; | 2343 | return ret; |
2343 | } | 2344 | } |
2344 | 2345 | ||
2345 | int __test_set_page_writeback(struct page *page, bool keep_write) | 2346 | int __test_set_page_writeback(struct page *page, bool keep_write) |
2346 | { | 2347 | { |
2347 | struct address_space *mapping = page_mapping(page); | 2348 | struct address_space *mapping = page_mapping(page); |
2348 | unsigned long memcg_flags; | ||
2349 | struct mem_cgroup *memcg; | 2349 | struct mem_cgroup *memcg; |
2350 | bool locked; | ||
2351 | int ret; | 2350 | int ret; |
2352 | 2351 | ||
2353 | memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); | 2352 | memcg = mem_cgroup_begin_page_stat(page); |
2354 | if (mapping) { | 2353 | if (mapping) { |
2355 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2354 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
2356 | unsigned long flags; | 2355 | unsigned long flags; |
@@ -2380,7 +2379,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) | |||
2380 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); | 2379 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); |
2381 | inc_zone_page_state(page, NR_WRITEBACK); | 2380 | inc_zone_page_state(page, NR_WRITEBACK); |
2382 | } | 2381 | } |
2383 | mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); | 2382 | mem_cgroup_end_page_stat(memcg); |
2384 | return ret; | 2383 | return ret; |
2385 | 2384 | ||
2386 | } | 2385 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f121050e8530..8d52ab18fe0d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype) | |||
244 | PB_migrate, PB_migrate_end); | 244 | PB_migrate, PB_migrate_end); |
245 | } | 245 | } |
246 | 246 | ||
247 | bool oom_killer_disabled __read_mostly; | ||
248 | |||
249 | #ifdef CONFIG_DEBUG_VM | 247 | #ifdef CONFIG_DEBUG_VM |
250 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 248 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
251 | { | 249 | { |
@@ -381,36 +379,6 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
381 | } | 379 | } |
382 | } | 380 | } |
383 | 381 | ||
384 | /* update __split_huge_page_refcount if you change this function */ | ||
385 | static int destroy_compound_page(struct page *page, unsigned long order) | ||
386 | { | ||
387 | int i; | ||
388 | int nr_pages = 1 << order; | ||
389 | int bad = 0; | ||
390 | |||
391 | if (unlikely(compound_order(page) != order)) { | ||
392 | bad_page(page, "wrong compound order", 0); | ||
393 | bad++; | ||
394 | } | ||
395 | |||
396 | __ClearPageHead(page); | ||
397 | |||
398 | for (i = 1; i < nr_pages; i++) { | ||
399 | struct page *p = page + i; | ||
400 | |||
401 | if (unlikely(!PageTail(p))) { | ||
402 | bad_page(page, "PageTail not set", 0); | ||
403 | bad++; | ||
404 | } else if (unlikely(p->first_page != page)) { | ||
405 | bad_page(page, "first_page not consistent", 0); | ||
406 | bad++; | ||
407 | } | ||
408 | __ClearPageTail(p); | ||
409 | } | ||
410 | |||
411 | return bad; | ||
412 | } | ||
413 | |||
414 | static inline void prep_zero_page(struct page *page, unsigned int order, | 382 | static inline void prep_zero_page(struct page *page, unsigned int order, |
415 | gfp_t gfp_flags) | 383 | gfp_t gfp_flags) |
416 | { | 384 | { |
@@ -613,10 +581,7 @@ static inline void __free_one_page(struct page *page, | |||
613 | int max_order = MAX_ORDER; | 581 | int max_order = MAX_ORDER; |
614 | 582 | ||
615 | VM_BUG_ON(!zone_is_initialized(zone)); | 583 | VM_BUG_ON(!zone_is_initialized(zone)); |
616 | 584 | VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); | |
617 | if (unlikely(PageCompound(page))) | ||
618 | if (unlikely(destroy_compound_page(page, order))) | ||
619 | return; | ||
620 | 585 | ||
621 | VM_BUG_ON(migratetype == -1); | 586 | VM_BUG_ON(migratetype == -1); |
622 | if (is_migrate_isolate(migratetype)) { | 587 | if (is_migrate_isolate(migratetype)) { |
@@ -797,21 +762,40 @@ static void free_one_page(struct zone *zone, | |||
797 | spin_unlock(&zone->lock); | 762 | spin_unlock(&zone->lock); |
798 | } | 763 | } |
799 | 764 | ||
765 | static int free_tail_pages_check(struct page *head_page, struct page *page) | ||
766 | { | ||
767 | if (!IS_ENABLED(CONFIG_DEBUG_VM)) | ||
768 | return 0; | ||
769 | if (unlikely(!PageTail(page))) { | ||
770 | bad_page(page, "PageTail not set", 0); | ||
771 | return 1; | ||
772 | } | ||
773 | if (unlikely(page->first_page != head_page)) { | ||
774 | bad_page(page, "first_page not consistent", 0); | ||
775 | return 1; | ||
776 | } | ||
777 | return 0; | ||
778 | } | ||
779 | |||
800 | static bool free_pages_prepare(struct page *page, unsigned int order) | 780 | static bool free_pages_prepare(struct page *page, unsigned int order) |
801 | { | 781 | { |
802 | int i; | 782 | bool compound = PageCompound(page); |
803 | int bad = 0; | 783 | int i, bad = 0; |
804 | 784 | ||
805 | VM_BUG_ON_PAGE(PageTail(page), page); | 785 | VM_BUG_ON_PAGE(PageTail(page), page); |
806 | VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); | 786 | VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); |
807 | 787 | ||
808 | trace_mm_page_free(page, order); | 788 | trace_mm_page_free(page, order); |
809 | kmemcheck_free_shadow(page, order); | 789 | kmemcheck_free_shadow(page, order); |
810 | 790 | ||
811 | if (PageAnon(page)) | 791 | if (PageAnon(page)) |
812 | page->mapping = NULL; | 792 | page->mapping = NULL; |
813 | for (i = 0; i < (1 << order); i++) | 793 | bad += free_pages_check(page); |
794 | for (i = 1; i < (1 << order); i++) { | ||
795 | if (compound) | ||
796 | bad += free_tail_pages_check(page, page + i); | ||
814 | bad += free_pages_check(page + i); | 797 | bad += free_pages_check(page + i); |
798 | } | ||
815 | if (bad) | 799 | if (bad) |
816 | return false; | 800 | return false; |
817 | 801 | ||
@@ -970,7 +954,8 @@ static inline int check_new_page(struct page *page) | |||
970 | return 0; | 954 | return 0; |
971 | } | 955 | } |
972 | 956 | ||
973 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | 957 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, |
958 | int alloc_flags) | ||
974 | { | 959 | { |
975 | int i; | 960 | int i; |
976 | 961 | ||
@@ -994,6 +979,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | |||
994 | 979 | ||
995 | set_page_owner(page, order, gfp_flags); | 980 | set_page_owner(page, order, gfp_flags); |
996 | 981 | ||
982 | /* | ||
983 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to | ||
984 | * allocate the page. The expectation is that the caller is taking | ||
985 | * steps that will free more memory. The caller should avoid the page | ||
986 | * being used for !PFMEMALLOC purposes. | ||
987 | */ | ||
988 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | ||
989 | |||
997 | return 0; | 990 | return 0; |
998 | } | 991 | } |
999 | 992 | ||
@@ -1130,39 +1123,34 @@ static void change_pageblock_range(struct page *pageblock_page, | |||
1130 | } | 1123 | } |
1131 | 1124 | ||
1132 | /* | 1125 | /* |
1133 | * If breaking a large block of pages, move all free pages to the preferred | 1126 | * When we are falling back to another migratetype during allocation, try to |
1134 | * allocation list. If falling back for a reclaimable kernel allocation, be | 1127 | * steal extra free pages from the same pageblocks to satisfy further |
1135 | * more aggressive about taking ownership of free pages. | 1128 | * allocations, instead of polluting multiple pageblocks. |
1136 | * | 1129 | * |
1137 | * On the other hand, never change migration type of MIGRATE_CMA pageblocks | 1130 | * If we are stealing a relatively large buddy page, it is likely there will |
1138 | * nor move CMA pages to different free lists. We don't want unmovable pages | 1131 | * be more free pages in the pageblock, so try to steal them all. For |
1139 | * to be allocated from MIGRATE_CMA areas. | 1132 | * reclaimable and unmovable allocations, we steal regardless of page size, |
1133 | * as fragmentation caused by those allocations polluting movable pageblocks | ||
1134 | * is worse than movable allocations stealing from unmovable and reclaimable | ||
1135 | * pageblocks. | ||
1140 | * | 1136 | * |
1141 | * Returns the new migratetype of the pageblock (or the same old migratetype | 1137 | * If we claim more than half of the pageblock, change pageblock's migratetype |
1142 | * if it was unchanged). | 1138 | * as well. |
1143 | */ | 1139 | */ |
1144 | static int try_to_steal_freepages(struct zone *zone, struct page *page, | 1140 | static void try_to_steal_freepages(struct zone *zone, struct page *page, |
1145 | int start_type, int fallback_type) | 1141 | int start_type, int fallback_type) |
1146 | { | 1142 | { |
1147 | int current_order = page_order(page); | 1143 | int current_order = page_order(page); |
1148 | 1144 | ||
1149 | /* | ||
1150 | * When borrowing from MIGRATE_CMA, we need to release the excess | ||
1151 | * buddy pages to CMA itself. We also ensure the freepage_migratetype | ||
1152 | * is set to CMA so it is returned to the correct freelist in case | ||
1153 | * the page ends up being not actually allocated from the pcp lists. | ||
1154 | */ | ||
1155 | if (is_migrate_cma(fallback_type)) | ||
1156 | return fallback_type; | ||
1157 | |||
1158 | /* Take ownership for orders >= pageblock_order */ | 1145 | /* Take ownership for orders >= pageblock_order */ |
1159 | if (current_order >= pageblock_order) { | 1146 | if (current_order >= pageblock_order) { |
1160 | change_pageblock_range(page, current_order, start_type); | 1147 | change_pageblock_range(page, current_order, start_type); |
1161 | return start_type; | 1148 | return; |
1162 | } | 1149 | } |
1163 | 1150 | ||
1164 | if (current_order >= pageblock_order / 2 || | 1151 | if (current_order >= pageblock_order / 2 || |
1165 | start_type == MIGRATE_RECLAIMABLE || | 1152 | start_type == MIGRATE_RECLAIMABLE || |
1153 | start_type == MIGRATE_UNMOVABLE || | ||
1166 | page_group_by_mobility_disabled) { | 1154 | page_group_by_mobility_disabled) { |
1167 | int pages; | 1155 | int pages; |
1168 | 1156 | ||
@@ -1170,15 +1158,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
1170 | 1158 | ||
1171 | /* Claim the whole block if over half of it is free */ | 1159 | /* Claim the whole block if over half of it is free */ |
1172 | if (pages >= (1 << (pageblock_order-1)) || | 1160 | if (pages >= (1 << (pageblock_order-1)) || |
1173 | page_group_by_mobility_disabled) { | 1161 | page_group_by_mobility_disabled) |
1174 | |||
1175 | set_pageblock_migratetype(page, start_type); | 1162 | set_pageblock_migratetype(page, start_type); |
1176 | return start_type; | ||
1177 | } | ||
1178 | |||
1179 | } | 1163 | } |
1180 | |||
1181 | return fallback_type; | ||
1182 | } | 1164 | } |
1183 | 1165 | ||
1184 | /* Remove an element from the buddy allocator from the fallback list */ | 1166 | /* Remove an element from the buddy allocator from the fallback list */ |
@@ -1188,14 +1170,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
1188 | struct free_area *area; | 1170 | struct free_area *area; |
1189 | unsigned int current_order; | 1171 | unsigned int current_order; |
1190 | struct page *page; | 1172 | struct page *page; |
1191 | int migratetype, new_type, i; | ||
1192 | 1173 | ||
1193 | /* Find the largest possible block of pages in the other list */ | 1174 | /* Find the largest possible block of pages in the other list */ |
1194 | for (current_order = MAX_ORDER-1; | 1175 | for (current_order = MAX_ORDER-1; |
1195 | current_order >= order && current_order <= MAX_ORDER-1; | 1176 | current_order >= order && current_order <= MAX_ORDER-1; |
1196 | --current_order) { | 1177 | --current_order) { |
1178 | int i; | ||
1197 | for (i = 0;; i++) { | 1179 | for (i = 0;; i++) { |
1198 | migratetype = fallbacks[start_migratetype][i]; | 1180 | int migratetype = fallbacks[start_migratetype][i]; |
1181 | int buddy_type = start_migratetype; | ||
1199 | 1182 | ||
1200 | /* MIGRATE_RESERVE handled later if necessary */ | 1183 | /* MIGRATE_RESERVE handled later if necessary */ |
1201 | if (migratetype == MIGRATE_RESERVE) | 1184 | if (migratetype == MIGRATE_RESERVE) |
@@ -1209,25 +1192,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
1209 | struct page, lru); | 1192 | struct page, lru); |
1210 | area->nr_free--; | 1193 | area->nr_free--; |
1211 | 1194 | ||
1212 | new_type = try_to_steal_freepages(zone, page, | 1195 | if (!is_migrate_cma(migratetype)) { |
1213 | start_migratetype, | 1196 | try_to_steal_freepages(zone, page, |
1214 | migratetype); | 1197 | start_migratetype, |
1198 | migratetype); | ||
1199 | } else { | ||
1200 | /* | ||
1201 | * When borrowing from MIGRATE_CMA, we need to | ||
1202 | * release the excess buddy pages to CMA | ||
1203 | * itself, and we do not try to steal extra | ||
1204 | * free pages. | ||
1205 | */ | ||
1206 | buddy_type = migratetype; | ||
1207 | } | ||
1215 | 1208 | ||
1216 | /* Remove the page from the freelists */ | 1209 | /* Remove the page from the freelists */ |
1217 | list_del(&page->lru); | 1210 | list_del(&page->lru); |
1218 | rmv_page_order(page); | 1211 | rmv_page_order(page); |
1219 | 1212 | ||
1220 | expand(zone, page, order, current_order, area, | 1213 | expand(zone, page, order, current_order, area, |
1221 | new_type); | 1214 | buddy_type); |
1222 | /* The freepage_migratetype may differ from pageblock's | 1215 | |
1216 | /* | ||
1217 | * The freepage_migratetype may differ from pageblock's | ||
1223 | * migratetype depending on the decisions in | 1218 | * migratetype depending on the decisions in |
1224 | * try_to_steal_freepages. This is OK as long as it does | 1219 | * try_to_steal_freepages(). This is OK as long as it |
1225 | * not differ for MIGRATE_CMA type. | 1220 | * does not differ for MIGRATE_CMA pageblocks. For CMA |
1221 | * we need to make sure unallocated pages flushed from | ||
1222 | * pcp lists are returned to the correct freelist. | ||
1226 | */ | 1223 | */ |
1227 | set_freepage_migratetype(page, new_type); | 1224 | set_freepage_migratetype(page, buddy_type); |
1228 | 1225 | ||
1229 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1226 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1230 | start_migratetype, migratetype, new_type); | 1227 | start_migratetype, migratetype); |
1231 | 1228 | ||
1232 | return page; | 1229 | return page; |
1233 | } | 1230 | } |
@@ -1642,9 +1639,7 @@ int split_free_page(struct page *page) | |||
1642 | } | 1639 | } |
1643 | 1640 | ||
1644 | /* | 1641 | /* |
1645 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | 1642 | * Allocate a page from the given zone. Use pcplists for order-0 allocations. |
1646 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | ||
1647 | * or two. | ||
1648 | */ | 1643 | */ |
1649 | static inline | 1644 | static inline |
1650 | struct page *buffered_rmqueue(struct zone *preferred_zone, | 1645 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
@@ -1655,7 +1650,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
1655 | struct page *page; | 1650 | struct page *page; |
1656 | bool cold = ((gfp_flags & __GFP_COLD) != 0); | 1651 | bool cold = ((gfp_flags & __GFP_COLD) != 0); |
1657 | 1652 | ||
1658 | again: | ||
1659 | if (likely(order == 0)) { | 1653 | if (likely(order == 0)) { |
1660 | struct per_cpu_pages *pcp; | 1654 | struct per_cpu_pages *pcp; |
1661 | struct list_head *list; | 1655 | struct list_head *list; |
@@ -1711,8 +1705,6 @@ again: | |||
1711 | local_irq_restore(flags); | 1705 | local_irq_restore(flags); |
1712 | 1706 | ||
1713 | VM_BUG_ON_PAGE(bad_range(zone, page), page); | 1707 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
1714 | if (prep_new_page(page, order, gfp_flags)) | ||
1715 | goto again; | ||
1716 | return page; | 1708 | return page; |
1717 | 1709 | ||
1718 | failed: | 1710 | failed: |
@@ -2033,10 +2025,10 @@ static void reset_alloc_batches(struct zone *preferred_zone) | |||
2033 | * a page. | 2025 | * a page. |
2034 | */ | 2026 | */ |
2035 | static struct page * | 2027 | static struct page * |
2036 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 2028 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, |
2037 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, | 2029 | const struct alloc_context *ac) |
2038 | struct zone *preferred_zone, int classzone_idx, int migratetype) | ||
2039 | { | 2030 | { |
2031 | struct zonelist *zonelist = ac->zonelist; | ||
2040 | struct zoneref *z; | 2032 | struct zoneref *z; |
2041 | struct page *page = NULL; | 2033 | struct page *page = NULL; |
2042 | struct zone *zone; | 2034 | struct zone *zone; |
@@ -2055,8 +2047,8 @@ zonelist_scan: | |||
2055 | * Scan zonelist, looking for a zone with enough free. | 2047 | * Scan zonelist, looking for a zone with enough free. |
2056 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. | 2048 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. |
2057 | */ | 2049 | */ |
2058 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2050 | for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, |
2059 | high_zoneidx, nodemask) { | 2051 | ac->nodemask) { |
2060 | unsigned long mark; | 2052 | unsigned long mark; |
2061 | 2053 | ||
2062 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && | 2054 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
@@ -2073,7 +2065,7 @@ zonelist_scan: | |||
2073 | * time the page has in memory before being reclaimed. | 2065 | * time the page has in memory before being reclaimed. |
2074 | */ | 2066 | */ |
2075 | if (alloc_flags & ALLOC_FAIR) { | 2067 | if (alloc_flags & ALLOC_FAIR) { |
2076 | if (!zone_local(preferred_zone, zone)) | 2068 | if (!zone_local(ac->preferred_zone, zone)) |
2077 | break; | 2069 | break; |
2078 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { | 2070 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { |
2079 | nr_fair_skipped++; | 2071 | nr_fair_skipped++; |
@@ -2111,7 +2103,7 @@ zonelist_scan: | |||
2111 | 2103 | ||
2112 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 2104 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
2113 | if (!zone_watermark_ok(zone, order, mark, | 2105 | if (!zone_watermark_ok(zone, order, mark, |
2114 | classzone_idx, alloc_flags)) { | 2106 | ac->classzone_idx, alloc_flags)) { |
2115 | int ret; | 2107 | int ret; |
2116 | 2108 | ||
2117 | /* Checked here to keep the fast path fast */ | 2109 | /* Checked here to keep the fast path fast */ |
@@ -2132,7 +2124,7 @@ zonelist_scan: | |||
2132 | } | 2124 | } |
2133 | 2125 | ||
2134 | if (zone_reclaim_mode == 0 || | 2126 | if (zone_reclaim_mode == 0 || |
2135 | !zone_allows_reclaim(preferred_zone, zone)) | 2127 | !zone_allows_reclaim(ac->preferred_zone, zone)) |
2136 | goto this_zone_full; | 2128 | goto this_zone_full; |
2137 | 2129 | ||
2138 | /* | 2130 | /* |
@@ -2154,7 +2146,7 @@ zonelist_scan: | |||
2154 | default: | 2146 | default: |
2155 | /* did we reclaim enough */ | 2147 | /* did we reclaim enough */ |
2156 | if (zone_watermark_ok(zone, order, mark, | 2148 | if (zone_watermark_ok(zone, order, mark, |
2157 | classzone_idx, alloc_flags)) | 2149 | ac->classzone_idx, alloc_flags)) |
2158 | goto try_this_zone; | 2150 | goto try_this_zone; |
2159 | 2151 | ||
2160 | /* | 2152 | /* |
@@ -2175,27 +2167,18 @@ zonelist_scan: | |||
2175 | } | 2167 | } |
2176 | 2168 | ||
2177 | try_this_zone: | 2169 | try_this_zone: |
2178 | page = buffered_rmqueue(preferred_zone, zone, order, | 2170 | page = buffered_rmqueue(ac->preferred_zone, zone, order, |
2179 | gfp_mask, migratetype); | 2171 | gfp_mask, ac->migratetype); |
2180 | if (page) | 2172 | if (page) { |
2181 | break; | 2173 | if (prep_new_page(page, order, gfp_mask, alloc_flags)) |
2174 | goto try_this_zone; | ||
2175 | return page; | ||
2176 | } | ||
2182 | this_zone_full: | 2177 | this_zone_full: |
2183 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) | 2178 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) |
2184 | zlc_mark_zone_full(zonelist, z); | 2179 | zlc_mark_zone_full(zonelist, z); |
2185 | } | 2180 | } |
2186 | 2181 | ||
2187 | if (page) { | ||
2188 | /* | ||
2189 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was | ||
2190 | * necessary to allocate the page. The expectation is | ||
2191 | * that the caller is taking steps that will free more | ||
2192 | * memory. The caller should avoid the page being used | ||
2193 | * for !PFMEMALLOC purposes. | ||
2194 | */ | ||
2195 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | ||
2196 | return page; | ||
2197 | } | ||
2198 | |||
2199 | /* | 2182 | /* |
2200 | * The first pass makes sure allocations are spread fairly within the | 2183 | * The first pass makes sure allocations are spread fairly within the |
2201 | * local node. However, the local node might have free pages left | 2184 | * local node. However, the local node might have free pages left |
@@ -2208,7 +2191,7 @@ this_zone_full: | |||
2208 | alloc_flags &= ~ALLOC_FAIR; | 2191 | alloc_flags &= ~ALLOC_FAIR; |
2209 | if (nr_fair_skipped) { | 2192 | if (nr_fair_skipped) { |
2210 | zonelist_rescan = true; | 2193 | zonelist_rescan = true; |
2211 | reset_alloc_batches(preferred_zone); | 2194 | reset_alloc_batches(ac->preferred_zone); |
2212 | } | 2195 | } |
2213 | if (nr_online_nodes > 1) | 2196 | if (nr_online_nodes > 1) |
2214 | zonelist_rescan = true; | 2197 | zonelist_rescan = true; |
@@ -2330,44 +2313,29 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, | |||
2330 | 2313 | ||
2331 | static inline struct page * | 2314 | static inline struct page * |
2332 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | 2315 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
2333 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2316 | const struct alloc_context *ac, unsigned long *did_some_progress) |
2334 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
2335 | int classzone_idx, int migratetype, unsigned long *did_some_progress) | ||
2336 | { | 2317 | { |
2337 | struct page *page; | 2318 | struct page *page; |
2338 | 2319 | ||
2339 | *did_some_progress = 0; | 2320 | *did_some_progress = 0; |
2340 | 2321 | ||
2341 | if (oom_killer_disabled) | ||
2342 | return NULL; | ||
2343 | |||
2344 | /* | 2322 | /* |
2345 | * Acquire the per-zone oom lock for each zone. If that | 2323 | * Acquire the per-zone oom lock for each zone. If that |
2346 | * fails, somebody else is making progress for us. | 2324 | * fails, somebody else is making progress for us. |
2347 | */ | 2325 | */ |
2348 | if (!oom_zonelist_trylock(zonelist, gfp_mask)) { | 2326 | if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { |
2349 | *did_some_progress = 1; | 2327 | *did_some_progress = 1; |
2350 | schedule_timeout_uninterruptible(1); | 2328 | schedule_timeout_uninterruptible(1); |
2351 | return NULL; | 2329 | return NULL; |
2352 | } | 2330 | } |
2353 | 2331 | ||
2354 | /* | 2332 | /* |
2355 | * PM-freezer should be notified that there might be an OOM killer on | ||
2356 | * its way to kill and wake somebody up. This is too early and we might | ||
2357 | * end up not killing anything but false positives are acceptable. | ||
2358 | * See freeze_processes. | ||
2359 | */ | ||
2360 | note_oom_kill(); | ||
2361 | |||
2362 | /* | ||
2363 | * Go through the zonelist yet one more time, keep very high watermark | 2333 | * Go through the zonelist yet one more time, keep very high watermark |
2364 | * here, this is only to catch a parallel oom killing, we must fail if | 2334 | * here, this is only to catch a parallel oom killing, we must fail if |
2365 | * we're still under heavy pressure. | 2335 | * we're still under heavy pressure. |
2366 | */ | 2336 | */ |
2367 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | 2337 | page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, |
2368 | order, zonelist, high_zoneidx, | 2338 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); |
2369 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | ||
2370 | preferred_zone, classzone_idx, migratetype); | ||
2371 | if (page) | 2339 | if (page) |
2372 | goto out; | 2340 | goto out; |
2373 | 2341 | ||
@@ -2379,7 +2347,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2379 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 2347 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
2380 | goto out; | 2348 | goto out; |
2381 | /* The OOM killer does not needlessly kill tasks for lowmem */ | 2349 | /* The OOM killer does not needlessly kill tasks for lowmem */ |
2382 | if (high_zoneidx < ZONE_NORMAL) | 2350 | if (ac->high_zoneidx < ZONE_NORMAL) |
2383 | goto out; | 2351 | goto out; |
2384 | /* The OOM killer does not compensate for light reclaim */ | 2352 | /* The OOM killer does not compensate for light reclaim */ |
2385 | if (!(gfp_mask & __GFP_FS)) | 2353 | if (!(gfp_mask & __GFP_FS)) |
@@ -2395,10 +2363,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2395 | goto out; | 2363 | goto out; |
2396 | } | 2364 | } |
2397 | /* Exhausted what can be done so it's blamo time */ | 2365 | /* Exhausted what can be done so it's blamo time */ |
2398 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); | 2366 | if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) |
2399 | *did_some_progress = 1; | 2367 | *did_some_progress = 1; |
2400 | out: | 2368 | out: |
2401 | oom_zonelist_unlock(zonelist, gfp_mask); | 2369 | oom_zonelist_unlock(ac->zonelist, gfp_mask); |
2402 | return page; | 2370 | return page; |
2403 | } | 2371 | } |
2404 | 2372 | ||
@@ -2406,10 +2374,9 @@ out: | |||
2406 | /* Try memory compaction for high-order allocations before reclaim */ | 2374 | /* Try memory compaction for high-order allocations before reclaim */ |
2407 | static struct page * | 2375 | static struct page * |
2408 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2376 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2409 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2377 | int alloc_flags, const struct alloc_context *ac, |
2410 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2378 | enum migrate_mode mode, int *contended_compaction, |
2411 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2379 | bool *deferred_compaction) |
2412 | int *contended_compaction, bool *deferred_compaction) | ||
2413 | { | 2380 | { |
2414 | unsigned long compact_result; | 2381 | unsigned long compact_result; |
2415 | struct page *page; | 2382 | struct page *page; |
@@ -2418,10 +2385,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2418 | return NULL; | 2385 | return NULL; |
2419 | 2386 | ||
2420 | current->flags |= PF_MEMALLOC; | 2387 | current->flags |= PF_MEMALLOC; |
2421 | compact_result = try_to_compact_pages(zonelist, order, gfp_mask, | 2388 | compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, |
2422 | nodemask, mode, | 2389 | mode, contended_compaction); |
2423 | contended_compaction, | ||
2424 | alloc_flags, classzone_idx); | ||
2425 | current->flags &= ~PF_MEMALLOC; | 2390 | current->flags &= ~PF_MEMALLOC; |
2426 | 2391 | ||
2427 | switch (compact_result) { | 2392 | switch (compact_result) { |
@@ -2440,10 +2405,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2440 | */ | 2405 | */ |
2441 | count_vm_event(COMPACTSTALL); | 2406 | count_vm_event(COMPACTSTALL); |
2442 | 2407 | ||
2443 | page = get_page_from_freelist(gfp_mask, nodemask, | 2408 | page = get_page_from_freelist(gfp_mask, order, |
2444 | order, zonelist, high_zoneidx, | 2409 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
2445 | alloc_flags & ~ALLOC_NO_WATERMARKS, | ||
2446 | preferred_zone, classzone_idx, migratetype); | ||
2447 | 2410 | ||
2448 | if (page) { | 2411 | if (page) { |
2449 | struct zone *zone = page_zone(page); | 2412 | struct zone *zone = page_zone(page); |
@@ -2467,10 +2430,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2467 | #else | 2430 | #else |
2468 | static inline struct page * | 2431 | static inline struct page * |
2469 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2432 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2470 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2433 | int alloc_flags, const struct alloc_context *ac, |
2471 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2434 | enum migrate_mode mode, int *contended_compaction, |
2472 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2435 | bool *deferred_compaction) |
2473 | int *contended_compaction, bool *deferred_compaction) | ||
2474 | { | 2436 | { |
2475 | return NULL; | 2437 | return NULL; |
2476 | } | 2438 | } |
@@ -2478,8 +2440,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2478 | 2440 | ||
2479 | /* Perform direct synchronous page reclaim */ | 2441 | /* Perform direct synchronous page reclaim */ |
2480 | static int | 2442 | static int |
2481 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | 2443 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, |
2482 | nodemask_t *nodemask) | 2444 | const struct alloc_context *ac) |
2483 | { | 2445 | { |
2484 | struct reclaim_state reclaim_state; | 2446 | struct reclaim_state reclaim_state; |
2485 | int progress; | 2447 | int progress; |
@@ -2493,7 +2455,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | |||
2493 | reclaim_state.reclaimed_slab = 0; | 2455 | reclaim_state.reclaimed_slab = 0; |
2494 | current->reclaim_state = &reclaim_state; | 2456 | current->reclaim_state = &reclaim_state; |
2495 | 2457 | ||
2496 | progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 2458 | progress = try_to_free_pages(ac->zonelist, order, gfp_mask, |
2459 | ac->nodemask); | ||
2497 | 2460 | ||
2498 | current->reclaim_state = NULL; | 2461 | current->reclaim_state = NULL; |
2499 | lockdep_clear_current_reclaim_state(); | 2462 | lockdep_clear_current_reclaim_state(); |
@@ -2507,28 +2470,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | |||
2507 | /* The really slow allocator path where we enter direct reclaim */ | 2470 | /* The really slow allocator path where we enter direct reclaim */ |
2508 | static inline struct page * | 2471 | static inline struct page * |
2509 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2472 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
2510 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2473 | int alloc_flags, const struct alloc_context *ac, |
2511 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2474 | unsigned long *did_some_progress) |
2512 | int classzone_idx, int migratetype, unsigned long *did_some_progress) | ||
2513 | { | 2475 | { |
2514 | struct page *page = NULL; | 2476 | struct page *page = NULL; |
2515 | bool drained = false; | 2477 | bool drained = false; |
2516 | 2478 | ||
2517 | *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | 2479 | *did_some_progress = __perform_reclaim(gfp_mask, order, ac); |
2518 | nodemask); | ||
2519 | if (unlikely(!(*did_some_progress))) | 2480 | if (unlikely(!(*did_some_progress))) |
2520 | return NULL; | 2481 | return NULL; |
2521 | 2482 | ||
2522 | /* After successful reclaim, reconsider all zones for allocation */ | 2483 | /* After successful reclaim, reconsider all zones for allocation */ |
2523 | if (IS_ENABLED(CONFIG_NUMA)) | 2484 | if (IS_ENABLED(CONFIG_NUMA)) |
2524 | zlc_clear_zones_full(zonelist); | 2485 | zlc_clear_zones_full(ac->zonelist); |
2525 | 2486 | ||
2526 | retry: | 2487 | retry: |
2527 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2488 | page = get_page_from_freelist(gfp_mask, order, |
2528 | zonelist, high_zoneidx, | 2489 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
2529 | alloc_flags & ~ALLOC_NO_WATERMARKS, | ||
2530 | preferred_zone, classzone_idx, | ||
2531 | migratetype); | ||
2532 | 2490 | ||
2533 | /* | 2491 | /* |
2534 | * If an allocation failed after direct reclaim, it could be because | 2492 | * If an allocation failed after direct reclaim, it could be because |
@@ -2549,36 +2507,30 @@ retry: | |||
2549 | */ | 2507 | */ |
2550 | static inline struct page * | 2508 | static inline struct page * |
2551 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | 2509 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
2552 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2510 | const struct alloc_context *ac) |
2553 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
2554 | int classzone_idx, int migratetype) | ||
2555 | { | 2511 | { |
2556 | struct page *page; | 2512 | struct page *page; |
2557 | 2513 | ||
2558 | do { | 2514 | do { |
2559 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2515 | page = get_page_from_freelist(gfp_mask, order, |
2560 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | 2516 | ALLOC_NO_WATERMARKS, ac); |
2561 | preferred_zone, classzone_idx, migratetype); | ||
2562 | 2517 | ||
2563 | if (!page && gfp_mask & __GFP_NOFAIL) | 2518 | if (!page && gfp_mask & __GFP_NOFAIL) |
2564 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2519 | wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, |
2520 | HZ/50); | ||
2565 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | 2521 | } while (!page && (gfp_mask & __GFP_NOFAIL)); |
2566 | 2522 | ||
2567 | return page; | 2523 | return page; |
2568 | } | 2524 | } |
2569 | 2525 | ||
2570 | static void wake_all_kswapds(unsigned int order, | 2526 | static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) |
2571 | struct zonelist *zonelist, | ||
2572 | enum zone_type high_zoneidx, | ||
2573 | struct zone *preferred_zone, | ||
2574 | nodemask_t *nodemask) | ||
2575 | { | 2527 | { |
2576 | struct zoneref *z; | 2528 | struct zoneref *z; |
2577 | struct zone *zone; | 2529 | struct zone *zone; |
2578 | 2530 | ||
2579 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2531 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, |
2580 | high_zoneidx, nodemask) | 2532 | ac->high_zoneidx, ac->nodemask) |
2581 | wakeup_kswapd(zone, order, zone_idx(preferred_zone)); | 2533 | wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); |
2582 | } | 2534 | } |
2583 | 2535 | ||
2584 | static inline int | 2536 | static inline int |
@@ -2637,9 +2589,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | |||
2637 | 2589 | ||
2638 | static inline struct page * | 2590 | static inline struct page * |
2639 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2591 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2640 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2592 | struct alloc_context *ac) |
2641 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
2642 | int classzone_idx, int migratetype) | ||
2643 | { | 2593 | { |
2644 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2594 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
2645 | struct page *page = NULL; | 2595 | struct page *page = NULL; |
@@ -2675,8 +2625,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2675 | 2625 | ||
2676 | retry: | 2626 | retry: |
2677 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2627 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2678 | wake_all_kswapds(order, zonelist, high_zoneidx, | 2628 | wake_all_kswapds(order, ac); |
2679 | preferred_zone, nodemask); | ||
2680 | 2629 | ||
2681 | /* | 2630 | /* |
2682 | * OK, we're below the kswapd watermark and have kicked background | 2631 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2689,17 +2638,16 @@ retry: | |||
2689 | * Find the true preferred zone if the allocation is unconstrained by | 2638 | * Find the true preferred zone if the allocation is unconstrained by |
2690 | * cpusets. | 2639 | * cpusets. |
2691 | */ | 2640 | */ |
2692 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { | 2641 | if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { |
2693 | struct zoneref *preferred_zoneref; | 2642 | struct zoneref *preferred_zoneref; |
2694 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, | 2643 | preferred_zoneref = first_zones_zonelist(ac->zonelist, |
2695 | NULL, &preferred_zone); | 2644 | ac->high_zoneidx, NULL, &ac->preferred_zone); |
2696 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | 2645 | ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); |
2697 | } | 2646 | } |
2698 | 2647 | ||
2699 | /* This is the last chance, in general, before the goto nopage. */ | 2648 | /* This is the last chance, in general, before the goto nopage. */ |
2700 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2649 | page = get_page_from_freelist(gfp_mask, order, |
2701 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2650 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
2702 | preferred_zone, classzone_idx, migratetype); | ||
2703 | if (page) | 2651 | if (page) |
2704 | goto got_pg; | 2652 | goto got_pg; |
2705 | 2653 | ||
@@ -2710,11 +2658,10 @@ retry: | |||
2710 | * the allocation is high priority and these type of | 2658 | * the allocation is high priority and these type of |
2711 | * allocations are system rather than user orientated | 2659 | * allocations are system rather than user orientated |
2712 | */ | 2660 | */ |
2713 | zonelist = node_zonelist(numa_node_id(), gfp_mask); | 2661 | ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); |
2662 | |||
2663 | page = __alloc_pages_high_priority(gfp_mask, order, ac); | ||
2714 | 2664 | ||
2715 | page = __alloc_pages_high_priority(gfp_mask, order, | ||
2716 | zonelist, high_zoneidx, nodemask, | ||
2717 | preferred_zone, classzone_idx, migratetype); | ||
2718 | if (page) { | 2665 | if (page) { |
2719 | goto got_pg; | 2666 | goto got_pg; |
2720 | } | 2667 | } |
@@ -2743,11 +2690,9 @@ retry: | |||
2743 | * Try direct compaction. The first pass is asynchronous. Subsequent | 2690 | * Try direct compaction. The first pass is asynchronous. Subsequent |
2744 | * attempts after direct reclaim are synchronous | 2691 | * attempts after direct reclaim are synchronous |
2745 | */ | 2692 | */ |
2746 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, | 2693 | page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, |
2747 | high_zoneidx, nodemask, alloc_flags, | 2694 | migration_mode, |
2748 | preferred_zone, | 2695 | &contended_compaction, |
2749 | classzone_idx, migratetype, | ||
2750 | migration_mode, &contended_compaction, | ||
2751 | &deferred_compaction); | 2696 | &deferred_compaction); |
2752 | if (page) | 2697 | if (page) |
2753 | goto got_pg; | 2698 | goto got_pg; |
@@ -2793,12 +2738,8 @@ retry: | |||
2793 | migration_mode = MIGRATE_SYNC_LIGHT; | 2738 | migration_mode = MIGRATE_SYNC_LIGHT; |
2794 | 2739 | ||
2795 | /* Try direct reclaim and then allocating */ | 2740 | /* Try direct reclaim and then allocating */ |
2796 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2741 | page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, |
2797 | zonelist, high_zoneidx, | 2742 | &did_some_progress); |
2798 | nodemask, | ||
2799 | alloc_flags, preferred_zone, | ||
2800 | classzone_idx, migratetype, | ||
2801 | &did_some_progress); | ||
2802 | if (page) | 2743 | if (page) |
2803 | goto got_pg; | 2744 | goto got_pg; |
2804 | 2745 | ||
@@ -2812,17 +2753,15 @@ retry: | |||
2812 | * start OOM killing tasks. | 2753 | * start OOM killing tasks. |
2813 | */ | 2754 | */ |
2814 | if (!did_some_progress) { | 2755 | if (!did_some_progress) { |
2815 | page = __alloc_pages_may_oom(gfp_mask, order, zonelist, | 2756 | page = __alloc_pages_may_oom(gfp_mask, order, ac, |
2816 | high_zoneidx, nodemask, | 2757 | &did_some_progress); |
2817 | preferred_zone, classzone_idx, | ||
2818 | migratetype,&did_some_progress); | ||
2819 | if (page) | 2758 | if (page) |
2820 | goto got_pg; | 2759 | goto got_pg; |
2821 | if (!did_some_progress) | 2760 | if (!did_some_progress) |
2822 | goto nopage; | 2761 | goto nopage; |
2823 | } | 2762 | } |
2824 | /* Wait for some write requests to complete then retry */ | 2763 | /* Wait for some write requests to complete then retry */ |
2825 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2764 | wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); |
2826 | goto retry; | 2765 | goto retry; |
2827 | } else { | 2766 | } else { |
2828 | /* | 2767 | /* |
@@ -2830,11 +2769,9 @@ retry: | |||
2830 | * direct reclaim and reclaim/compaction depends on compaction | 2769 | * direct reclaim and reclaim/compaction depends on compaction |
2831 | * being called after reclaim so call directly if necessary | 2770 | * being called after reclaim so call directly if necessary |
2832 | */ | 2771 | */ |
2833 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, | 2772 | page = __alloc_pages_direct_compact(gfp_mask, order, |
2834 | high_zoneidx, nodemask, alloc_flags, | 2773 | alloc_flags, ac, migration_mode, |
2835 | preferred_zone, | 2774 | &contended_compaction, |
2836 | classzone_idx, migratetype, | ||
2837 | migration_mode, &contended_compaction, | ||
2838 | &deferred_compaction); | 2775 | &deferred_compaction); |
2839 | if (page) | 2776 | if (page) |
2840 | goto got_pg; | 2777 | goto got_pg; |
@@ -2842,11 +2779,7 @@ retry: | |||
2842 | 2779 | ||
2843 | nopage: | 2780 | nopage: |
2844 | warn_alloc_failed(gfp_mask, order, NULL); | 2781 | warn_alloc_failed(gfp_mask, order, NULL); |
2845 | return page; | ||
2846 | got_pg: | 2782 | got_pg: |
2847 | if (kmemcheck_enabled) | ||
2848 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
2849 | |||
2850 | return page; | 2783 | return page; |
2851 | } | 2784 | } |
2852 | 2785 | ||
@@ -2857,14 +2790,16 @@ struct page * | |||
2857 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | 2790 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, |
2858 | struct zonelist *zonelist, nodemask_t *nodemask) | 2791 | struct zonelist *zonelist, nodemask_t *nodemask) |
2859 | { | 2792 | { |
2860 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
2861 | struct zone *preferred_zone; | ||
2862 | struct zoneref *preferred_zoneref; | 2793 | struct zoneref *preferred_zoneref; |
2863 | struct page *page = NULL; | 2794 | struct page *page = NULL; |
2864 | int migratetype = gfpflags_to_migratetype(gfp_mask); | ||
2865 | unsigned int cpuset_mems_cookie; | 2795 | unsigned int cpuset_mems_cookie; |
2866 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; | 2796 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; |
2867 | int classzone_idx; | 2797 | gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ |
2798 | struct alloc_context ac = { | ||
2799 | .high_zoneidx = gfp_zone(gfp_mask), | ||
2800 | .nodemask = nodemask, | ||
2801 | .migratetype = gfpflags_to_migratetype(gfp_mask), | ||
2802 | }; | ||
2868 | 2803 | ||
2869 | gfp_mask &= gfp_allowed_mask; | 2804 | gfp_mask &= gfp_allowed_mask; |
2870 | 2805 | ||
@@ -2883,37 +2818,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2883 | if (unlikely(!zonelist->_zonerefs->zone)) | 2818 | if (unlikely(!zonelist->_zonerefs->zone)) |
2884 | return NULL; | 2819 | return NULL; |
2885 | 2820 | ||
2886 | if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) | 2821 | if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) |
2887 | alloc_flags |= ALLOC_CMA; | 2822 | alloc_flags |= ALLOC_CMA; |
2888 | 2823 | ||
2889 | retry_cpuset: | 2824 | retry_cpuset: |
2890 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2825 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2891 | 2826 | ||
2827 | /* We set it here, as __alloc_pages_slowpath might have changed it */ | ||
2828 | ac.zonelist = zonelist; | ||
2892 | /* The preferred zone is used for statistics later */ | 2829 | /* The preferred zone is used for statistics later */ |
2893 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, | 2830 | preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, |
2894 | nodemask ? : &cpuset_current_mems_allowed, | 2831 | ac.nodemask ? : &cpuset_current_mems_allowed, |
2895 | &preferred_zone); | 2832 | &ac.preferred_zone); |
2896 | if (!preferred_zone) | 2833 | if (!ac.preferred_zone) |
2897 | goto out; | 2834 | goto out; |
2898 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | 2835 | ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); |
2899 | 2836 | ||
2900 | /* First allocation attempt */ | 2837 | /* First allocation attempt */ |
2901 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2838 | alloc_mask = gfp_mask|__GFP_HARDWALL; |
2902 | zonelist, high_zoneidx, alloc_flags, | 2839 | page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); |
2903 | preferred_zone, classzone_idx, migratetype); | ||
2904 | if (unlikely(!page)) { | 2840 | if (unlikely(!page)) { |
2905 | /* | 2841 | /* |
2906 | * Runtime PM, block IO and its error handling path | 2842 | * Runtime PM, block IO and its error handling path |
2907 | * can deadlock because I/O on the device might not | 2843 | * can deadlock because I/O on the device might not |
2908 | * complete. | 2844 | * complete. |
2909 | */ | 2845 | */ |
2910 | gfp_mask = memalloc_noio_flags(gfp_mask); | 2846 | alloc_mask = memalloc_noio_flags(gfp_mask); |
2911 | page = __alloc_pages_slowpath(gfp_mask, order, | 2847 | |
2912 | zonelist, high_zoneidx, nodemask, | 2848 | page = __alloc_pages_slowpath(alloc_mask, order, &ac); |
2913 | preferred_zone, classzone_idx, migratetype); | ||
2914 | } | 2849 | } |
2915 | 2850 | ||
2916 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2851 | if (kmemcheck_enabled && page) |
2852 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
2853 | |||
2854 | trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); | ||
2917 | 2855 | ||
2918 | out: | 2856 | out: |
2919 | /* | 2857 | /* |
@@ -5047,8 +4985,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
5047 | pgdat->node_start_pfn = node_start_pfn; | 4985 | pgdat->node_start_pfn = node_start_pfn; |
5048 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 4986 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
5049 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | 4987 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
5050 | printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, | 4988 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, |
5051 | (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); | 4989 | (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); |
5052 | #endif | 4990 | #endif |
5053 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, | 4991 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, |
5054 | zones_size, zholes_size); | 4992 | zones_size, zholes_size); |
@@ -5420,9 +5358,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
5420 | arch_zone_highest_possible_pfn[i]) | 5358 | arch_zone_highest_possible_pfn[i]) |
5421 | pr_cont("empty\n"); | 5359 | pr_cont("empty\n"); |
5422 | else | 5360 | else |
5423 | pr_cont("[mem %0#10lx-%0#10lx]\n", | 5361 | pr_cont("[mem %#018Lx-%#018Lx]\n", |
5424 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, | 5362 | (u64)arch_zone_lowest_possible_pfn[i] |
5425 | (arch_zone_highest_possible_pfn[i] | 5363 | << PAGE_SHIFT, |
5364 | ((u64)arch_zone_highest_possible_pfn[i] | ||
5426 | << PAGE_SHIFT) - 1); | 5365 | << PAGE_SHIFT) - 1); |
5427 | } | 5366 | } |
5428 | 5367 | ||
@@ -5430,15 +5369,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
5430 | pr_info("Movable zone start for each node\n"); | 5369 | pr_info("Movable zone start for each node\n"); |
5431 | for (i = 0; i < MAX_NUMNODES; i++) { | 5370 | for (i = 0; i < MAX_NUMNODES; i++) { |
5432 | if (zone_movable_pfn[i]) | 5371 | if (zone_movable_pfn[i]) |
5433 | pr_info(" Node %d: %#010lx\n", i, | 5372 | pr_info(" Node %d: %#018Lx\n", i, |
5434 | zone_movable_pfn[i] << PAGE_SHIFT); | 5373 | (u64)zone_movable_pfn[i] << PAGE_SHIFT); |
5435 | } | 5374 | } |
5436 | 5375 | ||
5437 | /* Print out the early node map */ | 5376 | /* Print out the early node map */ |
5438 | pr_info("Early memory node ranges\n"); | 5377 | pr_info("Early memory node ranges\n"); |
5439 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 5378 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
5440 | pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, | 5379 | pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, |
5441 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | 5380 | (u64)start_pfn << PAGE_SHIFT, |
5381 | ((u64)end_pfn << PAGE_SHIFT) - 1); | ||
5442 | 5382 | ||
5443 | /* Initialise every node */ | 5383 | /* Initialise every node */ |
5444 | mminit_verify_pageflags_layout(); | 5384 | mminit_verify_pageflags_layout(); |
diff --git a/mm/page_counter.c b/mm/page_counter.c index a009574fbba9..11b4beda14ba 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c | |||
@@ -166,18 +166,19 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit) | |||
166 | /** | 166 | /** |
167 | * page_counter_memparse - memparse() for page counter limits | 167 | * page_counter_memparse - memparse() for page counter limits |
168 | * @buf: string to parse | 168 | * @buf: string to parse |
169 | * @max: string meaning maximum possible value | ||
169 | * @nr_pages: returns the result in number of pages | 170 | * @nr_pages: returns the result in number of pages |
170 | * | 171 | * |
171 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be | 172 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be |
172 | * limited to %PAGE_COUNTER_MAX. | 173 | * limited to %PAGE_COUNTER_MAX. |
173 | */ | 174 | */ |
174 | int page_counter_memparse(const char *buf, unsigned long *nr_pages) | 175 | int page_counter_memparse(const char *buf, const char *max, |
176 | unsigned long *nr_pages) | ||
175 | { | 177 | { |
176 | char unlimited[] = "-1"; | ||
177 | char *end; | 178 | char *end; |
178 | u64 bytes; | 179 | u64 bytes; |
179 | 180 | ||
180 | if (!strncmp(buf, unlimited, sizeof(unlimited))) { | 181 | if (!strcmp(buf, max)) { |
181 | *nr_pages = PAGE_COUNTER_MAX; | 182 | *nr_pages = PAGE_COUNTER_MAX; |
182 | return 0; | 183 | return 0; |
183 | } | 184 | } |
diff --git a/mm/page_owner.c b/mm/page_owner.c index 9ab4a9b5bc09..0993f5f36b01 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c | |||
@@ -59,20 +59,19 @@ void __reset_page_owner(struct page *page, unsigned int order) | |||
59 | 59 | ||
60 | void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) | 60 | void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) |
61 | { | 61 | { |
62 | struct page_ext *page_ext; | 62 | struct page_ext *page_ext = lookup_page_ext(page); |
63 | struct stack_trace *trace; | 63 | struct stack_trace trace = { |
64 | 64 | .nr_entries = 0, | |
65 | page_ext = lookup_page_ext(page); | 65 | .max_entries = ARRAY_SIZE(page_ext->trace_entries), |
66 | .entries = &page_ext->trace_entries[0], | ||
67 | .skip = 3, | ||
68 | }; | ||
66 | 69 | ||
67 | trace = &page_ext->trace; | 70 | save_stack_trace(&trace); |
68 | trace->nr_entries = 0; | ||
69 | trace->max_entries = ARRAY_SIZE(page_ext->trace_entries); | ||
70 | trace->entries = &page_ext->trace_entries[0]; | ||
71 | trace->skip = 3; | ||
72 | save_stack_trace(&page_ext->trace); | ||
73 | 71 | ||
74 | page_ext->order = order; | 72 | page_ext->order = order; |
75 | page_ext->gfp_mask = gfp_mask; | 73 | page_ext->gfp_mask = gfp_mask; |
74 | page_ext->nr_entries = trace.nr_entries; | ||
76 | 75 | ||
77 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); | 76 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); |
78 | } | 77 | } |
@@ -84,6 +83,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
84 | int ret; | 83 | int ret; |
85 | int pageblock_mt, page_mt; | 84 | int pageblock_mt, page_mt; |
86 | char *kbuf; | 85 | char *kbuf; |
86 | struct stack_trace trace = { | ||
87 | .nr_entries = page_ext->nr_entries, | ||
88 | .entries = &page_ext->trace_entries[0], | ||
89 | }; | ||
87 | 90 | ||
88 | kbuf = kmalloc(count, GFP_KERNEL); | 91 | kbuf = kmalloc(count, GFP_KERNEL); |
89 | if (!kbuf) | 92 | if (!kbuf) |
@@ -121,8 +124,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
121 | if (ret >= count) | 124 | if (ret >= count) |
122 | goto err; | 125 | goto err; |
123 | 126 | ||
124 | ret += snprint_stack_trace(kbuf + ret, count - ret, | 127 | ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); |
125 | &page_ext->trace, 0); | ||
126 | if (ret >= count) | 128 | if (ret >= count) |
127 | goto err; | 129 | goto err; |
128 | 130 | ||
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b264bda46e1b..75c1f2878519 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -35,7 +35,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, | |||
35 | do { | 35 | do { |
36 | again: | 36 | again: |
37 | next = pmd_addr_end(addr, end); | 37 | next = pmd_addr_end(addr, end); |
38 | if (pmd_none(*pmd)) { | 38 | if (pmd_none(*pmd) || !walk->vma) { |
39 | if (walk->pte_hole) | 39 | if (walk->pte_hole) |
40 | err = walk->pte_hole(addr, next, walk); | 40 | err = walk->pte_hole(addr, next, walk); |
41 | if (err) | 41 | if (err) |
@@ -59,7 +59,7 @@ again: | |||
59 | continue; | 59 | continue; |
60 | 60 | ||
61 | split_huge_page_pmd_mm(walk->mm, addr, pmd); | 61 | split_huge_page_pmd_mm(walk->mm, addr, pmd); |
62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 62 | if (pmd_trans_unstable(pmd)) |
63 | goto again; | 63 | goto again; |
64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
65 | if (err) | 65 | if (err) |
@@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
86 | break; | 86 | break; |
87 | continue; | 87 | continue; |
88 | } | 88 | } |
89 | if (walk->pud_entry) | 89 | if (walk->pmd_entry || walk->pte_entry) |
90 | err = walk->pud_entry(pud, addr, next, walk); | ||
91 | if (!err && (walk->pmd_entry || walk->pte_entry)) | ||
92 | err = walk_pmd_range(pud, addr, next, walk); | 90 | err = walk_pmd_range(pud, addr, next, walk); |
93 | if (err) | 91 | if (err) |
94 | break; | 92 | break; |
@@ -97,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
97 | return err; | 95 | return err; |
98 | } | 96 | } |
99 | 97 | ||
98 | static int walk_pgd_range(unsigned long addr, unsigned long end, | ||
99 | struct mm_walk *walk) | ||
100 | { | ||
101 | pgd_t *pgd; | ||
102 | unsigned long next; | ||
103 | int err = 0; | ||
104 | |||
105 | pgd = pgd_offset(walk->mm, addr); | ||
106 | do { | ||
107 | next = pgd_addr_end(addr, end); | ||
108 | if (pgd_none_or_clear_bad(pgd)) { | ||
109 | if (walk->pte_hole) | ||
110 | err = walk->pte_hole(addr, next, walk); | ||
111 | if (err) | ||
112 | break; | ||
113 | continue; | ||
114 | } | ||
115 | if (walk->pmd_entry || walk->pte_entry) | ||
116 | err = walk_pud_range(pgd, addr, next, walk); | ||
117 | if (err) | ||
118 | break; | ||
119 | } while (pgd++, addr = next, addr != end); | ||
120 | |||
121 | return err; | ||
122 | } | ||
123 | |||
100 | #ifdef CONFIG_HUGETLB_PAGE | 124 | #ifdef CONFIG_HUGETLB_PAGE |
101 | static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, | 125 | static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, |
102 | unsigned long end) | 126 | unsigned long end) |
@@ -105,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, | |||
105 | return boundary < end ? boundary : end; | 129 | return boundary < end ? boundary : end; |
106 | } | 130 | } |
107 | 131 | ||
108 | static int walk_hugetlb_range(struct vm_area_struct *vma, | 132 | static int walk_hugetlb_range(unsigned long addr, unsigned long end, |
109 | unsigned long addr, unsigned long end, | ||
110 | struct mm_walk *walk) | 133 | struct mm_walk *walk) |
111 | { | 134 | { |
135 | struct vm_area_struct *vma = walk->vma; | ||
112 | struct hstate *h = hstate_vma(vma); | 136 | struct hstate *h = hstate_vma(vma); |
113 | unsigned long next; | 137 | unsigned long next; |
114 | unsigned long hmask = huge_page_mask(h); | 138 | unsigned long hmask = huge_page_mask(h); |
@@ -121,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
121 | if (pte && walk->hugetlb_entry) | 145 | if (pte && walk->hugetlb_entry) |
122 | err = walk->hugetlb_entry(pte, hmask, addr, next, walk); | 146 | err = walk->hugetlb_entry(pte, hmask, addr, next, walk); |
123 | if (err) | 147 | if (err) |
124 | return err; | 148 | break; |
125 | } while (addr = next, addr != end); | 149 | } while (addr = next, addr != end); |
126 | 150 | ||
127 | return 0; | 151 | return err; |
128 | } | 152 | } |
129 | 153 | ||
130 | #else /* CONFIG_HUGETLB_PAGE */ | 154 | #else /* CONFIG_HUGETLB_PAGE */ |
131 | static int walk_hugetlb_range(struct vm_area_struct *vma, | 155 | static int walk_hugetlb_range(unsigned long addr, unsigned long end, |
132 | unsigned long addr, unsigned long end, | ||
133 | struct mm_walk *walk) | 156 | struct mm_walk *walk) |
134 | { | 157 | { |
135 | return 0; | 158 | return 0; |
@@ -137,115 +160,138 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
137 | 160 | ||
138 | #endif /* CONFIG_HUGETLB_PAGE */ | 161 | #endif /* CONFIG_HUGETLB_PAGE */ |
139 | 162 | ||
163 | /* | ||
164 | * Decide whether we really walk over the current vma on [@start, @end) | ||
165 | * or skip it via the returned value. Return 0 if we do walk over the | ||
166 | * current vma, and return 1 if we skip the vma. Negative values means | ||
167 | * error, where we abort the current walk. | ||
168 | */ | ||
169 | static int walk_page_test(unsigned long start, unsigned long end, | ||
170 | struct mm_walk *walk) | ||
171 | { | ||
172 | struct vm_area_struct *vma = walk->vma; | ||
173 | |||
174 | if (walk->test_walk) | ||
175 | return walk->test_walk(start, end, walk); | ||
176 | |||
177 | /* | ||
178 | * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP | ||
179 | * range, so we don't walk over it as we do for normal vmas. However, | ||
180 | * Some callers are interested in handling hole range and they don't | ||
181 | * want to just ignore any single address range. Such users certainly | ||
182 | * define their ->pte_hole() callbacks, so let's delegate them to handle | ||
183 | * vma(VM_PFNMAP). | ||
184 | */ | ||
185 | if (vma->vm_flags & VM_PFNMAP) { | ||
186 | int err = 1; | ||
187 | if (walk->pte_hole) | ||
188 | err = walk->pte_hole(start, end, walk); | ||
189 | return err ? err : 1; | ||
190 | } | ||
191 | return 0; | ||
192 | } | ||
193 | |||
194 | static int __walk_page_range(unsigned long start, unsigned long end, | ||
195 | struct mm_walk *walk) | ||
196 | { | ||
197 | int err = 0; | ||
198 | struct vm_area_struct *vma = walk->vma; | ||
199 | |||
200 | if (vma && is_vm_hugetlb_page(vma)) { | ||
201 | if (walk->hugetlb_entry) | ||
202 | err = walk_hugetlb_range(start, end, walk); | ||
203 | } else | ||
204 | err = walk_pgd_range(start, end, walk); | ||
140 | 205 | ||
206 | return err; | ||
207 | } | ||
141 | 208 | ||
142 | /** | 209 | /** |
143 | * walk_page_range - walk a memory map's page tables with a callback | 210 | * walk_page_range - walk page table with caller specific callbacks |
144 | * @addr: starting address | ||
145 | * @end: ending address | ||
146 | * @walk: set of callbacks to invoke for each level of the tree | ||
147 | * | 211 | * |
148 | * Recursively walk the page table for the memory area in a VMA, | 212 | * Recursively walk the page table tree of the process represented by @walk->mm |
149 | * calling supplied callbacks. Callbacks are called in-order (first | 213 | * within the virtual address range [@start, @end). During walking, we can do |
150 | * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, | 214 | * some caller-specific works for each entry, by setting up pmd_entry(), |
151 | * etc.). If lower-level callbacks are omitted, walking depth is reduced. | 215 | * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these |
216 | * callbacks, the associated entries/pages are just ignored. | ||
217 | * The return values of these callbacks are commonly defined like below: | ||
218 | * - 0 : succeeded to handle the current entry, and if you don't reach the | ||
219 | * end address yet, continue to walk. | ||
220 | * - >0 : succeeded to handle the current entry, and return to the caller | ||
221 | * with caller specific value. | ||
222 | * - <0 : failed to handle the current entry, and return to the caller | ||
223 | * with error code. | ||
152 | * | 224 | * |
153 | * Each callback receives an entry pointer and the start and end of the | 225 | * Before starting to walk page table, some callers want to check whether |
154 | * associated range, and a copy of the original mm_walk for access to | 226 | * they really want to walk over the current vma, typically by checking |
155 | * the ->private or ->mm fields. | 227 | * its vm_flags. walk_page_test() and @walk->test_walk() are used for this |
228 | * purpose. | ||
156 | * | 229 | * |
157 | * Usually no locks are taken, but splitting transparent huge page may | 230 | * struct mm_walk keeps current values of some common data like vma and pmd, |
158 | * take page table lock. And the bottom level iterator will map PTE | 231 | * which are useful for the access from callbacks. If you want to pass some |
159 | * directories from highmem if necessary. | 232 | * caller-specific data to callbacks, @walk->private should be helpful. |
160 | * | 233 | * |
161 | * If any callback returns a non-zero value, the walk is aborted and | 234 | * Locking: |
162 | * the return value is propagated back to the caller. Otherwise 0 is returned. | 235 | * Callers of walk_page_range() and walk_page_vma() should hold |
163 | * | 236 | * @walk->mm->mmap_sem, because these function traverse vma list and/or |
164 | * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry | 237 | * access to vma's data. |
165 | * is !NULL. | ||
166 | */ | 238 | */ |
167 | int walk_page_range(unsigned long addr, unsigned long end, | 239 | int walk_page_range(unsigned long start, unsigned long end, |
168 | struct mm_walk *walk) | 240 | struct mm_walk *walk) |
169 | { | 241 | { |
170 | pgd_t *pgd; | ||
171 | unsigned long next; | ||
172 | int err = 0; | 242 | int err = 0; |
243 | unsigned long next; | ||
244 | struct vm_area_struct *vma; | ||
173 | 245 | ||
174 | if (addr >= end) | 246 | if (start >= end) |
175 | return err; | 247 | return -EINVAL; |
176 | 248 | ||
177 | if (!walk->mm) | 249 | if (!walk->mm) |
178 | return -EINVAL; | 250 | return -EINVAL; |
179 | 251 | ||
180 | VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); | 252 | VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); |
181 | 253 | ||
182 | pgd = pgd_offset(walk->mm, addr); | 254 | vma = find_vma(walk->mm, start); |
183 | do { | 255 | do { |
184 | struct vm_area_struct *vma = NULL; | 256 | if (!vma) { /* after the last vma */ |
185 | 257 | walk->vma = NULL; | |
186 | next = pgd_addr_end(addr, end); | 258 | next = end; |
259 | } else if (start < vma->vm_start) { /* outside vma */ | ||
260 | walk->vma = NULL; | ||
261 | next = min(end, vma->vm_start); | ||
262 | } else { /* inside vma */ | ||
263 | walk->vma = vma; | ||
264 | next = min(end, vma->vm_end); | ||
265 | vma = vma->vm_next; | ||
187 | 266 | ||
188 | /* | 267 | err = walk_page_test(start, next, walk); |
189 | * This function was not intended to be vma based. | 268 | if (err > 0) |
190 | * But there are vma special cases to be handled: | ||
191 | * - hugetlb vma's | ||
192 | * - VM_PFNMAP vma's | ||
193 | */ | ||
194 | vma = find_vma(walk->mm, addr); | ||
195 | if (vma) { | ||
196 | /* | ||
197 | * There are no page structures backing a VM_PFNMAP | ||
198 | * range, so do not allow split_huge_page_pmd(). | ||
199 | */ | ||
200 | if ((vma->vm_start <= addr) && | ||
201 | (vma->vm_flags & VM_PFNMAP)) { | ||
202 | if (walk->pte_hole) | ||
203 | err = walk->pte_hole(addr, next, walk); | ||
204 | if (err) | ||
205 | break; | ||
206 | pgd = pgd_offset(walk->mm, next); | ||
207 | continue; | ||
208 | } | ||
209 | /* | ||
210 | * Handle hugetlb vma individually because pagetable | ||
211 | * walk for the hugetlb page is dependent on the | ||
212 | * architecture and we can't handled it in the same | ||
213 | * manner as non-huge pages. | ||
214 | */ | ||
215 | if (walk->hugetlb_entry && (vma->vm_start <= addr) && | ||
216 | is_vm_hugetlb_page(vma)) { | ||
217 | if (vma->vm_end < next) | ||
218 | next = vma->vm_end; | ||
219 | /* | ||
220 | * Hugepage is very tightly coupled with vma, | ||
221 | * so walk through hugetlb entries within a | ||
222 | * given vma. | ||
223 | */ | ||
224 | err = walk_hugetlb_range(vma, addr, next, walk); | ||
225 | if (err) | ||
226 | break; | ||
227 | pgd = pgd_offset(walk->mm, next); | ||
228 | continue; | 269 | continue; |
229 | } | 270 | if (err < 0) |
230 | } | ||
231 | |||
232 | if (pgd_none_or_clear_bad(pgd)) { | ||
233 | if (walk->pte_hole) | ||
234 | err = walk->pte_hole(addr, next, walk); | ||
235 | if (err) | ||
236 | break; | 271 | break; |
237 | pgd++; | ||
238 | continue; | ||
239 | } | 272 | } |
240 | if (walk->pgd_entry) | 273 | if (walk->vma || walk->pte_hole) |
241 | err = walk->pgd_entry(pgd, addr, next, walk); | 274 | err = __walk_page_range(start, next, walk); |
242 | if (!err && | ||
243 | (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) | ||
244 | err = walk_pud_range(pgd, addr, next, walk); | ||
245 | if (err) | 275 | if (err) |
246 | break; | 276 | break; |
247 | pgd++; | 277 | } while (start = next, start < end); |
248 | } while (addr = next, addr < end); | ||
249 | |||
250 | return err; | 278 | return err; |
251 | } | 279 | } |
280 | |||
281 | int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) | ||
282 | { | ||
283 | int err; | ||
284 | |||
285 | if (!walk->mm) | ||
286 | return -EINVAL; | ||
287 | |||
288 | VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); | ||
289 | VM_BUG_ON(!vma); | ||
290 | walk->vma = vma; | ||
291 | err = walk_page_test(vma->vm_start, vma->vm_end, walk); | ||
292 | if (err > 0) | ||
293 | return 0; | ||
294 | if (err < 0) | ||
295 | return err; | ||
296 | return __walk_page_range(vma->vm_start, vma->vm_end, walk); | ||
297 | } | ||
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 5077afcd9e11..b1597690530c 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c | |||
@@ -99,11 +99,8 @@ static int process_vm_rw_single_vec(unsigned long addr, | |||
99 | size_t bytes; | 99 | size_t bytes; |
100 | 100 | ||
101 | /* Get the pages we're interested in */ | 101 | /* Get the pages we're interested in */ |
102 | down_read(&mm->mmap_sem); | 102 | pages = get_user_pages_unlocked(task, mm, pa, pages, |
103 | pages = get_user_pages(task, mm, pa, pages, | 103 | vm_write, 0, process_pages); |
104 | vm_write, 0, process_pages, NULL); | ||
105 | up_read(&mm->mmap_sem); | ||
106 | |||
107 | if (pages <= 0) | 104 | if (pages <= 0) |
108 | return -EFAULT; | 105 | return -EFAULT; |
109 | 106 | ||
@@ -1085,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page, | |||
1085 | void page_add_file_rmap(struct page *page) | 1085 | void page_add_file_rmap(struct page *page) |
1086 | { | 1086 | { |
1087 | struct mem_cgroup *memcg; | 1087 | struct mem_cgroup *memcg; |
1088 | unsigned long flags; | ||
1089 | bool locked; | ||
1090 | 1088 | ||
1091 | memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); | 1089 | memcg = mem_cgroup_begin_page_stat(page); |
1092 | if (atomic_inc_and_test(&page->_mapcount)) { | 1090 | if (atomic_inc_and_test(&page->_mapcount)) { |
1093 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1091 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
1094 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); | 1092 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); |
1095 | } | 1093 | } |
1096 | mem_cgroup_end_page_stat(memcg, &locked, &flags); | 1094 | mem_cgroup_end_page_stat(memcg); |
1097 | } | 1095 | } |
1098 | 1096 | ||
1099 | static void page_remove_file_rmap(struct page *page) | 1097 | static void page_remove_file_rmap(struct page *page) |
1100 | { | 1098 | { |
1101 | struct mem_cgroup *memcg; | 1099 | struct mem_cgroup *memcg; |
1102 | unsigned long flags; | ||
1103 | bool locked; | ||
1104 | 1100 | ||
1105 | memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); | 1101 | memcg = mem_cgroup_begin_page_stat(page); |
1106 | 1102 | ||
1107 | /* page still mapped by someone else? */ | 1103 | /* page still mapped by someone else? */ |
1108 | if (!atomic_add_negative(-1, &page->_mapcount)) | 1104 | if (!atomic_add_negative(-1, &page->_mapcount)) |
@@ -1123,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page) | |||
1123 | if (unlikely(PageMlocked(page))) | 1119 | if (unlikely(PageMlocked(page))) |
1124 | clear_page_mlock(page); | 1120 | clear_page_mlock(page); |
1125 | out: | 1121 | out: |
1126 | mem_cgroup_end_page_stat(memcg, &locked, &flags); | 1122 | mem_cgroup_end_page_stat(memcg); |
1127 | } | 1123 | } |
1128 | 1124 | ||
1129 | /** | 1125 | /** |
diff --git a/mm/shmem.c b/mm/shmem.c index b3e403181981..864c878401e6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1131,7 +1131,7 @@ repeat: | |||
1131 | * truncated or holepunched since swap was confirmed. | 1131 | * truncated or holepunched since swap was confirmed. |
1132 | * shmem_undo_range() will have done some of the | 1132 | * shmem_undo_range() will have done some of the |
1133 | * unaccounting, now delete_from_swap_cache() will do | 1133 | * unaccounting, now delete_from_swap_cache() will do |
1134 | * the rest (including mem_cgroup_uncharge_swapcache). | 1134 | * the rest. |
1135 | * Reset swap.val? No, leave it so "failed" goes back to | 1135 | * Reset swap.val? No, leave it so "failed" goes back to |
1136 | * "repeat": reading a hole and writing should succeed. | 1136 | * "repeat": reading a hole and writing should succeed. |
1137 | */ | 1137 | */ |
@@ -240,14 +240,8 @@ int __weak get_user_pages_fast(unsigned long start, | |||
240 | int nr_pages, int write, struct page **pages) | 240 | int nr_pages, int write, struct page **pages) |
241 | { | 241 | { |
242 | struct mm_struct *mm = current->mm; | 242 | struct mm_struct *mm = current->mm; |
243 | int ret; | 243 | return get_user_pages_unlocked(current, mm, start, nr_pages, |
244 | 244 | write, 0, pages); | |
245 | down_read(&mm->mmap_sem); | ||
246 | ret = get_user_pages(current, mm, start, nr_pages, | ||
247 | write, 0, pages, NULL); | ||
248 | up_read(&mm->mmap_sem); | ||
249 | |||
250 | return ret; | ||
251 | } | 245 | } |
252 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | 246 | EXPORT_SYMBOL_GPL(get_user_pages_fast); |
253 | 247 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index dcd90c891d8e..8e645ee52045 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -91,6 +91,9 @@ struct scan_control { | |||
91 | /* Can pages be swapped as part of reclaim? */ | 91 | /* Can pages be swapped as part of reclaim? */ |
92 | unsigned int may_swap:1; | 92 | unsigned int may_swap:1; |
93 | 93 | ||
94 | /* Can cgroups be reclaimed below their normal consumption range? */ | ||
95 | unsigned int may_thrash:1; | ||
96 | |||
94 | unsigned int hibernation_mode:1; | 97 | unsigned int hibernation_mode:1; |
95 | 98 | ||
96 | /* One of the zones is ready for compaction */ | 99 | /* One of the zones is ready for compaction */ |
@@ -1903,8 +1906,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, | |||
1903 | * latencies, so it's better to scan a minimum amount there as | 1906 | * latencies, so it's better to scan a minimum amount there as |
1904 | * well. | 1907 | * well. |
1905 | */ | 1908 | */ |
1906 | if (current_is_kswapd() && !zone_reclaimable(zone)) | 1909 | if (current_is_kswapd()) { |
1907 | force_scan = true; | 1910 | if (!zone_reclaimable(zone)) |
1911 | force_scan = true; | ||
1912 | if (!mem_cgroup_lruvec_online(lruvec)) | ||
1913 | force_scan = true; | ||
1914 | } | ||
1908 | if (!global_reclaim(sc)) | 1915 | if (!global_reclaim(sc)) |
1909 | force_scan = true; | 1916 | force_scan = true; |
1910 | 1917 | ||
@@ -2290,6 +2297,12 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2290 | struct lruvec *lruvec; | 2297 | struct lruvec *lruvec; |
2291 | int swappiness; | 2298 | int swappiness; |
2292 | 2299 | ||
2300 | if (mem_cgroup_low(root, memcg)) { | ||
2301 | if (!sc->may_thrash) | ||
2302 | continue; | ||
2303 | mem_cgroup_events(memcg, MEMCG_LOW, 1); | ||
2304 | } | ||
2305 | |||
2293 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2306 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2294 | swappiness = mem_cgroup_swappiness(memcg); | 2307 | swappiness = mem_cgroup_swappiness(memcg); |
2295 | 2308 | ||
@@ -2311,8 +2324,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2311 | mem_cgroup_iter_break(root, memcg); | 2324 | mem_cgroup_iter_break(root, memcg); |
2312 | break; | 2325 | break; |
2313 | } | 2326 | } |
2314 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 2327 | } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); |
2315 | } while (memcg); | ||
2316 | 2328 | ||
2317 | /* | 2329 | /* |
2318 | * Shrink the slab caches in the same proportion that | 2330 | * Shrink the slab caches in the same proportion that |
@@ -2515,10 +2527,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2515 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | 2527 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, |
2516 | struct scan_control *sc) | 2528 | struct scan_control *sc) |
2517 | { | 2529 | { |
2530 | int initial_priority = sc->priority; | ||
2518 | unsigned long total_scanned = 0; | 2531 | unsigned long total_scanned = 0; |
2519 | unsigned long writeback_threshold; | 2532 | unsigned long writeback_threshold; |
2520 | bool zones_reclaimable; | 2533 | bool zones_reclaimable; |
2521 | 2534 | retry: | |
2522 | delayacct_freepages_start(); | 2535 | delayacct_freepages_start(); |
2523 | 2536 | ||
2524 | if (global_reclaim(sc)) | 2537 | if (global_reclaim(sc)) |
@@ -2568,6 +2581,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2568 | if (sc->compaction_ready) | 2581 | if (sc->compaction_ready) |
2569 | return 1; | 2582 | return 1; |
2570 | 2583 | ||
2584 | /* Untapped cgroup reserves? Don't OOM, retry. */ | ||
2585 | if (!sc->may_thrash) { | ||
2586 | sc->priority = initial_priority; | ||
2587 | sc->may_thrash = 1; | ||
2588 | goto retry; | ||
2589 | } | ||
2590 | |||
2571 | /* Any of the zones still reclaimable? Don't OOM. */ | 2591 | /* Any of the zones still reclaimable? Don't OOM. */ |
2572 | if (zones_reclaimable) | 2592 | if (zones_reclaimable) |
2573 | return 1; | 2593 | return 1; |
@@ -3175,7 +3195,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3175 | */ | 3195 | */ |
3176 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && | 3196 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && |
3177 | pfmemalloc_watermark_ok(pgdat)) | 3197 | pfmemalloc_watermark_ok(pgdat)) |
3178 | wake_up(&pgdat->pfmemalloc_wait); | 3198 | wake_up_all(&pgdat->pfmemalloc_wait); |
3179 | 3199 | ||
3180 | /* | 3200 | /* |
3181 | * Fragmentation may mean that the system cannot be rebalanced | 3201 | * Fragmentation may mean that the system cannot be rebalanced |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 9943e5fd74e6..4f5cd974e11a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -1437,8 +1437,8 @@ static void vmstat_shepherd(struct work_struct *w) | |||
1437 | if (need_update(cpu) && | 1437 | if (need_update(cpu) && |
1438 | cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) | 1438 | cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) |
1439 | 1439 | ||
1440 | schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu), | 1440 | schedule_delayed_work_on(cpu, |
1441 | __round_jiffies_relative(sysctl_stat_interval, cpu)); | 1441 | &per_cpu(vmstat_work, cpu), 0); |
1442 | 1442 | ||
1443 | put_online_cpus(); | 1443 | put_online_cpus(); |
1444 | 1444 | ||
@@ -1452,7 +1452,7 @@ static void __init start_shepherd_timer(void) | |||
1452 | int cpu; | 1452 | int cpu; |
1453 | 1453 | ||
1454 | for_each_possible_cpu(cpu) | 1454 | for_each_possible_cpu(cpu) |
1455 | INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), | 1455 | INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), |
1456 | vmstat_update); | 1456 | vmstat_update); |
1457 | 1457 | ||
1458 | if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) | 1458 | if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) |
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 555013034f7a..096d91447e06 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c | |||
@@ -23,17 +23,15 @@ struct page **ceph_get_direct_page_vector(const void __user *data, | |||
23 | if (!pages) | 23 | if (!pages) |
24 | return ERR_PTR(-ENOMEM); | 24 | return ERR_PTR(-ENOMEM); |
25 | 25 | ||
26 | down_read(¤t->mm->mmap_sem); | ||
27 | while (got < num_pages) { | 26 | while (got < num_pages) { |
28 | rc = get_user_pages(current, current->mm, | 27 | rc = get_user_pages_unlocked(current, current->mm, |
29 | (unsigned long)data + ((unsigned long)got * PAGE_SIZE), | 28 | (unsigned long)data + ((unsigned long)got * PAGE_SIZE), |
30 | num_pages - got, write_page, 0, pages + got, NULL); | 29 | num_pages - got, write_page, 0, pages + got); |
31 | if (rc < 0) | 30 | if (rc < 0) |
32 | break; | 31 | break; |
33 | BUG_ON(rc == 0); | 32 | BUG_ON(rc == 0); |
34 | got += rc; | 33 | got += rc; |
35 | } | 34 | } |
36 | up_read(¤t->mm->mmap_sem); | ||
37 | if (rc < 0) | 35 | if (rc < 0) |
38 | goto fail; | 36 | goto fail; |
39 | return pages; | 37 | return pages; |
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 272327134a1b..c2a75c6957a1 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c | |||
@@ -120,7 +120,7 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, | |||
120 | switch (of_cft(of)->private) { | 120 | switch (of_cft(of)->private) { |
121 | case RES_LIMIT: | 121 | case RES_LIMIT: |
122 | /* see memcontrol.c */ | 122 | /* see memcontrol.c */ |
123 | ret = page_counter_memparse(buf, &nr_pages); | 123 | ret = page_counter_memparse(buf, "-1", &nr_pages); |
124 | if (ret) | 124 | if (ret) |
125 | break; | 125 | break; |
126 | mutex_lock(&tcp_limit_mutex); | 126 | mutex_lock(&tcp_limit_mutex); |
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 264fbc297e0b..8bdf16b8ba60 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c | |||
@@ -133,6 +133,7 @@ static const char * const page_flag_names[] = { | |||
133 | [KPF_KSM] = "x:ksm", | 133 | [KPF_KSM] = "x:ksm", |
134 | [KPF_THP] = "t:thp", | 134 | [KPF_THP] = "t:thp", |
135 | [KPF_BALLOON] = "o:balloon", | 135 | [KPF_BALLOON] = "o:balloon", |
136 | [KPF_ZERO_PAGE] = "z:zero_page", | ||
136 | 137 | ||
137 | [KPF_RESERVED] = "r:reserved", | 138 | [KPF_RESERVED] = "r:reserved", |
138 | [KPF_MLOCKED] = "m:mlocked", | 139 | [KPF_MLOCKED] = "m:mlocked", |
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 5ff7f7f2689a..44660aee335f 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c | |||
@@ -80,7 +80,7 @@ static void async_pf_execute(struct work_struct *work) | |||
80 | 80 | ||
81 | might_sleep(); | 81 | might_sleep(); |
82 | 82 | ||
83 | kvm_get_user_page_io(NULL, mm, addr, 1, NULL); | 83 | get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL); |
84 | kvm_async_page_present_sync(vcpu, apf); | 84 | kvm_async_page_present_sync(vcpu, apf); |
85 | 85 | ||
86 | spin_lock(&vcpu->async_pf.lock); | 86 | spin_lock(&vcpu->async_pf.lock); |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1cc6e2e19982..458b9b14b15c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -1128,43 +1128,6 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, | |||
1128 | return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); | 1128 | return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); |
1129 | } | 1129 | } |
1130 | 1130 | ||
1131 | int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm, | ||
1132 | unsigned long addr, bool write_fault, | ||
1133 | struct page **pagep) | ||
1134 | { | ||
1135 | int npages; | ||
1136 | int locked = 1; | ||
1137 | int flags = FOLL_TOUCH | FOLL_HWPOISON | | ||
1138 | (pagep ? FOLL_GET : 0) | | ||
1139 | (write_fault ? FOLL_WRITE : 0); | ||
1140 | |||
1141 | /* | ||
1142 | * If retrying the fault, we get here *not* having allowed the filemap | ||
1143 | * to wait on the page lock. We should now allow waiting on the IO with | ||
1144 | * the mmap semaphore released. | ||
1145 | */ | ||
1146 | down_read(&mm->mmap_sem); | ||
1147 | npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL, | ||
1148 | &locked); | ||
1149 | if (!locked) { | ||
1150 | VM_BUG_ON(npages); | ||
1151 | |||
1152 | if (!pagep) | ||
1153 | return 0; | ||
1154 | |||
1155 | /* | ||
1156 | * The previous call has now waited on the IO. Now we can | ||
1157 | * retry and complete. Pass TRIED to ensure we do not re | ||
1158 | * schedule async IO (see e.g. filemap_fault). | ||
1159 | */ | ||
1160 | down_read(&mm->mmap_sem); | ||
1161 | npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED, | ||
1162 | pagep, NULL, NULL); | ||
1163 | } | ||
1164 | up_read(&mm->mmap_sem); | ||
1165 | return npages; | ||
1166 | } | ||
1167 | |||
1168 | static inline int check_user_page_hwpoison(unsigned long addr) | 1131 | static inline int check_user_page_hwpoison(unsigned long addr) |
1169 | { | 1132 | { |
1170 | int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; | 1133 | int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; |
@@ -1227,15 +1190,10 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, | |||
1227 | npages = get_user_page_nowait(current, current->mm, | 1190 | npages = get_user_page_nowait(current, current->mm, |
1228 | addr, write_fault, page); | 1191 | addr, write_fault, page); |
1229 | up_read(¤t->mm->mmap_sem); | 1192 | up_read(¤t->mm->mmap_sem); |
1230 | } else { | 1193 | } else |
1231 | /* | 1194 | npages = __get_user_pages_unlocked(current, current->mm, addr, 1, |
1232 | * By now we have tried gup_fast, and possibly async_pf, and we | 1195 | write_fault, 0, page, |
1233 | * are certainly not atomic. Time to retry the gup, allowing | 1196 | FOLL_TOUCH|FOLL_HWPOISON); |
1234 | * mmap semaphore to be relinquished in the case of IO. | ||
1235 | */ | ||
1236 | npages = kvm_get_user_page_io(current, current->mm, addr, | ||
1237 | write_fault, page); | ||
1238 | } | ||
1239 | if (npages != 1) | 1197 | if (npages != 1) |
1240 | return npages; | 1198 | return npages; |
1241 | 1199 | ||