aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-11 21:23:28 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-11 21:23:28 -0500
commit59d53737a8640482995fea13c6e2c0fd016115d6 (patch)
tree3423eb92315865d76cb8d488513bfef6ab9251d0
parentd3f180ea1a44aecba1b0dab2a253428e77f906bf (diff)
parent8138a67a5557ffea3a21dfd6f037842d4e748513 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge second set of updates from Andrew Morton: "More of MM" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (83 commits) mm/nommu.c: fix arithmetic overflow in __vm_enough_memory() mm/mmap.c: fix arithmetic overflow in __vm_enough_memory() vmstat: Reduce time interval to stat update on idle cpu mm/page_owner.c: remove unnecessary stack_trace field Documentation/filesystems/proc.txt: describe /proc/<pid>/map_files mm: incorporate read-only pages into transparent huge pages vmstat: do not use deferrable delayed work for vmstat_update mm: more aggressive page stealing for UNMOVABLE allocations mm: always steal split buddies in fallback allocations mm: when stealing freepages, also take pages created by splitting buddy page mincore: apply page table walker on do_mincore() mm: /proc/pid/clear_refs: avoid split_huge_page() mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP) mempolicy: apply page table walker on queue_pages_range() arch/powerpc/mm/subpage-prot.c: use walk->vma and walk_page_vma() memcg: cleanup preparation for page table walk numa_maps: remove numa_maps->vma numa_maps: fix typo in gather_hugetbl_stats pagemap: use walk->vma instead of calling find_vma() clear_refs: remove clear_refs_private->vma and introduce clear_refs_test_walk() ...
-rw-r--r--Documentation/cgroups/unified-hierarchy.txt79
-rw-r--r--Documentation/filesystems/proc.txt23
-rw-r--r--Documentation/sysctl/vm.txt12
-rw-r--r--Documentation/vm/pagemap.txt8
-rw-r--r--arch/alpha/include/asm/pgtable.h2
-rw-r--r--arch/arc/include/asm/pgtable.h2
-rw-r--r--arch/arm/include/asm/pgtable-2level.h2
-rw-r--r--arch/arm/include/asm/pgtable-nommu.h2
-rw-r--r--arch/arm/mm/hugetlbpage.c6
-rw-r--r--arch/arm/mm/pgd.c4
-rw-r--r--arch/arm64/include/asm/pgtable.h2
-rw-r--r--arch/arm64/mm/hugetlbpage.c6
-rw-r--r--arch/avr32/include/asm/pgtable.h2
-rw-r--r--arch/cris/include/asm/pgtable.h2
-rw-r--r--arch/frv/include/asm/pgtable.h2
-rw-r--r--arch/hexagon/include/asm/pgtable.h2
-rw-r--r--arch/ia64/include/asm/pgtable.h2
-rw-r--r--arch/ia64/mm/hugetlbpage.c6
-rw-r--r--arch/m32r/include/asm/pgtable.h2
-rw-r--r--arch/m68k/include/asm/pgtable_mm.h2
-rw-r--r--arch/metag/mm/hugetlbpage.c6
-rw-r--r--arch/microblaze/include/asm/pgtable.h4
-rw-r--r--arch/mips/include/asm/pgtable-32.h2
-rw-r--r--arch/mips/mm/gup.c8
-rw-r--r--arch/mips/mm/hugetlbpage.c18
-rw-r--r--arch/mn10300/include/asm/pgtable.h2
-rw-r--r--arch/nios2/include/asm/pgtable.h2
-rw-r--r--arch/openrisc/include/asm/pgtable.h2
-rw-r--r--arch/parisc/include/asm/pgtable.h2
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc32.h2
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64.h2
-rw-r--r--arch/powerpc/mm/hugetlbpage.c8
-rw-r--r--arch/powerpc/mm/subpage-prot.c6
-rw-r--r--arch/s390/include/asm/pgtable.h2
-rw-r--r--arch/s390/mm/gup.c6
-rw-r--r--arch/s390/mm/hugetlbpage.c20
-rw-r--r--arch/score/include/asm/pgtable.h2
-rw-r--r--arch/sh/include/asm/pgtable.h2
-rw-r--r--arch/sh/mm/gup.c6
-rw-r--r--arch/sh/mm/hugetlbpage.c12
-rw-r--r--arch/sparc/include/asm/pgtable_32.h5
-rw-r--r--arch/sparc/include/asm/pgtable_64.h2
-rw-r--r--arch/sparc/mm/gup.c6
-rw-r--r--arch/sparc/mm/hugetlbpage.c12
-rw-r--r--arch/tile/include/asm/pgtable.h2
-rw-r--r--arch/tile/mm/hugetlbpage.c28
-rw-r--r--arch/um/include/asm/pgtable-2level.h2
-rw-r--r--arch/um/include/asm/pgtable-3level.h2
-rw-r--r--arch/unicore32/mm/pgd.c3
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--arch/x86/mm/gup.c9
-rw-r--r--arch/x86/mm/hugetlbpage.c20
-rw-r--r--arch/x86/mm/pgtable.c14
-rw-r--r--arch/xtensa/include/asm/pgtable.h2
-rw-r--r--drivers/media/pci/ivtv/ivtv-udma.c6
-rw-r--r--drivers/scsi/st.c7
-rw-r--r--drivers/staging/android/lowmemorykiller.c7
-rw-r--r--drivers/tty/sysrq.c23
-rw-r--r--drivers/video/fbdev/pvr2fb.c6
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/proc/page.c16
-rw-r--r--fs/proc/task_mmu.c218
-rw-r--r--include/asm-generic/4level-fixup.h1
-rw-r--r--include/linux/compaction.h86
-rw-r--r--include/linux/gfp.h12
-rw-r--r--include/linux/huge_mm.h12
-rw-r--r--include/linux/hugetlb.h8
-rw-r--r--include/linux/kvm_host.h11
-rw-r--r--include/linux/memcontrol.h50
-rw-r--r--include/linux/mm.h69
-rw-r--r--include/linux/mm_types.h11
-rw-r--r--include/linux/mmzone.h15
-rw-r--r--include/linux/oom.h18
-rw-r--r--include/linux/page_counter.h3
-rw-r--r--include/linux/page_ext.h2
-rw-r--r--include/linux/swap.h15
-rw-r--r--include/linux/swapops.h4
-rw-r--r--include/trace/events/compaction.h209
-rw-r--r--include/trace/events/kmem.h7
-rw-r--r--include/uapi/linux/kernel-page-flags.h1
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/power/process.c75
-rw-r--r--mm/cma.c2
-rw-r--r--mm/compaction.c156
-rw-r--r--mm/debug.c3
-rw-r--r--mm/gup.c228
-rw-r--r--mm/huge_memory.c106
-rw-r--r--mm/hugetlb.c158
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/internal.h22
-rw-r--r--mm/memcontrol.c702
-rw-r--r--mm/memory.c15
-rw-r--r--mm/mempolicy.c277
-rw-r--r--mm/migrate.c5
-rw-r--r--mm/mincore.c166
-rw-r--r--mm/mmap.c7
-rw-r--r--mm/mmzone.c4
-rw-r--r--mm/nommu.c37
-rw-r--r--mm/oom_kill.c169
-rw-r--r--mm/page-writeback.c17
-rw-r--r--mm/page_alloc.c432
-rw-r--r--mm/page_counter.c7
-rw-r--r--mm/page_owner.c26
-rw-r--r--mm/pagewalk.c238
-rw-r--r--mm/process_vm_access.c7
-rw-r--r--mm/rmap.c12
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/util.c10
-rw-r--r--mm/vmscan.c32
-rw-r--r--mm/vmstat.c6
-rw-r--r--net/ceph/pagevec.c6
-rw-r--r--net/ipv4/tcp_memcontrol.c2
-rw-r--r--tools/vm/page-types.c1
-rw-r--r--virt/kvm/async_pf.c2
-rw-r--r--virt/kvm/kvm_main.c50
116 files changed, 2491 insertions, 1717 deletions
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 4f4563277864..71daa35ec2d9 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -327,6 +327,85 @@ supported and the interface files "release_agent" and
327- use_hierarchy is on by default and the cgroup file for the flag is 327- use_hierarchy is on by default and the cgroup file for the flag is
328 not created. 328 not created.
329 329
330- The original lower boundary, the soft limit, is defined as a limit
331 that is per default unset. As a result, the set of cgroups that
332 global reclaim prefers is opt-in, rather than opt-out. The costs
333 for optimizing these mostly negative lookups are so high that the
334 implementation, despite its enormous size, does not even provide the
335 basic desirable behavior. First off, the soft limit has no
336 hierarchical meaning. All configured groups are organized in a
337 global rbtree and treated like equal peers, regardless where they
338 are located in the hierarchy. This makes subtree delegation
339 impossible. Second, the soft limit reclaim pass is so aggressive
340 that it not just introduces high allocation latencies into the
341 system, but also impacts system performance due to overreclaim, to
342 the point where the feature becomes self-defeating.
343
344 The memory.low boundary on the other hand is a top-down allocated
345 reserve. A cgroup enjoys reclaim protection when it and all its
346 ancestors are below their low boundaries, which makes delegation of
347 subtrees possible. Secondly, new cgroups have no reserve per
348 default and in the common case most cgroups are eligible for the
349 preferred reclaim pass. This allows the new low boundary to be
350 efficiently implemented with just a minor addition to the generic
351 reclaim code, without the need for out-of-band data structures and
352 reclaim passes. Because the generic reclaim code considers all
353 cgroups except for the ones running low in the preferred first
354 reclaim pass, overreclaim of individual groups is eliminated as
355 well, resulting in much better overall workload performance.
356
357- The original high boundary, the hard limit, is defined as a strict
358 limit that can not budge, even if the OOM killer has to be called.
359 But this generally goes against the goal of making the most out of
360 the available memory. The memory consumption of workloads varies
361 during runtime, and that requires users to overcommit. But doing
362 that with a strict upper limit requires either a fairly accurate
363 prediction of the working set size or adding slack to the limit.
364 Since working set size estimation is hard and error prone, and
365 getting it wrong results in OOM kills, most users tend to err on the
366 side of a looser limit and end up wasting precious resources.
367
368 The memory.high boundary on the other hand can be set much more
369 conservatively. When hit, it throttles allocations by forcing them
370 into direct reclaim to work off the excess, but it never invokes the
371 OOM killer. As a result, a high boundary that is chosen too
372 aggressively will not terminate the processes, but instead it will
373 lead to gradual performance degradation. The user can monitor this
374 and make corrections until the minimal memory footprint that still
375 gives acceptable performance is found.
376
377 In extreme cases, with many concurrent allocations and a complete
378 breakdown of reclaim progress within the group, the high boundary
379 can be exceeded. But even then it's mostly better to satisfy the
380 allocation from the slack available in other groups or the rest of
381 the system than killing the group. Otherwise, memory.max is there
382 to limit this type of spillover and ultimately contain buggy or even
383 malicious applications.
384
385- The original control file names are unwieldy and inconsistent in
386 many different ways. For example, the upper boundary hit count is
387 exported in the memory.failcnt file, but an OOM event count has to
388 be manually counted by listening to memory.oom_control events, and
389 lower boundary / soft limit events have to be counted by first
390 setting a threshold for that value and then counting those events.
391 Also, usage and limit files encode their units in the filename.
392 That makes the filenames very long, even though this is not
393 information that a user needs to be reminded of every time they type
394 out those names.
395
396 To address these naming issues, as well as to signal clearly that
397 the new interface carries a new configuration model, the naming
398 conventions in it necessarily differ from the old interface.
399
400- The original limit files indicate the state of an unset limit with a
401 Very High Number, and a configured limit can be unset by echoing -1
402 into those files. But that very high number is implementation and
403 architecture dependent and not very descriptive. And while -1 can
404 be understood as an underflow into the highest possible value, -2 or
405 -10M etc. do not work, so it's not consistent.
406
407 memory.low, memory.high, and memory.max will use the string
408 "infinity" to indicate and set the highest possible value.
330 409
3315. Planned Changes 4105. Planned Changes
332 411
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 79b3cc821e7b..cf8fc2f0b34b 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -42,6 +42,7 @@ Table of Contents
42 3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm 42 3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm
43 3.7 /proc/<pid>/task/<tid>/children - Information about task children 43 3.7 /proc/<pid>/task/<tid>/children - Information about task children
44 3.8 /proc/<pid>/fdinfo/<fd> - Information about opened file 44 3.8 /proc/<pid>/fdinfo/<fd> - Information about opened file
45 3.9 /proc/<pid>/map_files - Information about memory mapped files
45 46
46 4 Configuring procfs 47 4 Configuring procfs
47 4.1 Mount options 48 4.1 Mount options
@@ -1763,6 +1764,28 @@ pair provide additional information particular to the objects they represent.
1763 with TIMER_ABSTIME option which will be shown in 'settime flags', but 'it_value' 1764 with TIMER_ABSTIME option which will be shown in 'settime flags', but 'it_value'
1764 still exhibits timer's remaining time. 1765 still exhibits timer's remaining time.
1765 1766
17673.9 /proc/<pid>/map_files - Information about memory mapped files
1768---------------------------------------------------------------------
1769This directory contains symbolic links which represent memory mapped files
1770the process is maintaining. Example output:
1771
1772 | lr-------- 1 root root 64 Jan 27 11:24 333c600000-333c620000 -> /usr/lib64/ld-2.18.so
1773 | lr-------- 1 root root 64 Jan 27 11:24 333c81f000-333c820000 -> /usr/lib64/ld-2.18.so
1774 | lr-------- 1 root root 64 Jan 27 11:24 333c820000-333c821000 -> /usr/lib64/ld-2.18.so
1775 | ...
1776 | lr-------- 1 root root 64 Jan 27 11:24 35d0421000-35d0422000 -> /usr/lib64/libselinux.so.1
1777 | lr-------- 1 root root 64 Jan 27 11:24 400000-41a000 -> /usr/bin/ls
1778
1779The name of a link represents the virtual memory bounds of a mapping, i.e.
1780vm_area_struct::vm_start-vm_area_struct::vm_end.
1781
1782The main purpose of the map_files is to retrieve a set of memory mapped
1783files in a fast way instead of parsing /proc/<pid>/maps or
1784/proc/<pid>/smaps, both of which contain many more records. At the same
1785time one can open(2) mappings from the listings of two processes and
1786comparing their inode numbers to figure out which anonymous memory areas
1787are actually shared.
1788
1766------------------------------------------------------------------------------ 1789------------------------------------------------------------------------------
1767Configuring procfs 1790Configuring procfs
1768------------------------------------------------------------------------------ 1791------------------------------------------------------------------------------
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index de3afef76837..902b4574acfb 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -555,12 +555,12 @@ this is causing problems for your system/application.
555 555
556oom_dump_tasks 556oom_dump_tasks
557 557
558Enables a system-wide task dump (excluding kernel threads) to be 558Enables a system-wide task dump (excluding kernel threads) to be produced
559produced when the kernel performs an OOM-killing and includes such 559when the kernel performs an OOM-killing and includes such information as
560information as pid, uid, tgid, vm size, rss, nr_ptes, swapents, 560pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj
561oom_score_adj score, and name. This is helpful to determine why the 561score, and name. This is helpful to determine why the OOM killer was
562OOM killer was invoked, to identify the rogue task that caused it, 562invoked, to identify the rogue task that caused it, and to determine why
563and to determine why the OOM killer chose the task it did to kill. 563the OOM killer chose the task it did to kill.
564 564
565If this is set to zero, this information is suppressed. On very 565If this is set to zero, this information is suppressed. On very
566large systems with thousands of tasks it may not be feasible to dump 566large systems with thousands of tasks it may not be feasible to dump
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 5948e455c4d2..6fbd55ef6b45 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -62,6 +62,8 @@ There are three components to pagemap:
62 20. NOPAGE 62 20. NOPAGE
63 21. KSM 63 21. KSM
64 22. THP 64 22. THP
65 23. BALLOON
66 24. ZERO_PAGE
65 67
66Short descriptions to the page flags: 68Short descriptions to the page flags:
67 69
@@ -102,6 +104,12 @@ Short descriptions to the page flags:
10222. THP 10422. THP
103 contiguous pages which construct transparent hugepages 105 contiguous pages which construct transparent hugepages
104 106
10723. BALLOON
108 balloon compaction page
109
11024. ZERO_PAGE
111 zero page for pfn_zero or huge_zero page
112
105 [IO related page flags] 113 [IO related page flags]
106 1. ERROR IO error occurred 114 1. ERROR IO error occurred
107 3. UPTODATE page has up-to-date data 115 3. UPTODATE page has up-to-date data
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index fce22cf88ee9..a9a119592372 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -45,7 +45,7 @@ struct vm_area_struct;
45#define PTRS_PER_PMD (1UL << (PAGE_SHIFT-3)) 45#define PTRS_PER_PMD (1UL << (PAGE_SHIFT-3))
46#define PTRS_PER_PGD (1UL << (PAGE_SHIFT-3)) 46#define PTRS_PER_PGD (1UL << (PAGE_SHIFT-3))
47#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) 47#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
48#define FIRST_USER_ADDRESS 0 48#define FIRST_USER_ADDRESS 0UL
49 49
50/* Number of pointers that fit on a page: this will go away. */ 50/* Number of pointers that fit on a page: this will go away. */
51#define PTRS_PER_PAGE (1UL << (PAGE_SHIFT-3)) 51#define PTRS_PER_PAGE (1UL << (PAGE_SHIFT-3))
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index bdc8ccaf390d..ffed3b2cf313 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -211,7 +211,7 @@
211 * No special requirements for lowest virtual address we permit any user space 211 * No special requirements for lowest virtual address we permit any user space
212 * mapping to be mapped at. 212 * mapping to be mapped at.
213 */ 213 */
214#define FIRST_USER_ADDRESS 0 214#define FIRST_USER_ADDRESS 0UL
215 215
216 216
217/**************************************************************** 217/****************************************************************
diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h
index bcc5e300413f..bfd662e49a25 100644
--- a/arch/arm/include/asm/pgtable-2level.h
+++ b/arch/arm/include/asm/pgtable-2level.h
@@ -10,6 +10,8 @@
10#ifndef _ASM_PGTABLE_2LEVEL_H 10#ifndef _ASM_PGTABLE_2LEVEL_H
11#define _ASM_PGTABLE_2LEVEL_H 11#define _ASM_PGTABLE_2LEVEL_H
12 12
13#define __PAGETABLE_PMD_FOLDED
14
13/* 15/*
14 * Hardware-wise, we have a two level page table structure, where the first 16 * Hardware-wise, we have a two level page table structure, where the first
15 * level has 4096 entries, and the second level has 256 entries. Each entry 17 * level has 4096 entries, and the second level has 256 entries. Each entry
diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h
index c35e53ee6663..add094d09e3e 100644
--- a/arch/arm/include/asm/pgtable-nommu.h
+++ b/arch/arm/include/asm/pgtable-nommu.h
@@ -85,7 +85,7 @@ extern unsigned int kobjsize(const void *objp);
85#define VMALLOC_START 0UL 85#define VMALLOC_START 0UL
86#define VMALLOC_END 0xffffffffUL 86#define VMALLOC_END 0xffffffffUL
87 87
88#define FIRST_USER_ADDRESS (0) 88#define FIRST_USER_ADDRESS 0UL
89 89
90#include <asm-generic/pgtable.h> 90#include <asm-generic/pgtable.h>
91 91
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
index 66781bf34077..c72412415093 100644
--- a/arch/arm/mm/hugetlbpage.c
+++ b/arch/arm/mm/hugetlbpage.c
@@ -36,12 +36,6 @@
36 * of type casting from pmd_t * to pte_t *. 36 * of type casting from pmd_t * to pte_t *.
37 */ 37 */
38 38
39struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
40 int write)
41{
42 return ERR_PTR(-EINVAL);
43}
44
45int pud_huge(pud_t pud) 39int pud_huge(pud_t pud)
46{ 40{
47 return 0; 41 return 0;
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index 249379535be2..a3681f11dd9f 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -97,6 +97,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
97 97
98no_pte: 98no_pte:
99 pmd_free(mm, new_pmd); 99 pmd_free(mm, new_pmd);
100 mm_dec_nr_pmds(mm);
100no_pmd: 101no_pmd:
101 pud_free(mm, new_pud); 102 pud_free(mm, new_pud);
102no_pud: 103no_pud:
@@ -130,9 +131,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base)
130 pte = pmd_pgtable(*pmd); 131 pte = pmd_pgtable(*pmd);
131 pmd_clear(pmd); 132 pmd_clear(pmd);
132 pte_free(mm, pte); 133 pte_free(mm, pte);
134 atomic_long_dec(&mm->nr_ptes);
133no_pmd: 135no_pmd:
134 pud_clear(pud); 136 pud_clear(pud);
135 pmd_free(mm, pmd); 137 pmd_free(mm, pmd);
138 mm_dec_nr_pmds(mm);
136no_pud: 139no_pud:
137 pgd_clear(pgd); 140 pgd_clear(pgd);
138 pud_free(mm, pud); 141 pud_free(mm, pud);
@@ -152,6 +155,7 @@ no_pgd:
152 pmd = pmd_offset(pud, 0); 155 pmd = pmd_offset(pud, 0);
153 pud_clear(pud); 156 pud_clear(pud);
154 pmd_free(mm, pmd); 157 pmd_free(mm, pmd);
158 mm_dec_nr_pmds(mm);
155 pgd_clear(pgd); 159 pgd_clear(pgd);
156 pud_free(mm, pud); 160 pud_free(mm, pud);
157 } 161 }
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index cf1d9c86f20a..16449c535e50 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -45,7 +45,7 @@
45 45
46#define vmemmap ((struct page *)(VMALLOC_END + SZ_64K)) 46#define vmemmap ((struct page *)(VMALLOC_END + SZ_64K))
47 47
48#define FIRST_USER_ADDRESS 0 48#define FIRST_USER_ADDRESS 0UL
49 49
50#ifndef __ASSEMBLY__ 50#ifndef __ASSEMBLY__
51extern void __pte_error(const char *file, int line, unsigned long val); 51extern void __pte_error(const char *file, int line, unsigned long val);
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 023747bf4dd7..2de9d2e59d96 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -38,12 +38,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
38} 38}
39#endif 39#endif
40 40
41struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
42 int write)
43{
44 return ERR_PTR(-EINVAL);
45}
46
47int pmd_huge(pmd_t pmd) 41int pmd_huge(pmd_t pmd)
48{ 42{
49 return !(pmd_val(pmd) & PMD_TABLE_BIT); 43 return !(pmd_val(pmd) & PMD_TABLE_BIT);
diff --git a/arch/avr32/include/asm/pgtable.h b/arch/avr32/include/asm/pgtable.h
index ac7a817e2126..35800664076e 100644
--- a/arch/avr32/include/asm/pgtable.h
+++ b/arch/avr32/include/asm/pgtable.h
@@ -30,7 +30,7 @@
30#define PGDIR_MASK (~(PGDIR_SIZE-1)) 30#define PGDIR_MASK (~(PGDIR_SIZE-1))
31 31
32#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) 32#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
33#define FIRST_USER_ADDRESS 0 33#define FIRST_USER_ADDRESS 0UL
34 34
35#ifndef __ASSEMBLY__ 35#ifndef __ASSEMBLY__
36extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; 36extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
diff --git a/arch/cris/include/asm/pgtable.h b/arch/cris/include/asm/pgtable.h
index e824257971c4..ceefc314d64d 100644
--- a/arch/cris/include/asm/pgtable.h
+++ b/arch/cris/include/asm/pgtable.h
@@ -67,7 +67,7 @@ extern void paging_init(void);
67 */ 67 */
68 68
69#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) 69#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
70#define FIRST_USER_ADDRESS 0 70#define FIRST_USER_ADDRESS 0UL
71 71
72/* zero page used for uninitialized stuff */ 72/* zero page used for uninitialized stuff */
73#ifndef __ASSEMBLY__ 73#ifndef __ASSEMBLY__
diff --git a/arch/frv/include/asm/pgtable.h b/arch/frv/include/asm/pgtable.h
index c49699d5902d..93bcf2abd1a1 100644
--- a/arch/frv/include/asm/pgtable.h
+++ b/arch/frv/include/asm/pgtable.h
@@ -140,7 +140,7 @@ extern unsigned long empty_zero_page;
140#define PTRS_PER_PTE 4096 140#define PTRS_PER_PTE 4096
141 141
142#define USER_PGDS_IN_LAST_PML4 (TASK_SIZE / PGDIR_SIZE) 142#define USER_PGDS_IN_LAST_PML4 (TASK_SIZE / PGDIR_SIZE)
143#define FIRST_USER_ADDRESS 0 143#define FIRST_USER_ADDRESS 0UL
144 144
145#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) 145#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
146#define KERNEL_PGD_PTRS (PTRS_PER_PGD - USER_PGD_PTRS) 146#define KERNEL_PGD_PTRS (PTRS_PER_PGD - USER_PGD_PTRS)
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index 6e35e71d2aea..49eab8136ec3 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -171,7 +171,7 @@ extern unsigned long _dflt_cache_att;
171extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* located in head.S */ 171extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* located in head.S */
172 172
173/* Seems to be zero even in architectures where the zero page is firewalled? */ 173/* Seems to be zero even in architectures where the zero page is firewalled? */
174#define FIRST_USER_ADDRESS 0 174#define FIRST_USER_ADDRESS 0UL
175#define pte_special(pte) 0 175#define pte_special(pte) 0
176#define pte_mkspecial(pte) (pte) 176#define pte_mkspecial(pte) (pte)
177 177
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 2f07bb3dda91..7b6f8801df57 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -127,7 +127,7 @@
127#define PTRS_PER_PGD_SHIFT PTRS_PER_PTD_SHIFT 127#define PTRS_PER_PGD_SHIFT PTRS_PER_PTD_SHIFT
128#define PTRS_PER_PGD (1UL << PTRS_PER_PGD_SHIFT) 128#define PTRS_PER_PGD (1UL << PTRS_PER_PGD_SHIFT)
129#define USER_PTRS_PER_PGD (5*PTRS_PER_PGD/8) /* regions 0-4 are user regions */ 129#define USER_PTRS_PER_PGD (5*PTRS_PER_PGD/8) /* regions 0-4 are user regions */
130#define FIRST_USER_ADDRESS 0 130#define FIRST_USER_ADDRESS 0UL
131 131
132/* 132/*
133 * All the normal masks have the "page accessed" bits on, as any time 133 * All the normal masks have the "page accessed" bits on, as any time
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 76069c18ee42..52b7604b5215 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -114,12 +114,6 @@ int pud_huge(pud_t pud)
114 return 0; 114 return 0;
115} 115}
116 116
117struct page *
118follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
119{
120 return NULL;
121}
122
123void hugetlb_free_pgd_range(struct mmu_gather *tlb, 117void hugetlb_free_pgd_range(struct mmu_gather *tlb,
124 unsigned long addr, unsigned long end, 118 unsigned long addr, unsigned long end,
125 unsigned long floor, unsigned long ceiling) 119 unsigned long floor, unsigned long ceiling)
diff --git a/arch/m32r/include/asm/pgtable.h b/arch/m32r/include/asm/pgtable.h
index 050f7a686e3d..8c1fb902a9ce 100644
--- a/arch/m32r/include/asm/pgtable.h
+++ b/arch/m32r/include/asm/pgtable.h
@@ -53,7 +53,7 @@ extern unsigned long empty_zero_page[1024];
53#define PGDIR_MASK (~(PGDIR_SIZE - 1)) 53#define PGDIR_MASK (~(PGDIR_SIZE - 1))
54 54
55#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) 55#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
56#define FIRST_USER_ADDRESS 0 56#define FIRST_USER_ADDRESS 0UL
57 57
58#ifndef __ASSEMBLY__ 58#ifndef __ASSEMBLY__
59/* Just any arbitrary offset to the start of the vmalloc VM area: the 59/* Just any arbitrary offset to the start of the vmalloc VM area: the
diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h
index 9f5abbda1ea7..28a145bfbb71 100644
--- a/arch/m68k/include/asm/pgtable_mm.h
+++ b/arch/m68k/include/asm/pgtable_mm.h
@@ -66,7 +66,7 @@
66#define PTRS_PER_PGD 128 66#define PTRS_PER_PGD 128
67#endif 67#endif
68#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) 68#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
69#define FIRST_USER_ADDRESS 0 69#define FIRST_USER_ADDRESS 0UL
70 70
71/* Virtual address region for use by kernel_map() */ 71/* Virtual address region for use by kernel_map() */
72#ifdef CONFIG_SUN3 72#ifdef CONFIG_SUN3
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index 3c32075d2945..7ca80ac42ed5 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -94,12 +94,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
94 return 0; 94 return 0;
95} 95}
96 96
97struct page *follow_huge_addr(struct mm_struct *mm,
98 unsigned long address, int write)
99{
100 return ERR_PTR(-EINVAL);
101}
102
103int pmd_huge(pmd_t pmd) 97int pmd_huge(pmd_t pmd)
104{ 98{
105 return pmd_page_shift(pmd) > PAGE_SHIFT; 99 return pmd_page_shift(pmd) > PAGE_SHIFT;
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index 91b9b46fbb5d..e53b8532353c 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -61,6 +61,8 @@ extern int mem_init_done;
61 61
62#include <asm-generic/4level-fixup.h> 62#include <asm-generic/4level-fixup.h>
63 63
64#define __PAGETABLE_PMD_FOLDED
65
64#ifdef __KERNEL__ 66#ifdef __KERNEL__
65#ifndef __ASSEMBLY__ 67#ifndef __ASSEMBLY__
66 68
@@ -70,7 +72,7 @@ extern int mem_init_done;
70#include <asm/mmu.h> 72#include <asm/mmu.h>
71#include <asm/page.h> 73#include <asm/page.h>
72 74
73#define FIRST_USER_ADDRESS 0 75#define FIRST_USER_ADDRESS 0UL
74 76
75extern unsigned long va_to_phys(unsigned long address); 77extern unsigned long va_to_phys(unsigned long address);
76extern pte_t *va_to_pte(unsigned long address); 78extern pte_t *va_to_pte(unsigned long address);
diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h
index 16aa9f23e17b..a6be006b6f75 100644
--- a/arch/mips/include/asm/pgtable-32.h
+++ b/arch/mips/include/asm/pgtable-32.h
@@ -57,7 +57,7 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1,
57#define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t)) 57#define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t))
58 58
59#define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE) 59#define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE)
60#define FIRST_USER_ADDRESS 0 60#define FIRST_USER_ADDRESS 0UL
61 61
62#define VMALLOC_START MAP_BASE 62#define VMALLOC_START MAP_BASE
63 63
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 70795a67a276..349995d19c7f 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -301,11 +301,9 @@ slow_irqon:
301 start += nr << PAGE_SHIFT; 301 start += nr << PAGE_SHIFT;
302 pages += nr; 302 pages += nr;
303 303
304 down_read(&mm->mmap_sem); 304 ret = get_user_pages_unlocked(current, mm, start,
305 ret = get_user_pages(current, mm, start, 305 (end - start) >> PAGE_SHIFT,
306 (end - start) >> PAGE_SHIFT, 306 write, 0, pages);
307 write, 0, pages, NULL);
308 up_read(&mm->mmap_sem);
309 307
310 /* Have to be a bit careful with return values */ 308 /* Have to be a bit careful with return values */
311 if (nr > 0) { 309 if (nr > 0) {
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index 4ec8ee10d371..06e0f421b41b 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -68,12 +68,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
68 return 0; 68 return 0;
69} 69}
70 70
71struct page *
72follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
73{
74 return ERR_PTR(-EINVAL);
75}
76
77int pmd_huge(pmd_t pmd) 71int pmd_huge(pmd_t pmd)
78{ 72{
79 return (pmd_val(pmd) & _PAGE_HUGE) != 0; 73 return (pmd_val(pmd) & _PAGE_HUGE) != 0;
@@ -83,15 +77,3 @@ int pud_huge(pud_t pud)
83{ 77{
84 return (pud_val(pud) & _PAGE_HUGE) != 0; 78 return (pud_val(pud) & _PAGE_HUGE) != 0;
85} 79}
86
87struct page *
88follow_huge_pmd(struct mm_struct *mm, unsigned long address,
89 pmd_t *pmd, int write)
90{
91 struct page *page;
92
93 page = pte_page(*(pte_t *)pmd);
94 if (page)
95 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
96 return page;
97}
diff --git a/arch/mn10300/include/asm/pgtable.h b/arch/mn10300/include/asm/pgtable.h
index 629181ae111e..afab728ab65e 100644
--- a/arch/mn10300/include/asm/pgtable.h
+++ b/arch/mn10300/include/asm/pgtable.h
@@ -65,7 +65,7 @@ extern void paging_init(void);
65#define PGDIR_MASK (~(PGDIR_SIZE - 1)) 65#define PGDIR_MASK (~(PGDIR_SIZE - 1))
66 66
67#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) 67#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
68#define FIRST_USER_ADDRESS 0 68#define FIRST_USER_ADDRESS 0UL
69 69
70#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) 70#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
71#define KERNEL_PGD_PTRS (PTRS_PER_PGD - USER_PGD_PTRS) 71#define KERNEL_PGD_PTRS (PTRS_PER_PGD - USER_PGD_PTRS)
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index 7b292e3a3138..a213e8c9aad0 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -24,7 +24,7 @@
24#include <asm/pgtable-bits.h> 24#include <asm/pgtable-bits.h>
25#include <asm-generic/pgtable-nopmd.h> 25#include <asm-generic/pgtable-nopmd.h>
26 26
27#define FIRST_USER_ADDRESS 0 27#define FIRST_USER_ADDRESS 0UL
28 28
29#define VMALLOC_START CONFIG_NIOS2_KERNEL_MMU_REGION_BASE 29#define VMALLOC_START CONFIG_NIOS2_KERNEL_MMU_REGION_BASE
30#define VMALLOC_END (CONFIG_NIOS2_KERNEL_REGION_BASE - 1) 30#define VMALLOC_END (CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 18994ccb1185..69c7df0e1420 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -77,7 +77,7 @@ extern void paging_init(void);
77 */ 77 */
78 78
79#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) 79#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
80#define FIRST_USER_ADDRESS 0 80#define FIRST_USER_ADDRESS 0UL
81 81
82/* 82/*
83 * Kernels own virtual memory area. 83 * Kernels own virtual memory area.
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 1d49a4a7749b..8c966b2270aa 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -134,7 +134,7 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long);
134 * pgd entries used up by user/kernel: 134 * pgd entries used up by user/kernel:
135 */ 135 */
136 136
137#define FIRST_USER_ADDRESS 0 137#define FIRST_USER_ADDRESS 0UL
138 138
139/* NB: The tlb miss handlers make certain assumptions about the order */ 139/* NB: The tlb miss handlers make certain assumptions about the order */
140/* of the following bits, so be careful (One example, bits 25-31 */ 140/* of the following bits, so be careful (One example, bits 25-31 */
diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index 26ce0ab0a9e4..14bdcbd31670 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -45,7 +45,7 @@ extern int icache_44x_need_flush;
45#define PTRS_PER_PGD (1 << (32 - PGDIR_SHIFT)) 45#define PTRS_PER_PGD (1 << (32 - PGDIR_SHIFT))
46 46
47#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) 47#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
48#define FIRST_USER_ADDRESS 0 48#define FIRST_USER_ADDRESS 0UL
49 49
50#define pte_ERROR(e) \ 50#define pte_ERROR(e) \
51 pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \ 51 pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index b9dcc936e2d1..d46532ccc386 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -12,7 +12,7 @@
12#endif 12#endif
13#include <asm/barrier.h> 13#include <asm/barrier.h>
14 14
15#define FIRST_USER_ADDRESS 0 15#define FIRST_USER_ADDRESS 0UL
16 16
17/* 17/*
18 * Size of EA range mapped by our pagetables. 18 * Size of EA range mapped by our pagetables.
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 5ff4e07d920a..cf0464f4284f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -714,6 +714,14 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
714 return NULL; 714 return NULL;
715} 715}
716 716
717struct page *
718follow_huge_pud(struct mm_struct *mm, unsigned long address,
719 pud_t *pud, int write)
720{
721 BUG();
722 return NULL;
723}
724
717static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 725static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
718 unsigned long sz) 726 unsigned long sz)
719{ 727{
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 6c0b1f5f8d2c..fa9fb5b4c66c 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -134,7 +134,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
134static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, 134static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
135 unsigned long end, struct mm_walk *walk) 135 unsigned long end, struct mm_walk *walk)
136{ 136{
137 struct vm_area_struct *vma = walk->private; 137 struct vm_area_struct *vma = walk->vma;
138 split_huge_page_pmd(vma, addr, pmd); 138 split_huge_page_pmd(vma, addr, pmd);
139 return 0; 139 return 0;
140} 140}
@@ -163,9 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
163 if (vma->vm_start >= (addr + len)) 163 if (vma->vm_start >= (addr + len))
164 break; 164 break;
165 vma->vm_flags |= VM_NOHUGEPAGE; 165 vma->vm_flags |= VM_NOHUGEPAGE;
166 subpage_proto_walk.private = vma; 166 walk_page_vma(vma, &subpage_proto_walk);
167 walk_page_range(vma->vm_start, vma->vm_end,
168 &subpage_proto_walk);
169 vma = vma->vm_next; 167 vma = vma->vm_next;
170 } 168 }
171} 169}
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 0441ec24ae87..fbb5ee3ae57c 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -99,7 +99,7 @@ extern unsigned long zero_page_mask;
99#endif /* CONFIG_64BIT */ 99#endif /* CONFIG_64BIT */
100#define PTRS_PER_PGD 2048 100#define PTRS_PER_PGD 2048
101 101
102#define FIRST_USER_ADDRESS 0 102#define FIRST_USER_ADDRESS 0UL
103 103
104#define pte_ERROR(e) \ 104#define pte_ERROR(e) \
105 printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e)) 105 printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e))
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 639fce464008..5c586c78ca8d 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -235,10 +235,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
235 /* Try to get the remaining pages with get_user_pages */ 235 /* Try to get the remaining pages with get_user_pages */
236 start += nr << PAGE_SHIFT; 236 start += nr << PAGE_SHIFT;
237 pages += nr; 237 pages += nr;
238 down_read(&mm->mmap_sem); 238 ret = get_user_pages_unlocked(current, mm, start,
239 ret = get_user_pages(current, mm, start, 239 nr_pages - nr, write, 0, pages);
240 nr_pages - nr, write, 0, pages, NULL);
241 up_read(&mm->mmap_sem);
242 /* Have to be a bit careful with return values */ 240 /* Have to be a bit careful with return values */
243 if (nr > 0) 241 if (nr > 0)
244 ret = (ret < 0) ? nr : ret + nr; 242 ret = (ret < 0) ? nr : ret + nr;
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 3c80d2e38f03..210ffede0153 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -192,12 +192,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
192 return 0; 192 return 0;
193} 193}
194 194
195struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
196 int write)
197{
198 return ERR_PTR(-EINVAL);
199}
200
201int pmd_huge(pmd_t pmd) 195int pmd_huge(pmd_t pmd)
202{ 196{
203 if (!MACHINE_HAS_HPAGE) 197 if (!MACHINE_HAS_HPAGE)
@@ -210,17 +204,3 @@ int pud_huge(pud_t pud)
210{ 204{
211 return 0; 205 return 0;
212} 206}
213
214struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
215 pmd_t *pmdp, int write)
216{
217 struct page *page;
218
219 if (!MACHINE_HAS_HPAGE)
220 return NULL;
221
222 page = pmd_page(*pmdp);
223 if (page)
224 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
225 return page;
226}
diff --git a/arch/score/include/asm/pgtable.h b/arch/score/include/asm/pgtable.h
index 5170ffdea643..0553e5cd5985 100644
--- a/arch/score/include/asm/pgtable.h
+++ b/arch/score/include/asm/pgtable.h
@@ -27,7 +27,7 @@ extern pte_t invalid_pte_table[PAGE_SIZE/sizeof(pte_t)];
27#define PTRS_PER_PTE 1024 27#define PTRS_PER_PTE 1024
28 28
29#define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE) 29#define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE)
30#define FIRST_USER_ADDRESS 0 30#define FIRST_USER_ADDRESS 0UL
31 31
32#define VMALLOC_START (0xc0000000UL) 32#define VMALLOC_START (0xc0000000UL)
33 33
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index cf434c64408d..89c513a982fc 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -62,7 +62,7 @@ static inline unsigned long long neff_sign_extend(unsigned long val)
62/* Entries per level */ 62/* Entries per level */
63#define PTRS_PER_PTE (PAGE_SIZE / (1 << PTE_MAGNITUDE)) 63#define PTRS_PER_PTE (PAGE_SIZE / (1 << PTE_MAGNITUDE))
64 64
65#define FIRST_USER_ADDRESS 0 65#define FIRST_USER_ADDRESS 0UL
66 66
67#define PHYS_ADDR_MASK29 0x1fffffff 67#define PHYS_ADDR_MASK29 0x1fffffff
68#define PHYS_ADDR_MASK32 0xffffffff 68#define PHYS_ADDR_MASK32 0xffffffff
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
index 37458f38b220..e15f52a17b6c 100644
--- a/arch/sh/mm/gup.c
+++ b/arch/sh/mm/gup.c
@@ -257,10 +257,8 @@ slow_irqon:
257 start += nr << PAGE_SHIFT; 257 start += nr << PAGE_SHIFT;
258 pages += nr; 258 pages += nr;
259 259
260 down_read(&mm->mmap_sem); 260 ret = get_user_pages_unlocked(current, mm, start,
261 ret = get_user_pages(current, mm, start, 261 (end - start) >> PAGE_SHIFT, write, 0, pages);
262 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
263 up_read(&mm->mmap_sem);
264 262
265 /* Have to be a bit careful with return values */ 263 /* Have to be a bit careful with return values */
266 if (nr > 0) { 264 if (nr > 0) {
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index d7762349ea48..534bc978af8a 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -67,12 +67,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
67 return 0; 67 return 0;
68} 68}
69 69
70struct page *follow_huge_addr(struct mm_struct *mm,
71 unsigned long address, int write)
72{
73 return ERR_PTR(-EINVAL);
74}
75
76int pmd_huge(pmd_t pmd) 70int pmd_huge(pmd_t pmd)
77{ 71{
78 return 0; 72 return 0;
@@ -82,9 +76,3 @@ int pud_huge(pud_t pud)
82{ 76{
83 return 0; 77 return 0;
84} 78}
85
86struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
87 pmd_t *pmd, int write)
88{
89 return NULL;
90}
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index b2f7dc46a7d1..f06b36a00a3b 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -44,7 +44,7 @@ unsigned long __init bootmem_init(unsigned long *pages_avail);
44#define PTRS_PER_PMD SRMMU_PTRS_PER_PMD 44#define PTRS_PER_PMD SRMMU_PTRS_PER_PMD
45#define PTRS_PER_PGD SRMMU_PTRS_PER_PGD 45#define PTRS_PER_PGD SRMMU_PTRS_PER_PGD
46#define USER_PTRS_PER_PGD PAGE_OFFSET / SRMMU_PGDIR_SIZE 46#define USER_PTRS_PER_PGD PAGE_OFFSET / SRMMU_PGDIR_SIZE
47#define FIRST_USER_ADDRESS 0 47#define FIRST_USER_ADDRESS 0UL
48#define PTE_SIZE (PTRS_PER_PTE*4) 48#define PTE_SIZE (PTRS_PER_PTE*4)
49 49
50#define PAGE_NONE SRMMU_PAGE_NONE 50#define PAGE_NONE SRMMU_PAGE_NONE
@@ -102,7 +102,8 @@ extern unsigned long empty_zero_page;
102 */ 102 */
103static inline unsigned long srmmu_swap(unsigned long *addr, unsigned long value) 103static inline unsigned long srmmu_swap(unsigned long *addr, unsigned long value)
104{ 104{
105 __asm__ __volatile__("swap [%2], %0" : "=&r" (value) : "0" (value), "r" (addr)); 105 __asm__ __volatile__("swap [%2], %0" :
106 "=&r" (value) : "0" (value), "r" (addr) : "memory");
106 return value; 107 return value;
107} 108}
108 109
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 2ac7873ad6fd..dc165ebdf05a 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -93,7 +93,7 @@ bool kern_addr_valid(unsigned long addr);
93#define PTRS_PER_PGD (1UL << PGDIR_BITS) 93#define PTRS_PER_PGD (1UL << PGDIR_BITS)
94 94
95/* Kernel has a separate 44bit address space. */ 95/* Kernel has a separate 44bit address space. */
96#define FIRST_USER_ADDRESS 0 96#define FIRST_USER_ADDRESS 0UL
97 97
98#define pmd_ERROR(e) \ 98#define pmd_ERROR(e) \
99 pr_err("%s:%d: bad pmd %p(%016lx) seen at (%pS)\n", \ 99 pr_err("%s:%d: bad pmd %p(%016lx) seen at (%pS)\n", \
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index ae6ce383d4df..2e5c4fc2daa9 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -249,10 +249,8 @@ slow:
249 start += nr << PAGE_SHIFT; 249 start += nr << PAGE_SHIFT;
250 pages += nr; 250 pages += nr;
251 251
252 down_read(&mm->mmap_sem); 252 ret = get_user_pages_unlocked(current, mm, start,
253 ret = get_user_pages(current, mm, start, 253 (end - start) >> PAGE_SHIFT, write, 0, pages);
254 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
255 up_read(&mm->mmap_sem);
256 254
257 /* Have to be a bit careful with return values */ 255 /* Have to be a bit careful with return values */
258 if (nr > 0) { 256 if (nr > 0) {
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index d329537739c6..4242eab12e10 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -215,12 +215,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
215 return entry; 215 return entry;
216} 216}
217 217
218struct page *follow_huge_addr(struct mm_struct *mm,
219 unsigned long address, int write)
220{
221 return ERR_PTR(-EINVAL);
222}
223
224int pmd_huge(pmd_t pmd) 218int pmd_huge(pmd_t pmd)
225{ 219{
226 return 0; 220 return 0;
@@ -230,9 +224,3 @@ int pud_huge(pud_t pud)
230{ 224{
231 return 0; 225 return 0;
232} 226}
233
234struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
235 pmd_t *pmd, int write)
236{
237 return NULL;
238}
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index bc75b6ef2e79..95a4f19d16c5 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -67,7 +67,7 @@ extern void pgtable_cache_init(void);
67extern void paging_init(void); 67extern void paging_init(void);
68extern void set_page_homes(void); 68extern void set_page_homes(void);
69 69
70#define FIRST_USER_ADDRESS 0 70#define FIRST_USER_ADDRESS 0UL
71 71
72#define _PAGE_PRESENT HV_PTE_PRESENT 72#define _PAGE_PRESENT HV_PTE_PRESENT
73#define _PAGE_HUGE_PAGE HV_PTE_PAGE 73#define _PAGE_HUGE_PAGE HV_PTE_PAGE
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 3270e0019266..8416240c322c 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -150,12 +150,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
150 return NULL; 150 return NULL;
151} 151}
152 152
153struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
154 int write)
155{
156 return ERR_PTR(-EINVAL);
157}
158
159int pmd_huge(pmd_t pmd) 153int pmd_huge(pmd_t pmd)
160{ 154{
161 return !!(pmd_val(pmd) & _PAGE_HUGE_PAGE); 155 return !!(pmd_val(pmd) & _PAGE_HUGE_PAGE);
@@ -166,28 +160,6 @@ int pud_huge(pud_t pud)
166 return !!(pud_val(pud) & _PAGE_HUGE_PAGE); 160 return !!(pud_val(pud) & _PAGE_HUGE_PAGE);
167} 161}
168 162
169struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
170 pmd_t *pmd, int write)
171{
172 struct page *page;
173
174 page = pte_page(*(pte_t *)pmd);
175 if (page)
176 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
177 return page;
178}
179
180struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
181 pud_t *pud, int write)
182{
183 struct page *page;
184
185 page = pte_page(*(pte_t *)pud);
186 if (page)
187 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
188 return page;
189}
190
191int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 163int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
192{ 164{
193 return 0; 165 return 0;
diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h
index 7afe86035fa7..cfbe59752469 100644
--- a/arch/um/include/asm/pgtable-2level.h
+++ b/arch/um/include/asm/pgtable-2level.h
@@ -23,7 +23,7 @@
23#define PTRS_PER_PTE 1024 23#define PTRS_PER_PTE 1024
24#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) 24#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE)
25#define PTRS_PER_PGD 1024 25#define PTRS_PER_PGD 1024
26#define FIRST_USER_ADDRESS 0 26#define FIRST_USER_ADDRESS 0UL
27 27
28#define pte_ERROR(e) \ 28#define pte_ERROR(e) \
29 printk("%s:%d: bad pte %p(%08lx).\n", __FILE__, __LINE__, &(e), \ 29 printk("%s:%d: bad pte %p(%08lx).\n", __FILE__, __LINE__, &(e), \
diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h
index 344c559c0a17..2b4274e7c095 100644
--- a/arch/um/include/asm/pgtable-3level.h
+++ b/arch/um/include/asm/pgtable-3level.h
@@ -41,7 +41,7 @@
41#endif 41#endif
42 42
43#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) 43#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE)
44#define FIRST_USER_ADDRESS 0 44#define FIRST_USER_ADDRESS 0UL
45 45
46#define pte_ERROR(e) \ 46#define pte_ERROR(e) \
47 printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \ 47 printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \
diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c
index 08b8d4295e70..2ade20d8eab3 100644
--- a/arch/unicore32/mm/pgd.c
+++ b/arch/unicore32/mm/pgd.c
@@ -69,6 +69,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
69 69
70no_pte: 70no_pte:
71 pmd_free(mm, new_pmd); 71 pmd_free(mm, new_pmd);
72 mm_dec_nr_pmds(mm);
72no_pmd: 73no_pmd:
73 free_pages((unsigned long)new_pgd, 0); 74 free_pages((unsigned long)new_pgd, 0);
74no_pgd: 75no_pgd:
@@ -96,7 +97,9 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd)
96 pte = pmd_pgtable(*pmd); 97 pte = pmd_pgtable(*pmd);
97 pmd_clear(pmd); 98 pmd_clear(pmd);
98 pte_free(mm, pte); 99 pte_free(mm, pte);
100 atomic_long_dec(&mm->nr_ptes);
99 pmd_free(mm, pmd); 101 pmd_free(mm, pmd);
102 mm_dec_nr_pmds(mm);
100free: 103free:
101 free_pages((unsigned long) pgd, 0); 104 free_pages((unsigned long) pgd, 0);
102} 105}
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 5185a4f599ec..3e0230c94cff 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -4,7 +4,7 @@
4#include <linux/const.h> 4#include <linux/const.h>
5#include <asm/page_types.h> 5#include <asm/page_types.h>
6 6
7#define FIRST_USER_ADDRESS 0 7#define FIRST_USER_ADDRESS 0UL
8 8
9#define _PAGE_BIT_PRESENT 0 /* is present */ 9#define _PAGE_BIT_PRESENT 0 /* is present */
10#define _PAGE_BIT_RW 1 /* writeable */ 10#define _PAGE_BIT_RW 1 /* writeable */
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index d7547824e763..89df70e0caa6 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -172,7 +172,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
172 */ 172 */
173 if (pmd_none(pmd) || pmd_trans_splitting(pmd)) 173 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
174 return 0; 174 return 0;
175 if (unlikely(pmd_large(pmd))) { 175 if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
176 /* 176 /*
177 * NUMA hinting faults need to be handled in the GUP 177 * NUMA hinting faults need to be handled in the GUP
178 * slowpath for accounting purposes and so that they 178 * slowpath for accounting purposes and so that they
@@ -388,10 +388,9 @@ slow_irqon:
388 start += nr << PAGE_SHIFT; 388 start += nr << PAGE_SHIFT;
389 pages += nr; 389 pages += nr;
390 390
391 down_read(&mm->mmap_sem); 391 ret = get_user_pages_unlocked(current, mm, start,
392 ret = get_user_pages(current, mm, start, 392 (end - start) >> PAGE_SHIFT,
393 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); 393 write, 0, pages);
394 up_read(&mm->mmap_sem);
395 394
396 /* Have to be a bit careful with return values */ 395 /* Have to be a bit careful with return values */
397 if (nr > 0) { 396 if (nr > 0) {
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index bca0aa3a003f..42982b26e32b 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -52,23 +52,17 @@ int pud_huge(pud_t pud)
52 return 0; 52 return 0;
53} 53}
54 54
55struct page *
56follow_huge_pmd(struct mm_struct *mm, unsigned long address,
57 pmd_t *pmd, int write)
58{
59 return NULL;
60}
61#else 55#else
62 56
63struct page * 57/*
64follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 58 * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
65{ 59 * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
66 return ERR_PTR(-EINVAL); 60 * Otherwise, returns 0.
67} 61 */
68
69int pmd_huge(pmd_t pmd) 62int pmd_huge(pmd_t pmd)
70{ 63{
71 return !!(pmd_val(pmd) & _PAGE_PSE); 64 return !pmd_none(pmd) &&
65 (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
72} 66}
73 67
74int pud_huge(pud_t pud) 68int pud_huge(pud_t pud)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 6fb6927f9e76..7b22adaad4f1 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
190 190
191#endif /* CONFIG_X86_PAE */ 191#endif /* CONFIG_X86_PAE */
192 192
193static void free_pmds(pmd_t *pmds[]) 193static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
194{ 194{
195 int i; 195 int i;
196 196
@@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[])
198 if (pmds[i]) { 198 if (pmds[i]) {
199 pgtable_pmd_page_dtor(virt_to_page(pmds[i])); 199 pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
200 free_page((unsigned long)pmds[i]); 200 free_page((unsigned long)pmds[i]);
201 mm_dec_nr_pmds(mm);
201 } 202 }
202} 203}
203 204
204static int preallocate_pmds(pmd_t *pmds[]) 205static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
205{ 206{
206 int i; 207 int i;
207 bool failed = false; 208 bool failed = false;
@@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[])
215 pmd = NULL; 216 pmd = NULL;
216 failed = true; 217 failed = true;
217 } 218 }
219 if (pmd)
220 mm_inc_nr_pmds(mm);
218 pmds[i] = pmd; 221 pmds[i] = pmd;
219 } 222 }
220 223
221 if (failed) { 224 if (failed) {
222 free_pmds(pmds); 225 free_pmds(mm, pmds);
223 return -ENOMEM; 226 return -ENOMEM;
224 } 227 }
225 228
@@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
246 249
247 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); 250 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
248 pmd_free(mm, pmd); 251 pmd_free(mm, pmd);
252 mm_dec_nr_pmds(mm);
249 } 253 }
250 } 254 }
251} 255}
@@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
283 287
284 mm->pgd = pgd; 288 mm->pgd = pgd;
285 289
286 if (preallocate_pmds(pmds) != 0) 290 if (preallocate_pmds(mm, pmds) != 0)
287 goto out_free_pgd; 291 goto out_free_pgd;
288 292
289 if (paravirt_pgd_alloc(mm) != 0) 293 if (paravirt_pgd_alloc(mm) != 0)
@@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
304 return pgd; 308 return pgd;
305 309
306out_free_pmds: 310out_free_pmds:
307 free_pmds(pmds); 311 free_pmds(mm, pmds);
308out_free_pgd: 312out_free_pgd:
309 free_page((unsigned long)pgd); 313 free_page((unsigned long)pgd);
310out: 314out:
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index 01b80dce9d65..a5e929a10c20 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -57,7 +57,7 @@
57#define PTRS_PER_PGD 1024 57#define PTRS_PER_PGD 1024
58#define PGD_ORDER 0 58#define PGD_ORDER 0
59#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) 59#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
60#define FIRST_USER_ADDRESS 0 60#define FIRST_USER_ADDRESS 0UL
61#define FIRST_USER_PGD_NR (FIRST_USER_ADDRESS >> PGDIR_SHIFT) 61#define FIRST_USER_PGD_NR (FIRST_USER_ADDRESS >> PGDIR_SHIFT)
62 62
63/* 63/*
diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c
index bee2329e0b2e..24152accc66c 100644
--- a/drivers/media/pci/ivtv/ivtv-udma.c
+++ b/drivers/media/pci/ivtv/ivtv-udma.c
@@ -124,10 +124,8 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr,
124 } 124 }
125 125
126 /* Get user pages for DMA Xfer */ 126 /* Get user pages for DMA Xfer */
127 down_read(&current->mm->mmap_sem); 127 err = get_user_pages_unlocked(current, current->mm,
128 err = get_user_pages(current, current->mm, 128 user_dma.uaddr, user_dma.page_count, 0, 1, dma->map);
129 user_dma.uaddr, user_dma.page_count, 0, 1, dma->map, NULL);
130 up_read(&current->mm->mmap_sem);
131 129
132 if (user_dma.page_count != err) { 130 if (user_dma.page_count != err) {
133 IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n", 131 IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n",
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 128d3b55bdd9..9a1c34205254 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4551,18 +4551,15 @@ static int sgl_map_user_pages(struct st_buffer *STbp,
4551 return -ENOMEM; 4551 return -ENOMEM;
4552 4552
4553 /* Try to fault in all of the necessary pages */ 4553 /* Try to fault in all of the necessary pages */
4554 down_read(&current->mm->mmap_sem);
4555 /* rw==READ means read from drive, write into memory area */ 4554 /* rw==READ means read from drive, write into memory area */
4556 res = get_user_pages( 4555 res = get_user_pages_unlocked(
4557 current, 4556 current,
4558 current->mm, 4557 current->mm,
4559 uaddr, 4558 uaddr,
4560 nr_pages, 4559 nr_pages,
4561 rw == READ, 4560 rw == READ,
4562 0, /* don't force */ 4561 0, /* don't force */
4563 pages, 4562 pages);
4564 NULL);
4565 up_read(&current->mm->mmap_sem);
4566 4563
4567 /* Errors and no page mapped should return here */ 4564 /* Errors and no page mapped should return here */
4568 if (res < nr_pages) 4565 if (res < nr_pages)
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index b545d3d1da3e..feafa172b155 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -160,7 +160,12 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
160 selected->pid, selected->comm, 160 selected->pid, selected->comm,
161 selected_oom_score_adj, selected_tasksize); 161 selected_oom_score_adj, selected_tasksize);
162 lowmem_deathpending_timeout = jiffies + HZ; 162 lowmem_deathpending_timeout = jiffies + HZ;
163 set_tsk_thread_flag(selected, TIF_MEMDIE); 163 /*
164 * FIXME: lowmemorykiller shouldn't abuse global OOM killer
165 * infrastructure. There is no real reason why the selected
166 * task should have access to the memory reserves.
167 */
168 mark_tsk_oom_victim(selected);
164 send_sig(SIGKILL, selected, 0); 169 send_sig(SIGKILL, selected, 0);
165 rem += selected_tasksize; 170 rem += selected_tasksize;
166 } 171 }
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 42bad18c66c9..259a4d5a4e8f 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -90,7 +90,7 @@ static void sysrq_handle_loglevel(int key)
90 90
91 i = key - '0'; 91 i = key - '0';
92 console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; 92 console_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
93 printk("Loglevel set to %d\n", i); 93 pr_info("Loglevel set to %d\n", i);
94 console_loglevel = i; 94 console_loglevel = i;
95} 95}
96static struct sysrq_key_op sysrq_loglevel_op = { 96static struct sysrq_key_op sysrq_loglevel_op = {
@@ -220,7 +220,7 @@ static void showacpu(void *dummy)
220 return; 220 return;
221 221
222 spin_lock_irqsave(&show_lock, flags); 222 spin_lock_irqsave(&show_lock, flags);
223 printk(KERN_INFO "CPU%d:\n", smp_processor_id()); 223 pr_info("CPU%d:\n", smp_processor_id());
224 show_stack(NULL, NULL); 224 show_stack(NULL, NULL);
225 spin_unlock_irqrestore(&show_lock, flags); 225 spin_unlock_irqrestore(&show_lock, flags);
226} 226}
@@ -243,7 +243,7 @@ static void sysrq_handle_showallcpus(int key)
243 struct pt_regs *regs = get_irq_regs(); 243 struct pt_regs *regs = get_irq_regs();
244 244
245 if (regs) { 245 if (regs) {
246 printk(KERN_INFO "CPU%d:\n", smp_processor_id()); 246 pr_info("CPU%d:\n", smp_processor_id());
247 show_regs(regs); 247 show_regs(regs);
248 } 248 }
249 schedule_work(&sysrq_showallcpus); 249 schedule_work(&sysrq_showallcpus);
@@ -355,8 +355,9 @@ static struct sysrq_key_op sysrq_term_op = {
355 355
356static void moom_callback(struct work_struct *ignored) 356static void moom_callback(struct work_struct *ignored)
357{ 357{
358 out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL, 358 if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
359 0, NULL, true); 359 GFP_KERNEL, 0, NULL, true))
360 pr_info("OOM request ignored because killer is disabled\n");
360} 361}
361 362
362static DECLARE_WORK(moom_work, moom_callback); 363static DECLARE_WORK(moom_work, moom_callback);
@@ -522,7 +523,7 @@ void __handle_sysrq(int key, bool check_mask)
522 */ 523 */
523 orig_log_level = console_loglevel; 524 orig_log_level = console_loglevel;
524 console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; 525 console_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
525 printk(KERN_INFO "SysRq : "); 526 pr_info("SysRq : ");
526 527
527 op_p = __sysrq_get_key_op(key); 528 op_p = __sysrq_get_key_op(key);
528 if (op_p) { 529 if (op_p) {
@@ -531,14 +532,14 @@ void __handle_sysrq(int key, bool check_mask)
531 * should not) and is the invoked operation enabled? 532 * should not) and is the invoked operation enabled?
532 */ 533 */
533 if (!check_mask || sysrq_on_mask(op_p->enable_mask)) { 534 if (!check_mask || sysrq_on_mask(op_p->enable_mask)) {
534 printk("%s\n", op_p->action_msg); 535 pr_cont("%s\n", op_p->action_msg);
535 console_loglevel = orig_log_level; 536 console_loglevel = orig_log_level;
536 op_p->handler(key); 537 op_p->handler(key);
537 } else { 538 } else {
538 printk("This sysrq operation is disabled.\n"); 539 pr_cont("This sysrq operation is disabled.\n");
539 } 540 }
540 } else { 541 } else {
541 printk("HELP : "); 542 pr_cont("HELP : ");
542 /* Only print the help msg once per handler */ 543 /* Only print the help msg once per handler */
543 for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) { 544 for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) {
544 if (sysrq_key_table[i]) { 545 if (sysrq_key_table[i]) {
@@ -549,10 +550,10 @@ void __handle_sysrq(int key, bool check_mask)
549 ; 550 ;
550 if (j != i) 551 if (j != i)
551 continue; 552 continue;
552 printk("%s ", sysrq_key_table[i]->help_msg); 553 pr_cont("%s ", sysrq_key_table[i]->help_msg);
553 } 554 }
554 } 555 }
555 printk("\n"); 556 pr_cont("\n");
556 console_loglevel = orig_log_level; 557 console_loglevel = orig_log_level;
557 } 558 }
558 rcu_read_unlock(); 559 rcu_read_unlock();
diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c
index 7c74f58fc101..0e24eb9c219c 100644
--- a/drivers/video/fbdev/pvr2fb.c
+++ b/drivers/video/fbdev/pvr2fb.c
@@ -686,10 +686,8 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf,
686 if (!pages) 686 if (!pages)
687 return -ENOMEM; 687 return -ENOMEM;
688 688
689 down_read(&current->mm->mmap_sem); 689 ret = get_user_pages_unlocked(current, current->mm, (unsigned long)buf,
690 ret = get_user_pages(current, current->mm, (unsigned long)buf, 690 nr_pages, WRITE, 0, pages);
691 nr_pages, WRITE, 0, pages, NULL);
692 up_read(&current->mm->mmap_sem);
693 691
694 if (ret < nr_pages) { 692 if (ret < nr_pages) {
695 nr_pages = ret; 693 nr_pages = ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 790dbae3343c..c73df6a7c9b6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1407,8 +1407,8 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1407 while (index <= end_index) { 1407 while (index <= end_index) {
1408 page = find_get_page(inode->i_mapping, index); 1408 page = find_get_page(inode->i_mapping, index);
1409 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1409 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1410 account_page_redirty(page);
1411 __set_page_dirty_nobuffers(page); 1410 __set_page_dirty_nobuffers(page);
1411 account_page_redirty(page);
1412 page_cache_release(page); 1412 page_cache_release(page);
1413 index++; 1413 index++;
1414 } 1414 }
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1e3187da1fed..7eee2d8b97d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -5,6 +5,7 @@
5#include <linux/ksm.h> 5#include <linux/ksm.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/mmzone.h> 7#include <linux/mmzone.h>
8#include <linux/huge_mm.h>
8#include <linux/proc_fs.h> 9#include <linux/proc_fs.h>
9#include <linux/seq_file.h> 10#include <linux/seq_file.h>
10#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
@@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page)
121 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon 122 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
122 * to make sure a given page is a thp, not a non-huge compound page. 123 * to make sure a given page is a thp, not a non-huge compound page.
123 */ 124 */
124 else if (PageTransCompound(page) && (PageLRU(compound_head(page)) || 125 else if (PageTransCompound(page)) {
125 PageAnon(compound_head(page)))) 126 struct page *head = compound_head(page);
126 u |= 1 << KPF_THP; 127
128 if (PageLRU(head) || PageAnon(head))
129 u |= 1 << KPF_THP;
130 else if (is_huge_zero_page(head)) {
131 u |= 1 << KPF_ZERO_PAGE;
132 u |= 1 << KPF_THP;
133 }
134 } else if (is_zero_pfn(page_to_pfn(page)))
135 u |= 1 << KPF_ZERO_PAGE;
136
127 137
128 /* 138 /*
129 * Caveats on high order pages: page->_count will only be set 139 * Caveats on high order pages: page->_count will only be set
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6396f88c6687..0e36c1e49fe3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -21,7 +21,7 @@
21 21
22void task_mem(struct seq_file *m, struct mm_struct *mm) 22void task_mem(struct seq_file *m, struct mm_struct *mm)
23{ 23{
24 unsigned long data, text, lib, swap; 24 unsigned long data, text, lib, swap, ptes, pmds;
25 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 25 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
26 26
27 /* 27 /*
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
42 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 42 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
43 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 43 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
44 swap = get_mm_counter(mm, MM_SWAPENTS); 44 swap = get_mm_counter(mm, MM_SWAPENTS);
45 ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
46 pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
45 seq_printf(m, 47 seq_printf(m,
46 "VmPeak:\t%8lu kB\n" 48 "VmPeak:\t%8lu kB\n"
47 "VmSize:\t%8lu kB\n" 49 "VmSize:\t%8lu kB\n"
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
54 "VmExe:\t%8lu kB\n" 56 "VmExe:\t%8lu kB\n"
55 "VmLib:\t%8lu kB\n" 57 "VmLib:\t%8lu kB\n"
56 "VmPTE:\t%8lu kB\n" 58 "VmPTE:\t%8lu kB\n"
59 "VmPMD:\t%8lu kB\n"
57 "VmSwap:\t%8lu kB\n", 60 "VmSwap:\t%8lu kB\n",
58 hiwater_vm << (PAGE_SHIFT-10), 61 hiwater_vm << (PAGE_SHIFT-10),
59 total_vm << (PAGE_SHIFT-10), 62 total_vm << (PAGE_SHIFT-10),
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
63 total_rss << (PAGE_SHIFT-10), 66 total_rss << (PAGE_SHIFT-10),
64 data << (PAGE_SHIFT-10), 67 data << (PAGE_SHIFT-10),
65 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 68 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
66 (PTRS_PER_PTE * sizeof(pte_t) * 69 ptes >> 10,
67 atomic_long_read(&mm->nr_ptes)) >> 10, 70 pmds >> 10,
68 swap << (PAGE_SHIFT-10)); 71 swap << (PAGE_SHIFT-10));
69} 72}
70 73
@@ -433,7 +436,6 @@ const struct file_operations proc_tid_maps_operations = {
433 436
434#ifdef CONFIG_PROC_PAGE_MONITOR 437#ifdef CONFIG_PROC_PAGE_MONITOR
435struct mem_size_stats { 438struct mem_size_stats {
436 struct vm_area_struct *vma;
437 unsigned long resident; 439 unsigned long resident;
438 unsigned long shared_clean; 440 unsigned long shared_clean;
439 unsigned long shared_dirty; 441 unsigned long shared_dirty;
@@ -482,7 +484,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
482 struct mm_walk *walk) 484 struct mm_walk *walk)
483{ 485{
484 struct mem_size_stats *mss = walk->private; 486 struct mem_size_stats *mss = walk->private;
485 struct vm_area_struct *vma = mss->vma; 487 struct vm_area_struct *vma = walk->vma;
486 struct page *page = NULL; 488 struct page *page = NULL;
487 489
488 if (pte_present(*pte)) { 490 if (pte_present(*pte)) {
@@ -506,7 +508,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
506 struct mm_walk *walk) 508 struct mm_walk *walk)
507{ 509{
508 struct mem_size_stats *mss = walk->private; 510 struct mem_size_stats *mss = walk->private;
509 struct vm_area_struct *vma = mss->vma; 511 struct vm_area_struct *vma = walk->vma;
510 struct page *page; 512 struct page *page;
511 513
512 /* FOLL_DUMP will return -EFAULT on huge zero page */ 514 /* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -527,8 +529,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
527static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 529static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
528 struct mm_walk *walk) 530 struct mm_walk *walk)
529{ 531{
530 struct mem_size_stats *mss = walk->private; 532 struct vm_area_struct *vma = walk->vma;
531 struct vm_area_struct *vma = mss->vma;
532 pte_t *pte; 533 pte_t *pte;
533 spinlock_t *ptl; 534 spinlock_t *ptl;
534 535
@@ -620,10 +621,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
620 }; 621 };
621 622
622 memset(&mss, 0, sizeof mss); 623 memset(&mss, 0, sizeof mss);
623 mss.vma = vma;
624 /* mmap_sem is held in m_start */ 624 /* mmap_sem is held in m_start */
625 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 625 walk_page_vma(vma, &smaps_walk);
626 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
627 626
628 show_map_vma(m, vma, is_pid); 627 show_map_vma(m, vma, is_pid);
629 628
@@ -737,14 +736,13 @@ enum clear_refs_types {
737}; 736};
738 737
739struct clear_refs_private { 738struct clear_refs_private {
740 struct vm_area_struct *vma;
741 enum clear_refs_types type; 739 enum clear_refs_types type;
742}; 740};
743 741
742#ifdef CONFIG_MEM_SOFT_DIRTY
744static inline void clear_soft_dirty(struct vm_area_struct *vma, 743static inline void clear_soft_dirty(struct vm_area_struct *vma,
745 unsigned long addr, pte_t *pte) 744 unsigned long addr, pte_t *pte)
746{ 745{
747#ifdef CONFIG_MEM_SOFT_DIRTY
748 /* 746 /*
749 * The soft-dirty tracker uses #PF-s to catch writes 747 * The soft-dirty tracker uses #PF-s to catch writes
750 * to pages, so write-protect the pte as well. See the 748 * to pages, so write-protect the pte as well. See the
@@ -761,19 +759,60 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
761 } 759 }
762 760
763 set_pte_at(vma->vm_mm, addr, pte, ptent); 761 set_pte_at(vma->vm_mm, addr, pte, ptent);
764#endif
765} 762}
766 763
764static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
765 unsigned long addr, pmd_t *pmdp)
766{
767 pmd_t pmd = *pmdp;
768
769 pmd = pmd_wrprotect(pmd);
770 pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
771
772 if (vma->vm_flags & VM_SOFTDIRTY)
773 vma->vm_flags &= ~VM_SOFTDIRTY;
774
775 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
776}
777
778#else
779
780static inline void clear_soft_dirty(struct vm_area_struct *vma,
781 unsigned long addr, pte_t *pte)
782{
783}
784
785static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
786 unsigned long addr, pmd_t *pmdp)
787{
788}
789#endif
790
767static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 791static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
768 unsigned long end, struct mm_walk *walk) 792 unsigned long end, struct mm_walk *walk)
769{ 793{
770 struct clear_refs_private *cp = walk->private; 794 struct clear_refs_private *cp = walk->private;
771 struct vm_area_struct *vma = cp->vma; 795 struct vm_area_struct *vma = walk->vma;
772 pte_t *pte, ptent; 796 pte_t *pte, ptent;
773 spinlock_t *ptl; 797 spinlock_t *ptl;
774 struct page *page; 798 struct page *page;
775 799
776 split_huge_page_pmd(vma, addr, pmd); 800 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
801 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
802 clear_soft_dirty_pmd(vma, addr, pmd);
803 goto out;
804 }
805
806 page = pmd_page(*pmd);
807
808 /* Clear accessed and referenced bits. */
809 pmdp_test_and_clear_young(vma, addr, pmd);
810 ClearPageReferenced(page);
811out:
812 spin_unlock(ptl);
813 return 0;
814 }
815
777 if (pmd_trans_unstable(pmd)) 816 if (pmd_trans_unstable(pmd))
778 return 0; 817 return 0;
779 818
@@ -802,6 +841,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
802 return 0; 841 return 0;
803} 842}
804 843
844static int clear_refs_test_walk(unsigned long start, unsigned long end,
845 struct mm_walk *walk)
846{
847 struct clear_refs_private *cp = walk->private;
848 struct vm_area_struct *vma = walk->vma;
849
850 if (vma->vm_flags & VM_PFNMAP)
851 return 1;
852
853 /*
854 * Writing 1 to /proc/pid/clear_refs affects all pages.
855 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
856 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
857 * Writing 4 to /proc/pid/clear_refs affects all pages.
858 */
859 if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
860 return 1;
861 if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
862 return 1;
863 return 0;
864}
865
805static ssize_t clear_refs_write(struct file *file, const char __user *buf, 866static ssize_t clear_refs_write(struct file *file, const char __user *buf,
806 size_t count, loff_t *ppos) 867 size_t count, loff_t *ppos)
807{ 868{
@@ -842,6 +903,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
842 }; 903 };
843 struct mm_walk clear_refs_walk = { 904 struct mm_walk clear_refs_walk = {
844 .pmd_entry = clear_refs_pte_range, 905 .pmd_entry = clear_refs_pte_range,
906 .test_walk = clear_refs_test_walk,
845 .mm = mm, 907 .mm = mm,
846 .private = &cp, 908 .private = &cp,
847 }; 909 };
@@ -861,28 +923,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
861 } 923 }
862 mmu_notifier_invalidate_range_start(mm, 0, -1); 924 mmu_notifier_invalidate_range_start(mm, 0, -1);
863 } 925 }
864 for (vma = mm->mmap; vma; vma = vma->vm_next) { 926 walk_page_range(0, ~0UL, &clear_refs_walk);
865 cp.vma = vma;
866 if (is_vm_hugetlb_page(vma))
867 continue;
868 /*
869 * Writing 1 to /proc/pid/clear_refs affects all pages.
870 *
871 * Writing 2 to /proc/pid/clear_refs only affects
872 * Anonymous pages.
873 *
874 * Writing 3 to /proc/pid/clear_refs only affects file
875 * mapped pages.
876 *
877 * Writing 4 to /proc/pid/clear_refs affects all pages.
878 */
879 if (type == CLEAR_REFS_ANON && vma->vm_file)
880 continue;
881 if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
882 continue;
883 walk_page_range(vma->vm_start, vma->vm_end,
884 &clear_refs_walk);
885 }
886 if (type == CLEAR_REFS_SOFT_DIRTY) 927 if (type == CLEAR_REFS_SOFT_DIRTY)
887 mmu_notifier_invalidate_range_end(mm, 0, -1); 928 mmu_notifier_invalidate_range_end(mm, 0, -1);
888 flush_tlb_mm(mm); 929 flush_tlb_mm(mm);
@@ -1050,15 +1091,13 @@ static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemap
1050static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 1091static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1051 struct mm_walk *walk) 1092 struct mm_walk *walk)
1052{ 1093{
1053 struct vm_area_struct *vma; 1094 struct vm_area_struct *vma = walk->vma;
1054 struct pagemapread *pm = walk->private; 1095 struct pagemapread *pm = walk->private;
1055 spinlock_t *ptl; 1096 spinlock_t *ptl;
1056 pte_t *pte; 1097 pte_t *pte, *orig_pte;
1057 int err = 0; 1098 int err = 0;
1058 1099
1059 /* find the first VMA at or above 'addr' */ 1100 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1060 vma = find_vma(walk->mm, addr);
1061 if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1062 int pmd_flags2; 1101 int pmd_flags2;
1063 1102
1064 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) 1103 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1084,51 +1123,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1084 if (pmd_trans_unstable(pmd)) 1123 if (pmd_trans_unstable(pmd))
1085 return 0; 1124 return 0;
1086 1125
1087 while (1) { 1126 /*
1088 /* End of address space hole, which we mark as non-present. */ 1127 * We can assume that @vma always points to a valid one and @end never
1089 unsigned long hole_end; 1128 * goes beyond vma->vm_end.
1090 1129 */
1091 if (vma) 1130 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1092 hole_end = min(end, vma->vm_start); 1131 for (; addr < end; pte++, addr += PAGE_SIZE) {
1093 else 1132 pagemap_entry_t pme;
1094 hole_end = end;
1095
1096 for (; addr < hole_end; addr += PAGE_SIZE) {
1097 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
1098
1099 err = add_to_pagemap(addr, &pme, pm);
1100 if (err)
1101 return err;
1102 }
1103
1104 if (!vma || vma->vm_start >= end)
1105 break;
1106 /*
1107 * We can't possibly be in a hugetlb VMA. In general,
1108 * for a mm_walk with a pmd_entry and a hugetlb_entry,
1109 * the pmd_entry can only be called on addresses in a
1110 * hugetlb if the walk starts in a non-hugetlb VMA and
1111 * spans a hugepage VMA. Since pagemap_read walks are
1112 * PMD-sized and PMD-aligned, this will never be true.
1113 */
1114 BUG_ON(is_vm_hugetlb_page(vma));
1115
1116 /* Addresses in the VMA. */
1117 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1118 pagemap_entry_t pme;
1119 pte = pte_offset_map(pmd, addr);
1120 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1121 pte_unmap(pte);
1122 err = add_to_pagemap(addr, &pme, pm);
1123 if (err)
1124 return err;
1125 }
1126 1133
1127 if (addr == end) 1134 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1135 err = add_to_pagemap(addr, &pme, pm);
1136 if (err)
1128 break; 1137 break;
1129
1130 vma = find_vma(walk->mm, addr);
1131 } 1138 }
1139 pte_unmap_unlock(orig_pte, ptl);
1132 1140
1133 cond_resched(); 1141 cond_resched();
1134 1142
@@ -1154,15 +1162,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1154 struct mm_walk *walk) 1162 struct mm_walk *walk)
1155{ 1163{
1156 struct pagemapread *pm = walk->private; 1164 struct pagemapread *pm = walk->private;
1157 struct vm_area_struct *vma; 1165 struct vm_area_struct *vma = walk->vma;
1158 int err = 0; 1166 int err = 0;
1159 int flags2; 1167 int flags2;
1160 pagemap_entry_t pme; 1168 pagemap_entry_t pme;
1161 1169
1162 vma = find_vma(walk->mm, addr); 1170 if (vma->vm_flags & VM_SOFTDIRTY)
1163 WARN_ON_ONCE(!vma);
1164
1165 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1166 flags2 = __PM_SOFT_DIRTY; 1171 flags2 = __PM_SOFT_DIRTY;
1167 else 1172 else
1168 flags2 = 0; 1173 flags2 = 0;
@@ -1322,7 +1327,6 @@ const struct file_operations proc_pagemap_operations = {
1322#ifdef CONFIG_NUMA 1327#ifdef CONFIG_NUMA
1323 1328
1324struct numa_maps { 1329struct numa_maps {
1325 struct vm_area_struct *vma;
1326 unsigned long pages; 1330 unsigned long pages;
1327 unsigned long anon; 1331 unsigned long anon;
1328 unsigned long active; 1332 unsigned long active;
@@ -1391,18 +1395,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1391static int gather_pte_stats(pmd_t *pmd, unsigned long addr, 1395static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1392 unsigned long end, struct mm_walk *walk) 1396 unsigned long end, struct mm_walk *walk)
1393{ 1397{
1394 struct numa_maps *md; 1398 struct numa_maps *md = walk->private;
1399 struct vm_area_struct *vma = walk->vma;
1395 spinlock_t *ptl; 1400 spinlock_t *ptl;
1396 pte_t *orig_pte; 1401 pte_t *orig_pte;
1397 pte_t *pte; 1402 pte_t *pte;
1398 1403
1399 md = walk->private; 1404 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1400
1401 if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
1402 pte_t huge_pte = *(pte_t *)pmd; 1405 pte_t huge_pte = *(pte_t *)pmd;
1403 struct page *page; 1406 struct page *page;
1404 1407
1405 page = can_gather_numa_stats(huge_pte, md->vma, addr); 1408 page = can_gather_numa_stats(huge_pte, vma, addr);
1406 if (page) 1409 if (page)
1407 gather_stats(page, md, pte_dirty(huge_pte), 1410 gather_stats(page, md, pte_dirty(huge_pte),
1408 HPAGE_PMD_SIZE/PAGE_SIZE); 1411 HPAGE_PMD_SIZE/PAGE_SIZE);
@@ -1414,7 +1417,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1414 return 0; 1417 return 0;
1415 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 1418 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1416 do { 1419 do {
1417 struct page *page = can_gather_numa_stats(*pte, md->vma, addr); 1420 struct page *page = can_gather_numa_stats(*pte, vma, addr);
1418 if (!page) 1421 if (!page)
1419 continue; 1422 continue;
1420 gather_stats(page, md, pte_dirty(*pte), 1); 1423 gather_stats(page, md, pte_dirty(*pte), 1);
@@ -1424,7 +1427,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1424 return 0; 1427 return 0;
1425} 1428}
1426#ifdef CONFIG_HUGETLB_PAGE 1429#ifdef CONFIG_HUGETLB_PAGE
1427static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, 1430static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1428 unsigned long addr, unsigned long end, struct mm_walk *walk) 1431 unsigned long addr, unsigned long end, struct mm_walk *walk)
1429{ 1432{
1430 struct numa_maps *md; 1433 struct numa_maps *md;
@@ -1443,7 +1446,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1443} 1446}
1444 1447
1445#else 1448#else
1446static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, 1449static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1447 unsigned long addr, unsigned long end, struct mm_walk *walk) 1450 unsigned long addr, unsigned long end, struct mm_walk *walk)
1448{ 1451{
1449 return 0; 1452 return 0;
@@ -1461,7 +1464,12 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1461 struct numa_maps *md = &numa_priv->md; 1464 struct numa_maps *md = &numa_priv->md;
1462 struct file *file = vma->vm_file; 1465 struct file *file = vma->vm_file;
1463 struct mm_struct *mm = vma->vm_mm; 1466 struct mm_struct *mm = vma->vm_mm;
1464 struct mm_walk walk = {}; 1467 struct mm_walk walk = {
1468 .hugetlb_entry = gather_hugetlb_stats,
1469 .pmd_entry = gather_pte_stats,
1470 .private = md,
1471 .mm = mm,
1472 };
1465 struct mempolicy *pol; 1473 struct mempolicy *pol;
1466 char buffer[64]; 1474 char buffer[64];
1467 int nid; 1475 int nid;
@@ -1472,13 +1480,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1472 /* Ensure we start with an empty set of numa_maps statistics. */ 1480 /* Ensure we start with an empty set of numa_maps statistics. */
1473 memset(md, 0, sizeof(*md)); 1481 memset(md, 0, sizeof(*md));
1474 1482
1475 md->vma = vma;
1476
1477 walk.hugetlb_entry = gather_hugetbl_stats;
1478 walk.pmd_entry = gather_pte_stats;
1479 walk.private = md;
1480 walk.mm = mm;
1481
1482 pol = __get_vma_policy(vma, vma->vm_start); 1483 pol = __get_vma_policy(vma, vma->vm_start);
1483 if (pol) { 1484 if (pol) {
1484 mpol_to_str(buffer, sizeof(buffer), pol); 1485 mpol_to_str(buffer, sizeof(buffer), pol);
@@ -1512,7 +1513,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1512 if (is_vm_hugetlb_page(vma)) 1513 if (is_vm_hugetlb_page(vma))
1513 seq_puts(m, " huge"); 1514 seq_puts(m, " huge");
1514 1515
1515 walk_page_range(vma->vm_start, vma->vm_end, &walk); 1516 /* mmap_sem is held by m_start */
1517 walk_page_vma(vma, &walk);
1516 1518
1517 if (!md->pages) 1519 if (!md->pages)
1518 goto out; 1520 goto out;
diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
index 77ff547730af..5bdab6bffd23 100644
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -4,6 +4,7 @@
4#define __ARCH_HAS_4LEVEL_HACK 4#define __ARCH_HAS_4LEVEL_HACK
5#define __PAGETABLE_PUD_FOLDED 5#define __PAGETABLE_PUD_FOLDED
6 6
7#define PUD_SHIFT PGDIR_SHIFT
7#define PUD_SIZE PGDIR_SIZE 8#define PUD_SIZE PGDIR_SIZE
8#define PUD_MASK PGDIR_MASK 9#define PUD_MASK PGDIR_MASK
9#define PTRS_PER_PUD 1 10#define PTRS_PER_PUD 1
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 3238ffa33f68..a014559e4a49 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -12,6 +12,10 @@
12#define COMPACT_PARTIAL 3 12#define COMPACT_PARTIAL 3
13/* The full zone was compacted */ 13/* The full zone was compacted */
14#define COMPACT_COMPLETE 4 14#define COMPACT_COMPLETE 4
15/* For more detailed tracepoint output */
16#define COMPACT_NO_SUITABLE_PAGE 5
17#define COMPACT_NOT_SUITABLE_ZONE 6
18/* When adding new state, please change compaction_status_string, too */
15 19
16/* Used to signal whether compaction detected need_sched() or lock contention */ 20/* Used to signal whether compaction detected need_sched() or lock contention */
17/* No contention detected */ 21/* No contention detected */
@@ -21,6 +25,8 @@
21/* Zone lock or lru_lock was contended in async compaction */ 25/* Zone lock or lru_lock was contended in async compaction */
22#define COMPACT_CONTENDED_LOCK 2 26#define COMPACT_CONTENDED_LOCK 2
23 27
28struct alloc_context; /* in mm/internal.h */
29
24#ifdef CONFIG_COMPACTION 30#ifdef CONFIG_COMPACTION
25extern int sysctl_compact_memory; 31extern int sysctl_compact_memory;
26extern int sysctl_compaction_handler(struct ctl_table *table, int write, 32extern int sysctl_compaction_handler(struct ctl_table *table, int write,
@@ -30,81 +36,25 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
30 void __user *buffer, size_t *length, loff_t *ppos); 36 void __user *buffer, size_t *length, loff_t *ppos);
31 37
32extern int fragmentation_index(struct zone *zone, unsigned int order); 38extern int fragmentation_index(struct zone *zone, unsigned int order);
33extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 39extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
34 int order, gfp_t gfp_mask, nodemask_t *mask, 40 int alloc_flags, const struct alloc_context *ac,
35 enum migrate_mode mode, int *contended, 41 enum migrate_mode mode, int *contended);
36 int alloc_flags, int classzone_idx);
37extern void compact_pgdat(pg_data_t *pgdat, int order); 42extern void compact_pgdat(pg_data_t *pgdat, int order);
38extern void reset_isolation_suitable(pg_data_t *pgdat); 43extern void reset_isolation_suitable(pg_data_t *pgdat);
39extern unsigned long compaction_suitable(struct zone *zone, int order, 44extern unsigned long compaction_suitable(struct zone *zone, int order,
40 int alloc_flags, int classzone_idx); 45 int alloc_flags, int classzone_idx);
41 46
42/* Do not skip compaction more than 64 times */ 47extern void defer_compaction(struct zone *zone, int order);
43#define COMPACT_MAX_DEFER_SHIFT 6 48extern bool compaction_deferred(struct zone *zone, int order);
44 49extern void compaction_defer_reset(struct zone *zone, int order,
45/* 50 bool alloc_success);
46 * Compaction is deferred when compaction fails to result in a page 51extern bool compaction_restarting(struct zone *zone, int order);
47 * allocation success. 1 << compact_defer_limit compactions are skipped up
48 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
49 */
50static inline void defer_compaction(struct zone *zone, int order)
51{
52 zone->compact_considered = 0;
53 zone->compact_defer_shift++;
54
55 if (order < zone->compact_order_failed)
56 zone->compact_order_failed = order;
57
58 if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
59 zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
60}
61
62/* Returns true if compaction should be skipped this time */
63static inline bool compaction_deferred(struct zone *zone, int order)
64{
65 unsigned long defer_limit = 1UL << zone->compact_defer_shift;
66
67 if (order < zone->compact_order_failed)
68 return false;
69
70 /* Avoid possible overflow */
71 if (++zone->compact_considered > defer_limit)
72 zone->compact_considered = defer_limit;
73
74 return zone->compact_considered < defer_limit;
75}
76
77/*
78 * Update defer tracking counters after successful compaction of given order,
79 * which means an allocation either succeeded (alloc_success == true) or is
80 * expected to succeed.
81 */
82static inline void compaction_defer_reset(struct zone *zone, int order,
83 bool alloc_success)
84{
85 if (alloc_success) {
86 zone->compact_considered = 0;
87 zone->compact_defer_shift = 0;
88 }
89 if (order >= zone->compact_order_failed)
90 zone->compact_order_failed = order + 1;
91}
92
93/* Returns true if restarting compaction after many failures */
94static inline bool compaction_restarting(struct zone *zone, int order)
95{
96 if (order < zone->compact_order_failed)
97 return false;
98
99 return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
100 zone->compact_considered >= 1UL << zone->compact_defer_shift;
101}
102 52
103#else 53#else
104static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, 54static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
105 int order, gfp_t gfp_mask, nodemask_t *nodemask, 55 unsigned int order, int alloc_flags,
106 enum migrate_mode mode, int *contended, 56 const struct alloc_context *ac,
107 int alloc_flags, int classzone_idx) 57 enum migrate_mode mode, int *contended)
108{ 58{
109 return COMPACT_CONTINUE; 59 return COMPACT_CONTINUE;
110} 60}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index b840e3b2770d..51bd1e72a917 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -334,18 +334,22 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
334} 334}
335extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, 335extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
336 struct vm_area_struct *vma, unsigned long addr, 336 struct vm_area_struct *vma, unsigned long addr,
337 int node); 337 int node, bool hugepage);
338#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
339 alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
338#else 340#else
339#define alloc_pages(gfp_mask, order) \ 341#define alloc_pages(gfp_mask, order) \
340 alloc_pages_node(numa_node_id(), gfp_mask, order) 342 alloc_pages_node(numa_node_id(), gfp_mask, order)
341#define alloc_pages_vma(gfp_mask, order, vma, addr, node) \ 343#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
344 alloc_pages(gfp_mask, order)
345#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
342 alloc_pages(gfp_mask, order) 346 alloc_pages(gfp_mask, order)
343#endif 347#endif
344#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) 348#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
345#define alloc_page_vma(gfp_mask, vma, addr) \ 349#define alloc_page_vma(gfp_mask, vma, addr) \
346 alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) 350 alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
347#define alloc_page_vma_node(gfp_mask, vma, addr, node) \ 351#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
348 alloc_pages_vma(gfp_mask, 0, vma, addr, node) 352 alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
349 353
350extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order); 354extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order);
351extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, 355extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask,
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ad9051bab267..f10b20f05159 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -157,6 +157,13 @@ static inline int hpage_nr_pages(struct page *page)
157extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 157extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
158 unsigned long addr, pmd_t pmd, pmd_t *pmdp); 158 unsigned long addr, pmd_t pmd, pmd_t *pmdp);
159 159
160extern struct page *huge_zero_page;
161
162static inline bool is_huge_zero_page(struct page *page)
163{
164 return ACCESS_ONCE(huge_zero_page) == page;
165}
166
160#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 167#else /* CONFIG_TRANSPARENT_HUGEPAGE */
161#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) 168#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
162#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) 169#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -206,6 +213,11 @@ static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_str
206 return 0; 213 return 0;
207} 214}
208 215
216static inline bool is_huge_zero_page(struct page *page)
217{
218 return false;
219}
220
209#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 221#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
210 222
211#endif /* _LINUX_HUGE_MM_H */ 223#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7d7856359920..7b5785032049 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -99,9 +99,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
99struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, 99struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
100 int write); 100 int write);
101struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 101struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
102 pmd_t *pmd, int write); 102 pmd_t *pmd, int flags);
103struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, 103struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
104 pud_t *pud, int write); 104 pud_t *pud, int flags);
105int pmd_huge(pmd_t pmd); 105int pmd_huge(pmd_t pmd);
106int pud_huge(pud_t pmd); 106int pud_huge(pud_t pmd);
107unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 107unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -133,8 +133,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
133static inline void hugetlb_show_meminfo(void) 133static inline void hugetlb_show_meminfo(void)
134{ 134{
135} 135}
136#define follow_huge_pmd(mm, addr, pmd, write) NULL 136#define follow_huge_pmd(mm, addr, pmd, flags) NULL
137#define follow_huge_pud(mm, addr, pud, write) NULL 137#define follow_huge_pud(mm, addr, pud, flags) NULL
138#define prepare_hugepage_range(file, addr, len) (-EINVAL) 138#define prepare_hugepage_range(file, addr, len) (-EINVAL)
139#define pmd_huge(x) 0 139#define pmd_huge(x) 0
140#define pud_huge(x) 0 140#define pud_huge(x) 0
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 26f106022c88..d189ee098aa2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -200,17 +200,6 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
200int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); 200int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
201#endif 201#endif
202 202
203/*
204 * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
205 * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
206 * controls whether we retry the gup one more time to completion in that case.
207 * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
208 * handler.
209 */
210int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
211 unsigned long addr, bool write_fault,
212 struct page **pagep);
213
214enum { 203enum {
215 OUTSIDE_GUEST_MODE, 204 OUTSIDE_GUEST_MODE,
216 IN_GUEST_MODE, 205 IN_GUEST_MODE,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fb212e1d700d..6cfd934c7c9b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -52,7 +52,27 @@ struct mem_cgroup_reclaim_cookie {
52 unsigned int generation; 52 unsigned int generation;
53}; 53};
54 54
55enum mem_cgroup_events_index {
56 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
57 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
58 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
59 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
60 MEM_CGROUP_EVENTS_NSTATS,
61 /* default hierarchy events */
62 MEMCG_LOW = MEM_CGROUP_EVENTS_NSTATS,
63 MEMCG_HIGH,
64 MEMCG_MAX,
65 MEMCG_OOM,
66 MEMCG_NR_EVENTS,
67};
68
55#ifdef CONFIG_MEMCG 69#ifdef CONFIG_MEMCG
70void mem_cgroup_events(struct mem_cgroup *memcg,
71 enum mem_cgroup_events_index idx,
72 unsigned int nr);
73
74bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
75
56int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 76int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
57 gfp_t gfp_mask, struct mem_cgroup **memcgp); 77 gfp_t gfp_mask, struct mem_cgroup **memcgp);
58void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 78void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
@@ -102,6 +122,7 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
102 * For memory reclaim. 122 * For memory reclaim.
103 */ 123 */
104int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec); 124int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
125bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
105int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); 126int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
106unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); 127unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
107void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); 128void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
@@ -138,12 +159,10 @@ static inline bool mem_cgroup_disabled(void)
138 return false; 159 return false;
139} 160}
140 161
141struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, 162struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
142 unsigned long *flags);
143void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
144 unsigned long *flags);
145void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, 163void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
146 enum mem_cgroup_stat_index idx, int val); 164 enum mem_cgroup_stat_index idx, int val);
165void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
147 166
148static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, 167static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
149 enum mem_cgroup_stat_index idx) 168 enum mem_cgroup_stat_index idx)
@@ -176,6 +195,18 @@ void mem_cgroup_split_huge_fixup(struct page *head);
176#else /* CONFIG_MEMCG */ 195#else /* CONFIG_MEMCG */
177struct mem_cgroup; 196struct mem_cgroup;
178 197
198static inline void mem_cgroup_events(struct mem_cgroup *memcg,
199 enum mem_cgroup_events_index idx,
200 unsigned int nr)
201{
202}
203
204static inline bool mem_cgroup_low(struct mem_cgroup *root,
205 struct mem_cgroup *memcg)
206{
207 return false;
208}
209
179static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 210static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
180 gfp_t gfp_mask, 211 gfp_t gfp_mask,
181 struct mem_cgroup **memcgp) 212 struct mem_cgroup **memcgp)
@@ -268,6 +299,11 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
268 return 1; 299 return 1;
269} 300}
270 301
302static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
303{
304 return true;
305}
306
271static inline unsigned long 307static inline unsigned long
272mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 308mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
273{ 309{
@@ -285,14 +321,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
285{ 321{
286} 322}
287 323
288static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, 324static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
289 bool *locked, unsigned long *flags)
290{ 325{
291 return NULL; 326 return NULL;
292} 327}
293 328
294static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, 329static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
295 bool *locked, unsigned long *flags)
296{ 330{
297} 331}
298 332
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 65db4aee738a..a4d24f3c5430 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -484,7 +484,8 @@ static inline void page_mapcount_reset(struct page *page)
484 484
485static inline int page_mapcount(struct page *page) 485static inline int page_mapcount(struct page *page)
486{ 486{
487 return atomic_read(&(page)->_mapcount) + 1; 487 VM_BUG_ON_PAGE(PageSlab(page), page);
488 return atomic_read(&page->_mapcount) + 1;
488} 489}
489 490
490static inline int page_count(struct page *page) 491static inline int page_count(struct page *page)
@@ -627,29 +628,28 @@ int split_free_page(struct page *page);
627 * prototype for that function and accessor functions. 628 * prototype for that function and accessor functions.
628 * These are _only_ valid on the head of a PG_compound page. 629 * These are _only_ valid on the head of a PG_compound page.
629 */ 630 */
630typedef void compound_page_dtor(struct page *);
631 631
632static inline void set_compound_page_dtor(struct page *page, 632static inline void set_compound_page_dtor(struct page *page,
633 compound_page_dtor *dtor) 633 compound_page_dtor *dtor)
634{ 634{
635 page[1].lru.next = (void *)dtor; 635 page[1].compound_dtor = dtor;
636} 636}
637 637
638static inline compound_page_dtor *get_compound_page_dtor(struct page *page) 638static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
639{ 639{
640 return (compound_page_dtor *)page[1].lru.next; 640 return page[1].compound_dtor;
641} 641}
642 642
643static inline int compound_order(struct page *page) 643static inline int compound_order(struct page *page)
644{ 644{
645 if (!PageHead(page)) 645 if (!PageHead(page))
646 return 0; 646 return 0;
647 return (unsigned long)page[1].lru.prev; 647 return page[1].compound_order;
648} 648}
649 649
650static inline void set_compound_order(struct page *page, unsigned long order) 650static inline void set_compound_order(struct page *page, unsigned long order)
651{ 651{
652 page[1].lru.prev = (void *)order; 652 page[1].compound_order = order;
653} 653}
654 654
655#ifdef CONFIG_MMU 655#ifdef CONFIG_MMU
@@ -1164,8 +1164,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
1164 1164
1165/** 1165/**
1166 * mm_walk - callbacks for walk_page_range 1166 * mm_walk - callbacks for walk_page_range
1167 * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
1168 * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
1169 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry 1167 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
1170 * this handler is required to be able to handle 1168 * this handler is required to be able to handle
1171 * pmd_trans_huge() pmds. They may simply choose to 1169 * pmd_trans_huge() pmds. They may simply choose to
@@ -1173,16 +1171,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
1173 * @pte_entry: if set, called for each non-empty PTE (4th-level) entry 1171 * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
1174 * @pte_hole: if set, called for each hole at all levels 1172 * @pte_hole: if set, called for each hole at all levels
1175 * @hugetlb_entry: if set, called for each hugetlb entry 1173 * @hugetlb_entry: if set, called for each hugetlb entry
1176 * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry 1174 * @test_walk: caller specific callback function to determine whether
1177 * is used. 1175 * we walk over the current vma or not. A positive returned
1176 * value means "do page table walk over the current vma,"
1177 * and a negative one means "abort current page table walk
1178 * right now." 0 means "skip the current vma."
1179 * @mm: mm_struct representing the target process of page table walk
1180 * @vma: vma currently walked (NULL if walking outside vmas)
1181 * @private: private data for callbacks' usage
1178 * 1182 *
1179 * (see walk_page_range for more details) 1183 * (see the comment on walk_page_range() for more details)
1180 */ 1184 */
1181struct mm_walk { 1185struct mm_walk {
1182 int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
1183 unsigned long next, struct mm_walk *walk);
1184 int (*pud_entry)(pud_t *pud, unsigned long addr,
1185 unsigned long next, struct mm_walk *walk);
1186 int (*pmd_entry)(pmd_t *pmd, unsigned long addr, 1186 int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
1187 unsigned long next, struct mm_walk *walk); 1187 unsigned long next, struct mm_walk *walk);
1188 int (*pte_entry)(pte_t *pte, unsigned long addr, 1188 int (*pte_entry)(pte_t *pte, unsigned long addr,
@@ -1192,12 +1192,16 @@ struct mm_walk {
1192 int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, 1192 int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
1193 unsigned long addr, unsigned long next, 1193 unsigned long addr, unsigned long next,
1194 struct mm_walk *walk); 1194 struct mm_walk *walk);
1195 int (*test_walk)(unsigned long addr, unsigned long next,
1196 struct mm_walk *walk);
1195 struct mm_struct *mm; 1197 struct mm_struct *mm;
1198 struct vm_area_struct *vma;
1196 void *private; 1199 void *private;
1197}; 1200};
1198 1201
1199int walk_page_range(unsigned long addr, unsigned long end, 1202int walk_page_range(unsigned long addr, unsigned long end,
1200 struct mm_walk *walk); 1203 struct mm_walk *walk);
1204int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
1201void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, 1205void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
1202 unsigned long end, unsigned long floor, unsigned long ceiling); 1206 unsigned long end, unsigned long floor, unsigned long ceiling);
1203int copy_page_range(struct mm_struct *dst, struct mm_struct *src, 1207int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
@@ -1261,6 +1265,17 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1261 unsigned long start, unsigned long nr_pages, 1265 unsigned long start, unsigned long nr_pages,
1262 int write, int force, struct page **pages, 1266 int write, int force, struct page **pages,
1263 struct vm_area_struct **vmas); 1267 struct vm_area_struct **vmas);
1268long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
1269 unsigned long start, unsigned long nr_pages,
1270 int write, int force, struct page **pages,
1271 int *locked);
1272long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
1273 unsigned long start, unsigned long nr_pages,
1274 int write, int force, struct page **pages,
1275 unsigned int gup_flags);
1276long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
1277 unsigned long start, unsigned long nr_pages,
1278 int write, int force, struct page **pages);
1264int get_user_pages_fast(unsigned long start, int nr_pages, int write, 1279int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1265 struct page **pages); 1280 struct page **pages);
1266struct kvec; 1281struct kvec;
@@ -1438,8 +1453,32 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
1438{ 1453{
1439 return 0; 1454 return 0;
1440} 1455}
1456
1457static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
1458{
1459 return 0;
1460}
1461
1462static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
1463static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
1464
1441#else 1465#else
1442int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); 1466int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
1467
1468static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
1469{
1470 return atomic_long_read(&mm->nr_pmds);
1471}
1472
1473static inline void mm_inc_nr_pmds(struct mm_struct *mm)
1474{
1475 atomic_long_inc(&mm->nr_pmds);
1476}
1477
1478static inline void mm_dec_nr_pmds(struct mm_struct *mm)
1479{
1480 atomic_long_dec(&mm->nr_pmds);
1481}
1443#endif 1482#endif
1444 1483
1445int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 1484int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 07c8bd3f7b48..199a03aab8dc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -28,6 +28,8 @@ struct mem_cgroup;
28 IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) 28 IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
29#define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) 29#define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8)
30 30
31typedef void compound_page_dtor(struct page *);
32
31/* 33/*
32 * Each physical page in the system has a struct page associated with 34 * Each physical page in the system has a struct page associated with
33 * it to keep track of whatever it is we are using the page for at the 35 * it to keep track of whatever it is we are using the page for at the
@@ -142,6 +144,12 @@ struct page {
142 struct rcu_head rcu_head; /* Used by SLAB 144 struct rcu_head rcu_head; /* Used by SLAB
143 * when destroying via RCU 145 * when destroying via RCU
144 */ 146 */
147 /* First tail page of compound page */
148 struct {
149 compound_page_dtor *compound_dtor;
150 unsigned long compound_order;
151 };
152
145#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS 153#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
146 pgtable_t pmd_huge_pte; /* protected by page->ptl */ 154 pgtable_t pmd_huge_pte; /* protected by page->ptl */
147#endif 155#endif
@@ -355,7 +363,8 @@ struct mm_struct {
355 pgd_t * pgd; 363 pgd_t * pgd;
356 atomic_t mm_users; /* How many users with user space? */ 364 atomic_t mm_users; /* How many users with user space? */
357 atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ 365 atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
358 atomic_long_t nr_ptes; /* Page table pages */ 366 atomic_long_t nr_ptes; /* PTE page table pages */
367 atomic_long_t nr_pmds; /* PMD page table pages */
359 int map_count; /* number of VMAs */ 368 int map_count; /* number of VMAs */
360 369
361 spinlock_t page_table_lock; /* Protects page tables and some counters */ 370 spinlock_t page_table_lock; /* Protects page tables and some counters */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2f0856d14b21..f279d9c158cd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -426,7 +426,7 @@ struct zone {
426 const char *name; 426 const char *name;
427 427
428 /* 428 /*
429 * Number of MIGRATE_RESEVE page block. To maintain for just 429 * Number of MIGRATE_RESERVE page block. To maintain for just
430 * optimization. Protected by zone->lock. 430 * optimization. Protected by zone->lock.
431 */ 431 */
432 int nr_migrate_reserve_block; 432 int nr_migrate_reserve_block;
@@ -970,7 +970,6 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
970 * @z - The cursor used as a starting point for the search 970 * @z - The cursor used as a starting point for the search
971 * @highest_zoneidx - The zone index of the highest zone to return 971 * @highest_zoneidx - The zone index of the highest zone to return
972 * @nodes - An optional nodemask to filter the zonelist with 972 * @nodes - An optional nodemask to filter the zonelist with
973 * @zone - The first suitable zone found is returned via this parameter
974 * 973 *
975 * This function returns the next zone at or below a given zone index that is 974 * This function returns the next zone at or below a given zone index that is
976 * within the allowed nodemask using a cursor as the starting point for the 975 * within the allowed nodemask using a cursor as the starting point for the
@@ -980,8 +979,7 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
980 */ 979 */
981struct zoneref *next_zones_zonelist(struct zoneref *z, 980struct zoneref *next_zones_zonelist(struct zoneref *z,
982 enum zone_type highest_zoneidx, 981 enum zone_type highest_zoneidx,
983 nodemask_t *nodes, 982 nodemask_t *nodes);
984 struct zone **zone);
985 983
986/** 984/**
987 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist 985 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
@@ -1000,8 +998,10 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
1000 nodemask_t *nodes, 998 nodemask_t *nodes,
1001 struct zone **zone) 999 struct zone **zone)
1002{ 1000{
1003 return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes, 1001 struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs,
1004 zone); 1002 highest_zoneidx, nodes);
1003 *zone = zonelist_zone(z);
1004 return z;
1005} 1005}
1006 1006
1007/** 1007/**
@@ -1018,7 +1018,8 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
1018#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ 1018#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
1019 for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ 1019 for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
1020 zone; \ 1020 zone; \
1021 z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \ 1021 z = next_zones_zonelist(++z, highidx, nodemask), \
1022 zone = zonelist_zone(z)) \
1022 1023
1023/** 1024/**
1024 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index 1025 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 76200984d1e2..d5771bed59c9 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -47,6 +47,10 @@ static inline bool oom_task_origin(const struct task_struct *p)
47 return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); 47 return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
48} 48}
49 49
50extern void mark_tsk_oom_victim(struct task_struct *tsk);
51
52extern void unmark_oom_victim(void);
53
50extern unsigned long oom_badness(struct task_struct *p, 54extern unsigned long oom_badness(struct task_struct *p,
51 struct mem_cgroup *memcg, const nodemask_t *nodemask, 55 struct mem_cgroup *memcg, const nodemask_t *nodemask,
52 unsigned long totalpages); 56 unsigned long totalpages);
@@ -68,22 +72,14 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
68 unsigned long totalpages, const nodemask_t *nodemask, 72 unsigned long totalpages, const nodemask_t *nodemask,
69 bool force_kill); 73 bool force_kill);
70 74
71extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 75extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
72 int order, nodemask_t *mask, bool force_kill); 76 int order, nodemask_t *mask, bool force_kill);
73extern int register_oom_notifier(struct notifier_block *nb); 77extern int register_oom_notifier(struct notifier_block *nb);
74extern int unregister_oom_notifier(struct notifier_block *nb); 78extern int unregister_oom_notifier(struct notifier_block *nb);
75 79
76extern bool oom_killer_disabled; 80extern bool oom_killer_disabled;
77 81extern bool oom_killer_disable(void);
78static inline void oom_killer_disable(void) 82extern void oom_killer_enable(void);
79{
80 oom_killer_disabled = true;
81}
82
83static inline void oom_killer_enable(void)
84{
85 oom_killer_disabled = false;
86}
87 83
88extern struct task_struct *find_lock_task_mm(struct task_struct *p); 84extern struct task_struct *find_lock_task_mm(struct task_struct *p);
89 85
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 955421575d16..17fa4f8de3a6 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -41,7 +41,8 @@ int page_counter_try_charge(struct page_counter *counter,
41 struct page_counter **fail); 41 struct page_counter **fail);
42void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); 42void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
43int page_counter_limit(struct page_counter *counter, unsigned long limit); 43int page_counter_limit(struct page_counter *counter, unsigned long limit);
44int page_counter_memparse(const char *buf, unsigned long *nr_pages); 44int page_counter_memparse(const char *buf, const char *max,
45 unsigned long *nr_pages);
45 46
46static inline void page_counter_reset_watermark(struct page_counter *counter) 47static inline void page_counter_reset_watermark(struct page_counter *counter)
47{ 48{
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index d2a2c84c72d0..c42981cd99aa 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -40,7 +40,7 @@ struct page_ext {
40#ifdef CONFIG_PAGE_OWNER 40#ifdef CONFIG_PAGE_OWNER
41 unsigned int order; 41 unsigned int order;
42 gfp_t gfp_mask; 42 gfp_t gfp_mask;
43 struct stack_trace trace; 43 unsigned int nr_entries;
44 unsigned long trace_entries[8]; 44 unsigned long trace_entries[8];
45#endif 45#endif
46}; 46};
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 34e8b60ab973..7067eca501e2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -437,16 +437,6 @@ extern int reuse_swap_page(struct page *);
437extern int try_to_free_swap(struct page *); 437extern int try_to_free_swap(struct page *);
438struct backing_dev_info; 438struct backing_dev_info;
439 439
440#ifdef CONFIG_MEMCG
441extern void
442mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
443#else
444static inline void
445mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
446{
447}
448#endif
449
450#else /* CONFIG_SWAP */ 440#else /* CONFIG_SWAP */
451 441
452#define swap_address_space(entry) (NULL) 442#define swap_address_space(entry) (NULL)
@@ -547,11 +537,6 @@ static inline swp_entry_t get_swap_page(void)
547 return entry; 537 return entry;
548} 538}
549 539
550static inline void
551mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
552{
553}
554
555#endif /* CONFIG_SWAP */ 540#endif /* CONFIG_SWAP */
556#endif /* __KERNEL__*/ 541#endif /* __KERNEL__*/
557#endif /* _LINUX_SWAP_H */ 542#endif /* _LINUX_SWAP_H */
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 50cbc876be56..831a3168ab35 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -135,6 +135,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
135 *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); 135 *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
136} 136}
137 137
138extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
139 spinlock_t *ptl);
138extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 140extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
139 unsigned long address); 141 unsigned long address);
140extern void migration_entry_wait_huge(struct vm_area_struct *vma, 142extern void migration_entry_wait_huge(struct vm_area_struct *vma,
@@ -148,6 +150,8 @@ static inline int is_migration_entry(swp_entry_t swp)
148} 150}
149#define migration_entry_to_page(swp) NULL 151#define migration_entry_to_page(swp) NULL
150static inline void make_migration_entry_read(swp_entry_t *entryp) { } 152static inline void make_migration_entry_read(swp_entry_t *entryp) { }
153static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
154 spinlock_t *ptl) { }
151static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 155static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
152 unsigned long address) { } 156 unsigned long address) { }
153static inline void migration_entry_wait_huge(struct vm_area_struct *vma, 157static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index c6814b917bdf..9a6a3fe0fb51 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -11,39 +11,55 @@
11 11
12DECLARE_EVENT_CLASS(mm_compaction_isolate_template, 12DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
13 13
14 TP_PROTO(unsigned long nr_scanned, 14 TP_PROTO(
15 unsigned long start_pfn,
16 unsigned long end_pfn,
17 unsigned long nr_scanned,
15 unsigned long nr_taken), 18 unsigned long nr_taken),
16 19
17 TP_ARGS(nr_scanned, nr_taken), 20 TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken),
18 21
19 TP_STRUCT__entry( 22 TP_STRUCT__entry(
23 __field(unsigned long, start_pfn)
24 __field(unsigned long, end_pfn)
20 __field(unsigned long, nr_scanned) 25 __field(unsigned long, nr_scanned)
21 __field(unsigned long, nr_taken) 26 __field(unsigned long, nr_taken)
22 ), 27 ),
23 28
24 TP_fast_assign( 29 TP_fast_assign(
30 __entry->start_pfn = start_pfn;
31 __entry->end_pfn = end_pfn;
25 __entry->nr_scanned = nr_scanned; 32 __entry->nr_scanned = nr_scanned;
26 __entry->nr_taken = nr_taken; 33 __entry->nr_taken = nr_taken;
27 ), 34 ),
28 35
29 TP_printk("nr_scanned=%lu nr_taken=%lu", 36 TP_printk("range=(0x%lx ~ 0x%lx) nr_scanned=%lu nr_taken=%lu",
37 __entry->start_pfn,
38 __entry->end_pfn,
30 __entry->nr_scanned, 39 __entry->nr_scanned,
31 __entry->nr_taken) 40 __entry->nr_taken)
32); 41);
33 42
34DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, 43DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages,
35 44
36 TP_PROTO(unsigned long nr_scanned, 45 TP_PROTO(
46 unsigned long start_pfn,
47 unsigned long end_pfn,
48 unsigned long nr_scanned,
37 unsigned long nr_taken), 49 unsigned long nr_taken),
38 50
39 TP_ARGS(nr_scanned, nr_taken) 51 TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken)
40); 52);
41 53
42DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, 54DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
43 TP_PROTO(unsigned long nr_scanned, 55
56 TP_PROTO(
57 unsigned long start_pfn,
58 unsigned long end_pfn,
59 unsigned long nr_scanned,
44 unsigned long nr_taken), 60 unsigned long nr_taken),
45 61
46 TP_ARGS(nr_scanned, nr_taken) 62 TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken)
47); 63);
48 64
49TRACE_EVENT(mm_compaction_migratepages, 65TRACE_EVENT(mm_compaction_migratepages,
@@ -85,47 +101,198 @@ TRACE_EVENT(mm_compaction_migratepages,
85); 101);
86 102
87TRACE_EVENT(mm_compaction_begin, 103TRACE_EVENT(mm_compaction_begin,
88 TP_PROTO(unsigned long zone_start, unsigned long migrate_start, 104 TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
89 unsigned long free_start, unsigned long zone_end), 105 unsigned long free_pfn, unsigned long zone_end, bool sync),
90 106
91 TP_ARGS(zone_start, migrate_start, free_start, zone_end), 107 TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync),
92 108
93 TP_STRUCT__entry( 109 TP_STRUCT__entry(
94 __field(unsigned long, zone_start) 110 __field(unsigned long, zone_start)
95 __field(unsigned long, migrate_start) 111 __field(unsigned long, migrate_pfn)
96 __field(unsigned long, free_start) 112 __field(unsigned long, free_pfn)
97 __field(unsigned long, zone_end) 113 __field(unsigned long, zone_end)
114 __field(bool, sync)
98 ), 115 ),
99 116
100 TP_fast_assign( 117 TP_fast_assign(
101 __entry->zone_start = zone_start; 118 __entry->zone_start = zone_start;
102 __entry->migrate_start = migrate_start; 119 __entry->migrate_pfn = migrate_pfn;
103 __entry->free_start = free_start; 120 __entry->free_pfn = free_pfn;
104 __entry->zone_end = zone_end; 121 __entry->zone_end = zone_end;
122 __entry->sync = sync;
105 ), 123 ),
106 124
107 TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu", 125 TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s",
108 __entry->zone_start, 126 __entry->zone_start,
109 __entry->migrate_start, 127 __entry->migrate_pfn,
110 __entry->free_start, 128 __entry->free_pfn,
111 __entry->zone_end) 129 __entry->zone_end,
130 __entry->sync ? "sync" : "async")
112); 131);
113 132
114TRACE_EVENT(mm_compaction_end, 133TRACE_EVENT(mm_compaction_end,
115 TP_PROTO(int status), 134 TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
135 unsigned long free_pfn, unsigned long zone_end, bool sync,
136 int status),
116 137
117 TP_ARGS(status), 138 TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync, status),
118 139
119 TP_STRUCT__entry( 140 TP_STRUCT__entry(
141 __field(unsigned long, zone_start)
142 __field(unsigned long, migrate_pfn)
143 __field(unsigned long, free_pfn)
144 __field(unsigned long, zone_end)
145 __field(bool, sync)
120 __field(int, status) 146 __field(int, status)
121 ), 147 ),
122 148
123 TP_fast_assign( 149 TP_fast_assign(
150 __entry->zone_start = zone_start;
151 __entry->migrate_pfn = migrate_pfn;
152 __entry->free_pfn = free_pfn;
153 __entry->zone_end = zone_end;
154 __entry->sync = sync;
124 __entry->status = status; 155 __entry->status = status;
125 ), 156 ),
126 157
127 TP_printk("status=%d", __entry->status) 158 TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s status=%s",
159 __entry->zone_start,
160 __entry->migrate_pfn,
161 __entry->free_pfn,
162 __entry->zone_end,
163 __entry->sync ? "sync" : "async",
164 compaction_status_string[__entry->status])
165);
166
167TRACE_EVENT(mm_compaction_try_to_compact_pages,
168
169 TP_PROTO(
170 int order,
171 gfp_t gfp_mask,
172 enum migrate_mode mode),
173
174 TP_ARGS(order, gfp_mask, mode),
175
176 TP_STRUCT__entry(
177 __field(int, order)
178 __field(gfp_t, gfp_mask)
179 __field(enum migrate_mode, mode)
180 ),
181
182 TP_fast_assign(
183 __entry->order = order;
184 __entry->gfp_mask = gfp_mask;
185 __entry->mode = mode;
186 ),
187
188 TP_printk("order=%d gfp_mask=0x%x mode=%d",
189 __entry->order,
190 __entry->gfp_mask,
191 (int)__entry->mode)
192);
193
194DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
195
196 TP_PROTO(struct zone *zone,
197 int order,
198 int ret),
199
200 TP_ARGS(zone, order, ret),
201
202 TP_STRUCT__entry(
203 __field(int, nid)
204 __field(char *, name)
205 __field(int, order)
206 __field(int, ret)
207 ),
208
209 TP_fast_assign(
210 __entry->nid = zone_to_nid(zone);
211 __entry->name = (char *)zone->name;
212 __entry->order = order;
213 __entry->ret = ret;
214 ),
215
216 TP_printk("node=%d zone=%-8s order=%d ret=%s",
217 __entry->nid,
218 __entry->name,
219 __entry->order,
220 compaction_status_string[__entry->ret])
221);
222
223DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished,
224
225 TP_PROTO(struct zone *zone,
226 int order,
227 int ret),
228
229 TP_ARGS(zone, order, ret)
230);
231
232DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable,
233
234 TP_PROTO(struct zone *zone,
235 int order,
236 int ret),
237
238 TP_ARGS(zone, order, ret)
239);
240
241#ifdef CONFIG_COMPACTION
242DECLARE_EVENT_CLASS(mm_compaction_defer_template,
243
244 TP_PROTO(struct zone *zone, int order),
245
246 TP_ARGS(zone, order),
247
248 TP_STRUCT__entry(
249 __field(int, nid)
250 __field(char *, name)
251 __field(int, order)
252 __field(unsigned int, considered)
253 __field(unsigned int, defer_shift)
254 __field(int, order_failed)
255 ),
256
257 TP_fast_assign(
258 __entry->nid = zone_to_nid(zone);
259 __entry->name = (char *)zone->name;
260 __entry->order = order;
261 __entry->considered = zone->compact_considered;
262 __entry->defer_shift = zone->compact_defer_shift;
263 __entry->order_failed = zone->compact_order_failed;
264 ),
265
266 TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu",
267 __entry->nid,
268 __entry->name,
269 __entry->order,
270 __entry->order_failed,
271 __entry->considered,
272 1UL << __entry->defer_shift)
273);
274
275DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_deferred,
276
277 TP_PROTO(struct zone *zone, int order),
278
279 TP_ARGS(zone, order)
280);
281
282DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_compaction,
283
284 TP_PROTO(struct zone *zone, int order),
285
286 TP_ARGS(zone, order)
287);
288
289DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset,
290
291 TP_PROTO(struct zone *zone, int order),
292
293 TP_ARGS(zone, order)
128); 294);
295#endif
129 296
130#endif /* _TRACE_COMPACTION_H */ 297#endif /* _TRACE_COMPACTION_H */
131 298
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index aece1346ceb7..4ad10baecd4d 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -268,11 +268,11 @@ TRACE_EVENT(mm_page_alloc_extfrag,
268 268
269 TP_PROTO(struct page *page, 269 TP_PROTO(struct page *page,
270 int alloc_order, int fallback_order, 270 int alloc_order, int fallback_order,
271 int alloc_migratetype, int fallback_migratetype, int new_migratetype), 271 int alloc_migratetype, int fallback_migratetype),
272 272
273 TP_ARGS(page, 273 TP_ARGS(page,
274 alloc_order, fallback_order, 274 alloc_order, fallback_order,
275 alloc_migratetype, fallback_migratetype, new_migratetype), 275 alloc_migratetype, fallback_migratetype),
276 276
277 TP_STRUCT__entry( 277 TP_STRUCT__entry(
278 __field( struct page *, page ) 278 __field( struct page *, page )
@@ -289,7 +289,8 @@ TRACE_EVENT(mm_page_alloc_extfrag,
289 __entry->fallback_order = fallback_order; 289 __entry->fallback_order = fallback_order;
290 __entry->alloc_migratetype = alloc_migratetype; 290 __entry->alloc_migratetype = alloc_migratetype;
291 __entry->fallback_migratetype = fallback_migratetype; 291 __entry->fallback_migratetype = fallback_migratetype;
292 __entry->change_ownership = (new_migratetype == alloc_migratetype); 292 __entry->change_ownership = (alloc_migratetype ==
293 get_pageblock_migratetype(page));
293 ), 294 ),
294 295
295 TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", 296 TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index 2f96d233c980..a6c4962e5d46 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -32,6 +32,7 @@
32#define KPF_KSM 21 32#define KPF_KSM 21
33#define KPF_THP 22 33#define KPF_THP 22
34#define KPF_BALLOON 23 34#define KPF_BALLOON 23
35#define KPF_ZERO_PAGE 24
35 36
36 37
37#endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ 38#endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 6806c55475ee..feff10bbb307 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
435 task_unlock(tsk); 435 task_unlock(tsk);
436 mm_update_next_owner(mm); 436 mm_update_next_owner(mm);
437 mmput(mm); 437 mmput(mm);
438 clear_thread_flag(TIF_MEMDIE); 438 if (test_thread_flag(TIF_MEMDIE))
439 unmark_oom_victim();
439} 440}
440 441
441static struct task_struct *find_alive_thread(struct task_struct *p) 442static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/fork.c b/kernel/fork.c
index b379d9abddc7..66e19c251581 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -555,6 +555,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
555 INIT_LIST_HEAD(&mm->mmlist); 555 INIT_LIST_HEAD(&mm->mmlist);
556 mm->core_state = NULL; 556 mm->core_state = NULL;
557 atomic_long_set(&mm->nr_ptes, 0); 557 atomic_long_set(&mm->nr_ptes, 0);
558#ifndef __PAGETABLE_PMD_FOLDED
559 atomic_long_set(&mm->nr_pmds, 0);
560#endif
558 mm->map_count = 0; 561 mm->map_count = 0;
559 mm->locked_vm = 0; 562 mm->locked_vm = 0;
560 mm->pinned_vm = 0; 563 mm->pinned_vm = 0;
@@ -603,6 +606,14 @@ static void check_mm(struct mm_struct *mm)
603 printk(KERN_ALERT "BUG: Bad rss-counter state " 606 printk(KERN_ALERT "BUG: Bad rss-counter state "
604 "mm:%p idx:%d val:%ld\n", mm, i, x); 607 "mm:%p idx:%d val:%ld\n", mm, i, x);
605 } 608 }
609
610 if (atomic_long_read(&mm->nr_ptes))
611 pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
612 atomic_long_read(&mm->nr_ptes));
613 if (mm_nr_pmds(mm))
614 pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
615 mm_nr_pmds(mm));
616
606#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 617#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
607 VM_BUG_ON_MM(mm->pmd_huge_pte, mm); 618 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
608#endif 619#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5a6ec8678b9a..564f786df470 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -84,8 +84,8 @@ static int try_to_freeze_tasks(bool user_only)
84 elapsed_msecs = elapsed_msecs64; 84 elapsed_msecs = elapsed_msecs64;
85 85
86 if (todo) { 86 if (todo) {
87 printk("\n"); 87 pr_cont("\n");
88 printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " 88 pr_err("Freezing of tasks %s after %d.%03d seconds "
89 "(%d tasks refusing to freeze, wq_busy=%d):\n", 89 "(%d tasks refusing to freeze, wq_busy=%d):\n",
90 wakeup ? "aborted" : "failed", 90 wakeup ? "aborted" : "failed",
91 elapsed_msecs / 1000, elapsed_msecs % 1000, 91 elapsed_msecs / 1000, elapsed_msecs % 1000,
@@ -101,37 +101,13 @@ static int try_to_freeze_tasks(bool user_only)
101 read_unlock(&tasklist_lock); 101 read_unlock(&tasklist_lock);
102 } 102 }
103 } else { 103 } else {
104 printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, 104 pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
105 elapsed_msecs % 1000); 105 elapsed_msecs % 1000);
106 } 106 }
107 107
108 return todo ? -EBUSY : 0; 108 return todo ? -EBUSY : 0;
109} 109}
110 110
111static bool __check_frozen_processes(void)
112{
113 struct task_struct *g, *p;
114
115 for_each_process_thread(g, p)
116 if (p != current && !freezer_should_skip(p) && !frozen(p))
117 return false;
118
119 return true;
120}
121
122/*
123 * Returns true if all freezable tasks (except for current) are frozen already
124 */
125static bool check_frozen_processes(void)
126{
127 bool ret;
128
129 read_lock(&tasklist_lock);
130 ret = __check_frozen_processes();
131 read_unlock(&tasklist_lock);
132 return ret;
133}
134
135/** 111/**
136 * freeze_processes - Signal user space processes to enter the refrigerator. 112 * freeze_processes - Signal user space processes to enter the refrigerator.
137 * The current thread will not be frozen. The same process that calls 113 * The current thread will not be frozen. The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
142int freeze_processes(void) 118int freeze_processes(void)
143{ 119{
144 int error; 120 int error;
145 int oom_kills_saved;
146 121
147 error = __usermodehelper_disable(UMH_FREEZING); 122 error = __usermodehelper_disable(UMH_FREEZING);
148 if (error) 123 if (error)
@@ -155,31 +130,24 @@ int freeze_processes(void)
155 atomic_inc(&system_freezing_cnt); 130 atomic_inc(&system_freezing_cnt);
156 131
157 pm_wakeup_clear(); 132 pm_wakeup_clear();
158 printk("Freezing user space processes ... "); 133 pr_info("Freezing user space processes ... ");
159 pm_freezing = true; 134 pm_freezing = true;
160 oom_kills_saved = oom_kills_count();
161 error = try_to_freeze_tasks(true); 135 error = try_to_freeze_tasks(true);
162 if (!error) { 136 if (!error) {
163 __usermodehelper_set_disable_depth(UMH_DISABLED); 137 __usermodehelper_set_disable_depth(UMH_DISABLED);
164 oom_killer_disable(); 138 pr_cont("done.");
165
166 /*
167 * There might have been an OOM kill while we were
168 * freezing tasks and the killed task might be still
169 * on the way out so we have to double check for race.
170 */
171 if (oom_kills_count() != oom_kills_saved &&
172 !check_frozen_processes()) {
173 __usermodehelper_set_disable_depth(UMH_ENABLED);
174 printk("OOM in progress.");
175 error = -EBUSY;
176 } else {
177 printk("done.");
178 }
179 } 139 }
180 printk("\n"); 140 pr_cont("\n");
181 BUG_ON(in_atomic()); 141 BUG_ON(in_atomic());
182 142
143 /*
144 * Now that the whole userspace is frozen we need to disbale
145 * the OOM killer to disallow any further interference with
146 * killable tasks.
147 */
148 if (!error && !oom_killer_disable())
149 error = -EBUSY;
150
183 if (error) 151 if (error)
184 thaw_processes(); 152 thaw_processes();
185 return error; 153 return error;
@@ -197,13 +165,14 @@ int freeze_kernel_threads(void)
197{ 165{
198 int error; 166 int error;
199 167
200 printk("Freezing remaining freezable tasks ... "); 168 pr_info("Freezing remaining freezable tasks ... ");
169
201 pm_nosig_freezing = true; 170 pm_nosig_freezing = true;
202 error = try_to_freeze_tasks(false); 171 error = try_to_freeze_tasks(false);
203 if (!error) 172 if (!error)
204 printk("done."); 173 pr_cont("done.");
205 174
206 printk("\n"); 175 pr_cont("\n");
207 BUG_ON(in_atomic()); 176 BUG_ON(in_atomic());
208 177
209 if (error) 178 if (error)
@@ -224,7 +193,7 @@ void thaw_processes(void)
224 193
225 oom_killer_enable(); 194 oom_killer_enable();
226 195
227 printk("Restarting tasks ... "); 196 pr_info("Restarting tasks ... ");
228 197
229 __usermodehelper_set_disable_depth(UMH_FREEZING); 198 __usermodehelper_set_disable_depth(UMH_FREEZING);
230 thaw_workqueues(); 199 thaw_workqueues();
@@ -243,7 +212,7 @@ void thaw_processes(void)
243 usermodehelper_enable(); 212 usermodehelper_enable();
244 213
245 schedule(); 214 schedule();
246 printk("done.\n"); 215 pr_cont("done.\n");
247 trace_suspend_resume(TPS("thaw_processes"), 0, false); 216 trace_suspend_resume(TPS("thaw_processes"), 0, false);
248} 217}
249 218
@@ -252,7 +221,7 @@ void thaw_kernel_threads(void)
252 struct task_struct *g, *p; 221 struct task_struct *g, *p;
253 222
254 pm_nosig_freezing = false; 223 pm_nosig_freezing = false;
255 printk("Restarting kernel threads ... "); 224 pr_info("Restarting kernel threads ... ");
256 225
257 thaw_workqueues(); 226 thaw_workqueues();
258 227
@@ -264,5 +233,5 @@ void thaw_kernel_threads(void)
264 read_unlock(&tasklist_lock); 233 read_unlock(&tasklist_lock);
265 234
266 schedule(); 235 schedule();
267 printk("done.\n"); 236 pr_cont("done.\n");
268} 237}
diff --git a/mm/cma.c b/mm/cma.c
index a85ae28709a3..75016fd1de90 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -199,6 +199,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
199 cma->order_per_bit = order_per_bit; 199 cma->order_per_bit = order_per_bit;
200 *res_cma = cma; 200 *res_cma = cma;
201 cma_area_count++; 201 cma_area_count++;
202 totalcma_pages += (size / PAGE_SIZE);
202 203
203 return 0; 204 return 0;
204} 205}
@@ -337,7 +338,6 @@ int __init cma_declare_contiguous(phys_addr_t base,
337 if (ret) 338 if (ret)
338 goto err; 339 goto err;
339 340
340 totalcma_pages += (size / PAGE_SIZE);
341 pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, 341 pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
342 &base); 342 &base);
343 return 0; 343 return 0;
diff --git a/mm/compaction.c b/mm/compaction.c
index 546e571e9d60..b68736c8a1ce 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -34,6 +34,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
34#endif 34#endif
35 35
36#if defined CONFIG_COMPACTION || defined CONFIG_CMA 36#if defined CONFIG_COMPACTION || defined CONFIG_CMA
37#ifdef CONFIG_TRACEPOINTS
38static const char *const compaction_status_string[] = {
39 "deferred",
40 "skipped",
41 "continue",
42 "partial",
43 "complete",
44 "no_suitable_page",
45 "not_suitable_zone",
46};
47#endif
37 48
38#define CREATE_TRACE_POINTS 49#define CREATE_TRACE_POINTS
39#include <trace/events/compaction.h> 50#include <trace/events/compaction.h>
@@ -113,6 +124,77 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
113} 124}
114 125
115#ifdef CONFIG_COMPACTION 126#ifdef CONFIG_COMPACTION
127
128/* Do not skip compaction more than 64 times */
129#define COMPACT_MAX_DEFER_SHIFT 6
130
131/*
132 * Compaction is deferred when compaction fails to result in a page
133 * allocation success. 1 << compact_defer_limit compactions are skipped up
134 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
135 */
136void defer_compaction(struct zone *zone, int order)
137{
138 zone->compact_considered = 0;
139 zone->compact_defer_shift++;
140
141 if (order < zone->compact_order_failed)
142 zone->compact_order_failed = order;
143
144 if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
145 zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
146
147 trace_mm_compaction_defer_compaction(zone, order);
148}
149
150/* Returns true if compaction should be skipped this time */
151bool compaction_deferred(struct zone *zone, int order)
152{
153 unsigned long defer_limit = 1UL << zone->compact_defer_shift;
154
155 if (order < zone->compact_order_failed)
156 return false;
157
158 /* Avoid possible overflow */
159 if (++zone->compact_considered > defer_limit)
160 zone->compact_considered = defer_limit;
161
162 if (zone->compact_considered >= defer_limit)
163 return false;
164
165 trace_mm_compaction_deferred(zone, order);
166
167 return true;
168}
169
170/*
171 * Update defer tracking counters after successful compaction of given order,
172 * which means an allocation either succeeded (alloc_success == true) or is
173 * expected to succeed.
174 */
175void compaction_defer_reset(struct zone *zone, int order,
176 bool alloc_success)
177{
178 if (alloc_success) {
179 zone->compact_considered = 0;
180 zone->compact_defer_shift = 0;
181 }
182 if (order >= zone->compact_order_failed)
183 zone->compact_order_failed = order + 1;
184
185 trace_mm_compaction_defer_reset(zone, order);
186}
187
188/* Returns true if restarting compaction after many failures */
189bool compaction_restarting(struct zone *zone, int order)
190{
191 if (order < zone->compact_order_failed)
192 return false;
193
194 return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
195 zone->compact_considered >= 1UL << zone->compact_defer_shift;
196}
197
116/* Returns true if the pageblock should be scanned for pages to isolate. */ 198/* Returns true if the pageblock should be scanned for pages to isolate. */
117static inline bool isolation_suitable(struct compact_control *cc, 199static inline bool isolation_suitable(struct compact_control *cc,
118 struct page *page) 200 struct page *page)
@@ -421,11 +503,12 @@ isolate_fail:
421 503
422 } 504 }
423 505
506 trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
507 nr_scanned, total_isolated);
508
424 /* Record how far we have got within the block */ 509 /* Record how far we have got within the block */
425 *start_pfn = blockpfn; 510 *start_pfn = blockpfn;
426 511
427 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
428
429 /* 512 /*
430 * If strict isolation is requested by CMA then check that all the 513 * If strict isolation is requested by CMA then check that all the
431 * pages requested were isolated. If there were any failures, 0 is 514 * pages requested were isolated. If there were any failures, 0 is
@@ -581,6 +664,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
581 unsigned long flags = 0; 664 unsigned long flags = 0;
582 bool locked = false; 665 bool locked = false;
583 struct page *page = NULL, *valid_page = NULL; 666 struct page *page = NULL, *valid_page = NULL;
667 unsigned long start_pfn = low_pfn;
584 668
585 /* 669 /*
586 * Ensure that there are not too many pages isolated from the LRU 670 * Ensure that there are not too many pages isolated from the LRU
@@ -741,7 +825,8 @@ isolate_success:
741 if (low_pfn == end_pfn) 825 if (low_pfn == end_pfn)
742 update_pageblock_skip(cc, valid_page, nr_isolated, true); 826 update_pageblock_skip(cc, valid_page, nr_isolated, true);
743 827
744 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 828 trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
829 nr_scanned, nr_isolated);
745 830
746 count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); 831 count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
747 if (nr_isolated) 832 if (nr_isolated)
@@ -1037,7 +1122,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1037 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; 1122 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
1038} 1123}
1039 1124
1040static int compact_finished(struct zone *zone, struct compact_control *cc, 1125static int __compact_finished(struct zone *zone, struct compact_control *cc,
1041 const int migratetype) 1126 const int migratetype)
1042{ 1127{
1043 unsigned int order; 1128 unsigned int order;
@@ -1092,7 +1177,20 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
1092 return COMPACT_PARTIAL; 1177 return COMPACT_PARTIAL;
1093 } 1178 }
1094 1179
1095 return COMPACT_CONTINUE; 1180 return COMPACT_NO_SUITABLE_PAGE;
1181}
1182
1183static int compact_finished(struct zone *zone, struct compact_control *cc,
1184 const int migratetype)
1185{
1186 int ret;
1187
1188 ret = __compact_finished(zone, cc, migratetype);
1189 trace_mm_compaction_finished(zone, cc->order, ret);
1190 if (ret == COMPACT_NO_SUITABLE_PAGE)
1191 ret = COMPACT_CONTINUE;
1192
1193 return ret;
1096} 1194}
1097 1195
1098/* 1196/*
@@ -1102,7 +1200,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
1102 * COMPACT_PARTIAL - If the allocation would succeed without compaction 1200 * COMPACT_PARTIAL - If the allocation would succeed without compaction
1103 * COMPACT_CONTINUE - If compaction should run now 1201 * COMPACT_CONTINUE - If compaction should run now
1104 */ 1202 */
1105unsigned long compaction_suitable(struct zone *zone, int order, 1203static unsigned long __compaction_suitable(struct zone *zone, int order,
1106 int alloc_flags, int classzone_idx) 1204 int alloc_flags, int classzone_idx)
1107{ 1205{
1108 int fragindex; 1206 int fragindex;
@@ -1146,11 +1244,24 @@ unsigned long compaction_suitable(struct zone *zone, int order,
1146 */ 1244 */
1147 fragindex = fragmentation_index(zone, order); 1245 fragindex = fragmentation_index(zone, order);
1148 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 1246 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
1149 return COMPACT_SKIPPED; 1247 return COMPACT_NOT_SUITABLE_ZONE;
1150 1248
1151 return COMPACT_CONTINUE; 1249 return COMPACT_CONTINUE;
1152} 1250}
1153 1251
1252unsigned long compaction_suitable(struct zone *zone, int order,
1253 int alloc_flags, int classzone_idx)
1254{
1255 unsigned long ret;
1256
1257 ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
1258 trace_mm_compaction_suitable(zone, order, ret);
1259 if (ret == COMPACT_NOT_SUITABLE_ZONE)
1260 ret = COMPACT_SKIPPED;
1261
1262 return ret;
1263}
1264
1154static int compact_zone(struct zone *zone, struct compact_control *cc) 1265static int compact_zone(struct zone *zone, struct compact_control *cc)
1155{ 1266{
1156 int ret; 1267 int ret;
@@ -1197,7 +1308,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1197 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1308 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
1198 } 1309 }
1199 1310
1200 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); 1311 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
1312 cc->free_pfn, end_pfn, sync);
1201 1313
1202 migrate_prep_local(); 1314 migrate_prep_local();
1203 1315
@@ -1299,7 +1411,8 @@ out:
1299 zone->compact_cached_free_pfn = free_pfn; 1411 zone->compact_cached_free_pfn = free_pfn;
1300 } 1412 }
1301 1413
1302 trace_mm_compaction_end(ret); 1414 trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
1415 cc->free_pfn, end_pfn, sync, ret);
1303 1416
1304 return ret; 1417 return ret;
1305} 1418}
@@ -1335,22 +1448,20 @@ int sysctl_extfrag_threshold = 500;
1335 1448
1336/** 1449/**
1337 * try_to_compact_pages - Direct compact to satisfy a high-order allocation 1450 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
1338 * @zonelist: The zonelist used for the current allocation
1339 * @order: The order of the current allocation
1340 * @gfp_mask: The GFP mask of the current allocation 1451 * @gfp_mask: The GFP mask of the current allocation
1341 * @nodemask: The allowed nodes to allocate from 1452 * @order: The order of the current allocation
1453 * @alloc_flags: The allocation flags of the current allocation
1454 * @ac: The context of current allocation
1342 * @mode: The migration mode for async, sync light, or sync migration 1455 * @mode: The migration mode for async, sync light, or sync migration
1343 * @contended: Return value that determines if compaction was aborted due to 1456 * @contended: Return value that determines if compaction was aborted due to
1344 * need_resched() or lock contention 1457 * need_resched() or lock contention
1345 * 1458 *
1346 * This is the main entry point for direct page compaction. 1459 * This is the main entry point for direct page compaction.
1347 */ 1460 */
1348unsigned long try_to_compact_pages(struct zonelist *zonelist, 1461unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1349 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1462 int alloc_flags, const struct alloc_context *ac,
1350 enum migrate_mode mode, int *contended, 1463 enum migrate_mode mode, int *contended)
1351 int alloc_flags, int classzone_idx)
1352{ 1464{
1353 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1354 int may_enter_fs = gfp_mask & __GFP_FS; 1465 int may_enter_fs = gfp_mask & __GFP_FS;
1355 int may_perform_io = gfp_mask & __GFP_IO; 1466 int may_perform_io = gfp_mask & __GFP_IO;
1356 struct zoneref *z; 1467 struct zoneref *z;
@@ -1364,9 +1475,11 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1364 if (!order || !may_enter_fs || !may_perform_io) 1475 if (!order || !may_enter_fs || !may_perform_io)
1365 return COMPACT_SKIPPED; 1476 return COMPACT_SKIPPED;
1366 1477
1478 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
1479
1367 /* Compact each zone in the list */ 1480 /* Compact each zone in the list */
1368 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1481 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
1369 nodemask) { 1482 ac->nodemask) {
1370 int status; 1483 int status;
1371 int zone_contended; 1484 int zone_contended;
1372 1485
@@ -1374,7 +1487,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1374 continue; 1487 continue;
1375 1488
1376 status = compact_zone_order(zone, order, gfp_mask, mode, 1489 status = compact_zone_order(zone, order, gfp_mask, mode,
1377 &zone_contended, alloc_flags, classzone_idx); 1490 &zone_contended, alloc_flags,
1491 ac->classzone_idx);
1378 rc = max(status, rc); 1492 rc = max(status, rc);
1379 /* 1493 /*
1380 * It takes at least one zone that wasn't lock contended 1494 * It takes at least one zone that wasn't lock contended
@@ -1384,7 +1498,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1384 1498
1385 /* If a normal allocation would succeed, stop compacting */ 1499 /* If a normal allocation would succeed, stop compacting */
1386 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 1500 if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
1387 classzone_idx, alloc_flags)) { 1501 ac->classzone_idx, alloc_flags)) {
1388 /* 1502 /*
1389 * We think the allocation will succeed in this zone, 1503 * We think the allocation will succeed in this zone,
1390 * but it is not certain, hence the false. The caller 1504 * but it is not certain, hence the false. The caller
diff --git a/mm/debug.c b/mm/debug.c
index d69cb5a7ba9a..3eb3ac2fcee7 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm)
173 "get_unmapped_area %p\n" 173 "get_unmapped_area %p\n"
174#endif 174#endif
175 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" 175 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
176 "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" 176 "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
177 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" 177 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
178 "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" 178 "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
179 "start_code %lx end_code %lx start_data %lx end_data %lx\n" 179 "start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm)
206 mm->pgd, atomic_read(&mm->mm_users), 206 mm->pgd, atomic_read(&mm->mm_users),
207 atomic_read(&mm->mm_count), 207 atomic_read(&mm->mm_count),
208 atomic_long_read((atomic_long_t *)&mm->nr_ptes), 208 atomic_long_read((atomic_long_t *)&mm->nr_ptes),
209 mm_nr_pmds((struct mm_struct *)mm),
209 mm->map_count, 210 mm->map_count,
210 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, 211 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
211 mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, 212 mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/gup.c b/mm/gup.c
index 12bc2bc33da7..c2da1163986a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
167 if (pud_none(*pud)) 167 if (pud_none(*pud))
168 return no_page_table(vma, flags); 168 return no_page_table(vma, flags);
169 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { 169 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
170 if (flags & FOLL_GET) 170 page = follow_huge_pud(mm, address, pud, flags);
171 return NULL; 171 if (page)
172 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 172 return page;
173 return page; 173 return no_page_table(vma, flags);
174 } 174 }
175 if (unlikely(pud_bad(*pud))) 175 if (unlikely(pud_bad(*pud)))
176 return no_page_table(vma, flags); 176 return no_page_table(vma, flags);
@@ -179,19 +179,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
179 if (pmd_none(*pmd)) 179 if (pmd_none(*pmd))
180 return no_page_table(vma, flags); 180 return no_page_table(vma, flags);
181 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { 181 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
182 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 182 page = follow_huge_pmd(mm, address, pmd, flags);
183 if (flags & FOLL_GET) { 183 if (page)
184 /* 184 return page;
185 * Refcount on tail pages are not well-defined and 185 return no_page_table(vma, flags);
186 * shouldn't be taken. The caller should handle a NULL
187 * return when trying to follow tail pages.
188 */
189 if (PageHead(page))
190 get_page(page);
191 else
192 page = NULL;
193 }
194 return page;
195 } 186 }
196 if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 187 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
197 return no_page_table(vma, flags); 188 return no_page_table(vma, flags);
@@ -584,6 +575,185 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
584 return 0; 575 return 0;
585} 576}
586 577
578static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
579 struct mm_struct *mm,
580 unsigned long start,
581 unsigned long nr_pages,
582 int write, int force,
583 struct page **pages,
584 struct vm_area_struct **vmas,
585 int *locked, bool notify_drop,
586 unsigned int flags)
587{
588 long ret, pages_done;
589 bool lock_dropped;
590
591 if (locked) {
592 /* if VM_FAULT_RETRY can be returned, vmas become invalid */
593 BUG_ON(vmas);
594 /* check caller initialized locked */
595 BUG_ON(*locked != 1);
596 }
597
598 if (pages)
599 flags |= FOLL_GET;
600 if (write)
601 flags |= FOLL_WRITE;
602 if (force)
603 flags |= FOLL_FORCE;
604
605 pages_done = 0;
606 lock_dropped = false;
607 for (;;) {
608 ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
609 vmas, locked);
610 if (!locked)
611 /* VM_FAULT_RETRY couldn't trigger, bypass */
612 return ret;
613
614 /* VM_FAULT_RETRY cannot return errors */
615 if (!*locked) {
616 BUG_ON(ret < 0);
617 BUG_ON(ret >= nr_pages);
618 }
619
620 if (!pages)
621 /* If it's a prefault don't insist harder */
622 return ret;
623
624 if (ret > 0) {
625 nr_pages -= ret;
626 pages_done += ret;
627 if (!nr_pages)
628 break;
629 }
630 if (*locked) {
631 /* VM_FAULT_RETRY didn't trigger */
632 if (!pages_done)
633 pages_done = ret;
634 break;
635 }
636 /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
637 pages += ret;
638 start += ret << PAGE_SHIFT;
639
640 /*
641 * Repeat on the address that fired VM_FAULT_RETRY
642 * without FAULT_FLAG_ALLOW_RETRY but with
643 * FAULT_FLAG_TRIED.
644 */
645 *locked = 1;
646 lock_dropped = true;
647 down_read(&mm->mmap_sem);
648 ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
649 pages, NULL, NULL);
650 if (ret != 1) {
651 BUG_ON(ret > 1);
652 if (!pages_done)
653 pages_done = ret;
654 break;
655 }
656 nr_pages--;
657 pages_done++;
658 if (!nr_pages)
659 break;
660 pages++;
661 start += PAGE_SIZE;
662 }
663 if (notify_drop && lock_dropped && *locked) {
664 /*
665 * We must let the caller know we temporarily dropped the lock
666 * and so the critical section protected by it was lost.
667 */
668 up_read(&mm->mmap_sem);
669 *locked = 0;
670 }
671 return pages_done;
672}
673
674/*
675 * We can leverage the VM_FAULT_RETRY functionality in the page fault
676 * paths better by using either get_user_pages_locked() or
677 * get_user_pages_unlocked().
678 *
679 * get_user_pages_locked() is suitable to replace the form:
680 *
681 * down_read(&mm->mmap_sem);
682 * do_something()
683 * get_user_pages(tsk, mm, ..., pages, NULL);
684 * up_read(&mm->mmap_sem);
685 *
686 * to:
687 *
688 * int locked = 1;
689 * down_read(&mm->mmap_sem);
690 * do_something()
691 * get_user_pages_locked(tsk, mm, ..., pages, &locked);
692 * if (locked)
693 * up_read(&mm->mmap_sem);
694 */
695long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
696 unsigned long start, unsigned long nr_pages,
697 int write, int force, struct page **pages,
698 int *locked)
699{
700 return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
701 pages, NULL, locked, true, FOLL_TOUCH);
702}
703EXPORT_SYMBOL(get_user_pages_locked);
704
705/*
706 * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
707 * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
708 *
709 * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
710 * caller if required (just like with __get_user_pages). "FOLL_GET",
711 * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
712 * according to the parameters "pages", "write", "force"
713 * respectively.
714 */
715__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
716 unsigned long start, unsigned long nr_pages,
717 int write, int force, struct page **pages,
718 unsigned int gup_flags)
719{
720 long ret;
721 int locked = 1;
722 down_read(&mm->mmap_sem);
723 ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
724 pages, NULL, &locked, false, gup_flags);
725 if (locked)
726 up_read(&mm->mmap_sem);
727 return ret;
728}
729EXPORT_SYMBOL(__get_user_pages_unlocked);
730
731/*
732 * get_user_pages_unlocked() is suitable to replace the form:
733 *
734 * down_read(&mm->mmap_sem);
735 * get_user_pages(tsk, mm, ..., pages, NULL);
736 * up_read(&mm->mmap_sem);
737 *
738 * with:
739 *
740 * get_user_pages_unlocked(tsk, mm, ..., pages);
741 *
742 * It is functionally equivalent to get_user_pages_fast so
743 * get_user_pages_fast should be used instead, if the two parameters
744 * "tsk" and "mm" are respectively equal to current and current->mm,
745 * or if "force" shall be set to 1 (get_user_pages_fast misses the
746 * "force" parameter).
747 */
748long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
749 unsigned long start, unsigned long nr_pages,
750 int write, int force, struct page **pages)
751{
752 return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
753 force, pages, FOLL_TOUCH);
754}
755EXPORT_SYMBOL(get_user_pages_unlocked);
756
587/* 757/*
588 * get_user_pages() - pin user pages in memory 758 * get_user_pages() - pin user pages in memory
589 * @tsk: the task_struct to use for page fault accounting, or 759 * @tsk: the task_struct to use for page fault accounting, or
@@ -633,22 +803,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
633 * use the correct cache flushing APIs. 803 * use the correct cache flushing APIs.
634 * 804 *
635 * See also get_user_pages_fast, for performance critical applications. 805 * See also get_user_pages_fast, for performance critical applications.
806 *
807 * get_user_pages should be phased out in favor of
808 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
809 * should use get_user_pages because it cannot pass
810 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
636 */ 811 */
637long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 812long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
638 unsigned long start, unsigned long nr_pages, int write, 813 unsigned long start, unsigned long nr_pages, int write,
639 int force, struct page **pages, struct vm_area_struct **vmas) 814 int force, struct page **pages, struct vm_area_struct **vmas)
640{ 815{
641 int flags = FOLL_TOUCH; 816 return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
642 817 pages, vmas, NULL, false, FOLL_TOUCH);
643 if (pages)
644 flags |= FOLL_GET;
645 if (write)
646 flags |= FOLL_WRITE;
647 if (force)
648 flags |= FOLL_FORCE;
649
650 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
651 NULL);
652} 818}
653EXPORT_SYMBOL(get_user_pages); 819EXPORT_SYMBOL(get_user_pages);
654 820
@@ -1077,10 +1243,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1077 start += nr << PAGE_SHIFT; 1243 start += nr << PAGE_SHIFT;
1078 pages += nr; 1244 pages += nr;
1079 1245
1080 down_read(&mm->mmap_sem); 1246 ret = get_user_pages_unlocked(current, mm, start,
1081 ret = get_user_pages(current, mm, start, 1247 nr_pages - nr, write, 0, pages);
1082 nr_pages - nr, write, 0, pages, NULL);
1083 up_read(&mm->mmap_sem);
1084 1248
1085 /* Have to be a bit careful with return values */ 1249 /* Have to be a bit careful with return values */
1086 if (nr > 0) { 1250 if (nr > 0) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 817a875f2b8c..cb7be110cad3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -171,12 +171,7 @@ static int start_khugepaged(void)
171} 171}
172 172
173static atomic_t huge_zero_refcount; 173static atomic_t huge_zero_refcount;
174static struct page *huge_zero_page __read_mostly; 174struct page *huge_zero_page __read_mostly;
175
176static inline bool is_huge_zero_page(struct page *page)
177{
178 return ACCESS_ONCE(huge_zero_page) == page;
179}
180 175
181static inline bool is_huge_zero_pmd(pmd_t pmd) 176static inline bool is_huge_zero_pmd(pmd_t pmd)
182{ 177{
@@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
766 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; 761 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
767} 762}
768 763
769static inline struct page *alloc_hugepage_vma(int defrag,
770 struct vm_area_struct *vma,
771 unsigned long haddr, int nd,
772 gfp_t extra_gfp)
773{
774 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
775 HPAGE_PMD_ORDER, vma, haddr, nd);
776}
777
778/* Caller must hold page table lock. */ 764/* Caller must hold page table lock. */
779static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 765static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
780 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 766 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
795 unsigned long address, pmd_t *pmd, 781 unsigned long address, pmd_t *pmd,
796 unsigned int flags) 782 unsigned int flags)
797{ 783{
784 gfp_t gfp;
798 struct page *page; 785 struct page *page;
799 unsigned long haddr = address & HPAGE_PMD_MASK; 786 unsigned long haddr = address & HPAGE_PMD_MASK;
800 787
@@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
829 } 816 }
830 return 0; 817 return 0;
831 } 818 }
832 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 819 gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
833 vma, haddr, numa_node_id(), 0); 820 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
834 if (unlikely(!page)) { 821 if (unlikely(!page)) {
835 count_vm_event(THP_FAULT_FALLBACK); 822 count_vm_event(THP_FAULT_FALLBACK);
836 return VM_FAULT_FALLBACK; 823 return VM_FAULT_FALLBACK;
@@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1118 spin_unlock(ptl); 1105 spin_unlock(ptl);
1119alloc: 1106alloc:
1120 if (transparent_hugepage_enabled(vma) && 1107 if (transparent_hugepage_enabled(vma) &&
1121 !transparent_hugepage_debug_cow()) 1108 !transparent_hugepage_debug_cow()) {
1122 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 1109 gfp_t gfp;
1123 vma, haddr, numa_node_id(), 0); 1110
1124 else 1111 gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
1112 new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
1113 } else
1125 new_page = NULL; 1114 new_page = NULL;
1126 1115
1127 if (unlikely(!new_page)) { 1116 if (unlikely(!new_page)) {
@@ -1423,26 +1412,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1423 return ret; 1412 return ret;
1424} 1413}
1425 1414
1426int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1427 unsigned long addr, unsigned long end,
1428 unsigned char *vec)
1429{
1430 spinlock_t *ptl;
1431 int ret = 0;
1432
1433 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1434 /*
1435 * All logical pages in the range are present
1436 * if backed by a huge page.
1437 */
1438 spin_unlock(ptl);
1439 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1440 ret = 1;
1441 }
1442
1443 return ret;
1444}
1445
1446int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, 1415int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1447 unsigned long old_addr, 1416 unsigned long old_addr,
1448 unsigned long new_addr, unsigned long old_end, 1417 unsigned long new_addr, unsigned long old_end,
@@ -2148,7 +2117,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2148{ 2117{
2149 struct page *page; 2118 struct page *page;
2150 pte_t *_pte; 2119 pte_t *_pte;
2151 int referenced = 0, none = 0; 2120 int none = 0;
2121 bool referenced = false, writable = false;
2152 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2122 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
2153 _pte++, address += PAGE_SIZE) { 2123 _pte++, address += PAGE_SIZE) {
2154 pte_t pteval = *_pte; 2124 pte_t pteval = *_pte;
@@ -2158,7 +2128,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2158 else 2128 else
2159 goto out; 2129 goto out;
2160 } 2130 }
2161 if (!pte_present(pteval) || !pte_write(pteval)) 2131 if (!pte_present(pteval))
2162 goto out; 2132 goto out;
2163 page = vm_normal_page(vma, address, pteval); 2133 page = vm_normal_page(vma, address, pteval);
2164 if (unlikely(!page)) 2134 if (unlikely(!page))
@@ -2168,9 +2138,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2168 VM_BUG_ON_PAGE(!PageAnon(page), page); 2138 VM_BUG_ON_PAGE(!PageAnon(page), page);
2169 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 2139 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
2170 2140
2171 /* cannot use mapcount: can't collapse if there's a gup pin */
2172 if (page_count(page) != 1)
2173 goto out;
2174 /* 2141 /*
2175 * We can do it before isolate_lru_page because the 2142 * We can do it before isolate_lru_page because the
2176 * page can't be freed from under us. NOTE: PG_lock 2143 * page can't be freed from under us. NOTE: PG_lock
@@ -2179,6 +2146,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2179 */ 2146 */
2180 if (!trylock_page(page)) 2147 if (!trylock_page(page))
2181 goto out; 2148 goto out;
2149
2150 /*
2151 * cannot use mapcount: can't collapse if there's a gup pin.
2152 * The page must only be referenced by the scanned process
2153 * and page swap cache.
2154 */
2155 if (page_count(page) != 1 + !!PageSwapCache(page)) {
2156 unlock_page(page);
2157 goto out;
2158 }
2159 if (pte_write(pteval)) {
2160 writable = true;
2161 } else {
2162 if (PageSwapCache(page) && !reuse_swap_page(page)) {
2163 unlock_page(page);
2164 goto out;
2165 }
2166 /*
2167 * Page is not in the swap cache. It can be collapsed
2168 * into a THP.
2169 */
2170 }
2171
2182 /* 2172 /*
2183 * Isolate the page to avoid collapsing an hugepage 2173 * Isolate the page to avoid collapsing an hugepage
2184 * currently in use by the VM. 2174 * currently in use by the VM.
@@ -2195,9 +2185,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2195 /* If there is no mapped pte young don't collapse the page */ 2185 /* If there is no mapped pte young don't collapse the page */
2196 if (pte_young(pteval) || PageReferenced(page) || 2186 if (pte_young(pteval) || PageReferenced(page) ||
2197 mmu_notifier_test_young(vma->vm_mm, address)) 2187 mmu_notifier_test_young(vma->vm_mm, address))
2198 referenced = 1; 2188 referenced = true;
2199 } 2189 }
2200 if (likely(referenced)) 2190 if (likely(referenced && writable))
2201 return 1; 2191 return 1;
2202out: 2192out:
2203 release_pte_pages(pte, _pte); 2193 release_pte_pages(pte, _pte);
@@ -2550,11 +2540,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2550{ 2540{
2551 pmd_t *pmd; 2541 pmd_t *pmd;
2552 pte_t *pte, *_pte; 2542 pte_t *pte, *_pte;
2553 int ret = 0, referenced = 0, none = 0; 2543 int ret = 0, none = 0;
2554 struct page *page; 2544 struct page *page;
2555 unsigned long _address; 2545 unsigned long _address;
2556 spinlock_t *ptl; 2546 spinlock_t *ptl;
2557 int node = NUMA_NO_NODE; 2547 int node = NUMA_NO_NODE;
2548 bool writable = false, referenced = false;
2558 2549
2559 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2550 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2560 2551
@@ -2573,8 +2564,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2573 else 2564 else
2574 goto out_unmap; 2565 goto out_unmap;
2575 } 2566 }
2576 if (!pte_present(pteval) || !pte_write(pteval)) 2567 if (!pte_present(pteval))
2577 goto out_unmap; 2568 goto out_unmap;
2569 if (pte_write(pteval))
2570 writable = true;
2571
2578 page = vm_normal_page(vma, _address, pteval); 2572 page = vm_normal_page(vma, _address, pteval);
2579 if (unlikely(!page)) 2573 if (unlikely(!page))
2580 goto out_unmap; 2574 goto out_unmap;
@@ -2591,14 +2585,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2591 VM_BUG_ON_PAGE(PageCompound(page), page); 2585 VM_BUG_ON_PAGE(PageCompound(page), page);
2592 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2586 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
2593 goto out_unmap; 2587 goto out_unmap;
2594 /* cannot use mapcount: can't collapse if there's a gup pin */ 2588 /*
2595 if (page_count(page) != 1) 2589 * cannot use mapcount: can't collapse if there's a gup pin.
2590 * The page must only be referenced by the scanned process
2591 * and page swap cache.
2592 */
2593 if (page_count(page) != 1 + !!PageSwapCache(page))
2596 goto out_unmap; 2594 goto out_unmap;
2597 if (pte_young(pteval) || PageReferenced(page) || 2595 if (pte_young(pteval) || PageReferenced(page) ||
2598 mmu_notifier_test_young(vma->vm_mm, address)) 2596 mmu_notifier_test_young(vma->vm_mm, address))
2599 referenced = 1; 2597 referenced = true;
2600 } 2598 }
2601 if (referenced) 2599 if (referenced && writable)
2602 ret = 1; 2600 ret = 1;
2603out_unmap: 2601out_unmap:
2604 pte_unmap_unlock(pte, ptl); 2602 pte_unmap_unlock(pte, ptl);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index be0e5d0db5ec..0a9ac6c26832 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2657,9 +2657,10 @@ again:
2657 goto unlock; 2657 goto unlock;
2658 2658
2659 /* 2659 /*
2660 * HWPoisoned hugepage is already unmapped and dropped reference 2660 * Migrating hugepage or HWPoisoned hugepage is already
2661 * unmapped and its refcount is dropped, so just clear pte here.
2661 */ 2662 */
2662 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 2663 if (unlikely(!pte_present(pte))) {
2663 huge_pte_clear(mm, address, ptep); 2664 huge_pte_clear(mm, address, ptep);
2664 goto unlock; 2665 goto unlock;
2665 } 2666 }
@@ -3134,6 +3135,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3134 struct page *pagecache_page = NULL; 3135 struct page *pagecache_page = NULL;
3135 struct hstate *h = hstate_vma(vma); 3136 struct hstate *h = hstate_vma(vma);
3136 struct address_space *mapping; 3137 struct address_space *mapping;
3138 int need_wait_lock = 0;
3137 3139
3138 address &= huge_page_mask(h); 3140 address &= huge_page_mask(h);
3139 3141
@@ -3172,6 +3174,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3172 ret = 0; 3174 ret = 0;
3173 3175
3174 /* 3176 /*
3177 * entry could be a migration/hwpoison entry at this point, so this
3178 * check prevents the kernel from going below assuming that we have
3179 * a active hugepage in pagecache. This goto expects the 2nd page fault,
3180 * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
3181 * handle it.
3182 */
3183 if (!pte_present(entry))
3184 goto out_mutex;
3185
3186 /*
3175 * If we are going to COW the mapping later, we examine the pending 3187 * If we are going to COW the mapping later, we examine the pending
3176 * reservations for this page now. This will ensure that any 3188 * reservations for this page now. This will ensure that any
3177 * allocations necessary to record that reservation occur outside the 3189 * allocations necessary to record that reservation occur outside the
@@ -3190,30 +3202,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3190 vma, address); 3202 vma, address);
3191 } 3203 }
3192 3204
3205 ptl = huge_pte_lock(h, mm, ptep);
3206
3207 /* Check for a racing update before calling hugetlb_cow */
3208 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
3209 goto out_ptl;
3210
3193 /* 3211 /*
3194 * hugetlb_cow() requires page locks of pte_page(entry) and 3212 * hugetlb_cow() requires page locks of pte_page(entry) and
3195 * pagecache_page, so here we need take the former one 3213 * pagecache_page, so here we need take the former one
3196 * when page != pagecache_page or !pagecache_page. 3214 * when page != pagecache_page or !pagecache_page.
3197 * Note that locking order is always pagecache_page -> page,
3198 * so no worry about deadlock.
3199 */ 3215 */
3200 page = pte_page(entry); 3216 page = pte_page(entry);
3201 get_page(page);
3202 if (page != pagecache_page) 3217 if (page != pagecache_page)
3203 lock_page(page); 3218 if (!trylock_page(page)) {
3204 3219 need_wait_lock = 1;
3205 ptl = huge_pte_lockptr(h, mm, ptep); 3220 goto out_ptl;
3206 spin_lock(ptl); 3221 }
3207 /* Check for a racing update before calling hugetlb_cow */
3208 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
3209 goto out_ptl;
3210 3222
3223 get_page(page);
3211 3224
3212 if (flags & FAULT_FLAG_WRITE) { 3225 if (flags & FAULT_FLAG_WRITE) {
3213 if (!huge_pte_write(entry)) { 3226 if (!huge_pte_write(entry)) {
3214 ret = hugetlb_cow(mm, vma, address, ptep, entry, 3227 ret = hugetlb_cow(mm, vma, address, ptep, entry,
3215 pagecache_page, ptl); 3228 pagecache_page, ptl);
3216 goto out_ptl; 3229 goto out_put_page;
3217 } 3230 }
3218 entry = huge_pte_mkdirty(entry); 3231 entry = huge_pte_mkdirty(entry);
3219 } 3232 }
@@ -3221,7 +3234,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3221 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 3234 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
3222 flags & FAULT_FLAG_WRITE)) 3235 flags & FAULT_FLAG_WRITE))
3223 update_mmu_cache(vma, address, ptep); 3236 update_mmu_cache(vma, address, ptep);
3224 3237out_put_page:
3238 if (page != pagecache_page)
3239 unlock_page(page);
3240 put_page(page);
3225out_ptl: 3241out_ptl:
3226 spin_unlock(ptl); 3242 spin_unlock(ptl);
3227 3243
@@ -3229,12 +3245,17 @@ out_ptl:
3229 unlock_page(pagecache_page); 3245 unlock_page(pagecache_page);
3230 put_page(pagecache_page); 3246 put_page(pagecache_page);
3231 } 3247 }
3232 if (page != pagecache_page)
3233 unlock_page(page);
3234 put_page(page);
3235
3236out_mutex: 3248out_mutex:
3237 mutex_unlock(&htlb_fault_mutex_table[hash]); 3249 mutex_unlock(&htlb_fault_mutex_table[hash]);
3250 /*
3251 * Generally it's safe to hold refcount during waiting page lock. But
3252 * here we just wait to defer the next page fault to avoid busy loop and
3253 * the page is not used after unlocked before returning from the current
3254 * page fault. So we are safe from accessing freed page, even if we wait
3255 * here without taking refcount.
3256 */
3257 if (need_wait_lock)
3258 wait_on_page_locked(page);
3238 return ret; 3259 return ret;
3239} 3260}
3240 3261
@@ -3364,7 +3385,26 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3364 spin_unlock(ptl); 3385 spin_unlock(ptl);
3365 continue; 3386 continue;
3366 } 3387 }
3367 if (!huge_pte_none(huge_ptep_get(ptep))) { 3388 pte = huge_ptep_get(ptep);
3389 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
3390 spin_unlock(ptl);
3391 continue;
3392 }
3393 if (unlikely(is_hugetlb_entry_migration(pte))) {
3394 swp_entry_t entry = pte_to_swp_entry(pte);
3395
3396 if (is_write_migration_entry(entry)) {
3397 pte_t newpte;
3398
3399 make_migration_entry_read(&entry);
3400 newpte = swp_entry_to_pte(entry);
3401 set_huge_pte_at(mm, address, ptep, newpte);
3402 pages++;
3403 }
3404 spin_unlock(ptl);
3405 continue;
3406 }
3407 if (!huge_pte_none(pte)) {
3368 pte = huge_ptep_get_and_clear(mm, address, ptep); 3408 pte = huge_ptep_get_and_clear(mm, address, ptep);
3369 pte = pte_mkhuge(huge_pte_modify(pte, newprot)); 3409 pte = pte_mkhuge(huge_pte_modify(pte, newprot));
3370 pte = arch_make_huge_pte(pte, vma, NULL, 0); 3410 pte = arch_make_huge_pte(pte, vma, NULL, 0);
@@ -3558,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3558 if (saddr) { 3598 if (saddr) {
3559 spte = huge_pte_offset(svma->vm_mm, saddr); 3599 spte = huge_pte_offset(svma->vm_mm, saddr);
3560 if (spte) { 3600 if (spte) {
3601 mm_inc_nr_pmds(mm);
3561 get_page(virt_to_page(spte)); 3602 get_page(virt_to_page(spte));
3562 break; 3603 break;
3563 } 3604 }
@@ -3569,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3569 3610
3570 ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); 3611 ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
3571 spin_lock(ptl); 3612 spin_lock(ptl);
3572 if (pud_none(*pud)) 3613 if (pud_none(*pud)) {
3573 pud_populate(mm, pud, 3614 pud_populate(mm, pud,
3574 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 3615 (pmd_t *)((unsigned long)spte & PAGE_MASK));
3575 else 3616 } else {
3576 put_page(virt_to_page(spte)); 3617 put_page(virt_to_page(spte));
3618 mm_inc_nr_pmds(mm);
3619 }
3577 spin_unlock(ptl); 3620 spin_unlock(ptl);
3578out: 3621out:
3579 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3622 pte = (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3604,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
3604 3647
3605 pud_clear(pud); 3648 pud_clear(pud);
3606 put_page(virt_to_page(ptep)); 3649 put_page(virt_to_page(ptep));
3650 mm_dec_nr_pmds(mm);
3607 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 3651 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
3608 return 1; 3652 return 1;
3609} 3653}
@@ -3660,42 +3704,64 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
3660 return (pte_t *) pmd; 3704 return (pte_t *) pmd;
3661} 3705}
3662 3706
3663struct page * 3707#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
3664follow_huge_pmd(struct mm_struct *mm, unsigned long address,
3665 pmd_t *pmd, int write)
3666{
3667 struct page *page;
3668 3708
3669 page = pte_page(*(pte_t *)pmd); 3709/*
3670 if (page) 3710 * These functions are overwritable if your architecture needs its own
3671 page += ((address & ~PMD_MASK) >> PAGE_SHIFT); 3711 * behavior.
3672 return page; 3712 */
3713struct page * __weak
3714follow_huge_addr(struct mm_struct *mm, unsigned long address,
3715 int write)
3716{
3717 return ERR_PTR(-EINVAL);
3673} 3718}
3674 3719
3675struct page * 3720struct page * __weak
3676follow_huge_pud(struct mm_struct *mm, unsigned long address, 3721follow_huge_pmd(struct mm_struct *mm, unsigned long address,
3677 pud_t *pud, int write) 3722 pmd_t *pmd, int flags)
3678{ 3723{
3679 struct page *page; 3724 struct page *page = NULL;
3680 3725 spinlock_t *ptl;
3681 page = pte_page(*(pte_t *)pud); 3726retry:
3682 if (page) 3727 ptl = pmd_lockptr(mm, pmd);
3683 page += ((address & ~PUD_MASK) >> PAGE_SHIFT); 3728 spin_lock(ptl);
3729 /*
3730 * make sure that the address range covered by this pmd is not
3731 * unmapped from other threads.
3732 */
3733 if (!pmd_huge(*pmd))
3734 goto out;
3735 if (pmd_present(*pmd)) {
3736 page = pte_page(*(pte_t *)pmd) +
3737 ((address & ~PMD_MASK) >> PAGE_SHIFT);
3738 if (flags & FOLL_GET)
3739 get_page(page);
3740 } else {
3741 if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
3742 spin_unlock(ptl);
3743 __migration_entry_wait(mm, (pte_t *)pmd, ptl);
3744 goto retry;
3745 }
3746 /*
3747 * hwpoisoned entry is treated as no_page_table in
3748 * follow_page_mask().
3749 */
3750 }
3751out:
3752 spin_unlock(ptl);
3684 return page; 3753 return page;
3685} 3754}
3686 3755
3687#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
3688
3689/* Can be overriden by architectures */
3690struct page * __weak 3756struct page * __weak
3691follow_huge_pud(struct mm_struct *mm, unsigned long address, 3757follow_huge_pud(struct mm_struct *mm, unsigned long address,
3692 pud_t *pud, int write) 3758 pud_t *pud, int flags)
3693{ 3759{
3694 BUG(); 3760 if (flags & FOLL_GET)
3695 return NULL; 3761 return NULL;
3696}
3697 3762
3698#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 3763 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
3764}
3699 3765
3700#ifdef CONFIG_MEMORY_FAILURE 3766#ifdef CONFIG_MEMORY_FAILURE
3701 3767
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 037e1c00a5b7..6e0057439a46 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -279,7 +279,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
279 return -EINVAL; 279 return -EINVAL;
280 280
281 buf = strstrip(buf); 281 buf = strstrip(buf);
282 ret = page_counter_memparse(buf, &nr_pages); 282 ret = page_counter_memparse(buf, "-1", &nr_pages);
283 if (ret) 283 if (ret)
284 return ret; 284 return ret;
285 285
diff --git a/mm/internal.h b/mm/internal.h
index efad241f7014..c4d6c9b43491 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -110,6 +110,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
110 */ 110 */
111 111
112/* 112/*
113 * Structure for holding the mostly immutable allocation parameters passed
114 * between functions involved in allocations, including the alloc_pages*
115 * family of functions.
116 *
117 * nodemask, migratetype and high_zoneidx are initialized only once in
118 * __alloc_pages_nodemask() and then never change.
119 *
120 * zonelist, preferred_zone and classzone_idx are set first in
121 * __alloc_pages_nodemask() for the fast path, and might be later changed
122 * in __alloc_pages_slowpath(). All other functions pass the whole strucure
123 * by a const pointer.
124 */
125struct alloc_context {
126 struct zonelist *zonelist;
127 nodemask_t *nodemask;
128 struct zone *preferred_zone;
129 int classzone_idx;
130 int migratetype;
131 enum zone_type high_zoneidx;
132};
133
134/*
113 * Locate the struct page for both the matching buddy in our 135 * Locate the struct page for both the matching buddy in our
114 * pair (buddy1) and the combined O(n+1) page they form (page). 136 * pair (buddy1) and the combined O(n+1) page they form (page).
115 * 137 *
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f3f8a4f52a0c..095c1f96fbec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
72#define MEM_CGROUP_RECLAIM_RETRIES 5 72#define MEM_CGROUP_RECLAIM_RETRIES 5
73static struct mem_cgroup *root_mem_cgroup __read_mostly; 73static struct mem_cgroup *root_mem_cgroup __read_mostly;
74 74
75/* Whether the swap controller is active */
75#ifdef CONFIG_MEMCG_SWAP 76#ifdef CONFIG_MEMCG_SWAP
76/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
77int do_swap_account __read_mostly; 77int do_swap_account __read_mostly;
78
79/* for remember boot option*/
80#ifdef CONFIG_MEMCG_SWAP_ENABLED
81static int really_do_swap_account __initdata = 1;
82#else
83static int really_do_swap_account __initdata;
84#endif
85
86#else 78#else
87#define do_swap_account 0 79#define do_swap_account 0
88#endif 80#endif
89 81
90
91static const char * const mem_cgroup_stat_names[] = { 82static const char * const mem_cgroup_stat_names[] = {
92 "cache", 83 "cache",
93 "rss", 84 "rss",
@@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = {
97 "swap", 88 "swap",
98}; 89};
99 90
100enum mem_cgroup_events_index {
101 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
102 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
103 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
104 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
105 MEM_CGROUP_EVENTS_NSTATS,
106};
107
108static const char * const mem_cgroup_events_names[] = { 91static const char * const mem_cgroup_events_names[] = {
109 "pgpgin", 92 "pgpgin",
110 "pgpgout", 93 "pgpgout",
@@ -138,7 +121,7 @@ enum mem_cgroup_events_target {
138 121
139struct mem_cgroup_stat_cpu { 122struct mem_cgroup_stat_cpu {
140 long count[MEM_CGROUP_STAT_NSTATS]; 123 long count[MEM_CGROUP_STAT_NSTATS];
141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 124 unsigned long events[MEMCG_NR_EVENTS];
142 unsigned long nr_page_events; 125 unsigned long nr_page_events;
143 unsigned long targets[MEM_CGROUP_NTARGETS]; 126 unsigned long targets[MEM_CGROUP_NTARGETS];
144}; 127};
@@ -284,6 +267,10 @@ struct mem_cgroup {
284 struct page_counter memsw; 267 struct page_counter memsw;
285 struct page_counter kmem; 268 struct page_counter kmem;
286 269
270 /* Normal memory consumption range */
271 unsigned long low;
272 unsigned long high;
273
287 unsigned long soft_limit; 274 unsigned long soft_limit;
288 275
289 /* vmpressure notifications */ 276 /* vmpressure notifications */
@@ -325,9 +312,11 @@ struct mem_cgroup {
325 /* 312 /*
326 * set > 0 if pages under this cgroup are moving to other cgroup. 313 * set > 0 if pages under this cgroup are moving to other cgroup.
327 */ 314 */
328 atomic_t moving_account; 315 atomic_t moving_account;
329 /* taken only while moving_account > 0 */ 316 /* taken only while moving_account > 0 */
330 spinlock_t move_lock; 317 spinlock_t move_lock;
318 struct task_struct *move_lock_task;
319 unsigned long move_lock_flags;
331 /* 320 /*
332 * percpu counter. 321 * percpu counter.
333 */ 322 */
@@ -371,21 +360,18 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
371 360
372/* Stuffs for move charges at task migration. */ 361/* Stuffs for move charges at task migration. */
373/* 362/*
374 * Types of charges to be moved. "move_charge_at_immitgrate" and 363 * Types of charges to be moved.
375 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
376 */ 364 */
377enum move_type { 365#define MOVE_ANON 0x1U
378 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 366#define MOVE_FILE 0x2U
379 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 367#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
380 NR_MOVE_TYPE,
381};
382 368
383/* "mc" and its members are protected by cgroup_mutex */ 369/* "mc" and its members are protected by cgroup_mutex */
384static struct move_charge_struct { 370static struct move_charge_struct {
385 spinlock_t lock; /* for from, to */ 371 spinlock_t lock; /* for from, to */
386 struct mem_cgroup *from; 372 struct mem_cgroup *from;
387 struct mem_cgroup *to; 373 struct mem_cgroup *to;
388 unsigned long immigrate_flags; 374 unsigned long flags;
389 unsigned long precharge; 375 unsigned long precharge;
390 unsigned long moved_charge; 376 unsigned long moved_charge;
391 unsigned long moved_swap; 377 unsigned long moved_swap;
@@ -396,16 +382,6 @@ static struct move_charge_struct {
396 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 382 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
397}; 383};
398 384
399static bool move_anon(void)
400{
401 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
402}
403
404static bool move_file(void)
405{
406 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
407}
408
409/* 385/*
410 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 386 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
411 * limit reclaim to prevent infinite loops, if they ever occur. 387 * limit reclaim to prevent infinite loops, if they ever occur.
@@ -1365,6 +1341,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1365 return inactive * inactive_ratio < active; 1341 return inactive * inactive_ratio < active;
1366} 1342}
1367 1343
1344bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
1345{
1346 struct mem_cgroup_per_zone *mz;
1347 struct mem_cgroup *memcg;
1348
1349 if (mem_cgroup_disabled())
1350 return true;
1351
1352 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1353 memcg = mz->memcg;
1354
1355 return !!(memcg->css.flags & CSS_ONLINE);
1356}
1357
1368#define mem_cgroup_from_counter(counter, member) \ 1358#define mem_cgroup_from_counter(counter, member) \
1369 container_of(counter, struct mem_cgroup, member) 1359 container_of(counter, struct mem_cgroup, member)
1370 1360
@@ -1557,7 +1547,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1557 * quickly exit and free its memory. 1547 * quickly exit and free its memory.
1558 */ 1548 */
1559 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 1549 if (fatal_signal_pending(current) || task_will_free_mem(current)) {
1560 set_thread_flag(TIF_MEMDIE); 1550 mark_tsk_oom_victim(current);
1561 return; 1551 return;
1562 } 1552 }
1563 1553
@@ -1931,7 +1921,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
1931 if (!memcg) 1921 if (!memcg)
1932 return false; 1922 return false;
1933 1923
1934 if (!handle) 1924 if (!handle || oom_killer_disabled)
1935 goto cleanup; 1925 goto cleanup;
1936 1926
1937 owait.memcg = memcg; 1927 owait.memcg = memcg;
@@ -1977,34 +1967,33 @@ cleanup:
1977/** 1967/**
1978 * mem_cgroup_begin_page_stat - begin a page state statistics transaction 1968 * mem_cgroup_begin_page_stat - begin a page state statistics transaction
1979 * @page: page that is going to change accounted state 1969 * @page: page that is going to change accounted state
1980 * @locked: &memcg->move_lock slowpath was taken
1981 * @flags: IRQ-state flags for &memcg->move_lock
1982 * 1970 *
1983 * This function must mark the beginning of an accounted page state 1971 * This function must mark the beginning of an accounted page state
1984 * change to prevent double accounting when the page is concurrently 1972 * change to prevent double accounting when the page is concurrently
1985 * being moved to another memcg: 1973 * being moved to another memcg:
1986 * 1974 *
1987 * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); 1975 * memcg = mem_cgroup_begin_page_stat(page);
1988 * if (TestClearPageState(page)) 1976 * if (TestClearPageState(page))
1989 * mem_cgroup_update_page_stat(memcg, state, -1); 1977 * mem_cgroup_update_page_stat(memcg, state, -1);
1990 * mem_cgroup_end_page_stat(memcg, locked, flags); 1978 * mem_cgroup_end_page_stat(memcg);
1991 *
1992 * The RCU lock is held throughout the transaction. The fast path can
1993 * get away without acquiring the memcg->move_lock (@locked is false)
1994 * because page moving starts with an RCU grace period.
1995 *
1996 * The RCU lock also protects the memcg from being freed when the page
1997 * state that is going to change is the only thing preventing the page
1998 * from being uncharged. E.g. end-writeback clearing PageWriteback(),
1999 * which allows migration to go ahead and uncharge the page before the
2000 * account transaction might be complete.
2001 */ 1979 */
2002struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, 1980struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
2003 bool *locked,
2004 unsigned long *flags)
2005{ 1981{
2006 struct mem_cgroup *memcg; 1982 struct mem_cgroup *memcg;
1983 unsigned long flags;
2007 1984
1985 /*
1986 * The RCU lock is held throughout the transaction. The fast
1987 * path can get away without acquiring the memcg->move_lock
1988 * because page moving starts with an RCU grace period.
1989 *
1990 * The RCU lock also protects the memcg from being freed when
1991 * the page state that is going to change is the only thing
1992 * preventing the page from being uncharged.
1993 * E.g. end-writeback clearing PageWriteback(), which allows
1994 * migration to go ahead and uncharge the page before the
1995 * account transaction might be complete.
1996 */
2008 rcu_read_lock(); 1997 rcu_read_lock();
2009 1998
2010 if (mem_cgroup_disabled()) 1999 if (mem_cgroup_disabled())
@@ -2014,16 +2003,22 @@ again:
2014 if (unlikely(!memcg)) 2003 if (unlikely(!memcg))
2015 return NULL; 2004 return NULL;
2016 2005
2017 *locked = false;
2018 if (atomic_read(&memcg->moving_account) <= 0) 2006 if (atomic_read(&memcg->moving_account) <= 0)
2019 return memcg; 2007 return memcg;
2020 2008
2021 spin_lock_irqsave(&memcg->move_lock, *flags); 2009 spin_lock_irqsave(&memcg->move_lock, flags);
2022 if (memcg != page->mem_cgroup) { 2010 if (memcg != page->mem_cgroup) {
2023 spin_unlock_irqrestore(&memcg->move_lock, *flags); 2011 spin_unlock_irqrestore(&memcg->move_lock, flags);
2024 goto again; 2012 goto again;
2025 } 2013 }
2026 *locked = true; 2014
2015 /*
2016 * When charge migration first begins, we can have locked and
2017 * unlocked page stat updates happening concurrently. Track
2018 * the task who has the lock for mem_cgroup_end_page_stat().
2019 */
2020 memcg->move_lock_task = current;
2021 memcg->move_lock_flags = flags;
2027 2022
2028 return memcg; 2023 return memcg;
2029} 2024}
@@ -2031,14 +2026,17 @@ again:
2031/** 2026/**
2032 * mem_cgroup_end_page_stat - finish a page state statistics transaction 2027 * mem_cgroup_end_page_stat - finish a page state statistics transaction
2033 * @memcg: the memcg that was accounted against 2028 * @memcg: the memcg that was accounted against
2034 * @locked: value received from mem_cgroup_begin_page_stat()
2035 * @flags: value received from mem_cgroup_begin_page_stat()
2036 */ 2029 */
2037void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, 2030void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
2038 unsigned long *flags)
2039{ 2031{
2040 if (memcg && *locked) 2032 if (memcg && memcg->move_lock_task == current) {
2041 spin_unlock_irqrestore(&memcg->move_lock, *flags); 2033 unsigned long flags = memcg->move_lock_flags;
2034
2035 memcg->move_lock_task = NULL;
2036 memcg->move_lock_flags = 0;
2037
2038 spin_unlock_irqrestore(&memcg->move_lock, flags);
2039 }
2042 2040
2043 rcu_read_unlock(); 2041 rcu_read_unlock();
2044} 2042}
@@ -2131,17 +2129,6 @@ static void drain_local_stock(struct work_struct *dummy)
2131 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2129 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2132} 2130}
2133 2131
2134static void __init memcg_stock_init(void)
2135{
2136 int cpu;
2137
2138 for_each_possible_cpu(cpu) {
2139 struct memcg_stock_pcp *stock =
2140 &per_cpu(memcg_stock, cpu);
2141 INIT_WORK(&stock->work, drain_local_stock);
2142 }
2143}
2144
2145/* 2132/*
2146 * Cache charges(val) to local per_cpu area. 2133 * Cache charges(val) to local per_cpu area.
2147 * This will be consumed by consume_stock() function, later. 2134 * This will be consumed by consume_stock() function, later.
@@ -2291,6 +2278,8 @@ retry:
2291 if (!(gfp_mask & __GFP_WAIT)) 2278 if (!(gfp_mask & __GFP_WAIT))
2292 goto nomem; 2279 goto nomem;
2293 2280
2281 mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
2282
2294 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2283 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2295 gfp_mask, may_swap); 2284 gfp_mask, may_swap);
2296 2285
@@ -2332,6 +2321,8 @@ retry:
2332 if (fatal_signal_pending(current)) 2321 if (fatal_signal_pending(current))
2333 goto bypass; 2322 goto bypass;
2334 2323
2324 mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
2325
2335 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); 2326 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
2336nomem: 2327nomem:
2337 if (!(gfp_mask & __GFP_NOFAIL)) 2328 if (!(gfp_mask & __GFP_NOFAIL))
@@ -2343,6 +2334,16 @@ done_restock:
2343 css_get_many(&memcg->css, batch); 2334 css_get_many(&memcg->css, batch);
2344 if (batch > nr_pages) 2335 if (batch > nr_pages)
2345 refill_stock(memcg, batch - nr_pages); 2336 refill_stock(memcg, batch - nr_pages);
2337 /*
2338 * If the hierarchy is above the normal consumption range,
2339 * make the charging task trim their excess contribution.
2340 */
2341 do {
2342 if (page_counter_read(&memcg->memory) <= memcg->high)
2343 continue;
2344 mem_cgroup_events(memcg, MEMCG_HIGH, 1);
2345 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2346 } while ((memcg = parent_mem_cgroup(memcg)));
2346done: 2347done:
2347 return ret; 2348 return ret;
2348} 2349}
@@ -3390,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3390 int ret; 3391 int ret;
3391 3392
3392 buf = strstrip(buf); 3393 buf = strstrip(buf);
3393 ret = page_counter_memparse(buf, &nr_pages); 3394 ret = page_counter_memparse(buf, "-1", &nr_pages);
3394 if (ret) 3395 if (ret)
3395 return ret; 3396 return ret;
3396 3397
@@ -3466,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3466{ 3467{
3467 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3468 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3468 3469
3469 if (val >= (1 << NR_MOVE_TYPE)) 3470 if (val & ~MOVE_MASK)
3470 return -EINVAL; 3471 return -EINVAL;
3471 3472
3472 /* 3473 /*
@@ -3544,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
3544 struct mem_cgroup *mi; 3545 struct mem_cgroup *mi;
3545 unsigned int i; 3546 unsigned int i;
3546 3547
3548 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
3549 MEM_CGROUP_STAT_NSTATS);
3550 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
3551 MEM_CGROUP_EVENTS_NSTATS);
3547 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 3552 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3548 3553
3549 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3554 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3758,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3758 unsigned long usage; 3763 unsigned long usage;
3759 int i, size, ret; 3764 int i, size, ret;
3760 3765
3761 ret = page_counter_memparse(args, &threshold); 3766 ret = page_counter_memparse(args, "-1", &threshold);
3762 if (ret) 3767 if (ret)
3763 return ret; 3768 return ret;
3764 3769
@@ -4248,7 +4253,7 @@ out_kfree:
4248 return ret; 4253 return ret;
4249} 4254}
4250 4255
4251static struct cftype mem_cgroup_files[] = { 4256static struct cftype mem_cgroup_legacy_files[] = {
4252 { 4257 {
4253 .name = "usage_in_bytes", 4258 .name = "usage_in_bytes",
4254 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4259 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
@@ -4359,34 +4364,6 @@ static struct cftype mem_cgroup_files[] = {
4359 { }, /* terminate */ 4364 { }, /* terminate */
4360}; 4365};
4361 4366
4362#ifdef CONFIG_MEMCG_SWAP
4363static struct cftype memsw_cgroup_files[] = {
4364 {
4365 .name = "memsw.usage_in_bytes",
4366 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4367 .read_u64 = mem_cgroup_read_u64,
4368 },
4369 {
4370 .name = "memsw.max_usage_in_bytes",
4371 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4372 .write = mem_cgroup_reset,
4373 .read_u64 = mem_cgroup_read_u64,
4374 },
4375 {
4376 .name = "memsw.limit_in_bytes",
4377 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4378 .write = mem_cgroup_write,
4379 .read_u64 = mem_cgroup_read_u64,
4380 },
4381 {
4382 .name = "memsw.failcnt",
4383 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4384 .write = mem_cgroup_reset,
4385 .read_u64 = mem_cgroup_read_u64,
4386 },
4387 { }, /* terminate */
4388};
4389#endif
4390static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4367static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4391{ 4368{
4392 struct mem_cgroup_per_node *pn; 4369 struct mem_cgroup_per_node *pn;
@@ -4482,29 +4459,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4482} 4459}
4483EXPORT_SYMBOL(parent_mem_cgroup); 4460EXPORT_SYMBOL(parent_mem_cgroup);
4484 4461
4485static void __init mem_cgroup_soft_limit_tree_init(void)
4486{
4487 struct mem_cgroup_tree_per_node *rtpn;
4488 struct mem_cgroup_tree_per_zone *rtpz;
4489 int tmp, node, zone;
4490
4491 for_each_node(node) {
4492 tmp = node;
4493 if (!node_state(node, N_NORMAL_MEMORY))
4494 tmp = -1;
4495 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4496 BUG_ON(!rtpn);
4497
4498 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4499
4500 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4501 rtpz = &rtpn->rb_tree_per_zone[zone];
4502 rtpz->rb_root = RB_ROOT;
4503 spin_lock_init(&rtpz->lock);
4504 }
4505 }
4506}
4507
4508static struct cgroup_subsys_state * __ref 4462static struct cgroup_subsys_state * __ref
4509mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4463mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4510{ 4464{
@@ -4524,6 +4478,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4524 if (parent_css == NULL) { 4478 if (parent_css == NULL) {
4525 root_mem_cgroup = memcg; 4479 root_mem_cgroup = memcg;
4526 page_counter_init(&memcg->memory, NULL); 4480 page_counter_init(&memcg->memory, NULL);
4481 memcg->high = PAGE_COUNTER_MAX;
4527 memcg->soft_limit = PAGE_COUNTER_MAX; 4482 memcg->soft_limit = PAGE_COUNTER_MAX;
4528 page_counter_init(&memcg->memsw, NULL); 4483 page_counter_init(&memcg->memsw, NULL);
4529 page_counter_init(&memcg->kmem, NULL); 4484 page_counter_init(&memcg->kmem, NULL);
@@ -4569,6 +4524,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
4569 4524
4570 if (parent->use_hierarchy) { 4525 if (parent->use_hierarchy) {
4571 page_counter_init(&memcg->memory, &parent->memory); 4526 page_counter_init(&memcg->memory, &parent->memory);
4527 memcg->high = PAGE_COUNTER_MAX;
4572 memcg->soft_limit = PAGE_COUNTER_MAX; 4528 memcg->soft_limit = PAGE_COUNTER_MAX;
4573 page_counter_init(&memcg->memsw, &parent->memsw); 4529 page_counter_init(&memcg->memsw, &parent->memsw);
4574 page_counter_init(&memcg->kmem, &parent->kmem); 4530 page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4579,6 +4535,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
4579 */ 4535 */
4580 } else { 4536 } else {
4581 page_counter_init(&memcg->memory, NULL); 4537 page_counter_init(&memcg->memory, NULL);
4538 memcg->high = PAGE_COUNTER_MAX;
4582 memcg->soft_limit = PAGE_COUNTER_MAX; 4539 memcg->soft_limit = PAGE_COUNTER_MAX;
4583 page_counter_init(&memcg->memsw, NULL); 4540 page_counter_init(&memcg->memsw, NULL);
4584 page_counter_init(&memcg->kmem, NULL); 4541 page_counter_init(&memcg->kmem, NULL);
@@ -4654,6 +4611,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4654 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); 4611 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
4655 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); 4612 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
4656 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); 4613 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
4614 memcg->low = 0;
4615 memcg->high = PAGE_COUNTER_MAX;
4657 memcg->soft_limit = PAGE_COUNTER_MAX; 4616 memcg->soft_limit = PAGE_COUNTER_MAX;
4658} 4617}
4659 4618
@@ -4730,12 +4689,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4730 if (!page || !page_mapped(page)) 4689 if (!page || !page_mapped(page))
4731 return NULL; 4690 return NULL;
4732 if (PageAnon(page)) { 4691 if (PageAnon(page)) {
4733 /* we don't move shared anon */ 4692 if (!(mc.flags & MOVE_ANON))
4734 if (!move_anon())
4735 return NULL; 4693 return NULL;
4736 } else if (!move_file()) 4694 } else {
4737 /* we ignore mapcount for file pages */ 4695 if (!(mc.flags & MOVE_FILE))
4738 return NULL; 4696 return NULL;
4697 }
4739 if (!get_page_unless_zero(page)) 4698 if (!get_page_unless_zero(page))
4740 return NULL; 4699 return NULL;
4741 4700
@@ -4749,7 +4708,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4749 struct page *page = NULL; 4708 struct page *page = NULL;
4750 swp_entry_t ent = pte_to_swp_entry(ptent); 4709 swp_entry_t ent = pte_to_swp_entry(ptent);
4751 4710
4752 if (!move_anon() || non_swap_entry(ent)) 4711 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
4753 return NULL; 4712 return NULL;
4754 /* 4713 /*
4755 * Because lookup_swap_cache() updates some statistics counter, 4714 * Because lookup_swap_cache() updates some statistics counter,
@@ -4778,7 +4737,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4778 4737
4779 if (!vma->vm_file) /* anonymous vma */ 4738 if (!vma->vm_file) /* anonymous vma */
4780 return NULL; 4739 return NULL;
4781 if (!move_file()) 4740 if (!(mc.flags & MOVE_FILE))
4782 return NULL; 4741 return NULL;
4783 4742
4784 mapping = vma->vm_file->f_mapping; 4743 mapping = vma->vm_file->f_mapping;
@@ -4857,7 +4816,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4857 4816
4858 page = pmd_page(pmd); 4817 page = pmd_page(pmd);
4859 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 4818 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
4860 if (!move_anon()) 4819 if (!(mc.flags & MOVE_ANON))
4861 return ret; 4820 return ret;
4862 if (page->mem_cgroup == mc.from) { 4821 if (page->mem_cgroup == mc.from) {
4863 ret = MC_TARGET_PAGE; 4822 ret = MC_TARGET_PAGE;
@@ -4880,7 +4839,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4880 unsigned long addr, unsigned long end, 4839 unsigned long addr, unsigned long end,
4881 struct mm_walk *walk) 4840 struct mm_walk *walk)
4882{ 4841{
4883 struct vm_area_struct *vma = walk->private; 4842 struct vm_area_struct *vma = walk->vma;
4884 pte_t *pte; 4843 pte_t *pte;
4885 spinlock_t *ptl; 4844 spinlock_t *ptl;
4886 4845
@@ -4906,20 +4865,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4906static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4865static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4907{ 4866{
4908 unsigned long precharge; 4867 unsigned long precharge;
4909 struct vm_area_struct *vma;
4910 4868
4869 struct mm_walk mem_cgroup_count_precharge_walk = {
4870 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4871 .mm = mm,
4872 };
4911 down_read(&mm->mmap_sem); 4873 down_read(&mm->mmap_sem);
4912 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4874 walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
4913 struct mm_walk mem_cgroup_count_precharge_walk = {
4914 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4915 .mm = mm,
4916 .private = vma,
4917 };
4918 if (is_vm_hugetlb_page(vma))
4919 continue;
4920 walk_page_range(vma->vm_start, vma->vm_end,
4921 &mem_cgroup_count_precharge_walk);
4922 }
4923 up_read(&mm->mmap_sem); 4875 up_read(&mm->mmap_sem);
4924 4876
4925 precharge = mc.precharge; 4877 precharge = mc.precharge;
@@ -4999,15 +4951,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
4999 struct task_struct *p = cgroup_taskset_first(tset); 4951 struct task_struct *p = cgroup_taskset_first(tset);
5000 int ret = 0; 4952 int ret = 0;
5001 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4953 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5002 unsigned long move_charge_at_immigrate; 4954 unsigned long move_flags;
5003 4955
5004 /* 4956 /*
5005 * We are now commited to this value whatever it is. Changes in this 4957 * We are now commited to this value whatever it is. Changes in this
5006 * tunable will only affect upcoming migrations, not the current one. 4958 * tunable will only affect upcoming migrations, not the current one.
5007 * So we need to save it, and keep it going. 4959 * So we need to save it, and keep it going.
5008 */ 4960 */
5009 move_charge_at_immigrate = memcg->move_charge_at_immigrate; 4961 move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
5010 if (move_charge_at_immigrate) { 4962 if (move_flags) {
5011 struct mm_struct *mm; 4963 struct mm_struct *mm;
5012 struct mem_cgroup *from = mem_cgroup_from_task(p); 4964 struct mem_cgroup *from = mem_cgroup_from_task(p);
5013 4965
@@ -5027,7 +4979,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5027 spin_lock(&mc.lock); 4979 spin_lock(&mc.lock);
5028 mc.from = from; 4980 mc.from = from;
5029 mc.to = memcg; 4981 mc.to = memcg;
5030 mc.immigrate_flags = move_charge_at_immigrate; 4982 mc.flags = move_flags;
5031 spin_unlock(&mc.lock); 4983 spin_unlock(&mc.lock);
5032 /* We set mc.moving_task later */ 4984 /* We set mc.moving_task later */
5033 4985
@@ -5052,7 +5004,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5052 struct mm_walk *walk) 5004 struct mm_walk *walk)
5053{ 5005{
5054 int ret = 0; 5006 int ret = 0;
5055 struct vm_area_struct *vma = walk->private; 5007 struct vm_area_struct *vma = walk->vma;
5056 pte_t *pte; 5008 pte_t *pte;
5057 spinlock_t *ptl; 5009 spinlock_t *ptl;
5058 enum mc_target_type target_type; 5010 enum mc_target_type target_type;
@@ -5148,7 +5100,10 @@ put: /* get_mctgt_type() gets the page */
5148 5100
5149static void mem_cgroup_move_charge(struct mm_struct *mm) 5101static void mem_cgroup_move_charge(struct mm_struct *mm)
5150{ 5102{
5151 struct vm_area_struct *vma; 5103 struct mm_walk mem_cgroup_move_charge_walk = {
5104 .pmd_entry = mem_cgroup_move_charge_pte_range,
5105 .mm = mm,
5106 };
5152 5107
5153 lru_add_drain_all(); 5108 lru_add_drain_all();
5154 /* 5109 /*
@@ -5171,24 +5126,11 @@ retry:
5171 cond_resched(); 5126 cond_resched();
5172 goto retry; 5127 goto retry;
5173 } 5128 }
5174 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5129 /*
5175 int ret; 5130 * When we have consumed all precharges and failed in doing
5176 struct mm_walk mem_cgroup_move_charge_walk = { 5131 * additional charge, the page walk just aborts.
5177 .pmd_entry = mem_cgroup_move_charge_pte_range, 5132 */
5178 .mm = mm, 5133 walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
5179 .private = vma,
5180 };
5181 if (is_vm_hugetlb_page(vma))
5182 continue;
5183 ret = walk_page_range(vma->vm_start, vma->vm_end,
5184 &mem_cgroup_move_charge_walk);
5185 if (ret)
5186 /*
5187 * means we have consumed all precharges and failed in
5188 * doing additional charge. Just abandon here.
5189 */
5190 break;
5191 }
5192 up_read(&mm->mmap_sem); 5134 up_read(&mm->mmap_sem);
5193 atomic_dec(&mc.from->moving_account); 5135 atomic_dec(&mc.from->moving_account);
5194} 5136}
@@ -5239,118 +5181,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5239 mem_cgroup_from_css(root_css)->use_hierarchy = true; 5181 mem_cgroup_from_css(root_css)->use_hierarchy = true;
5240} 5182}
5241 5183
5242struct cgroup_subsys memory_cgrp_subsys = { 5184static u64 memory_current_read(struct cgroup_subsys_state *css,
5243 .css_alloc = mem_cgroup_css_alloc, 5185 struct cftype *cft)
5244 .css_online = mem_cgroup_css_online, 5186{
5245 .css_offline = mem_cgroup_css_offline, 5187 return mem_cgroup_usage(mem_cgroup_from_css(css), false);
5246 .css_free = mem_cgroup_css_free, 5188}
5247 .css_reset = mem_cgroup_css_reset,
5248 .can_attach = mem_cgroup_can_attach,
5249 .cancel_attach = mem_cgroup_cancel_attach,
5250 .attach = mem_cgroup_move_task,
5251 .bind = mem_cgroup_bind,
5252 .legacy_cftypes = mem_cgroup_files,
5253 .early_init = 0,
5254};
5255 5189
5256#ifdef CONFIG_MEMCG_SWAP 5190static int memory_low_show(struct seq_file *m, void *v)
5257static int __init enable_swap_account(char *s)
5258{ 5191{
5259 if (!strcmp(s, "1")) 5192 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5260 really_do_swap_account = 1; 5193 unsigned long low = ACCESS_ONCE(memcg->low);
5261 else if (!strcmp(s, "0")) 5194
5262 really_do_swap_account = 0; 5195 if (low == PAGE_COUNTER_MAX)
5263 return 1; 5196 seq_puts(m, "infinity\n");
5197 else
5198 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5199
5200 return 0;
5264} 5201}
5265__setup("swapaccount=", enable_swap_account);
5266 5202
5267static void __init memsw_file_init(void) 5203static ssize_t memory_low_write(struct kernfs_open_file *of,
5204 char *buf, size_t nbytes, loff_t off)
5268{ 5205{
5269 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 5206 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5270 memsw_cgroup_files)); 5207 unsigned long low;
5208 int err;
5209
5210 buf = strstrip(buf);
5211 err = page_counter_memparse(buf, "infinity", &low);
5212 if (err)
5213 return err;
5214
5215 memcg->low = low;
5216
5217 return nbytes;
5271} 5218}
5272 5219
5273static void __init enable_swap_cgroup(void) 5220static int memory_high_show(struct seq_file *m, void *v)
5274{ 5221{
5275 if (!mem_cgroup_disabled() && really_do_swap_account) { 5222 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5276 do_swap_account = 1; 5223 unsigned long high = ACCESS_ONCE(memcg->high);
5277 memsw_file_init(); 5224
5278 } 5225 if (high == PAGE_COUNTER_MAX)
5226 seq_puts(m, "infinity\n");
5227 else
5228 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5229
5230 return 0;
5279} 5231}
5280 5232
5281#else 5233static ssize_t memory_high_write(struct kernfs_open_file *of,
5282static void __init enable_swap_cgroup(void) 5234 char *buf, size_t nbytes, loff_t off)
5283{ 5235{
5236 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5237 unsigned long high;
5238 int err;
5239
5240 buf = strstrip(buf);
5241 err = page_counter_memparse(buf, "infinity", &high);
5242 if (err)
5243 return err;
5244
5245 memcg->high = high;
5246
5247 return nbytes;
5284} 5248}
5285#endif
5286 5249
5287#ifdef CONFIG_MEMCG_SWAP 5250static int memory_max_show(struct seq_file *m, void *v)
5288/**
5289 * mem_cgroup_swapout - transfer a memsw charge to swap
5290 * @page: page whose memsw charge to transfer
5291 * @entry: swap entry to move the charge to
5292 *
5293 * Transfer the memsw charge of @page to @entry.
5294 */
5295void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5296{ 5251{
5297 struct mem_cgroup *memcg; 5252 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5298 unsigned short oldid; 5253 unsigned long max = ACCESS_ONCE(memcg->memory.limit);
5299 5254
5300 VM_BUG_ON_PAGE(PageLRU(page), page); 5255 if (max == PAGE_COUNTER_MAX)
5301 VM_BUG_ON_PAGE(page_count(page), page); 5256 seq_puts(m, "infinity\n");
5257 else
5258 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5302 5259
5303 if (!do_swap_account) 5260 return 0;
5304 return; 5261}
5305 5262
5306 memcg = page->mem_cgroup; 5263static ssize_t memory_max_write(struct kernfs_open_file *of,
5264 char *buf, size_t nbytes, loff_t off)
5265{
5266 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5267 unsigned long max;
5268 int err;
5307 5269
5308 /* Readahead page, never charged */ 5270 buf = strstrip(buf);
5309 if (!memcg) 5271 err = page_counter_memparse(buf, "infinity", &max);
5310 return; 5272 if (err)
5273 return err;
5311 5274
5312 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); 5275 err = mem_cgroup_resize_limit(memcg, max);
5313 VM_BUG_ON_PAGE(oldid, page); 5276 if (err)
5314 mem_cgroup_swap_statistics(memcg, true); 5277 return err;
5315 5278
5316 page->mem_cgroup = NULL; 5279 return nbytes;
5280}
5317 5281
5318 if (!mem_cgroup_is_root(memcg)) 5282static int memory_events_show(struct seq_file *m, void *v)
5319 page_counter_uncharge(&memcg->memory, 1); 5283{
5284 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5320 5285
5321 /* XXX: caller holds IRQ-safe mapping->tree_lock */ 5286 seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
5322 VM_BUG_ON(!irqs_disabled()); 5287 seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
5288 seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
5289 seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
5323 5290
5324 mem_cgroup_charge_statistics(memcg, page, -1); 5291 return 0;
5325 memcg_check_events(memcg, page); 5292}
5293
5294static struct cftype memory_files[] = {
5295 {
5296 .name = "current",
5297 .read_u64 = memory_current_read,
5298 },
5299 {
5300 .name = "low",
5301 .flags = CFTYPE_NOT_ON_ROOT,
5302 .seq_show = memory_low_show,
5303 .write = memory_low_write,
5304 },
5305 {
5306 .name = "high",
5307 .flags = CFTYPE_NOT_ON_ROOT,
5308 .seq_show = memory_high_show,
5309 .write = memory_high_write,
5310 },
5311 {
5312 .name = "max",
5313 .flags = CFTYPE_NOT_ON_ROOT,
5314 .seq_show = memory_max_show,
5315 .write = memory_max_write,
5316 },
5317 {
5318 .name = "events",
5319 .flags = CFTYPE_NOT_ON_ROOT,
5320 .seq_show = memory_events_show,
5321 },
5322 { } /* terminate */
5323};
5324
5325struct cgroup_subsys memory_cgrp_subsys = {
5326 .css_alloc = mem_cgroup_css_alloc,
5327 .css_online = mem_cgroup_css_online,
5328 .css_offline = mem_cgroup_css_offline,
5329 .css_free = mem_cgroup_css_free,
5330 .css_reset = mem_cgroup_css_reset,
5331 .can_attach = mem_cgroup_can_attach,
5332 .cancel_attach = mem_cgroup_cancel_attach,
5333 .attach = mem_cgroup_move_task,
5334 .bind = mem_cgroup_bind,
5335 .dfl_cftypes = memory_files,
5336 .legacy_cftypes = mem_cgroup_legacy_files,
5337 .early_init = 0,
5338};
5339
5340/**
5341 * mem_cgroup_events - count memory events against a cgroup
5342 * @memcg: the memory cgroup
5343 * @idx: the event index
5344 * @nr: the number of events to account for
5345 */
5346void mem_cgroup_events(struct mem_cgroup *memcg,
5347 enum mem_cgroup_events_index idx,
5348 unsigned int nr)
5349{
5350 this_cpu_add(memcg->stat->events[idx], nr);
5326} 5351}
5327 5352
5328/** 5353/**
5329 * mem_cgroup_uncharge_swap - uncharge a swap entry 5354 * mem_cgroup_low - check if memory consumption is below the normal range
5330 * @entry: swap entry to uncharge 5355 * @root: the highest ancestor to consider
5356 * @memcg: the memory cgroup to check
5331 * 5357 *
5332 * Drop the memsw charge associated with @entry. 5358 * Returns %true if memory consumption of @memcg, and that of all
5359 * configurable ancestors up to @root, is below the normal range.
5333 */ 5360 */
5334void mem_cgroup_uncharge_swap(swp_entry_t entry) 5361bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
5335{ 5362{
5336 struct mem_cgroup *memcg; 5363 if (mem_cgroup_disabled())
5337 unsigned short id; 5364 return false;
5338 5365
5339 if (!do_swap_account) 5366 /*
5340 return; 5367 * The toplevel group doesn't have a configurable range, so
5368 * it's never low when looked at directly, and it is not
5369 * considered an ancestor when assessing the hierarchy.
5370 */
5341 5371
5342 id = swap_cgroup_record(entry, 0); 5372 if (memcg == root_mem_cgroup)
5343 rcu_read_lock(); 5373 return false;
5344 memcg = mem_cgroup_lookup(id); 5374
5345 if (memcg) { 5375 if (page_counter_read(&memcg->memory) > memcg->low)
5346 if (!mem_cgroup_is_root(memcg)) 5376 return false;
5347 page_counter_uncharge(&memcg->memsw, 1); 5377
5348 mem_cgroup_swap_statistics(memcg, false); 5378 while (memcg != root) {
5349 css_put(&memcg->css); 5379 memcg = parent_mem_cgroup(memcg);
5380
5381 if (memcg == root_mem_cgroup)
5382 break;
5383
5384 if (page_counter_read(&memcg->memory) > memcg->low)
5385 return false;
5350 } 5386 }
5351 rcu_read_unlock(); 5387 return true;
5352} 5388}
5353#endif
5354 5389
5355/** 5390/**
5356 * mem_cgroup_try_charge - try charging a page 5391 * mem_cgroup_try_charge - try charging a page
@@ -5684,10 +5719,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
5684 */ 5719 */
5685static int __init mem_cgroup_init(void) 5720static int __init mem_cgroup_init(void)
5686{ 5721{
5722 int cpu, node;
5723
5687 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 5724 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5688 enable_swap_cgroup(); 5725
5689 mem_cgroup_soft_limit_tree_init(); 5726 for_each_possible_cpu(cpu)
5690 memcg_stock_init(); 5727 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
5728 drain_local_stock);
5729
5730 for_each_node(node) {
5731 struct mem_cgroup_tree_per_node *rtpn;
5732 int zone;
5733
5734 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5735 node_online(node) ? node : NUMA_NO_NODE);
5736
5737 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5738 struct mem_cgroup_tree_per_zone *rtpz;
5739
5740 rtpz = &rtpn->rb_tree_per_zone[zone];
5741 rtpz->rb_root = RB_ROOT;
5742 spin_lock_init(&rtpz->lock);
5743 }
5744 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5745 }
5746
5691 return 0; 5747 return 0;
5692} 5748}
5693subsys_initcall(mem_cgroup_init); 5749subsys_initcall(mem_cgroup_init);
5750
5751#ifdef CONFIG_MEMCG_SWAP
5752/**
5753 * mem_cgroup_swapout - transfer a memsw charge to swap
5754 * @page: page whose memsw charge to transfer
5755 * @entry: swap entry to move the charge to
5756 *
5757 * Transfer the memsw charge of @page to @entry.
5758 */
5759void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5760{
5761 struct mem_cgroup *memcg;
5762 unsigned short oldid;
5763
5764 VM_BUG_ON_PAGE(PageLRU(page), page);
5765 VM_BUG_ON_PAGE(page_count(page), page);
5766
5767 if (!do_swap_account)
5768 return;
5769
5770 memcg = page->mem_cgroup;
5771
5772 /* Readahead page, never charged */
5773 if (!memcg)
5774 return;
5775
5776 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
5777 VM_BUG_ON_PAGE(oldid, page);
5778 mem_cgroup_swap_statistics(memcg, true);
5779
5780 page->mem_cgroup = NULL;
5781
5782 if (!mem_cgroup_is_root(memcg))
5783 page_counter_uncharge(&memcg->memory, 1);
5784
5785 /* XXX: caller holds IRQ-safe mapping->tree_lock */
5786 VM_BUG_ON(!irqs_disabled());
5787
5788 mem_cgroup_charge_statistics(memcg, page, -1);
5789 memcg_check_events(memcg, page);
5790}
5791
5792/**
5793 * mem_cgroup_uncharge_swap - uncharge a swap entry
5794 * @entry: swap entry to uncharge
5795 *
5796 * Drop the memsw charge associated with @entry.
5797 */
5798void mem_cgroup_uncharge_swap(swp_entry_t entry)
5799{
5800 struct mem_cgroup *memcg;
5801 unsigned short id;
5802
5803 if (!do_swap_account)
5804 return;
5805
5806 id = swap_cgroup_record(entry, 0);
5807 rcu_read_lock();
5808 memcg = mem_cgroup_lookup(id);
5809 if (memcg) {
5810 if (!mem_cgroup_is_root(memcg))
5811 page_counter_uncharge(&memcg->memsw, 1);
5812 mem_cgroup_swap_statistics(memcg, false);
5813 css_put(&memcg->css);
5814 }
5815 rcu_read_unlock();
5816}
5817
5818/* for remember boot option*/
5819#ifdef CONFIG_MEMCG_SWAP_ENABLED
5820static int really_do_swap_account __initdata = 1;
5821#else
5822static int really_do_swap_account __initdata;
5823#endif
5824
5825static int __init enable_swap_account(char *s)
5826{
5827 if (!strcmp(s, "1"))
5828 really_do_swap_account = 1;
5829 else if (!strcmp(s, "0"))
5830 really_do_swap_account = 0;
5831 return 1;
5832}
5833__setup("swapaccount=", enable_swap_account);
5834
5835static struct cftype memsw_cgroup_files[] = {
5836 {
5837 .name = "memsw.usage_in_bytes",
5838 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5839 .read_u64 = mem_cgroup_read_u64,
5840 },
5841 {
5842 .name = "memsw.max_usage_in_bytes",
5843 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5844 .write = mem_cgroup_reset,
5845 .read_u64 = mem_cgroup_read_u64,
5846 },
5847 {
5848 .name = "memsw.limit_in_bytes",
5849 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5850 .write = mem_cgroup_write,
5851 .read_u64 = mem_cgroup_read_u64,
5852 },
5853 {
5854 .name = "memsw.failcnt",
5855 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5856 .write = mem_cgroup_reset,
5857 .read_u64 = mem_cgroup_read_u64,
5858 },
5859 { }, /* terminate */
5860};
5861
5862static int __init mem_cgroup_swap_init(void)
5863{
5864 if (!mem_cgroup_disabled() && really_do_swap_account) {
5865 do_swap_account = 1;
5866 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
5867 memsw_cgroup_files));
5868 }
5869 return 0;
5870}
5871subsys_initcall(mem_cgroup_swap_init);
5872
5873#endif /* CONFIG_MEMCG_SWAP */
diff --git a/mm/memory.c b/mm/memory.c
index d63849b5188f..bbe6a73a899d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
428 pmd = pmd_offset(pud, start); 428 pmd = pmd_offset(pud, start);
429 pud_clear(pud); 429 pud_clear(pud);
430 pmd_free_tlb(tlb, pmd, start); 430 pmd_free_tlb(tlb, pmd, start);
431 mm_dec_nr_pmds(tlb->mm);
431} 432}
432 433
433static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 434static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3322 3323
3323 spin_lock(&mm->page_table_lock); 3324 spin_lock(&mm->page_table_lock);
3324#ifndef __ARCH_HAS_4LEVEL_HACK 3325#ifndef __ARCH_HAS_4LEVEL_HACK
3325 if (pud_present(*pud)) /* Another has populated it */ 3326 if (!pud_present(*pud)) {
3326 pmd_free(mm, new); 3327 mm_inc_nr_pmds(mm);
3327 else
3328 pud_populate(mm, pud, new); 3328 pud_populate(mm, pud, new);
3329#else 3329 } else /* Another has populated it */
3330 if (pgd_present(*pud)) /* Another has populated it */
3331 pmd_free(mm, new); 3330 pmd_free(mm, new);
3332 else 3331#else
3332 if (!pgd_present(*pud)) {
3333 mm_inc_nr_pmds(mm);
3333 pgd_populate(mm, pud, new); 3334 pgd_populate(mm, pud, new);
3335 } else /* Another has populated it */
3336 pmd_free(mm, new);
3334#endif /* __ARCH_HAS_4LEVEL_HACK */ 3337#endif /* __ARCH_HAS_4LEVEL_HACK */
3335 spin_unlock(&mm->page_table_lock); 3338 spin_unlock(&mm->page_table_lock);
3336 return 0; 3339 return 0;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e0961b8c39c..f1bd23803576 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
471static void migrate_page_add(struct page *page, struct list_head *pagelist, 471static void migrate_page_add(struct page *page, struct list_head *pagelist,
472 unsigned long flags); 472 unsigned long flags);
473 473
474struct queue_pages {
475 struct list_head *pagelist;
476 unsigned long flags;
477 nodemask_t *nmask;
478 struct vm_area_struct *prev;
479};
480
474/* 481/*
475 * Scan through pages checking if pages follow certain conditions, 482 * Scan through pages checking if pages follow certain conditions,
476 * and move them to the pagelist if they do. 483 * and move them to the pagelist if they do.
477 */ 484 */
478static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 485static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
479 unsigned long addr, unsigned long end, 486 unsigned long end, struct mm_walk *walk)
480 const nodemask_t *nodes, unsigned long flags,
481 void *private)
482{ 487{
483 pte_t *orig_pte; 488 struct vm_area_struct *vma = walk->vma;
489 struct page *page;
490 struct queue_pages *qp = walk->private;
491 unsigned long flags = qp->flags;
492 int nid;
484 pte_t *pte; 493 pte_t *pte;
485 spinlock_t *ptl; 494 spinlock_t *ptl;
486 495
487 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 496 split_huge_page_pmd(vma, addr, pmd);
488 do { 497 if (pmd_trans_unstable(pmd))
489 struct page *page; 498 return 0;
490 int nid;
491 499
500 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
501 for (; addr != end; pte++, addr += PAGE_SIZE) {
492 if (!pte_present(*pte)) 502 if (!pte_present(*pte))
493 continue; 503 continue;
494 page = vm_normal_page(vma, addr, *pte); 504 page = vm_normal_page(vma, addr, *pte);
@@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
501 if (PageReserved(page)) 511 if (PageReserved(page))
502 continue; 512 continue;
503 nid = page_to_nid(page); 513 nid = page_to_nid(page);
504 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 514 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
505 continue; 515 continue;
506 516
507 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 517 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
508 migrate_page_add(page, private, flags); 518 migrate_page_add(page, qp->pagelist, flags);
509 else 519 }
510 break; 520 pte_unmap_unlock(pte - 1, ptl);
511 } while (pte++, addr += PAGE_SIZE, addr != end); 521 cond_resched();
512 pte_unmap_unlock(orig_pte, ptl); 522 return 0;
513 return addr != end;
514} 523}
515 524
516static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, 525static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
517 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, 526 unsigned long addr, unsigned long end,
518 void *private) 527 struct mm_walk *walk)
519{ 528{
520#ifdef CONFIG_HUGETLB_PAGE 529#ifdef CONFIG_HUGETLB_PAGE
530 struct queue_pages *qp = walk->private;
531 unsigned long flags = qp->flags;
521 int nid; 532 int nid;
522 struct page *page; 533 struct page *page;
523 spinlock_t *ptl; 534 spinlock_t *ptl;
524 pte_t entry; 535 pte_t entry;
525 536
526 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); 537 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
527 entry = huge_ptep_get((pte_t *)pmd); 538 entry = huge_ptep_get(pte);
528 if (!pte_present(entry)) 539 if (!pte_present(entry))
529 goto unlock; 540 goto unlock;
530 page = pte_page(entry); 541 page = pte_page(entry);
531 nid = page_to_nid(page); 542 nid = page_to_nid(page);
532 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 543 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
533 goto unlock; 544 goto unlock;
534 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ 545 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
535 if (flags & (MPOL_MF_MOVE_ALL) || 546 if (flags & (MPOL_MF_MOVE_ALL) ||
536 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) 547 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
537 isolate_huge_page(page, private); 548 isolate_huge_page(page, qp->pagelist);
538unlock: 549unlock:
539 spin_unlock(ptl); 550 spin_unlock(ptl);
540#else 551#else
541 BUG(); 552 BUG();
542#endif 553#endif
543}
544
545static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
546 unsigned long addr, unsigned long end,
547 const nodemask_t *nodes, unsigned long flags,
548 void *private)
549{
550 pmd_t *pmd;
551 unsigned long next;
552
553 pmd = pmd_offset(pud, addr);
554 do {
555 next = pmd_addr_end(addr, end);
556 if (!pmd_present(*pmd))
557 continue;
558 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
559 queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
560 flags, private);
561 continue;
562 }
563 split_huge_page_pmd(vma, addr, pmd);
564 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
565 continue;
566 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
567 flags, private))
568 return -EIO;
569 } while (pmd++, addr = next, addr != end);
570 return 0;
571}
572
573static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
574 unsigned long addr, unsigned long end,
575 const nodemask_t *nodes, unsigned long flags,
576 void *private)
577{
578 pud_t *pud;
579 unsigned long next;
580
581 pud = pud_offset(pgd, addr);
582 do {
583 next = pud_addr_end(addr, end);
584 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
585 continue;
586 if (pud_none_or_clear_bad(pud))
587 continue;
588 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
589 flags, private))
590 return -EIO;
591 } while (pud++, addr = next, addr != end);
592 return 0;
593}
594
595static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
596 unsigned long addr, unsigned long end,
597 const nodemask_t *nodes, unsigned long flags,
598 void *private)
599{
600 pgd_t *pgd;
601 unsigned long next;
602
603 pgd = pgd_offset(vma->vm_mm, addr);
604 do {
605 next = pgd_addr_end(addr, end);
606 if (pgd_none_or_clear_bad(pgd))
607 continue;
608 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
609 flags, private))
610 return -EIO;
611 } while (pgd++, addr = next, addr != end);
612 return 0; 554 return 0;
613} 555}
614 556
@@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
641} 583}
642#endif /* CONFIG_NUMA_BALANCING */ 584#endif /* CONFIG_NUMA_BALANCING */
643 585
586static int queue_pages_test_walk(unsigned long start, unsigned long end,
587 struct mm_walk *walk)
588{
589 struct vm_area_struct *vma = walk->vma;
590 struct queue_pages *qp = walk->private;
591 unsigned long endvma = vma->vm_end;
592 unsigned long flags = qp->flags;
593
594 if (vma->vm_flags & VM_PFNMAP)
595 return 1;
596
597 if (endvma > end)
598 endvma = end;
599 if (vma->vm_start > start)
600 start = vma->vm_start;
601
602 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
603 if (!vma->vm_next && vma->vm_end < end)
604 return -EFAULT;
605 if (qp->prev && qp->prev->vm_end < vma->vm_start)
606 return -EFAULT;
607 }
608
609 qp->prev = vma;
610
611 if (vma->vm_flags & VM_PFNMAP)
612 return 1;
613
614 if (flags & MPOL_MF_LAZY) {
615 /* Similar to task_numa_work, skip inaccessible VMAs */
616 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
617 change_prot_numa(vma, start, endvma);
618 return 1;
619 }
620
621 if ((flags & MPOL_MF_STRICT) ||
622 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
623 vma_migratable(vma)))
624 /* queue pages from current vma */
625 return 0;
626 return 1;
627}
628
644/* 629/*
645 * Walk through page tables and collect pages to be migrated. 630 * Walk through page tables and collect pages to be migrated.
646 * 631 *
@@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
650 */ 635 */
651static int 636static int
652queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 637queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
653 const nodemask_t *nodes, unsigned long flags, void *private) 638 nodemask_t *nodes, unsigned long flags,
654{ 639 struct list_head *pagelist)
655 int err = 0; 640{
656 struct vm_area_struct *vma, *prev; 641 struct queue_pages qp = {
657 642 .pagelist = pagelist,
658 vma = find_vma(mm, start); 643 .flags = flags,
659 if (!vma) 644 .nmask = nodes,
660 return -EFAULT; 645 .prev = NULL,
661 prev = NULL; 646 };
662 for (; vma && vma->vm_start < end; vma = vma->vm_next) { 647 struct mm_walk queue_pages_walk = {
663 unsigned long endvma = vma->vm_end; 648 .hugetlb_entry = queue_pages_hugetlb,
664 649 .pmd_entry = queue_pages_pte_range,
665 if (endvma > end) 650 .test_walk = queue_pages_test_walk,
666 endvma = end; 651 .mm = mm,
667 if (vma->vm_start > start) 652 .private = &qp,
668 start = vma->vm_start; 653 };
669 654
670 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 655 return walk_page_range(start, end, &queue_pages_walk);
671 if (!vma->vm_next && vma->vm_end < end)
672 return -EFAULT;
673 if (prev && prev->vm_end < vma->vm_start)
674 return -EFAULT;
675 }
676
677 if (flags & MPOL_MF_LAZY) {
678 /* Similar to task_numa_work, skip inaccessible VMAs */
679 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
680 change_prot_numa(vma, start, endvma);
681 goto next;
682 }
683
684 if ((flags & MPOL_MF_STRICT) ||
685 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
686 vma_migratable(vma))) {
687
688 err = queue_pages_pgd_range(vma, start, endvma, nodes,
689 flags, private);
690 if (err)
691 break;
692 }
693next:
694 prev = vma;
695 }
696 return err;
697} 656}
698 657
699/* 658/*
@@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1988 * @order:Order of the GFP allocation. 1947 * @order:Order of the GFP allocation.
1989 * @vma: Pointer to VMA or NULL if not available. 1948 * @vma: Pointer to VMA or NULL if not available.
1990 * @addr: Virtual Address of the allocation. Must be inside the VMA. 1949 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1950 * @node: Which node to prefer for allocation (modulo policy).
1951 * @hugepage: for hugepages try only the preferred node if possible
1991 * 1952 *
1992 * This function allocates a page from the kernel page pool and applies 1953 * This function allocates a page from the kernel page pool and applies
1993 * a NUMA policy associated with the VMA or the current process. 1954 * a NUMA policy associated with the VMA or the current process.
1994 * When VMA is not NULL caller must hold down_read on the mmap_sem of the 1955 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1995 * mm_struct of the VMA to prevent it from going away. Should be used for 1956 * mm_struct of the VMA to prevent it from going away. Should be used for
1996 * all allocations for pages that will be mapped into 1957 * all allocations for pages that will be mapped into user space. Returns
1997 * user space. Returns NULL when no page can be allocated. 1958 * NULL when no page can be allocated.
1998 *
1999 * Should be called with the mm_sem of the vma hold.
2000 */ 1959 */
2001struct page * 1960struct page *
2002alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1961alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2003 unsigned long addr, int node) 1962 unsigned long addr, int node, bool hugepage)
2004{ 1963{
2005 struct mempolicy *pol; 1964 struct mempolicy *pol;
2006 struct page *page; 1965 struct page *page;
2007 unsigned int cpuset_mems_cookie; 1966 unsigned int cpuset_mems_cookie;
1967 struct zonelist *zl;
1968 nodemask_t *nmask;
2008 1969
2009retry_cpuset: 1970retry_cpuset:
2010 pol = get_vma_policy(vma, addr); 1971 pol = get_vma_policy(vma, addr);
2011 cpuset_mems_cookie = read_mems_allowed_begin(); 1972 cpuset_mems_cookie = read_mems_allowed_begin();
2012 1973
2013 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1974 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
1975 pol->mode != MPOL_INTERLEAVE)) {
1976 /*
1977 * For hugepage allocation and non-interleave policy which
1978 * allows the current node, we only try to allocate from the
1979 * current node and don't fall back to other nodes, as the
1980 * cost of remote accesses would likely offset THP benefits.
1981 *
1982 * If the policy is interleave, or does not allow the current
1983 * node in its nodemask, we allocate the standard way.
1984 */
1985 nmask = policy_nodemask(gfp, pol);
1986 if (!nmask || node_isset(node, *nmask)) {
1987 mpol_cond_put(pol);
1988 page = alloc_pages_exact_node(node, gfp, order);
1989 goto out;
1990 }
1991 }
1992
1993 if (pol->mode == MPOL_INTERLEAVE) {
2014 unsigned nid; 1994 unsigned nid;
2015 1995
2016 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 1996 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2017 mpol_cond_put(pol); 1997 mpol_cond_put(pol);
2018 page = alloc_page_interleave(gfp, order, nid); 1998 page = alloc_page_interleave(gfp, order, nid);
2019 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 1999 goto out;
2020 goto retry_cpuset;
2021
2022 return page;
2023 } 2000 }
2024 page = __alloc_pages_nodemask(gfp, order, 2001
2025 policy_zonelist(gfp, pol, node), 2002 nmask = policy_nodemask(gfp, pol);
2026 policy_nodemask(gfp, pol)); 2003 zl = policy_zonelist(gfp, pol, node);
2027 mpol_cond_put(pol); 2004 mpol_cond_put(pol);
2005 page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2006out:
2028 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2007 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2029 goto retry_cpuset; 2008 goto retry_cpuset;
2030 return page; 2009 return page;
diff --git a/mm/migrate.c b/mm/migrate.c
index 6e284bcca8bb..f98067e5d353 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -197,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new)
197 * get to the page and wait until migration is finished. 197 * get to the page and wait until migration is finished.
198 * When we return from this function the fault will be retried. 198 * When we return from this function the fault will be retried.
199 */ 199 */
200static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, 200void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
201 spinlock_t *ptl) 201 spinlock_t *ptl)
202{ 202{
203 pte_t pte; 203 pte_t pte;
@@ -1236,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1236 goto put_and_set; 1236 goto put_and_set;
1237 1237
1238 if (PageHuge(page)) { 1238 if (PageHuge(page)) {
1239 isolate_huge_page(page, &pagelist); 1239 if (PageHead(page))
1240 isolate_huge_page(page, &pagelist);
1240 goto put_and_set; 1241 goto put_and_set;
1241 } 1242 }
1242 1243
diff --git a/mm/mincore.c b/mm/mincore.c
index 46527c023e0c..be25efde64a4 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -19,38 +19,25 @@
19#include <asm/uaccess.h> 19#include <asm/uaccess.h>
20#include <asm/pgtable.h> 20#include <asm/pgtable.h>
21 21
22static void mincore_hugetlb_page_range(struct vm_area_struct *vma, 22static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
23 unsigned long addr, unsigned long end, 23 unsigned long end, struct mm_walk *walk)
24 unsigned char *vec)
25{ 24{
26#ifdef CONFIG_HUGETLB_PAGE 25#ifdef CONFIG_HUGETLB_PAGE
27 struct hstate *h; 26 unsigned char present;
27 unsigned char *vec = walk->private;
28 28
29 h = hstate_vma(vma); 29 /*
30 while (1) { 30 * Hugepages under user process are always in RAM and never
31 unsigned char present; 31 * swapped out, but theoretically it needs to be checked.
32 pte_t *ptep; 32 */
33 /* 33 present = pte && !huge_pte_none(huge_ptep_get(pte));
34 * Huge pages are always in RAM for now, but 34 for (; addr != end; vec++, addr += PAGE_SIZE)
35 * theoretically it needs to be checked. 35 *vec = present;
36 */ 36 walk->private = vec;
37 ptep = huge_pte_offset(current->mm,
38 addr & huge_page_mask(h));
39 present = ptep && !huge_pte_none(huge_ptep_get(ptep));
40 while (1) {
41 *vec = present;
42 vec++;
43 addr += PAGE_SIZE;
44 if (addr == end)
45 return;
46 /* check hugepage border */
47 if (!(addr & ~huge_page_mask(h)))
48 break;
49 }
50 }
51#else 37#else
52 BUG(); 38 BUG();
53#endif 39#endif
40 return 0;
54} 41}
55 42
56/* 43/*
@@ -94,9 +81,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
94 return present; 81 return present;
95} 82}
96 83
97static void mincore_unmapped_range(struct vm_area_struct *vma, 84static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
98 unsigned long addr, unsigned long end, 85 struct vm_area_struct *vma, unsigned char *vec)
99 unsigned char *vec)
100{ 86{
101 unsigned long nr = (end - addr) >> PAGE_SHIFT; 87 unsigned long nr = (end - addr) >> PAGE_SHIFT;
102 int i; 88 int i;
@@ -111,23 +97,44 @@ static void mincore_unmapped_range(struct vm_area_struct *vma,
111 for (i = 0; i < nr; i++) 97 for (i = 0; i < nr; i++)
112 vec[i] = 0; 98 vec[i] = 0;
113 } 99 }
100 return nr;
101}
102
103static int mincore_unmapped_range(unsigned long addr, unsigned long end,
104 struct mm_walk *walk)
105{
106 walk->private += __mincore_unmapped_range(addr, end,
107 walk->vma, walk->private);
108 return 0;
114} 109}
115 110
116static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 111static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
117 unsigned long addr, unsigned long end, 112 struct mm_walk *walk)
118 unsigned char *vec)
119{ 113{
120 unsigned long next;
121 spinlock_t *ptl; 114 spinlock_t *ptl;
115 struct vm_area_struct *vma = walk->vma;
122 pte_t *ptep; 116 pte_t *ptep;
117 unsigned char *vec = walk->private;
118 int nr = (end - addr) >> PAGE_SHIFT;
119
120 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
121 memset(vec, 1, nr);
122 spin_unlock(ptl);
123 goto out;
124 }
125
126 if (pmd_trans_unstable(pmd)) {
127 __mincore_unmapped_range(addr, end, vma, vec);
128 goto out;
129 }
123 130
124 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 131 ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
125 do { 132 for (; addr != end; ptep++, addr += PAGE_SIZE) {
126 pte_t pte = *ptep; 133 pte_t pte = *ptep;
127 134
128 next = addr + PAGE_SIZE;
129 if (pte_none(pte)) 135 if (pte_none(pte))
130 mincore_unmapped_range(vma, addr, next, vec); 136 __mincore_unmapped_range(addr, addr + PAGE_SIZE,
137 vma, vec);
131 else if (pte_present(pte)) 138 else if (pte_present(pte))
132 *vec = 1; 139 *vec = 1;
133 else { /* pte is a swap entry */ 140 else { /* pte is a swap entry */
@@ -150,69 +157,12 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
150 } 157 }
151 } 158 }
152 vec++; 159 vec++;
153 } while (ptep++, addr = next, addr != end); 160 }
154 pte_unmap_unlock(ptep - 1, ptl); 161 pte_unmap_unlock(ptep - 1, ptl);
155} 162out:
156 163 walk->private += nr;
157static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, 164 cond_resched();
158 unsigned long addr, unsigned long end, 165 return 0;
159 unsigned char *vec)
160{
161 unsigned long next;
162 pmd_t *pmd;
163
164 pmd = pmd_offset(pud, addr);
165 do {
166 next = pmd_addr_end(addr, end);
167 if (pmd_trans_huge(*pmd)) {
168 if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
169 vec += (next - addr) >> PAGE_SHIFT;
170 continue;
171 }
172 /* fall through */
173 }
174 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
175 mincore_unmapped_range(vma, addr, next, vec);
176 else
177 mincore_pte_range(vma, pmd, addr, next, vec);
178 vec += (next - addr) >> PAGE_SHIFT;
179 } while (pmd++, addr = next, addr != end);
180}
181
182static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
183 unsigned long addr, unsigned long end,
184 unsigned char *vec)
185{
186 unsigned long next;
187 pud_t *pud;
188
189 pud = pud_offset(pgd, addr);
190 do {
191 next = pud_addr_end(addr, end);
192 if (pud_none_or_clear_bad(pud))
193 mincore_unmapped_range(vma, addr, next, vec);
194 else
195 mincore_pmd_range(vma, pud, addr, next, vec);
196 vec += (next - addr) >> PAGE_SHIFT;
197 } while (pud++, addr = next, addr != end);
198}
199
200static void mincore_page_range(struct vm_area_struct *vma,
201 unsigned long addr, unsigned long end,
202 unsigned char *vec)
203{
204 unsigned long next;
205 pgd_t *pgd;
206
207 pgd = pgd_offset(vma->vm_mm, addr);
208 do {
209 next = pgd_addr_end(addr, end);
210 if (pgd_none_or_clear_bad(pgd))
211 mincore_unmapped_range(vma, addr, next, vec);
212 else
213 mincore_pud_range(vma, pgd, addr, next, vec);
214 vec += (next - addr) >> PAGE_SHIFT;
215 } while (pgd++, addr = next, addr != end);
216} 166}
217 167
218/* 168/*
@@ -224,18 +174,22 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
224{ 174{
225 struct vm_area_struct *vma; 175 struct vm_area_struct *vma;
226 unsigned long end; 176 unsigned long end;
177 int err;
178 struct mm_walk mincore_walk = {
179 .pmd_entry = mincore_pte_range,
180 .pte_hole = mincore_unmapped_range,
181 .hugetlb_entry = mincore_hugetlb,
182 .private = vec,
183 };
227 184
228 vma = find_vma(current->mm, addr); 185 vma = find_vma(current->mm, addr);
229 if (!vma || addr < vma->vm_start) 186 if (!vma || addr < vma->vm_start)
230 return -ENOMEM; 187 return -ENOMEM;
231 188 mincore_walk.mm = vma->vm_mm;
232 end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); 189 end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
233 190 err = walk_page_range(addr, end, &mincore_walk);
234 if (is_vm_hugetlb_page(vma)) 191 if (err < 0)
235 mincore_hugetlb_page_range(vma, addr, end, vec); 192 return err;
236 else
237 mincore_page_range(vma, addr, end, vec);
238
239 return (end - addr) >> PAGE_SHIFT; 193 return (end - addr) >> PAGE_SHIFT;
240} 194}
241 195
diff --git a/mm/mmap.c b/mm/mmap.c
index 14d84666e8ba..da9990acc08b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -152,7 +152,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
152 */ 152 */
153int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 153int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
154{ 154{
155 unsigned long free, allowed, reserve; 155 long free, allowed, reserve;
156 156
157 VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < 157 VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
158 -(s64)vm_committed_as_batch * num_online_cpus(), 158 -(s64)vm_committed_as_batch * num_online_cpus(),
@@ -220,7 +220,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
220 */ 220 */
221 if (mm) { 221 if (mm) {
222 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 222 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
223 allowed -= min(mm->total_vm / 32, reserve); 223 allowed -= min_t(long, mm->total_vm / 32, reserve);
224 } 224 }
225 225
226 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 226 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
@@ -2851,9 +2851,6 @@ void exit_mmap(struct mm_struct *mm)
2851 vma = remove_vma(vma); 2851 vma = remove_vma(vma);
2852 } 2852 }
2853 vm_unacct_memory(nr_accounted); 2853 vm_unacct_memory(nr_accounted);
2854
2855 WARN_ON(atomic_long_read(&mm->nr_ptes) >
2856 (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2857} 2854}
2858 2855
2859/* Insert vm structure into process list sorted by address 2856/* Insert vm structure into process list sorted by address
diff --git a/mm/mmzone.c b/mm/mmzone.c
index bf34fb8556db..7d87ebb0d632 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -54,8 +54,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
54/* Returns the next zone at or below highest_zoneidx in a zonelist */ 54/* Returns the next zone at or below highest_zoneidx in a zonelist */
55struct zoneref *next_zones_zonelist(struct zoneref *z, 55struct zoneref *next_zones_zonelist(struct zoneref *z,
56 enum zone_type highest_zoneidx, 56 enum zone_type highest_zoneidx,
57 nodemask_t *nodes, 57 nodemask_t *nodes)
58 struct zone **zone)
59{ 58{
60 /* 59 /*
61 * Find the next suitable zone to use for the allocation. 60 * Find the next suitable zone to use for the allocation.
@@ -69,7 +68,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
69 (z->zone && !zref_in_nodemask(z, nodes))) 68 (z->zone && !zref_in_nodemask(z, nodes)))
70 z++; 69 z++;
71 70
72 *zone = zonelist_zone(z);
73 return z; 71 return z;
74} 72}
75 73
diff --git a/mm/nommu.c b/mm/nommu.c
index 541bed64e348..1a19fb3b0463 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -214,6 +214,39 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
214} 214}
215EXPORT_SYMBOL(get_user_pages); 215EXPORT_SYMBOL(get_user_pages);
216 216
217long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
218 unsigned long start, unsigned long nr_pages,
219 int write, int force, struct page **pages,
220 int *locked)
221{
222 return get_user_pages(tsk, mm, start, nr_pages, write, force,
223 pages, NULL);
224}
225EXPORT_SYMBOL(get_user_pages_locked);
226
227long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
228 unsigned long start, unsigned long nr_pages,
229 int write, int force, struct page **pages,
230 unsigned int gup_flags)
231{
232 long ret;
233 down_read(&mm->mmap_sem);
234 ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
235 pages, NULL);
236 up_read(&mm->mmap_sem);
237 return ret;
238}
239EXPORT_SYMBOL(__get_user_pages_unlocked);
240
241long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
242 unsigned long start, unsigned long nr_pages,
243 int write, int force, struct page **pages)
244{
245 return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
246 force, pages, 0);
247}
248EXPORT_SYMBOL(get_user_pages_unlocked);
249
217/** 250/**
218 * follow_pfn - look up PFN at a user virtual address 251 * follow_pfn - look up PFN at a user virtual address
219 * @vma: memory mapping 252 * @vma: memory mapping
@@ -1895,7 +1928,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
1895 */ 1928 */
1896int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 1929int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1897{ 1930{
1898 unsigned long free, allowed, reserve; 1931 long free, allowed, reserve;
1899 1932
1900 vm_acct_memory(pages); 1933 vm_acct_memory(pages);
1901 1934
@@ -1959,7 +1992,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1959 */ 1992 */
1960 if (mm) { 1993 if (mm) {
1961 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 1994 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
1962 allowed -= min(mm->total_vm / 32, reserve); 1995 allowed -= min_t(long, mm->total_vm / 32, reserve);
1963 } 1996 }
1964 1997
1965 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 1998 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d503e9ce1c7b..642f38cb175a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
169 * The baseline for the badness score is the proportion of RAM that each 169 * The baseline for the badness score is the proportion of RAM that each
170 * task's rss, pagetable and swap space use. 170 * task's rss, pagetable and swap space use.
171 */ 171 */
172 points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + 172 points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
173 get_mm_counter(p->mm, MM_SWAPENTS); 173 atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
174 task_unlock(p); 174 task_unlock(p);
175 175
176 /* 176 /*
@@ -266,8 +266,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
266 * Don't allow any other task to have access to the reserves. 266 * Don't allow any other task to have access to the reserves.
267 */ 267 */
268 if (test_tsk_thread_flag(task, TIF_MEMDIE)) { 268 if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
269 if (unlikely(frozen(task)))
270 __thaw_task(task);
271 if (!force_kill) 269 if (!force_kill)
272 return OOM_SCAN_ABORT; 270 return OOM_SCAN_ABORT;
273 } 271 }
@@ -353,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
353 struct task_struct *p; 351 struct task_struct *p;
354 struct task_struct *task; 352 struct task_struct *task;
355 353
356 pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); 354 pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n");
357 rcu_read_lock(); 355 rcu_read_lock();
358 for_each_process(p) { 356 for_each_process(p) {
359 if (oom_unkillable_task(p, memcg, nodemask)) 357 if (oom_unkillable_task(p, memcg, nodemask))
@@ -369,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
369 continue; 367 continue;
370 } 368 }
371 369
372 pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", 370 pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n",
373 task->pid, from_kuid(&init_user_ns, task_uid(task)), 371 task->pid, from_kuid(&init_user_ns, task_uid(task)),
374 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 372 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
375 atomic_long_read(&task->mm->nr_ptes), 373 atomic_long_read(&task->mm->nr_ptes),
374 mm_nr_pmds(task->mm),
376 get_mm_counter(task->mm, MM_SWAPENTS), 375 get_mm_counter(task->mm, MM_SWAPENTS),
377 task->signal->oom_score_adj, task->comm); 376 task->signal->oom_score_adj, task->comm);
378 task_unlock(task); 377 task_unlock(task);
@@ -400,20 +399,98 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
400} 399}
401 400
402/* 401/*
403 * Number of OOM killer invocations (including memcg OOM killer). 402 * Number of OOM victims in flight
404 * Primarily used by PM freezer to check for potential races with
405 * OOM killed frozen task.
406 */ 403 */
407static atomic_t oom_kills = ATOMIC_INIT(0); 404static atomic_t oom_victims = ATOMIC_INIT(0);
405static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
408 406
409int oom_kills_count(void) 407bool oom_killer_disabled __read_mostly;
408static DECLARE_RWSEM(oom_sem);
409
410/**
411 * mark_tsk_oom_victim - marks the given taks as OOM victim.
412 * @tsk: task to mark
413 *
414 * Has to be called with oom_sem taken for read and never after
415 * oom has been disabled already.
416 */
417void mark_tsk_oom_victim(struct task_struct *tsk)
410{ 418{
411 return atomic_read(&oom_kills); 419 WARN_ON(oom_killer_disabled);
420 /* OOM killer might race with memcg OOM */
421 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
422 return;
423 /*
424 * Make sure that the task is woken up from uninterruptible sleep
425 * if it is frozen because OOM killer wouldn't be able to free
426 * any memory and livelock. freezing_slow_path will tell the freezer
427 * that TIF_MEMDIE tasks should be ignored.
428 */
429 __thaw_task(tsk);
430 atomic_inc(&oom_victims);
431}
432
433/**
434 * unmark_oom_victim - unmarks the current task as OOM victim.
435 *
436 * Wakes up all waiters in oom_killer_disable()
437 */
438void unmark_oom_victim(void)
439{
440 if (!test_and_clear_thread_flag(TIF_MEMDIE))
441 return;
442
443 down_read(&oom_sem);
444 /*
445 * There is no need to signal the lasst oom_victim if there
446 * is nobody who cares.
447 */
448 if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
449 wake_up_all(&oom_victims_wait);
450 up_read(&oom_sem);
451}
452
453/**
454 * oom_killer_disable - disable OOM killer
455 *
456 * Forces all page allocations to fail rather than trigger OOM killer.
457 * Will block and wait until all OOM victims are killed.
458 *
459 * The function cannot be called when there are runnable user tasks because
460 * the userspace would see unexpected allocation failures as a result. Any
461 * new usage of this function should be consulted with MM people.
462 *
463 * Returns true if successful and false if the OOM killer cannot be
464 * disabled.
465 */
466bool oom_killer_disable(void)
467{
468 /*
469 * Make sure to not race with an ongoing OOM killer
470 * and that the current is not the victim.
471 */
472 down_write(&oom_sem);
473 if (test_thread_flag(TIF_MEMDIE)) {
474 up_write(&oom_sem);
475 return false;
476 }
477
478 oom_killer_disabled = true;
479 up_write(&oom_sem);
480
481 wait_event(oom_victims_wait, !atomic_read(&oom_victims));
482
483 return true;
412} 484}
413 485
414void note_oom_kill(void) 486/**
487 * oom_killer_enable - enable OOM killer
488 */
489void oom_killer_enable(void)
415{ 490{
416 atomic_inc(&oom_kills); 491 down_write(&oom_sem);
492 oom_killer_disabled = false;
493 up_write(&oom_sem);
417} 494}
418 495
419#define K(x) ((x) << (PAGE_SHIFT-10)) 496#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -438,11 +515,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
438 * If the task is already exiting, don't alarm the sysadmin or kill 515 * If the task is already exiting, don't alarm the sysadmin or kill
439 * its children or threads, just set TIF_MEMDIE so it can die quickly 516 * its children or threads, just set TIF_MEMDIE so it can die quickly
440 */ 517 */
441 if (task_will_free_mem(p)) { 518 task_lock(p);
442 set_tsk_thread_flag(p, TIF_MEMDIE); 519 if (p->mm && task_will_free_mem(p)) {
520 mark_tsk_oom_victim(p);
521 task_unlock(p);
443 put_task_struct(p); 522 put_task_struct(p);
444 return; 523 return;
445 } 524 }
525 task_unlock(p);
446 526
447 if (__ratelimit(&oom_rs)) 527 if (__ratelimit(&oom_rs))
448 dump_header(p, gfp_mask, order, memcg, nodemask); 528 dump_header(p, gfp_mask, order, memcg, nodemask);
@@ -492,6 +572,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
492 572
493 /* mm cannot safely be dereferenced after task_unlock(victim) */ 573 /* mm cannot safely be dereferenced after task_unlock(victim) */
494 mm = victim->mm; 574 mm = victim->mm;
575 mark_tsk_oom_victim(victim);
495 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 576 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
496 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), 577 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
497 K(get_mm_counter(victim->mm, MM_ANONPAGES)), 578 K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -522,7 +603,6 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
522 } 603 }
523 rcu_read_unlock(); 604 rcu_read_unlock();
524 605
525 set_tsk_thread_flag(victim, TIF_MEMDIE);
526 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); 606 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
527 put_task_struct(victim); 607 put_task_struct(victim);
528} 608}
@@ -611,7 +691,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
611} 691}
612 692
613/** 693/**
614 * out_of_memory - kill the "best" process when we run out of memory 694 * __out_of_memory - kill the "best" process when we run out of memory
615 * @zonelist: zonelist pointer 695 * @zonelist: zonelist pointer
616 * @gfp_mask: memory allocation flags 696 * @gfp_mask: memory allocation flags
617 * @order: amount of memory being requested as a power of 2 697 * @order: amount of memory being requested as a power of 2
@@ -623,7 +703,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
623 * OR try to be smart about which process to kill. Note that we 703 * OR try to be smart about which process to kill. Note that we
624 * don't have to be perfect here, we just have to be good. 704 * don't have to be perfect here, we just have to be good.
625 */ 705 */
626void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 706static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
627 int order, nodemask_t *nodemask, bool force_kill) 707 int order, nodemask_t *nodemask, bool force_kill)
628{ 708{
629 const nodemask_t *mpol_mask; 709 const nodemask_t *mpol_mask;
@@ -643,9 +723,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
643 * If current has a pending SIGKILL or is exiting, then automatically 723 * If current has a pending SIGKILL or is exiting, then automatically
644 * select it. The goal is to allow it to allocate so that it may 724 * select it. The goal is to allow it to allocate so that it may
645 * quickly exit and free its memory. 725 * quickly exit and free its memory.
726 *
727 * But don't select if current has already released its mm and cleared
728 * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
646 */ 729 */
647 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 730 if (current->mm &&
648 set_thread_flag(TIF_MEMDIE); 731 (fatal_signal_pending(current) || task_will_free_mem(current))) {
732 mark_tsk_oom_victim(current);
649 return; 733 return;
650 } 734 }
651 735
@@ -688,6 +772,32 @@ out:
688 schedule_timeout_killable(1); 772 schedule_timeout_killable(1);
689} 773}
690 774
775/**
776 * out_of_memory - tries to invoke OOM killer.
777 * @zonelist: zonelist pointer
778 * @gfp_mask: memory allocation flags
779 * @order: amount of memory being requested as a power of 2
780 * @nodemask: nodemask passed to page allocator
781 * @force_kill: true if a task must be killed, even if others are exiting
782 *
783 * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
784 * when it returns false. Otherwise returns true.
785 */
786bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
787 int order, nodemask_t *nodemask, bool force_kill)
788{
789 bool ret = false;
790
791 down_read(&oom_sem);
792 if (!oom_killer_disabled) {
793 __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
794 ret = true;
795 }
796 up_read(&oom_sem);
797
798 return ret;
799}
800
691/* 801/*
692 * The pagefault handler calls here because it is out of memory, so kill a 802 * The pagefault handler calls here because it is out of memory, so kill a
693 * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a 803 * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
@@ -697,12 +807,25 @@ void pagefault_out_of_memory(void)
697{ 807{
698 struct zonelist *zonelist; 808 struct zonelist *zonelist;
699 809
810 down_read(&oom_sem);
700 if (mem_cgroup_oom_synchronize(true)) 811 if (mem_cgroup_oom_synchronize(true))
701 return; 812 goto unlock;
702 813
703 zonelist = node_zonelist(first_memory_node, GFP_KERNEL); 814 zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
704 if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { 815 if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
705 out_of_memory(NULL, 0, 0, NULL, false); 816 if (!oom_killer_disabled)
817 __out_of_memory(NULL, 0, 0, NULL, false);
818 else
819 /*
820 * There shouldn't be any user tasks runable while the
821 * OOM killer is disabled so the current task has to
822 * be a racing OOM victim for which oom_killer_disable()
823 * is waiting for.
824 */
825 WARN_ON(test_thread_flag(TIF_MEMDIE));
826
706 oom_zonelist_unlock(zonelist, GFP_KERNEL); 827 oom_zonelist_unlock(zonelist, GFP_KERNEL);
707 } 828 }
829unlock:
830 up_read(&oom_sem);
708} 831}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6f4335238e33..6a73e47e81c6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2168,9 +2168,12 @@ EXPORT_SYMBOL(account_page_redirty);
2168 */ 2168 */
2169int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) 2169int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
2170{ 2170{
2171 int ret;
2172
2171 wbc->pages_skipped++; 2173 wbc->pages_skipped++;
2174 ret = __set_page_dirty_nobuffers(page);
2172 account_page_redirty(page); 2175 account_page_redirty(page);
2173 return __set_page_dirty_nobuffers(page); 2176 return ret;
2174} 2177}
2175EXPORT_SYMBOL(redirty_page_for_writepage); 2178EXPORT_SYMBOL(redirty_page_for_writepage);
2176 2179
@@ -2308,12 +2311,10 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
2308int test_clear_page_writeback(struct page *page) 2311int test_clear_page_writeback(struct page *page)
2309{ 2312{
2310 struct address_space *mapping = page_mapping(page); 2313 struct address_space *mapping = page_mapping(page);
2311 unsigned long memcg_flags;
2312 struct mem_cgroup *memcg; 2314 struct mem_cgroup *memcg;
2313 bool locked;
2314 int ret; 2315 int ret;
2315 2316
2316 memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); 2317 memcg = mem_cgroup_begin_page_stat(page);
2317 if (mapping) { 2318 if (mapping) {
2318 struct backing_dev_info *bdi = mapping->backing_dev_info; 2319 struct backing_dev_info *bdi = mapping->backing_dev_info;
2319 unsigned long flags; 2320 unsigned long flags;
@@ -2338,19 +2339,17 @@ int test_clear_page_writeback(struct page *page)
2338 dec_zone_page_state(page, NR_WRITEBACK); 2339 dec_zone_page_state(page, NR_WRITEBACK);
2339 inc_zone_page_state(page, NR_WRITTEN); 2340 inc_zone_page_state(page, NR_WRITTEN);
2340 } 2341 }
2341 mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); 2342 mem_cgroup_end_page_stat(memcg);
2342 return ret; 2343 return ret;
2343} 2344}
2344 2345
2345int __test_set_page_writeback(struct page *page, bool keep_write) 2346int __test_set_page_writeback(struct page *page, bool keep_write)
2346{ 2347{
2347 struct address_space *mapping = page_mapping(page); 2348 struct address_space *mapping = page_mapping(page);
2348 unsigned long memcg_flags;
2349 struct mem_cgroup *memcg; 2349 struct mem_cgroup *memcg;
2350 bool locked;
2351 int ret; 2350 int ret;
2352 2351
2353 memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); 2352 memcg = mem_cgroup_begin_page_stat(page);
2354 if (mapping) { 2353 if (mapping) {
2355 struct backing_dev_info *bdi = mapping->backing_dev_info; 2354 struct backing_dev_info *bdi = mapping->backing_dev_info;
2356 unsigned long flags; 2355 unsigned long flags;
@@ -2380,7 +2379,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2380 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 2379 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
2381 inc_zone_page_state(page, NR_WRITEBACK); 2380 inc_zone_page_state(page, NR_WRITEBACK);
2382 } 2381 }
2383 mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); 2382 mem_cgroup_end_page_stat(memcg);
2384 return ret; 2383 return ret;
2385 2384
2386} 2385}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f121050e8530..8d52ab18fe0d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
244 PB_migrate, PB_migrate_end); 244 PB_migrate, PB_migrate_end);
245} 245}
246 246
247bool oom_killer_disabled __read_mostly;
248
249#ifdef CONFIG_DEBUG_VM 247#ifdef CONFIG_DEBUG_VM
250static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 248static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
251{ 249{
@@ -381,36 +379,6 @@ void prep_compound_page(struct page *page, unsigned long order)
381 } 379 }
382} 380}
383 381
384/* update __split_huge_page_refcount if you change this function */
385static int destroy_compound_page(struct page *page, unsigned long order)
386{
387 int i;
388 int nr_pages = 1 << order;
389 int bad = 0;
390
391 if (unlikely(compound_order(page) != order)) {
392 bad_page(page, "wrong compound order", 0);
393 bad++;
394 }
395
396 __ClearPageHead(page);
397
398 for (i = 1; i < nr_pages; i++) {
399 struct page *p = page + i;
400
401 if (unlikely(!PageTail(p))) {
402 bad_page(page, "PageTail not set", 0);
403 bad++;
404 } else if (unlikely(p->first_page != page)) {
405 bad_page(page, "first_page not consistent", 0);
406 bad++;
407 }
408 __ClearPageTail(p);
409 }
410
411 return bad;
412}
413
414static inline void prep_zero_page(struct page *page, unsigned int order, 382static inline void prep_zero_page(struct page *page, unsigned int order,
415 gfp_t gfp_flags) 383 gfp_t gfp_flags)
416{ 384{
@@ -613,10 +581,7 @@ static inline void __free_one_page(struct page *page,
613 int max_order = MAX_ORDER; 581 int max_order = MAX_ORDER;
614 582
615 VM_BUG_ON(!zone_is_initialized(zone)); 583 VM_BUG_ON(!zone_is_initialized(zone));
616 584 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
617 if (unlikely(PageCompound(page)))
618 if (unlikely(destroy_compound_page(page, order)))
619 return;
620 585
621 VM_BUG_ON(migratetype == -1); 586 VM_BUG_ON(migratetype == -1);
622 if (is_migrate_isolate(migratetype)) { 587 if (is_migrate_isolate(migratetype)) {
@@ -797,21 +762,40 @@ static void free_one_page(struct zone *zone,
797 spin_unlock(&zone->lock); 762 spin_unlock(&zone->lock);
798} 763}
799 764
765static int free_tail_pages_check(struct page *head_page, struct page *page)
766{
767 if (!IS_ENABLED(CONFIG_DEBUG_VM))
768 return 0;
769 if (unlikely(!PageTail(page))) {
770 bad_page(page, "PageTail not set", 0);
771 return 1;
772 }
773 if (unlikely(page->first_page != head_page)) {
774 bad_page(page, "first_page not consistent", 0);
775 return 1;
776 }
777 return 0;
778}
779
800static bool free_pages_prepare(struct page *page, unsigned int order) 780static bool free_pages_prepare(struct page *page, unsigned int order)
801{ 781{
802 int i; 782 bool compound = PageCompound(page);
803 int bad = 0; 783 int i, bad = 0;
804 784
805 VM_BUG_ON_PAGE(PageTail(page), page); 785 VM_BUG_ON_PAGE(PageTail(page), page);
806 VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); 786 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
807 787
808 trace_mm_page_free(page, order); 788 trace_mm_page_free(page, order);
809 kmemcheck_free_shadow(page, order); 789 kmemcheck_free_shadow(page, order);
810 790
811 if (PageAnon(page)) 791 if (PageAnon(page))
812 page->mapping = NULL; 792 page->mapping = NULL;
813 for (i = 0; i < (1 << order); i++) 793 bad += free_pages_check(page);
794 for (i = 1; i < (1 << order); i++) {
795 if (compound)
796 bad += free_tail_pages_check(page, page + i);
814 bad += free_pages_check(page + i); 797 bad += free_pages_check(page + i);
798 }
815 if (bad) 799 if (bad)
816 return false; 800 return false;
817 801
@@ -970,7 +954,8 @@ static inline int check_new_page(struct page *page)
970 return 0; 954 return 0;
971} 955}
972 956
973static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) 957static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
958 int alloc_flags)
974{ 959{
975 int i; 960 int i;
976 961
@@ -994,6 +979,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
994 979
995 set_page_owner(page, order, gfp_flags); 980 set_page_owner(page, order, gfp_flags);
996 981
982 /*
983 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
984 * allocate the page. The expectation is that the caller is taking
985 * steps that will free more memory. The caller should avoid the page
986 * being used for !PFMEMALLOC purposes.
987 */
988 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
989
997 return 0; 990 return 0;
998} 991}
999 992
@@ -1130,39 +1123,34 @@ static void change_pageblock_range(struct page *pageblock_page,
1130} 1123}
1131 1124
1132/* 1125/*
1133 * If breaking a large block of pages, move all free pages to the preferred 1126 * When we are falling back to another migratetype during allocation, try to
1134 * allocation list. If falling back for a reclaimable kernel allocation, be 1127 * steal extra free pages from the same pageblocks to satisfy further
1135 * more aggressive about taking ownership of free pages. 1128 * allocations, instead of polluting multiple pageblocks.
1136 * 1129 *
1137 * On the other hand, never change migration type of MIGRATE_CMA pageblocks 1130 * If we are stealing a relatively large buddy page, it is likely there will
1138 * nor move CMA pages to different free lists. We don't want unmovable pages 1131 * be more free pages in the pageblock, so try to steal them all. For
1139 * to be allocated from MIGRATE_CMA areas. 1132 * reclaimable and unmovable allocations, we steal regardless of page size,
1133 * as fragmentation caused by those allocations polluting movable pageblocks
1134 * is worse than movable allocations stealing from unmovable and reclaimable
1135 * pageblocks.
1140 * 1136 *
1141 * Returns the new migratetype of the pageblock (or the same old migratetype 1137 * If we claim more than half of the pageblock, change pageblock's migratetype
1142 * if it was unchanged). 1138 * as well.
1143 */ 1139 */
1144static int try_to_steal_freepages(struct zone *zone, struct page *page, 1140static void try_to_steal_freepages(struct zone *zone, struct page *page,
1145 int start_type, int fallback_type) 1141 int start_type, int fallback_type)
1146{ 1142{
1147 int current_order = page_order(page); 1143 int current_order = page_order(page);
1148 1144
1149 /*
1150 * When borrowing from MIGRATE_CMA, we need to release the excess
1151 * buddy pages to CMA itself. We also ensure the freepage_migratetype
1152 * is set to CMA so it is returned to the correct freelist in case
1153 * the page ends up being not actually allocated from the pcp lists.
1154 */
1155 if (is_migrate_cma(fallback_type))
1156 return fallback_type;
1157
1158 /* Take ownership for orders >= pageblock_order */ 1145 /* Take ownership for orders >= pageblock_order */
1159 if (current_order >= pageblock_order) { 1146 if (current_order >= pageblock_order) {
1160 change_pageblock_range(page, current_order, start_type); 1147 change_pageblock_range(page, current_order, start_type);
1161 return start_type; 1148 return;
1162 } 1149 }
1163 1150
1164 if (current_order >= pageblock_order / 2 || 1151 if (current_order >= pageblock_order / 2 ||
1165 start_type == MIGRATE_RECLAIMABLE || 1152 start_type == MIGRATE_RECLAIMABLE ||
1153 start_type == MIGRATE_UNMOVABLE ||
1166 page_group_by_mobility_disabled) { 1154 page_group_by_mobility_disabled) {
1167 int pages; 1155 int pages;
1168 1156
@@ -1170,15 +1158,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
1170 1158
1171 /* Claim the whole block if over half of it is free */ 1159 /* Claim the whole block if over half of it is free */
1172 if (pages >= (1 << (pageblock_order-1)) || 1160 if (pages >= (1 << (pageblock_order-1)) ||
1173 page_group_by_mobility_disabled) { 1161 page_group_by_mobility_disabled)
1174
1175 set_pageblock_migratetype(page, start_type); 1162 set_pageblock_migratetype(page, start_type);
1176 return start_type;
1177 }
1178
1179 } 1163 }
1180
1181 return fallback_type;
1182} 1164}
1183 1165
1184/* Remove an element from the buddy allocator from the fallback list */ 1166/* Remove an element from the buddy allocator from the fallback list */
@@ -1188,14 +1170,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1188 struct free_area *area; 1170 struct free_area *area;
1189 unsigned int current_order; 1171 unsigned int current_order;
1190 struct page *page; 1172 struct page *page;
1191 int migratetype, new_type, i;
1192 1173
1193 /* Find the largest possible block of pages in the other list */ 1174 /* Find the largest possible block of pages in the other list */
1194 for (current_order = MAX_ORDER-1; 1175 for (current_order = MAX_ORDER-1;
1195 current_order >= order && current_order <= MAX_ORDER-1; 1176 current_order >= order && current_order <= MAX_ORDER-1;
1196 --current_order) { 1177 --current_order) {
1178 int i;
1197 for (i = 0;; i++) { 1179 for (i = 0;; i++) {
1198 migratetype = fallbacks[start_migratetype][i]; 1180 int migratetype = fallbacks[start_migratetype][i];
1181 int buddy_type = start_migratetype;
1199 1182
1200 /* MIGRATE_RESERVE handled later if necessary */ 1183 /* MIGRATE_RESERVE handled later if necessary */
1201 if (migratetype == MIGRATE_RESERVE) 1184 if (migratetype == MIGRATE_RESERVE)
@@ -1209,25 +1192,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1209 struct page, lru); 1192 struct page, lru);
1210 area->nr_free--; 1193 area->nr_free--;
1211 1194
1212 new_type = try_to_steal_freepages(zone, page, 1195 if (!is_migrate_cma(migratetype)) {
1213 start_migratetype, 1196 try_to_steal_freepages(zone, page,
1214 migratetype); 1197 start_migratetype,
1198 migratetype);
1199 } else {
1200 /*
1201 * When borrowing from MIGRATE_CMA, we need to
1202 * release the excess buddy pages to CMA
1203 * itself, and we do not try to steal extra
1204 * free pages.
1205 */
1206 buddy_type = migratetype;
1207 }
1215 1208
1216 /* Remove the page from the freelists */ 1209 /* Remove the page from the freelists */
1217 list_del(&page->lru); 1210 list_del(&page->lru);
1218 rmv_page_order(page); 1211 rmv_page_order(page);
1219 1212
1220 expand(zone, page, order, current_order, area, 1213 expand(zone, page, order, current_order, area,
1221 new_type); 1214 buddy_type);
1222 /* The freepage_migratetype may differ from pageblock's 1215
1216 /*
1217 * The freepage_migratetype may differ from pageblock's
1223 * migratetype depending on the decisions in 1218 * migratetype depending on the decisions in
1224 * try_to_steal_freepages. This is OK as long as it does 1219 * try_to_steal_freepages(). This is OK as long as it
1225 * not differ for MIGRATE_CMA type. 1220 * does not differ for MIGRATE_CMA pageblocks. For CMA
1221 * we need to make sure unallocated pages flushed from
1222 * pcp lists are returned to the correct freelist.
1226 */ 1223 */
1227 set_freepage_migratetype(page, new_type); 1224 set_freepage_migratetype(page, buddy_type);
1228 1225
1229 trace_mm_page_alloc_extfrag(page, order, current_order, 1226 trace_mm_page_alloc_extfrag(page, order, current_order,
1230 start_migratetype, migratetype, new_type); 1227 start_migratetype, migratetype);
1231 1228
1232 return page; 1229 return page;
1233 } 1230 }
@@ -1642,9 +1639,7 @@ int split_free_page(struct page *page)
1642} 1639}
1643 1640
1644/* 1641/*
1645 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1642 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
1646 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1647 * or two.
1648 */ 1643 */
1649static inline 1644static inline
1650struct page *buffered_rmqueue(struct zone *preferred_zone, 1645struct page *buffered_rmqueue(struct zone *preferred_zone,
@@ -1655,7 +1650,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1655 struct page *page; 1650 struct page *page;
1656 bool cold = ((gfp_flags & __GFP_COLD) != 0); 1651 bool cold = ((gfp_flags & __GFP_COLD) != 0);
1657 1652
1658again:
1659 if (likely(order == 0)) { 1653 if (likely(order == 0)) {
1660 struct per_cpu_pages *pcp; 1654 struct per_cpu_pages *pcp;
1661 struct list_head *list; 1655 struct list_head *list;
@@ -1711,8 +1705,6 @@ again:
1711 local_irq_restore(flags); 1705 local_irq_restore(flags);
1712 1706
1713 VM_BUG_ON_PAGE(bad_range(zone, page), page); 1707 VM_BUG_ON_PAGE(bad_range(zone, page), page);
1714 if (prep_new_page(page, order, gfp_flags))
1715 goto again;
1716 return page; 1708 return page;
1717 1709
1718failed: 1710failed:
@@ -2033,10 +2025,10 @@ static void reset_alloc_batches(struct zone *preferred_zone)
2033 * a page. 2025 * a page.
2034 */ 2026 */
2035static struct page * 2027static struct page *
2036get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 2028get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
2037 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 2029 const struct alloc_context *ac)
2038 struct zone *preferred_zone, int classzone_idx, int migratetype)
2039{ 2030{
2031 struct zonelist *zonelist = ac->zonelist;
2040 struct zoneref *z; 2032 struct zoneref *z;
2041 struct page *page = NULL; 2033 struct page *page = NULL;
2042 struct zone *zone; 2034 struct zone *zone;
@@ -2055,8 +2047,8 @@ zonelist_scan:
2055 * Scan zonelist, looking for a zone with enough free. 2047 * Scan zonelist, looking for a zone with enough free.
2056 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 2048 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
2057 */ 2049 */
2058 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2050 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2059 high_zoneidx, nodemask) { 2051 ac->nodemask) {
2060 unsigned long mark; 2052 unsigned long mark;
2061 2053
2062 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 2054 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
@@ -2073,7 +2065,7 @@ zonelist_scan:
2073 * time the page has in memory before being reclaimed. 2065 * time the page has in memory before being reclaimed.
2074 */ 2066 */
2075 if (alloc_flags & ALLOC_FAIR) { 2067 if (alloc_flags & ALLOC_FAIR) {
2076 if (!zone_local(preferred_zone, zone)) 2068 if (!zone_local(ac->preferred_zone, zone))
2077 break; 2069 break;
2078 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { 2070 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
2079 nr_fair_skipped++; 2071 nr_fair_skipped++;
@@ -2111,7 +2103,7 @@ zonelist_scan:
2111 2103
2112 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 2104 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
2113 if (!zone_watermark_ok(zone, order, mark, 2105 if (!zone_watermark_ok(zone, order, mark,
2114 classzone_idx, alloc_flags)) { 2106 ac->classzone_idx, alloc_flags)) {
2115 int ret; 2107 int ret;
2116 2108
2117 /* Checked here to keep the fast path fast */ 2109 /* Checked here to keep the fast path fast */
@@ -2132,7 +2124,7 @@ zonelist_scan:
2132 } 2124 }
2133 2125
2134 if (zone_reclaim_mode == 0 || 2126 if (zone_reclaim_mode == 0 ||
2135 !zone_allows_reclaim(preferred_zone, zone)) 2127 !zone_allows_reclaim(ac->preferred_zone, zone))
2136 goto this_zone_full; 2128 goto this_zone_full;
2137 2129
2138 /* 2130 /*
@@ -2154,7 +2146,7 @@ zonelist_scan:
2154 default: 2146 default:
2155 /* did we reclaim enough */ 2147 /* did we reclaim enough */
2156 if (zone_watermark_ok(zone, order, mark, 2148 if (zone_watermark_ok(zone, order, mark,
2157 classzone_idx, alloc_flags)) 2149 ac->classzone_idx, alloc_flags))
2158 goto try_this_zone; 2150 goto try_this_zone;
2159 2151
2160 /* 2152 /*
@@ -2175,27 +2167,18 @@ zonelist_scan:
2175 } 2167 }
2176 2168
2177try_this_zone: 2169try_this_zone:
2178 page = buffered_rmqueue(preferred_zone, zone, order, 2170 page = buffered_rmqueue(ac->preferred_zone, zone, order,
2179 gfp_mask, migratetype); 2171 gfp_mask, ac->migratetype);
2180 if (page) 2172 if (page) {
2181 break; 2173 if (prep_new_page(page, order, gfp_mask, alloc_flags))
2174 goto try_this_zone;
2175 return page;
2176 }
2182this_zone_full: 2177this_zone_full:
2183 if (IS_ENABLED(CONFIG_NUMA) && zlc_active) 2178 if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
2184 zlc_mark_zone_full(zonelist, z); 2179 zlc_mark_zone_full(zonelist, z);
2185 } 2180 }
2186 2181
2187 if (page) {
2188 /*
2189 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2190 * necessary to allocate the page. The expectation is
2191 * that the caller is taking steps that will free more
2192 * memory. The caller should avoid the page being used
2193 * for !PFMEMALLOC purposes.
2194 */
2195 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2196 return page;
2197 }
2198
2199 /* 2182 /*
2200 * The first pass makes sure allocations are spread fairly within the 2183 * The first pass makes sure allocations are spread fairly within the
2201 * local node. However, the local node might have free pages left 2184 * local node. However, the local node might have free pages left
@@ -2208,7 +2191,7 @@ this_zone_full:
2208 alloc_flags &= ~ALLOC_FAIR; 2191 alloc_flags &= ~ALLOC_FAIR;
2209 if (nr_fair_skipped) { 2192 if (nr_fair_skipped) {
2210 zonelist_rescan = true; 2193 zonelist_rescan = true;
2211 reset_alloc_batches(preferred_zone); 2194 reset_alloc_batches(ac->preferred_zone);
2212 } 2195 }
2213 if (nr_online_nodes > 1) 2196 if (nr_online_nodes > 1)
2214 zonelist_rescan = true; 2197 zonelist_rescan = true;
@@ -2330,44 +2313,29 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2330 2313
2331static inline struct page * 2314static inline struct page *
2332__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2315__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2333 struct zonelist *zonelist, enum zone_type high_zoneidx, 2316 const struct alloc_context *ac, unsigned long *did_some_progress)
2334 nodemask_t *nodemask, struct zone *preferred_zone,
2335 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2336{ 2317{
2337 struct page *page; 2318 struct page *page;
2338 2319
2339 *did_some_progress = 0; 2320 *did_some_progress = 0;
2340 2321
2341 if (oom_killer_disabled)
2342 return NULL;
2343
2344 /* 2322 /*
2345 * Acquire the per-zone oom lock for each zone. If that 2323 * Acquire the per-zone oom lock for each zone. If that
2346 * fails, somebody else is making progress for us. 2324 * fails, somebody else is making progress for us.
2347 */ 2325 */
2348 if (!oom_zonelist_trylock(zonelist, gfp_mask)) { 2326 if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
2349 *did_some_progress = 1; 2327 *did_some_progress = 1;
2350 schedule_timeout_uninterruptible(1); 2328 schedule_timeout_uninterruptible(1);
2351 return NULL; 2329 return NULL;
2352 } 2330 }
2353 2331
2354 /* 2332 /*
2355 * PM-freezer should be notified that there might be an OOM killer on
2356 * its way to kill and wake somebody up. This is too early and we might
2357 * end up not killing anything but false positives are acceptable.
2358 * See freeze_processes.
2359 */
2360 note_oom_kill();
2361
2362 /*
2363 * Go through the zonelist yet one more time, keep very high watermark 2333 * Go through the zonelist yet one more time, keep very high watermark
2364 * here, this is only to catch a parallel oom killing, we must fail if 2334 * here, this is only to catch a parallel oom killing, we must fail if
2365 * we're still under heavy pressure. 2335 * we're still under heavy pressure.
2366 */ 2336 */
2367 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2337 page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
2368 order, zonelist, high_zoneidx, 2338 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
2369 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2370 preferred_zone, classzone_idx, migratetype);
2371 if (page) 2339 if (page)
2372 goto out; 2340 goto out;
2373 2341
@@ -2379,7 +2347,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2379 if (order > PAGE_ALLOC_COSTLY_ORDER) 2347 if (order > PAGE_ALLOC_COSTLY_ORDER)
2380 goto out; 2348 goto out;
2381 /* The OOM killer does not needlessly kill tasks for lowmem */ 2349 /* The OOM killer does not needlessly kill tasks for lowmem */
2382 if (high_zoneidx < ZONE_NORMAL) 2350 if (ac->high_zoneidx < ZONE_NORMAL)
2383 goto out; 2351 goto out;
2384 /* The OOM killer does not compensate for light reclaim */ 2352 /* The OOM killer does not compensate for light reclaim */
2385 if (!(gfp_mask & __GFP_FS)) 2353 if (!(gfp_mask & __GFP_FS))
@@ -2395,10 +2363,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2395 goto out; 2363 goto out;
2396 } 2364 }
2397 /* Exhausted what can be done so it's blamo time */ 2365 /* Exhausted what can be done so it's blamo time */
2398 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2366 if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
2399 *did_some_progress = 1; 2367 *did_some_progress = 1;
2400out: 2368out:
2401 oom_zonelist_unlock(zonelist, gfp_mask); 2369 oom_zonelist_unlock(ac->zonelist, gfp_mask);
2402 return page; 2370 return page;
2403} 2371}
2404 2372
@@ -2406,10 +2374,9 @@ out:
2406/* Try memory compaction for high-order allocations before reclaim */ 2374/* Try memory compaction for high-order allocations before reclaim */
2407static struct page * 2375static struct page *
2408__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2376__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2409 struct zonelist *zonelist, enum zone_type high_zoneidx, 2377 int alloc_flags, const struct alloc_context *ac,
2410 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2378 enum migrate_mode mode, int *contended_compaction,
2411 int classzone_idx, int migratetype, enum migrate_mode mode, 2379 bool *deferred_compaction)
2412 int *contended_compaction, bool *deferred_compaction)
2413{ 2380{
2414 unsigned long compact_result; 2381 unsigned long compact_result;
2415 struct page *page; 2382 struct page *page;
@@ -2418,10 +2385,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2418 return NULL; 2385 return NULL;
2419 2386
2420 current->flags |= PF_MEMALLOC; 2387 current->flags |= PF_MEMALLOC;
2421 compact_result = try_to_compact_pages(zonelist, order, gfp_mask, 2388 compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
2422 nodemask, mode, 2389 mode, contended_compaction);
2423 contended_compaction,
2424 alloc_flags, classzone_idx);
2425 current->flags &= ~PF_MEMALLOC; 2390 current->flags &= ~PF_MEMALLOC;
2426 2391
2427 switch (compact_result) { 2392 switch (compact_result) {
@@ -2440,10 +2405,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2440 */ 2405 */
2441 count_vm_event(COMPACTSTALL); 2406 count_vm_event(COMPACTSTALL);
2442 2407
2443 page = get_page_from_freelist(gfp_mask, nodemask, 2408 page = get_page_from_freelist(gfp_mask, order,
2444 order, zonelist, high_zoneidx, 2409 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
2445 alloc_flags & ~ALLOC_NO_WATERMARKS,
2446 preferred_zone, classzone_idx, migratetype);
2447 2410
2448 if (page) { 2411 if (page) {
2449 struct zone *zone = page_zone(page); 2412 struct zone *zone = page_zone(page);
@@ -2467,10 +2430,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2467#else 2430#else
2468static inline struct page * 2431static inline struct page *
2469__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2432__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2470 struct zonelist *zonelist, enum zone_type high_zoneidx, 2433 int alloc_flags, const struct alloc_context *ac,
2471 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2434 enum migrate_mode mode, int *contended_compaction,
2472 int classzone_idx, int migratetype, enum migrate_mode mode, 2435 bool *deferred_compaction)
2473 int *contended_compaction, bool *deferred_compaction)
2474{ 2436{
2475 return NULL; 2437 return NULL;
2476} 2438}
@@ -2478,8 +2440,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2478 2440
2479/* Perform direct synchronous page reclaim */ 2441/* Perform direct synchronous page reclaim */
2480static int 2442static int
2481__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2443__perform_reclaim(gfp_t gfp_mask, unsigned int order,
2482 nodemask_t *nodemask) 2444 const struct alloc_context *ac)
2483{ 2445{
2484 struct reclaim_state reclaim_state; 2446 struct reclaim_state reclaim_state;
2485 int progress; 2447 int progress;
@@ -2493,7 +2455,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2493 reclaim_state.reclaimed_slab = 0; 2455 reclaim_state.reclaimed_slab = 0;
2494 current->reclaim_state = &reclaim_state; 2456 current->reclaim_state = &reclaim_state;
2495 2457
2496 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2458 progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
2459 ac->nodemask);
2497 2460
2498 current->reclaim_state = NULL; 2461 current->reclaim_state = NULL;
2499 lockdep_clear_current_reclaim_state(); 2462 lockdep_clear_current_reclaim_state();
@@ -2507,28 +2470,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2507/* The really slow allocator path where we enter direct reclaim */ 2470/* The really slow allocator path where we enter direct reclaim */
2508static inline struct page * 2471static inline struct page *
2509__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2472__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2510 struct zonelist *zonelist, enum zone_type high_zoneidx, 2473 int alloc_flags, const struct alloc_context *ac,
2511 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2474 unsigned long *did_some_progress)
2512 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2513{ 2475{
2514 struct page *page = NULL; 2476 struct page *page = NULL;
2515 bool drained = false; 2477 bool drained = false;
2516 2478
2517 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2479 *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
2518 nodemask);
2519 if (unlikely(!(*did_some_progress))) 2480 if (unlikely(!(*did_some_progress)))
2520 return NULL; 2481 return NULL;
2521 2482
2522 /* After successful reclaim, reconsider all zones for allocation */ 2483 /* After successful reclaim, reconsider all zones for allocation */
2523 if (IS_ENABLED(CONFIG_NUMA)) 2484 if (IS_ENABLED(CONFIG_NUMA))
2524 zlc_clear_zones_full(zonelist); 2485 zlc_clear_zones_full(ac->zonelist);
2525 2486
2526retry: 2487retry:
2527 page = get_page_from_freelist(gfp_mask, nodemask, order, 2488 page = get_page_from_freelist(gfp_mask, order,
2528 zonelist, high_zoneidx, 2489 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
2529 alloc_flags & ~ALLOC_NO_WATERMARKS,
2530 preferred_zone, classzone_idx,
2531 migratetype);
2532 2490
2533 /* 2491 /*
2534 * If an allocation failed after direct reclaim, it could be because 2492 * If an allocation failed after direct reclaim, it could be because
@@ -2549,36 +2507,30 @@ retry:
2549 */ 2507 */
2550static inline struct page * 2508static inline struct page *
2551__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2509__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2552 struct zonelist *zonelist, enum zone_type high_zoneidx, 2510 const struct alloc_context *ac)
2553 nodemask_t *nodemask, struct zone *preferred_zone,
2554 int classzone_idx, int migratetype)
2555{ 2511{
2556 struct page *page; 2512 struct page *page;
2557 2513
2558 do { 2514 do {
2559 page = get_page_from_freelist(gfp_mask, nodemask, order, 2515 page = get_page_from_freelist(gfp_mask, order,
2560 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2516 ALLOC_NO_WATERMARKS, ac);
2561 preferred_zone, classzone_idx, migratetype);
2562 2517
2563 if (!page && gfp_mask & __GFP_NOFAIL) 2518 if (!page && gfp_mask & __GFP_NOFAIL)
2564 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2519 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
2520 HZ/50);
2565 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2521 } while (!page && (gfp_mask & __GFP_NOFAIL));
2566 2522
2567 return page; 2523 return page;
2568} 2524}
2569 2525
2570static void wake_all_kswapds(unsigned int order, 2526static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
2571 struct zonelist *zonelist,
2572 enum zone_type high_zoneidx,
2573 struct zone *preferred_zone,
2574 nodemask_t *nodemask)
2575{ 2527{
2576 struct zoneref *z; 2528 struct zoneref *z;
2577 struct zone *zone; 2529 struct zone *zone;
2578 2530
2579 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2531 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
2580 high_zoneidx, nodemask) 2532 ac->high_zoneidx, ac->nodemask)
2581 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2533 wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
2582} 2534}
2583 2535
2584static inline int 2536static inline int
@@ -2637,9 +2589,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2637 2589
2638static inline struct page * 2590static inline struct page *
2639__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2591__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2640 struct zonelist *zonelist, enum zone_type high_zoneidx, 2592 struct alloc_context *ac)
2641 nodemask_t *nodemask, struct zone *preferred_zone,
2642 int classzone_idx, int migratetype)
2643{ 2593{
2644 const gfp_t wait = gfp_mask & __GFP_WAIT; 2594 const gfp_t wait = gfp_mask & __GFP_WAIT;
2645 struct page *page = NULL; 2595 struct page *page = NULL;
@@ -2675,8 +2625,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2675 2625
2676retry: 2626retry:
2677 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2627 if (!(gfp_mask & __GFP_NO_KSWAPD))
2678 wake_all_kswapds(order, zonelist, high_zoneidx, 2628 wake_all_kswapds(order, ac);
2679 preferred_zone, nodemask);
2680 2629
2681 /* 2630 /*
2682 * OK, we're below the kswapd watermark and have kicked background 2631 * OK, we're below the kswapd watermark and have kicked background
@@ -2689,17 +2638,16 @@ retry:
2689 * Find the true preferred zone if the allocation is unconstrained by 2638 * Find the true preferred zone if the allocation is unconstrained by
2690 * cpusets. 2639 * cpusets.
2691 */ 2640 */
2692 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { 2641 if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
2693 struct zoneref *preferred_zoneref; 2642 struct zoneref *preferred_zoneref;
2694 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2643 preferred_zoneref = first_zones_zonelist(ac->zonelist,
2695 NULL, &preferred_zone); 2644 ac->high_zoneidx, NULL, &ac->preferred_zone);
2696 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2645 ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
2697 } 2646 }
2698 2647
2699 /* This is the last chance, in general, before the goto nopage. */ 2648 /* This is the last chance, in general, before the goto nopage. */
2700 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2649 page = get_page_from_freelist(gfp_mask, order,
2701 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2650 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
2702 preferred_zone, classzone_idx, migratetype);
2703 if (page) 2651 if (page)
2704 goto got_pg; 2652 goto got_pg;
2705 2653
@@ -2710,11 +2658,10 @@ retry:
2710 * the allocation is high priority and these type of 2658 * the allocation is high priority and these type of
2711 * allocations are system rather than user orientated 2659 * allocations are system rather than user orientated
2712 */ 2660 */
2713 zonelist = node_zonelist(numa_node_id(), gfp_mask); 2661 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
2662
2663 page = __alloc_pages_high_priority(gfp_mask, order, ac);
2714 2664
2715 page = __alloc_pages_high_priority(gfp_mask, order,
2716 zonelist, high_zoneidx, nodemask,
2717 preferred_zone, classzone_idx, migratetype);
2718 if (page) { 2665 if (page) {
2719 goto got_pg; 2666 goto got_pg;
2720 } 2667 }
@@ -2743,11 +2690,9 @@ retry:
2743 * Try direct compaction. The first pass is asynchronous. Subsequent 2690 * Try direct compaction. The first pass is asynchronous. Subsequent
2744 * attempts after direct reclaim are synchronous 2691 * attempts after direct reclaim are synchronous
2745 */ 2692 */
2746 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2693 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
2747 high_zoneidx, nodemask, alloc_flags, 2694 migration_mode,
2748 preferred_zone, 2695 &contended_compaction,
2749 classzone_idx, migratetype,
2750 migration_mode, &contended_compaction,
2751 &deferred_compaction); 2696 &deferred_compaction);
2752 if (page) 2697 if (page)
2753 goto got_pg; 2698 goto got_pg;
@@ -2793,12 +2738,8 @@ retry:
2793 migration_mode = MIGRATE_SYNC_LIGHT; 2738 migration_mode = MIGRATE_SYNC_LIGHT;
2794 2739
2795 /* Try direct reclaim and then allocating */ 2740 /* Try direct reclaim and then allocating */
2796 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2741 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
2797 zonelist, high_zoneidx, 2742 &did_some_progress);
2798 nodemask,
2799 alloc_flags, preferred_zone,
2800 classzone_idx, migratetype,
2801 &did_some_progress);
2802 if (page) 2743 if (page)
2803 goto got_pg; 2744 goto got_pg;
2804 2745
@@ -2812,17 +2753,15 @@ retry:
2812 * start OOM killing tasks. 2753 * start OOM killing tasks.
2813 */ 2754 */
2814 if (!did_some_progress) { 2755 if (!did_some_progress) {
2815 page = __alloc_pages_may_oom(gfp_mask, order, zonelist, 2756 page = __alloc_pages_may_oom(gfp_mask, order, ac,
2816 high_zoneidx, nodemask, 2757 &did_some_progress);
2817 preferred_zone, classzone_idx,
2818 migratetype,&did_some_progress);
2819 if (page) 2758 if (page)
2820 goto got_pg; 2759 goto got_pg;
2821 if (!did_some_progress) 2760 if (!did_some_progress)
2822 goto nopage; 2761 goto nopage;
2823 } 2762 }
2824 /* Wait for some write requests to complete then retry */ 2763 /* Wait for some write requests to complete then retry */
2825 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2764 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
2826 goto retry; 2765 goto retry;
2827 } else { 2766 } else {
2828 /* 2767 /*
@@ -2830,11 +2769,9 @@ retry:
2830 * direct reclaim and reclaim/compaction depends on compaction 2769 * direct reclaim and reclaim/compaction depends on compaction
2831 * being called after reclaim so call directly if necessary 2770 * being called after reclaim so call directly if necessary
2832 */ 2771 */
2833 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2772 page = __alloc_pages_direct_compact(gfp_mask, order,
2834 high_zoneidx, nodemask, alloc_flags, 2773 alloc_flags, ac, migration_mode,
2835 preferred_zone, 2774 &contended_compaction,
2836 classzone_idx, migratetype,
2837 migration_mode, &contended_compaction,
2838 &deferred_compaction); 2775 &deferred_compaction);
2839 if (page) 2776 if (page)
2840 goto got_pg; 2777 goto got_pg;
@@ -2842,11 +2779,7 @@ retry:
2842 2779
2843nopage: 2780nopage:
2844 warn_alloc_failed(gfp_mask, order, NULL); 2781 warn_alloc_failed(gfp_mask, order, NULL);
2845 return page;
2846got_pg: 2782got_pg:
2847 if (kmemcheck_enabled)
2848 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2849
2850 return page; 2783 return page;
2851} 2784}
2852 2785
@@ -2857,14 +2790,16 @@ struct page *
2857__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2790__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2858 struct zonelist *zonelist, nodemask_t *nodemask) 2791 struct zonelist *zonelist, nodemask_t *nodemask)
2859{ 2792{
2860 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2861 struct zone *preferred_zone;
2862 struct zoneref *preferred_zoneref; 2793 struct zoneref *preferred_zoneref;
2863 struct page *page = NULL; 2794 struct page *page = NULL;
2864 int migratetype = gfpflags_to_migratetype(gfp_mask);
2865 unsigned int cpuset_mems_cookie; 2795 unsigned int cpuset_mems_cookie;
2866 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2796 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2867 int classzone_idx; 2797 gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
2798 struct alloc_context ac = {
2799 .high_zoneidx = gfp_zone(gfp_mask),
2800 .nodemask = nodemask,
2801 .migratetype = gfpflags_to_migratetype(gfp_mask),
2802 };
2868 2803
2869 gfp_mask &= gfp_allowed_mask; 2804 gfp_mask &= gfp_allowed_mask;
2870 2805
@@ -2883,37 +2818,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2883 if (unlikely(!zonelist->_zonerefs->zone)) 2818 if (unlikely(!zonelist->_zonerefs->zone))
2884 return NULL; 2819 return NULL;
2885 2820
2886 if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) 2821 if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
2887 alloc_flags |= ALLOC_CMA; 2822 alloc_flags |= ALLOC_CMA;
2888 2823
2889retry_cpuset: 2824retry_cpuset:
2890 cpuset_mems_cookie = read_mems_allowed_begin(); 2825 cpuset_mems_cookie = read_mems_allowed_begin();
2891 2826
2827 /* We set it here, as __alloc_pages_slowpath might have changed it */
2828 ac.zonelist = zonelist;
2892 /* The preferred zone is used for statistics later */ 2829 /* The preferred zone is used for statistics later */
2893 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2830 preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
2894 nodemask ? : &cpuset_current_mems_allowed, 2831 ac.nodemask ? : &cpuset_current_mems_allowed,
2895 &preferred_zone); 2832 &ac.preferred_zone);
2896 if (!preferred_zone) 2833 if (!ac.preferred_zone)
2897 goto out; 2834 goto out;
2898 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2835 ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
2899 2836
2900 /* First allocation attempt */ 2837 /* First allocation attempt */
2901 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2838 alloc_mask = gfp_mask|__GFP_HARDWALL;
2902 zonelist, high_zoneidx, alloc_flags, 2839 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
2903 preferred_zone, classzone_idx, migratetype);
2904 if (unlikely(!page)) { 2840 if (unlikely(!page)) {
2905 /* 2841 /*
2906 * Runtime PM, block IO and its error handling path 2842 * Runtime PM, block IO and its error handling path
2907 * can deadlock because I/O on the device might not 2843 * can deadlock because I/O on the device might not
2908 * complete. 2844 * complete.
2909 */ 2845 */
2910 gfp_mask = memalloc_noio_flags(gfp_mask); 2846 alloc_mask = memalloc_noio_flags(gfp_mask);
2911 page = __alloc_pages_slowpath(gfp_mask, order, 2847
2912 zonelist, high_zoneidx, nodemask, 2848 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
2913 preferred_zone, classzone_idx, migratetype);
2914 } 2849 }
2915 2850
2916 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2851 if (kmemcheck_enabled && page)
2852 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2853
2854 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
2917 2855
2918out: 2856out:
2919 /* 2857 /*
@@ -5047,8 +4985,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
5047 pgdat->node_start_pfn = node_start_pfn; 4985 pgdat->node_start_pfn = node_start_pfn;
5048#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4986#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5049 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 4987 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
5050 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, 4988 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
5051 (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); 4989 (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
5052#endif 4990#endif
5053 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 4991 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
5054 zones_size, zholes_size); 4992 zones_size, zholes_size);
@@ -5420,9 +5358,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5420 arch_zone_highest_possible_pfn[i]) 5358 arch_zone_highest_possible_pfn[i])
5421 pr_cont("empty\n"); 5359 pr_cont("empty\n");
5422 else 5360 else
5423 pr_cont("[mem %0#10lx-%0#10lx]\n", 5361 pr_cont("[mem %#018Lx-%#018Lx]\n",
5424 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5362 (u64)arch_zone_lowest_possible_pfn[i]
5425 (arch_zone_highest_possible_pfn[i] 5363 << PAGE_SHIFT,
5364 ((u64)arch_zone_highest_possible_pfn[i]
5426 << PAGE_SHIFT) - 1); 5365 << PAGE_SHIFT) - 1);
5427 } 5366 }
5428 5367
@@ -5430,15 +5369,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5430 pr_info("Movable zone start for each node\n"); 5369 pr_info("Movable zone start for each node\n");
5431 for (i = 0; i < MAX_NUMNODES; i++) { 5370 for (i = 0; i < MAX_NUMNODES; i++) {
5432 if (zone_movable_pfn[i]) 5371 if (zone_movable_pfn[i])
5433 pr_info(" Node %d: %#010lx\n", i, 5372 pr_info(" Node %d: %#018Lx\n", i,
5434 zone_movable_pfn[i] << PAGE_SHIFT); 5373 (u64)zone_movable_pfn[i] << PAGE_SHIFT);
5435 } 5374 }
5436 5375
5437 /* Print out the early node map */ 5376 /* Print out the early node map */
5438 pr_info("Early memory node ranges\n"); 5377 pr_info("Early memory node ranges\n");
5439 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5378 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5440 pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5379 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
5441 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5380 (u64)start_pfn << PAGE_SHIFT,
5381 ((u64)end_pfn << PAGE_SHIFT) - 1);
5442 5382
5443 /* Initialise every node */ 5383 /* Initialise every node */
5444 mminit_verify_pageflags_layout(); 5384 mminit_verify_pageflags_layout();
diff --git a/mm/page_counter.c b/mm/page_counter.c
index a009574fbba9..11b4beda14ba 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -166,18 +166,19 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit)
166/** 166/**
167 * page_counter_memparse - memparse() for page counter limits 167 * page_counter_memparse - memparse() for page counter limits
168 * @buf: string to parse 168 * @buf: string to parse
169 * @max: string meaning maximum possible value
169 * @nr_pages: returns the result in number of pages 170 * @nr_pages: returns the result in number of pages
170 * 171 *
171 * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be 172 * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
172 * limited to %PAGE_COUNTER_MAX. 173 * limited to %PAGE_COUNTER_MAX.
173 */ 174 */
174int page_counter_memparse(const char *buf, unsigned long *nr_pages) 175int page_counter_memparse(const char *buf, const char *max,
176 unsigned long *nr_pages)
175{ 177{
176 char unlimited[] = "-1";
177 char *end; 178 char *end;
178 u64 bytes; 179 u64 bytes;
179 180
180 if (!strncmp(buf, unlimited, sizeof(unlimited))) { 181 if (!strcmp(buf, max)) {
181 *nr_pages = PAGE_COUNTER_MAX; 182 *nr_pages = PAGE_COUNTER_MAX;
182 return 0; 183 return 0;
183 } 184 }
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 9ab4a9b5bc09..0993f5f36b01 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -59,20 +59,19 @@ void __reset_page_owner(struct page *page, unsigned int order)
59 59
60void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) 60void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
61{ 61{
62 struct page_ext *page_ext; 62 struct page_ext *page_ext = lookup_page_ext(page);
63 struct stack_trace *trace; 63 struct stack_trace trace = {
64 64 .nr_entries = 0,
65 page_ext = lookup_page_ext(page); 65 .max_entries = ARRAY_SIZE(page_ext->trace_entries),
66 .entries = &page_ext->trace_entries[0],
67 .skip = 3,
68 };
66 69
67 trace = &page_ext->trace; 70 save_stack_trace(&trace);
68 trace->nr_entries = 0;
69 trace->max_entries = ARRAY_SIZE(page_ext->trace_entries);
70 trace->entries = &page_ext->trace_entries[0];
71 trace->skip = 3;
72 save_stack_trace(&page_ext->trace);
73 71
74 page_ext->order = order; 72 page_ext->order = order;
75 page_ext->gfp_mask = gfp_mask; 73 page_ext->gfp_mask = gfp_mask;
74 page_ext->nr_entries = trace.nr_entries;
76 75
77 __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 76 __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
78} 77}
@@ -84,6 +83,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
84 int ret; 83 int ret;
85 int pageblock_mt, page_mt; 84 int pageblock_mt, page_mt;
86 char *kbuf; 85 char *kbuf;
86 struct stack_trace trace = {
87 .nr_entries = page_ext->nr_entries,
88 .entries = &page_ext->trace_entries[0],
89 };
87 90
88 kbuf = kmalloc(count, GFP_KERNEL); 91 kbuf = kmalloc(count, GFP_KERNEL);
89 if (!kbuf) 92 if (!kbuf)
@@ -121,8 +124,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
121 if (ret >= count) 124 if (ret >= count)
122 goto err; 125 goto err;
123 126
124 ret += snprint_stack_trace(kbuf + ret, count - ret, 127 ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
125 &page_ext->trace, 0);
126 if (ret >= count) 128 if (ret >= count)
127 goto err; 129 goto err;
128 130
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index b264bda46e1b..75c1f2878519 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -35,7 +35,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
35 do { 35 do {
36again: 36again:
37 next = pmd_addr_end(addr, end); 37 next = pmd_addr_end(addr, end);
38 if (pmd_none(*pmd)) { 38 if (pmd_none(*pmd) || !walk->vma) {
39 if (walk->pte_hole) 39 if (walk->pte_hole)
40 err = walk->pte_hole(addr, next, walk); 40 err = walk->pte_hole(addr, next, walk);
41 if (err) 41 if (err)
@@ -59,7 +59,7 @@ again:
59 continue; 59 continue;
60 60
61 split_huge_page_pmd_mm(walk->mm, addr, pmd); 61 split_huge_page_pmd_mm(walk->mm, addr, pmd);
62 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 62 if (pmd_trans_unstable(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
65 if (err) 65 if (err)
@@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
86 break; 86 break;
87 continue; 87 continue;
88 } 88 }
89 if (walk->pud_entry) 89 if (walk->pmd_entry || walk->pte_entry)
90 err = walk->pud_entry(pud, addr, next, walk);
91 if (!err && (walk->pmd_entry || walk->pte_entry))
92 err = walk_pmd_range(pud, addr, next, walk); 90 err = walk_pmd_range(pud, addr, next, walk);
93 if (err) 91 if (err)
94 break; 92 break;
@@ -97,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
97 return err; 95 return err;
98} 96}
99 97
98static int walk_pgd_range(unsigned long addr, unsigned long end,
99 struct mm_walk *walk)
100{
101 pgd_t *pgd;
102 unsigned long next;
103 int err = 0;
104
105 pgd = pgd_offset(walk->mm, addr);
106 do {
107 next = pgd_addr_end(addr, end);
108 if (pgd_none_or_clear_bad(pgd)) {
109 if (walk->pte_hole)
110 err = walk->pte_hole(addr, next, walk);
111 if (err)
112 break;
113 continue;
114 }
115 if (walk->pmd_entry || walk->pte_entry)
116 err = walk_pud_range(pgd, addr, next, walk);
117 if (err)
118 break;
119 } while (pgd++, addr = next, addr != end);
120
121 return err;
122}
123
100#ifdef CONFIG_HUGETLB_PAGE 124#ifdef CONFIG_HUGETLB_PAGE
101static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 125static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
102 unsigned long end) 126 unsigned long end)
@@ -105,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
105 return boundary < end ? boundary : end; 129 return boundary < end ? boundary : end;
106} 130}
107 131
108static int walk_hugetlb_range(struct vm_area_struct *vma, 132static int walk_hugetlb_range(unsigned long addr, unsigned long end,
109 unsigned long addr, unsigned long end,
110 struct mm_walk *walk) 133 struct mm_walk *walk)
111{ 134{
135 struct vm_area_struct *vma = walk->vma;
112 struct hstate *h = hstate_vma(vma); 136 struct hstate *h = hstate_vma(vma);
113 unsigned long next; 137 unsigned long next;
114 unsigned long hmask = huge_page_mask(h); 138 unsigned long hmask = huge_page_mask(h);
@@ -121,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
121 if (pte && walk->hugetlb_entry) 145 if (pte && walk->hugetlb_entry)
122 err = walk->hugetlb_entry(pte, hmask, addr, next, walk); 146 err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
123 if (err) 147 if (err)
124 return err; 148 break;
125 } while (addr = next, addr != end); 149 } while (addr = next, addr != end);
126 150
127 return 0; 151 return err;
128} 152}
129 153
130#else /* CONFIG_HUGETLB_PAGE */ 154#else /* CONFIG_HUGETLB_PAGE */
131static int walk_hugetlb_range(struct vm_area_struct *vma, 155static int walk_hugetlb_range(unsigned long addr, unsigned long end,
132 unsigned long addr, unsigned long end,
133 struct mm_walk *walk) 156 struct mm_walk *walk)
134{ 157{
135 return 0; 158 return 0;
@@ -137,115 +160,138 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
137 160
138#endif /* CONFIG_HUGETLB_PAGE */ 161#endif /* CONFIG_HUGETLB_PAGE */
139 162
163/*
164 * Decide whether we really walk over the current vma on [@start, @end)
165 * or skip it via the returned value. Return 0 if we do walk over the
166 * current vma, and return 1 if we skip the vma. Negative values means
167 * error, where we abort the current walk.
168 */
169static int walk_page_test(unsigned long start, unsigned long end,
170 struct mm_walk *walk)
171{
172 struct vm_area_struct *vma = walk->vma;
173
174 if (walk->test_walk)
175 return walk->test_walk(start, end, walk);
176
177 /*
178 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
179 * range, so we don't walk over it as we do for normal vmas. However,
180 * Some callers are interested in handling hole range and they don't
181 * want to just ignore any single address range. Such users certainly
182 * define their ->pte_hole() callbacks, so let's delegate them to handle
183 * vma(VM_PFNMAP).
184 */
185 if (vma->vm_flags & VM_PFNMAP) {
186 int err = 1;
187 if (walk->pte_hole)
188 err = walk->pte_hole(start, end, walk);
189 return err ? err : 1;
190 }
191 return 0;
192}
193
194static int __walk_page_range(unsigned long start, unsigned long end,
195 struct mm_walk *walk)
196{
197 int err = 0;
198 struct vm_area_struct *vma = walk->vma;
199
200 if (vma && is_vm_hugetlb_page(vma)) {
201 if (walk->hugetlb_entry)
202 err = walk_hugetlb_range(start, end, walk);
203 } else
204 err = walk_pgd_range(start, end, walk);
140 205
206 return err;
207}
141 208
142/** 209/**
143 * walk_page_range - walk a memory map's page tables with a callback 210 * walk_page_range - walk page table with caller specific callbacks
144 * @addr: starting address
145 * @end: ending address
146 * @walk: set of callbacks to invoke for each level of the tree
147 * 211 *
148 * Recursively walk the page table for the memory area in a VMA, 212 * Recursively walk the page table tree of the process represented by @walk->mm
149 * calling supplied callbacks. Callbacks are called in-order (first 213 * within the virtual address range [@start, @end). During walking, we can do
150 * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, 214 * some caller-specific works for each entry, by setting up pmd_entry(),
151 * etc.). If lower-level callbacks are omitted, walking depth is reduced. 215 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
216 * callbacks, the associated entries/pages are just ignored.
217 * The return values of these callbacks are commonly defined like below:
218 * - 0 : succeeded to handle the current entry, and if you don't reach the
219 * end address yet, continue to walk.
220 * - >0 : succeeded to handle the current entry, and return to the caller
221 * with caller specific value.
222 * - <0 : failed to handle the current entry, and return to the caller
223 * with error code.
152 * 224 *
153 * Each callback receives an entry pointer and the start and end of the 225 * Before starting to walk page table, some callers want to check whether
154 * associated range, and a copy of the original mm_walk for access to 226 * they really want to walk over the current vma, typically by checking
155 * the ->private or ->mm fields. 227 * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
228 * purpose.
156 * 229 *
157 * Usually no locks are taken, but splitting transparent huge page may 230 * struct mm_walk keeps current values of some common data like vma and pmd,
158 * take page table lock. And the bottom level iterator will map PTE 231 * which are useful for the access from callbacks. If you want to pass some
159 * directories from highmem if necessary. 232 * caller-specific data to callbacks, @walk->private should be helpful.
160 * 233 *
161 * If any callback returns a non-zero value, the walk is aborted and 234 * Locking:
162 * the return value is propagated back to the caller. Otherwise 0 is returned. 235 * Callers of walk_page_range() and walk_page_vma() should hold
163 * 236 * @walk->mm->mmap_sem, because these function traverse vma list and/or
164 * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry 237 * access to vma's data.
165 * is !NULL.
166 */ 238 */
167int walk_page_range(unsigned long addr, unsigned long end, 239int walk_page_range(unsigned long start, unsigned long end,
168 struct mm_walk *walk) 240 struct mm_walk *walk)
169{ 241{
170 pgd_t *pgd;
171 unsigned long next;
172 int err = 0; 242 int err = 0;
243 unsigned long next;
244 struct vm_area_struct *vma;
173 245
174 if (addr >= end) 246 if (start >= end)
175 return err; 247 return -EINVAL;
176 248
177 if (!walk->mm) 249 if (!walk->mm)
178 return -EINVAL; 250 return -EINVAL;
179 251
180 VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); 252 VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
181 253
182 pgd = pgd_offset(walk->mm, addr); 254 vma = find_vma(walk->mm, start);
183 do { 255 do {
184 struct vm_area_struct *vma = NULL; 256 if (!vma) { /* after the last vma */
185 257 walk->vma = NULL;
186 next = pgd_addr_end(addr, end); 258 next = end;
259 } else if (start < vma->vm_start) { /* outside vma */
260 walk->vma = NULL;
261 next = min(end, vma->vm_start);
262 } else { /* inside vma */
263 walk->vma = vma;
264 next = min(end, vma->vm_end);
265 vma = vma->vm_next;
187 266
188 /* 267 err = walk_page_test(start, next, walk);
189 * This function was not intended to be vma based. 268 if (err > 0)
190 * But there are vma special cases to be handled:
191 * - hugetlb vma's
192 * - VM_PFNMAP vma's
193 */
194 vma = find_vma(walk->mm, addr);
195 if (vma) {
196 /*
197 * There are no page structures backing a VM_PFNMAP
198 * range, so do not allow split_huge_page_pmd().
199 */
200 if ((vma->vm_start <= addr) &&
201 (vma->vm_flags & VM_PFNMAP)) {
202 if (walk->pte_hole)
203 err = walk->pte_hole(addr, next, walk);
204 if (err)
205 break;
206 pgd = pgd_offset(walk->mm, next);
207 continue;
208 }
209 /*
210 * Handle hugetlb vma individually because pagetable
211 * walk for the hugetlb page is dependent on the
212 * architecture and we can't handled it in the same
213 * manner as non-huge pages.
214 */
215 if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
216 is_vm_hugetlb_page(vma)) {
217 if (vma->vm_end < next)
218 next = vma->vm_end;
219 /*
220 * Hugepage is very tightly coupled with vma,
221 * so walk through hugetlb entries within a
222 * given vma.
223 */
224 err = walk_hugetlb_range(vma, addr, next, walk);
225 if (err)
226 break;
227 pgd = pgd_offset(walk->mm, next);
228 continue; 269 continue;
229 } 270 if (err < 0)
230 }
231
232 if (pgd_none_or_clear_bad(pgd)) {
233 if (walk->pte_hole)
234 err = walk->pte_hole(addr, next, walk);
235 if (err)
236 break; 271 break;
237 pgd++;
238 continue;
239 } 272 }
240 if (walk->pgd_entry) 273 if (walk->vma || walk->pte_hole)
241 err = walk->pgd_entry(pgd, addr, next, walk); 274 err = __walk_page_range(start, next, walk);
242 if (!err &&
243 (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
244 err = walk_pud_range(pgd, addr, next, walk);
245 if (err) 275 if (err)
246 break; 276 break;
247 pgd++; 277 } while (start = next, start < end);
248 } while (addr = next, addr < end);
249
250 return err; 278 return err;
251} 279}
280
281int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
282{
283 int err;
284
285 if (!walk->mm)
286 return -EINVAL;
287
288 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
289 VM_BUG_ON(!vma);
290 walk->vma = vma;
291 err = walk_page_test(vma->vm_start, vma->vm_end, walk);
292 if (err > 0)
293 return 0;
294 if (err < 0)
295 return err;
296 return __walk_page_range(vma->vm_start, vma->vm_end, walk);
297}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 5077afcd9e11..b1597690530c 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -99,11 +99,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
99 size_t bytes; 99 size_t bytes;
100 100
101 /* Get the pages we're interested in */ 101 /* Get the pages we're interested in */
102 down_read(&mm->mmap_sem); 102 pages = get_user_pages_unlocked(task, mm, pa, pages,
103 pages = get_user_pages(task, mm, pa, pages, 103 vm_write, 0, process_pages);
104 vm_write, 0, process_pages, NULL);
105 up_read(&mm->mmap_sem);
106
107 if (pages <= 0) 104 if (pages <= 0)
108 return -EFAULT; 105 return -EFAULT;
109 106
diff --git a/mm/rmap.c b/mm/rmap.c
index 70b32498d4f2..5e3e09081164 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1085,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page,
1085void page_add_file_rmap(struct page *page) 1085void page_add_file_rmap(struct page *page)
1086{ 1086{
1087 struct mem_cgroup *memcg; 1087 struct mem_cgroup *memcg;
1088 unsigned long flags;
1089 bool locked;
1090 1088
1091 memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); 1089 memcg = mem_cgroup_begin_page_stat(page);
1092 if (atomic_inc_and_test(&page->_mapcount)) { 1090 if (atomic_inc_and_test(&page->_mapcount)) {
1093 __inc_zone_page_state(page, NR_FILE_MAPPED); 1091 __inc_zone_page_state(page, NR_FILE_MAPPED);
1094 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 1092 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
1095 } 1093 }
1096 mem_cgroup_end_page_stat(memcg, &locked, &flags); 1094 mem_cgroup_end_page_stat(memcg);
1097} 1095}
1098 1096
1099static void page_remove_file_rmap(struct page *page) 1097static void page_remove_file_rmap(struct page *page)
1100{ 1098{
1101 struct mem_cgroup *memcg; 1099 struct mem_cgroup *memcg;
1102 unsigned long flags;
1103 bool locked;
1104 1100
1105 memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); 1101 memcg = mem_cgroup_begin_page_stat(page);
1106 1102
1107 /* page still mapped by someone else? */ 1103 /* page still mapped by someone else? */
1108 if (!atomic_add_negative(-1, &page->_mapcount)) 1104 if (!atomic_add_negative(-1, &page->_mapcount))
@@ -1123,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page)
1123 if (unlikely(PageMlocked(page))) 1119 if (unlikely(PageMlocked(page)))
1124 clear_page_mlock(page); 1120 clear_page_mlock(page);
1125out: 1121out:
1126 mem_cgroup_end_page_stat(memcg, &locked, &flags); 1122 mem_cgroup_end_page_stat(memcg);
1127} 1123}
1128 1124
1129/** 1125/**
diff --git a/mm/shmem.c b/mm/shmem.c
index b3e403181981..864c878401e6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1131,7 +1131,7 @@ repeat:
1131 * truncated or holepunched since swap was confirmed. 1131 * truncated or holepunched since swap was confirmed.
1132 * shmem_undo_range() will have done some of the 1132 * shmem_undo_range() will have done some of the
1133 * unaccounting, now delete_from_swap_cache() will do 1133 * unaccounting, now delete_from_swap_cache() will do
1134 * the rest (including mem_cgroup_uncharge_swapcache). 1134 * the rest.
1135 * Reset swap.val? No, leave it so "failed" goes back to 1135 * Reset swap.val? No, leave it so "failed" goes back to
1136 * "repeat": reading a hole and writing should succeed. 1136 * "repeat": reading a hole and writing should succeed.
1137 */ 1137 */
diff --git a/mm/util.c b/mm/util.c
index fec39d4509a9..f3ef639c4857 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -240,14 +240,8 @@ int __weak get_user_pages_fast(unsigned long start,
240 int nr_pages, int write, struct page **pages) 240 int nr_pages, int write, struct page **pages)
241{ 241{
242 struct mm_struct *mm = current->mm; 242 struct mm_struct *mm = current->mm;
243 int ret; 243 return get_user_pages_unlocked(current, mm, start, nr_pages,
244 244 write, 0, pages);
245 down_read(&mm->mmap_sem);
246 ret = get_user_pages(current, mm, start, nr_pages,
247 write, 0, pages, NULL);
248 up_read(&mm->mmap_sem);
249
250 return ret;
251} 245}
252EXPORT_SYMBOL_GPL(get_user_pages_fast); 246EXPORT_SYMBOL_GPL(get_user_pages_fast);
253 247
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcd90c891d8e..8e645ee52045 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -91,6 +91,9 @@ struct scan_control {
91 /* Can pages be swapped as part of reclaim? */ 91 /* Can pages be swapped as part of reclaim? */
92 unsigned int may_swap:1; 92 unsigned int may_swap:1;
93 93
94 /* Can cgroups be reclaimed below their normal consumption range? */
95 unsigned int may_thrash:1;
96
94 unsigned int hibernation_mode:1; 97 unsigned int hibernation_mode:1;
95 98
96 /* One of the zones is ready for compaction */ 99 /* One of the zones is ready for compaction */
@@ -1903,8 +1906,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
1903 * latencies, so it's better to scan a minimum amount there as 1906 * latencies, so it's better to scan a minimum amount there as
1904 * well. 1907 * well.
1905 */ 1908 */
1906 if (current_is_kswapd() && !zone_reclaimable(zone)) 1909 if (current_is_kswapd()) {
1907 force_scan = true; 1910 if (!zone_reclaimable(zone))
1911 force_scan = true;
1912 if (!mem_cgroup_lruvec_online(lruvec))
1913 force_scan = true;
1914 }
1908 if (!global_reclaim(sc)) 1915 if (!global_reclaim(sc))
1909 force_scan = true; 1916 force_scan = true;
1910 1917
@@ -2290,6 +2297,12 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2290 struct lruvec *lruvec; 2297 struct lruvec *lruvec;
2291 int swappiness; 2298 int swappiness;
2292 2299
2300 if (mem_cgroup_low(root, memcg)) {
2301 if (!sc->may_thrash)
2302 continue;
2303 mem_cgroup_events(memcg, MEMCG_LOW, 1);
2304 }
2305
2293 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2306 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2294 swappiness = mem_cgroup_swappiness(memcg); 2307 swappiness = mem_cgroup_swappiness(memcg);
2295 2308
@@ -2311,8 +2324,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2311 mem_cgroup_iter_break(root, memcg); 2324 mem_cgroup_iter_break(root, memcg);
2312 break; 2325 break;
2313 } 2326 }
2314 memcg = mem_cgroup_iter(root, memcg, &reclaim); 2327 } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
2315 } while (memcg);
2316 2328
2317 /* 2329 /*
2318 * Shrink the slab caches in the same proportion that 2330 * Shrink the slab caches in the same proportion that
@@ -2515,10 +2527,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2515static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 2527static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2516 struct scan_control *sc) 2528 struct scan_control *sc)
2517{ 2529{
2530 int initial_priority = sc->priority;
2518 unsigned long total_scanned = 0; 2531 unsigned long total_scanned = 0;
2519 unsigned long writeback_threshold; 2532 unsigned long writeback_threshold;
2520 bool zones_reclaimable; 2533 bool zones_reclaimable;
2521 2534retry:
2522 delayacct_freepages_start(); 2535 delayacct_freepages_start();
2523 2536
2524 if (global_reclaim(sc)) 2537 if (global_reclaim(sc))
@@ -2568,6 +2581,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2568 if (sc->compaction_ready) 2581 if (sc->compaction_ready)
2569 return 1; 2582 return 1;
2570 2583
2584 /* Untapped cgroup reserves? Don't OOM, retry. */
2585 if (!sc->may_thrash) {
2586 sc->priority = initial_priority;
2587 sc->may_thrash = 1;
2588 goto retry;
2589 }
2590
2571 /* Any of the zones still reclaimable? Don't OOM. */ 2591 /* Any of the zones still reclaimable? Don't OOM. */
2572 if (zones_reclaimable) 2592 if (zones_reclaimable)
2573 return 1; 2593 return 1;
@@ -3175,7 +3195,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3175 */ 3195 */
3176 if (waitqueue_active(&pgdat->pfmemalloc_wait) && 3196 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3177 pfmemalloc_watermark_ok(pgdat)) 3197 pfmemalloc_watermark_ok(pgdat))
3178 wake_up(&pgdat->pfmemalloc_wait); 3198 wake_up_all(&pgdat->pfmemalloc_wait);
3179 3199
3180 /* 3200 /*
3181 * Fragmentation may mean that the system cannot be rebalanced 3201 * Fragmentation may mean that the system cannot be rebalanced
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9943e5fd74e6..4f5cd974e11a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1437,8 +1437,8 @@ static void vmstat_shepherd(struct work_struct *w)
1437 if (need_update(cpu) && 1437 if (need_update(cpu) &&
1438 cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) 1438 cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
1439 1439
1440 schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu), 1440 schedule_delayed_work_on(cpu,
1441 __round_jiffies_relative(sysctl_stat_interval, cpu)); 1441 &per_cpu(vmstat_work, cpu), 0);
1442 1442
1443 put_online_cpus(); 1443 put_online_cpus();
1444 1444
@@ -1452,7 +1452,7 @@ static void __init start_shepherd_timer(void)
1452 int cpu; 1452 int cpu;
1453 1453
1454 for_each_possible_cpu(cpu) 1454 for_each_possible_cpu(cpu)
1455 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), 1455 INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu),
1456 vmstat_update); 1456 vmstat_update);
1457 1457
1458 if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) 1458 if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 555013034f7a..096d91447e06 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -23,17 +23,15 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
23 if (!pages) 23 if (!pages)
24 return ERR_PTR(-ENOMEM); 24 return ERR_PTR(-ENOMEM);
25 25
26 down_read(&current->mm->mmap_sem);
27 while (got < num_pages) { 26 while (got < num_pages) {
28 rc = get_user_pages(current, current->mm, 27 rc = get_user_pages_unlocked(current, current->mm,
29 (unsigned long)data + ((unsigned long)got * PAGE_SIZE), 28 (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
30 num_pages - got, write_page, 0, pages + got, NULL); 29 num_pages - got, write_page, 0, pages + got);
31 if (rc < 0) 30 if (rc < 0)
32 break; 31 break;
33 BUG_ON(rc == 0); 32 BUG_ON(rc == 0);
34 got += rc; 33 got += rc;
35 } 34 }
36 up_read(&current->mm->mmap_sem);
37 if (rc < 0) 35 if (rc < 0)
38 goto fail; 36 goto fail;
39 return pages; 37 return pages;
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 272327134a1b..c2a75c6957a1 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -120,7 +120,7 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
120 switch (of_cft(of)->private) { 120 switch (of_cft(of)->private) {
121 case RES_LIMIT: 121 case RES_LIMIT:
122 /* see memcontrol.c */ 122 /* see memcontrol.c */
123 ret = page_counter_memparse(buf, &nr_pages); 123 ret = page_counter_memparse(buf, "-1", &nr_pages);
124 if (ret) 124 if (ret)
125 break; 125 break;
126 mutex_lock(&tcp_limit_mutex); 126 mutex_lock(&tcp_limit_mutex);
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 264fbc297e0b..8bdf16b8ba60 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -133,6 +133,7 @@ static const char * const page_flag_names[] = {
133 [KPF_KSM] = "x:ksm", 133 [KPF_KSM] = "x:ksm",
134 [KPF_THP] = "t:thp", 134 [KPF_THP] = "t:thp",
135 [KPF_BALLOON] = "o:balloon", 135 [KPF_BALLOON] = "o:balloon",
136 [KPF_ZERO_PAGE] = "z:zero_page",
136 137
137 [KPF_RESERVED] = "r:reserved", 138 [KPF_RESERVED] = "r:reserved",
138 [KPF_MLOCKED] = "m:mlocked", 139 [KPF_MLOCKED] = "m:mlocked",
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 5ff7f7f2689a..44660aee335f 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,7 +80,7 @@ static void async_pf_execute(struct work_struct *work)
80 80
81 might_sleep(); 81 might_sleep();
82 82
83 kvm_get_user_page_io(NULL, mm, addr, 1, NULL); 83 get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL);
84 kvm_async_page_present_sync(vcpu, apf); 84 kvm_async_page_present_sync(vcpu, apf);
85 85
86 spin_lock(&vcpu->async_pf.lock); 86 spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1cc6e2e19982..458b9b14b15c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1128,43 +1128,6 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
1128 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); 1128 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
1129} 1129}
1130 1130
1131int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
1132 unsigned long addr, bool write_fault,
1133 struct page **pagep)
1134{
1135 int npages;
1136 int locked = 1;
1137 int flags = FOLL_TOUCH | FOLL_HWPOISON |
1138 (pagep ? FOLL_GET : 0) |
1139 (write_fault ? FOLL_WRITE : 0);
1140
1141 /*
1142 * If retrying the fault, we get here *not* having allowed the filemap
1143 * to wait on the page lock. We should now allow waiting on the IO with
1144 * the mmap semaphore released.
1145 */
1146 down_read(&mm->mmap_sem);
1147 npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
1148 &locked);
1149 if (!locked) {
1150 VM_BUG_ON(npages);
1151
1152 if (!pagep)
1153 return 0;
1154
1155 /*
1156 * The previous call has now waited on the IO. Now we can
1157 * retry and complete. Pass TRIED to ensure we do not re
1158 * schedule async IO (see e.g. filemap_fault).
1159 */
1160 down_read(&mm->mmap_sem);
1161 npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
1162 pagep, NULL, NULL);
1163 }
1164 up_read(&mm->mmap_sem);
1165 return npages;
1166}
1167
1168static inline int check_user_page_hwpoison(unsigned long addr) 1131static inline int check_user_page_hwpoison(unsigned long addr)
1169{ 1132{
1170 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; 1133 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1227,15 +1190,10 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1227 npages = get_user_page_nowait(current, current->mm, 1190 npages = get_user_page_nowait(current, current->mm,
1228 addr, write_fault, page); 1191 addr, write_fault, page);
1229 up_read(&current->mm->mmap_sem); 1192 up_read(&current->mm->mmap_sem);
1230 } else { 1193 } else
1231 /* 1194 npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
1232 * By now we have tried gup_fast, and possibly async_pf, and we 1195 write_fault, 0, page,
1233 * are certainly not atomic. Time to retry the gup, allowing 1196 FOLL_TOUCH|FOLL_HWPOISON);
1234 * mmap semaphore to be relinquished in the case of IO.
1235 */
1236 npages = kvm_get_user_page_io(current, current->mm, addr,
1237 write_fault, page);
1238 }
1239 if (npages != 1) 1197 if (npages != 1)
1240 return npages; 1198 return npages;
1241 1199