aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-11 21:05:37 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-11 21:05:37 -0500
commit608ff1a210ab0e8b969399039bf8e18693605910 (patch)
treefaea7bb1764461c73d0953089bd5439d91733a03
parent414a6750e59b0b687034764c464e9ddecac0f7a6 (diff)
parent74d42d8fe146e870c52bde3b1c692f86cc8ff844 (diff)
Merge branch 'akpm' (Andrew's patchbomb)
Merge misc updates from Andrew Morton: "About half of most of MM. Going very early this time due to uncertainty over the coreautounifiednumasched things. I'll send the other half of most of MM tomorrow. The rest of MM awaits a slab merge from Pekka." * emailed patches from Andrew Morton: (71 commits) memory_hotplug: ensure every online node has NORMAL memory memory_hotplug: handle empty zone when online_movable/online_kernel mm, memory-hotplug: dynamic configure movable memory and portion memory drivers/base/node.c: cleanup node_state_attr[] bootmem: fix wrong call parameter for free_bootmem() avr32, kconfig: remove HAVE_ARCH_BOOTMEM mm: cma: remove watermark hacks mm: cma: skip watermarks check for already isolated blocks in split_free_page() mm, oom: fix race when specifying a thread as the oom origin mm, oom: change type of oom_score_adj to short mm: cleanup register_node() mm, mempolicy: remove duplicate code mm/vmscan.c: try_to_freeze() returns boolean mm: introduce putback_movable_pages() virtio_balloon: introduce migration primitives to balloon pages mm: introduce compaction and migration for ballooned pages mm: introduce a common interface for balloon pages mobility mm: redefine address_space.assoc_mapping mm: adjust address_space_operations.migratepage() return code arch/sparc/kernel/sys_sparc_64.c: s/COLOUR/COLOR/ ...
-rw-r--r--Documentation/cgroups/memory.txt6
-rw-r--r--Documentation/memory-hotplug.txt19
-rw-r--r--arch/alpha/include/asm/mman.h11
-rw-r--r--arch/arm/mm/mmap.c132
-rw-r--r--arch/avr32/Kconfig3
-rw-r--r--arch/mips/include/uapi/asm/mman.h11
-rw-r--r--arch/mips/mm/mmap.c111
-rw-r--r--arch/parisc/include/uapi/asm/mman.h11
-rw-r--r--arch/powerpc/kernel/sysfs.c4
-rw-r--r--arch/powerpc/platforms/cell/celleb_pci.c4
-rw-r--r--arch/s390/include/asm/page.h3
-rw-r--r--arch/sh/mm/mmap.c137
-rw-r--r--arch/sparc/kernel/sys_sparc_32.c27
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c150
-rw-r--r--arch/sparc/mm/hugetlbpage.c124
-rw-r--r--arch/tile/mm/hugetlbpage.c139
-rw-r--r--arch/x86/include/asm/elf.h6
-rw-r--r--arch/x86/include/asm/mman.h3
-rw-r--r--arch/x86/kernel/sys_x86_64.c151
-rw-r--r--arch/x86/mm/hugetlbpage.c130
-rw-r--r--arch/x86/vdso/vma.c2
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h11
-rw-r--r--drivers/base/memory.c42
-rw-r--r--drivers/base/node.c78
-rw-r--r--drivers/macintosh/smu.c2
-rw-r--r--drivers/staging/android/lowmemorykiller.c16
-rw-r--r--drivers/virtio/virtio_balloon.c151
-rw-r--r--fs/btrfs/disk-io.c8
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/buffer.c12
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/hugetlbfs/inode.c109
-rw-r--r--fs/inode.c2
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/ocfs2/file.c5
-rw-r--r--fs/proc/base.c10
-rw-r--r--fs/splice.c5
-rw-r--r--include/linux/balloon_compaction.h272
-rw-r--r--include/linux/bootmem.h4
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/huge_mm.h4
-rw-r--r--include/linux/hugetlb.h7
-rw-r--r--include/linux/kernel.h14
-rw-r--r--include/linux/memory.h1
-rw-r--r--include/linux/memory_hotplug.h13
-rw-r--r--include/linux/migrate.h19
-rw-r--r--include/linux/mm.h31
-rw-r--r--include/linux/mm_types.h19
-rw-r--r--include/linux/mmzone.h9
-rw-r--r--include/linux/node.h3
-rw-r--r--include/linux/oom.h21
-rw-r--r--include/linux/page-isolation.h10
-rw-r--r--include/linux/pagemap.h16
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/shm.h15
-rw-r--r--include/linux/types.h1
-rw-r--r--include/linux/writeback.h9
-rw-r--r--include/trace/events/oom.h4
-rw-r--r--include/trace/events/task.h8
-rw-r--r--include/uapi/asm-generic/mman-common.h11
-rw-r--r--include/uapi/asm-generic/mman.h2
-rw-r--r--ipc/shm.c3
-rw-r--r--lib/cpumask.c2
-rw-r--r--mm/Kconfig15
-rw-r--r--mm/Makefile3
-rw-r--r--mm/balloon_compaction.c302
-rw-r--r--mm/bootmem.c20
-rw-r--r--mm/compaction.c27
-rw-r--r--mm/dmapool.c24
-rw-r--r--mm/highmem.c29
-rw-r--r--mm/huge_memory.c174
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/internal.h5
-rw-r--r--mm/ksm.c21
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/memory-failure.c28
-rw-r--r--mm/memory.c8
-rw-r--r--mm/memory_hotplug.c332
-rw-r--r--mm/mempolicy.c21
-rw-r--r--mm/migrate.c99
-rw-r--r--mm/mmap.c513
-rw-r--r--mm/oom_kill.c86
-rw-r--r--mm/page-writeback.c11
-rw-r--r--mm/page_alloc.c176
-rw-r--r--mm/page_cgroup.c3
-rw-r--r--mm/page_isolation.c27
-rw-r--r--mm/rmap.c56
-rw-r--r--mm/slub.c4
-rw-r--r--mm/sparse.c25
-rw-r--r--mm/swapfile.c31
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c26
-rw-r--r--tools/testing/selftests/vm/Makefile4
-rw-r--r--tools/testing/selftests/vm/thuge-gen.c254
96 files changed, 2792 insertions, 1697 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 71c4da413444..a25cb3fafeba 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -144,9 +144,9 @@ Figure 1 shows the important aspects of the controller
1443. Each page has a pointer to the page_cgroup, which in turn knows the 1443. Each page has a pointer to the page_cgroup, which in turn knows the
145 cgroup it belongs to 145 cgroup it belongs to
146 146
147The accounting is done as follows: mem_cgroup_charge() is invoked to set up 147The accounting is done as follows: mem_cgroup_charge_common() is invoked to
148the necessary data structures and check if the cgroup that is being charged 148set up the necessary data structures and check if the cgroup that is being
149is over its limit. If it is, then reclaim is invoked on the cgroup. 149charged is over its limit. If it is, then reclaim is invoked on the cgroup.
150More details can be found in the reclaim section of this document. 150More details can be found in the reclaim section of this document.
151If everything goes well, a page meta-data-structure called page_cgroup is 151If everything goes well, a page meta-data-structure called page_cgroup is
152updated. page_cgroup has its own LRU on cgroup. 152updated. page_cgroup has its own LRU on cgroup.
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 6d0c2519cf47..c6f993d491b5 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -161,7 +161,8 @@ a recent addition and not present on older kernels.
161 in the memory block. 161 in the memory block.
162'state' : read-write 162'state' : read-write
163 at read: contains online/offline state of memory. 163 at read: contains online/offline state of memory.
164 at write: user can specify "online", "offline" command 164 at write: user can specify "online_kernel",
165 "online_movable", "online", "offline" command
165 which will be performed on al sections in the block. 166 which will be performed on al sections in the block.
166'phys_device' : read-only: designed to show the name of physical memory 167'phys_device' : read-only: designed to show the name of physical memory
167 device. This is not well implemented now. 168 device. This is not well implemented now.
@@ -255,6 +256,17 @@ For onlining, you have to write "online" to the section's state file as:
255 256
256% echo online > /sys/devices/system/memory/memoryXXX/state 257% echo online > /sys/devices/system/memory/memoryXXX/state
257 258
259This onlining will not change the ZONE type of the target memory section,
260If the memory section is in ZONE_NORMAL, you can change it to ZONE_MOVABLE:
261
262% echo online_movable > /sys/devices/system/memory/memoryXXX/state
263(NOTE: current limit: this memory section must be adjacent to ZONE_MOVABLE)
264
265And if the memory section is in ZONE_MOVABLE, you can change it to ZONE_NORMAL:
266
267% echo online_kernel > /sys/devices/system/memory/memoryXXX/state
268(NOTE: current limit: this memory section must be adjacent to ZONE_NORMAL)
269
258After this, section memoryXXX's state will be 'online' and the amount of 270After this, section memoryXXX's state will be 'online' and the amount of
259available memory will be increased. 271available memory will be increased.
260 272
@@ -377,15 +389,18 @@ The third argument is passed by pointer of struct memory_notify.
377struct memory_notify { 389struct memory_notify {
378 unsigned long start_pfn; 390 unsigned long start_pfn;
379 unsigned long nr_pages; 391 unsigned long nr_pages;
392 int status_change_nid_normal;
380 int status_change_nid; 393 int status_change_nid;
381} 394}
382 395
383start_pfn is start_pfn of online/offline memory. 396start_pfn is start_pfn of online/offline memory.
384nr_pages is # of pages of online/offline memory. 397nr_pages is # of pages of online/offline memory.
398status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
399is (will be) set/clear, if this is -1, then nodemask status is not changed.
385status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be) 400status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
386set/clear. It means a new(memoryless) node gets new memory by online and a 401set/clear. It means a new(memoryless) node gets new memory by online and a
387node loses all memory. If this is -1, then nodemask status is not changed. 402node loses all memory. If this is -1, then nodemask status is not changed.
388If status_changed_nid >= 0, callback should create/discard structures for the 403If status_changed_nid* >= 0, callback should create/discard structures for the
389node if necessary. 404node if necessary.
390 405
391-------------- 406--------------
diff --git a/arch/alpha/include/asm/mman.h b/arch/alpha/include/asm/mman.h
index cbeb3616a28e..0086b472bc2b 100644
--- a/arch/alpha/include/asm/mman.h
+++ b/arch/alpha/include/asm/mman.h
@@ -63,4 +63,15 @@
63/* compatibility flags */ 63/* compatibility flags */
64#define MAP_FILE 0 64#define MAP_FILE 0
65 65
66/*
67 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
68 * This gives us 6 bits, which is enough until someone invents 128 bit address
69 * spaces.
70 *
71 * Assume these are all power of twos.
72 * When 0 use the default page size.
73 */
74#define MAP_HUGE_SHIFT 26
75#define MAP_HUGE_MASK 0x3f
76
66#endif /* __ALPHA_MMAN_H__ */ 77#endif /* __ALPHA_MMAN_H__ */
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 89f2b7f7b042..10062ceadd1c 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -11,18 +11,6 @@
11#include <linux/random.h> 11#include <linux/random.h>
12#include <asm/cachetype.h> 12#include <asm/cachetype.h>
13 13
14static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
15 unsigned long pgoff)
16{
17 unsigned long base = addr & ~(SHMLBA-1);
18 unsigned long off = (pgoff << PAGE_SHIFT) & (SHMLBA-1);
19
20 if (base + off <= addr)
21 return base + off;
22
23 return base - off;
24}
25
26#define COLOUR_ALIGN(addr,pgoff) \ 14#define COLOUR_ALIGN(addr,pgoff) \
27 ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ 15 ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \
28 (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1))) 16 (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1)))
@@ -69,9 +57,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
69{ 57{
70 struct mm_struct *mm = current->mm; 58 struct mm_struct *mm = current->mm;
71 struct vm_area_struct *vma; 59 struct vm_area_struct *vma;
72 unsigned long start_addr;
73 int do_align = 0; 60 int do_align = 0;
74 int aliasing = cache_is_vipt_aliasing(); 61 int aliasing = cache_is_vipt_aliasing();
62 struct vm_unmapped_area_info info;
75 63
76 /* 64 /*
77 * We only need to do colour alignment if either the I or D 65 * We only need to do colour alignment if either the I or D
@@ -104,46 +92,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
104 (!vma || addr + len <= vma->vm_start)) 92 (!vma || addr + len <= vma->vm_start))
105 return addr; 93 return addr;
106 } 94 }
107 if (len > mm->cached_hole_size) {
108 start_addr = addr = mm->free_area_cache;
109 } else {
110 start_addr = addr = mm->mmap_base;
111 mm->cached_hole_size = 0;
112 }
113 95
114full_search: 96 info.flags = 0;
115 if (do_align) 97 info.length = len;
116 addr = COLOUR_ALIGN(addr, pgoff); 98 info.low_limit = mm->mmap_base;
117 else 99 info.high_limit = TASK_SIZE;
118 addr = PAGE_ALIGN(addr); 100 info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
119 101 info.align_offset = pgoff << PAGE_SHIFT;
120 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 102 return vm_unmapped_area(&info);
121 /* At this point: (!vma || addr < vma->vm_end). */
122 if (TASK_SIZE - len < addr) {
123 /*
124 * Start a new search - just in case we missed
125 * some holes.
126 */
127 if (start_addr != TASK_UNMAPPED_BASE) {
128 start_addr = addr = TASK_UNMAPPED_BASE;
129 mm->cached_hole_size = 0;
130 goto full_search;
131 }
132 return -ENOMEM;
133 }
134 if (!vma || addr + len <= vma->vm_start) {
135 /*
136 * Remember the place where we stopped the search:
137 */
138 mm->free_area_cache = addr + len;
139 return addr;
140 }
141 if (addr + mm->cached_hole_size < vma->vm_start)
142 mm->cached_hole_size = vma->vm_start - addr;
143 addr = vma->vm_end;
144 if (do_align)
145 addr = COLOUR_ALIGN(addr, pgoff);
146 }
147} 103}
148 104
149unsigned long 105unsigned long
@@ -156,6 +112,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
156 unsigned long addr = addr0; 112 unsigned long addr = addr0;
157 int do_align = 0; 113 int do_align = 0;
158 int aliasing = cache_is_vipt_aliasing(); 114 int aliasing = cache_is_vipt_aliasing();
115 struct vm_unmapped_area_info info;
159 116
160 /* 117 /*
161 * We only need to do colour alignment if either the I or D 118 * We only need to do colour alignment if either the I or D
@@ -187,70 +144,27 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
187 return addr; 144 return addr;
188 } 145 }
189 146
190 /* check if free_area_cache is useful for us */ 147 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
191 if (len <= mm->cached_hole_size) { 148 info.length = len;
192 mm->cached_hole_size = 0; 149 info.low_limit = PAGE_SIZE;
193 mm->free_area_cache = mm->mmap_base; 150 info.high_limit = mm->mmap_base;
194 } 151 info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
195 152 info.align_offset = pgoff << PAGE_SHIFT;
196 /* either no address requested or can't fit in requested address hole */ 153 addr = vm_unmapped_area(&info);
197 addr = mm->free_area_cache;
198 if (do_align) {
199 unsigned long base = COLOUR_ALIGN_DOWN(addr - len, pgoff);
200 addr = base + len;
201 }
202
203 /* make sure it can fit in the remaining address space */
204 if (addr > len) {
205 vma = find_vma(mm, addr-len);
206 if (!vma || addr <= vma->vm_start)
207 /* remember the address as a hint for next time */
208 return (mm->free_area_cache = addr-len);
209 }
210
211 if (mm->mmap_base < len)
212 goto bottomup;
213
214 addr = mm->mmap_base - len;
215 if (do_align)
216 addr = COLOUR_ALIGN_DOWN(addr, pgoff);
217
218 do {
219 /*
220 * Lookup failure means no vma is above this address,
221 * else if new region fits below vma->vm_start,
222 * return with success:
223 */
224 vma = find_vma(mm, addr);
225 if (!vma || addr+len <= vma->vm_start)
226 /* remember the address as a hint for next time */
227 return (mm->free_area_cache = addr);
228 154
229 /* remember the largest hole we saw so far */
230 if (addr + mm->cached_hole_size < vma->vm_start)
231 mm->cached_hole_size = vma->vm_start - addr;
232
233 /* try just below the current vma->vm_start */
234 addr = vma->vm_start - len;
235 if (do_align)
236 addr = COLOUR_ALIGN_DOWN(addr, pgoff);
237 } while (len < vma->vm_start);
238
239bottomup:
240 /* 155 /*
241 * A failed mmap() very likely causes application failure, 156 * A failed mmap() very likely causes application failure,
242 * so fall back to the bottom-up function here. This scenario 157 * so fall back to the bottom-up function here. This scenario
243 * can happen with large stack limits and large mmap() 158 * can happen with large stack limits and large mmap()
244 * allocations. 159 * allocations.
245 */ 160 */
246 mm->cached_hole_size = ~0UL; 161 if (addr & ~PAGE_MASK) {
247 mm->free_area_cache = TASK_UNMAPPED_BASE; 162 VM_BUG_ON(addr != -ENOMEM);
248 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 163 info.flags = 0;
249 /* 164 info.low_limit = mm->mmap_base;
250 * Restore the topdown base: 165 info.high_limit = TASK_SIZE;
251 */ 166 addr = vm_unmapped_area(&info);
252 mm->free_area_cache = mm->mmap_base; 167 }
253 mm->cached_hole_size = ~0UL;
254 168
255 return addr; 169 return addr;
256} 170}
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 06e73bf665e9..c2bbc9a72222 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -193,9 +193,6 @@ source "kernel/Kconfig.preempt"
193config QUICKLIST 193config QUICKLIST
194 def_bool y 194 def_bool y
195 195
196config HAVE_ARCH_BOOTMEM
197 def_bool n
198
199config ARCH_HAVE_MEMORY_PRESENT 196config ARCH_HAVE_MEMORY_PRESENT
200 def_bool n 197 def_bool n
201 198
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 46d3da0d4b92..9a936ac9a942 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -87,4 +87,15 @@
87/* compatibility flags */ 87/* compatibility flags */
88#define MAP_FILE 0 88#define MAP_FILE 0
89 89
90/*
91 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
92 * This gives us 6 bits, which is enough until someone invents 128 bit address
93 * spaces.
94 *
95 * Assume these are all power of twos.
96 * When 0 use the default page size.
97 */
98#define MAP_HUGE_SHIFT 26
99#define MAP_HUGE_MASK 0x3f
100
90#endif /* _ASM_MMAN_H */ 101#endif /* _ASM_MMAN_H */
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index 302d779d5b0d..d9be7540a6be 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -45,18 +45,6 @@ static unsigned long mmap_base(unsigned long rnd)
45 return PAGE_ALIGN(TASK_SIZE - gap - rnd); 45 return PAGE_ALIGN(TASK_SIZE - gap - rnd);
46} 46}
47 47
48static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
49 unsigned long pgoff)
50{
51 unsigned long base = addr & ~shm_align_mask;
52 unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask;
53
54 if (base + off <= addr)
55 return base + off;
56
57 return base - off;
58}
59
60#define COLOUR_ALIGN(addr, pgoff) \ 48#define COLOUR_ALIGN(addr, pgoff) \
61 ((((addr) + shm_align_mask) & ~shm_align_mask) + \ 49 ((((addr) + shm_align_mask) & ~shm_align_mask) + \
62 (((pgoff) << PAGE_SHIFT) & shm_align_mask)) 50 (((pgoff) << PAGE_SHIFT) & shm_align_mask))
@@ -71,6 +59,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
71 struct vm_area_struct *vma; 59 struct vm_area_struct *vma;
72 unsigned long addr = addr0; 60 unsigned long addr = addr0;
73 int do_color_align; 61 int do_color_align;
62 struct vm_unmapped_area_info info;
74 63
75 if (unlikely(len > TASK_SIZE)) 64 if (unlikely(len > TASK_SIZE))
76 return -ENOMEM; 65 return -ENOMEM;
@@ -107,97 +96,31 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
107 return addr; 96 return addr;
108 } 97 }
109 98
110 if (dir == UP) { 99 info.length = len;
111 addr = mm->mmap_base; 100 info.align_mask = do_color_align ? (PAGE_MASK & shm_align_mask) : 0;
112 if (do_color_align) 101 info.align_offset = pgoff << PAGE_SHIFT;
113 addr = COLOUR_ALIGN(addr, pgoff);
114 else
115 addr = PAGE_ALIGN(addr);
116 102
117 for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) { 103 if (dir == DOWN) {
118 /* At this point: (!vma || addr < vma->vm_end). */ 104 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
119 if (TASK_SIZE - len < addr) 105 info.low_limit = PAGE_SIZE;
120 return -ENOMEM; 106 info.high_limit = mm->mmap_base;
121 if (!vma || addr + len <= vma->vm_start) 107 addr = vm_unmapped_area(&info);
122 return addr; 108
123 addr = vma->vm_end; 109 if (!(addr & ~PAGE_MASK))
124 if (do_color_align) 110 return addr;
125 addr = COLOUR_ALIGN(addr, pgoff);
126 }
127 } else {
128 /* check if free_area_cache is useful for us */
129 if (len <= mm->cached_hole_size) {
130 mm->cached_hole_size = 0;
131 mm->free_area_cache = mm->mmap_base;
132 }
133 111
134 /*
135 * either no address requested, or the mapping can't fit into
136 * the requested address hole
137 */
138 addr = mm->free_area_cache;
139 if (do_color_align) {
140 unsigned long base =
141 COLOUR_ALIGN_DOWN(addr - len, pgoff);
142 addr = base + len;
143 }
144
145 /* make sure it can fit in the remaining address space */
146 if (likely(addr > len)) {
147 vma = find_vma(mm, addr - len);
148 if (!vma || addr <= vma->vm_start) {
149 /* cache the address as a hint for next time */
150 return mm->free_area_cache = addr - len;
151 }
152 }
153
154 if (unlikely(mm->mmap_base < len))
155 goto bottomup;
156
157 addr = mm->mmap_base - len;
158 if (do_color_align)
159 addr = COLOUR_ALIGN_DOWN(addr, pgoff);
160
161 do {
162 /*
163 * Lookup failure means no vma is above this address,
164 * else if new region fits below vma->vm_start,
165 * return with success:
166 */
167 vma = find_vma(mm, addr);
168 if (likely(!vma || addr + len <= vma->vm_start)) {
169 /* cache the address as a hint for next time */
170 return mm->free_area_cache = addr;
171 }
172
173 /* remember the largest hole we saw so far */
174 if (addr + mm->cached_hole_size < vma->vm_start)
175 mm->cached_hole_size = vma->vm_start - addr;
176
177 /* try just below the current vma->vm_start */
178 addr = vma->vm_start - len;
179 if (do_color_align)
180 addr = COLOUR_ALIGN_DOWN(addr, pgoff);
181 } while (likely(len < vma->vm_start));
182
183bottomup:
184 /* 112 /*
185 * A failed mmap() very likely causes application failure, 113 * A failed mmap() very likely causes application failure,
186 * so fall back to the bottom-up function here. This scenario 114 * so fall back to the bottom-up function here. This scenario
187 * can happen with large stack limits and large mmap() 115 * can happen with large stack limits and large mmap()
188 * allocations. 116 * allocations.
189 */ 117 */
190 mm->cached_hole_size = ~0UL;
191 mm->free_area_cache = TASK_UNMAPPED_BASE;
192 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
193 /*
194 * Restore the topdown base:
195 */
196 mm->free_area_cache = mm->mmap_base;
197 mm->cached_hole_size = ~0UL;
198
199 return addr;
200 } 118 }
119
120 info.flags = 0;
121 info.low_limit = mm->mmap_base;
122 info.high_limit = TASK_SIZE;
123 return vm_unmapped_area(&info);
201} 124}
202 125
203unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0, 126unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0,
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 12219ebce869..294d251ca7b2 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -70,4 +70,15 @@
70#define MAP_FILE 0 70#define MAP_FILE 0
71#define MAP_VARIABLE 0 71#define MAP_VARIABLE 0
72 72
73/*
74 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
75 * This gives us 6 bits, which is enough until someone invents 128 bit address
76 * spaces.
77 *
78 * Assume these are all power of twos.
79 * When 0 use the default page size.
80 */
81#define MAP_HUGE_SHIFT 26
82#define MAP_HUGE_MASK 0x3f
83
73#endif /* __PARISC_MMAN_H__ */ 84#endif /* __PARISC_MMAN_H__ */
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index cf357a059ddb..3ce1f864c2d3 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -607,7 +607,7 @@ static void register_nodes(void)
607 607
608int sysfs_add_device_to_node(struct device *dev, int nid) 608int sysfs_add_device_to_node(struct device *dev, int nid)
609{ 609{
610 struct node *node = &node_devices[nid]; 610 struct node *node = node_devices[nid];
611 return sysfs_create_link(&node->dev.kobj, &dev->kobj, 611 return sysfs_create_link(&node->dev.kobj, &dev->kobj,
612 kobject_name(&dev->kobj)); 612 kobject_name(&dev->kobj));
613} 613}
@@ -615,7 +615,7 @@ EXPORT_SYMBOL_GPL(sysfs_add_device_to_node);
615 615
616void sysfs_remove_device_from_node(struct device *dev, int nid) 616void sysfs_remove_device_from_node(struct device *dev, int nid)
617{ 617{
618 struct node *node = &node_devices[nid]; 618 struct node *node = node_devices[nid];
619 sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj)); 619 sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj));
620} 620}
621EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node); 621EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node);
diff --git a/arch/powerpc/platforms/cell/celleb_pci.c b/arch/powerpc/platforms/cell/celleb_pci.c
index abc8af43ea7c..173568140a32 100644
--- a/arch/powerpc/platforms/cell/celleb_pci.c
+++ b/arch/powerpc/platforms/cell/celleb_pci.c
@@ -401,11 +401,11 @@ error:
401 } else { 401 } else {
402 if (config && *config) { 402 if (config && *config) {
403 size = 256; 403 size = 256;
404 free_bootmem((unsigned long)(*config), size); 404 free_bootmem(__pa(*config), size);
405 } 405 }
406 if (res && *res) { 406 if (res && *res) {
407 size = sizeof(struct celleb_pci_resource); 407 size = sizeof(struct celleb_pci_resource);
408 free_bootmem((unsigned long)(*res), size); 408 free_bootmem(__pa(*res), size);
409 } 409 }
410 } 410 }
411 411
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 6d5367060a56..39faa4ac9660 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -158,6 +158,9 @@ static inline int page_reset_referenced(unsigned long addr)
158 * race against modification of the referenced bit. This function 158 * race against modification of the referenced bit. This function
159 * should therefore only be called if it is not mapped in any 159 * should therefore only be called if it is not mapped in any
160 * address space. 160 * address space.
161 *
162 * Note that the bit gets set whenever page content is changed. That means
163 * also when the page is modified by DMA or from inside the kernel.
161 */ 164 */
162#define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_DIRTY 165#define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_DIRTY
163static inline int page_test_and_clear_dirty(unsigned long pfn, int mapped) 166static inline int page_test_and_clear_dirty(unsigned long pfn, int mapped)
diff --git a/arch/sh/mm/mmap.c b/arch/sh/mm/mmap.c
index 80bf494ddbcb..6777177807c2 100644
--- a/arch/sh/mm/mmap.c
+++ b/arch/sh/mm/mmap.c
@@ -30,25 +30,13 @@ static inline unsigned long COLOUR_ALIGN(unsigned long addr,
30 return base + off; 30 return base + off;
31} 31}
32 32
33static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
34 unsigned long pgoff)
35{
36 unsigned long base = addr & ~shm_align_mask;
37 unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask;
38
39 if (base + off <= addr)
40 return base + off;
41
42 return base - off;
43}
44
45unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, 33unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
46 unsigned long len, unsigned long pgoff, unsigned long flags) 34 unsigned long len, unsigned long pgoff, unsigned long flags)
47{ 35{
48 struct mm_struct *mm = current->mm; 36 struct mm_struct *mm = current->mm;
49 struct vm_area_struct *vma; 37 struct vm_area_struct *vma;
50 unsigned long start_addr;
51 int do_colour_align; 38 int do_colour_align;
39 struct vm_unmapped_area_info info;
52 40
53 if (flags & MAP_FIXED) { 41 if (flags & MAP_FIXED) {
54 /* We do not accept a shared mapping if it would violate 42 /* We do not accept a shared mapping if it would violate
@@ -79,47 +67,13 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
79 return addr; 67 return addr;
80 } 68 }
81 69
82 if (len > mm->cached_hole_size) { 70 info.flags = 0;
83 start_addr = addr = mm->free_area_cache; 71 info.length = len;
84 } else { 72 info.low_limit = TASK_UNMAPPED_BASE;
85 mm->cached_hole_size = 0; 73 info.high_limit = TASK_SIZE;
86 start_addr = addr = TASK_UNMAPPED_BASE; 74 info.align_mask = do_colour_align ? (PAGE_MASK & shm_align_mask) : 0;
87 } 75 info.align_offset = pgoff << PAGE_SHIFT;
88 76 return vm_unmapped_area(&info);
89full_search:
90 if (do_colour_align)
91 addr = COLOUR_ALIGN(addr, pgoff);
92 else
93 addr = PAGE_ALIGN(mm->free_area_cache);
94
95 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
96 /* At this point: (!vma || addr < vma->vm_end). */
97 if (unlikely(TASK_SIZE - len < addr)) {
98 /*
99 * Start a new search - just in case we missed
100 * some holes.
101 */
102 if (start_addr != TASK_UNMAPPED_BASE) {
103 start_addr = addr = TASK_UNMAPPED_BASE;
104 mm->cached_hole_size = 0;
105 goto full_search;
106 }
107 return -ENOMEM;
108 }
109 if (likely(!vma || addr + len <= vma->vm_start)) {
110 /*
111 * Remember the place where we stopped the search:
112 */
113 mm->free_area_cache = addr + len;
114 return addr;
115 }
116 if (addr + mm->cached_hole_size < vma->vm_start)
117 mm->cached_hole_size = vma->vm_start - addr;
118
119 addr = vma->vm_end;
120 if (do_colour_align)
121 addr = COLOUR_ALIGN(addr, pgoff);
122 }
123} 77}
124 78
125unsigned long 79unsigned long
@@ -131,6 +85,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
131 struct mm_struct *mm = current->mm; 85 struct mm_struct *mm = current->mm;
132 unsigned long addr = addr0; 86 unsigned long addr = addr0;
133 int do_colour_align; 87 int do_colour_align;
88 struct vm_unmapped_area_info info;
134 89
135 if (flags & MAP_FIXED) { 90 if (flags & MAP_FIXED) {
136 /* We do not accept a shared mapping if it would violate 91 /* We do not accept a shared mapping if it would violate
@@ -162,73 +117,27 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
162 return addr; 117 return addr;
163 } 118 }
164 119
165 /* check if free_area_cache is useful for us */ 120 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
166 if (len <= mm->cached_hole_size) { 121 info.length = len;
167 mm->cached_hole_size = 0; 122 info.low_limit = PAGE_SIZE;
168 mm->free_area_cache = mm->mmap_base; 123 info.high_limit = mm->mmap_base;
169 } 124 info.align_mask = do_colour_align ? (PAGE_MASK & shm_align_mask) : 0;
170 125 info.align_offset = pgoff << PAGE_SHIFT;
171 /* either no address requested or can't fit in requested address hole */ 126 addr = vm_unmapped_area(&info);
172 addr = mm->free_area_cache;
173 if (do_colour_align) {
174 unsigned long base = COLOUR_ALIGN_DOWN(addr-len, pgoff);
175 127
176 addr = base + len;
177 }
178
179 /* make sure it can fit in the remaining address space */
180 if (likely(addr > len)) {
181 vma = find_vma(mm, addr-len);
182 if (!vma || addr <= vma->vm_start) {
183 /* remember the address as a hint for next time */
184 return (mm->free_area_cache = addr-len);
185 }
186 }
187
188 if (unlikely(mm->mmap_base < len))
189 goto bottomup;
190
191 addr = mm->mmap_base-len;
192 if (do_colour_align)
193 addr = COLOUR_ALIGN_DOWN(addr, pgoff);
194
195 do {
196 /*
197 * Lookup failure means no vma is above this address,
198 * else if new region fits below vma->vm_start,
199 * return with success:
200 */
201 vma = find_vma(mm, addr);
202 if (likely(!vma || addr+len <= vma->vm_start)) {
203 /* remember the address as a hint for next time */
204 return (mm->free_area_cache = addr);
205 }
206
207 /* remember the largest hole we saw so far */
208 if (addr + mm->cached_hole_size < vma->vm_start)
209 mm->cached_hole_size = vma->vm_start - addr;
210
211 /* try just below the current vma->vm_start */
212 addr = vma->vm_start-len;
213 if (do_colour_align)
214 addr = COLOUR_ALIGN_DOWN(addr, pgoff);
215 } while (likely(len < vma->vm_start));
216
217bottomup:
218 /* 128 /*
219 * A failed mmap() very likely causes application failure, 129 * A failed mmap() very likely causes application failure,
220 * so fall back to the bottom-up function here. This scenario 130 * so fall back to the bottom-up function here. This scenario
221 * can happen with large stack limits and large mmap() 131 * can happen with large stack limits and large mmap()
222 * allocations. 132 * allocations.
223 */ 133 */
224 mm->cached_hole_size = ~0UL; 134 if (addr & ~PAGE_MASK) {
225 mm->free_area_cache = TASK_UNMAPPED_BASE; 135 VM_BUG_ON(addr != -ENOMEM);
226 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 136 info.flags = 0;
227 /* 137 info.low_limit = TASK_UNMAPPED_BASE;
228 * Restore the topdown base: 138 info.high_limit = TASK_SIZE;
229 */ 139 addr = vm_unmapped_area(&info);
230 mm->free_area_cache = mm->mmap_base; 140 }
231 mm->cached_hole_size = ~0UL;
232 141
233 return addr; 142 return addr;
234} 143}
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index 0c9b31b22e07..57277c830151 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -34,11 +34,9 @@ asmlinkage unsigned long sys_getpagesize(void)
34 return PAGE_SIZE; /* Possibly older binaries want 8192 on sun4's? */ 34 return PAGE_SIZE; /* Possibly older binaries want 8192 on sun4's? */
35} 35}
36 36
37#define COLOUR_ALIGN(addr) (((addr)+SHMLBA-1)&~(SHMLBA-1))
38
39unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) 37unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
40{ 38{
41 struct vm_area_struct * vmm; 39 struct vm_unmapped_area_info info;
42 40
43 if (flags & MAP_FIXED) { 41 if (flags & MAP_FIXED) {
44 /* We do not accept a shared mapping if it would violate 42 /* We do not accept a shared mapping if it would violate
@@ -56,21 +54,14 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
56 if (!addr) 54 if (!addr)
57 addr = TASK_UNMAPPED_BASE; 55 addr = TASK_UNMAPPED_BASE;
58 56
59 if (flags & MAP_SHARED) 57 info.flags = 0;
60 addr = COLOUR_ALIGN(addr); 58 info.length = len;
61 else 59 info.low_limit = addr;
62 addr = PAGE_ALIGN(addr); 60 info.high_limit = TASK_SIZE;
63 61 info.align_mask = (flags & MAP_SHARED) ?
64 for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { 62 (PAGE_MASK & (SHMLBA - 1)) : 0;
65 /* At this point: (!vmm || addr < vmm->vm_end). */ 63 info.align_offset = pgoff << PAGE_SHIFT;
66 if (TASK_SIZE - PAGE_SIZE - len < addr) 64 return vm_unmapped_area(&info);
67 return -ENOMEM;
68 if (!vmm || addr + len <= vmm->vm_start)
69 return addr;
70 addr = vmm->vm_end;
71 if (flags & MAP_SHARED)
72 addr = COLOUR_ALIGN(addr);
73 }
74} 65}
75 66
76/* 67/*
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 878ef3d5fec5..97309c0ec533 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -75,7 +75,7 @@ static inline int invalid_64bit_range(unsigned long addr, unsigned long len)
75 * the spitfire/niagara VA-hole. 75 * the spitfire/niagara VA-hole.
76 */ 76 */
77 77
78static inline unsigned long COLOUR_ALIGN(unsigned long addr, 78static inline unsigned long COLOR_ALIGN(unsigned long addr,
79 unsigned long pgoff) 79 unsigned long pgoff)
80{ 80{
81 unsigned long base = (addr+SHMLBA-1)&~(SHMLBA-1); 81 unsigned long base = (addr+SHMLBA-1)&~(SHMLBA-1);
@@ -84,24 +84,13 @@ static inline unsigned long COLOUR_ALIGN(unsigned long addr,
84 return base + off; 84 return base + off;
85} 85}
86 86
87static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
88 unsigned long pgoff)
89{
90 unsigned long base = addr & ~(SHMLBA-1);
91 unsigned long off = (pgoff<<PAGE_SHIFT) & (SHMLBA-1);
92
93 if (base + off <= addr)
94 return base + off;
95 return base - off;
96}
97
98unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) 87unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
99{ 88{
100 struct mm_struct *mm = current->mm; 89 struct mm_struct *mm = current->mm;
101 struct vm_area_struct * vma; 90 struct vm_area_struct * vma;
102 unsigned long task_size = TASK_SIZE; 91 unsigned long task_size = TASK_SIZE;
103 unsigned long start_addr;
104 int do_color_align; 92 int do_color_align;
93 struct vm_unmapped_area_info info;
105 94
106 if (flags & MAP_FIXED) { 95 if (flags & MAP_FIXED) {
107 /* We do not accept a shared mapping if it would violate 96 /* We do not accept a shared mapping if it would violate
@@ -124,7 +113,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
124 113
125 if (addr) { 114 if (addr) {
126 if (do_color_align) 115 if (do_color_align)
127 addr = COLOUR_ALIGN(addr, pgoff); 116 addr = COLOR_ALIGN(addr, pgoff);
128 else 117 else
129 addr = PAGE_ALIGN(addr); 118 addr = PAGE_ALIGN(addr);
130 119
@@ -134,50 +123,22 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
134 return addr; 123 return addr;
135 } 124 }
136 125
137 if (len > mm->cached_hole_size) { 126 info.flags = 0;
138 start_addr = addr = mm->free_area_cache; 127 info.length = len;
139 } else { 128 info.low_limit = TASK_UNMAPPED_BASE;
140 start_addr = addr = TASK_UNMAPPED_BASE; 129 info.high_limit = min(task_size, VA_EXCLUDE_START);
141 mm->cached_hole_size = 0; 130 info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
131 info.align_offset = pgoff << PAGE_SHIFT;
132 addr = vm_unmapped_area(&info);
133
134 if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) {
135 VM_BUG_ON(addr != -ENOMEM);
136 info.low_limit = VA_EXCLUDE_END;
137 info.high_limit = task_size;
138 addr = vm_unmapped_area(&info);
142 } 139 }
143 140
144 task_size -= len; 141 return addr;
145
146full_search:
147 if (do_color_align)
148 addr = COLOUR_ALIGN(addr, pgoff);
149 else
150 addr = PAGE_ALIGN(addr);
151
152 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
153 /* At this point: (!vma || addr < vma->vm_end). */
154 if (addr < VA_EXCLUDE_START &&
155 (addr + len) >= VA_EXCLUDE_START) {
156 addr = VA_EXCLUDE_END;
157 vma = find_vma(mm, VA_EXCLUDE_END);
158 }
159 if (unlikely(task_size < addr)) {
160 if (start_addr != TASK_UNMAPPED_BASE) {
161 start_addr = addr = TASK_UNMAPPED_BASE;
162 mm->cached_hole_size = 0;
163 goto full_search;
164 }
165 return -ENOMEM;
166 }
167 if (likely(!vma || addr + len <= vma->vm_start)) {
168 /*
169 * Remember the place where we stopped the search:
170 */
171 mm->free_area_cache = addr + len;
172 return addr;
173 }
174 if (addr + mm->cached_hole_size < vma->vm_start)
175 mm->cached_hole_size = vma->vm_start - addr;
176
177 addr = vma->vm_end;
178 if (do_color_align)
179 addr = COLOUR_ALIGN(addr, pgoff);
180 }
181} 142}
182 143
183unsigned long 144unsigned long
@@ -190,6 +151,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
190 unsigned long task_size = STACK_TOP32; 151 unsigned long task_size = STACK_TOP32;
191 unsigned long addr = addr0; 152 unsigned long addr = addr0;
192 int do_color_align; 153 int do_color_align;
154 struct vm_unmapped_area_info info;
193 155
194 /* This should only ever run for 32-bit processes. */ 156 /* This should only ever run for 32-bit processes. */
195 BUG_ON(!test_thread_flag(TIF_32BIT)); 157 BUG_ON(!test_thread_flag(TIF_32BIT));
@@ -214,7 +176,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
214 /* requesting a specific address */ 176 /* requesting a specific address */
215 if (addr) { 177 if (addr) {
216 if (do_color_align) 178 if (do_color_align)
217 addr = COLOUR_ALIGN(addr, pgoff); 179 addr = COLOR_ALIGN(addr, pgoff);
218 else 180 else
219 addr = PAGE_ALIGN(addr); 181 addr = PAGE_ALIGN(addr);
220 182
@@ -224,73 +186,27 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
224 return addr; 186 return addr;
225 } 187 }
226 188
227 /* check if free_area_cache is useful for us */ 189 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
228 if (len <= mm->cached_hole_size) { 190 info.length = len;
229 mm->cached_hole_size = 0; 191 info.low_limit = PAGE_SIZE;
230 mm->free_area_cache = mm->mmap_base; 192 info.high_limit = mm->mmap_base;
231 } 193 info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
232 194 info.align_offset = pgoff << PAGE_SHIFT;
233 /* either no address requested or can't fit in requested address hole */ 195 addr = vm_unmapped_area(&info);
234 addr = mm->free_area_cache;
235 if (do_color_align) {
236 unsigned long base = COLOUR_ALIGN_DOWN(addr-len, pgoff);
237 196
238 addr = base + len;
239 }
240
241 /* make sure it can fit in the remaining address space */
242 if (likely(addr > len)) {
243 vma = find_vma(mm, addr-len);
244 if (!vma || addr <= vma->vm_start) {
245 /* remember the address as a hint for next time */
246 return (mm->free_area_cache = addr-len);
247 }
248 }
249
250 if (unlikely(mm->mmap_base < len))
251 goto bottomup;
252
253 addr = mm->mmap_base-len;
254 if (do_color_align)
255 addr = COLOUR_ALIGN_DOWN(addr, pgoff);
256
257 do {
258 /*
259 * Lookup failure means no vma is above this address,
260 * else if new region fits below vma->vm_start,
261 * return with success:
262 */
263 vma = find_vma(mm, addr);
264 if (likely(!vma || addr+len <= vma->vm_start)) {
265 /* remember the address as a hint for next time */
266 return (mm->free_area_cache = addr);
267 }
268
269 /* remember the largest hole we saw so far */
270 if (addr + mm->cached_hole_size < vma->vm_start)
271 mm->cached_hole_size = vma->vm_start - addr;
272
273 /* try just below the current vma->vm_start */
274 addr = vma->vm_start-len;
275 if (do_color_align)
276 addr = COLOUR_ALIGN_DOWN(addr, pgoff);
277 } while (likely(len < vma->vm_start));
278
279bottomup:
280 /* 197 /*
281 * A failed mmap() very likely causes application failure, 198 * A failed mmap() very likely causes application failure,
282 * so fall back to the bottom-up function here. This scenario 199 * so fall back to the bottom-up function here. This scenario
283 * can happen with large stack limits and large mmap() 200 * can happen with large stack limits and large mmap()
284 * allocations. 201 * allocations.
285 */ 202 */
286 mm->cached_hole_size = ~0UL; 203 if (addr & ~PAGE_MASK) {
287 mm->free_area_cache = TASK_UNMAPPED_BASE; 204 VM_BUG_ON(addr != -ENOMEM);
288 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 205 info.flags = 0;
289 /* 206 info.low_limit = TASK_UNMAPPED_BASE;
290 * Restore the topdown base: 207 info.high_limit = STACK_TOP32;
291 */ 208 addr = vm_unmapped_area(&info);
292 mm->free_area_cache = mm->mmap_base; 209 }
293 mm->cached_hole_size = ~0UL;
294 210
295 return addr; 211 return addr;
296} 212}
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index f76f83d5ac63..d2b59441ebdd 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -30,55 +30,28 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp,
30 unsigned long pgoff, 30 unsigned long pgoff,
31 unsigned long flags) 31 unsigned long flags)
32{ 32{
33 struct mm_struct *mm = current->mm;
34 struct vm_area_struct * vma;
35 unsigned long task_size = TASK_SIZE; 33 unsigned long task_size = TASK_SIZE;
36 unsigned long start_addr; 34 struct vm_unmapped_area_info info;
37 35
38 if (test_thread_flag(TIF_32BIT)) 36 if (test_thread_flag(TIF_32BIT))
39 task_size = STACK_TOP32; 37 task_size = STACK_TOP32;
40 if (unlikely(len >= VA_EXCLUDE_START))
41 return -ENOMEM;
42 38
43 if (len > mm->cached_hole_size) { 39 info.flags = 0;
44 start_addr = addr = mm->free_area_cache; 40 info.length = len;
45 } else { 41 info.low_limit = TASK_UNMAPPED_BASE;
46 start_addr = addr = TASK_UNMAPPED_BASE; 42 info.high_limit = min(task_size, VA_EXCLUDE_START);
47 mm->cached_hole_size = 0; 43 info.align_mask = PAGE_MASK & ~HPAGE_MASK;
44 info.align_offset = 0;
45 addr = vm_unmapped_area(&info);
46
47 if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) {
48 VM_BUG_ON(addr != -ENOMEM);
49 info.low_limit = VA_EXCLUDE_END;
50 info.high_limit = task_size;
51 addr = vm_unmapped_area(&info);
48 } 52 }
49 53
50 task_size -= len; 54 return addr;
51
52full_search:
53 addr = ALIGN(addr, HPAGE_SIZE);
54
55 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
56 /* At this point: (!vma || addr < vma->vm_end). */
57 if (addr < VA_EXCLUDE_START &&
58 (addr + len) >= VA_EXCLUDE_START) {
59 addr = VA_EXCLUDE_END;
60 vma = find_vma(mm, VA_EXCLUDE_END);
61 }
62 if (unlikely(task_size < addr)) {
63 if (start_addr != TASK_UNMAPPED_BASE) {
64 start_addr = addr = TASK_UNMAPPED_BASE;
65 mm->cached_hole_size = 0;
66 goto full_search;
67 }
68 return -ENOMEM;
69 }
70 if (likely(!vma || addr + len <= vma->vm_start)) {
71 /*
72 * Remember the place where we stopped the search:
73 */
74 mm->free_area_cache = addr + len;
75 return addr;
76 }
77 if (addr + mm->cached_hole_size < vma->vm_start)
78 mm->cached_hole_size = vma->vm_start - addr;
79
80 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
81 }
82} 55}
83 56
84static unsigned long 57static unsigned long
@@ -87,71 +60,34 @@ hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
87 const unsigned long pgoff, 60 const unsigned long pgoff,
88 const unsigned long flags) 61 const unsigned long flags)
89{ 62{
90 struct vm_area_struct *vma;
91 struct mm_struct *mm = current->mm; 63 struct mm_struct *mm = current->mm;
92 unsigned long addr = addr0; 64 unsigned long addr = addr0;
65 struct vm_unmapped_area_info info;
93 66
94 /* This should only ever run for 32-bit processes. */ 67 /* This should only ever run for 32-bit processes. */
95 BUG_ON(!test_thread_flag(TIF_32BIT)); 68 BUG_ON(!test_thread_flag(TIF_32BIT));
96 69
97 /* check if free_area_cache is useful for us */ 70 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
98 if (len <= mm->cached_hole_size) { 71 info.length = len;
99 mm->cached_hole_size = 0; 72 info.low_limit = PAGE_SIZE;
100 mm->free_area_cache = mm->mmap_base; 73 info.high_limit = mm->mmap_base;
101 } 74 info.align_mask = PAGE_MASK & ~HPAGE_MASK;
102 75 info.align_offset = 0;
103 /* either no address requested or can't fit in requested address hole */ 76 addr = vm_unmapped_area(&info);
104 addr = mm->free_area_cache & HPAGE_MASK;
105
106 /* make sure it can fit in the remaining address space */
107 if (likely(addr > len)) {
108 vma = find_vma(mm, addr-len);
109 if (!vma || addr <= vma->vm_start) {
110 /* remember the address as a hint for next time */
111 return (mm->free_area_cache = addr-len);
112 }
113 }
114
115 if (unlikely(mm->mmap_base < len))
116 goto bottomup;
117
118 addr = (mm->mmap_base-len) & HPAGE_MASK;
119
120 do {
121 /*
122 * Lookup failure means no vma is above this address,
123 * else if new region fits below vma->vm_start,
124 * return with success:
125 */
126 vma = find_vma(mm, addr);
127 if (likely(!vma || addr+len <= vma->vm_start)) {
128 /* remember the address as a hint for next time */
129 return (mm->free_area_cache = addr);
130 }
131
132 /* remember the largest hole we saw so far */
133 if (addr + mm->cached_hole_size < vma->vm_start)
134 mm->cached_hole_size = vma->vm_start - addr;
135
136 /* try just below the current vma->vm_start */
137 addr = (vma->vm_start-len) & HPAGE_MASK;
138 } while (likely(len < vma->vm_start));
139 77
140bottomup:
141 /* 78 /*
142 * A failed mmap() very likely causes application failure, 79 * A failed mmap() very likely causes application failure,
143 * so fall back to the bottom-up function here. This scenario 80 * so fall back to the bottom-up function here. This scenario
144 * can happen with large stack limits and large mmap() 81 * can happen with large stack limits and large mmap()
145 * allocations. 82 * allocations.
146 */ 83 */
147 mm->cached_hole_size = ~0UL; 84 if (addr & ~PAGE_MASK) {
148 mm->free_area_cache = TASK_UNMAPPED_BASE; 85 VM_BUG_ON(addr != -ENOMEM);
149 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 86 info.flags = 0;
150 /* 87 info.low_limit = TASK_UNMAPPED_BASE;
151 * Restore the topdown base: 88 info.high_limit = STACK_TOP32;
152 */ 89 addr = vm_unmapped_area(&info);
153 mm->free_area_cache = mm->mmap_base; 90 }
154 mm->cached_hole_size = ~0UL;
155 91
156 return addr; 92 return addr;
157} 93}
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 812e2d037972..650ccff8378c 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -231,42 +231,15 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
231 unsigned long pgoff, unsigned long flags) 231 unsigned long pgoff, unsigned long flags)
232{ 232{
233 struct hstate *h = hstate_file(file); 233 struct hstate *h = hstate_file(file);
234 struct mm_struct *mm = current->mm; 234 struct vm_unmapped_area_info info;
235 struct vm_area_struct *vma; 235
236 unsigned long start_addr; 236 info.flags = 0;
237 237 info.length = len;
238 if (len > mm->cached_hole_size) { 238 info.low_limit = TASK_UNMAPPED_BASE;
239 start_addr = mm->free_area_cache; 239 info.high_limit = TASK_SIZE;
240 } else { 240 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
241 start_addr = TASK_UNMAPPED_BASE; 241 info.align_offset = 0;
242 mm->cached_hole_size = 0; 242 return vm_unmapped_area(&info);
243 }
244
245full_search:
246 addr = ALIGN(start_addr, huge_page_size(h));
247
248 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
249 /* At this point: (!vma || addr < vma->vm_end). */
250 if (TASK_SIZE - len < addr) {
251 /*
252 * Start a new search - just in case we missed
253 * some holes.
254 */
255 if (start_addr != TASK_UNMAPPED_BASE) {
256 start_addr = TASK_UNMAPPED_BASE;
257 mm->cached_hole_size = 0;
258 goto full_search;
259 }
260 return -ENOMEM;
261 }
262 if (!vma || addr + len <= vma->vm_start) {
263 mm->free_area_cache = addr + len;
264 return addr;
265 }
266 if (addr + mm->cached_hole_size < vma->vm_start)
267 mm->cached_hole_size = vma->vm_start - addr;
268 addr = ALIGN(vma->vm_end, huge_page_size(h));
269 }
270} 243}
271 244
272static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, 245static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
@@ -274,92 +247,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
274 unsigned long pgoff, unsigned long flags) 247 unsigned long pgoff, unsigned long flags)
275{ 248{
276 struct hstate *h = hstate_file(file); 249 struct hstate *h = hstate_file(file);
277 struct mm_struct *mm = current->mm; 250 struct vm_unmapped_area_info info;
278 struct vm_area_struct *vma, *prev_vma; 251 unsigned long addr;
279 unsigned long base = mm->mmap_base, addr = addr0;
280 unsigned long largest_hole = mm->cached_hole_size;
281 int first_time = 1;
282
283 /* don't allow allocations above current base */
284 if (mm->free_area_cache > base)
285 mm->free_area_cache = base;
286
287 if (len <= largest_hole) {
288 largest_hole = 0;
289 mm->free_area_cache = base;
290 }
291try_again:
292 /* make sure it can fit in the remaining address space */
293 if (mm->free_area_cache < len)
294 goto fail;
295
296 /* either no address requested or can't fit in requested address hole */
297 addr = (mm->free_area_cache - len) & huge_page_mask(h);
298 do {
299 /*
300 * Lookup failure means no vma is above this address,
301 * i.e. return with success:
302 */
303 vma = find_vma_prev(mm, addr, &prev_vma);
304 if (!vma) {
305 return addr;
306 break;
307 }
308
309 /*
310 * new region fits between prev_vma->vm_end and
311 * vma->vm_start, use it:
312 */
313 if (addr + len <= vma->vm_start &&
314 (!prev_vma || (addr >= prev_vma->vm_end))) {
315 /* remember the address as a hint for next time */
316 mm->cached_hole_size = largest_hole;
317 mm->free_area_cache = addr;
318 return addr;
319 } else {
320 /* pull free_area_cache down to the first hole */
321 if (mm->free_area_cache == vma->vm_end) {
322 mm->free_area_cache = vma->vm_start;
323 mm->cached_hole_size = largest_hole;
324 }
325 }
326 252
327 /* remember the largest hole we saw so far */ 253 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
328 if (addr + largest_hole < vma->vm_start) 254 info.length = len;
329 largest_hole = vma->vm_start - addr; 255 info.low_limit = PAGE_SIZE;
256 info.high_limit = current->mm->mmap_base;
257 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
258 info.align_offset = 0;
259 addr = vm_unmapped_area(&info);
330 260
331 /* try just below the current vma->vm_start */
332 addr = (vma->vm_start - len) & huge_page_mask(h);
333
334 } while (len <= vma->vm_start);
335
336fail:
337 /*
338 * if hint left us with no space for the requested
339 * mapping then try again:
340 */
341 if (first_time) {
342 mm->free_area_cache = base;
343 largest_hole = 0;
344 first_time = 0;
345 goto try_again;
346 }
347 /* 261 /*
348 * A failed mmap() very likely causes application failure, 262 * A failed mmap() very likely causes application failure,
349 * so fall back to the bottom-up function here. This scenario 263 * so fall back to the bottom-up function here. This scenario
350 * can happen with large stack limits and large mmap() 264 * can happen with large stack limits and large mmap()
351 * allocations. 265 * allocations.
352 */ 266 */
353 mm->free_area_cache = TASK_UNMAPPED_BASE; 267 if (addr & ~PAGE_MASK) {
354 mm->cached_hole_size = ~0UL; 268 VM_BUG_ON(addr != -ENOMEM);
355 addr = hugetlb_get_unmapped_area_bottomup(file, addr0, 269 info.flags = 0;
356 len, pgoff, flags); 270 info.low_limit = TASK_UNMAPPED_BASE;
357 271 info.high_limit = TASK_SIZE;
358 /* 272 addr = vm_unmapped_area(&info);
359 * Restore the topdown base: 273 }
360 */
361 mm->free_area_cache = base;
362 mm->cached_hole_size = ~0UL;
363 274
364 return addr; 275 return addr;
365} 276}
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 5939f44fe0c0..9c999c1674fa 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -354,12 +354,10 @@ static inline int mmap_is_ia32(void)
354 return 0; 354 return 0;
355} 355}
356 356
357/* The first two values are special, do not change. See align_addr() */ 357/* Do not change the values. See get_align_mask() */
358enum align_flags { 358enum align_flags {
359 ALIGN_VA_32 = BIT(0), 359 ALIGN_VA_32 = BIT(0),
360 ALIGN_VA_64 = BIT(1), 360 ALIGN_VA_64 = BIT(1),
361 ALIGN_VDSO = BIT(2),
362 ALIGN_TOPDOWN = BIT(3),
363}; 361};
364 362
365struct va_alignment { 363struct va_alignment {
@@ -368,5 +366,5 @@ struct va_alignment {
368} ____cacheline_aligned; 366} ____cacheline_aligned;
369 367
370extern struct va_alignment va_align; 368extern struct va_alignment va_align;
371extern unsigned long align_addr(unsigned long, struct file *, enum align_flags); 369extern unsigned long align_vdso_addr(unsigned long);
372#endif /* _ASM_X86_ELF_H */ 370#endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
index 593e51d4643f..513b05f15bb4 100644
--- a/arch/x86/include/asm/mman.h
+++ b/arch/x86/include/asm/mman.h
@@ -3,6 +3,9 @@
3 3
4#define MAP_32BIT 0x40 /* only give out 32bit addresses */ 4#define MAP_32BIT 0x40 /* only give out 32bit addresses */
5 5
6#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
7#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
8
6#include <asm-generic/mman.h> 9#include <asm-generic/mman.h>
7 10
8#endif /* _ASM_X86_MMAN_H */ 11#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index b4d3c3927dd8..97ef74b88e0f 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -21,37 +21,23 @@
21 21
22/* 22/*
23 * Align a virtual address to avoid aliasing in the I$ on AMD F15h. 23 * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
24 *
25 * @flags denotes the allocation direction - bottomup or topdown -
26 * or vDSO; see call sites below.
27 */ 24 */
28unsigned long align_addr(unsigned long addr, struct file *filp, 25static unsigned long get_align_mask(void)
29 enum align_flags flags)
30{ 26{
31 unsigned long tmp_addr;
32
33 /* handle 32- and 64-bit case with a single conditional */ 27 /* handle 32- and 64-bit case with a single conditional */
34 if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) 28 if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
35 return addr; 29 return 0;
36 30
37 if (!(current->flags & PF_RANDOMIZE)) 31 if (!(current->flags & PF_RANDOMIZE))
38 return addr; 32 return 0;
39
40 if (!((flags & ALIGN_VDSO) || filp))
41 return addr;
42
43 tmp_addr = addr;
44
45 /*
46 * We need an address which is <= than the original
47 * one only when in topdown direction.
48 */
49 if (!(flags & ALIGN_TOPDOWN))
50 tmp_addr += va_align.mask;
51 33
52 tmp_addr &= ~va_align.mask; 34 return va_align.mask;
35}
53 36
54 return tmp_addr; 37unsigned long align_vdso_addr(unsigned long addr)
38{
39 unsigned long align_mask = get_align_mask();
40 return (addr + align_mask) & ~align_mask;
55} 41}
56 42
57static int __init control_va_addr_alignment(char *str) 43static int __init control_va_addr_alignment(char *str)
@@ -126,7 +112,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
126{ 112{
127 struct mm_struct *mm = current->mm; 113 struct mm_struct *mm = current->mm;
128 struct vm_area_struct *vma; 114 struct vm_area_struct *vma;
129 unsigned long start_addr; 115 struct vm_unmapped_area_info info;
130 unsigned long begin, end; 116 unsigned long begin, end;
131 117
132 if (flags & MAP_FIXED) 118 if (flags & MAP_FIXED)
@@ -144,50 +130,16 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
144 (!vma || addr + len <= vma->vm_start)) 130 (!vma || addr + len <= vma->vm_start))
145 return addr; 131 return addr;
146 } 132 }
147 if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32))
148 && len <= mm->cached_hole_size) {
149 mm->cached_hole_size = 0;
150 mm->free_area_cache = begin;
151 }
152 addr = mm->free_area_cache;
153 if (addr < begin)
154 addr = begin;
155 start_addr = addr;
156
157full_search:
158
159 addr = align_addr(addr, filp, 0);
160
161 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
162 /* At this point: (!vma || addr < vma->vm_end). */
163 if (end - len < addr) {
164 /*
165 * Start a new search - just in case we missed
166 * some holes.
167 */
168 if (start_addr != begin) {
169 start_addr = addr = begin;
170 mm->cached_hole_size = 0;
171 goto full_search;
172 }
173 return -ENOMEM;
174 }
175 if (!vma || addr + len <= vma->vm_start) {
176 /*
177 * Remember the place where we stopped the search:
178 */
179 mm->free_area_cache = addr + len;
180 return addr;
181 }
182 if (addr + mm->cached_hole_size < vma->vm_start)
183 mm->cached_hole_size = vma->vm_start - addr;
184 133
185 addr = vma->vm_end; 134 info.flags = 0;
186 addr = align_addr(addr, filp, 0); 135 info.length = len;
187 } 136 info.low_limit = begin;
137 info.high_limit = end;
138 info.align_mask = filp ? get_align_mask() : 0;
139 info.align_offset = pgoff << PAGE_SHIFT;
140 return vm_unmapped_area(&info);
188} 141}
189 142
190
191unsigned long 143unsigned long
192arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, 144arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
193 const unsigned long len, const unsigned long pgoff, 145 const unsigned long len, const unsigned long pgoff,
@@ -195,7 +147,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
195{ 147{
196 struct vm_area_struct *vma; 148 struct vm_area_struct *vma;
197 struct mm_struct *mm = current->mm; 149 struct mm_struct *mm = current->mm;
198 unsigned long addr = addr0, start_addr; 150 unsigned long addr = addr0;
151 struct vm_unmapped_area_info info;
199 152
200 /* requested length too big for entire address space */ 153 /* requested length too big for entire address space */
201 if (len > TASK_SIZE) 154 if (len > TASK_SIZE)
@@ -217,51 +170,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
217 return addr; 170 return addr;
218 } 171 }
219 172
220 /* check if free_area_cache is useful for us */ 173 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
221 if (len <= mm->cached_hole_size) { 174 info.length = len;
222 mm->cached_hole_size = 0; 175 info.low_limit = PAGE_SIZE;
223 mm->free_area_cache = mm->mmap_base; 176 info.high_limit = mm->mmap_base;
224 } 177 info.align_mask = filp ? get_align_mask() : 0;
225 178 info.align_offset = pgoff << PAGE_SHIFT;
226try_again: 179 addr = vm_unmapped_area(&info);
227 /* either no address requested or can't fit in requested address hole */ 180 if (!(addr & ~PAGE_MASK))
228 start_addr = addr = mm->free_area_cache; 181 return addr;
229 182 VM_BUG_ON(addr != -ENOMEM);
230 if (addr < len)
231 goto fail;
232
233 addr -= len;
234 do {
235 addr = align_addr(addr, filp, ALIGN_TOPDOWN);
236
237 /*
238 * Lookup failure means no vma is above this address,
239 * else if new region fits below vma->vm_start,
240 * return with success:
241 */
242 vma = find_vma(mm, addr);
243 if (!vma || addr+len <= vma->vm_start)
244 /* remember the address as a hint for next time */
245 return mm->free_area_cache = addr;
246
247 /* remember the largest hole we saw so far */
248 if (addr + mm->cached_hole_size < vma->vm_start)
249 mm->cached_hole_size = vma->vm_start - addr;
250
251 /* try just below the current vma->vm_start */
252 addr = vma->vm_start-len;
253 } while (len < vma->vm_start);
254
255fail:
256 /*
257 * if hint left us with no space for the requested
258 * mapping then try again:
259 */
260 if (start_addr != mm->mmap_base) {
261 mm->free_area_cache = mm->mmap_base;
262 mm->cached_hole_size = 0;
263 goto try_again;
264 }
265 183
266bottomup: 184bottomup:
267 /* 185 /*
@@ -270,14 +188,5 @@ bottomup:
270 * can happen with large stack limits and large mmap() 188 * can happen with large stack limits and large mmap()
271 * allocations. 189 * allocations.
272 */ 190 */
273 mm->cached_hole_size = ~0UL; 191 return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
274 mm->free_area_cache = TASK_UNMAPPED_BASE;
275 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
276 /*
277 * Restore the topdown base:
278 */
279 mm->free_area_cache = mm->mmap_base;
280 mm->cached_hole_size = ~0UL;
281
282 return addr;
283} 192}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 937bff5cdaa7..ae1aa71d0115 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -274,42 +274,15 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
274 unsigned long pgoff, unsigned long flags) 274 unsigned long pgoff, unsigned long flags)
275{ 275{
276 struct hstate *h = hstate_file(file); 276 struct hstate *h = hstate_file(file);
277 struct mm_struct *mm = current->mm; 277 struct vm_unmapped_area_info info;
278 struct vm_area_struct *vma; 278
279 unsigned long start_addr; 279 info.flags = 0;
280 280 info.length = len;
281 if (len > mm->cached_hole_size) { 281 info.low_limit = TASK_UNMAPPED_BASE;
282 start_addr = mm->free_area_cache; 282 info.high_limit = TASK_SIZE;
283 } else { 283 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
284 start_addr = TASK_UNMAPPED_BASE; 284 info.align_offset = 0;
285 mm->cached_hole_size = 0; 285 return vm_unmapped_area(&info);
286 }
287
288full_search:
289 addr = ALIGN(start_addr, huge_page_size(h));
290
291 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
292 /* At this point: (!vma || addr < vma->vm_end). */
293 if (TASK_SIZE - len < addr) {
294 /*
295 * Start a new search - just in case we missed
296 * some holes.
297 */
298 if (start_addr != TASK_UNMAPPED_BASE) {
299 start_addr = TASK_UNMAPPED_BASE;
300 mm->cached_hole_size = 0;
301 goto full_search;
302 }
303 return -ENOMEM;
304 }
305 if (!vma || addr + len <= vma->vm_start) {
306 mm->free_area_cache = addr + len;
307 return addr;
308 }
309 if (addr + mm->cached_hole_size < vma->vm_start)
310 mm->cached_hole_size = vma->vm_start - addr;
311 addr = ALIGN(vma->vm_end, huge_page_size(h));
312 }
313} 286}
314 287
315static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, 288static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
@@ -317,83 +290,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
317 unsigned long pgoff, unsigned long flags) 290 unsigned long pgoff, unsigned long flags)
318{ 291{
319 struct hstate *h = hstate_file(file); 292 struct hstate *h = hstate_file(file);
320 struct mm_struct *mm = current->mm; 293 struct vm_unmapped_area_info info;
321 struct vm_area_struct *vma; 294 unsigned long addr;
322 unsigned long base = mm->mmap_base;
323 unsigned long addr = addr0;
324 unsigned long largest_hole = mm->cached_hole_size;
325 unsigned long start_addr;
326
327 /* don't allow allocations above current base */
328 if (mm->free_area_cache > base)
329 mm->free_area_cache = base;
330
331 if (len <= largest_hole) {
332 largest_hole = 0;
333 mm->free_area_cache = base;
334 }
335try_again:
336 start_addr = mm->free_area_cache;
337
338 /* make sure it can fit in the remaining address space */
339 if (mm->free_area_cache < len)
340 goto fail;
341
342 /* either no address requested or can't fit in requested address hole */
343 addr = (mm->free_area_cache - len) & huge_page_mask(h);
344 do {
345 /*
346 * Lookup failure means no vma is above this address,
347 * i.e. return with success:
348 */
349 vma = find_vma(mm, addr);
350 if (!vma)
351 return addr;
352 295
353 if (addr + len <= vma->vm_start) { 296 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
354 /* remember the address as a hint for next time */ 297 info.length = len;
355 mm->cached_hole_size = largest_hole; 298 info.low_limit = PAGE_SIZE;
356 return (mm->free_area_cache = addr); 299 info.high_limit = current->mm->mmap_base;
357 } else if (mm->free_area_cache == vma->vm_end) { 300 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
358 /* pull free_area_cache down to the first hole */ 301 info.align_offset = 0;
359 mm->free_area_cache = vma->vm_start; 302 addr = vm_unmapped_area(&info);
360 mm->cached_hole_size = largest_hole;
361 }
362 303
363 /* remember the largest hole we saw so far */
364 if (addr + largest_hole < vma->vm_start)
365 largest_hole = vma->vm_start - addr;
366
367 /* try just below the current vma->vm_start */
368 addr = (vma->vm_start - len) & huge_page_mask(h);
369 } while (len <= vma->vm_start);
370
371fail:
372 /*
373 * if hint left us with no space for the requested
374 * mapping then try again:
375 */
376 if (start_addr != base) {
377 mm->free_area_cache = base;
378 largest_hole = 0;
379 goto try_again;
380 }
381 /* 304 /*
382 * A failed mmap() very likely causes application failure, 305 * A failed mmap() very likely causes application failure,
383 * so fall back to the bottom-up function here. This scenario 306 * so fall back to the bottom-up function here. This scenario
384 * can happen with large stack limits and large mmap() 307 * can happen with large stack limits and large mmap()
385 * allocations. 308 * allocations.
386 */ 309 */
387 mm->free_area_cache = TASK_UNMAPPED_BASE; 310 if (addr & ~PAGE_MASK) {
388 mm->cached_hole_size = ~0UL; 311 VM_BUG_ON(addr != -ENOMEM);
389 addr = hugetlb_get_unmapped_area_bottomup(file, addr0, 312 info.flags = 0;
390 len, pgoff, flags); 313 info.low_limit = TASK_UNMAPPED_BASE;
391 314 info.high_limit = TASK_SIZE;
392 /* 315 addr = vm_unmapped_area(&info);
393 * Restore the topdown base: 316 }
394 */
395 mm->free_area_cache = base;
396 mm->cached_hole_size = ~0UL;
397 317
398 return addr; 318 return addr;
399} 319}
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 00aaf047b39f..431e87544411 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -141,7 +141,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
141 * unaligned here as a result of stack start randomization. 141 * unaligned here as a result of stack start randomization.
142 */ 142 */
143 addr = PAGE_ALIGN(addr); 143 addr = PAGE_ALIGN(addr);
144 addr = align_addr(addr, NULL, ALIGN_VDSO); 144 addr = align_vdso_addr(addr);
145 145
146 return addr; 146 return addr;
147} 147}
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 25bc6c1309c3..00eed6786d7e 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -93,4 +93,15 @@
93/* compatibility flags */ 93/* compatibility flags */
94#define MAP_FILE 0 94#define MAP_FILE 0
95 95
96/*
97 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
98 * This gives us 6 bits, which is enough until someone invents 128 bit address
99 * spaces.
100 *
101 * Assume these are all power of twos.
102 * When 0 use the default page size.
103 */
104#define MAP_HUGE_SHIFT 26
105#define MAP_HUGE_MASK 0x3f
106
96#endif /* _XTENSA_MMAN_H */ 107#endif /* _XTENSA_MMAN_H */
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 86c88216a503..987604d56c83 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -70,6 +70,13 @@ void unregister_memory_isolate_notifier(struct notifier_block *nb)
70} 70}
71EXPORT_SYMBOL(unregister_memory_isolate_notifier); 71EXPORT_SYMBOL(unregister_memory_isolate_notifier);
72 72
73static void memory_block_release(struct device *dev)
74{
75 struct memory_block *mem = container_of(dev, struct memory_block, dev);
76
77 kfree(mem);
78}
79
73/* 80/*
74 * register_memory - Setup a sysfs device for a memory block 81 * register_memory - Setup a sysfs device for a memory block
75 */ 82 */
@@ -80,6 +87,7 @@ int register_memory(struct memory_block *memory)
80 87
81 memory->dev.bus = &memory_subsys; 88 memory->dev.bus = &memory_subsys;
82 memory->dev.id = memory->start_section_nr / sections_per_block; 89 memory->dev.id = memory->start_section_nr / sections_per_block;
90 memory->dev.release = memory_block_release;
83 91
84 error = device_register(&memory->dev); 92 error = device_register(&memory->dev);
85 return error; 93 return error;
@@ -246,7 +254,7 @@ static bool pages_correctly_reserved(unsigned long start_pfn,
246 * OK to have direct references to sparsemem variables in here. 254 * OK to have direct references to sparsemem variables in here.
247 */ 255 */
248static int 256static int
249memory_block_action(unsigned long phys_index, unsigned long action) 257memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
250{ 258{
251 unsigned long start_pfn; 259 unsigned long start_pfn;
252 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 260 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
@@ -261,7 +269,7 @@ memory_block_action(unsigned long phys_index, unsigned long action)
261 if (!pages_correctly_reserved(start_pfn, nr_pages)) 269 if (!pages_correctly_reserved(start_pfn, nr_pages))
262 return -EBUSY; 270 return -EBUSY;
263 271
264 ret = online_pages(start_pfn, nr_pages); 272 ret = online_pages(start_pfn, nr_pages, online_type);
265 break; 273 break;
266 case MEM_OFFLINE: 274 case MEM_OFFLINE:
267 ret = offline_pages(start_pfn, nr_pages); 275 ret = offline_pages(start_pfn, nr_pages);
@@ -276,7 +284,8 @@ memory_block_action(unsigned long phys_index, unsigned long action)
276} 284}
277 285
278static int __memory_block_change_state(struct memory_block *mem, 286static int __memory_block_change_state(struct memory_block *mem,
279 unsigned long to_state, unsigned long from_state_req) 287 unsigned long to_state, unsigned long from_state_req,
288 int online_type)
280{ 289{
281 int ret = 0; 290 int ret = 0;
282 291
@@ -288,7 +297,7 @@ static int __memory_block_change_state(struct memory_block *mem,
288 if (to_state == MEM_OFFLINE) 297 if (to_state == MEM_OFFLINE)
289 mem->state = MEM_GOING_OFFLINE; 298 mem->state = MEM_GOING_OFFLINE;
290 299
291 ret = memory_block_action(mem->start_section_nr, to_state); 300 ret = memory_block_action(mem->start_section_nr, to_state, online_type);
292 301
293 if (ret) { 302 if (ret) {
294 mem->state = from_state_req; 303 mem->state = from_state_req;
@@ -311,12 +320,14 @@ out:
311} 320}
312 321
313static int memory_block_change_state(struct memory_block *mem, 322static int memory_block_change_state(struct memory_block *mem,
314 unsigned long to_state, unsigned long from_state_req) 323 unsigned long to_state, unsigned long from_state_req,
324 int online_type)
315{ 325{
316 int ret; 326 int ret;
317 327
318 mutex_lock(&mem->state_mutex); 328 mutex_lock(&mem->state_mutex);
319 ret = __memory_block_change_state(mem, to_state, from_state_req); 329 ret = __memory_block_change_state(mem, to_state, from_state_req,
330 online_type);
320 mutex_unlock(&mem->state_mutex); 331 mutex_unlock(&mem->state_mutex);
321 332
322 return ret; 333 return ret;
@@ -330,10 +341,18 @@ store_mem_state(struct device *dev,
330 341
331 mem = container_of(dev, struct memory_block, dev); 342 mem = container_of(dev, struct memory_block, dev);
332 343
333 if (!strncmp(buf, "online", min((int)count, 6))) 344 if (!strncmp(buf, "online_kernel", min_t(int, count, 13)))
334 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 345 ret = memory_block_change_state(mem, MEM_ONLINE,
335 else if(!strncmp(buf, "offline", min((int)count, 7))) 346 MEM_OFFLINE, ONLINE_KERNEL);
336 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 347 else if (!strncmp(buf, "online_movable", min_t(int, count, 14)))
348 ret = memory_block_change_state(mem, MEM_ONLINE,
349 MEM_OFFLINE, ONLINE_MOVABLE);
350 else if (!strncmp(buf, "online", min_t(int, count, 6)))
351 ret = memory_block_change_state(mem, MEM_ONLINE,
352 MEM_OFFLINE, ONLINE_KEEP);
353 else if(!strncmp(buf, "offline", min_t(int, count, 7)))
354 ret = memory_block_change_state(mem, MEM_OFFLINE,
355 MEM_ONLINE, -1);
337 356
338 if (ret) 357 if (ret)
339 return ret; 358 return ret;
@@ -635,7 +654,6 @@ int remove_memory_block(unsigned long node_id, struct mem_section *section,
635 mem_remove_simple_file(mem, phys_device); 654 mem_remove_simple_file(mem, phys_device);
636 mem_remove_simple_file(mem, removable); 655 mem_remove_simple_file(mem, removable);
637 unregister_memory(mem); 656 unregister_memory(mem);
638 kfree(mem);
639 } else 657 } else
640 kobject_put(&mem->dev.kobj); 658 kobject_put(&mem->dev.kobj);
641 659
@@ -669,7 +687,7 @@ int offline_memory_block(struct memory_block *mem)
669 687
670 mutex_lock(&mem->state_mutex); 688 mutex_lock(&mem->state_mutex);
671 if (mem->state != MEM_OFFLINE) 689 if (mem->state != MEM_OFFLINE)
672 ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 690 ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE, -1);
673 mutex_unlock(&mem->state_mutex); 691 mutex_unlock(&mem->state_mutex);
674 692
675 return ret; 693 return ret;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index af1a177216f1..294e31626210 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -252,6 +252,24 @@ static inline void hugetlb_register_node(struct node *node) {}
252static inline void hugetlb_unregister_node(struct node *node) {} 252static inline void hugetlb_unregister_node(struct node *node) {}
253#endif 253#endif
254 254
255static void node_device_release(struct device *dev)
256{
257 struct node *node = to_node(dev);
258
259#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
260 /*
261 * We schedule the work only when a memory section is
262 * onlined/offlined on this node. When we come here,
263 * all the memory on this node has been offlined,
264 * so we won't enqueue new work to this work.
265 *
266 * The work is using node->node_work, so we should
267 * flush work before freeing the memory.
268 */
269 flush_work(&node->node_work);
270#endif
271 kfree(node);
272}
255 273
256/* 274/*
257 * register_node - Setup a sysfs device for a node. 275 * register_node - Setup a sysfs device for a node.
@@ -259,12 +277,13 @@ static inline void hugetlb_unregister_node(struct node *node) {}
259 * 277 *
260 * Initialize and register the node device. 278 * Initialize and register the node device.
261 */ 279 */
262int register_node(struct node *node, int num, struct node *parent) 280static int register_node(struct node *node, int num, struct node *parent)
263{ 281{
264 int error; 282 int error;
265 283
266 node->dev.id = num; 284 node->dev.id = num;
267 node->dev.bus = &node_subsys; 285 node->dev.bus = &node_subsys;
286 node->dev.release = node_device_release;
268 error = device_register(&node->dev); 287 error = device_register(&node->dev);
269 288
270 if (!error){ 289 if (!error){
@@ -306,7 +325,7 @@ void unregister_node(struct node *node)
306 device_unregister(&node->dev); 325 device_unregister(&node->dev);
307} 326}
308 327
309struct node node_devices[MAX_NUMNODES]; 328struct node *node_devices[MAX_NUMNODES];
310 329
311/* 330/*
312 * register cpu under node 331 * register cpu under node
@@ -323,15 +342,15 @@ int register_cpu_under_node(unsigned int cpu, unsigned int nid)
323 if (!obj) 342 if (!obj)
324 return 0; 343 return 0;
325 344
326 ret = sysfs_create_link(&node_devices[nid].dev.kobj, 345 ret = sysfs_create_link(&node_devices[nid]->dev.kobj,
327 &obj->kobj, 346 &obj->kobj,
328 kobject_name(&obj->kobj)); 347 kobject_name(&obj->kobj));
329 if (ret) 348 if (ret)
330 return ret; 349 return ret;
331 350
332 return sysfs_create_link(&obj->kobj, 351 return sysfs_create_link(&obj->kobj,
333 &node_devices[nid].dev.kobj, 352 &node_devices[nid]->dev.kobj,
334 kobject_name(&node_devices[nid].dev.kobj)); 353 kobject_name(&node_devices[nid]->dev.kobj));
335} 354}
336 355
337int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) 356int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
@@ -345,10 +364,10 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
345 if (!obj) 364 if (!obj)
346 return 0; 365 return 0;
347 366
348 sysfs_remove_link(&node_devices[nid].dev.kobj, 367 sysfs_remove_link(&node_devices[nid]->dev.kobj,
349 kobject_name(&obj->kobj)); 368 kobject_name(&obj->kobj));
350 sysfs_remove_link(&obj->kobj, 369 sysfs_remove_link(&obj->kobj,
351 kobject_name(&node_devices[nid].dev.kobj)); 370 kobject_name(&node_devices[nid]->dev.kobj));
352 371
353 return 0; 372 return 0;
354} 373}
@@ -390,15 +409,15 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid)
390 continue; 409 continue;
391 if (page_nid != nid) 410 if (page_nid != nid)
392 continue; 411 continue;
393 ret = sysfs_create_link_nowarn(&node_devices[nid].dev.kobj, 412 ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
394 &mem_blk->dev.kobj, 413 &mem_blk->dev.kobj,
395 kobject_name(&mem_blk->dev.kobj)); 414 kobject_name(&mem_blk->dev.kobj));
396 if (ret) 415 if (ret)
397 return ret; 416 return ret;
398 417
399 return sysfs_create_link_nowarn(&mem_blk->dev.kobj, 418 return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
400 &node_devices[nid].dev.kobj, 419 &node_devices[nid]->dev.kobj,
401 kobject_name(&node_devices[nid].dev.kobj)); 420 kobject_name(&node_devices[nid]->dev.kobj));
402 } 421 }
403 /* mem section does not span the specified node */ 422 /* mem section does not span the specified node */
404 return 0; 423 return 0;
@@ -431,10 +450,10 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
431 continue; 450 continue;
432 if (node_test_and_set(nid, *unlinked_nodes)) 451 if (node_test_and_set(nid, *unlinked_nodes))
433 continue; 452 continue;
434 sysfs_remove_link(&node_devices[nid].dev.kobj, 453 sysfs_remove_link(&node_devices[nid]->dev.kobj,
435 kobject_name(&mem_blk->dev.kobj)); 454 kobject_name(&mem_blk->dev.kobj));
436 sysfs_remove_link(&mem_blk->dev.kobj, 455 sysfs_remove_link(&mem_blk->dev.kobj,
437 kobject_name(&node_devices[nid].dev.kobj)); 456 kobject_name(&node_devices[nid]->dev.kobj));
438 } 457 }
439 NODEMASK_FREE(unlinked_nodes); 458 NODEMASK_FREE(unlinked_nodes);
440 return 0; 459 return 0;
@@ -500,7 +519,7 @@ static void node_hugetlb_work(struct work_struct *work)
500 519
501static void init_node_hugetlb_work(int nid) 520static void init_node_hugetlb_work(int nid)
502{ 521{
503 INIT_WORK(&node_devices[nid].node_work, node_hugetlb_work); 522 INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work);
504} 523}
505 524
506static int node_memory_callback(struct notifier_block *self, 525static int node_memory_callback(struct notifier_block *self,
@@ -517,7 +536,7 @@ static int node_memory_callback(struct notifier_block *self,
517 * when transitioning to/from memoryless state. 536 * when transitioning to/from memoryless state.
518 */ 537 */
519 if (nid != NUMA_NO_NODE) 538 if (nid != NUMA_NO_NODE)
520 schedule_work(&node_devices[nid].node_work); 539 schedule_work(&node_devices[nid]->node_work);
521 break; 540 break;
522 541
523 case MEM_GOING_ONLINE: 542 case MEM_GOING_ONLINE:
@@ -558,9 +577,13 @@ int register_one_node(int nid)
558 struct node *parent = NULL; 577 struct node *parent = NULL;
559 578
560 if (p_node != nid) 579 if (p_node != nid)
561 parent = &node_devices[p_node]; 580 parent = node_devices[p_node];
581
582 node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL);
583 if (!node_devices[nid])
584 return -ENOMEM;
562 585
563 error = register_node(&node_devices[nid], nid, parent); 586 error = register_node(node_devices[nid], nid, parent);
564 587
565 /* link cpu under this node */ 588 /* link cpu under this node */
566 for_each_present_cpu(cpu) { 589 for_each_present_cpu(cpu) {
@@ -581,7 +604,8 @@ int register_one_node(int nid)
581 604
582void unregister_one_node(int nid) 605void unregister_one_node(int nid)
583{ 606{
584 unregister_node(&node_devices[nid]); 607 unregister_node(node_devices[nid]);
608 node_devices[nid] = NULL;
585} 609}
586 610
587/* 611/*
@@ -614,23 +638,23 @@ static ssize_t show_node_state(struct device *dev,
614 { __ATTR(name, 0444, show_node_state, NULL), state } 638 { __ATTR(name, 0444, show_node_state, NULL), state }
615 639
616static struct node_attr node_state_attr[] = { 640static struct node_attr node_state_attr[] = {
617 _NODE_ATTR(possible, N_POSSIBLE), 641 [N_POSSIBLE] = _NODE_ATTR(possible, N_POSSIBLE),
618 _NODE_ATTR(online, N_ONLINE), 642 [N_ONLINE] = _NODE_ATTR(online, N_ONLINE),
619 _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY), 643 [N_NORMAL_MEMORY] = _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY),
620 _NODE_ATTR(has_cpu, N_CPU),
621#ifdef CONFIG_HIGHMEM 644#ifdef CONFIG_HIGHMEM
622 _NODE_ATTR(has_high_memory, N_HIGH_MEMORY), 645 [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
623#endif 646#endif
647 [N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
624}; 648};
625 649
626static struct attribute *node_state_attrs[] = { 650static struct attribute *node_state_attrs[] = {
627 &node_state_attr[0].attr.attr, 651 &node_state_attr[N_POSSIBLE].attr.attr,
628 &node_state_attr[1].attr.attr, 652 &node_state_attr[N_ONLINE].attr.attr,
629 &node_state_attr[2].attr.attr, 653 &node_state_attr[N_NORMAL_MEMORY].attr.attr,
630 &node_state_attr[3].attr.attr,
631#ifdef CONFIG_HIGHMEM 654#ifdef CONFIG_HIGHMEM
632 &node_state_attr[4].attr.attr, 655 &node_state_attr[N_HIGH_MEMORY].attr.attr,
633#endif 656#endif
657 &node_state_attr[N_CPU].attr.attr,
634 NULL 658 NULL
635}; 659};
636 660
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index 7d5a6b40b31c..196368009001 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -565,7 +565,7 @@ fail_msg_node:
565fail_db_node: 565fail_db_node:
566 of_node_put(smu->db_node); 566 of_node_put(smu->db_node);
567fail_bootmem: 567fail_bootmem:
568 free_bootmem((unsigned long)smu, sizeof(struct smu_device)); 568 free_bootmem(__pa(smu), sizeof(struct smu_device));
569 smu = NULL; 569 smu = NULL;
570fail_np: 570fail_np:
571 of_node_put(np); 571 of_node_put(np);
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index b91e4bc332a7..3b91b0fd4de3 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -40,7 +40,7 @@
40#include <linux/notifier.h> 40#include <linux/notifier.h>
41 41
42static uint32_t lowmem_debug_level = 2; 42static uint32_t lowmem_debug_level = 2;
43static int lowmem_adj[6] = { 43static short lowmem_adj[6] = {
44 0, 44 0,
45 1, 45 1,
46 6, 46 6,
@@ -70,9 +70,9 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
70 int rem = 0; 70 int rem = 0;
71 int tasksize; 71 int tasksize;
72 int i; 72 int i;
73 int min_score_adj = OOM_SCORE_ADJ_MAX + 1; 73 short min_score_adj = OOM_SCORE_ADJ_MAX + 1;
74 int selected_tasksize = 0; 74 int selected_tasksize = 0;
75 int selected_oom_score_adj; 75 short selected_oom_score_adj;
76 int array_size = ARRAY_SIZE(lowmem_adj); 76 int array_size = ARRAY_SIZE(lowmem_adj);
77 int other_free = global_page_state(NR_FREE_PAGES); 77 int other_free = global_page_state(NR_FREE_PAGES);
78 int other_file = global_page_state(NR_FILE_PAGES) - 78 int other_file = global_page_state(NR_FILE_PAGES) -
@@ -90,7 +90,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
90 } 90 }
91 } 91 }
92 if (sc->nr_to_scan > 0) 92 if (sc->nr_to_scan > 0)
93 lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %d\n", 93 lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %hd\n",
94 sc->nr_to_scan, sc->gfp_mask, other_free, 94 sc->nr_to_scan, sc->gfp_mask, other_free,
95 other_file, min_score_adj); 95 other_file, min_score_adj);
96 rem = global_page_state(NR_ACTIVE_ANON) + 96 rem = global_page_state(NR_ACTIVE_ANON) +
@@ -107,7 +107,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
107 rcu_read_lock(); 107 rcu_read_lock();
108 for_each_process(tsk) { 108 for_each_process(tsk) {
109 struct task_struct *p; 109 struct task_struct *p;
110 int oom_score_adj; 110 short oom_score_adj;
111 111
112 if (tsk->flags & PF_KTHREAD) 112 if (tsk->flags & PF_KTHREAD)
113 continue; 113 continue;
@@ -141,11 +141,11 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
141 selected = p; 141 selected = p;
142 selected_tasksize = tasksize; 142 selected_tasksize = tasksize;
143 selected_oom_score_adj = oom_score_adj; 143 selected_oom_score_adj = oom_score_adj;
144 lowmem_print(2, "select %d (%s), adj %d, size %d, to kill\n", 144 lowmem_print(2, "select %d (%s), adj %hd, size %d, to kill\n",
145 p->pid, p->comm, oom_score_adj, tasksize); 145 p->pid, p->comm, oom_score_adj, tasksize);
146 } 146 }
147 if (selected) { 147 if (selected) {
148 lowmem_print(1, "send sigkill to %d (%s), adj %d, size %d\n", 148 lowmem_print(1, "send sigkill to %d (%s), adj %hd, size %d\n",
149 selected->pid, selected->comm, 149 selected->pid, selected->comm,
150 selected_oom_score_adj, selected_tasksize); 150 selected_oom_score_adj, selected_tasksize);
151 lowmem_deathpending_timeout = jiffies + HZ; 151 lowmem_deathpending_timeout = jiffies + HZ;
@@ -176,7 +176,7 @@ static void __exit lowmem_exit(void)
176} 176}
177 177
178module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR); 178module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR);
179module_param_array_named(adj, lowmem_adj, int, &lowmem_adj_size, 179module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size,
180 S_IRUGO | S_IWUSR); 180 S_IRUGO | S_IWUSR);
181module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size, 181module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size,
182 S_IRUGO | S_IWUSR); 182 S_IRUGO | S_IWUSR);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 0908e6044333..2a70558b36ea 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -27,13 +27,15 @@
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/balloon_compaction.h>
30 31
31/* 32/*
32 * Balloon device works in 4K page units. So each page is pointed to by 33 * Balloon device works in 4K page units. So each page is pointed to by
33 * multiple balloon pages. All memory counters in this driver are in balloon 34 * multiple balloon pages. All memory counters in this driver are in balloon
34 * page units. 35 * page units.
35 */ 36 */
36#define VIRTIO_BALLOON_PAGES_PER_PAGE (PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT) 37#define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
38#define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256
37 39
38struct virtio_balloon 40struct virtio_balloon
39{ 41{
@@ -52,15 +54,19 @@ struct virtio_balloon
52 /* Number of balloon pages we've told the Host we're not using. */ 54 /* Number of balloon pages we've told the Host we're not using. */
53 unsigned int num_pages; 55 unsigned int num_pages;
54 /* 56 /*
55 * The pages we've told the Host we're not using. 57 * The pages we've told the Host we're not using are enqueued
58 * at vb_dev_info->pages list.
56 * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE 59 * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE
57 * to num_pages above. 60 * to num_pages above.
58 */ 61 */
59 struct list_head pages; 62 struct balloon_dev_info *vb_dev_info;
63
64 /* Synchronize access/update to this struct virtio_balloon elements */
65 struct mutex balloon_lock;
60 66
61 /* The array of pfns we tell the Host about. */ 67 /* The array of pfns we tell the Host about. */
62 unsigned int num_pfns; 68 unsigned int num_pfns;
63 u32 pfns[256]; 69 u32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
64 70
65 /* Memory statistics */ 71 /* Memory statistics */
66 int need_stats_update; 72 int need_stats_update;
@@ -122,18 +128,21 @@ static void set_page_pfns(u32 pfns[], struct page *page)
122 128
123static void fill_balloon(struct virtio_balloon *vb, size_t num) 129static void fill_balloon(struct virtio_balloon *vb, size_t num)
124{ 130{
131 struct balloon_dev_info *vb_dev_info = vb->vb_dev_info;
132
125 /* We can only do one array worth at a time. */ 133 /* We can only do one array worth at a time. */
126 num = min(num, ARRAY_SIZE(vb->pfns)); 134 num = min(num, ARRAY_SIZE(vb->pfns));
127 135
136 mutex_lock(&vb->balloon_lock);
128 for (vb->num_pfns = 0; vb->num_pfns < num; 137 for (vb->num_pfns = 0; vb->num_pfns < num;
129 vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { 138 vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
130 struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY | 139 struct page *page = balloon_page_enqueue(vb_dev_info);
131 __GFP_NOMEMALLOC | __GFP_NOWARN); 140
132 if (!page) { 141 if (!page) {
133 if (printk_ratelimit()) 142 if (printk_ratelimit())
134 dev_printk(KERN_INFO, &vb->vdev->dev, 143 dev_printk(KERN_INFO, &vb->vdev->dev,
135 "Out of puff! Can't get %zu pages\n", 144 "Out of puff! Can't get %u pages\n",
136 num); 145 VIRTIO_BALLOON_PAGES_PER_PAGE);
137 /* Sleep for at least 1/5 of a second before retry. */ 146 /* Sleep for at least 1/5 of a second before retry. */
138 msleep(200); 147 msleep(200);
139 break; 148 break;
@@ -141,14 +150,12 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num)
141 set_page_pfns(vb->pfns + vb->num_pfns, page); 150 set_page_pfns(vb->pfns + vb->num_pfns, page);
142 vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; 151 vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
143 totalram_pages--; 152 totalram_pages--;
144 list_add(&page->lru, &vb->pages);
145 } 153 }
146 154
147 /* Didn't get any? Oh well. */ 155 /* Did we get any? */
148 if (vb->num_pfns == 0) 156 if (vb->num_pfns != 0)
149 return; 157 tell_host(vb, vb->inflate_vq);
150 158 mutex_unlock(&vb->balloon_lock);
151 tell_host(vb, vb->inflate_vq);
152} 159}
153 160
154static void release_pages_by_pfn(const u32 pfns[], unsigned int num) 161static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
@@ -157,7 +164,7 @@ static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
157 164
158 /* Find pfns pointing at start of each page, get pages and free them. */ 165 /* Find pfns pointing at start of each page, get pages and free them. */
159 for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) { 166 for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) {
160 __free_page(balloon_pfn_to_page(pfns[i])); 167 balloon_page_free(balloon_pfn_to_page(pfns[i]));
161 totalram_pages++; 168 totalram_pages++;
162 } 169 }
163} 170}
@@ -165,14 +172,17 @@ static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
165static void leak_balloon(struct virtio_balloon *vb, size_t num) 172static void leak_balloon(struct virtio_balloon *vb, size_t num)
166{ 173{
167 struct page *page; 174 struct page *page;
175 struct balloon_dev_info *vb_dev_info = vb->vb_dev_info;
168 176
169 /* We can only do one array worth at a time. */ 177 /* We can only do one array worth at a time. */
170 num = min(num, ARRAY_SIZE(vb->pfns)); 178 num = min(num, ARRAY_SIZE(vb->pfns));
171 179
180 mutex_lock(&vb->balloon_lock);
172 for (vb->num_pfns = 0; vb->num_pfns < num; 181 for (vb->num_pfns = 0; vb->num_pfns < num;
173 vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { 182 vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
174 page = list_first_entry(&vb->pages, struct page, lru); 183 page = balloon_page_dequeue(vb_dev_info);
175 list_del(&page->lru); 184 if (!page)
185 break;
176 set_page_pfns(vb->pfns + vb->num_pfns, page); 186 set_page_pfns(vb->pfns + vb->num_pfns, page);
177 vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; 187 vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
178 } 188 }
@@ -183,6 +193,7 @@ static void leak_balloon(struct virtio_balloon *vb, size_t num)
183 * is true, we *have* to do it in this order 193 * is true, we *have* to do it in this order
184 */ 194 */
185 tell_host(vb, vb->deflate_vq); 195 tell_host(vb, vb->deflate_vq);
196 mutex_unlock(&vb->balloon_lock);
186 release_pages_by_pfn(vb->pfns, vb->num_pfns); 197 release_pages_by_pfn(vb->pfns, vb->num_pfns);
187} 198}
188 199
@@ -339,9 +350,84 @@ static int init_vqs(struct virtio_balloon *vb)
339 return 0; 350 return 0;
340} 351}
341 352
353static const struct address_space_operations virtio_balloon_aops;
354#ifdef CONFIG_BALLOON_COMPACTION
355/*
356 * virtballoon_migratepage - perform the balloon page migration on behalf of
357 * a compation thread. (called under page lock)
358 * @mapping: the page->mapping which will be assigned to the new migrated page.
359 * @newpage: page that will replace the isolated page after migration finishes.
360 * @page : the isolated (old) page that is about to be migrated to newpage.
361 * @mode : compaction mode -- not used for balloon page migration.
362 *
363 * After a ballooned page gets isolated by compaction procedures, this is the
364 * function that performs the page migration on behalf of a compaction thread
365 * The page migration for virtio balloon is done in a simple swap fashion which
366 * follows these two macro steps:
367 * 1) insert newpage into vb->pages list and update the host about it;
368 * 2) update the host about the old page removed from vb->pages list;
369 *
370 * This function preforms the balloon page migration task.
371 * Called through balloon_mapping->a_ops->migratepage
372 */
373int virtballoon_migratepage(struct address_space *mapping,
374 struct page *newpage, struct page *page, enum migrate_mode mode)
375{
376 struct balloon_dev_info *vb_dev_info = balloon_page_device(page);
377 struct virtio_balloon *vb;
378 unsigned long flags;
379
380 BUG_ON(!vb_dev_info);
381
382 vb = vb_dev_info->balloon_device;
383
384 /*
385 * In order to avoid lock contention while migrating pages concurrently
386 * to leak_balloon() or fill_balloon() we just give up the balloon_lock
387 * this turn, as it is easier to retry the page migration later.
388 * This also prevents fill_balloon() getting stuck into a mutex
389 * recursion in the case it ends up triggering memory compaction
390 * while it is attempting to inflate the ballon.
391 */
392 if (!mutex_trylock(&vb->balloon_lock))
393 return -EAGAIN;
394
395 /* balloon's page migration 1st step -- inflate "newpage" */
396 spin_lock_irqsave(&vb_dev_info->pages_lock, flags);
397 balloon_page_insert(newpage, mapping, &vb_dev_info->pages);
398 vb_dev_info->isolated_pages--;
399 spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
400 vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
401 set_page_pfns(vb->pfns, newpage);
402 tell_host(vb, vb->inflate_vq);
403
404 /*
405 * balloon's page migration 2nd step -- deflate "page"
406 *
407 * It's safe to delete page->lru here because this page is at
408 * an isolated migration list, and this step is expected to happen here
409 */
410 balloon_page_delete(page);
411 vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
412 set_page_pfns(vb->pfns, page);
413 tell_host(vb, vb->deflate_vq);
414
415 mutex_unlock(&vb->balloon_lock);
416
417 return MIGRATEPAGE_BALLOON_SUCCESS;
418}
419
420/* define the balloon_mapping->a_ops callback to allow balloon page migration */
421static const struct address_space_operations virtio_balloon_aops = {
422 .migratepage = virtballoon_migratepage,
423};
424#endif /* CONFIG_BALLOON_COMPACTION */
425
342static int virtballoon_probe(struct virtio_device *vdev) 426static int virtballoon_probe(struct virtio_device *vdev)
343{ 427{
344 struct virtio_balloon *vb; 428 struct virtio_balloon *vb;
429 struct address_space *vb_mapping;
430 struct balloon_dev_info *vb_devinfo;
345 int err; 431 int err;
346 432
347 vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); 433 vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL);
@@ -350,16 +436,37 @@ static int virtballoon_probe(struct virtio_device *vdev)
350 goto out; 436 goto out;
351 } 437 }
352 438
353 INIT_LIST_HEAD(&vb->pages);
354 vb->num_pages = 0; 439 vb->num_pages = 0;
440 mutex_init(&vb->balloon_lock);
355 init_waitqueue_head(&vb->config_change); 441 init_waitqueue_head(&vb->config_change);
356 init_waitqueue_head(&vb->acked); 442 init_waitqueue_head(&vb->acked);
357 vb->vdev = vdev; 443 vb->vdev = vdev;
358 vb->need_stats_update = 0; 444 vb->need_stats_update = 0;
359 445
446 vb_devinfo = balloon_devinfo_alloc(vb);
447 if (IS_ERR(vb_devinfo)) {
448 err = PTR_ERR(vb_devinfo);
449 goto out_free_vb;
450 }
451
452 vb_mapping = balloon_mapping_alloc(vb_devinfo,
453 (balloon_compaction_check()) ?
454 &virtio_balloon_aops : NULL);
455 if (IS_ERR(vb_mapping)) {
456 /*
457 * IS_ERR(vb_mapping) && PTR_ERR(vb_mapping) == -EOPNOTSUPP
458 * This means !CONFIG_BALLOON_COMPACTION, otherwise we get off.
459 */
460 err = PTR_ERR(vb_mapping);
461 if (err != -EOPNOTSUPP)
462 goto out_free_vb_devinfo;
463 }
464
465 vb->vb_dev_info = vb_devinfo;
466
360 err = init_vqs(vb); 467 err = init_vqs(vb);
361 if (err) 468 if (err)
362 goto out_free_vb; 469 goto out_free_vb_mapping;
363 470
364 vb->thread = kthread_run(balloon, vb, "vballoon"); 471 vb->thread = kthread_run(balloon, vb, "vballoon");
365 if (IS_ERR(vb->thread)) { 472 if (IS_ERR(vb->thread)) {
@@ -371,6 +478,10 @@ static int virtballoon_probe(struct virtio_device *vdev)
371 478
372out_del_vqs: 479out_del_vqs:
373 vdev->config->del_vqs(vdev); 480 vdev->config->del_vqs(vdev);
481out_free_vb_mapping:
482 balloon_mapping_free(vb_mapping);
483out_free_vb_devinfo:
484 balloon_devinfo_free(vb_devinfo);
374out_free_vb: 485out_free_vb:
375 kfree(vb); 486 kfree(vb);
376out: 487out:
@@ -396,6 +507,8 @@ static void __devexit virtballoon_remove(struct virtio_device *vdev)
396 507
397 kthread_stop(vb->thread); 508 kthread_stop(vb->thread);
398 remove_common(vb); 509 remove_common(vb);
510 balloon_mapping_free(vb->vb_dev_info->mapping);
511 balloon_devinfo_free(vb->vb_dev_info);
399 kfree(vb); 512 kfree(vb);
400} 513}
401 514
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7cda51995c1e..22a0439e5a86 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3416,8 +3416,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3416 num_dirty = root->fs_info->dirty_metadata_bytes; 3416 num_dirty = root->fs_info->dirty_metadata_bytes;
3417 3417
3418 if (num_dirty > thresh) { 3418 if (num_dirty > thresh) {
3419 balance_dirty_pages_ratelimited_nr( 3419 balance_dirty_pages_ratelimited(
3420 root->fs_info->btree_inode->i_mapping, 1); 3420 root->fs_info->btree_inode->i_mapping);
3421 } 3421 }
3422 return; 3422 return;
3423} 3423}
@@ -3437,8 +3437,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3437 num_dirty = root->fs_info->dirty_metadata_bytes; 3437 num_dirty = root->fs_info->dirty_metadata_bytes;
3438 3438
3439 if (num_dirty > thresh) { 3439 if (num_dirty > thresh) {
3440 balance_dirty_pages_ratelimited_nr( 3440 balance_dirty_pages_ratelimited(
3441 root->fs_info->btree_inode->i_mapping, 1); 3441 root->fs_info->btree_inode->i_mapping);
3442 } 3442 }
3443 return; 3443 return;
3444} 3444}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9ab1bed88116..a8ee75cb96ee 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1346,8 +1346,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1346 1346
1347 cond_resched(); 1347 cond_resched();
1348 1348
1349 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1349 balance_dirty_pages_ratelimited(inode->i_mapping);
1350 dirty_pages);
1351 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1350 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1352 btrfs_btree_balance_dirty(root, 1); 1351 btrfs_btree_balance_dirty(root, 1);
1353 1352
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8fcf9a59c28d..5b3429ab8ec1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1225,7 +1225,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1225 } 1225 }
1226 1226
1227 defrag_count += ret; 1227 defrag_count += ret;
1228 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1228 balance_dirty_pages_ratelimited(inode->i_mapping);
1229 mutex_unlock(&inode->i_mutex); 1229 mutex_unlock(&inode->i_mutex);
1230 1230
1231 if (newer_than) { 1231 if (newer_than) {
diff --git a/fs/buffer.c b/fs/buffer.c
index ec0aca8ba6bf..6e9ed48064fc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -555,7 +555,7 @@ void emergency_thaw_all(void)
555 */ 555 */
556int sync_mapping_buffers(struct address_space *mapping) 556int sync_mapping_buffers(struct address_space *mapping)
557{ 557{
558 struct address_space *buffer_mapping = mapping->assoc_mapping; 558 struct address_space *buffer_mapping = mapping->private_data;
559 559
560 if (buffer_mapping == NULL || list_empty(&mapping->private_list)) 560 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
561 return 0; 561 return 0;
@@ -588,10 +588,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
588 struct address_space *buffer_mapping = bh->b_page->mapping; 588 struct address_space *buffer_mapping = bh->b_page->mapping;
589 589
590 mark_buffer_dirty(bh); 590 mark_buffer_dirty(bh);
591 if (!mapping->assoc_mapping) { 591 if (!mapping->private_data) {
592 mapping->assoc_mapping = buffer_mapping; 592 mapping->private_data = buffer_mapping;
593 } else { 593 } else {
594 BUG_ON(mapping->assoc_mapping != buffer_mapping); 594 BUG_ON(mapping->private_data != buffer_mapping);
595 } 595 }
596 if (!bh->b_assoc_map) { 596 if (!bh->b_assoc_map) {
597 spin_lock(&buffer_mapping->private_lock); 597 spin_lock(&buffer_mapping->private_lock);
@@ -788,7 +788,7 @@ void invalidate_inode_buffers(struct inode *inode)
788 if (inode_has_buffers(inode)) { 788 if (inode_has_buffers(inode)) {
789 struct address_space *mapping = &inode->i_data; 789 struct address_space *mapping = &inode->i_data;
790 struct list_head *list = &mapping->private_list; 790 struct list_head *list = &mapping->private_list;
791 struct address_space *buffer_mapping = mapping->assoc_mapping; 791 struct address_space *buffer_mapping = mapping->private_data;
792 792
793 spin_lock(&buffer_mapping->private_lock); 793 spin_lock(&buffer_mapping->private_lock);
794 while (!list_empty(list)) 794 while (!list_empty(list))
@@ -811,7 +811,7 @@ int remove_inode_buffers(struct inode *inode)
811 if (inode_has_buffers(inode)) { 811 if (inode_has_buffers(inode)) {
812 struct address_space *mapping = &inode->i_data; 812 struct address_space *mapping = &inode->i_data;
813 struct list_head *list = &mapping->private_list; 813 struct list_head *list = &mapping->private_list;
814 struct address_space *buffer_mapping = mapping->assoc_mapping; 814 struct address_space *buffer_mapping = mapping->private_data;
815 815
816 spin_lock(&buffer_mapping->private_lock); 816 spin_lock(&buffer_mapping->private_lock);
817 while (!list_empty(list)) { 817 while (!list_empty(list)) {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e6c2fd53cab2..0f22d09f358d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -768,7 +768,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
768 mapping->host = s->s_bdev->bd_inode; 768 mapping->host = s->s_bdev->bd_inode;
769 mapping->flags = 0; 769 mapping->flags = 0;
770 mapping_set_gfp_mask(mapping, GFP_NOFS); 770 mapping_set_gfp_mask(mapping, GFP_NOFS);
771 mapping->assoc_mapping = NULL; 771 mapping->private_data = NULL;
772 mapping->backing_dev_info = s->s_bdi; 772 mapping->backing_dev_info = s->s_bdi;
773 mapping->writeback_index = 0; 773 mapping->writeback_index = 0;
774 } 774 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c5bc355d8243..4a55f35a6ced 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -151,8 +151,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
151{ 151{
152 struct mm_struct *mm = current->mm; 152 struct mm_struct *mm = current->mm;
153 struct vm_area_struct *vma; 153 struct vm_area_struct *vma;
154 unsigned long start_addr;
155 struct hstate *h = hstate_file(file); 154 struct hstate *h = hstate_file(file);
155 struct vm_unmapped_area_info info;
156 156
157 if (len & ~huge_page_mask(h)) 157 if (len & ~huge_page_mask(h))
158 return -EINVAL; 158 return -EINVAL;
@@ -173,39 +173,13 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
173 return addr; 173 return addr;
174 } 174 }
175 175
176 if (len > mm->cached_hole_size) 176 info.flags = 0;
177 start_addr = mm->free_area_cache; 177 info.length = len;
178 else { 178 info.low_limit = TASK_UNMAPPED_BASE;
179 start_addr = TASK_UNMAPPED_BASE; 179 info.high_limit = TASK_SIZE;
180 mm->cached_hole_size = 0; 180 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
181 } 181 info.align_offset = 0;
182 182 return vm_unmapped_area(&info);
183full_search:
184 addr = ALIGN(start_addr, huge_page_size(h));
185
186 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
187 /* At this point: (!vma || addr < vma->vm_end). */
188 if (TASK_SIZE - len < addr) {
189 /*
190 * Start a new search - just in case we missed
191 * some holes.
192 */
193 if (start_addr != TASK_UNMAPPED_BASE) {
194 start_addr = TASK_UNMAPPED_BASE;
195 mm->cached_hole_size = 0;
196 goto full_search;
197 }
198 return -ENOMEM;
199 }
200
201 if (!vma || addr + len <= vma->vm_start) {
202 mm->free_area_cache = addr + len;
203 return addr;
204 }
205 if (addr + mm->cached_hole_size < vma->vm_start)
206 mm->cached_hole_size = vma->vm_start - addr;
207 addr = ALIGN(vma->vm_end, huge_page_size(h));
208 }
209} 183}
210#endif 184#endif
211 185
@@ -608,11 +582,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
608 int rc; 582 int rc;
609 583
610 rc = migrate_huge_page_move_mapping(mapping, newpage, page); 584 rc = migrate_huge_page_move_mapping(mapping, newpage, page);
611 if (rc) 585 if (rc != MIGRATEPAGE_SUCCESS)
612 return rc; 586 return rc;
613 migrate_page_copy(newpage, page); 587 migrate_page_copy(newpage, page);
614 588
615 return 0; 589 return MIGRATEPAGE_SUCCESS;
616} 590}
617 591
618static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 592static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -923,7 +897,7 @@ static struct file_system_type hugetlbfs_fs_type = {
923 .kill_sb = kill_litter_super, 897 .kill_sb = kill_litter_super,
924}; 898};
925 899
926static struct vfsmount *hugetlbfs_vfsmount; 900static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
927 901
928static int can_do_hugetlb_shm(void) 902static int can_do_hugetlb_shm(void)
929{ 903{
@@ -932,9 +906,22 @@ static int can_do_hugetlb_shm(void)
932 return capable(CAP_IPC_LOCK) || in_group_p(shm_group); 906 return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
933} 907}
934 908
909static int get_hstate_idx(int page_size_log)
910{
911 struct hstate *h;
912
913 if (!page_size_log)
914 return default_hstate_idx;
915 h = size_to_hstate(1 << page_size_log);
916 if (!h)
917 return -1;
918 return h - hstates;
919}
920
935struct file *hugetlb_file_setup(const char *name, unsigned long addr, 921struct file *hugetlb_file_setup(const char *name, unsigned long addr,
936 size_t size, vm_flags_t acctflag, 922 size_t size, vm_flags_t acctflag,
937 struct user_struct **user, int creat_flags) 923 struct user_struct **user,
924 int creat_flags, int page_size_log)
938{ 925{
939 int error = -ENOMEM; 926 int error = -ENOMEM;
940 struct file *file; 927 struct file *file;
@@ -944,9 +931,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
944 struct qstr quick_string; 931 struct qstr quick_string;
945 struct hstate *hstate; 932 struct hstate *hstate;
946 unsigned long num_pages; 933 unsigned long num_pages;
934 int hstate_idx;
935
936 hstate_idx = get_hstate_idx(page_size_log);
937 if (hstate_idx < 0)
938 return ERR_PTR(-ENODEV);
947 939
948 *user = NULL; 940 *user = NULL;
949 if (!hugetlbfs_vfsmount) 941 if (!hugetlbfs_vfsmount[hstate_idx])
950 return ERR_PTR(-ENOENT); 942 return ERR_PTR(-ENOENT);
951 943
952 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 944 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
@@ -963,7 +955,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
963 } 955 }
964 } 956 }
965 957
966 root = hugetlbfs_vfsmount->mnt_root; 958 root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
967 quick_string.name = name; 959 quick_string.name = name;
968 quick_string.len = strlen(quick_string.name); 960 quick_string.len = strlen(quick_string.name);
969 quick_string.hash = 0; 961 quick_string.hash = 0;
@@ -971,7 +963,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
971 if (!path.dentry) 963 if (!path.dentry)
972 goto out_shm_unlock; 964 goto out_shm_unlock;
973 965
974 path.mnt = mntget(hugetlbfs_vfsmount); 966 path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
975 error = -ENOSPC; 967 error = -ENOSPC;
976 inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); 968 inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
977 if (!inode) 969 if (!inode)
@@ -1011,8 +1003,9 @@ out_shm_unlock:
1011 1003
1012static int __init init_hugetlbfs_fs(void) 1004static int __init init_hugetlbfs_fs(void)
1013{ 1005{
1006 struct hstate *h;
1014 int error; 1007 int error;
1015 struct vfsmount *vfsmount; 1008 int i;
1016 1009
1017 error = bdi_init(&hugetlbfs_backing_dev_info); 1010 error = bdi_init(&hugetlbfs_backing_dev_info);
1018 if (error) 1011 if (error)
@@ -1029,14 +1022,26 @@ static int __init init_hugetlbfs_fs(void)
1029 if (error) 1022 if (error)
1030 goto out; 1023 goto out;
1031 1024
1032 vfsmount = kern_mount(&hugetlbfs_fs_type); 1025 i = 0;
1026 for_each_hstate(h) {
1027 char buf[50];
1028 unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
1033 1029
1034 if (!IS_ERR(vfsmount)) { 1030 snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
1035 hugetlbfs_vfsmount = vfsmount; 1031 hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
1036 return 0; 1032 buf);
1037 }
1038 1033
1039 error = PTR_ERR(vfsmount); 1034 if (IS_ERR(hugetlbfs_vfsmount[i])) {
1035 pr_err("hugetlb: Cannot mount internal hugetlbfs for "
1036 "page size %uK", ps_kb);
1037 error = PTR_ERR(hugetlbfs_vfsmount[i]);
1038 hugetlbfs_vfsmount[i] = NULL;
1039 }
1040 i++;
1041 }
1042 /* Non default hstates are optional */
1043 if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
1044 return 0;
1040 1045
1041 out: 1046 out:
1042 kmem_cache_destroy(hugetlbfs_inode_cachep); 1047 kmem_cache_destroy(hugetlbfs_inode_cachep);
@@ -1047,13 +1052,19 @@ static int __init init_hugetlbfs_fs(void)
1047 1052
1048static void __exit exit_hugetlbfs_fs(void) 1053static void __exit exit_hugetlbfs_fs(void)
1049{ 1054{
1055 struct hstate *h;
1056 int i;
1057
1058
1050 /* 1059 /*
1051 * Make sure all delayed rcu free inodes are flushed before we 1060 * Make sure all delayed rcu free inodes are flushed before we
1052 * destroy cache. 1061 * destroy cache.
1053 */ 1062 */
1054 rcu_barrier(); 1063 rcu_barrier();
1055 kmem_cache_destroy(hugetlbfs_inode_cachep); 1064 kmem_cache_destroy(hugetlbfs_inode_cachep);
1056 kern_unmount(hugetlbfs_vfsmount); 1065 i = 0;
1066 for_each_hstate(h)
1067 kern_unmount(hugetlbfs_vfsmount[i++]);
1057 unregister_filesystem(&hugetlbfs_fs_type); 1068 unregister_filesystem(&hugetlbfs_fs_type);
1058 bdi_destroy(&hugetlbfs_backing_dev_info); 1069 bdi_destroy(&hugetlbfs_backing_dev_info);
1059} 1070}
diff --git a/fs/inode.c b/fs/inode.c
index 64999f144153..14084b72b259 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -165,7 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
165 mapping->host = inode; 165 mapping->host = inode;
166 mapping->flags = 0; 166 mapping->flags = 0;
167 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 167 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
168 mapping->assoc_mapping = NULL; 168 mapping->private_data = NULL;
169 mapping->backing_dev_info = &default_backing_dev_info; 169 mapping->backing_dev_info = &default_backing_dev_info;
170 mapping->writeback_index = 0; 170 mapping->writeback_index = 0;
171 171
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 3e7b2a0dc0c8..07f76db04ec7 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -431,7 +431,7 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
431 mapping->host = inode; 431 mapping->host = inode;
432 mapping->flags = 0; 432 mapping->flags = 0;
433 mapping_set_gfp_mask(mapping, GFP_NOFS); 433 mapping_set_gfp_mask(mapping, GFP_NOFS);
434 mapping->assoc_mapping = NULL; 434 mapping->private_data = NULL;
435 mapping->backing_dev_info = bdi; 435 mapping->backing_dev_info = bdi;
436 mapping->a_ops = &empty_aops; 436 mapping->a_ops = &empty_aops;
437} 437}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5a4ee77cec51..dda089804942 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2513,18 +2513,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2513 ret = sd.num_spliced; 2513 ret = sd.num_spliced;
2514 2514
2515 if (ret > 0) { 2515 if (ret > 0) {
2516 unsigned long nr_pages;
2517 int err; 2516 int err;
2518 2517
2519 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2520
2521 err = generic_write_sync(out, *ppos, ret); 2518 err = generic_write_sync(out, *ppos, ret);
2522 if (err) 2519 if (err)
2523 ret = err; 2520 ret = err;
2524 else 2521 else
2525 *ppos += ret; 2522 *ppos += ret;
2526 2523
2527 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 2524 balance_dirty_pages_ratelimited(mapping);
2528 } 2525 }
2529 2526
2530 return ret; 2527 return ret;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9e28356a959a..aa63d25157b8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -985,7 +985,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
985{ 985{
986 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 986 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
987 char buffer[PROC_NUMBUF]; 987 char buffer[PROC_NUMBUF];
988 int oom_score_adj = OOM_SCORE_ADJ_MIN; 988 short oom_score_adj = OOM_SCORE_ADJ_MIN;
989 unsigned long flags; 989 unsigned long flags;
990 size_t len; 990 size_t len;
991 991
@@ -996,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
996 unlock_task_sighand(task, &flags); 996 unlock_task_sighand(task, &flags);
997 } 997 }
998 put_task_struct(task); 998 put_task_struct(task);
999 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); 999 len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
1000 return simple_read_from_buffer(buf, count, ppos, buffer, len); 1000 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1001} 1001}
1002 1002
@@ -1043,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1043 goto err_task_lock; 1043 goto err_task_lock;
1044 } 1044 }
1045 1045
1046 if (oom_score_adj < task->signal->oom_score_adj_min && 1046 if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
1047 !capable(CAP_SYS_RESOURCE)) { 1047 !capable(CAP_SYS_RESOURCE)) {
1048 err = -EACCES; 1048 err = -EACCES;
1049 goto err_sighand; 1049 goto err_sighand;
1050 } 1050 }
1051 1051
1052 task->signal->oom_score_adj = oom_score_adj; 1052 task->signal->oom_score_adj = (short)oom_score_adj;
1053 if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) 1053 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1054 task->signal->oom_score_adj_min = oom_score_adj; 1054 task->signal->oom_score_adj_min = (short)oom_score_adj;
1055 trace_oom_score_adj_update(task); 1055 trace_oom_score_adj_update(task);
1056 1056
1057err_sighand: 1057err_sighand:
diff --git a/fs/splice.c b/fs/splice.c
index 13e5b4776e7a..8890604e3fcd 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1024,17 +1024,14 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
1024 ret = sd.num_spliced; 1024 ret = sd.num_spliced;
1025 1025
1026 if (ret > 0) { 1026 if (ret > 0) {
1027 unsigned long nr_pages;
1028 int err; 1027 int err;
1029 1028
1030 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1031
1032 err = generic_write_sync(out, *ppos, ret); 1029 err = generic_write_sync(out, *ppos, ret);
1033 if (err) 1030 if (err)
1034 ret = err; 1031 ret = err;
1035 else 1032 else
1036 *ppos += ret; 1033 *ppos += ret;
1037 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 1034 balance_dirty_pages_ratelimited(mapping);
1038 } 1035 }
1039 sb_end_write(inode->i_sb); 1036 sb_end_write(inode->i_sb);
1040 1037
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
new file mode 100644
index 000000000000..f7f1d7169b11
--- /dev/null
+++ b/include/linux/balloon_compaction.h
@@ -0,0 +1,272 @@
1/*
2 * include/linux/balloon_compaction.h
3 *
4 * Common interface definitions for making balloon pages movable by compaction.
5 *
6 * Despite being perfectly possible to perform ballooned pages migration, they
7 * make a special corner case to compaction scans because balloon pages are not
8 * enlisted at any LRU list like the other pages we do compact / migrate.
9 *
10 * As the page isolation scanning step a compaction thread does is a lockless
11 * procedure (from a page standpoint), it might bring some racy situations while
12 * performing balloon page compaction. In order to sort out these racy scenarios
13 * and safely perform balloon's page compaction and migration we must, always,
14 * ensure following these three simple rules:
15 *
16 * i. when updating a balloon's page ->mapping element, strictly do it under
17 * the following lock order, independently of the far superior
18 * locking scheme (lru_lock, balloon_lock):
19 * +-page_lock(page);
20 * +--spin_lock_irq(&b_dev_info->pages_lock);
21 * ... page->mapping updates here ...
22 *
23 * ii. before isolating or dequeueing a balloon page from the balloon device
24 * pages list, the page reference counter must be raised by one and the
25 * extra refcount must be dropped when the page is enqueued back into
26 * the balloon device page list, thus a balloon page keeps its reference
27 * counter raised only while it is under our special handling;
28 *
29 * iii. after the lockless scan step have selected a potential balloon page for
30 * isolation, re-test the page->mapping flags and the page ref counter
31 * under the proper page lock, to ensure isolating a valid balloon page
32 * (not yet isolated, nor under release procedure)
33 *
34 * The functions provided by this interface are placed to help on coping with
35 * the aforementioned balloon page corner case, as well as to ensure the simple
36 * set of exposed rules are satisfied while we are dealing with balloon pages
37 * compaction / migration.
38 *
39 * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
40 */
41#ifndef _LINUX_BALLOON_COMPACTION_H
42#define _LINUX_BALLOON_COMPACTION_H
43#include <linux/pagemap.h>
44#include <linux/page-flags.h>
45#include <linux/migrate.h>
46#include <linux/gfp.h>
47#include <linux/err.h>
48
49/*
50 * Balloon device information descriptor.
51 * This struct is used to allow the common balloon compaction interface
52 * procedures to find the proper balloon device holding memory pages they'll
53 * have to cope for page compaction / migration, as well as it serves the
54 * balloon driver as a page book-keeper for its registered balloon devices.
55 */
56struct balloon_dev_info {
57 void *balloon_device; /* balloon device descriptor */
58 struct address_space *mapping; /* balloon special page->mapping */
59 unsigned long isolated_pages; /* # of isolated pages for migration */
60 spinlock_t pages_lock; /* Protection to pages list */
61 struct list_head pages; /* Pages enqueued & handled to Host */
62};
63
64extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info);
65extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
66extern struct balloon_dev_info *balloon_devinfo_alloc(
67 void *balloon_dev_descriptor);
68
69static inline void balloon_devinfo_free(struct balloon_dev_info *b_dev_info)
70{
71 kfree(b_dev_info);
72}
73
74/*
75 * balloon_page_free - release a balloon page back to the page free lists
76 * @page: ballooned page to be set free
77 *
78 * This function must be used to properly set free an isolated/dequeued balloon
79 * page at the end of a sucessful page migration, or at the balloon driver's
80 * page release procedure.
81 */
82static inline void balloon_page_free(struct page *page)
83{
84 /*
85 * Balloon pages always get an extra refcount before being isolated
86 * and before being dequeued to help on sorting out fortuite colisions
87 * between a thread attempting to isolate and another thread attempting
88 * to release the very same balloon page.
89 *
90 * Before we handle the page back to Buddy, lets drop its extra refcnt.
91 */
92 put_page(page);
93 __free_page(page);
94}
95
96#ifdef CONFIG_BALLOON_COMPACTION
97extern bool balloon_page_isolate(struct page *page);
98extern void balloon_page_putback(struct page *page);
99extern int balloon_page_migrate(struct page *newpage,
100 struct page *page, enum migrate_mode mode);
101extern struct address_space
102*balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
103 const struct address_space_operations *a_ops);
104
105static inline void balloon_mapping_free(struct address_space *balloon_mapping)
106{
107 kfree(balloon_mapping);
108}
109
110/*
111 * page_flags_cleared - helper to perform balloon @page ->flags tests.
112 *
113 * As balloon pages are obtained from buddy and we do not play with page->flags
114 * at driver level (exception made when we get the page lock for compaction),
115 * we can safely identify a ballooned page by checking if the
116 * PAGE_FLAGS_CHECK_AT_PREP page->flags are all cleared. This approach also
117 * helps us skip ballooned pages that are locked for compaction or release, thus
118 * mitigating their racy check at balloon_page_movable()
119 */
120static inline bool page_flags_cleared(struct page *page)
121{
122 return !(page->flags & PAGE_FLAGS_CHECK_AT_PREP);
123}
124
125/*
126 * __is_movable_balloon_page - helper to perform @page mapping->flags tests
127 */
128static inline bool __is_movable_balloon_page(struct page *page)
129{
130 struct address_space *mapping = page->mapping;
131 return mapping_balloon(mapping);
132}
133
134/*
135 * balloon_page_movable - test page->mapping->flags to identify balloon pages
136 * that can be moved by compaction/migration.
137 *
138 * This function is used at core compaction's page isolation scheme, therefore
139 * most pages exposed to it are not enlisted as balloon pages and so, to avoid
140 * undesired side effects like racing against __free_pages(), we cannot afford
141 * holding the page locked while testing page->mapping->flags here.
142 *
143 * As we might return false positives in the case of a balloon page being just
144 * released under us, the page->mapping->flags need to be re-tested later,
145 * under the proper page lock, at the functions that will be coping with the
146 * balloon page case.
147 */
148static inline bool balloon_page_movable(struct page *page)
149{
150 /*
151 * Before dereferencing and testing mapping->flags, let's make sure
152 * this is not a page that uses ->mapping in a different way
153 */
154 if (page_flags_cleared(page) && !page_mapped(page) &&
155 page_count(page) == 1)
156 return __is_movable_balloon_page(page);
157
158 return false;
159}
160
161/*
162 * balloon_page_insert - insert a page into the balloon's page list and make
163 * the page->mapping assignment accordingly.
164 * @page : page to be assigned as a 'balloon page'
165 * @mapping : allocated special 'balloon_mapping'
166 * @head : balloon's device page list head
167 *
168 * Caller must ensure the page is locked and the spin_lock protecting balloon
169 * pages list is held before inserting a page into the balloon device.
170 */
171static inline void balloon_page_insert(struct page *page,
172 struct address_space *mapping,
173 struct list_head *head)
174{
175 page->mapping = mapping;
176 list_add(&page->lru, head);
177}
178
179/*
180 * balloon_page_delete - delete a page from balloon's page list and clear
181 * the page->mapping assignement accordingly.
182 * @page : page to be released from balloon's page list
183 *
184 * Caller must ensure the page is locked and the spin_lock protecting balloon
185 * pages list is held before deleting a page from the balloon device.
186 */
187static inline void balloon_page_delete(struct page *page)
188{
189 page->mapping = NULL;
190 list_del(&page->lru);
191}
192
193/*
194 * balloon_page_device - get the b_dev_info descriptor for the balloon device
195 * that enqueues the given page.
196 */
197static inline struct balloon_dev_info *balloon_page_device(struct page *page)
198{
199 struct address_space *mapping = page->mapping;
200 if (likely(mapping))
201 return mapping->private_data;
202
203 return NULL;
204}
205
206static inline gfp_t balloon_mapping_gfp_mask(void)
207{
208 return GFP_HIGHUSER_MOVABLE;
209}
210
211static inline bool balloon_compaction_check(void)
212{
213 return true;
214}
215
216#else /* !CONFIG_BALLOON_COMPACTION */
217
218static inline void *balloon_mapping_alloc(void *balloon_device,
219 const struct address_space_operations *a_ops)
220{
221 return ERR_PTR(-EOPNOTSUPP);
222}
223
224static inline void balloon_mapping_free(struct address_space *balloon_mapping)
225{
226 return;
227}
228
229static inline void balloon_page_insert(struct page *page,
230 struct address_space *mapping,
231 struct list_head *head)
232{
233 list_add(&page->lru, head);
234}
235
236static inline void balloon_page_delete(struct page *page)
237{
238 list_del(&page->lru);
239}
240
241static inline bool balloon_page_movable(struct page *page)
242{
243 return false;
244}
245
246static inline bool balloon_page_isolate(struct page *page)
247{
248 return false;
249}
250
251static inline void balloon_page_putback(struct page *page)
252{
253 return;
254}
255
256static inline int balloon_page_migrate(struct page *newpage,
257 struct page *page, enum migrate_mode mode)
258{
259 return 0;
260}
261
262static inline gfp_t balloon_mapping_gfp_mask(void)
263{
264 return GFP_HIGHUSER;
265}
266
267static inline bool balloon_compaction_check(void)
268{
269 return false;
270}
271#endif /* CONFIG_BALLOON_COMPACTION */
272#endif /* _LINUX_BALLOON_COMPACTION_H */
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 6d6795d46a75..7b74452c5317 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -51,8 +51,8 @@ extern unsigned long free_all_bootmem(void);
51extern void free_bootmem_node(pg_data_t *pgdat, 51extern void free_bootmem_node(pg_data_t *pgdat,
52 unsigned long addr, 52 unsigned long addr,
53 unsigned long size); 53 unsigned long size);
54extern void free_bootmem(unsigned long addr, unsigned long size); 54extern void free_bootmem(unsigned long physaddr, unsigned long size);
55extern void free_bootmem_late(unsigned long addr, unsigned long size); 55extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
56 56
57/* 57/*
58 * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, 58 * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75fe9a134803..408fb1e77a0a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -418,7 +418,7 @@ struct address_space {
418 struct backing_dev_info *backing_dev_info; /* device readahead, etc */ 418 struct backing_dev_info *backing_dev_info; /* device readahead, etc */
419 spinlock_t private_lock; /* for use by the address_space */ 419 spinlock_t private_lock; /* for use by the address_space */
420 struct list_head private_list; /* ditto */ 420 struct list_head private_list; /* ditto */
421 struct address_space *assoc_mapping; /* ditto */ 421 void *private_data; /* ditto */
422} __attribute__((aligned(sizeof(long)))); 422} __attribute__((aligned(sizeof(long))));
423 /* 423 /*
424 * On most architectures that alignment is already the case; but 424 * On most architectures that alignment is already the case; but
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index d0a79678f169..31e8041274f6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -266,7 +266,7 @@ static inline enum zone_type gfp_zone(gfp_t flags)
266 266
267static inline int gfp_zonelist(gfp_t flags) 267static inline int gfp_zonelist(gfp_t flags)
268{ 268{
269 if (NUMA_BUILD && unlikely(flags & __GFP_THISNODE)) 269 if (IS_ENABLED(CONFIG_NUMA) && unlikely(flags & __GFP_THISNODE))
270 return 1; 270 return 1;
271 271
272 return 0; 272 return 0;
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b31cb7da0346..1af477552459 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -8,6 +8,10 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
8extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 8extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
9 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 9 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
10 struct vm_area_struct *vma); 10 struct vm_area_struct *vma);
11extern void huge_pmd_set_accessed(struct mm_struct *mm,
12 struct vm_area_struct *vma,
13 unsigned long address, pmd_t *pmd,
14 pmd_t orig_pmd, int dirty);
11extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 15extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
12 unsigned long address, pmd_t *pmd, 16 unsigned long address, pmd_t *pmd,
13 pmd_t orig_pmd); 17 pmd_t orig_pmd);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 225164842ab6..3e7fa1acf09c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -183,7 +183,8 @@ extern const struct file_operations hugetlbfs_file_operations;
183extern const struct vm_operations_struct hugetlb_vm_ops; 183extern const struct vm_operations_struct hugetlb_vm_ops;
184struct file *hugetlb_file_setup(const char *name, unsigned long addr, 184struct file *hugetlb_file_setup(const char *name, unsigned long addr,
185 size_t size, vm_flags_t acct, 185 size_t size, vm_flags_t acct,
186 struct user_struct **user, int creat_flags); 186 struct user_struct **user, int creat_flags,
187 int page_size_log);
187 188
188static inline int is_file_hugepages(struct file *file) 189static inline int is_file_hugepages(struct file *file)
189{ 190{
@@ -195,12 +196,14 @@ static inline int is_file_hugepages(struct file *file)
195 return 0; 196 return 0;
196} 197}
197 198
199
198#else /* !CONFIG_HUGETLBFS */ 200#else /* !CONFIG_HUGETLBFS */
199 201
200#define is_file_hugepages(file) 0 202#define is_file_hugepages(file) 0
201static inline struct file * 203static inline struct file *
202hugetlb_file_setup(const char *name, unsigned long addr, size_t size, 204hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
203 vm_flags_t acctflag, struct user_struct **user, int creat_flags) 205 vm_flags_t acctflag, struct user_struct **user, int creat_flags,
206 int page_size_log)
204{ 207{
205 return ERR_PTR(-ENOSYS); 208 return ERR_PTR(-ENOSYS);
206} 209}
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7d8dfc7392f1..dd9900cabf89 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -687,20 +687,6 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
687/* Trap pasters of __FUNCTION__ at compile-time */ 687/* Trap pasters of __FUNCTION__ at compile-time */
688#define __FUNCTION__ (__func__) 688#define __FUNCTION__ (__func__)
689 689
690/* This helps us to avoid #ifdef CONFIG_NUMA */
691#ifdef CONFIG_NUMA
692#define NUMA_BUILD 1
693#else
694#define NUMA_BUILD 0
695#endif
696
697/* This helps us avoid #ifdef CONFIG_COMPACTION */
698#ifdef CONFIG_COMPACTION
699#define COMPACTION_BUILD 1
700#else
701#define COMPACTION_BUILD 0
702#endif
703
704/* This helps us to avoid #ifdef CONFIG_SYMBOL_PREFIX */ 690/* This helps us to avoid #ifdef CONFIG_SYMBOL_PREFIX */
705#ifdef CONFIG_SYMBOL_PREFIX 691#ifdef CONFIG_SYMBOL_PREFIX
706#define SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX 692#define SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX
diff --git a/include/linux/memory.h b/include/linux/memory.h
index ff9a9f8e0ed9..a09216d0dcc7 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -53,6 +53,7 @@ int arch_get_memory_phys_device(unsigned long start_pfn);
53struct memory_notify { 53struct memory_notify {
54 unsigned long start_pfn; 54 unsigned long start_pfn;
55 unsigned long nr_pages; 55 unsigned long nr_pages;
56 int status_change_nid_normal;
56 int status_change_nid; 57 int status_change_nid;
57}; 58};
58 59
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 95573ec4ee6c..4a45c4e50025 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -26,6 +26,13 @@ enum {
26 MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, 26 MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
27}; 27};
28 28
29/* Types for control the zone type of onlined memory */
30enum {
31 ONLINE_KEEP,
32 ONLINE_KERNEL,
33 ONLINE_MOVABLE,
34};
35
29/* 36/*
30 * pgdat resizing functions 37 * pgdat resizing functions
31 */ 38 */
@@ -46,6 +53,10 @@ void pgdat_resize_init(struct pglist_data *pgdat)
46} 53}
47/* 54/*
48 * Zone resizing functions 55 * Zone resizing functions
56 *
57 * Note: any attempt to resize a zone should has pgdat_resize_lock()
58 * zone_span_writelock() both held. This ensure the size of a zone
59 * can't be changed while pgdat_resize_lock() held.
49 */ 60 */
50static inline unsigned zone_span_seqbegin(struct zone *zone) 61static inline unsigned zone_span_seqbegin(struct zone *zone)
51{ 62{
@@ -71,7 +82,7 @@ extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
71extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); 82extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
72extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); 83extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
73/* VM interface that may be used by firmware interface */ 84/* VM interface that may be used by firmware interface */
74extern int online_pages(unsigned long, unsigned long); 85extern int online_pages(unsigned long, unsigned long, int);
75extern void __offline_isolated_pages(unsigned long, unsigned long); 86extern void __offline_isolated_pages(unsigned long, unsigned long);
76 87
77typedef void (*online_page_callback_t)(struct page *page); 88typedef void (*online_page_callback_t)(struct page *page);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ce7e6671968b..0b5865c61efd 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -7,9 +7,27 @@
7 7
8typedef struct page *new_page_t(struct page *, unsigned long private, int **); 8typedef struct page *new_page_t(struct page *, unsigned long private, int **);
9 9
10/*
11 * Return values from addresss_space_operations.migratepage():
12 * - negative errno on page migration failure;
13 * - zero on page migration success;
14 *
15 * The balloon page migration introduces this special case where a 'distinct'
16 * return code is used to flag a successful page migration to unmap_and_move().
17 * This approach is necessary because page migration can race against balloon
18 * deflation procedure, and for such case we could introduce a nasty page leak
19 * if a successfully migrated balloon page gets released concurrently with
20 * migration's unmap_and_move() wrap-up steps.
21 */
22#define MIGRATEPAGE_SUCCESS 0
23#define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page
24 * sucessful migration case.
25 */
26
10#ifdef CONFIG_MIGRATION 27#ifdef CONFIG_MIGRATION
11 28
12extern void putback_lru_pages(struct list_head *l); 29extern void putback_lru_pages(struct list_head *l);
30extern void putback_movable_pages(struct list_head *l);
13extern int migrate_page(struct address_space *, 31extern int migrate_page(struct address_space *,
14 struct page *, struct page *, enum migrate_mode); 32 struct page *, struct page *, enum migrate_mode);
15extern int migrate_pages(struct list_head *l, new_page_t x, 33extern int migrate_pages(struct list_head *l, new_page_t x,
@@ -33,6 +51,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
33#else 51#else
34 52
35static inline void putback_lru_pages(struct list_head *l) {} 53static inline void putback_lru_pages(struct list_head *l) {}
54static inline void putback_movable_pages(struct list_head *l) {}
36static inline int migrate_pages(struct list_head *l, new_page_t x, 55static inline int migrate_pages(struct list_head *l, new_page_t x,
37 unsigned long private, bool offlining, 56 unsigned long private, bool offlining,
38 enum migrate_mode mode) { return -ENOSYS; } 57 enum migrate_mode mode) { return -ENOSYS; }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bcaab4e6fe91..4af4f0b1be4c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1456,6 +1456,37 @@ extern unsigned long vm_mmap(struct file *, unsigned long,
1456 unsigned long, unsigned long, 1456 unsigned long, unsigned long,
1457 unsigned long, unsigned long); 1457 unsigned long, unsigned long);
1458 1458
1459struct vm_unmapped_area_info {
1460#define VM_UNMAPPED_AREA_TOPDOWN 1
1461 unsigned long flags;
1462 unsigned long length;
1463 unsigned long low_limit;
1464 unsigned long high_limit;
1465 unsigned long align_mask;
1466 unsigned long align_offset;
1467};
1468
1469extern unsigned long unmapped_area(struct vm_unmapped_area_info *info);
1470extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);
1471
1472/*
1473 * Search for an unmapped address range.
1474 *
1475 * We are looking for a range that:
1476 * - does not intersect with any VMA;
1477 * - is contained within the [low_limit, high_limit) interval;
1478 * - is at least the desired size.
1479 * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
1480 */
1481static inline unsigned long
1482vm_unmapped_area(struct vm_unmapped_area_info *info)
1483{
1484 if (!(info->flags & VM_UNMAPPED_AREA_TOPDOWN))
1485 return unmapped_area(info);
1486 else
1487 return unmapped_area_topdown(info);
1488}
1489
1459/* truncate.c */ 1490/* truncate.c */
1460extern void truncate_inode_pages(struct address_space *, loff_t); 1491extern void truncate_inode_pages(struct address_space *, loff_t);
1461extern void truncate_inode_pages_range(struct address_space *, 1492extern void truncate_inode_pages_range(struct address_space *,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 31f8a3af7d94..7ade2731b5d6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -224,7 +224,8 @@ struct vm_region {
224 * library, the executable area etc). 224 * library, the executable area etc).
225 */ 225 */
226struct vm_area_struct { 226struct vm_area_struct {
227 struct mm_struct * vm_mm; /* The address space we belong to. */ 227 /* The first cache line has the info for VMA tree walking. */
228
228 unsigned long vm_start; /* Our start address within vm_mm. */ 229 unsigned long vm_start; /* Our start address within vm_mm. */
229 unsigned long vm_end; /* The first byte after our end address 230 unsigned long vm_end; /* The first byte after our end address
230 within vm_mm. */ 231 within vm_mm. */
@@ -232,11 +233,22 @@ struct vm_area_struct {
232 /* linked list of VM areas per task, sorted by address */ 233 /* linked list of VM areas per task, sorted by address */
233 struct vm_area_struct *vm_next, *vm_prev; 234 struct vm_area_struct *vm_next, *vm_prev;
234 235
236 struct rb_node vm_rb;
237
238 /*
239 * Largest free memory gap in bytes to the left of this VMA.
240 * Either between this VMA and vma->vm_prev, or between one of the
241 * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
242 * get_unmapped_area find a free area of the right size.
243 */
244 unsigned long rb_subtree_gap;
245
246 /* Second cache line starts here. */
247
248 struct mm_struct *vm_mm; /* The address space we belong to. */
235 pgprot_t vm_page_prot; /* Access permissions of this VMA. */ 249 pgprot_t vm_page_prot; /* Access permissions of this VMA. */
236 unsigned long vm_flags; /* Flags, see mm.h. */ 250 unsigned long vm_flags; /* Flags, see mm.h. */
237 251
238 struct rb_node vm_rb;
239
240 /* 252 /*
241 * For areas with an address space and backing store, 253 * For areas with an address space and backing store,
242 * linkage into the address_space->i_mmap interval tree, or 254 * linkage into the address_space->i_mmap interval tree, or
@@ -322,6 +334,7 @@ struct mm_struct {
322 unsigned long task_size; /* size of task vm space */ 334 unsigned long task_size; /* size of task vm space */
323 unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ 335 unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
324 unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ 336 unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
337 unsigned long highest_vm_end; /* highest vma end address */
325 pgd_t * pgd; 338 pgd_t * pgd;
326 atomic_t mm_users; /* How many users with user space? */ 339 atomic_t mm_users; /* How many users with user space? */
327 atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ 340 atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a23923ba8263..0c0b1d608a69 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -63,10 +63,8 @@ enum {
63 63
64#ifdef CONFIG_CMA 64#ifdef CONFIG_CMA
65# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) 65# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
66# define cma_wmark_pages(zone) zone->min_cma_pages
67#else 66#else
68# define is_migrate_cma(migratetype) false 67# define is_migrate_cma(migratetype) false
69# define cma_wmark_pages(zone) 0
70#endif 68#endif
71 69
72#define for_each_migratetype_order(order, type) \ 70#define for_each_migratetype_order(order, type) \
@@ -383,13 +381,6 @@ struct zone {
383 /* see spanned/present_pages for more description */ 381 /* see spanned/present_pages for more description */
384 seqlock_t span_seqlock; 382 seqlock_t span_seqlock;
385#endif 383#endif
386#ifdef CONFIG_CMA
387 /*
388 * CMA needs to increase watermark levels during the allocation
389 * process to make sure that the system is not starved.
390 */
391 unsigned long min_cma_pages;
392#endif
393 struct free_area free_area[MAX_ORDER]; 384 struct free_area free_area[MAX_ORDER];
394 385
395#ifndef CONFIG_SPARSEMEM 386#ifndef CONFIG_SPARSEMEM
diff --git a/include/linux/node.h b/include/linux/node.h
index 624e53cecc02..2115ad5d6f19 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -27,10 +27,9 @@ struct node {
27}; 27};
28 28
29struct memory_block; 29struct memory_block;
30extern struct node node_devices[]; 30extern struct node *node_devices[];
31typedef void (*node_registration_func_t)(struct node *); 31typedef void (*node_registration_func_t)(struct node *);
32 32
33extern int register_node(struct node *, int, struct node *);
34extern void unregister_node(struct node *node); 33extern void unregister_node(struct node *node);
35#ifdef CONFIG_NUMA 34#ifdef CONFIG_NUMA
36extern int register_one_node(int nid); 35extern int register_one_node(int nid);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index fb9826847b89..da60007075b5 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -29,8 +29,23 @@ enum oom_scan_t {
29 OOM_SCAN_SELECT, /* always select this thread first */ 29 OOM_SCAN_SELECT, /* always select this thread first */
30}; 30};
31 31
32extern void compare_swap_oom_score_adj(int old_val, int new_val); 32/* Thread is the potential origin of an oom condition; kill first on oom */
33extern int test_set_oom_score_adj(int new_val); 33#define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1)
34
35static inline void set_current_oom_origin(void)
36{
37 current->signal->oom_flags |= OOM_FLAG_ORIGIN;
38}
39
40static inline void clear_current_oom_origin(void)
41{
42 current->signal->oom_flags &= ~OOM_FLAG_ORIGIN;
43}
44
45static inline bool oom_task_origin(const struct task_struct *p)
46{
47 return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
48}
34 49
35extern unsigned long oom_badness(struct task_struct *p, 50extern unsigned long oom_badness(struct task_struct *p,
36 struct mem_cgroup *memcg, const nodemask_t *nodemask, 51 struct mem_cgroup *memcg, const nodemask_t *nodemask,
@@ -49,8 +64,6 @@ extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
49extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, 64extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
50 unsigned long totalpages, const nodemask_t *nodemask, 65 unsigned long totalpages, const nodemask_t *nodemask,
51 bool force_kill); 66 bool force_kill);
52extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
53 int order);
54 67
55extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 68extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
56 int order, nodemask_t *mask, bool force_kill); 69 int order, nodemask_t *mask, bool force_kill);
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 76a9539cfd3f..a92061e08d48 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -2,7 +2,8 @@
2#define __LINUX_PAGEISOLATION_H 2#define __LINUX_PAGEISOLATION_H
3 3
4 4
5bool has_unmovable_pages(struct zone *zone, struct page *page, int count); 5bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
6 bool skip_hwpoisoned_pages);
6void set_pageblock_migratetype(struct page *page, int migratetype); 7void set_pageblock_migratetype(struct page *page, int migratetype);
7int move_freepages_block(struct zone *zone, struct page *page, 8int move_freepages_block(struct zone *zone, struct page *page,
8 int migratetype); 9 int migratetype);
@@ -21,7 +22,7 @@ int move_freepages(struct zone *zone,
21 */ 22 */
22int 23int
23start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, 24start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
24 unsigned migratetype); 25 unsigned migratetype, bool skip_hwpoisoned_pages);
25 26
26/* 27/*
27 * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. 28 * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
@@ -34,12 +35,13 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
34/* 35/*
35 * Test all pages in [start_pfn, end_pfn) are isolated or not. 36 * Test all pages in [start_pfn, end_pfn) are isolated or not.
36 */ 37 */
37int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn); 38int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
39 bool skip_hwpoisoned_pages);
38 40
39/* 41/*
40 * Internal functions. Changes pageblock's migrate type. 42 * Internal functions. Changes pageblock's migrate type.
41 */ 43 */
42int set_migratetype_isolate(struct page *page); 44int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages);
43void unset_migratetype_isolate(struct page *page, unsigned migratetype); 45void unset_migratetype_isolate(struct page *page, unsigned migratetype);
44struct page *alloc_migrate_target(struct page *page, unsigned long private, 46struct page *alloc_migrate_target(struct page *page, unsigned long private,
45 int **resultp); 47 int **resultp);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e42c762f0dc7..6da609d14c15 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -24,6 +24,7 @@ enum mapping_flags {
24 AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ 24 AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */
25 AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ 25 AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */
26 AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ 26 AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */
27 AS_BALLOON_MAP = __GFP_BITS_SHIFT + 4, /* balloon page special map */
27}; 28};
28 29
29static inline void mapping_set_error(struct address_space *mapping, int error) 30static inline void mapping_set_error(struct address_space *mapping, int error)
@@ -53,6 +54,21 @@ static inline int mapping_unevictable(struct address_space *mapping)
53 return !!mapping; 54 return !!mapping;
54} 55}
55 56
57static inline void mapping_set_balloon(struct address_space *mapping)
58{
59 set_bit(AS_BALLOON_MAP, &mapping->flags);
60}
61
62static inline void mapping_clear_balloon(struct address_space *mapping)
63{
64 clear_bit(AS_BALLOON_MAP, &mapping->flags);
65}
66
67static inline int mapping_balloon(struct address_space *mapping)
68{
69 return mapping && test_bit(AS_BALLOON_MAP, &mapping->flags);
70}
71
56static inline gfp_t mapping_gfp_mask(struct address_space * mapping) 72static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
57{ 73{
58 return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; 74 return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a02df2e..3e387df065fc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -631,9 +631,10 @@ struct signal_struct {
631 struct rw_semaphore group_rwsem; 631 struct rw_semaphore group_rwsem;
632#endif 632#endif
633 633
634 int oom_score_adj; /* OOM kill score adjustment */ 634 oom_flags_t oom_flags;
635 int oom_score_adj_min; /* OOM kill score adjustment minimum value. 635 short oom_score_adj; /* OOM kill score adjustment */
636 * Only settable by CAP_SYS_RESOURCE. */ 636 short oom_score_adj_min; /* OOM kill score adjustment min value.
637 * Only settable by CAP_SYS_RESOURCE. */
637 638
638 struct mutex cred_guard_mutex; /* guard against foreign influences on 639 struct mutex cred_guard_mutex; /* guard against foreign influences on
639 * credential calculations 640 * credential calculations
diff --git a/include/linux/shm.h b/include/linux/shm.h
index bcf8a6a3ec00..429c1995d756 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -29,6 +29,21 @@ struct shmid_kernel /* private to the kernel */
29#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ 29#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
30#define SHM_NORESERVE 010000 /* don't check for reservations */ 30#define SHM_NORESERVE 010000 /* don't check for reservations */
31 31
32/* Bits [26:31] are reserved */
33
34/*
35 * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
36 * This gives us 6 bits, which is enough until someone invents 128 bit address
37 * spaces.
38 *
39 * Assume these are all power of twos.
40 * When 0 use the default page size.
41 */
42#define SHM_HUGE_SHIFT 26
43#define SHM_HUGE_MASK 0x3f
44#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
45#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
46
32#ifdef CONFIG_SYSVIPC 47#ifdef CONFIG_SYSVIPC
33long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, 48long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
34 unsigned long shmlba); 49 unsigned long shmlba);
diff --git a/include/linux/types.h b/include/linux/types.h
index 1cc0e4b9a048..4d118ba11349 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -156,6 +156,7 @@ typedef u32 dma_addr_t;
156#endif 156#endif
157typedef unsigned __bitwise__ gfp_t; 157typedef unsigned __bitwise__ gfp_t;
158typedef unsigned __bitwise__ fmode_t; 158typedef unsigned __bitwise__ fmode_t;
159typedef unsigned __bitwise__ oom_flags_t;
159 160
160#ifdef CONFIG_PHYS_ADDR_T_64BIT 161#ifdef CONFIG_PHYS_ADDR_T_64BIT
161typedef u64 phys_addr_t; 162typedef u64 phys_addr_t;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 50c3e8fa06a8..b82a83aba311 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -161,14 +161,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
161 unsigned long start_time); 161 unsigned long start_time);
162 162
163void page_writeback_init(void); 163void page_writeback_init(void);
164void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 164void balance_dirty_pages_ratelimited(struct address_space *mapping);
165 unsigned long nr_pages_dirtied);
166
167static inline void
168balance_dirty_pages_ratelimited(struct address_space *mapping)
169{
170 balance_dirty_pages_ratelimited_nr(mapping, 1);
171}
172 165
173typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, 166typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
174 void *data); 167 void *data);
diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
index dd4ba3b92002..1e974983757e 100644
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -14,7 +14,7 @@ TRACE_EVENT(oom_score_adj_update,
14 TP_STRUCT__entry( 14 TP_STRUCT__entry(
15 __field( pid_t, pid) 15 __field( pid_t, pid)
16 __array( char, comm, TASK_COMM_LEN ) 16 __array( char, comm, TASK_COMM_LEN )
17 __field( int, oom_score_adj) 17 __field( short, oom_score_adj)
18 ), 18 ),
19 19
20 TP_fast_assign( 20 TP_fast_assign(
@@ -23,7 +23,7 @@ TRACE_EVENT(oom_score_adj_update,
23 __entry->oom_score_adj = task->signal->oom_score_adj; 23 __entry->oom_score_adj = task->signal->oom_score_adj;
24 ), 24 ),
25 25
26 TP_printk("pid=%d comm=%s oom_score_adj=%d", 26 TP_printk("pid=%d comm=%s oom_score_adj=%hd",
27 __entry->pid, __entry->comm, __entry->oom_score_adj) 27 __entry->pid, __entry->comm, __entry->oom_score_adj)
28); 28);
29 29
diff --git a/include/trace/events/task.h b/include/trace/events/task.h
index b53add02e929..102a646e1996 100644
--- a/include/trace/events/task.h
+++ b/include/trace/events/task.h
@@ -15,7 +15,7 @@ TRACE_EVENT(task_newtask,
15 __field( pid_t, pid) 15 __field( pid_t, pid)
16 __array( char, comm, TASK_COMM_LEN) 16 __array( char, comm, TASK_COMM_LEN)
17 __field( unsigned long, clone_flags) 17 __field( unsigned long, clone_flags)
18 __field( int, oom_score_adj) 18 __field( short, oom_score_adj)
19 ), 19 ),
20 20
21 TP_fast_assign( 21 TP_fast_assign(
@@ -25,7 +25,7 @@ TRACE_EVENT(task_newtask,
25 __entry->oom_score_adj = task->signal->oom_score_adj; 25 __entry->oom_score_adj = task->signal->oom_score_adj;
26 ), 26 ),
27 27
28 TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d", 28 TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd",
29 __entry->pid, __entry->comm, 29 __entry->pid, __entry->comm,
30 __entry->clone_flags, __entry->oom_score_adj) 30 __entry->clone_flags, __entry->oom_score_adj)
31); 31);
@@ -40,7 +40,7 @@ TRACE_EVENT(task_rename,
40 __field( pid_t, pid) 40 __field( pid_t, pid)
41 __array( char, oldcomm, TASK_COMM_LEN) 41 __array( char, oldcomm, TASK_COMM_LEN)
42 __array( char, newcomm, TASK_COMM_LEN) 42 __array( char, newcomm, TASK_COMM_LEN)
43 __field( int, oom_score_adj) 43 __field( short, oom_score_adj)
44 ), 44 ),
45 45
46 TP_fast_assign( 46 TP_fast_assign(
@@ -50,7 +50,7 @@ TRACE_EVENT(task_rename,
50 __entry->oom_score_adj = task->signal->oom_score_adj; 50 __entry->oom_score_adj = task->signal->oom_score_adj;
51 ), 51 ),
52 52
53 TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d", 53 TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%hd",
54 __entry->pid, __entry->oldcomm, 54 __entry->pid, __entry->oldcomm,
55 __entry->newcomm, __entry->oom_score_adj) 55 __entry->newcomm, __entry->oom_score_adj)
56); 56);
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index d030d2c2647a..4164529a94f9 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -55,4 +55,15 @@
55/* compatibility flags */ 55/* compatibility flags */
56#define MAP_FILE 0 56#define MAP_FILE 0
57 57
58/*
59 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
60 * This gives us 6 bits, which is enough until someone invents 128 bit address
61 * spaces.
62 *
63 * Assume these are all power of twos.
64 * When 0 use the default page size.
65 */
66#define MAP_HUGE_SHIFT 26
67#define MAP_HUGE_MASK 0x3f
68
58#endif /* __ASM_GENERIC_MMAN_COMMON_H */ 69#endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index 32c8bd6a196d..e9fe6fd2a074 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -13,6 +13,8 @@
13#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ 13#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
14#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ 14#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
15 15
16/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
17
16#define MCL_CURRENT 1 /* lock all current mappings */ 18#define MCL_CURRENT 1 /* lock all current mappings */
17#define MCL_FUTURE 2 /* lock all future mappings */ 19#define MCL_FUTURE 2 /* lock all future mappings */
18 20
diff --git a/ipc/shm.c b/ipc/shm.c
index dff40c9f73c9..4fa6d8fee730 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -495,7 +495,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
495 if (shmflg & SHM_NORESERVE) 495 if (shmflg & SHM_NORESERVE)
496 acctflag = VM_NORESERVE; 496 acctflag = VM_NORESERVE;
497 file = hugetlb_file_setup(name, 0, size, acctflag, 497 file = hugetlb_file_setup(name, 0, size, acctflag,
498 &shp->mlock_user, HUGETLB_SHMFS_INODE); 498 &shp->mlock_user, HUGETLB_SHMFS_INODE,
499 (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
499 } else { 500 } else {
500 /* 501 /*
501 * Do not allow no accounting for OVERCOMMIT_NEVER, even 502 * Do not allow no accounting for OVERCOMMIT_NEVER, even
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 402a54ac35cb..d327b87c99b7 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -161,6 +161,6 @@ EXPORT_SYMBOL(free_cpumask_var);
161 */ 161 */
162void __init free_bootmem_cpumask_var(cpumask_var_t mask) 162void __init free_bootmem_cpumask_var(cpumask_var_t mask)
163{ 163{
164 free_bootmem((unsigned long)mask, cpumask_size()); 164 free_bootmem(__pa(mask), cpumask_size());
165} 165}
166#endif 166#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index a3f8dddaaab3..e6651c5de14f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -188,6 +188,21 @@ config SPLIT_PTLOCK_CPUS
188 default "4" 188 default "4"
189 189
190# 190#
191# support for memory balloon compaction
192config BALLOON_COMPACTION
193 bool "Allow for balloon memory compaction/migration"
194 def_bool y
195 depends on COMPACTION && VIRTIO_BALLOON
196 help
197 Memory fragmentation introduced by ballooning might reduce
198 significantly the number of 2MB contiguous memory blocks that can be
199 used within a guest, thus imposing performance penalties associated
200 with the reduced number of transparent huge pages that could be used
201 by the guest workload. Allowing the compaction & migration for memory
202 pages enlisted as being part of memory balloon devices avoids the
203 scenario aforementioned and helps improving memory defragmentation.
204
205#
191# support for memory compaction 206# support for memory compaction
192config COMPACTION 207config COMPACTION
193 bool "Allow for memory compaction" 208 bool "Allow for memory compaction"
diff --git a/mm/Makefile b/mm/Makefile
index 6b025f80af34..3a4628751f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o interval_tree.o $(mmu-y) 19 compaction.o balloon_compaction.o \
20 interval_tree.o $(mmu-y)
20 21
21obj-y += init-mm.o 22obj-y += init-mm.o
22 23
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
new file mode 100644
index 000000000000..07dbc8ec46cf
--- /dev/null
+++ b/mm/balloon_compaction.c
@@ -0,0 +1,302 @@
1/*
2 * mm/balloon_compaction.c
3 *
4 * Common interface for making balloon pages movable by compaction.
5 *
6 * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
7 */
8#include <linux/mm.h>
9#include <linux/slab.h>
10#include <linux/export.h>
11#include <linux/balloon_compaction.h>
12
13/*
14 * balloon_devinfo_alloc - allocates a balloon device information descriptor.
15 * @balloon_dev_descriptor: pointer to reference the balloon device which
16 * this struct balloon_dev_info will be servicing.
17 *
18 * Driver must call it to properly allocate and initialize an instance of
19 * struct balloon_dev_info which will be used to reference a balloon device
20 * as well as to keep track of the balloon device page list.
21 */
22struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
23{
24 struct balloon_dev_info *b_dev_info;
25 b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
26 if (!b_dev_info)
27 return ERR_PTR(-ENOMEM);
28
29 b_dev_info->balloon_device = balloon_dev_descriptor;
30 b_dev_info->mapping = NULL;
31 b_dev_info->isolated_pages = 0;
32 spin_lock_init(&b_dev_info->pages_lock);
33 INIT_LIST_HEAD(&b_dev_info->pages);
34
35 return b_dev_info;
36}
37EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
38
39/*
40 * balloon_page_enqueue - allocates a new page and inserts it into the balloon
41 * page list.
42 * @b_dev_info: balloon device decriptor where we will insert a new page to
43 *
44 * Driver must call it to properly allocate a new enlisted balloon page
45 * before definetively removing it from the guest system.
46 * This function returns the page address for the recently enqueued page or
47 * NULL in the case we fail to allocate a new page this turn.
48 */
49struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
50{
51 unsigned long flags;
52 struct page *page = alloc_page(balloon_mapping_gfp_mask() |
53 __GFP_NOMEMALLOC | __GFP_NORETRY);
54 if (!page)
55 return NULL;
56
57 /*
58 * Block others from accessing the 'page' when we get around to
59 * establishing additional references. We should be the only one
60 * holding a reference to the 'page' at this point.
61 */
62 BUG_ON(!trylock_page(page));
63 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
64 balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
65 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
66 unlock_page(page);
67 return page;
68}
69EXPORT_SYMBOL_GPL(balloon_page_enqueue);
70
71/*
72 * balloon_page_dequeue - removes a page from balloon's page list and returns
73 * the its address to allow the driver release the page.
74 * @b_dev_info: balloon device decriptor where we will grab a page from.
75 *
76 * Driver must call it to properly de-allocate a previous enlisted balloon page
77 * before definetively releasing it back to the guest system.
78 * This function returns the page address for the recently dequeued page or
79 * NULL in the case we find balloon's page list temporarily empty due to
80 * compaction isolated pages.
81 */
82struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
83{
84 struct page *page, *tmp;
85 unsigned long flags;
86 bool dequeued_page;
87
88 dequeued_page = false;
89 list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
90 /*
91 * Block others from accessing the 'page' while we get around
92 * establishing additional references and preparing the 'page'
93 * to be released by the balloon driver.
94 */
95 if (trylock_page(page)) {
96 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
97 /*
98 * Raise the page refcount here to prevent any wrong
99 * attempt to isolate this page, in case of coliding
100 * with balloon_page_isolate() just after we release
101 * the page lock.
102 *
103 * balloon_page_free() will take care of dropping
104 * this extra refcount later.
105 */
106 get_page(page);
107 balloon_page_delete(page);
108 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
109 unlock_page(page);
110 dequeued_page = true;
111 break;
112 }
113 }
114
115 if (!dequeued_page) {
116 /*
117 * If we are unable to dequeue a balloon page because the page
118 * list is empty and there is no isolated pages, then something
119 * went out of track and some balloon pages are lost.
120 * BUG() here, otherwise the balloon driver may get stuck into
121 * an infinite loop while attempting to release all its pages.
122 */
123 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
124 if (unlikely(list_empty(&b_dev_info->pages) &&
125 !b_dev_info->isolated_pages))
126 BUG();
127 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
128 page = NULL;
129 }
130 return page;
131}
132EXPORT_SYMBOL_GPL(balloon_page_dequeue);
133
134#ifdef CONFIG_BALLOON_COMPACTION
135/*
136 * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
137 * @b_dev_info: holds the balloon device information descriptor.
138 * @a_ops: balloon_mapping address_space_operations descriptor.
139 *
140 * Driver must call it to properly allocate and initialize an instance of
141 * struct address_space which will be used as the special page->mapping for
142 * balloon device enlisted page instances.
143 */
144struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
145 const struct address_space_operations *a_ops)
146{
147 struct address_space *mapping;
148
149 mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
150 if (!mapping)
151 return ERR_PTR(-ENOMEM);
152
153 /*
154 * Give a clean 'zeroed' status to all elements of this special
155 * balloon page->mapping struct address_space instance.
156 */
157 address_space_init_once(mapping);
158
159 /*
160 * Set mapping->flags appropriately, to allow balloon pages
161 * ->mapping identification.
162 */
163 mapping_set_balloon(mapping);
164 mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
165
166 /* balloon's page->mapping->a_ops callback descriptor */
167 mapping->a_ops = a_ops;
168
169 /*
170 * Establish a pointer reference back to the balloon device descriptor
171 * this particular page->mapping will be servicing.
172 * This is used by compaction / migration procedures to identify and
173 * access the balloon device pageset while isolating / migrating pages.
174 *
175 * As some balloon drivers can register multiple balloon devices
176 * for a single guest, this also helps compaction / migration to
177 * properly deal with multiple balloon pagesets, when required.
178 */
179 mapping->private_data = b_dev_info;
180 b_dev_info->mapping = mapping;
181
182 return mapping;
183}
184EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
185
186static inline void __isolate_balloon_page(struct page *page)
187{
188 struct balloon_dev_info *b_dev_info = page->mapping->private_data;
189 unsigned long flags;
190 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
191 list_del(&page->lru);
192 b_dev_info->isolated_pages++;
193 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
194}
195
196static inline void __putback_balloon_page(struct page *page)
197{
198 struct balloon_dev_info *b_dev_info = page->mapping->private_data;
199 unsigned long flags;
200 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
201 list_add(&page->lru, &b_dev_info->pages);
202 b_dev_info->isolated_pages--;
203 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
204}
205
206static inline int __migrate_balloon_page(struct address_space *mapping,
207 struct page *newpage, struct page *page, enum migrate_mode mode)
208{
209 return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
210}
211
212/* __isolate_lru_page() counterpart for a ballooned page */
213bool balloon_page_isolate(struct page *page)
214{
215 /*
216 * Avoid burning cycles with pages that are yet under __free_pages(),
217 * or just got freed under us.
218 *
219 * In case we 'win' a race for a balloon page being freed under us and
220 * raise its refcount preventing __free_pages() from doing its job
221 * the put_page() at the end of this block will take care of
222 * release this page, thus avoiding a nasty leakage.
223 */
224 if (likely(get_page_unless_zero(page))) {
225 /*
226 * As balloon pages are not isolated from LRU lists, concurrent
227 * compaction threads can race against page migration functions
228 * as well as race against the balloon driver releasing a page.
229 *
230 * In order to avoid having an already isolated balloon page
231 * being (wrongly) re-isolated while it is under migration,
232 * or to avoid attempting to isolate pages being released by
233 * the balloon driver, lets be sure we have the page lock
234 * before proceeding with the balloon page isolation steps.
235 */
236 if (likely(trylock_page(page))) {
237 /*
238 * A ballooned page, by default, has just one refcount.
239 * Prevent concurrent compaction threads from isolating
240 * an already isolated balloon page by refcount check.
241 */
242 if (__is_movable_balloon_page(page) &&
243 page_count(page) == 2) {
244 __isolate_balloon_page(page);
245 unlock_page(page);
246 return true;
247 }
248 unlock_page(page);
249 }
250 put_page(page);
251 }
252 return false;
253}
254
255/* putback_lru_page() counterpart for a ballooned page */
256void balloon_page_putback(struct page *page)
257{
258 /*
259 * 'lock_page()' stabilizes the page and prevents races against
260 * concurrent isolation threads attempting to re-isolate it.
261 */
262 lock_page(page);
263
264 if (__is_movable_balloon_page(page)) {
265 __putback_balloon_page(page);
266 /* drop the extra ref count taken for page isolation */
267 put_page(page);
268 } else {
269 WARN_ON(1);
270 dump_page(page);
271 }
272 unlock_page(page);
273}
274
275/* move_to_new_page() counterpart for a ballooned page */
276int balloon_page_migrate(struct page *newpage,
277 struct page *page, enum migrate_mode mode)
278{
279 struct address_space *mapping;
280 int rc = -EAGAIN;
281
282 /*
283 * Block others from accessing the 'newpage' when we get around to
284 * establishing additional references. We should be the only one
285 * holding a reference to the 'newpage' at this point.
286 */
287 BUG_ON(!trylock_page(newpage));
288
289 if (WARN_ON(!__is_movable_balloon_page(page))) {
290 dump_page(page);
291 unlock_page(newpage);
292 return rc;
293 }
294
295 mapping = page->mapping;
296 if (mapping)
297 rc = __migrate_balloon_page(mapping, newpage, page, mode);
298
299 unlock_page(newpage);
300 return rc;
301}
302#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..ecc45958ac0c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
147 147
148/* 148/*
149 * free_bootmem_late - free bootmem pages directly to page allocator 149 * free_bootmem_late - free bootmem pages directly to page allocator
150 * @addr: starting address of the range 150 * @addr: starting physical address of the range
151 * @size: size of the range in bytes 151 * @size: size of the range in bytes
152 * 152 *
153 * This is only useful when the bootmem allocator has already been torn 153 * This is only useful when the bootmem allocator has already been torn
154 * down, but we are still initializing the system. Pages are given directly 154 * down, but we are still initializing the system. Pages are given directly
155 * to the page allocator, no bootmem metadata is updated because it is gone. 155 * to the page allocator, no bootmem metadata is updated because it is gone.
156 */ 156 */
157void __init free_bootmem_late(unsigned long addr, unsigned long size) 157void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
158{ 158{
159 unsigned long cursor, end; 159 unsigned long cursor, end;
160 160
161 kmemleak_free_part(__va(addr), size); 161 kmemleak_free_part(__va(physaddr), size);
162 162
163 cursor = PFN_UP(addr); 163 cursor = PFN_UP(physaddr);
164 end = PFN_DOWN(addr + size); 164 end = PFN_DOWN(physaddr + size);
165 165
166 for (; cursor < end; cursor++) { 166 for (; cursor < end; cursor++) {
167 __free_pages_bootmem(pfn_to_page(cursor), 0); 167 __free_pages_bootmem(pfn_to_page(cursor), 0);
@@ -377,21 +377,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
377 377
378/** 378/**
379 * free_bootmem - mark a page range as usable 379 * free_bootmem - mark a page range as usable
380 * @addr: starting address of the range 380 * @addr: starting physical address of the range
381 * @size: size of the range in bytes 381 * @size: size of the range in bytes
382 * 382 *
383 * Partial pages will be considered reserved and left as they are. 383 * Partial pages will be considered reserved and left as they are.
384 * 384 *
385 * The range must be contiguous but may span node boundaries. 385 * The range must be contiguous but may span node boundaries.
386 */ 386 */
387void __init free_bootmem(unsigned long addr, unsigned long size) 387void __init free_bootmem(unsigned long physaddr, unsigned long size)
388{ 388{
389 unsigned long start, end; 389 unsigned long start, end;
390 390
391 kmemleak_free_part(__va(addr), size); 391 kmemleak_free_part(__va(physaddr), size);
392 392
393 start = PFN_UP(addr); 393 start = PFN_UP(physaddr);
394 end = PFN_DOWN(addr + size); 394 end = PFN_DOWN(physaddr + size);
395 395
396 mark_bootmem(start, end, 0, 0); 396 mark_bootmem(start, end, 0, 0);
397} 397}
diff --git a/mm/compaction.c b/mm/compaction.c
index 694eaabaaebd..d24dd2d7bad4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -14,6 +14,7 @@
14#include <linux/backing-dev.h> 14#include <linux/backing-dev.h>
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include <linux/balloon_compaction.h>
17#include "internal.h" 18#include "internal.h"
18 19
19#if defined CONFIG_COMPACTION || defined CONFIG_CMA 20#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -565,9 +566,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
565 goto next_pageblock; 566 goto next_pageblock;
566 } 567 }
567 568
568 /* Check may be lockless but that's ok as we recheck later */ 569 /*
569 if (!PageLRU(page)) 570 * Check may be lockless but that's ok as we recheck later.
571 * It's possible to migrate LRU pages and balloon pages
572 * Skip any other type of page
573 */
574 if (!PageLRU(page)) {
575 if (unlikely(balloon_page_movable(page))) {
576 if (locked && balloon_page_isolate(page)) {
577 /* Successfully isolated */
578 cc->finished_update_migrate = true;
579 list_add(&page->lru, migratelist);
580 cc->nr_migratepages++;
581 nr_isolated++;
582 goto check_compact_cluster;
583 }
584 }
570 continue; 585 continue;
586 }
571 587
572 /* 588 /*
573 * PageLRU is set. lru_lock normally excludes isolation 589 * PageLRU is set. lru_lock normally excludes isolation
@@ -621,6 +637,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
621 cc->nr_migratepages++; 637 cc->nr_migratepages++;
622 nr_isolated++; 638 nr_isolated++;
623 639
640check_compact_cluster:
624 /* Avoid isolating too much */ 641 /* Avoid isolating too much */
625 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 642 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
626 ++low_pfn; 643 ++low_pfn;
@@ -986,7 +1003,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
986 switch (isolate_migratepages(zone, cc)) { 1003 switch (isolate_migratepages(zone, cc)) {
987 case ISOLATE_ABORT: 1004 case ISOLATE_ABORT:
988 ret = COMPACT_PARTIAL; 1005 ret = COMPACT_PARTIAL;
989 putback_lru_pages(&cc->migratepages); 1006 putback_movable_pages(&cc->migratepages);
990 cc->nr_migratepages = 0; 1007 cc->nr_migratepages = 0;
991 goto out; 1008 goto out;
992 case ISOLATE_NONE: 1009 case ISOLATE_NONE:
@@ -1009,9 +1026,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1009 trace_mm_compaction_migratepages(nr_migrate - nr_remaining, 1026 trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
1010 nr_remaining); 1027 nr_remaining);
1011 1028
1012 /* Release LRU pages not migrated */ 1029 /* Release isolated pages not migrated */
1013 if (err) { 1030 if (err) {
1014 putback_lru_pages(&cc->migratepages); 1031 putback_movable_pages(&cc->migratepages);
1015 cc->nr_migratepages = 0; 1032 cc->nr_migratepages = 0;
1016 if (err == -ENOMEM) { 1033 if (err == -ENOMEM) {
1017 ret = COMPACT_PARTIAL; 1034 ret = COMPACT_PARTIAL;
diff --git a/mm/dmapool.c b/mm/dmapool.c
index da1b0f0b8709..c69781e97cf9 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -332,6 +332,30 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
332 retval = offset + page->vaddr; 332 retval = offset + page->vaddr;
333 *handle = offset + page->dma; 333 *handle = offset + page->dma;
334#ifdef DMAPOOL_DEBUG 334#ifdef DMAPOOL_DEBUG
335 {
336 int i;
337 u8 *data = retval;
338 /* page->offset is stored in first 4 bytes */
339 for (i = sizeof(page->offset); i < pool->size; i++) {
340 if (data[i] == POOL_POISON_FREED)
341 continue;
342 if (pool->dev)
343 dev_err(pool->dev,
344 "dma_pool_alloc %s, %p (corruped)\n",
345 pool->name, retval);
346 else
347 pr_err("dma_pool_alloc %s, %p (corruped)\n",
348 pool->name, retval);
349
350 /*
351 * Dump the first 4 bytes even if they are not
352 * POOL_POISON_FREED
353 */
354 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
355 data, pool->size, 1);
356 break;
357 }
358 }
335 memset(retval, POOL_POISON_ALLOCATED, pool->size); 359 memset(retval, POOL_POISON_ALLOCATED, pool->size);
336#endif 360#endif
337 spin_unlock_irqrestore(&pool->lock, flags); 361 spin_unlock_irqrestore(&pool->lock, flags);
diff --git a/mm/highmem.c b/mm/highmem.c
index 2da13a5c50e2..d999077431df 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -99,7 +99,7 @@ struct page *kmap_to_page(void *vaddr)
99 unsigned long addr = (unsigned long)vaddr; 99 unsigned long addr = (unsigned long)vaddr;
100 100
101 if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { 101 if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; 102 int i = PKMAP_NR(addr);
103 return pte_page(pkmap_page_table[i]); 103 return pte_page(pkmap_page_table[i]);
104 } 104 }
105 105
@@ -137,8 +137,7 @@ static void flush_all_zero_pkmaps(void)
137 * So no dangers, even with speculative execution. 137 * So no dangers, even with speculative execution.
138 */ 138 */
139 page = pte_page(pkmap_page_table[i]); 139 page = pte_page(pkmap_page_table[i]);
140 pte_clear(&init_mm, (unsigned long)page_address(page), 140 pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
141 &pkmap_page_table[i]);
142 141
143 set_page_address(page, NULL); 142 set_page_address(page, NULL);
144 need_flush = 1; 143 need_flush = 1;
@@ -324,11 +323,7 @@ struct page_address_map {
324 struct list_head list; 323 struct list_head list;
325}; 324};
326 325
327/* 326static struct page_address_map page_address_maps[LAST_PKMAP];
328 * page_address_map freelist, allocated from page_address_maps.
329 */
330static struct list_head page_address_pool; /* freelist */
331static spinlock_t pool_lock; /* protects page_address_pool */
332 327
333/* 328/*
334 * Hash table bucket 329 * Hash table bucket
@@ -393,14 +388,7 @@ void set_page_address(struct page *page, void *virtual)
393 388
394 pas = page_slot(page); 389 pas = page_slot(page);
395 if (virtual) { /* Add */ 390 if (virtual) { /* Add */
396 BUG_ON(list_empty(&page_address_pool)); 391 pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
397
398 spin_lock_irqsave(&pool_lock, flags);
399 pam = list_entry(page_address_pool.next,
400 struct page_address_map, list);
401 list_del(&pam->list);
402 spin_unlock_irqrestore(&pool_lock, flags);
403
404 pam->page = page; 392 pam->page = page;
405 pam->virtual = virtual; 393 pam->virtual = virtual;
406 394
@@ -413,9 +401,6 @@ void set_page_address(struct page *page, void *virtual)
413 if (pam->page == page) { 401 if (pam->page == page) {
414 list_del(&pam->list); 402 list_del(&pam->list);
415 spin_unlock_irqrestore(&pas->lock, flags); 403 spin_unlock_irqrestore(&pas->lock, flags);
416 spin_lock_irqsave(&pool_lock, flags);
417 list_add_tail(&pam->list, &page_address_pool);
418 spin_unlock_irqrestore(&pool_lock, flags);
419 goto done; 404 goto done;
420 } 405 }
421 } 406 }
@@ -425,20 +410,14 @@ done:
425 return; 410 return;
426} 411}
427 412
428static struct page_address_map page_address_maps[LAST_PKMAP];
429
430void __init page_address_init(void) 413void __init page_address_init(void)
431{ 414{
432 int i; 415 int i;
433 416
434 INIT_LIST_HEAD(&page_address_pool);
435 for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
436 list_add(&page_address_maps[i].list, &page_address_pool);
437 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { 417 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
438 INIT_LIST_HEAD(&page_address_htable[i].lh); 418 INIT_LIST_HEAD(&page_address_htable[i].lh);
439 spin_lock_init(&page_address_htable[i].lock); 419 spin_lock_init(&page_address_htable[i].lock);
440 } 420 }
441 spin_lock_init(&pool_lock);
442} 421}
443 422
444#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 423#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 40f17c34b415..5f902e20e8c0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -606,6 +606,15 @@ static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
606 return pmd; 606 return pmd;
607} 607}
608 608
609static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
610{
611 pmd_t entry;
612 entry = mk_pmd(page, vma->vm_page_prot);
613 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
614 entry = pmd_mkhuge(entry);
615 return entry;
616}
617
609static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 618static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
610 struct vm_area_struct *vma, 619 struct vm_area_struct *vma,
611 unsigned long haddr, pmd_t *pmd, 620 unsigned long haddr, pmd_t *pmd,
@@ -629,9 +638,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
629 pte_free(mm, pgtable); 638 pte_free(mm, pgtable);
630 } else { 639 } else {
631 pmd_t entry; 640 pmd_t entry;
632 entry = mk_pmd(page, vma->vm_page_prot); 641 entry = mk_huge_pmd(page, vma);
633 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
634 entry = pmd_mkhuge(entry);
635 /* 642 /*
636 * The spinlocking to take the lru_lock inside 643 * The spinlocking to take the lru_lock inside
637 * page_add_new_anon_rmap() acts as a full memory 644 * page_add_new_anon_rmap() acts as a full memory
@@ -777,6 +784,28 @@ out:
777 return ret; 784 return ret;
778} 785}
779 786
787void huge_pmd_set_accessed(struct mm_struct *mm,
788 struct vm_area_struct *vma,
789 unsigned long address,
790 pmd_t *pmd, pmd_t orig_pmd,
791 int dirty)
792{
793 pmd_t entry;
794 unsigned long haddr;
795
796 spin_lock(&mm->page_table_lock);
797 if (unlikely(!pmd_same(*pmd, orig_pmd)))
798 goto unlock;
799
800 entry = pmd_mkyoung(orig_pmd);
801 haddr = address & HPAGE_PMD_MASK;
802 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
803 update_mmu_cache_pmd(vma, address, pmd);
804
805unlock:
806 spin_unlock(&mm->page_table_lock);
807}
808
780static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 809static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
781 struct vm_area_struct *vma, 810 struct vm_area_struct *vma,
782 unsigned long address, 811 unsigned long address,
@@ -951,9 +980,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
951 } else { 980 } else {
952 pmd_t entry; 981 pmd_t entry;
953 VM_BUG_ON(!PageHead(page)); 982 VM_BUG_ON(!PageHead(page));
954 entry = mk_pmd(new_page, vma->vm_page_prot); 983 entry = mk_huge_pmd(new_page, vma);
955 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
956 entry = pmd_mkhuge(entry);
957 pmdp_clear_flush(vma, haddr, pmd); 984 pmdp_clear_flush(vma, haddr, pmd);
958 page_add_new_anon_rmap(new_page, vma, haddr); 985 page_add_new_anon_rmap(new_page, vma, haddr);
959 set_pmd_at(mm, haddr, pmd, entry); 986 set_pmd_at(mm, haddr, pmd, entry);
@@ -1146,22 +1173,14 @@ pmd_t *page_check_address_pmd(struct page *page,
1146 unsigned long address, 1173 unsigned long address,
1147 enum page_check_address_pmd_flag flag) 1174 enum page_check_address_pmd_flag flag)
1148{ 1175{
1149 pgd_t *pgd;
1150 pud_t *pud;
1151 pmd_t *pmd, *ret = NULL; 1176 pmd_t *pmd, *ret = NULL;
1152 1177
1153 if (address & ~HPAGE_PMD_MASK) 1178 if (address & ~HPAGE_PMD_MASK)
1154 goto out; 1179 goto out;
1155 1180
1156 pgd = pgd_offset(mm, address); 1181 pmd = mm_find_pmd(mm, address);
1157 if (!pgd_present(*pgd)) 1182 if (!pmd)
1158 goto out;
1159
1160 pud = pud_offset(pgd, address);
1161 if (!pud_present(*pud))
1162 goto out; 1183 goto out;
1163
1164 pmd = pmd_offset(pud, address);
1165 if (pmd_none(*pmd)) 1184 if (pmd_none(*pmd))
1166 goto out; 1185 goto out;
1167 if (pmd_page(*pmd) != page) 1186 if (pmd_page(*pmd) != page)
@@ -1701,64 +1720,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
1701 } 1720 }
1702} 1721}
1703 1722
1704static void release_all_pte_pages(pte_t *pte)
1705{
1706 release_pte_pages(pte, pte + HPAGE_PMD_NR);
1707}
1708
1709static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 1723static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1710 unsigned long address, 1724 unsigned long address,
1711 pte_t *pte) 1725 pte_t *pte)
1712{ 1726{
1713 struct page *page; 1727 struct page *page;
1714 pte_t *_pte; 1728 pte_t *_pte;
1715 int referenced = 0, isolated = 0, none = 0; 1729 int referenced = 0, none = 0;
1716 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 1730 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1717 _pte++, address += PAGE_SIZE) { 1731 _pte++, address += PAGE_SIZE) {
1718 pte_t pteval = *_pte; 1732 pte_t pteval = *_pte;
1719 if (pte_none(pteval)) { 1733 if (pte_none(pteval)) {
1720 if (++none <= khugepaged_max_ptes_none) 1734 if (++none <= khugepaged_max_ptes_none)
1721 continue; 1735 continue;
1722 else { 1736 else
1723 release_pte_pages(pte, _pte);
1724 goto out; 1737 goto out;
1725 }
1726 } 1738 }
1727 if (!pte_present(pteval) || !pte_write(pteval)) { 1739 if (!pte_present(pteval) || !pte_write(pteval))
1728 release_pte_pages(pte, _pte);
1729 goto out; 1740 goto out;
1730 }
1731 page = vm_normal_page(vma, address, pteval); 1741 page = vm_normal_page(vma, address, pteval);
1732 if (unlikely(!page)) { 1742 if (unlikely(!page))
1733 release_pte_pages(pte, _pte);
1734 goto out; 1743 goto out;
1735 } 1744
1736 VM_BUG_ON(PageCompound(page)); 1745 VM_BUG_ON(PageCompound(page));
1737 BUG_ON(!PageAnon(page)); 1746 BUG_ON(!PageAnon(page));
1738 VM_BUG_ON(!PageSwapBacked(page)); 1747 VM_BUG_ON(!PageSwapBacked(page));
1739 1748
1740 /* cannot use mapcount: can't collapse if there's a gup pin */ 1749 /* cannot use mapcount: can't collapse if there's a gup pin */
1741 if (page_count(page) != 1) { 1750 if (page_count(page) != 1)
1742 release_pte_pages(pte, _pte);
1743 goto out; 1751 goto out;
1744 }
1745 /* 1752 /*
1746 * We can do it before isolate_lru_page because the 1753 * We can do it before isolate_lru_page because the
1747 * page can't be freed from under us. NOTE: PG_lock 1754 * page can't be freed from under us. NOTE: PG_lock
1748 * is needed to serialize against split_huge_page 1755 * is needed to serialize against split_huge_page
1749 * when invoked from the VM. 1756 * when invoked from the VM.
1750 */ 1757 */
1751 if (!trylock_page(page)) { 1758 if (!trylock_page(page))
1752 release_pte_pages(pte, _pte);
1753 goto out; 1759 goto out;
1754 }
1755 /* 1760 /*
1756 * Isolate the page to avoid collapsing an hugepage 1761 * Isolate the page to avoid collapsing an hugepage
1757 * currently in use by the VM. 1762 * currently in use by the VM.
1758 */ 1763 */
1759 if (isolate_lru_page(page)) { 1764 if (isolate_lru_page(page)) {
1760 unlock_page(page); 1765 unlock_page(page);
1761 release_pte_pages(pte, _pte);
1762 goto out; 1766 goto out;
1763 } 1767 }
1764 /* 0 stands for page_is_file_cache(page) == false */ 1768 /* 0 stands for page_is_file_cache(page) == false */
@@ -1771,12 +1775,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1771 mmu_notifier_test_young(vma->vm_mm, address)) 1775 mmu_notifier_test_young(vma->vm_mm, address))
1772 referenced = 1; 1776 referenced = 1;
1773 } 1777 }
1774 if (unlikely(!referenced)) 1778 if (likely(referenced))
1775 release_all_pte_pages(pte); 1779 return 1;
1776 else
1777 isolated = 1;
1778out: 1780out:
1779 return isolated; 1781 release_pte_pages(pte, _pte);
1782 return 0;
1780} 1783}
1781 1784
1782static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 1785static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -1918,14 +1921,26 @@ static struct page
1918} 1921}
1919#endif 1922#endif
1920 1923
1924static bool hugepage_vma_check(struct vm_area_struct *vma)
1925{
1926 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1927 (vma->vm_flags & VM_NOHUGEPAGE))
1928 return false;
1929
1930 if (!vma->anon_vma || vma->vm_ops)
1931 return false;
1932 if (is_vma_temporary_stack(vma))
1933 return false;
1934 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
1935 return true;
1936}
1937
1921static void collapse_huge_page(struct mm_struct *mm, 1938static void collapse_huge_page(struct mm_struct *mm,
1922 unsigned long address, 1939 unsigned long address,
1923 struct page **hpage, 1940 struct page **hpage,
1924 struct vm_area_struct *vma, 1941 struct vm_area_struct *vma,
1925 int node) 1942 int node)
1926{ 1943{
1927 pgd_t *pgd;
1928 pud_t *pud;
1929 pmd_t *pmd, _pmd; 1944 pmd_t *pmd, _pmd;
1930 pte_t *pte; 1945 pte_t *pte;
1931 pgtable_t pgtable; 1946 pgtable_t pgtable;
@@ -1960,28 +1975,12 @@ static void collapse_huge_page(struct mm_struct *mm,
1960 hend = vma->vm_end & HPAGE_PMD_MASK; 1975 hend = vma->vm_end & HPAGE_PMD_MASK;
1961 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 1976 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1962 goto out; 1977 goto out;
1963 1978 if (!hugepage_vma_check(vma))
1964 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1965 (vma->vm_flags & VM_NOHUGEPAGE))
1966 goto out;
1967
1968 if (!vma->anon_vma || vma->vm_ops)
1969 goto out;
1970 if (is_vma_temporary_stack(vma))
1971 goto out; 1979 goto out;
1972 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 1980 pmd = mm_find_pmd(mm, address);
1973 1981 if (!pmd)
1974 pgd = pgd_offset(mm, address);
1975 if (!pgd_present(*pgd))
1976 goto out; 1982 goto out;
1977 1983 if (pmd_trans_huge(*pmd))
1978 pud = pud_offset(pgd, address);
1979 if (!pud_present(*pud))
1980 goto out;
1981
1982 pmd = pmd_offset(pud, address);
1983 /* pmd can't go away or become huge under us */
1984 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1985 goto out; 1984 goto out;
1986 1985
1987 anon_vma_lock(vma->anon_vma); 1986 anon_vma_lock(vma->anon_vma);
@@ -2028,9 +2027,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2028 __SetPageUptodate(new_page); 2027 __SetPageUptodate(new_page);
2029 pgtable = pmd_pgtable(_pmd); 2028 pgtable = pmd_pgtable(_pmd);
2030 2029
2031 _pmd = mk_pmd(new_page, vma->vm_page_prot); 2030 _pmd = mk_huge_pmd(new_page, vma);
2032 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
2033 _pmd = pmd_mkhuge(_pmd);
2034 2031
2035 /* 2032 /*
2036 * spin_lock() below is not the equivalent of smp_wmb(), so 2033 * spin_lock() below is not the equivalent of smp_wmb(), so
@@ -2064,8 +2061,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2064 unsigned long address, 2061 unsigned long address,
2065 struct page **hpage) 2062 struct page **hpage)
2066{ 2063{
2067 pgd_t *pgd;
2068 pud_t *pud;
2069 pmd_t *pmd; 2064 pmd_t *pmd;
2070 pte_t *pte, *_pte; 2065 pte_t *pte, *_pte;
2071 int ret = 0, referenced = 0, none = 0; 2066 int ret = 0, referenced = 0, none = 0;
@@ -2076,16 +2071,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2076 2071
2077 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2072 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2078 2073
2079 pgd = pgd_offset(mm, address); 2074 pmd = mm_find_pmd(mm, address);
2080 if (!pgd_present(*pgd)) 2075 if (!pmd)
2081 goto out;
2082
2083 pud = pud_offset(pgd, address);
2084 if (!pud_present(*pud))
2085 goto out; 2076 goto out;
2086 2077 if (pmd_trans_huge(*pmd))
2087 pmd = pmd_offset(pud, address);
2088 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
2089 goto out; 2078 goto out;
2090 2079
2091 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2080 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2193,20 +2182,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2193 progress++; 2182 progress++;
2194 break; 2183 break;
2195 } 2184 }
2196 2185 if (!hugepage_vma_check(vma)) {
2197 if ((!(vma->vm_flags & VM_HUGEPAGE) && 2186skip:
2198 !khugepaged_always()) ||
2199 (vma->vm_flags & VM_NOHUGEPAGE)) {
2200 skip:
2201 progress++; 2187 progress++;
2202 continue; 2188 continue;
2203 } 2189 }
2204 if (!vma->anon_vma || vma->vm_ops)
2205 goto skip;
2206 if (is_vma_temporary_stack(vma))
2207 goto skip;
2208 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2209
2210 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2190 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2211 hend = vma->vm_end & HPAGE_PMD_MASK; 2191 hend = vma->vm_end & HPAGE_PMD_MASK;
2212 if (hstart >= hend) 2192 if (hstart >= hend)
@@ -2379,22 +2359,12 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2379static void split_huge_page_address(struct mm_struct *mm, 2359static void split_huge_page_address(struct mm_struct *mm,
2380 unsigned long address) 2360 unsigned long address)
2381{ 2361{
2382 pgd_t *pgd;
2383 pud_t *pud;
2384 pmd_t *pmd; 2362 pmd_t *pmd;
2385 2363
2386 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 2364 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2387 2365
2388 pgd = pgd_offset(mm, address); 2366 pmd = mm_find_pmd(mm, address);
2389 if (!pgd_present(*pgd)) 2367 if (!pmd)
2390 return;
2391
2392 pud = pud_offset(pgd, address);
2393 if (!pud_present(*pud))
2394 return;
2395
2396 pmd = pmd_offset(pud, address);
2397 if (!pmd_present(*pmd))
2398 return; 2368 return;
2399 /* 2369 /*
2400 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2370 * Caller holds the mmap_sem write mode, so a huge pmd cannot
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 59a0059b39e2..1ef2cd4ae3c9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void)
1800 * remove hstate attributes from any nodes that have them. 1800 * remove hstate attributes from any nodes that have them.
1801 */ 1801 */
1802 for (nid = 0; nid < nr_node_ids; nid++) 1802 for (nid = 0; nid < nr_node_ids; nid++)
1803 hugetlb_unregister_node(&node_devices[nid]); 1803 hugetlb_unregister_node(node_devices[nid]);
1804} 1804}
1805 1805
1806/* 1806/*
@@ -1845,7 +1845,7 @@ static void hugetlb_register_all_nodes(void)
1845 int nid; 1845 int nid;
1846 1846
1847 for_each_node_state(nid, N_HIGH_MEMORY) { 1847 for_each_node_state(nid, N_HIGH_MEMORY) {
1848 struct node *node = &node_devices[nid]; 1848 struct node *node = node_devices[nid];
1849 if (node->dev.id == nid) 1849 if (node->dev.id == nid)
1850 hugetlb_register_node(node); 1850 hugetlb_register_node(node);
1851 } 1851 }
diff --git a/mm/internal.h b/mm/internal.h
index a4fa284f6bc2..52d1fa957194 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page);
92extern void putback_lru_page(struct page *page); 92extern void putback_lru_page(struct page *page);
93 93
94/* 94/*
95 * in mm/rmap.c:
96 */
97extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
98
99/*
95 * in mm/page_alloc.c 100 * in mm/page_alloc.c
96 */ 101 */
97extern void __free_pages_bootmem(struct page *page, unsigned int order); 102extern void __free_pages_bootmem(struct page *page, unsigned int order);
diff --git a/mm/ksm.c b/mm/ksm.c
index ae539f0b8aa1..382d930a0bf1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
778 struct page *kpage, pte_t orig_pte) 778 struct page *kpage, pte_t orig_pte)
779{ 779{
780 struct mm_struct *mm = vma->vm_mm; 780 struct mm_struct *mm = vma->vm_mm;
781 pgd_t *pgd;
782 pud_t *pud;
783 pmd_t *pmd; 781 pmd_t *pmd;
784 pte_t *ptep; 782 pte_t *ptep;
785 spinlock_t *ptl; 783 spinlock_t *ptl;
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
792 if (addr == -EFAULT) 790 if (addr == -EFAULT)
793 goto out; 791 goto out;
794 792
795 pgd = pgd_offset(mm, addr); 793 pmd = mm_find_pmd(mm, addr);
796 if (!pgd_present(*pgd)) 794 if (!pmd)
797 goto out; 795 goto out;
798
799 pud = pud_offset(pgd, addr);
800 if (!pud_present(*pud))
801 goto out;
802
803 pmd = pmd_offset(pud, addr);
804 BUG_ON(pmd_trans_huge(*pmd)); 796 BUG_ON(pmd_trans_huge(*pmd));
805 if (!pmd_present(*pmd))
806 goto out;
807 797
808 mmun_start = addr; 798 mmun_start = addr;
809 mmun_end = addr + PAGE_SIZE; 799 mmun_end = addr + PAGE_SIZE;
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1929 if (ksm_run != flags) { 1919 if (ksm_run != flags) {
1930 ksm_run = flags; 1920 ksm_run = flags;
1931 if (flags & KSM_RUN_UNMERGE) { 1921 if (flags & KSM_RUN_UNMERGE) {
1932 int oom_score_adj; 1922 set_current_oom_origin();
1933
1934 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1935 err = unmerge_and_remove_all_rmap_items(); 1923 err = unmerge_and_remove_all_rmap_items();
1936 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, 1924 clear_current_oom_origin();
1937 oom_score_adj);
1938 if (err) { 1925 if (err) {
1939 ksm_run = KSM_RUN_STOP; 1926 ksm_run = KSM_RUN_STOP;
1940 count = err; 1927 count = err;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dd39ba000b31..cf6d0df4849c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1498,8 +1498,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1498 return limit; 1498 return limit;
1499} 1499}
1500 1500
1501void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1501static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1502 int order) 1502 int order)
1503{ 1503{
1504 struct mem_cgroup *iter; 1504 struct mem_cgroup *iter;
1505 unsigned long chosen_points = 0; 1505 unsigned long chosen_points = 0;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8b20278be6a6..108c52fa60f6 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -781,16 +781,16 @@ static struct page_state {
781 { compound, compound, "huge", me_huge_page }, 781 { compound, compound, "huge", me_huge_page },
782#endif 782#endif
783 783
784 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, 784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
785 { sc|dirty, sc, "swapcache", me_swapcache_clean }, 785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
786 786
787 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, 787 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
788 { unevict, unevict, "unevictable LRU", me_pagecache_clean}, 788 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
789 789
790 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, 790 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
791 { mlock, mlock, "mlocked LRU", me_pagecache_clean }, 791 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
792 792
793 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
794 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 794 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
795 795
796 /* 796 /*
@@ -812,14 +812,14 @@ static struct page_state {
812#undef slab 812#undef slab
813#undef reserved 813#undef reserved
814 814
815/*
816 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
817 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
818 */
815static void action_result(unsigned long pfn, char *msg, int result) 819static void action_result(unsigned long pfn, char *msg, int result)
816{ 820{
817 struct page *page = pfn_to_page(pfn); 821 pr_err("MCE %#lx: %s page recovery: %s\n",
818 822 pfn, msg, action_name[result]);
819 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
820 pfn,
821 PageDirty(page) ? "dirty " : "",
822 msg, action_name[result]);
823} 823}
824 824
825static int page_action(struct page_state *ps, struct page *p, 825static int page_action(struct page_state *ps, struct page *p,
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1385 * Isolate the page, so that it doesn't get reallocated if it 1385 * Isolate the page, so that it doesn't get reallocated if it
1386 * was free. 1386 * was free.
1387 */ 1387 */
1388 set_migratetype_isolate(p); 1388 set_migratetype_isolate(p, true);
1389 /* 1389 /*
1390 * When the target page is a free hugepage, just remove it 1390 * When the target page is a free hugepage, just remove it
1391 * from free hugepage list. 1391 * from free hugepage list.
diff --git a/mm/memory.c b/mm/memory.c
index 221fc9ffcab1..765377385632 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3537,8 +3537,9 @@ retry:
3537 3537
3538 barrier(); 3538 barrier();
3539 if (pmd_trans_huge(orig_pmd)) { 3539 if (pmd_trans_huge(orig_pmd)) {
3540 if (flags & FAULT_FLAG_WRITE && 3540 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3541 !pmd_write(orig_pmd) && 3541
3542 if (dirty && !pmd_write(orig_pmd) &&
3542 !pmd_trans_splitting(orig_pmd)) { 3543 !pmd_trans_splitting(orig_pmd)) {
3543 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3544 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3544 orig_pmd); 3545 orig_pmd);
@@ -3550,6 +3551,9 @@ retry:
3550 if (unlikely(ret & VM_FAULT_OOM)) 3551 if (unlikely(ret & VM_FAULT_OOM))
3551 goto retry; 3552 goto retry;
3552 return ret; 3553 return ret;
3554 } else {
3555 huge_pmd_set_accessed(mm, vma, address, pmd,
3556 orig_pmd, dirty);
3553 } 3557 }
3554 return 0; 3558 return 0;
3555 } 3559 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4eeacae2b91..de9cb14ae753 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -205,7 +205,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
205 zone_span_writelock(zone); 205 zone_span_writelock(zone);
206 206
207 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 207 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
208 if (start_pfn < zone->zone_start_pfn) 208 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
209 zone->zone_start_pfn = start_pfn; 209 zone->zone_start_pfn = start_pfn;
210 210
211 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 211 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -214,13 +214,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
214 zone_span_writeunlock(zone); 214 zone_span_writeunlock(zone);
215} 215}
216 216
217static void resize_zone(struct zone *zone, unsigned long start_pfn,
218 unsigned long end_pfn)
219{
220 zone_span_writelock(zone);
221
222 if (end_pfn - start_pfn) {
223 zone->zone_start_pfn = start_pfn;
224 zone->spanned_pages = end_pfn - start_pfn;
225 } else {
226 /*
227 * make it consist as free_area_init_core(),
228 * if spanned_pages = 0, then keep start_pfn = 0
229 */
230 zone->zone_start_pfn = 0;
231 zone->spanned_pages = 0;
232 }
233
234 zone_span_writeunlock(zone);
235}
236
237static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
238 unsigned long end_pfn)
239{
240 enum zone_type zid = zone_idx(zone);
241 int nid = zone->zone_pgdat->node_id;
242 unsigned long pfn;
243
244 for (pfn = start_pfn; pfn < end_pfn; pfn++)
245 set_page_links(pfn_to_page(pfn), zid, nid, pfn);
246}
247
248static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
249 unsigned long start_pfn, unsigned long end_pfn)
250{
251 int ret;
252 unsigned long flags;
253 unsigned long z1_start_pfn;
254
255 if (!z1->wait_table) {
256 ret = init_currently_empty_zone(z1, start_pfn,
257 end_pfn - start_pfn, MEMMAP_HOTPLUG);
258 if (ret)
259 return ret;
260 }
261
262 pgdat_resize_lock(z1->zone_pgdat, &flags);
263
264 /* can't move pfns which are higher than @z2 */
265 if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
266 goto out_fail;
267 /* the move out part mast at the left most of @z2 */
268 if (start_pfn > z2->zone_start_pfn)
269 goto out_fail;
270 /* must included/overlap */
271 if (end_pfn <= z2->zone_start_pfn)
272 goto out_fail;
273
274 /* use start_pfn for z1's start_pfn if z1 is empty */
275 if (z1->spanned_pages)
276 z1_start_pfn = z1->zone_start_pfn;
277 else
278 z1_start_pfn = start_pfn;
279
280 resize_zone(z1, z1_start_pfn, end_pfn);
281 resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
282
283 pgdat_resize_unlock(z1->zone_pgdat, &flags);
284
285 fix_zone_id(z1, start_pfn, end_pfn);
286
287 return 0;
288out_fail:
289 pgdat_resize_unlock(z1->zone_pgdat, &flags);
290 return -1;
291}
292
293static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
294 unsigned long start_pfn, unsigned long end_pfn)
295{
296 int ret;
297 unsigned long flags;
298 unsigned long z2_end_pfn;
299
300 if (!z2->wait_table) {
301 ret = init_currently_empty_zone(z2, start_pfn,
302 end_pfn - start_pfn, MEMMAP_HOTPLUG);
303 if (ret)
304 return ret;
305 }
306
307 pgdat_resize_lock(z1->zone_pgdat, &flags);
308
309 /* can't move pfns which are lower than @z1 */
310 if (z1->zone_start_pfn > start_pfn)
311 goto out_fail;
312 /* the move out part mast at the right most of @z1 */
313 if (z1->zone_start_pfn + z1->spanned_pages > end_pfn)
314 goto out_fail;
315 /* must included/overlap */
316 if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
317 goto out_fail;
318
319 /* use end_pfn for z2's end_pfn if z2 is empty */
320 if (z2->spanned_pages)
321 z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
322 else
323 z2_end_pfn = end_pfn;
324
325 resize_zone(z1, z1->zone_start_pfn, start_pfn);
326 resize_zone(z2, start_pfn, z2_end_pfn);
327
328 pgdat_resize_unlock(z1->zone_pgdat, &flags);
329
330 fix_zone_id(z2, start_pfn, end_pfn);
331
332 return 0;
333out_fail:
334 pgdat_resize_unlock(z1->zone_pgdat, &flags);
335 return -1;
336}
337
217static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 338static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
218 unsigned long end_pfn) 339 unsigned long end_pfn)
219{ 340{
220 unsigned long old_pgdat_end_pfn = 341 unsigned long old_pgdat_end_pfn =
221 pgdat->node_start_pfn + pgdat->node_spanned_pages; 342 pgdat->node_start_pfn + pgdat->node_spanned_pages;
222 343
223 if (start_pfn < pgdat->node_start_pfn) 344 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
224 pgdat->node_start_pfn = start_pfn; 345 pgdat->node_start_pfn = start_pfn;
225 346
226 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 347 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
@@ -460,8 +581,61 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
460 return 0; 581 return 0;
461} 582}
462 583
584/* ensure every online node has NORMAL memory */
585static bool can_online_high_movable(struct zone *zone)
586{
587 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
588}
589
590/* check which state of node_states will be changed when online memory */
591static void node_states_check_changes_online(unsigned long nr_pages,
592 struct zone *zone, struct memory_notify *arg)
593{
594 int nid = zone_to_nid(zone);
595 enum zone_type zone_last = ZONE_NORMAL;
596
597 /*
598 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
599 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
600 *
601 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
602 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
603 */
604 if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
605 zone_last = ZONE_MOVABLE;
606
607 /*
608 * if the memory to be online is in a zone of 0...zone_last, and
609 * the zones of 0...zone_last don't have memory before online, we will
610 * need to set the node to node_states[N_NORMAL_MEMORY] after
611 * the memory is online.
612 */
613 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
614 arg->status_change_nid_normal = nid;
615 else
616 arg->status_change_nid_normal = -1;
617
618 /*
619 * if the node don't have memory befor online, we will need to
620 * set the node to node_states[N_HIGH_MEMORY] after the memory
621 * is online.
622 */
623 if (!node_state(nid, N_HIGH_MEMORY))
624 arg->status_change_nid = nid;
625 else
626 arg->status_change_nid = -1;
627}
628
629static void node_states_set_node(int node, struct memory_notify *arg)
630{
631 if (arg->status_change_nid_normal >= 0)
632 node_set_state(node, N_NORMAL_MEMORY);
633
634 node_set_state(node, N_HIGH_MEMORY);
635}
463 636
464int __ref online_pages(unsigned long pfn, unsigned long nr_pages) 637
638int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
465{ 639{
466 unsigned long onlined_pages = 0; 640 unsigned long onlined_pages = 0;
467 struct zone *zone; 641 struct zone *zone;
@@ -471,13 +645,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
471 struct memory_notify arg; 645 struct memory_notify arg;
472 646
473 lock_memory_hotplug(); 647 lock_memory_hotplug();
648 /*
649 * This doesn't need a lock to do pfn_to_page().
650 * The section can't be removed here because of the
651 * memory_block->state_mutex.
652 */
653 zone = page_zone(pfn_to_page(pfn));
654
655 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
656 !can_online_high_movable(zone)) {
657 unlock_memory_hotplug();
658 return -1;
659 }
660
661 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
662 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
663 unlock_memory_hotplug();
664 return -1;
665 }
666 }
667 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
668 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
669 unlock_memory_hotplug();
670 return -1;
671 }
672 }
673
674 /* Previous code may changed the zone of the pfn range */
675 zone = page_zone(pfn_to_page(pfn));
676
474 arg.start_pfn = pfn; 677 arg.start_pfn = pfn;
475 arg.nr_pages = nr_pages; 678 arg.nr_pages = nr_pages;
476 arg.status_change_nid = -1; 679 node_states_check_changes_online(nr_pages, zone, &arg);
477 680
478 nid = page_to_nid(pfn_to_page(pfn)); 681 nid = page_to_nid(pfn_to_page(pfn));
479 if (node_present_pages(nid) == 0)
480 arg.status_change_nid = nid;
481 682
482 ret = memory_notify(MEM_GOING_ONLINE, &arg); 683 ret = memory_notify(MEM_GOING_ONLINE, &arg);
483 ret = notifier_to_errno(ret); 684 ret = notifier_to_errno(ret);
@@ -487,23 +688,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
487 return ret; 688 return ret;
488 } 689 }
489 /* 690 /*
490 * This doesn't need a lock to do pfn_to_page().
491 * The section can't be removed here because of the
492 * memory_block->state_mutex.
493 */
494 zone = page_zone(pfn_to_page(pfn));
495 /*
496 * If this zone is not populated, then it is not in zonelist. 691 * If this zone is not populated, then it is not in zonelist.
497 * This means the page allocator ignores this zone. 692 * This means the page allocator ignores this zone.
498 * So, zonelist must be updated after online. 693 * So, zonelist must be updated after online.
499 */ 694 */
500 mutex_lock(&zonelists_mutex); 695 mutex_lock(&zonelists_mutex);
501 if (!populated_zone(zone)) 696 if (!populated_zone(zone)) {
502 need_zonelists_rebuild = 1; 697 need_zonelists_rebuild = 1;
698 build_all_zonelists(NULL, zone);
699 }
503 700
504 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 701 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
505 online_pages_range); 702 online_pages_range);
506 if (ret) { 703 if (ret) {
704 if (need_zonelists_rebuild)
705 zone_pcp_reset(zone);
507 mutex_unlock(&zonelists_mutex); 706 mutex_unlock(&zonelists_mutex);
508 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 707 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
509 (unsigned long long) pfn << PAGE_SHIFT, 708 (unsigned long long) pfn << PAGE_SHIFT,
@@ -517,9 +716,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
517 zone->present_pages += onlined_pages; 716 zone->present_pages += onlined_pages;
518 zone->zone_pgdat->node_present_pages += onlined_pages; 717 zone->zone_pgdat->node_present_pages += onlined_pages;
519 if (onlined_pages) { 718 if (onlined_pages) {
520 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 719 node_states_set_node(zone_to_nid(zone), &arg);
521 if (need_zonelists_rebuild) 720 if (need_zonelists_rebuild)
522 build_all_zonelists(NULL, zone); 721 build_all_zonelists(NULL, NULL);
523 else 722 else
524 zone_pcp_update(zone); 723 zone_pcp_update(zone);
525 } 724 }
@@ -847,7 +1046,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
847{ 1046{
848 int ret; 1047 int ret;
849 long offlined = *(long *)data; 1048 long offlined = *(long *)data;
850 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); 1049 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
851 offlined = nr_pages; 1050 offlined = nr_pages;
852 if (!ret) 1051 if (!ret)
853 *(long *)data += offlined; 1052 *(long *)data += offlined;
@@ -867,6 +1066,91 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
867 return offlined; 1066 return offlined;
868} 1067}
869 1068
1069/* ensure the node has NORMAL memory if it is still online */
1070static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1071{
1072 struct pglist_data *pgdat = zone->zone_pgdat;
1073 unsigned long present_pages = 0;
1074 enum zone_type zt;
1075
1076 for (zt = 0; zt <= ZONE_NORMAL; zt++)
1077 present_pages += pgdat->node_zones[zt].present_pages;
1078
1079 if (present_pages > nr_pages)
1080 return true;
1081
1082 present_pages = 0;
1083 for (; zt <= ZONE_MOVABLE; zt++)
1084 present_pages += pgdat->node_zones[zt].present_pages;
1085
1086 /*
1087 * we can't offline the last normal memory until all
1088 * higher memory is offlined.
1089 */
1090 return present_pages == 0;
1091}
1092
1093/* check which state of node_states will be changed when offline memory */
1094static void node_states_check_changes_offline(unsigned long nr_pages,
1095 struct zone *zone, struct memory_notify *arg)
1096{
1097 struct pglist_data *pgdat = zone->zone_pgdat;
1098 unsigned long present_pages = 0;
1099 enum zone_type zt, zone_last = ZONE_NORMAL;
1100
1101 /*
1102 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
1103 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
1104 *
1105 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
1106 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1107 */
1108 if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
1109 zone_last = ZONE_MOVABLE;
1110
1111 /*
1112 * check whether node_states[N_NORMAL_MEMORY] will be changed.
1113 * If the memory to be offline is in a zone of 0...zone_last,
1114 * and it is the last present memory, 0...zone_last will
1115 * become empty after offline , thus we can determind we will
1116 * need to clear the node from node_states[N_NORMAL_MEMORY].
1117 */
1118 for (zt = 0; zt <= zone_last; zt++)
1119 present_pages += pgdat->node_zones[zt].present_pages;
1120 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1121 arg->status_change_nid_normal = zone_to_nid(zone);
1122 else
1123 arg->status_change_nid_normal = -1;
1124
1125 /*
1126 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
1127 */
1128 zone_last = ZONE_MOVABLE;
1129
1130 /*
1131 * check whether node_states[N_HIGH_MEMORY] will be changed
1132 * If we try to offline the last present @nr_pages from the node,
1133 * we can determind we will need to clear the node from
1134 * node_states[N_HIGH_MEMORY].
1135 */
1136 for (; zt <= zone_last; zt++)
1137 present_pages += pgdat->node_zones[zt].present_pages;
1138 if (nr_pages >= present_pages)
1139 arg->status_change_nid = zone_to_nid(zone);
1140 else
1141 arg->status_change_nid = -1;
1142}
1143
1144static void node_states_clear_node(int node, struct memory_notify *arg)
1145{
1146 if (arg->status_change_nid_normal >= 0)
1147 node_clear_state(node, N_NORMAL_MEMORY);
1148
1149 if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) &&
1150 (arg->status_change_nid >= 0))
1151 node_clear_state(node, N_HIGH_MEMORY);
1152}
1153
870static int __ref __offline_pages(unsigned long start_pfn, 1154static int __ref __offline_pages(unsigned long start_pfn,
871 unsigned long end_pfn, unsigned long timeout) 1155 unsigned long end_pfn, unsigned long timeout)
872{ 1156{
@@ -893,16 +1177,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
893 node = zone_to_nid(zone); 1177 node = zone_to_nid(zone);
894 nr_pages = end_pfn - start_pfn; 1178 nr_pages = end_pfn - start_pfn;
895 1179
1180 ret = -EINVAL;
1181 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
1182 goto out;
1183
896 /* set above range as isolated */ 1184 /* set above range as isolated */
897 ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1185 ret = start_isolate_page_range(start_pfn, end_pfn,
1186 MIGRATE_MOVABLE, true);
898 if (ret) 1187 if (ret)
899 goto out; 1188 goto out;
900 1189
901 arg.start_pfn = start_pfn; 1190 arg.start_pfn = start_pfn;
902 arg.nr_pages = nr_pages; 1191 arg.nr_pages = nr_pages;
903 arg.status_change_nid = -1; 1192 node_states_check_changes_offline(nr_pages, zone, &arg);
904 if (nr_pages >= node_present_pages(node))
905 arg.status_change_nid = node;
906 1193
907 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1194 ret = memory_notify(MEM_GOING_OFFLINE, &arg);
908 ret = notifier_to_errno(ret); 1195 ret = notifier_to_errno(ret);
@@ -975,10 +1262,9 @@ repeat:
975 } else 1262 } else
976 zone_pcp_update(zone); 1263 zone_pcp_update(zone);
977 1264
978 if (!node_present_pages(node)) { 1265 node_states_clear_node(node, &arg);
979 node_clear_state(node, N_HIGH_MEMORY); 1266 if (arg.status_change_nid >= 0)
980 kswapd_stop(node); 1267 kswapd_stop(node);
981 }
982 1268
983 vm_total_pages = nr_free_pagecache_pages(); 1269 vm_total_pages = nr_free_pagecache_pages();
984 writeback_set_ratelimit(); 1270 writeback_set_ratelimit();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ea600da8940..05b28361a39b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1907,7 +1907,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1907 unsigned long addr, int node) 1907 unsigned long addr, int node)
1908{ 1908{
1909 struct mempolicy *pol; 1909 struct mempolicy *pol;
1910 struct zonelist *zl;
1911 struct page *page; 1910 struct page *page;
1912 unsigned int cpuset_mems_cookie; 1911 unsigned int cpuset_mems_cookie;
1913 1912
@@ -1926,23 +1925,11 @@ retry_cpuset:
1926 1925
1927 return page; 1926 return page;
1928 } 1927 }
1929 zl = policy_zonelist(gfp, pol, node); 1928 page = __alloc_pages_nodemask(gfp, order,
1930 if (unlikely(mpol_needs_cond_ref(pol))) { 1929 policy_zonelist(gfp, pol, node),
1931 /*
1932 * slow path: ref counted shared policy
1933 */
1934 struct page *page = __alloc_pages_nodemask(gfp, order,
1935 zl, policy_nodemask(gfp, pol));
1936 __mpol_put(pol);
1937 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1938 goto retry_cpuset;
1939 return page;
1940 }
1941 /*
1942 * fast path: default or task policy
1943 */
1944 page = __alloc_pages_nodemask(gfp, order, zl,
1945 policy_nodemask(gfp, pol)); 1930 policy_nodemask(gfp, pol));
1931 if (unlikely(mpol_needs_cond_ref(pol)))
1932 __mpol_put(pol);
1946 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 1933 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1947 goto retry_cpuset; 1934 goto retry_cpuset;
1948 return page; 1935 return page;
diff --git a/mm/migrate.c b/mm/migrate.c
index 77ed2d773705..3f675ca08279 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,6 +35,7 @@
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h> 36#include <linux/hugetlb_cgroup.h>
37#include <linux/gfp.h> 37#include <linux/gfp.h>
38#include <linux/balloon_compaction.h>
38 39
39#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
40 41
@@ -79,7 +80,30 @@ void putback_lru_pages(struct list_head *l)
79 list_del(&page->lru); 80 list_del(&page->lru);
80 dec_zone_page_state(page, NR_ISOLATED_ANON + 81 dec_zone_page_state(page, NR_ISOLATED_ANON +
81 page_is_file_cache(page)); 82 page_is_file_cache(page));
82 putback_lru_page(page); 83 putback_lru_page(page);
84 }
85}
86
87/*
88 * Put previously isolated pages back onto the appropriate lists
89 * from where they were once taken off for compaction/migration.
90 *
91 * This function shall be used instead of putback_lru_pages(),
92 * whenever the isolated pageset has been built by isolate_migratepages_range()
93 */
94void putback_movable_pages(struct list_head *l)
95{
96 struct page *page;
97 struct page *page2;
98
99 list_for_each_entry_safe(page, page2, l, lru) {
100 list_del(&page->lru);
101 dec_zone_page_state(page, NR_ISOLATED_ANON +
102 page_is_file_cache(page));
103 if (unlikely(balloon_page_movable(page)))
104 balloon_page_putback(page);
105 else
106 putback_lru_page(page);
83 } 107 }
84} 108}
85 109
@@ -91,8 +115,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
91{ 115{
92 struct mm_struct *mm = vma->vm_mm; 116 struct mm_struct *mm = vma->vm_mm;
93 swp_entry_t entry; 117 swp_entry_t entry;
94 pgd_t *pgd;
95 pud_t *pud;
96 pmd_t *pmd; 118 pmd_t *pmd;
97 pte_t *ptep, pte; 119 pte_t *ptep, pte;
98 spinlock_t *ptl; 120 spinlock_t *ptl;
@@ -103,19 +125,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
103 goto out; 125 goto out;
104 ptl = &mm->page_table_lock; 126 ptl = &mm->page_table_lock;
105 } else { 127 } else {
106 pgd = pgd_offset(mm, addr); 128 pmd = mm_find_pmd(mm, addr);
107 if (!pgd_present(*pgd)) 129 if (!pmd)
108 goto out;
109
110 pud = pud_offset(pgd, addr);
111 if (!pud_present(*pud))
112 goto out; 130 goto out;
113
114 pmd = pmd_offset(pud, addr);
115 if (pmd_trans_huge(*pmd)) 131 if (pmd_trans_huge(*pmd))
116 goto out; 132 goto out;
117 if (!pmd_present(*pmd))
118 goto out;
119 133
120 ptep = pte_offset_map(pmd, addr); 134 ptep = pte_offset_map(pmd, addr);
121 135
@@ -286,7 +300,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
286 /* Anonymous page without mapping */ 300 /* Anonymous page without mapping */
287 if (page_count(page) != 1) 301 if (page_count(page) != 1)
288 return -EAGAIN; 302 return -EAGAIN;
289 return 0; 303 return MIGRATEPAGE_SUCCESS;
290 } 304 }
291 305
292 spin_lock_irq(&mapping->tree_lock); 306 spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +370,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
356 } 370 }
357 spin_unlock_irq(&mapping->tree_lock); 371 spin_unlock_irq(&mapping->tree_lock);
358 372
359 return 0; 373 return MIGRATEPAGE_SUCCESS;
360} 374}
361 375
362/* 376/*
@@ -372,7 +386,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
372 if (!mapping) { 386 if (!mapping) {
373 if (page_count(page) != 1) 387 if (page_count(page) != 1)
374 return -EAGAIN; 388 return -EAGAIN;
375 return 0; 389 return MIGRATEPAGE_SUCCESS;
376 } 390 }
377 391
378 spin_lock_irq(&mapping->tree_lock); 392 spin_lock_irq(&mapping->tree_lock);
@@ -399,7 +413,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
399 page_unfreeze_refs(page, expected_count - 1); 413 page_unfreeze_refs(page, expected_count - 1);
400 414
401 spin_unlock_irq(&mapping->tree_lock); 415 spin_unlock_irq(&mapping->tree_lock);
402 return 0; 416 return MIGRATEPAGE_SUCCESS;
403} 417}
404 418
405/* 419/*
@@ -486,11 +500,11 @@ int migrate_page(struct address_space *mapping,
486 500
487 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); 501 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
488 502
489 if (rc) 503 if (rc != MIGRATEPAGE_SUCCESS)
490 return rc; 504 return rc;
491 505
492 migrate_page_copy(newpage, page); 506 migrate_page_copy(newpage, page);
493 return 0; 507 return MIGRATEPAGE_SUCCESS;
494} 508}
495EXPORT_SYMBOL(migrate_page); 509EXPORT_SYMBOL(migrate_page);
496 510
@@ -513,7 +527,7 @@ int buffer_migrate_page(struct address_space *mapping,
513 527
514 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); 528 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
515 529
516 if (rc) 530 if (rc != MIGRATEPAGE_SUCCESS)
517 return rc; 531 return rc;
518 532
519 /* 533 /*
@@ -549,7 +563,7 @@ int buffer_migrate_page(struct address_space *mapping,
549 563
550 } while (bh != head); 564 } while (bh != head);
551 565
552 return 0; 566 return MIGRATEPAGE_SUCCESS;
553} 567}
554EXPORT_SYMBOL(buffer_migrate_page); 568EXPORT_SYMBOL(buffer_migrate_page);
555#endif 569#endif
@@ -628,7 +642,7 @@ static int fallback_migrate_page(struct address_space *mapping,
628 * 642 *
629 * Return value: 643 * Return value:
630 * < 0 - error code 644 * < 0 - error code
631 * == 0 - success 645 * MIGRATEPAGE_SUCCESS - success
632 */ 646 */
633static int move_to_new_page(struct page *newpage, struct page *page, 647static int move_to_new_page(struct page *newpage, struct page *page,
634 int remap_swapcache, enum migrate_mode mode) 648 int remap_swapcache, enum migrate_mode mode)
@@ -665,7 +679,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
665 else 679 else
666 rc = fallback_migrate_page(mapping, newpage, page, mode); 680 rc = fallback_migrate_page(mapping, newpage, page, mode);
667 681
668 if (rc) { 682 if (rc != MIGRATEPAGE_SUCCESS) {
669 newpage->mapping = NULL; 683 newpage->mapping = NULL;
670 } else { 684 } else {
671 if (remap_swapcache) 685 if (remap_swapcache)
@@ -778,6 +792,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
778 } 792 }
779 } 793 }
780 794
795 if (unlikely(balloon_page_movable(page))) {
796 /*
797 * A ballooned page does not need any special attention from
798 * physical to virtual reverse mapping procedures.
799 * Skip any attempt to unmap PTEs or to remap swap cache,
800 * in order to avoid burning cycles at rmap level, and perform
801 * the page migration right away (proteced by page lock).
802 */
803 rc = balloon_page_migrate(newpage, page, mode);
804 goto uncharge;
805 }
806
781 /* 807 /*
782 * Corner case handling: 808 * Corner case handling:
783 * 1. When a new swap-cache page is read into, it is added to the LRU 809 * 1. When a new swap-cache page is read into, it is added to the LRU
@@ -814,7 +840,9 @@ skip_unmap:
814 put_anon_vma(anon_vma); 840 put_anon_vma(anon_vma);
815 841
816uncharge: 842uncharge:
817 mem_cgroup_end_migration(mem, page, newpage, rc == 0); 843 mem_cgroup_end_migration(mem, page, newpage,
844 (rc == MIGRATEPAGE_SUCCESS ||
845 rc == MIGRATEPAGE_BALLOON_SUCCESS));
818unlock: 846unlock:
819 unlock_page(page); 847 unlock_page(page);
820out: 848out:
@@ -846,6 +874,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
846 goto out; 874 goto out;
847 875
848 rc = __unmap_and_move(page, newpage, force, offlining, mode); 876 rc = __unmap_and_move(page, newpage, force, offlining, mode);
877
878 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
879 /*
880 * A ballooned page has been migrated already.
881 * Now, it's the time to wrap-up counters,
882 * handle the page back to Buddy and return.
883 */
884 dec_zone_page_state(page, NR_ISOLATED_ANON +
885 page_is_file_cache(page));
886 balloon_page_free(page);
887 return MIGRATEPAGE_SUCCESS;
888 }
849out: 889out:
850 if (rc != -EAGAIN) { 890 if (rc != -EAGAIN) {
851 /* 891 /*
@@ -987,7 +1027,7 @@ int migrate_pages(struct list_head *from,
987 case -EAGAIN: 1027 case -EAGAIN:
988 retry++; 1028 retry++;
989 break; 1029 break;
990 case 0: 1030 case MIGRATEPAGE_SUCCESS:
991 break; 1031 break;
992 default: 1032 default:
993 /* Permanent failure */ 1033 /* Permanent failure */
@@ -996,15 +1036,12 @@ int migrate_pages(struct list_head *from,
996 } 1036 }
997 } 1037 }
998 } 1038 }
999 rc = 0; 1039 rc = nr_failed + retry;
1000out: 1040out:
1001 if (!swapwrite) 1041 if (!swapwrite)
1002 current->flags &= ~PF_SWAPWRITE; 1042 current->flags &= ~PF_SWAPWRITE;
1003 1043
1004 if (rc) 1044 return rc;
1005 return rc;
1006
1007 return nr_failed + retry;
1008} 1045}
1009 1046
1010int migrate_huge_page(struct page *hpage, new_page_t get_new_page, 1047int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
@@ -1024,7 +1061,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1024 /* try again */ 1061 /* try again */
1025 cond_resched(); 1062 cond_resched();
1026 break; 1063 break;
1027 case 0: 1064 case MIGRATEPAGE_SUCCESS:
1028 goto out; 1065 goto out;
1029 default: 1066 default:
1030 rc = -EIO; 1067 rc = -EIO;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7d416055f08c..f940062c8d4b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -31,6 +31,7 @@
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/khugepaged.h> 32#include <linux/khugepaged.h>
33#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/rbtree_augmented.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
@@ -311,40 +312,88 @@ out:
311 return retval; 312 return retval;
312} 313}
313 314
315static long vma_compute_subtree_gap(struct vm_area_struct *vma)
316{
317 unsigned long max, subtree_gap;
318 max = vma->vm_start;
319 if (vma->vm_prev)
320 max -= vma->vm_prev->vm_end;
321 if (vma->vm_rb.rb_left) {
322 subtree_gap = rb_entry(vma->vm_rb.rb_left,
323 struct vm_area_struct, vm_rb)->rb_subtree_gap;
324 if (subtree_gap > max)
325 max = subtree_gap;
326 }
327 if (vma->vm_rb.rb_right) {
328 subtree_gap = rb_entry(vma->vm_rb.rb_right,
329 struct vm_area_struct, vm_rb)->rb_subtree_gap;
330 if (subtree_gap > max)
331 max = subtree_gap;
332 }
333 return max;
334}
335
314#ifdef CONFIG_DEBUG_VM_RB 336#ifdef CONFIG_DEBUG_VM_RB
315static int browse_rb(struct rb_root *root) 337static int browse_rb(struct rb_root *root)
316{ 338{
317 int i = 0, j; 339 int i = 0, j, bug = 0;
318 struct rb_node *nd, *pn = NULL; 340 struct rb_node *nd, *pn = NULL;
319 unsigned long prev = 0, pend = 0; 341 unsigned long prev = 0, pend = 0;
320 342
321 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 343 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
322 struct vm_area_struct *vma; 344 struct vm_area_struct *vma;
323 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 345 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
324 if (vma->vm_start < prev) 346 if (vma->vm_start < prev) {
325 printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; 347 printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
326 if (vma->vm_start < pend) 348 bug = 1;
349 }
350 if (vma->vm_start < pend) {
327 printk("vm_start %lx pend %lx\n", vma->vm_start, pend); 351 printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
328 if (vma->vm_start > vma->vm_end) 352 bug = 1;
329 printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); 353 }
354 if (vma->vm_start > vma->vm_end) {
355 printk("vm_end %lx < vm_start %lx\n",
356 vma->vm_end, vma->vm_start);
357 bug = 1;
358 }
359 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
360 printk("free gap %lx, correct %lx\n",
361 vma->rb_subtree_gap,
362 vma_compute_subtree_gap(vma));
363 bug = 1;
364 }
330 i++; 365 i++;
331 pn = nd; 366 pn = nd;
332 prev = vma->vm_start; 367 prev = vma->vm_start;
333 pend = vma->vm_end; 368 pend = vma->vm_end;
334 } 369 }
335 j = 0; 370 j = 0;
336 for (nd = pn; nd; nd = rb_prev(nd)) { 371 for (nd = pn; nd; nd = rb_prev(nd))
337 j++; 372 j++;
373 if (i != j) {
374 printk("backwards %d, forwards %d\n", j, i);
375 bug = 1;
376 }
377 return bug ? -1 : i;
378}
379
380static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
381{
382 struct rb_node *nd;
383
384 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
385 struct vm_area_struct *vma;
386 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
387 BUG_ON(vma != ignore &&
388 vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
338 } 389 }
339 if (i != j)
340 printk("backwards %d, forwards %d\n", j, i), i = 0;
341 return i;
342} 390}
343 391
344void validate_mm(struct mm_struct *mm) 392void validate_mm(struct mm_struct *mm)
345{ 393{
346 int bug = 0; 394 int bug = 0;
347 int i = 0; 395 int i = 0;
396 unsigned long highest_address = 0;
348 struct vm_area_struct *vma = mm->mmap; 397 struct vm_area_struct *vma = mm->mmap;
349 while (vma) { 398 while (vma) {
350 struct anon_vma_chain *avc; 399 struct anon_vma_chain *avc;
@@ -352,20 +401,73 @@ void validate_mm(struct mm_struct *mm)
352 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 401 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
353 anon_vma_interval_tree_verify(avc); 402 anon_vma_interval_tree_verify(avc);
354 vma_unlock_anon_vma(vma); 403 vma_unlock_anon_vma(vma);
404 highest_address = vma->vm_end;
355 vma = vma->vm_next; 405 vma = vma->vm_next;
356 i++; 406 i++;
357 } 407 }
358 if (i != mm->map_count) 408 if (i != mm->map_count) {
359 printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; 409 printk("map_count %d vm_next %d\n", mm->map_count, i);
410 bug = 1;
411 }
412 if (highest_address != mm->highest_vm_end) {
413 printk("mm->highest_vm_end %lx, found %lx\n",
414 mm->highest_vm_end, highest_address);
415 bug = 1;
416 }
360 i = browse_rb(&mm->mm_rb); 417 i = browse_rb(&mm->mm_rb);
361 if (i != mm->map_count) 418 if (i != mm->map_count) {
362 printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; 419 printk("map_count %d rb %d\n", mm->map_count, i);
420 bug = 1;
421 }
363 BUG_ON(bug); 422 BUG_ON(bug);
364} 423}
365#else 424#else
425#define validate_mm_rb(root, ignore) do { } while (0)
366#define validate_mm(mm) do { } while (0) 426#define validate_mm(mm) do { } while (0)
367#endif 427#endif
368 428
429RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
430 unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
431
432/*
433 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
434 * vma->vm_prev->vm_end values changed, without modifying the vma's position
435 * in the rbtree.
436 */
437static void vma_gap_update(struct vm_area_struct *vma)
438{
439 /*
440 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
441 * function that does exacltly what we want.
442 */
443 vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
444}
445
446static inline void vma_rb_insert(struct vm_area_struct *vma,
447 struct rb_root *root)
448{
449 /* All rb_subtree_gap values must be consistent prior to insertion */
450 validate_mm_rb(root, NULL);
451
452 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
453}
454
455static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
456{
457 /*
458 * All rb_subtree_gap values must be consistent prior to erase,
459 * with the possible exception of the vma being erased.
460 */
461 validate_mm_rb(root, vma);
462
463 /*
464 * Note rb_erase_augmented is a fairly large inline function,
465 * so make sure we instantiate it only once with our desired
466 * augmented rbtree callbacks.
467 */
468 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
469}
470
369/* 471/*
370 * vma has some anon_vma assigned, and is already inserted on that 472 * vma has some anon_vma assigned, and is already inserted on that
371 * anon_vma's interval trees. 473 * anon_vma's interval trees.
@@ -435,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
435void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 537void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
436 struct rb_node **rb_link, struct rb_node *rb_parent) 538 struct rb_node **rb_link, struct rb_node *rb_parent)
437{ 539{
540 /* Update tracking information for the gap following the new vma. */
541 if (vma->vm_next)
542 vma_gap_update(vma->vm_next);
543 else
544 mm->highest_vm_end = vma->vm_end;
545
546 /*
547 * vma->vm_prev wasn't known when we followed the rbtree to find the
548 * correct insertion point for that vma. As a result, we could not
549 * update the vma vm_rb parents rb_subtree_gap values on the way down.
550 * So, we first insert the vma with a zero rb_subtree_gap value
551 * (to be consistent with what we did on the way down), and then
552 * immediately update the gap to the correct value. Finally we
553 * rebalance the rbtree after all augmented values have been set.
554 */
438 rb_link_node(&vma->vm_rb, rb_parent, rb_link); 555 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
439 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 556 vma->rb_subtree_gap = 0;
557 vma_gap_update(vma);
558 vma_rb_insert(vma, &mm->mm_rb);
440} 559}
441 560
442static void __vma_link_file(struct vm_area_struct *vma) 561static void __vma_link_file(struct vm_area_struct *vma)
@@ -512,12 +631,12 @@ static inline void
512__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 631__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
513 struct vm_area_struct *prev) 632 struct vm_area_struct *prev)
514{ 633{
515 struct vm_area_struct *next = vma->vm_next; 634 struct vm_area_struct *next;
516 635
517 prev->vm_next = next; 636 vma_rb_erase(vma, &mm->mm_rb);
637 prev->vm_next = next = vma->vm_next;
518 if (next) 638 if (next)
519 next->vm_prev = prev; 639 next->vm_prev = prev;
520 rb_erase(&vma->vm_rb, &mm->mm_rb);
521 if (mm->mmap_cache == vma) 640 if (mm->mmap_cache == vma)
522 mm->mmap_cache = prev; 641 mm->mmap_cache = prev;
523} 642}
@@ -539,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
539 struct rb_root *root = NULL; 658 struct rb_root *root = NULL;
540 struct anon_vma *anon_vma = NULL; 659 struct anon_vma *anon_vma = NULL;
541 struct file *file = vma->vm_file; 660 struct file *file = vma->vm_file;
661 bool start_changed = false, end_changed = false;
542 long adjust_next = 0; 662 long adjust_next = 0;
543 int remove_next = 0; 663 int remove_next = 0;
544 664
@@ -629,8 +749,14 @@ again: remove_next = 1 + (end > next->vm_end);
629 vma_interval_tree_remove(next, root); 749 vma_interval_tree_remove(next, root);
630 } 750 }
631 751
632 vma->vm_start = start; 752 if (start != vma->vm_start) {
633 vma->vm_end = end; 753 vma->vm_start = start;
754 start_changed = true;
755 }
756 if (end != vma->vm_end) {
757 vma->vm_end = end;
758 end_changed = true;
759 }
634 vma->vm_pgoff = pgoff; 760 vma->vm_pgoff = pgoff;
635 if (adjust_next) { 761 if (adjust_next) {
636 next->vm_start += adjust_next << PAGE_SHIFT; 762 next->vm_start += adjust_next << PAGE_SHIFT;
@@ -659,6 +785,15 @@ again: remove_next = 1 + (end > next->vm_end);
659 * (it may either follow vma or precede it). 785 * (it may either follow vma or precede it).
660 */ 786 */
661 __insert_vm_struct(mm, insert); 787 __insert_vm_struct(mm, insert);
788 } else {
789 if (start_changed)
790 vma_gap_update(vma);
791 if (end_changed) {
792 if (!next)
793 mm->highest_vm_end = end;
794 else if (!adjust_next)
795 vma_gap_update(next);
796 }
662 } 797 }
663 798
664 if (anon_vma) { 799 if (anon_vma) {
@@ -692,10 +827,13 @@ again: remove_next = 1 + (end > next->vm_end);
692 * we must remove another next too. It would clutter 827 * we must remove another next too. It would clutter
693 * up the code too much to do both in one go. 828 * up the code too much to do both in one go.
694 */ 829 */
695 if (remove_next == 2) { 830 next = vma->vm_next;
696 next = vma->vm_next; 831 if (remove_next == 2)
697 goto again; 832 goto again;
698 } 833 else if (next)
834 vma_gap_update(next);
835 else
836 mm->highest_vm_end = end;
699 } 837 }
700 if (insert && file) 838 if (insert && file)
701 uprobe_mmap(insert); 839 uprobe_mmap(insert);
@@ -1167,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1167 * memory so no accounting is necessary 1305 * memory so no accounting is necessary
1168 */ 1306 */
1169 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, 1307 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
1170 VM_NORESERVE, &user, 1308 VM_NORESERVE,
1171 HUGETLB_ANONHUGE_INODE); 1309 &user, HUGETLB_ANONHUGE_INODE,
1310 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1172 if (IS_ERR(file)) 1311 if (IS_ERR(file))
1173 return PTR_ERR(file); 1312 return PTR_ERR(file);
1174 } 1313 }
@@ -1414,6 +1553,206 @@ unacct_error:
1414 return error; 1553 return error;
1415} 1554}
1416 1555
1556unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1557{
1558 /*
1559 * We implement the search by looking for an rbtree node that
1560 * immediately follows a suitable gap. That is,
1561 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
1562 * - gap_end = vma->vm_start >= info->low_limit + length;
1563 * - gap_end - gap_start >= length
1564 */
1565
1566 struct mm_struct *mm = current->mm;
1567 struct vm_area_struct *vma;
1568 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1569
1570 /* Adjust search length to account for worst case alignment overhead */
1571 length = info->length + info->align_mask;
1572 if (length < info->length)
1573 return -ENOMEM;
1574
1575 /* Adjust search limits by the desired length */
1576 if (info->high_limit < length)
1577 return -ENOMEM;
1578 high_limit = info->high_limit - length;
1579
1580 if (info->low_limit > high_limit)
1581 return -ENOMEM;
1582 low_limit = info->low_limit + length;
1583
1584 /* Check if rbtree root looks promising */
1585 if (RB_EMPTY_ROOT(&mm->mm_rb))
1586 goto check_highest;
1587 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1588 if (vma->rb_subtree_gap < length)
1589 goto check_highest;
1590
1591 while (true) {
1592 /* Visit left subtree if it looks promising */
1593 gap_end = vma->vm_start;
1594 if (gap_end >= low_limit && vma->vm_rb.rb_left) {
1595 struct vm_area_struct *left =
1596 rb_entry(vma->vm_rb.rb_left,
1597 struct vm_area_struct, vm_rb);
1598 if (left->rb_subtree_gap >= length) {
1599 vma = left;
1600 continue;
1601 }
1602 }
1603
1604 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1605check_current:
1606 /* Check if current node has a suitable gap */
1607 if (gap_start > high_limit)
1608 return -ENOMEM;
1609 if (gap_end >= low_limit && gap_end - gap_start >= length)
1610 goto found;
1611
1612 /* Visit right subtree if it looks promising */
1613 if (vma->vm_rb.rb_right) {
1614 struct vm_area_struct *right =
1615 rb_entry(vma->vm_rb.rb_right,
1616 struct vm_area_struct, vm_rb);
1617 if (right->rb_subtree_gap >= length) {
1618 vma = right;
1619 continue;
1620 }
1621 }
1622
1623 /* Go back up the rbtree to find next candidate node */
1624 while (true) {
1625 struct rb_node *prev = &vma->vm_rb;
1626 if (!rb_parent(prev))
1627 goto check_highest;
1628 vma = rb_entry(rb_parent(prev),
1629 struct vm_area_struct, vm_rb);
1630 if (prev == vma->vm_rb.rb_left) {
1631 gap_start = vma->vm_prev->vm_end;
1632 gap_end = vma->vm_start;
1633 goto check_current;
1634 }
1635 }
1636 }
1637
1638check_highest:
1639 /* Check highest gap, which does not precede any rbtree node */
1640 gap_start = mm->highest_vm_end;
1641 gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
1642 if (gap_start > high_limit)
1643 return -ENOMEM;
1644
1645found:
1646 /* We found a suitable gap. Clip it with the original low_limit. */
1647 if (gap_start < info->low_limit)
1648 gap_start = info->low_limit;
1649
1650 /* Adjust gap address to the desired alignment */
1651 gap_start += (info->align_offset - gap_start) & info->align_mask;
1652
1653 VM_BUG_ON(gap_start + info->length > info->high_limit);
1654 VM_BUG_ON(gap_start + info->length > gap_end);
1655 return gap_start;
1656}
1657
1658unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
1659{
1660 struct mm_struct *mm = current->mm;
1661 struct vm_area_struct *vma;
1662 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1663
1664 /* Adjust search length to account for worst case alignment overhead */
1665 length = info->length + info->align_mask;
1666 if (length < info->length)
1667 return -ENOMEM;
1668
1669 /*
1670 * Adjust search limits by the desired length.
1671 * See implementation comment at top of unmapped_area().
1672 */
1673 gap_end = info->high_limit;
1674 if (gap_end < length)
1675 return -ENOMEM;
1676 high_limit = gap_end - length;
1677
1678 if (info->low_limit > high_limit)
1679 return -ENOMEM;
1680 low_limit = info->low_limit + length;
1681
1682 /* Check highest gap, which does not precede any rbtree node */
1683 gap_start = mm->highest_vm_end;
1684 if (gap_start <= high_limit)
1685 goto found_highest;
1686
1687 /* Check if rbtree root looks promising */
1688 if (RB_EMPTY_ROOT(&mm->mm_rb))
1689 return -ENOMEM;
1690 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1691 if (vma->rb_subtree_gap < length)
1692 return -ENOMEM;
1693
1694 while (true) {
1695 /* Visit right subtree if it looks promising */
1696 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1697 if (gap_start <= high_limit && vma->vm_rb.rb_right) {
1698 struct vm_area_struct *right =
1699 rb_entry(vma->vm_rb.rb_right,
1700 struct vm_area_struct, vm_rb);
1701 if (right->rb_subtree_gap >= length) {
1702 vma = right;
1703 continue;
1704 }
1705 }
1706
1707check_current:
1708 /* Check if current node has a suitable gap */
1709 gap_end = vma->vm_start;
1710 if (gap_end < low_limit)
1711 return -ENOMEM;
1712 if (gap_start <= high_limit && gap_end - gap_start >= length)
1713 goto found;
1714
1715 /* Visit left subtree if it looks promising */
1716 if (vma->vm_rb.rb_left) {
1717 struct vm_area_struct *left =
1718 rb_entry(vma->vm_rb.rb_left,
1719 struct vm_area_struct, vm_rb);
1720 if (left->rb_subtree_gap >= length) {
1721 vma = left;
1722 continue;
1723 }
1724 }
1725
1726 /* Go back up the rbtree to find next candidate node */
1727 while (true) {
1728 struct rb_node *prev = &vma->vm_rb;
1729 if (!rb_parent(prev))
1730 return -ENOMEM;
1731 vma = rb_entry(rb_parent(prev),
1732 struct vm_area_struct, vm_rb);
1733 if (prev == vma->vm_rb.rb_right) {
1734 gap_start = vma->vm_prev ?
1735 vma->vm_prev->vm_end : 0;
1736 goto check_current;
1737 }
1738 }
1739 }
1740
1741found:
1742 /* We found a suitable gap. Clip it with the original high_limit. */
1743 if (gap_end > info->high_limit)
1744 gap_end = info->high_limit;
1745
1746found_highest:
1747 /* Compute highest gap address at the desired alignment */
1748 gap_end -= info->length;
1749 gap_end -= (gap_end - info->align_offset) & info->align_mask;
1750
1751 VM_BUG_ON(gap_end < info->low_limit);
1752 VM_BUG_ON(gap_end < gap_start);
1753 return gap_end;
1754}
1755
1417/* Get an address range which is currently unmapped. 1756/* Get an address range which is currently unmapped.
1418 * For shmat() with addr=0. 1757 * For shmat() with addr=0.
1419 * 1758 *
@@ -1432,7 +1771,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1432{ 1771{
1433 struct mm_struct *mm = current->mm; 1772 struct mm_struct *mm = current->mm;
1434 struct vm_area_struct *vma; 1773 struct vm_area_struct *vma;
1435 unsigned long start_addr; 1774 struct vm_unmapped_area_info info;
1436 1775
1437 if (len > TASK_SIZE) 1776 if (len > TASK_SIZE)
1438 return -ENOMEM; 1777 return -ENOMEM;
@@ -1447,40 +1786,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1447 (!vma || addr + len <= vma->vm_start)) 1786 (!vma || addr + len <= vma->vm_start))
1448 return addr; 1787 return addr;
1449 } 1788 }
1450 if (len > mm->cached_hole_size) {
1451 start_addr = addr = mm->free_area_cache;
1452 } else {
1453 start_addr = addr = TASK_UNMAPPED_BASE;
1454 mm->cached_hole_size = 0;
1455 }
1456 1789
1457full_search: 1790 info.flags = 0;
1458 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 1791 info.length = len;
1459 /* At this point: (!vma || addr < vma->vm_end). */ 1792 info.low_limit = TASK_UNMAPPED_BASE;
1460 if (TASK_SIZE - len < addr) { 1793 info.high_limit = TASK_SIZE;
1461 /* 1794 info.align_mask = 0;
1462 * Start a new search - just in case we missed 1795 return vm_unmapped_area(&info);
1463 * some holes.
1464 */
1465 if (start_addr != TASK_UNMAPPED_BASE) {
1466 addr = TASK_UNMAPPED_BASE;
1467 start_addr = addr;
1468 mm->cached_hole_size = 0;
1469 goto full_search;
1470 }
1471 return -ENOMEM;
1472 }
1473 if (!vma || addr + len <= vma->vm_start) {
1474 /*
1475 * Remember the place where we stopped the search:
1476 */
1477 mm->free_area_cache = addr + len;
1478 return addr;
1479 }
1480 if (addr + mm->cached_hole_size < vma->vm_start)
1481 mm->cached_hole_size = vma->vm_start - addr;
1482 addr = vma->vm_end;
1483 }
1484} 1796}
1485#endif 1797#endif
1486 1798
@@ -1505,7 +1817,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1505{ 1817{
1506 struct vm_area_struct *vma; 1818 struct vm_area_struct *vma;
1507 struct mm_struct *mm = current->mm; 1819 struct mm_struct *mm = current->mm;
1508 unsigned long addr = addr0, start_addr; 1820 unsigned long addr = addr0;
1821 struct vm_unmapped_area_info info;
1509 1822
1510 /* requested length too big for entire address space */ 1823 /* requested length too big for entire address space */
1511 if (len > TASK_SIZE) 1824 if (len > TASK_SIZE)
@@ -1523,53 +1836,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1523 return addr; 1836 return addr;
1524 } 1837 }
1525 1838
1526 /* check if free_area_cache is useful for us */ 1839 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
1527 if (len <= mm->cached_hole_size) { 1840 info.length = len;
1528 mm->cached_hole_size = 0; 1841 info.low_limit = PAGE_SIZE;
1529 mm->free_area_cache = mm->mmap_base; 1842 info.high_limit = mm->mmap_base;
1530 } 1843 info.align_mask = 0;
1531 1844 addr = vm_unmapped_area(&info);
1532try_again:
1533 /* either no address requested or can't fit in requested address hole */
1534 start_addr = addr = mm->free_area_cache;
1535
1536 if (addr < len)
1537 goto fail;
1538
1539 addr -= len;
1540 do {
1541 /*
1542 * Lookup failure means no vma is above this address,
1543 * else if new region fits below vma->vm_start,
1544 * return with success:
1545 */
1546 vma = find_vma(mm, addr);
1547 if (!vma || addr+len <= vma->vm_start)
1548 /* remember the address as a hint for next time */
1549 return (mm->free_area_cache = addr);
1550
1551 /* remember the largest hole we saw so far */
1552 if (addr + mm->cached_hole_size < vma->vm_start)
1553 mm->cached_hole_size = vma->vm_start - addr;
1554
1555 /* try just below the current vma->vm_start */
1556 addr = vma->vm_start-len;
1557 } while (len < vma->vm_start);
1558
1559fail:
1560 /*
1561 * if hint left us with no space for the requested
1562 * mapping then try again:
1563 *
1564 * Note: this is different with the case of bottomup
1565 * which does the fully line-search, but we use find_vma
1566 * here that causes some holes skipped.
1567 */
1568 if (start_addr != mm->mmap_base) {
1569 mm->free_area_cache = mm->mmap_base;
1570 mm->cached_hole_size = 0;
1571 goto try_again;
1572 }
1573 1845
1574 /* 1846 /*
1575 * A failed mmap() very likely causes application failure, 1847 * A failed mmap() very likely causes application failure,
@@ -1577,14 +1849,13 @@ fail:
1577 * can happen with large stack limits and large mmap() 1849 * can happen with large stack limits and large mmap()
1578 * allocations. 1850 * allocations.
1579 */ 1851 */
1580 mm->cached_hole_size = ~0UL; 1852 if (addr & ~PAGE_MASK) {
1581 mm->free_area_cache = TASK_UNMAPPED_BASE; 1853 VM_BUG_ON(addr != -ENOMEM);
1582 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 1854 info.flags = 0;
1583 /* 1855 info.low_limit = TASK_UNMAPPED_BASE;
1584 * Restore the topdown base: 1856 info.high_limit = TASK_SIZE;
1585 */ 1857 addr = vm_unmapped_area(&info);
1586 mm->free_area_cache = mm->mmap_base; 1858 }
1587 mm->cached_hole_size = ~0UL;
1588 1859
1589 return addr; 1860 return addr;
1590} 1861}
@@ -1797,6 +2068,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1797 anon_vma_interval_tree_pre_update_vma(vma); 2068 anon_vma_interval_tree_pre_update_vma(vma);
1798 vma->vm_end = address; 2069 vma->vm_end = address;
1799 anon_vma_interval_tree_post_update_vma(vma); 2070 anon_vma_interval_tree_post_update_vma(vma);
2071 if (vma->vm_next)
2072 vma_gap_update(vma->vm_next);
2073 else
2074 vma->vm_mm->highest_vm_end = address;
1800 perf_event_mmap(vma); 2075 perf_event_mmap(vma);
1801 } 2076 }
1802 } 2077 }
@@ -1851,6 +2126,7 @@ int expand_downwards(struct vm_area_struct *vma,
1851 vma->vm_start = address; 2126 vma->vm_start = address;
1852 vma->vm_pgoff -= grow; 2127 vma->vm_pgoff -= grow;
1853 anon_vma_interval_tree_post_update_vma(vma); 2128 anon_vma_interval_tree_post_update_vma(vma);
2129 vma_gap_update(vma);
1854 perf_event_mmap(vma); 2130 perf_event_mmap(vma);
1855 } 2131 }
1856 } 2132 }
@@ -1973,14 +2249,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1973 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2249 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1974 vma->vm_prev = NULL; 2250 vma->vm_prev = NULL;
1975 do { 2251 do {
1976 rb_erase(&vma->vm_rb, &mm->mm_rb); 2252 vma_rb_erase(vma, &mm->mm_rb);
1977 mm->map_count--; 2253 mm->map_count--;
1978 tail_vma = vma; 2254 tail_vma = vma;
1979 vma = vma->vm_next; 2255 vma = vma->vm_next;
1980 } while (vma && vma->vm_start < end); 2256 } while (vma && vma->vm_start < end);
1981 *insertion_point = vma; 2257 *insertion_point = vma;
1982 if (vma) 2258 if (vma) {
1983 vma->vm_prev = prev; 2259 vma->vm_prev = prev;
2260 vma_gap_update(vma);
2261 } else
2262 mm->highest_vm_end = prev ? prev->vm_end : 0;
1984 tail_vma->vm_next = NULL; 2263 tail_vma->vm_next = NULL;
1985 if (mm->unmap_area == arch_unmap_area) 2264 if (mm->unmap_area == arch_unmap_area)
1986 addr = prev ? prev->vm_end : mm->mmap_base; 2265 addr = prev ? prev->vm_end : mm->mmap_base;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 79e0f3e24831..18f1ae2b45de 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
44int sysctl_oom_dump_tasks = 1; 44int sysctl_oom_dump_tasks = 1;
45static DEFINE_SPINLOCK(zone_scan_lock); 45static DEFINE_SPINLOCK(zone_scan_lock);
46 46
47/*
48 * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
49 * @old_val: old oom_score_adj for compare
50 * @new_val: new oom_score_adj for swap
51 *
52 * Sets the oom_score_adj value for current to @new_val iff its present value is
53 * @old_val. Usually used to reinstate a previous value to prevent racing with
54 * userspacing tuning the value in the interim.
55 */
56void compare_swap_oom_score_adj(int old_val, int new_val)
57{
58 struct sighand_struct *sighand = current->sighand;
59
60 spin_lock_irq(&sighand->siglock);
61 if (current->signal->oom_score_adj == old_val)
62 current->signal->oom_score_adj = new_val;
63 trace_oom_score_adj_update(current);
64 spin_unlock_irq(&sighand->siglock);
65}
66
67/**
68 * test_set_oom_score_adj() - set current's oom_score_adj and return old value
69 * @new_val: new oom_score_adj value
70 *
71 * Sets the oom_score_adj value for current to @new_val with proper
72 * synchronization and returns the old value. Usually used to temporarily
73 * set a value, save the old value in the caller, and then reinstate it later.
74 */
75int test_set_oom_score_adj(int new_val)
76{
77 struct sighand_struct *sighand = current->sighand;
78 int old_val;
79
80 spin_lock_irq(&sighand->siglock);
81 old_val = current->signal->oom_score_adj;
82 current->signal->oom_score_adj = new_val;
83 trace_oom_score_adj_update(current);
84 spin_unlock_irq(&sighand->siglock);
85
86 return old_val;
87}
88
89#ifdef CONFIG_NUMA 47#ifdef CONFIG_NUMA
90/** 48/**
91 * has_intersects_mems_allowed() - check task eligiblity for kill 49 * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
193 if (!p) 151 if (!p)
194 return 0; 152 return 0;
195 153
196 adj = p->signal->oom_score_adj; 154 adj = (long)p->signal->oom_score_adj;
197 if (adj == OOM_SCORE_ADJ_MIN) { 155 if (adj == OOM_SCORE_ADJ_MIN) {
198 task_unlock(p); 156 task_unlock(p);
199 return 0; 157 return 0;
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
310 if (!task->mm) 268 if (!task->mm)
311 return OOM_SCAN_CONTINUE; 269 return OOM_SCAN_CONTINUE;
312 270
313 if (task->flags & PF_EXITING) { 271 /*
272 * If task is allocating a lot of memory and has been marked to be
273 * killed first if it triggers an oom, then select it.
274 */
275 if (oom_task_origin(task))
276 return OOM_SCAN_SELECT;
277
278 if (task->flags & PF_EXITING && !force_kill) {
314 /* 279 /*
315 * If task is current and is in the process of releasing memory, 280 * If this task is not being ptraced on exit, then wait for it
316 * allow the "kill" to set TIF_MEMDIE, which will allow it to 281 * to finish before killing some other task unnecessarily.
317 * access memory reserves. Otherwise, it may stall forever.
318 *
319 * The iteration isn't broken here, however, in case other
320 * threads are found to have already been oom killed.
321 */ 282 */
322 if (task == current) 283 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
323 return OOM_SCAN_SELECT; 284 return OOM_SCAN_ABORT;
324 else if (!force_kill) {
325 /*
326 * If this task is not being ptraced on exit, then wait
327 * for it to finish before killing some other task
328 * unnecessarily.
329 */
330 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
331 return OOM_SCAN_ABORT;
332 }
333 } 285 }
334 return OOM_SCAN_OK; 286 return OOM_SCAN_OK;
335} 287}
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
412 continue; 364 continue;
413 } 365 }
414 366
415 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", 367 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n",
416 task->pid, from_kuid(&init_user_ns, task_uid(task)), 368 task->pid, from_kuid(&init_user_ns, task_uid(task)),
417 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 369 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
418 task->mm->nr_ptes, 370 task->mm->nr_ptes,
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
428{ 380{
429 task_lock(current); 381 task_lock(current);
430 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 382 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
431 "oom_score_adj=%d\n", 383 "oom_score_adj=%hd\n",
432 current->comm, gfp_mask, order, 384 current->comm, gfp_mask, order,
433 current->signal->oom_score_adj); 385 current->signal->oom_score_adj);
434 cpuset_print_task_mems_allowed(current); 386 cpuset_print_task_mems_allowed(current);
@@ -706,11 +658,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
706 return; 658 return;
707 659
708 /* 660 /*
709 * If current has a pending SIGKILL, then automatically select it. The 661 * If current has a pending SIGKILL or is exiting, then automatically
710 * goal is to allow it to allocate so that it may quickly exit and free 662 * select it. The goal is to allow it to allocate so that it may
711 * its memory. 663 * quickly exit and free its memory.
712 */ 664 */
713 if (fatal_signal_pending(current)) { 665 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
714 set_thread_flag(TIF_MEMDIE); 666 set_thread_flag(TIF_MEMDIE);
715 return; 667 return;
716 } 668 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 830893b2b3c7..6f4271224493 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1069,7 +1069,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
1069} 1069}
1070 1070
1071/* 1071/*
1072 * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() 1072 * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
1073 * will look to see if it needs to start dirty throttling. 1073 * will look to see if it needs to start dirty throttling.
1074 * 1074 *
1075 * If dirty_poll_interval is too low, big NUMA machines will call the expensive 1075 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
@@ -1436,9 +1436,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
1436DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; 1436DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1437 1437
1438/** 1438/**
1439 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 1439 * balance_dirty_pages_ratelimited - balance dirty memory state
1440 * @mapping: address_space which was dirtied 1440 * @mapping: address_space which was dirtied
1441 * @nr_pages_dirtied: number of pages which the caller has just dirtied
1442 * 1441 *
1443 * Processes which are dirtying memory should call in here once for each page 1442 * Processes which are dirtying memory should call in here once for each page
1444 * which was newly dirtied. The function will periodically check the system's 1443 * which was newly dirtied. The function will periodically check the system's
@@ -1449,8 +1448,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1449 * limit we decrease the ratelimiting by a lot, to prevent individual processes 1448 * limit we decrease the ratelimiting by a lot, to prevent individual processes
1450 * from overshooting the limit by (ratelimit_pages) each. 1449 * from overshooting the limit by (ratelimit_pages) each.
1451 */ 1450 */
1452void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 1451void balance_dirty_pages_ratelimited(struct address_space *mapping)
1453 unsigned long nr_pages_dirtied)
1454{ 1452{
1455 struct backing_dev_info *bdi = mapping->backing_dev_info; 1453 struct backing_dev_info *bdi = mapping->backing_dev_info;
1456 int ratelimit; 1454 int ratelimit;
@@ -1484,6 +1482,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1484 */ 1482 */
1485 p = &__get_cpu_var(dirty_throttle_leaks); 1483 p = &__get_cpu_var(dirty_throttle_leaks);
1486 if (*p > 0 && current->nr_dirtied < ratelimit) { 1484 if (*p > 0 && current->nr_dirtied < ratelimit) {
1485 unsigned long nr_pages_dirtied;
1487 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); 1486 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
1488 *p -= nr_pages_dirtied; 1487 *p -= nr_pages_dirtied;
1489 current->nr_dirtied += nr_pages_dirtied; 1488 current->nr_dirtied += nr_pages_dirtied;
@@ -1493,7 +1492,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1493 if (unlikely(current->nr_dirtied >= ratelimit)) 1492 if (unlikely(current->nr_dirtied >= ratelimit))
1494 balance_dirty_pages(mapping, current->nr_dirtied); 1493 balance_dirty_pages(mapping, current->nr_dirtied);
1495} 1494}
1496EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); 1495EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
1497 1496
1498void throttle_vm_writeout(gfp_t gfp_mask) 1497void throttle_vm_writeout(gfp_t gfp_mask)
1499{ 1498{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e208f0ad68c..5a8d339d282a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -667,11 +667,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
667 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 667 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
668 __free_one_page(page, zone, 0, mt); 668 __free_one_page(page, zone, 0, mt);
669 trace_mm_page_pcpu_drain(page, 0, mt); 669 trace_mm_page_pcpu_drain(page, 0, mt);
670 if (is_migrate_cma(mt)) 670 if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 671 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
672 if (is_migrate_cma(mt))
673 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
674 }
672 } while (--to_free && --batch_free && !list_empty(list)); 675 } while (--to_free && --batch_free && !list_empty(list));
673 } 676 }
674 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
675 spin_unlock(&zone->lock); 677 spin_unlock(&zone->lock);
676} 678}
677 679
@@ -1392,21 +1394,22 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1392 1394
1393 zone = page_zone(page); 1395 zone = page_zone(page);
1394 order = page_order(page); 1396 order = page_order(page);
1397 mt = get_pageblock_migratetype(page);
1395 1398
1396 /* Obey watermarks as if the page was being allocated */ 1399 if (mt != MIGRATE_ISOLATE) {
1397 watermark = low_wmark_pages(zone) + (1 << order); 1400 /* Obey watermarks as if the page was being allocated */
1398 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1401 watermark = low_wmark_pages(zone) + (1 << order);
1399 return 0; 1402 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1403 return 0;
1404
1405 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
1406 }
1400 1407
1401 /* Remove page from free list */ 1408 /* Remove page from free list */
1402 list_del(&page->lru); 1409 list_del(&page->lru);
1403 zone->free_area[order].nr_free--; 1410 zone->free_area[order].nr_free--;
1404 rmv_page_order(page); 1411 rmv_page_order(page);
1405 1412
1406 mt = get_pageblock_migratetype(page);
1407 if (unlikely(mt != MIGRATE_ISOLATE))
1408 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
1409
1410 if (alloc_order != order) 1413 if (alloc_order != order)
1411 expand(zone, page, alloc_order, order, 1414 expand(zone, page, alloc_order, order,
1412 &zone->free_area[order], migratetype); 1415 &zone->free_area[order], migratetype);
@@ -1871,7 +1874,7 @@ zonelist_scan:
1871 */ 1874 */
1872 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1875 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1873 high_zoneidx, nodemask) { 1876 high_zoneidx, nodemask) {
1874 if (NUMA_BUILD && zlc_active && 1877 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1875 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1878 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1876 continue; 1879 continue;
1877 if ((alloc_flags & ALLOC_CPUSET) && 1880 if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1917,7 +1920,8 @@ zonelist_scan:
1917 classzone_idx, alloc_flags)) 1920 classzone_idx, alloc_flags))
1918 goto try_this_zone; 1921 goto try_this_zone;
1919 1922
1920 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { 1923 if (IS_ENABLED(CONFIG_NUMA) &&
1924 !did_zlc_setup && nr_online_nodes > 1) {
1921 /* 1925 /*
1922 * we do zlc_setup if there are multiple nodes 1926 * we do zlc_setup if there are multiple nodes
1923 * and before considering the first zone allowed 1927 * and before considering the first zone allowed
@@ -1936,7 +1940,7 @@ zonelist_scan:
1936 * As we may have just activated ZLC, check if the first 1940 * As we may have just activated ZLC, check if the first
1937 * eligible zone has failed zone_reclaim recently. 1941 * eligible zone has failed zone_reclaim recently.
1938 */ 1942 */
1939 if (NUMA_BUILD && zlc_active && 1943 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1940 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1944 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1941 continue; 1945 continue;
1942 1946
@@ -1962,11 +1966,11 @@ try_this_zone:
1962 if (page) 1966 if (page)
1963 break; 1967 break;
1964this_zone_full: 1968this_zone_full:
1965 if (NUMA_BUILD) 1969 if (IS_ENABLED(CONFIG_NUMA))
1966 zlc_mark_zone_full(zonelist, z); 1970 zlc_mark_zone_full(zonelist, z);
1967 } 1971 }
1968 1972
1969 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1973 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
1970 /* Disable zlc cache for second zonelist scan */ 1974 /* Disable zlc cache for second zonelist scan */
1971 zlc_active = 0; 1975 zlc_active = 0;
1972 goto zonelist_scan; 1976 goto zonelist_scan;
@@ -2266,7 +2270,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2266 return NULL; 2270 return NULL;
2267 2271
2268 /* After successful reclaim, reconsider all zones for allocation */ 2272 /* After successful reclaim, reconsider all zones for allocation */
2269 if (NUMA_BUILD) 2273 if (IS_ENABLED(CONFIG_NUMA))
2270 zlc_clear_zones_full(zonelist); 2274 zlc_clear_zones_full(zonelist);
2271 2275
2272retry: 2276retry:
@@ -2412,7 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2412 * allowed per node queues are empty and that nodes are 2416 * allowed per node queues are empty and that nodes are
2413 * over allocated. 2417 * over allocated.
2414 */ 2418 */
2415 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2419 if (IS_ENABLED(CONFIG_NUMA) &&
2420 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2416 goto nopage; 2421 goto nopage;
2417 2422
2418restart: 2423restart:
@@ -2819,7 +2824,7 @@ unsigned int nr_free_pagecache_pages(void)
2819 2824
2820static inline void show_node(struct zone *zone) 2825static inline void show_node(struct zone *zone)
2821{ 2826{
2822 if (NUMA_BUILD) 2827 if (IS_ENABLED(CONFIG_NUMA))
2823 printk("Node %d ", zone_to_nid(zone)); 2828 printk("Node %d ", zone_to_nid(zone));
2824} 2829}
2825 2830
@@ -2877,6 +2882,31 @@ out:
2877 2882
2878#define K(x) ((x) << (PAGE_SHIFT-10)) 2883#define K(x) ((x) << (PAGE_SHIFT-10))
2879 2884
2885static void show_migration_types(unsigned char type)
2886{
2887 static const char types[MIGRATE_TYPES] = {
2888 [MIGRATE_UNMOVABLE] = 'U',
2889 [MIGRATE_RECLAIMABLE] = 'E',
2890 [MIGRATE_MOVABLE] = 'M',
2891 [MIGRATE_RESERVE] = 'R',
2892#ifdef CONFIG_CMA
2893 [MIGRATE_CMA] = 'C',
2894#endif
2895 [MIGRATE_ISOLATE] = 'I',
2896 };
2897 char tmp[MIGRATE_TYPES + 1];
2898 char *p = tmp;
2899 int i;
2900
2901 for (i = 0; i < MIGRATE_TYPES; i++) {
2902 if (type & (1 << i))
2903 *p++ = types[i];
2904 }
2905
2906 *p = '\0';
2907 printk("(%s) ", tmp);
2908}
2909
2880/* 2910/*
2881 * Show free area list (used inside shift_scroll-lock stuff) 2911 * Show free area list (used inside shift_scroll-lock stuff)
2882 * We also calculate the percentage fragmentation. We do this by counting the 2912 * We also calculate the percentage fragmentation. We do this by counting the
@@ -3005,6 +3035,7 @@ void show_free_areas(unsigned int filter)
3005 3035
3006 for_each_populated_zone(zone) { 3036 for_each_populated_zone(zone) {
3007 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3037 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3038 unsigned char types[MAX_ORDER];
3008 3039
3009 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3040 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3010 continue; 3041 continue;
@@ -3013,12 +3044,24 @@ void show_free_areas(unsigned int filter)
3013 3044
3014 spin_lock_irqsave(&zone->lock, flags); 3045 spin_lock_irqsave(&zone->lock, flags);
3015 for (order = 0; order < MAX_ORDER; order++) { 3046 for (order = 0; order < MAX_ORDER; order++) {
3016 nr[order] = zone->free_area[order].nr_free; 3047 struct free_area *area = &zone->free_area[order];
3048 int type;
3049
3050 nr[order] = area->nr_free;
3017 total += nr[order] << order; 3051 total += nr[order] << order;
3052
3053 types[order] = 0;
3054 for (type = 0; type < MIGRATE_TYPES; type++) {
3055 if (!list_empty(&area->free_list[type]))
3056 types[order] |= 1 << type;
3057 }
3018 } 3058 }
3019 spin_unlock_irqrestore(&zone->lock, flags); 3059 spin_unlock_irqrestore(&zone->lock, flags);
3020 for (order = 0; order < MAX_ORDER; order++) 3060 for (order = 0; order < MAX_ORDER; order++) {
3021 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3061 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3062 if (nr[order])
3063 show_migration_types(types[order]);
3064 }
3022 printk("= %lukB\n", K(total)); 3065 printk("= %lukB\n", K(total));
3023 } 3066 }
3024 3067
@@ -5175,10 +5218,6 @@ static void __setup_per_zone_wmarks(void)
5175 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5218 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5176 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5219 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5177 5220
5178 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
5179 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
5180 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
5181
5182 setup_zone_migrate_reserve(zone); 5221 setup_zone_migrate_reserve(zone);
5183 spin_unlock_irqrestore(&zone->lock, flags); 5222 spin_unlock_irqrestore(&zone->lock, flags);
5184 } 5223 }
@@ -5576,7 +5615,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5576 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 5615 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5577 * expect this function should be exact. 5616 * expect this function should be exact.
5578 */ 5617 */
5579bool has_unmovable_pages(struct zone *zone, struct page *page, int count) 5618bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
5619 bool skip_hwpoisoned_pages)
5580{ 5620{
5581 unsigned long pfn, iter, found; 5621 unsigned long pfn, iter, found;
5582 int mt; 5622 int mt;
@@ -5611,6 +5651,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5611 continue; 5651 continue;
5612 } 5652 }
5613 5653
5654 /*
5655 * The HWPoisoned page may be not in buddy system, and
5656 * page_count() is not 0.
5657 */
5658 if (skip_hwpoisoned_pages && PageHWPoison(page))
5659 continue;
5660
5614 if (!PageLRU(page)) 5661 if (!PageLRU(page))
5615 found++; 5662 found++;
5616 /* 5663 /*
@@ -5653,7 +5700,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5653 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5700 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5654 return false; 5701 return false;
5655 5702
5656 return !has_unmovable_pages(zone, page, 0); 5703 return !has_unmovable_pages(zone, page, 0, true);
5657} 5704}
5658 5705
5659#ifdef CONFIG_CMA 5706#ifdef CONFIG_CMA
@@ -5711,58 +5758,10 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5711 0, false, MIGRATE_SYNC); 5758 0, false, MIGRATE_SYNC);
5712 } 5759 }
5713 5760
5714 putback_lru_pages(&cc->migratepages); 5761 putback_movable_pages(&cc->migratepages);
5715 return ret > 0 ? 0 : ret; 5762 return ret > 0 ? 0 : ret;
5716} 5763}
5717 5764
5718/*
5719 * Update zone's cma pages counter used for watermark level calculation.
5720 */
5721static inline void __update_cma_watermarks(struct zone *zone, int count)
5722{
5723 unsigned long flags;
5724 spin_lock_irqsave(&zone->lock, flags);
5725 zone->min_cma_pages += count;
5726 spin_unlock_irqrestore(&zone->lock, flags);
5727 setup_per_zone_wmarks();
5728}
5729
5730/*
5731 * Trigger memory pressure bump to reclaim some pages in order to be able to
5732 * allocate 'count' pages in single page units. Does similar work as
5733 *__alloc_pages_slowpath() function.
5734 */
5735static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5736{
5737 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5738 struct zonelist *zonelist = node_zonelist(0, gfp_mask);
5739 int did_some_progress = 0;
5740 int order = 1;
5741
5742 /*
5743 * Increase level of watermarks to force kswapd do his job
5744 * to stabilise at new watermark level.
5745 */
5746 __update_cma_watermarks(zone, count);
5747
5748 /* Obey watermarks as if the page was being allocated */
5749 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
5750 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
5751
5752 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
5753 NULL);
5754 if (!did_some_progress) {
5755 /* Exhausted what can be done so it's blamo time */
5756 out_of_memory(zonelist, gfp_mask, order, NULL, false);
5757 }
5758 }
5759
5760 /* Restore original watermark levels. */
5761 __update_cma_watermarks(zone, -count);
5762
5763 return count;
5764}
5765
5766/** 5765/**
5767 * alloc_contig_range() -- tries to allocate given range of pages 5766 * alloc_contig_range() -- tries to allocate given range of pages
5768 * @start: start PFN to allocate 5767 * @start: start PFN to allocate
@@ -5786,7 +5785,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5786int alloc_contig_range(unsigned long start, unsigned long end, 5785int alloc_contig_range(unsigned long start, unsigned long end,
5787 unsigned migratetype) 5786 unsigned migratetype)
5788{ 5787{
5789 struct zone *zone = page_zone(pfn_to_page(start));
5790 unsigned long outer_start, outer_end; 5788 unsigned long outer_start, outer_end;
5791 int ret = 0, order; 5789 int ret = 0, order;
5792 5790
@@ -5824,7 +5822,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5824 */ 5822 */
5825 5823
5826 ret = start_isolate_page_range(pfn_max_align_down(start), 5824 ret = start_isolate_page_range(pfn_max_align_down(start),
5827 pfn_max_align_up(end), migratetype); 5825 pfn_max_align_up(end), migratetype,
5826 false);
5828 if (ret) 5827 if (ret)
5829 return ret; 5828 return ret;
5830 5829
@@ -5863,18 +5862,13 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5863 } 5862 }
5864 5863
5865 /* Make sure the range is really isolated. */ 5864 /* Make sure the range is really isolated. */
5866 if (test_pages_isolated(outer_start, end)) { 5865 if (test_pages_isolated(outer_start, end, false)) {
5867 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 5866 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
5868 outer_start, end); 5867 outer_start, end);
5869 ret = -EBUSY; 5868 ret = -EBUSY;
5870 goto done; 5869 goto done;
5871 } 5870 }
5872 5871
5873 /*
5874 * Reclaim enough pages to make sure that contiguous allocation
5875 * will not starve the system.
5876 */
5877 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5878 5872
5879 /* Grab isolated pages from freelists. */ 5873 /* Grab isolated pages from freelists. */
5880 outer_end = isolate_freepages_range(&cc, outer_start, end); 5874 outer_end = isolate_freepages_range(&cc, outer_start, end);
@@ -5932,7 +5926,6 @@ void __meminit zone_pcp_update(struct zone *zone)
5932} 5926}
5933#endif 5927#endif
5934 5928
5935#ifdef CONFIG_MEMORY_HOTREMOVE
5936void zone_pcp_reset(struct zone *zone) 5929void zone_pcp_reset(struct zone *zone)
5937{ 5930{
5938 unsigned long flags; 5931 unsigned long flags;
@@ -5952,6 +5945,7 @@ void zone_pcp_reset(struct zone *zone)
5952 local_irq_restore(flags); 5945 local_irq_restore(flags);
5953} 5946}
5954 5947
5948#ifdef CONFIG_MEMORY_HOTREMOVE
5955/* 5949/*
5956 * All pages in the range must be isolated before calling this. 5950 * All pages in the range must be isolated before calling this.
5957 */ 5951 */
@@ -5978,6 +5972,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5978 continue; 5972 continue;
5979 } 5973 }
5980 page = pfn_to_page(pfn); 5974 page = pfn_to_page(pfn);
5975 /*
5976 * The HWPoisoned page may be not in buddy system, and
5977 * page_count() is not 0.
5978 */
5979 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
5980 pfn++;
5981 SetPageReserved(page);
5982 continue;
5983 }
5984
5981 BUG_ON(page_count(page)); 5985 BUG_ON(page_count(page));
5982 BUG_ON(!PageBuddy(page)); 5986 BUG_ON(!PageBuddy(page));
5983 order = page_order(page); 5987 order = page_order(page);
@@ -5988,8 +5992,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5988 list_del(&page->lru); 5992 list_del(&page->lru);
5989 rmv_page_order(page); 5993 rmv_page_order(page);
5990 zone->free_area[order].nr_free--; 5994 zone->free_area[order].nr_free--;
5991 __mod_zone_page_state(zone, NR_FREE_PAGES,
5992 - (1UL << order));
5993 for (i = 0; i < (1 << order); i++) 5995 for (i = 0; i < (1 << order); i++)
5994 SetPageReserved((page+i)); 5996 SetPageReserved((page+i));
5995 pfn += (1 << order); 5997 pfn += (1 << order);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c6daa6..44db00e253ed 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
251 mn->nr_pages, mn->status_change_nid); 251 mn->nr_pages, mn->status_change_nid);
252 break; 252 break;
253 case MEM_CANCEL_ONLINE: 253 case MEM_CANCEL_ONLINE:
254 offline_page_cgroup(mn->start_pfn,
255 mn->nr_pages, mn->status_change_nid);
256 break;
254 case MEM_GOING_OFFLINE: 257 case MEM_GOING_OFFLINE:
255 break; 258 break;
256 case MEM_ONLINE: 259 case MEM_ONLINE:
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f2f5b4818e94..9d2264ea4606 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -30,7 +30,7 @@ static void restore_pageblock_isolate(struct page *page, int migratetype)
30 zone->nr_pageblock_isolate--; 30 zone->nr_pageblock_isolate--;
31} 31}
32 32
33int set_migratetype_isolate(struct page *page) 33int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
34{ 34{
35 struct zone *zone; 35 struct zone *zone;
36 unsigned long flags, pfn; 36 unsigned long flags, pfn;
@@ -66,7 +66,8 @@ int set_migratetype_isolate(struct page *page)
66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. 66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
67 * We just check MOVABLE pages. 67 * We just check MOVABLE pages.
68 */ 68 */
69 if (!has_unmovable_pages(zone, page, arg.pages_found)) 69 if (!has_unmovable_pages(zone, page, arg.pages_found,
70 skip_hwpoisoned_pages))
70 ret = 0; 71 ret = 0;
71 72
72 /* 73 /*
@@ -134,7 +135,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
134 * Returns 0 on success and -EBUSY if any part of range cannot be isolated. 135 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
135 */ 136 */
136int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, 137int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
137 unsigned migratetype) 138 unsigned migratetype, bool skip_hwpoisoned_pages)
138{ 139{
139 unsigned long pfn; 140 unsigned long pfn;
140 unsigned long undo_pfn; 141 unsigned long undo_pfn;
@@ -147,7 +148,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
147 pfn < end_pfn; 148 pfn < end_pfn;
148 pfn += pageblock_nr_pages) { 149 pfn += pageblock_nr_pages) {
149 page = __first_valid_page(pfn, pageblock_nr_pages); 150 page = __first_valid_page(pfn, pageblock_nr_pages);
150 if (page && set_migratetype_isolate(page)) { 151 if (page &&
152 set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
151 undo_pfn = pfn; 153 undo_pfn = pfn;
152 goto undo; 154 goto undo;
153 } 155 }
@@ -190,7 +192,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
190 * Returns 1 if all pages in the range are isolated. 192 * Returns 1 if all pages in the range are isolated.
191 */ 193 */
192static int 194static int
193__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) 195__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
196 bool skip_hwpoisoned_pages)
194{ 197{
195 struct page *page; 198 struct page *page;
196 199
@@ -220,6 +223,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
220 else if (page_count(page) == 0 && 223 else if (page_count(page) == 0 &&
221 get_freepage_migratetype(page) == MIGRATE_ISOLATE) 224 get_freepage_migratetype(page) == MIGRATE_ISOLATE)
222 pfn += 1; 225 pfn += 1;
226 else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
227 /*
228 * The HWPoisoned page may be not in buddy
229 * system, and page_count() is not 0.
230 */
231 pfn++;
232 continue;
233 }
223 else 234 else
224 break; 235 break;
225 } 236 }
@@ -228,7 +239,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
228 return 1; 239 return 1;
229} 240}
230 241
231int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 242int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
243 bool skip_hwpoisoned_pages)
232{ 244{
233 unsigned long pfn, flags; 245 unsigned long pfn, flags;
234 struct page *page; 246 struct page *page;
@@ -251,7 +263,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
251 /* Check all pages are free or Marked as ISOLATED */ 263 /* Check all pages are free or Marked as ISOLATED */
252 zone = page_zone(page); 264 zone = page_zone(page);
253 spin_lock_irqsave(&zone->lock, flags); 265 spin_lock_irqsave(&zone->lock, flags);
254 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); 266 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
267 skip_hwpoisoned_pages);
255 spin_unlock_irqrestore(&zone->lock, flags); 268 spin_unlock_irqrestore(&zone->lock, flags);
256 return ret ? 0 : -EBUSY; 269 return ret ? 0 : -EBUSY;
257} 270}
diff --git a/mm/rmap.c b/mm/rmap.c
index 2ee1ef0f317b..cf7e99a87c32 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
562 return address; 562 return address;
563} 563}
564 564
565pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
566{
567 pgd_t *pgd;
568 pud_t *pud;
569 pmd_t *pmd = NULL;
570
571 pgd = pgd_offset(mm, address);
572 if (!pgd_present(*pgd))
573 goto out;
574
575 pud = pud_offset(pgd, address);
576 if (!pud_present(*pud))
577 goto out;
578
579 pmd = pmd_offset(pud, address);
580 if (!pmd_present(*pmd))
581 pmd = NULL;
582out:
583 return pmd;
584}
585
565/* 586/*
566 * Check that @page is mapped at @address into @mm. 587 * Check that @page is mapped at @address into @mm.
567 * 588 *
@@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
574pte_t *__page_check_address(struct page *page, struct mm_struct *mm, 595pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
575 unsigned long address, spinlock_t **ptlp, int sync) 596 unsigned long address, spinlock_t **ptlp, int sync)
576{ 597{
577 pgd_t *pgd;
578 pud_t *pud;
579 pmd_t *pmd; 598 pmd_t *pmd;
580 pte_t *pte; 599 pte_t *pte;
581 spinlock_t *ptl; 600 spinlock_t *ptl;
@@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
586 goto check; 605 goto check;
587 } 606 }
588 607
589 pgd = pgd_offset(mm, address); 608 pmd = mm_find_pmd(mm, address);
590 if (!pgd_present(*pgd)) 609 if (!pmd)
591 return NULL;
592
593 pud = pud_offset(pgd, address);
594 if (!pud_present(*pud))
595 return NULL; 610 return NULL;
596 611
597 pmd = pmd_offset(pud, address);
598 if (!pmd_present(*pmd))
599 return NULL;
600 if (pmd_trans_huge(*pmd)) 612 if (pmd_trans_huge(*pmd))
601 return NULL; 613 return NULL;
602 614
@@ -1139,9 +1151,11 @@ void page_remove_rmap(struct page *page)
1139 * containing the swap entry, but page not yet written to swap. 1151 * containing the swap entry, but page not yet written to swap.
1140 * 1152 *
1141 * And we can skip it on file pages, so long as the filesystem 1153 * And we can skip it on file pages, so long as the filesystem
1142 * participates in dirty tracking; but need to catch shm and tmpfs 1154 * participates in dirty tracking (note that this is not only an
1143 * and ramfs pages which have been modified since creation by read 1155 * optimization but also solves problems caused by dirty flag in
1144 * fault. 1156 * storage key getting set by a write from inside kernel); but need to
1157 * catch shm and tmpfs and ramfs pages which have been modified since
1158 * creation by read fault.
1145 * 1159 *
1146 * Note that mapping must be decided above, before decrementing 1160 * Note that mapping must be decided above, before decrementing
1147 * mapcount (which luckily provides a barrier): once page is unmapped, 1161 * mapcount (which luckily provides a barrier): once page is unmapped,
@@ -1345,8 +1359,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1345 struct vm_area_struct *vma, struct page *check_page) 1359 struct vm_area_struct *vma, struct page *check_page)
1346{ 1360{
1347 struct mm_struct *mm = vma->vm_mm; 1361 struct mm_struct *mm = vma->vm_mm;
1348 pgd_t *pgd;
1349 pud_t *pud;
1350 pmd_t *pmd; 1362 pmd_t *pmd;
1351 pte_t *pte; 1363 pte_t *pte;
1352 pte_t pteval; 1364 pte_t pteval;
@@ -1366,16 +1378,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1366 if (end > vma->vm_end) 1378 if (end > vma->vm_end)
1367 end = vma->vm_end; 1379 end = vma->vm_end;
1368 1380
1369 pgd = pgd_offset(mm, address); 1381 pmd = mm_find_pmd(mm, address);
1370 if (!pgd_present(*pgd)) 1382 if (!pmd)
1371 return ret;
1372
1373 pud = pud_offset(pgd, address);
1374 if (!pud_present(*pud))
1375 return ret;
1376
1377 pmd = pmd_offset(pud, address);
1378 if (!pmd_present(*pmd))
1379 return ret; 1383 return ret;
1380 1384
1381 mmun_start = address; 1385 mmun_start = address;
diff --git a/mm/slub.c b/mm/slub.c
index a0d698467f70..487f0bdd53c0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3573,7 +3573,7 @@ static void slab_mem_offline_callback(void *arg)
3573 struct memory_notify *marg = arg; 3573 struct memory_notify *marg = arg;
3574 int offline_node; 3574 int offline_node;
3575 3575
3576 offline_node = marg->status_change_nid; 3576 offline_node = marg->status_change_nid_normal;
3577 3577
3578 /* 3578 /*
3579 * If the node still has available memory. we need kmem_cache_node 3579 * If the node still has available memory. we need kmem_cache_node
@@ -3606,7 +3606,7 @@ static int slab_mem_going_online_callback(void *arg)
3606 struct kmem_cache_node *n; 3606 struct kmem_cache_node *n;
3607 struct kmem_cache *s; 3607 struct kmem_cache *s;
3608 struct memory_notify *marg = arg; 3608 struct memory_notify *marg = arg;
3609 int nid = marg->status_change_nid; 3609 int nid = marg->status_change_nid_normal;
3610 int ret = 0; 3610 int ret = 0;
3611 3611
3612 /* 3612 /*
diff --git a/mm/sparse.c b/mm/sparse.c
index a83de2f72b30..6b5fb762e2ca 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
638got_map_page: 638got_map_page:
639 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); 639 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
640got_map_ptr: 640got_map_ptr:
641 memset(ret, 0, memmap_size);
642 641
643 return ret; 642 return ret;
644} 643}
@@ -758,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
758 goto out; 757 goto out;
759 } 758 }
760 759
760 memset(memmap, 0, sizeof(struct page) * nr_pages);
761
761 ms->section_mem_map |= SECTION_MARKED_PRESENT; 762 ms->section_mem_map |= SECTION_MARKED_PRESENT;
762 763
763 ret = sparse_init_one_section(ms, section_nr, memmap, usemap); 764 ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
@@ -771,6 +772,27 @@ out:
771 return ret; 772 return ret;
772} 773}
773 774
775#ifdef CONFIG_MEMORY_FAILURE
776static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
777{
778 int i;
779
780 if (!memmap)
781 return;
782
783 for (i = 0; i < PAGES_PER_SECTION; i++) {
784 if (PageHWPoison(&memmap[i])) {
785 atomic_long_sub(1, &mce_bad_pages);
786 ClearPageHWPoison(&memmap[i]);
787 }
788 }
789}
790#else
791static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
792{
793}
794#endif
795
774void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 796void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
775{ 797{
776 struct page *memmap = NULL; 798 struct page *memmap = NULL;
@@ -784,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
784 ms->pageblock_flags = NULL; 806 ms->pageblock_flags = NULL;
785 } 807 }
786 808
809 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
787 free_section_usemap(memmap, usemap); 810 free_section_usemap(memmap, usemap);
788} 811}
789#endif 812#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f91a25547ffe..e97a0e5aea91 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1443 return generic_swapfile_activate(sis, swap_file, span); 1443 return generic_swapfile_activate(sis, swap_file, span);
1444} 1444}
1445 1445
1446static void enable_swap_info(struct swap_info_struct *p, int prio, 1446static void _enable_swap_info(struct swap_info_struct *p, int prio,
1447 unsigned char *swap_map, 1447 unsigned char *swap_map,
1448 unsigned long *frontswap_map) 1448 unsigned long *frontswap_map)
1449{ 1449{
1450 int i, prev; 1450 int i, prev;
1451 1451
1452 spin_lock(&swap_lock);
1453 if (prio >= 0) 1452 if (prio >= 0)
1454 p->prio = prio; 1453 p->prio = prio;
1455 else 1454 else
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1472 swap_list.head = swap_list.next = p->type; 1471 swap_list.head = swap_list.next = p->type;
1473 else 1472 else
1474 swap_info[prev]->next = p->type; 1473 swap_info[prev]->next = p->type;
1474}
1475
1476static void enable_swap_info(struct swap_info_struct *p, int prio,
1477 unsigned char *swap_map,
1478 unsigned long *frontswap_map)
1479{
1480 spin_lock(&swap_lock);
1481 _enable_swap_info(p, prio, swap_map, frontswap_map);
1475 frontswap_init(p->type); 1482 frontswap_init(p->type);
1476 spin_unlock(&swap_lock); 1483 spin_unlock(&swap_lock);
1477} 1484}
1478 1485
1486static void reinsert_swap_info(struct swap_info_struct *p)
1487{
1488 spin_lock(&swap_lock);
1489 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1490 spin_unlock(&swap_lock);
1491}
1492
1479SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1493SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1480{ 1494{
1481 struct swap_info_struct *p = NULL; 1495 struct swap_info_struct *p = NULL;
@@ -1484,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1484 struct address_space *mapping; 1498 struct address_space *mapping;
1485 struct inode *inode; 1499 struct inode *inode;
1486 struct filename *pathname; 1500 struct filename *pathname;
1487 int oom_score_adj;
1488 int i, type, prev; 1501 int i, type, prev;
1489 int err; 1502 int err;
1490 1503
@@ -1543,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1543 p->flags &= ~SWP_WRITEOK; 1556 p->flags &= ~SWP_WRITEOK;
1544 spin_unlock(&swap_lock); 1557 spin_unlock(&swap_lock);
1545 1558
1546 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1559 set_current_oom_origin();
1547 err = try_to_unuse(type, false, 0); /* force all pages to be unused */ 1560 err = try_to_unuse(type, false, 0); /* force all pages to be unused */
1548 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); 1561 clear_current_oom_origin();
1549 1562
1550 if (err) { 1563 if (err) {
1551 /*
1552 * reading p->prio and p->swap_map outside the lock is
1553 * safe here because only sys_swapon and sys_swapoff
1554 * change them, and there can be no other sys_swapon or
1555 * sys_swapoff for this swap_info_struct at this point.
1556 */
1557 /* re-insert swap space back into swap_list */ 1564 /* re-insert swap space back into swap_list */
1558 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); 1565 reinsert_swap_info(p);
1559 goto out_dput; 1566 goto out_dput;
1560 } 1567 }
1561 1568
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 78e08300db21..5123a169ab7b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p)
2550 2550
2551static void show_numa_info(struct seq_file *m, struct vm_struct *v) 2551static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2552{ 2552{
2553 if (NUMA_BUILD) { 2553 if (IS_ENABLED(CONFIG_NUMA)) {
2554 unsigned int nr, *counters = m->private; 2554 unsigned int nr, *counters = m->private;
2555 2555
2556 if (!counters) 2556 if (!counters)
@@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file)
2615 unsigned int *ptr = NULL; 2615 unsigned int *ptr = NULL;
2616 int ret; 2616 int ret;
2617 2617
2618 if (NUMA_BUILD) { 2618 if (IS_ENABLED(CONFIG_NUMA)) {
2619 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2619 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
2620 if (ptr == NULL) 2620 if (ptr == NULL)
2621 return -ENOMEM; 2621 return -ENOMEM;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b7ed37675644..157bb116dec8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1679,13 +1679,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1679 1679
1680 if (global_reclaim(sc)) { 1680 if (global_reclaim(sc)) {
1681 free = zone_page_state(zone, NR_FREE_PAGES); 1681 free = zone_page_state(zone, NR_FREE_PAGES);
1682 /* If we have very few page cache pages,
1683 force-scan anon pages. */
1684 if (unlikely(file + free <= high_wmark_pages(zone))) { 1682 if (unlikely(file + free <= high_wmark_pages(zone))) {
1683 /*
1684 * If we have very few page cache pages, force-scan
1685 * anon pages.
1686 */
1685 fraction[0] = 1; 1687 fraction[0] = 1;
1686 fraction[1] = 0; 1688 fraction[1] = 0;
1687 denominator = 1; 1689 denominator = 1;
1688 goto out; 1690 goto out;
1691 } else if (!inactive_file_is_low_global(zone)) {
1692 /*
1693 * There is enough inactive page cache, do not
1694 * reclaim anything from the working set right now.
1695 */
1696 fraction[0] = 0;
1697 fraction[1] = 1;
1698 denominator = 1;
1699 goto out;
1689 } 1700 }
1690 } 1701 }
1691 1702
@@ -1752,7 +1763,7 @@ out:
1752/* Use reclaim/compaction for costly allocs or under memory pressure */ 1763/* Use reclaim/compaction for costly allocs or under memory pressure */
1753static bool in_reclaim_compaction(struct scan_control *sc) 1764static bool in_reclaim_compaction(struct scan_control *sc)
1754{ 1765{
1755 if (COMPACTION_BUILD && sc->order && 1766 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
1756 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 1767 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
1757 sc->priority < DEF_PRIORITY - 2)) 1768 sc->priority < DEF_PRIORITY - 2))
1758 return true; 1769 return true;
@@ -2005,7 +2016,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2005 if (zone->all_unreclaimable && 2016 if (zone->all_unreclaimable &&
2006 sc->priority != DEF_PRIORITY) 2017 sc->priority != DEF_PRIORITY)
2007 continue; /* Let kswapd poll it */ 2018 continue; /* Let kswapd poll it */
2008 if (COMPACTION_BUILD) { 2019 if (IS_ENABLED(CONFIG_COMPACTION)) {
2009 /* 2020 /*
2010 * If we already have plenty of memory free for 2021 * If we already have plenty of memory free for
2011 * compaction in this zone, don't free any more. 2022 * compaction in this zone, don't free any more.
@@ -2421,7 +2432,8 @@ static bool zone_balanced(struct zone *zone, int order,
2421 balance_gap, classzone_idx, 0)) 2432 balance_gap, classzone_idx, 0))
2422 return false; 2433 return false;
2423 2434
2424 if (COMPACTION_BUILD && order && !compaction_suitable(zone, order)) 2435 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2436 !compaction_suitable(zone, order))
2425 return false; 2437 return false;
2426 2438
2427 return true; 2439 return true;
@@ -2684,7 +2696,7 @@ loop_again:
2684 * Do not reclaim more than needed for compaction. 2696 * Do not reclaim more than needed for compaction.
2685 */ 2697 */
2686 testorder = order; 2698 testorder = order;
2687 if (COMPACTION_BUILD && order && 2699 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2688 compaction_suitable(zone, order) != 2700 compaction_suitable(zone, order) !=
2689 COMPACT_SKIPPED) 2701 COMPACT_SKIPPED)
2690 testorder = 0; 2702 testorder = 0;
@@ -2951,7 +2963,7 @@ static int kswapd(void *p)
2951 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2963 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2952 balanced_classzone_idx = classzone_idx; 2964 balanced_classzone_idx = classzone_idx;
2953 for ( ; ; ) { 2965 for ( ; ; ) {
2954 int ret; 2966 bool ret;
2955 2967
2956 /* 2968 /*
2957 * If the last balance_pgdat was unsuccessful it's unlikely a 2969 * If the last balance_pgdat was unsuccessful it's unlikely a
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index b336b24aa6c0..7300d0702efe 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -1,9 +1,9 @@
1# Makefile for vm selftests 1# Makefile for vm selftests
2 2
3CC = $(CROSS_COMPILE)gcc 3CC = $(CROSS_COMPILE)gcc
4CFLAGS = -Wall -Wextra 4CFLAGS = -Wall
5 5
6all: hugepage-mmap hugepage-shm map_hugetlb 6all: hugepage-mmap hugepage-shm map_hugetlb thuge-gen
7%: %.c 7%: %.c
8 $(CC) $(CFLAGS) -o $@ $^ 8 $(CC) $(CFLAGS) -o $@ $^
9 9
diff --git a/tools/testing/selftests/vm/thuge-gen.c b/tools/testing/selftests/vm/thuge-gen.c
new file mode 100644
index 000000000000..c87957295f74
--- /dev/null
+++ b/tools/testing/selftests/vm/thuge-gen.c
@@ -0,0 +1,254 @@
1/* Test selecting other page sizes for mmap/shmget.
2
3 Before running this huge pages for each huge page size must have been
4 reserved.
5 For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used.
6 Also shmmax must be increased.
7 And you need to run as root to work around some weird permissions in shm.
8 And nothing using huge pages should run in parallel.
9 When the program aborts you may need to clean up the shm segments with
10 ipcrm -m by hand, like this
11 sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m
12 (warning this will remove all if someone else uses them) */
13
14#define _GNU_SOURCE 1
15#include <sys/mman.h>
16#include <stdlib.h>
17#include <stdio.h>
18#include <sys/ipc.h>
19#include <sys/shm.h>
20#include <sys/stat.h>
21#include <glob.h>
22#include <assert.h>
23#include <unistd.h>
24#include <stdarg.h>
25#include <string.h>
26
27#define err(x) perror(x), exit(1)
28
29#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
30#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
31#define MAP_HUGE_SHIFT 26
32#define MAP_HUGE_MASK 0x3f
33#define MAP_HUGETLB 0x40000
34
35#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
36#define SHM_HUGE_SHIFT 26
37#define SHM_HUGE_MASK 0x3f
38#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
39#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
40
41#define NUM_PAGESIZES 5
42
43#define NUM_PAGES 4
44
45#define Dprintf(fmt...) // printf(fmt)
46
47unsigned long page_sizes[NUM_PAGESIZES];
48int num_page_sizes;
49
50int ilog2(unsigned long v)
51{
52 int l = 0;
53 while ((1UL << l) < v)
54 l++;
55 return l;
56}
57
58void find_pagesizes(void)
59{
60 glob_t g;
61 int i;
62 glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
63 assert(g.gl_pathc <= NUM_PAGESIZES);
64 for (i = 0; i < g.gl_pathc; i++) {
65 sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
66 &page_sizes[i]);
67 page_sizes[i] <<= 10;
68 printf("Found %luMB\n", page_sizes[i] >> 20);
69 }
70 num_page_sizes = g.gl_pathc;
71 globfree(&g);
72}
73
74unsigned long default_huge_page_size(void)
75{
76 unsigned long hps = 0;
77 char *line = NULL;
78 size_t linelen = 0;
79 FILE *f = fopen("/proc/meminfo", "r");
80 if (!f)
81 return 0;
82 while (getline(&line, &linelen, f) > 0) {
83 if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
84 hps <<= 10;
85 break;
86 }
87 }
88 free(line);
89 return hps;
90}
91
92void show(unsigned long ps)
93{
94 char buf[100];
95 if (ps == getpagesize())
96 return;
97 printf("%luMB: ", ps >> 20);
98 fflush(stdout);
99 snprintf(buf, sizeof buf,
100 "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
101 ps >> 10);
102 system(buf);
103}
104
105unsigned long read_sysfs(int warn, char *fmt, ...)
106{
107 char *line = NULL;
108 size_t linelen = 0;
109 char buf[100];
110 FILE *f;
111 va_list ap;
112 unsigned long val = 0;
113
114 va_start(ap, fmt);
115 vsnprintf(buf, sizeof buf, fmt, ap);
116 va_end(ap);
117
118 f = fopen(buf, "r");
119 if (!f) {
120 if (warn)
121 printf("missing %s\n", buf);
122 return 0;
123 }
124 if (getline(&line, &linelen, f) > 0) {
125 sscanf(line, "%lu", &val);
126 }
127 fclose(f);
128 free(line);
129 return val;
130}
131
132unsigned long read_free(unsigned long ps)
133{
134 return read_sysfs(ps != getpagesize(),
135 "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
136 ps >> 10);
137}
138
139void test_mmap(unsigned long size, unsigned flags)
140{
141 char *map;
142 unsigned long before, after;
143 int err;
144
145 before = read_free(size);
146 map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE,
147 MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, 0, 0);
148
149 if (map == (char *)-1) err("mmap");
150 memset(map, 0xff, size*NUM_PAGES);
151 after = read_free(size);
152 Dprintf("before %lu after %lu diff %ld size %lu\n",
153 before, after, before - after, size);
154 assert(size == getpagesize() || (before - after) == NUM_PAGES);
155 show(size);
156 err = munmap(map, size);
157 assert(!err);
158}
159
160void test_shmget(unsigned long size, unsigned flags)
161{
162 int id;
163 unsigned long before, after;
164 int err;
165
166 before = read_free(size);
167 id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags);
168 if (id < 0) err("shmget");
169
170 struct shm_info i;
171 if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl");
172 Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss);
173
174
175 Dprintf("id %d\n", id);
176 char *map = shmat(id, NULL, 0600);
177 if (map == (char*)-1) err("shmat");
178
179 shmctl(id, IPC_RMID, NULL);
180
181 memset(map, 0xff, size*NUM_PAGES);
182 after = read_free(size);
183
184 Dprintf("before %lu after %lu diff %ld size %lu\n",
185 before, after, before - after, size);
186 assert(size == getpagesize() || (before - after) == NUM_PAGES);
187 show(size);
188 err = shmdt(map);
189 assert(!err);
190}
191
192void sanity_checks(void)
193{
194 int i;
195 unsigned long largest = getpagesize();
196
197 for (i = 0; i < num_page_sizes; i++) {
198 if (page_sizes[i] > largest)
199 largest = page_sizes[i];
200
201 if (read_free(page_sizes[i]) < NUM_PAGES) {
202 printf("Not enough huge pages for page size %lu MB, need %u\n",
203 page_sizes[i] >> 20,
204 NUM_PAGES);
205 exit(0);
206 }
207 }
208
209 if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) {
210 printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES);
211 exit(0);
212 }
213
214#if defined(__x86_64__)
215 if (largest != 1U<<30) {
216 printf("No GB pages available on x86-64\n"
217 "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
218 exit(0);
219 }
220#endif
221}
222
223int main(void)
224{
225 int i;
226 unsigned default_hps = default_huge_page_size();
227
228 find_pagesizes();
229
230 sanity_checks();
231
232 for (i = 0; i < num_page_sizes; i++) {
233 unsigned long ps = page_sizes[i];
234 int arg = ilog2(ps) << MAP_HUGE_SHIFT;
235 printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
236 test_mmap(ps, MAP_HUGETLB | arg);
237 }
238 printf("Testing default huge mmap\n");
239 test_mmap(default_hps, SHM_HUGETLB);
240
241 puts("Testing non-huge shmget");
242 test_shmget(getpagesize(), 0);
243
244 for (i = 0; i < num_page_sizes; i++) {
245 unsigned long ps = page_sizes[i];
246 int arg = ilog2(ps) << SHM_HUGE_SHIFT;
247 printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
248 test_shmget(ps, SHM_HUGETLB | arg);
249 }
250 puts("default huge shmget");
251 test_shmget(default_hps, SHM_HUGETLB);
252
253 return 0;
254}