aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2012-07-30 03:03:10 -0400
committerJens Axboe <axboe@kernel.dk>2012-07-30 03:03:10 -0400
commit72ea1f74fcdf874cca6d2c0962379523bbd99e2c (patch)
tree4c67be6c73356086ff44ef1b8b1c9479702689ca /mm
parentb1af9be5ef77898c05667bb9dbf3b180d91d3292 (diff)
parenta73ff3231df59a4b92ccd0dd4e73897c5822489b (diff)
Merge branch 'for-jens' of git://git.drbd.org/linux-drbd into for-3.6/drivers
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/Makefile1
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/compaction.c5
-rw-r--r--mm/frontswap.c314
-rw-r--r--mm/madvise.c18
-rw-r--r--mm/memblock.c115
-rw-r--r--mm/memcontrol.c6
-rw-r--r--mm/memory.c12
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/nobootmem.c40
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c21
-rw-r--r--mm/page_alloc.c7
-rw-r--r--mm/page_cgroup.c4
-rw-r--r--mm/page_io.c12
-rw-r--r--mm/pagewalk.c1
-rw-r--r--mm/percpu-vm.c1
-rw-r--r--mm/shmem.c248
-rw-r--r--mm/sparse.c20
-rw-r--r--mm/swapfile.c66
-rw-r--r--mm/vmscan.c12
23 files changed, 666 insertions, 266 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b2176374b98e..82fed4eb2b6f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -389,3 +389,20 @@ config CLEANCACHE
389 in a negligible performance hit. 389 in a negligible performance hit.
390 390
391 If unsure, say Y to enable cleancache 391 If unsure, say Y to enable cleancache
392
393config FRONTSWAP
394 bool "Enable frontswap to cache swap pages if tmem is present"
395 depends on SWAP
396 default n
397 help
398 Frontswap is so named because it can be thought of as the opposite
399 of a "backing" store for a swap device. The data is stored into
400 "transcendent memory", memory that is not directly accessible or
401 addressable by the kernel and is of unknown and possibly
402 time-varying size. When space in transcendent memory is available,
403 a significant swap I/O reduction may be achieved. When none is
404 available, all frontswap calls are reduced to a single pointer-
405 compare-against-NULL resulting in a negligible performance hit
406 and swap data is stored as normal on the matching swap device.
407
408 If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index a156285ce88d..2e2fbbefb99f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
29 29
30obj-$(CONFIG_BOUNCE) += bounce.o 30obj-$(CONFIG_BOUNCE) += bounce.o
31obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 31obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
32obj-$(CONFIG_FRONTSWAP) += frontswap.o
32obj-$(CONFIG_HAS_DMA) += dmapool.o 33obj-$(CONFIG_HAS_DMA) += dmapool.o
33obj-$(CONFIG_HUGETLBFS) += hugetlb.o 34obj-$(CONFIG_HUGETLBFS) += hugetlb.o
34obj-$(CONFIG_NUMA) += mempolicy.o 35obj-$(CONFIG_NUMA) += mempolicy.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ec4fcb7a56c8..bcb63ac48cc5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
698 return ___alloc_bootmem(size, align, goal, limit); 698 return ___alloc_bootmem(size, align, goal, limit);
699} 699}
700 700
701static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, 701void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
702 unsigned long size, unsigned long align, 702 unsigned long size, unsigned long align,
703 unsigned long goal, unsigned long limit) 703 unsigned long goal, unsigned long limit)
704{ 704{
@@ -710,6 +710,10 @@ again:
710 if (ptr) 710 if (ptr)
711 return ptr; 711 return ptr;
712 712
713 /* do not panic in alloc_bootmem_bdata() */
714 if (limit && goal + size > limit)
715 limit = 0;
716
713 ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); 717 ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
714 if (ptr) 718 if (ptr)
715 return ptr; 719 return ptr;
diff --git a/mm/compaction.c b/mm/compaction.c
index 7ea259d82a99..2f42d9528539 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -701,8 +701,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
701 if (err) { 701 if (err) {
702 putback_lru_pages(&cc->migratepages); 702 putback_lru_pages(&cc->migratepages);
703 cc->nr_migratepages = 0; 703 cc->nr_migratepages = 0;
704 if (err == -ENOMEM) {
705 ret = COMPACT_PARTIAL;
706 goto out;
707 }
704 } 708 }
705
706 } 709 }
707 710
708out: 711out:
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000000..e25025574a02
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,314 @@
1/*
2 * Frontswap frontend
3 *
4 * This code provides the generic "frontend" layer to call a matching
5 * "backend" driver implementation of frontswap. See
6 * Documentation/vm/frontswap.txt for more information.
7 *
8 * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
9 * Author: Dan Magenheimer
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2.
12 */
13
14#include <linux/mm.h>
15#include <linux/mman.h>
16#include <linux/swap.h>
17#include <linux/swapops.h>
18#include <linux/proc_fs.h>
19#include <linux/security.h>
20#include <linux/capability.h>
21#include <linux/module.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/frontswap.h>
25#include <linux/swapfile.h>
26
27/*
28 * frontswap_ops is set by frontswap_register_ops to contain the pointers
29 * to the frontswap "backend" implementation functions.
30 */
31static struct frontswap_ops frontswap_ops __read_mostly;
32
33/*
34 * This global enablement flag reduces overhead on systems where frontswap_ops
35 * has not been registered, so is preferred to the slower alternative: a
36 * function call that checks a non-global.
37 */
38bool frontswap_enabled __read_mostly;
39EXPORT_SYMBOL(frontswap_enabled);
40
41/*
42 * If enabled, frontswap_store will return failure even on success. As
43 * a result, the swap subsystem will always write the page to swap, in
44 * effect converting frontswap into a writethrough cache. In this mode,
45 * there is no direct reduction in swap writes, but a frontswap backend
46 * can unilaterally "reclaim" any pages in use with no data loss, thus
47 * providing increases control over maximum memory usage due to frontswap.
48 */
49static bool frontswap_writethrough_enabled __read_mostly;
50
51#ifdef CONFIG_DEBUG_FS
52/*
53 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
54 * properly configured). These are for information only so are not protected
55 * against increment races.
56 */
57static u64 frontswap_loads;
58static u64 frontswap_succ_stores;
59static u64 frontswap_failed_stores;
60static u64 frontswap_invalidates;
61
62static inline void inc_frontswap_loads(void) {
63 frontswap_loads++;
64}
65static inline void inc_frontswap_succ_stores(void) {
66 frontswap_succ_stores++;
67}
68static inline void inc_frontswap_failed_stores(void) {
69 frontswap_failed_stores++;
70}
71static inline void inc_frontswap_invalidates(void) {
72 frontswap_invalidates++;
73}
74#else
75static inline void inc_frontswap_loads(void) { }
76static inline void inc_frontswap_succ_stores(void) { }
77static inline void inc_frontswap_failed_stores(void) { }
78static inline void inc_frontswap_invalidates(void) { }
79#endif
80/*
81 * Register operations for frontswap, returning previous thus allowing
82 * detection of multiple backends and possible nesting.
83 */
84struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
85{
86 struct frontswap_ops old = frontswap_ops;
87
88 frontswap_ops = *ops;
89 frontswap_enabled = true;
90 return old;
91}
92EXPORT_SYMBOL(frontswap_register_ops);
93
94/*
95 * Enable/disable frontswap writethrough (see above).
96 */
97void frontswap_writethrough(bool enable)
98{
99 frontswap_writethrough_enabled = enable;
100}
101EXPORT_SYMBOL(frontswap_writethrough);
102
103/*
104 * Called when a swap device is swapon'd.
105 */
106void __frontswap_init(unsigned type)
107{
108 struct swap_info_struct *sis = swap_info[type];
109
110 BUG_ON(sis == NULL);
111 if (sis->frontswap_map == NULL)
112 return;
113 if (frontswap_enabled)
114 (*frontswap_ops.init)(type);
115}
116EXPORT_SYMBOL(__frontswap_init);
117
118/*
119 * "Store" data from a page to frontswap and associate it with the page's
120 * swaptype and offset. Page must be locked and in the swap cache.
121 * If frontswap already contains a page with matching swaptype and
122 * offset, the frontswap implmentation may either overwrite the data and
123 * return success or invalidate the page from frontswap and return failure.
124 */
125int __frontswap_store(struct page *page)
126{
127 int ret = -1, dup = 0;
128 swp_entry_t entry = { .val = page_private(page), };
129 int type = swp_type(entry);
130 struct swap_info_struct *sis = swap_info[type];
131 pgoff_t offset = swp_offset(entry);
132
133 BUG_ON(!PageLocked(page));
134 BUG_ON(sis == NULL);
135 if (frontswap_test(sis, offset))
136 dup = 1;
137 ret = (*frontswap_ops.store)(type, offset, page);
138 if (ret == 0) {
139 frontswap_set(sis, offset);
140 inc_frontswap_succ_stores();
141 if (!dup)
142 atomic_inc(&sis->frontswap_pages);
143 } else if (dup) {
144 /*
145 failed dup always results in automatic invalidate of
146 the (older) page from frontswap
147 */
148 frontswap_clear(sis, offset);
149 atomic_dec(&sis->frontswap_pages);
150 inc_frontswap_failed_stores();
151 } else
152 inc_frontswap_failed_stores();
153 if (frontswap_writethrough_enabled)
154 /* report failure so swap also writes to swap device */
155 ret = -1;
156 return ret;
157}
158EXPORT_SYMBOL(__frontswap_store);
159
160/*
161 * "Get" data from frontswap associated with swaptype and offset that were
162 * specified when the data was put to frontswap and use it to fill the
163 * specified page with data. Page must be locked and in the swap cache.
164 */
165int __frontswap_load(struct page *page)
166{
167 int ret = -1;
168 swp_entry_t entry = { .val = page_private(page), };
169 int type = swp_type(entry);
170 struct swap_info_struct *sis = swap_info[type];
171 pgoff_t offset = swp_offset(entry);
172
173 BUG_ON(!PageLocked(page));
174 BUG_ON(sis == NULL);
175 if (frontswap_test(sis, offset))
176 ret = (*frontswap_ops.load)(type, offset, page);
177 if (ret == 0)
178 inc_frontswap_loads();
179 return ret;
180}
181EXPORT_SYMBOL(__frontswap_load);
182
183/*
184 * Invalidate any data from frontswap associated with the specified swaptype
185 * and offset so that a subsequent "get" will fail.
186 */
187void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
188{
189 struct swap_info_struct *sis = swap_info[type];
190
191 BUG_ON(sis == NULL);
192 if (frontswap_test(sis, offset)) {
193 (*frontswap_ops.invalidate_page)(type, offset);
194 atomic_dec(&sis->frontswap_pages);
195 frontswap_clear(sis, offset);
196 inc_frontswap_invalidates();
197 }
198}
199EXPORT_SYMBOL(__frontswap_invalidate_page);
200
201/*
202 * Invalidate all data from frontswap associated with all offsets for the
203 * specified swaptype.
204 */
205void __frontswap_invalidate_area(unsigned type)
206{
207 struct swap_info_struct *sis = swap_info[type];
208
209 BUG_ON(sis == NULL);
210 if (sis->frontswap_map == NULL)
211 return;
212 (*frontswap_ops.invalidate_area)(type);
213 atomic_set(&sis->frontswap_pages, 0);
214 memset(sis->frontswap_map, 0, sis->max / sizeof(long));
215}
216EXPORT_SYMBOL(__frontswap_invalidate_area);
217
218/*
219 * Frontswap, like a true swap device, may unnecessarily retain pages
220 * under certain circumstances; "shrink" frontswap is essentially a
221 * "partial swapoff" and works by calling try_to_unuse to attempt to
222 * unuse enough frontswap pages to attempt to -- subject to memory
223 * constraints -- reduce the number of pages in frontswap to the
224 * number given in the parameter target_pages.
225 */
226void frontswap_shrink(unsigned long target_pages)
227{
228 struct swap_info_struct *si = NULL;
229 int si_frontswap_pages;
230 unsigned long total_pages = 0, total_pages_to_unuse;
231 unsigned long pages = 0, pages_to_unuse = 0;
232 int type;
233 bool locked = false;
234
235 /*
236 * we don't want to hold swap_lock while doing a very
237 * lengthy try_to_unuse, but swap_list may change
238 * so restart scan from swap_list.head each time
239 */
240 spin_lock(&swap_lock);
241 locked = true;
242 total_pages = 0;
243 for (type = swap_list.head; type >= 0; type = si->next) {
244 si = swap_info[type];
245 total_pages += atomic_read(&si->frontswap_pages);
246 }
247 if (total_pages <= target_pages)
248 goto out;
249 total_pages_to_unuse = total_pages - target_pages;
250 for (type = swap_list.head; type >= 0; type = si->next) {
251 si = swap_info[type];
252 si_frontswap_pages = atomic_read(&si->frontswap_pages);
253 if (total_pages_to_unuse < si_frontswap_pages)
254 pages = pages_to_unuse = total_pages_to_unuse;
255 else {
256 pages = si_frontswap_pages;
257 pages_to_unuse = 0; /* unuse all */
258 }
259 /* ensure there is enough RAM to fetch pages from frontswap */
260 if (security_vm_enough_memory_mm(current->mm, pages))
261 continue;
262 vm_unacct_memory(pages);
263 break;
264 }
265 if (type < 0)
266 goto out;
267 locked = false;
268 spin_unlock(&swap_lock);
269 try_to_unuse(type, true, pages_to_unuse);
270out:
271 if (locked)
272 spin_unlock(&swap_lock);
273 return;
274}
275EXPORT_SYMBOL(frontswap_shrink);
276
277/*
278 * Count and return the number of frontswap pages across all
279 * swap devices. This is exported so that backend drivers can
280 * determine current usage without reading debugfs.
281 */
282unsigned long frontswap_curr_pages(void)
283{
284 int type;
285 unsigned long totalpages = 0;
286 struct swap_info_struct *si = NULL;
287
288 spin_lock(&swap_lock);
289 for (type = swap_list.head; type >= 0; type = si->next) {
290 si = swap_info[type];
291 totalpages += atomic_read(&si->frontswap_pages);
292 }
293 spin_unlock(&swap_lock);
294 return totalpages;
295}
296EXPORT_SYMBOL(frontswap_curr_pages);
297
298static int __init init_frontswap(void)
299{
300#ifdef CONFIG_DEBUG_FS
301 struct dentry *root = debugfs_create_dir("frontswap", NULL);
302 if (root == NULL)
303 return -ENXIO;
304 debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
305 debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
306 debugfs_create_u64("failed_stores", S_IRUGO, root,
307 &frontswap_failed_stores);
308 debugfs_create_u64("invalidates", S_IRUGO,
309 root, &frontswap_invalidates);
310#endif
311 return 0;
312}
313
314module_init(init_frontswap);
diff --git a/mm/madvise.c b/mm/madvise.c
index deff1b64a08c..14d260fa0d17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -15,6 +15,7 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/ksm.h> 16#include <linux/ksm.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/file.h>
18 19
19/* 20/*
20 * Any behaviour which results in changes to the vma->vm_flags needs to 21 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -204,14 +205,16 @@ static long madvise_remove(struct vm_area_struct *vma,
204{ 205{
205 loff_t offset; 206 loff_t offset;
206 int error; 207 int error;
208 struct file *f;
207 209
208 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 210 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
209 211
210 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 212 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
211 return -EINVAL; 213 return -EINVAL;
212 214
213 if (!vma->vm_file || !vma->vm_file->f_mapping 215 f = vma->vm_file;
214 || !vma->vm_file->f_mapping->host) { 216
217 if (!f || !f->f_mapping || !f->f_mapping->host) {
215 return -EINVAL; 218 return -EINVAL;
216 } 219 }
217 220
@@ -221,11 +224,18 @@ static long madvise_remove(struct vm_area_struct *vma,
221 offset = (loff_t)(start - vma->vm_start) 224 offset = (loff_t)(start - vma->vm_start)
222 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 225 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
223 226
224 /* filesystem's fallocate may need to take i_mutex */ 227 /*
228 * Filesystem's fallocate may need to take i_mutex. We need to
229 * explicitly grab a reference because the vma (and hence the
230 * vma's reference to the file) can go away as soon as we drop
231 * mmap_sem.
232 */
233 get_file(f);
225 up_read(&current->mm->mmap_sem); 234 up_read(&current->mm->mmap_sem);
226 error = do_fallocate(vma->vm_file, 235 error = do_fallocate(f,
227 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 236 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
228 offset, end - start); 237 offset, end - start);
238 fput(f);
229 down_read(&current->mm->mmap_sem); 239 down_read(&current->mm->mmap_sem);
230 return error; 240 return error;
231} 241}
diff --git a/mm/memblock.c b/mm/memblock.c
index 952123eba433..5cc6731b00cc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
143 MAX_NUMNODES); 143 MAX_NUMNODES);
144} 144}
145 145
146/*
147 * Free memblock.reserved.regions
148 */
149int __init_memblock memblock_free_reserved_regions(void)
150{
151 if (memblock.reserved.regions == memblock_reserved_init_regions)
152 return 0;
153
154 return memblock_free(__pa(memblock.reserved.regions),
155 sizeof(struct memblock_region) * memblock.reserved.max);
156}
157
158/*
159 * Reserve memblock.reserved.regions
160 */
161int __init_memblock memblock_reserve_reserved_regions(void)
162{
163 if (memblock.reserved.regions == memblock_reserved_init_regions)
164 return 0;
165
166 return memblock_reserve(__pa(memblock.reserved.regions),
167 sizeof(struct memblock_region) * memblock.reserved.max);
168}
169
170static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) 146static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
171{ 147{
172 type->total_size -= type->regions[r].size; 148 type->total_size -= type->regions[r].size;
@@ -184,9 +160,39 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
184 } 160 }
185} 161}
186 162
187static int __init_memblock memblock_double_array(struct memblock_type *type) 163phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
164 phys_addr_t *addr)
165{
166 if (memblock.reserved.regions == memblock_reserved_init_regions)
167 return 0;
168
169 *addr = __pa(memblock.reserved.regions);
170
171 return PAGE_ALIGN(sizeof(struct memblock_region) *
172 memblock.reserved.max);
173}
174
175/**
176 * memblock_double_array - double the size of the memblock regions array
177 * @type: memblock type of the regions array being doubled
178 * @new_area_start: starting address of memory range to avoid overlap with
179 * @new_area_size: size of memory range to avoid overlap with
180 *
181 * Double the size of the @type regions array. If memblock is being used to
182 * allocate memory for a new reserved regions array and there is a previously
183 * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
184 * waiting to be reserved, ensure the memory used by the new array does
185 * not overlap.
186 *
187 * RETURNS:
188 * 0 on success, -1 on failure.
189 */
190static int __init_memblock memblock_double_array(struct memblock_type *type,
191 phys_addr_t new_area_start,
192 phys_addr_t new_area_size)
188{ 193{
189 struct memblock_region *new_array, *old_array; 194 struct memblock_region *new_array, *old_array;
195 phys_addr_t old_alloc_size, new_alloc_size;
190 phys_addr_t old_size, new_size, addr; 196 phys_addr_t old_size, new_size, addr;
191 int use_slab = slab_is_available(); 197 int use_slab = slab_is_available();
192 int *in_slab; 198 int *in_slab;
@@ -200,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
200 /* Calculate new doubled size */ 206 /* Calculate new doubled size */
201 old_size = type->max * sizeof(struct memblock_region); 207 old_size = type->max * sizeof(struct memblock_region);
202 new_size = old_size << 1; 208 new_size = old_size << 1;
209 /*
210 * We need to allocated new one align to PAGE_SIZE,
211 * so we can free them completely later.
212 */
213 old_alloc_size = PAGE_ALIGN(old_size);
214 new_alloc_size = PAGE_ALIGN(new_size);
203 215
204 /* Retrieve the slab flag */ 216 /* Retrieve the slab flag */
205 if (type == &memblock.memory) 217 if (type == &memblock.memory)
@@ -222,7 +234,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
222 new_array = kmalloc(new_size, GFP_KERNEL); 234 new_array = kmalloc(new_size, GFP_KERNEL);
223 addr = new_array ? __pa(new_array) : 0; 235 addr = new_array ? __pa(new_array) : 0;
224 } else { 236 } else {
225 addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); 237 /* only exclude range when trying to double reserved.regions */
238 if (type != &memblock.reserved)
239 new_area_start = new_area_size = 0;
240
241 addr = memblock_find_in_range(new_area_start + new_area_size,
242 memblock.current_limit,
243 new_alloc_size, PAGE_SIZE);
244 if (!addr && new_area_size)
245 addr = memblock_find_in_range(0,
246 min(new_area_start, memblock.current_limit),
247 new_alloc_size, PAGE_SIZE);
248
226 new_array = addr ? __va(addr) : 0; 249 new_array = addr ? __va(addr) : 0;
227 } 250 }
228 if (!addr) { 251 if (!addr) {
@@ -251,13 +274,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
251 kfree(old_array); 274 kfree(old_array);
252 else if (old_array != memblock_memory_init_regions && 275 else if (old_array != memblock_memory_init_regions &&
253 old_array != memblock_reserved_init_regions) 276 old_array != memblock_reserved_init_regions)
254 memblock_free(__pa(old_array), old_size); 277 memblock_free(__pa(old_array), old_alloc_size);
255 278
256 /* Reserve the new array if that comes from the memblock. 279 /* Reserve the new array if that comes from the memblock.
257 * Otherwise, we needn't do it 280 * Otherwise, we needn't do it
258 */ 281 */
259 if (!use_slab) 282 if (!use_slab)
260 BUG_ON(memblock_reserve(addr, new_size)); 283 BUG_ON(memblock_reserve(addr, new_alloc_size));
261 284
262 /* Update slab flag */ 285 /* Update slab flag */
263 *in_slab = use_slab; 286 *in_slab = use_slab;
@@ -399,7 +422,7 @@ repeat:
399 */ 422 */
400 if (!insert) { 423 if (!insert) {
401 while (type->cnt + nr_new > type->max) 424 while (type->cnt + nr_new > type->max)
402 if (memblock_double_array(type) < 0) 425 if (memblock_double_array(type, obase, size) < 0)
403 return -ENOMEM; 426 return -ENOMEM;
404 insert = true; 427 insert = true;
405 goto repeat; 428 goto repeat;
@@ -450,7 +473,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
450 473
451 /* we'll create at most two more regions */ 474 /* we'll create at most two more regions */
452 while (type->cnt + 2 > type->max) 475 while (type->cnt + 2 > type->max)
453 if (memblock_double_array(type) < 0) 476 if (memblock_double_array(type, base, size) < 0)
454 return -ENOMEM; 477 return -ENOMEM;
455 478
456 for (i = 0; i < type->cnt; i++) { 479 for (i = 0; i < type->cnt; i++) {
@@ -540,9 +563,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
540 * __next_free_mem_range - next function for for_each_free_mem_range() 563 * __next_free_mem_range - next function for for_each_free_mem_range()
541 * @idx: pointer to u64 loop variable 564 * @idx: pointer to u64 loop variable
542 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 565 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
543 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 566 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
544 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 567 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
545 * @p_nid: ptr to int for nid of the range, can be %NULL 568 * @out_nid: ptr to int for nid of the range, can be %NULL
546 * 569 *
547 * Find the first free area from *@idx which matches @nid, fill the out 570 * Find the first free area from *@idx which matches @nid, fill the out
548 * parameters, and update *@idx for the next iteration. The lower 32bit of 571 * parameters, and update *@idx for the next iteration. The lower 32bit of
@@ -616,9 +639,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
616 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() 639 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
617 * @idx: pointer to u64 loop variable 640 * @idx: pointer to u64 loop variable
618 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 641 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
619 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 642 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
620 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 643 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
621 * @p_nid: ptr to int for nid of the range, can be %NULL 644 * @out_nid: ptr to int for nid of the range, can be %NULL
622 * 645 *
623 * Reverse of __next_free_mem_range(). 646 * Reverse of __next_free_mem_range().
624 */ 647 */
@@ -867,6 +890,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
867 return memblock_search(&memblock.memory, addr) != -1; 890 return memblock_search(&memblock.memory, addr) != -1;
868} 891}
869 892
893/**
894 * memblock_is_region_memory - check if a region is a subset of memory
895 * @base: base of region to check
896 * @size: size of region to check
897 *
898 * Check if the region [@base, @base+@size) is a subset of a memory block.
899 *
900 * RETURNS:
901 * 0 if false, non-zero if true
902 */
870int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) 903int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
871{ 904{
872 int idx = memblock_search(&memblock.memory, base); 905 int idx = memblock_search(&memblock.memory, base);
@@ -879,6 +912,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
879 memblock.memory.regions[idx].size) >= end; 912 memblock.memory.regions[idx].size) >= end;
880} 913}
881 914
915/**
916 * memblock_is_region_reserved - check if a region intersects reserved memory
917 * @base: base of region to check
918 * @size: size of region to check
919 *
920 * Check if the region [@base, @base+@size) intersects a reserved memory block.
921 *
922 * RETURNS:
923 * 0 if false, non-zero if true
924 */
882int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) 925int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
883{ 926{
884 memblock_cap_size(base, &size); 927 memblock_cap_size(base, &size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ac35bccadb7b..f72b5e52451a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1148,7 +1148,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1148{ 1148{
1149 if (root_memcg == memcg) 1149 if (root_memcg == memcg)
1150 return true; 1150 return true;
1151 if (!root_memcg->use_hierarchy) 1151 if (!root_memcg->use_hierarchy || !memcg)
1152 return false; 1152 return false;
1153 return css_is_ancestor(&memcg->css, &root_memcg->css); 1153 return css_is_ancestor(&memcg->css, &root_memcg->css);
1154} 1154}
@@ -1234,7 +1234,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1234 1234
1235/** 1235/**
1236 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1236 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1237 * @mem: the memory cgroup 1237 * @memcg: the memory cgroup
1238 * 1238 *
1239 * Returns the maximum amount of memory @mem can be charged with, in 1239 * Returns the maximum amount of memory @mem can be charged with, in
1240 * pages. 1240 * pages.
@@ -1508,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1508 1508
1509/** 1509/**
1510 * test_mem_cgroup_node_reclaimable 1510 * test_mem_cgroup_node_reclaimable
1511 * @mem: the target memcg 1511 * @memcg: the target memcg
1512 * @nid: the node ID to be checked. 1512 * @nid: the node ID to be checked.
1513 * @noswap : specify true here if the user wants flle only information. 1513 * @noswap : specify true here if the user wants flle only information.
1514 * 1514 *
diff --git a/mm/memory.c b/mm/memory.c
index 1b7dc662bf9f..2466d1250231 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1225,7 +1225,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1225 next = pmd_addr_end(addr, end); 1225 next = pmd_addr_end(addr, end);
1226 if (pmd_trans_huge(*pmd)) { 1226 if (pmd_trans_huge(*pmd)) {
1227 if (next - addr != HPAGE_PMD_SIZE) { 1227 if (next - addr != HPAGE_PMD_SIZE) {
1228 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1228#ifdef CONFIG_DEBUG_VM
1229 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1230 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1231 __func__, addr, end,
1232 vma->vm_start,
1233 vma->vm_end);
1234 BUG();
1235 }
1236#endif
1229 split_huge_page_pmd(vma->vm_mm, pmd); 1237 split_huge_page_pmd(vma->vm_mm, pmd);
1230 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1238 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1231 goto next; 1239 goto next;
@@ -1366,7 +1374,7 @@ void unmap_vmas(struct mmu_gather *tlb,
1366/** 1374/**
1367 * zap_page_range - remove user pages in a given range 1375 * zap_page_range - remove user pages in a given range
1368 * @vma: vm_area_struct holding the applicable pages 1376 * @vma: vm_area_struct holding the applicable pages
1369 * @address: starting address of pages to zap 1377 * @start: starting address of pages to zap
1370 * @size: number of bytes to zap 1378 * @size: number of bytes to zap
1371 * @details: details of nonlinear truncation or shared cache invalidation 1379 * @details: details of nonlinear truncation or shared cache invalidation
1372 * 1380 *
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0d7e3ec8e0f3..427bb291dd0f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -618,7 +618,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
618 pgdat = hotadd_new_pgdat(nid, start); 618 pgdat = hotadd_new_pgdat(nid, start);
619 ret = -ENOMEM; 619 ret = -ENOMEM;
620 if (!pgdat) 620 if (!pgdat)
621 goto out; 621 goto error;
622 new_pgdat = 1; 622 new_pgdat = 1;
623 } 623 }
624 624
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f15c1b24ca18..1d771e4200d2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1177 if (!list_empty(&pagelist)) { 1177 if (!list_empty(&pagelist)) {
1178 nr_failed = migrate_pages(&pagelist, new_vma_page, 1178 nr_failed = migrate_pages(&pagelist, new_vma_page,
1179 (unsigned long)vma, 1179 (unsigned long)vma,
1180 false, true); 1180 false, MIGRATE_SYNC);
1181 if (nr_failed) 1181 if (nr_failed)
1182 putback_lru_pages(&pagelist); 1182 putback_lru_pages(&pagelist);
1183 } 1183 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index d23415c001bc..405573010f99 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
105 __free_pages_bootmem(pfn_to_page(i), 0); 105 __free_pages_bootmem(pfn_to_page(i), 0);
106} 106}
107 107
108static unsigned long __init __free_memory_core(phys_addr_t start,
109 phys_addr_t end)
110{
111 unsigned long start_pfn = PFN_UP(start);
112 unsigned long end_pfn = min_t(unsigned long,
113 PFN_DOWN(end), max_low_pfn);
114
115 if (start_pfn > end_pfn)
116 return 0;
117
118 __free_pages_memory(start_pfn, end_pfn);
119
120 return end_pfn - start_pfn;
121}
122
108unsigned long __init free_low_memory_core_early(int nodeid) 123unsigned long __init free_low_memory_core_early(int nodeid)
109{ 124{
110 unsigned long count = 0; 125 unsigned long count = 0;
111 phys_addr_t start, end; 126 phys_addr_t start, end, size;
112 u64 i; 127 u64 i;
113 128
114 /* free reserved array temporarily so that it's treated as free area */ 129 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
115 memblock_free_reserved_regions(); 130 count += __free_memory_core(start, end);
116 131
117 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { 132 /* free range that is used for reserved array if we allocate it */
118 unsigned long start_pfn = PFN_UP(start); 133 size = get_allocated_memblock_reserved_regions_info(&start);
119 unsigned long end_pfn = min_t(unsigned long, 134 if (size)
120 PFN_DOWN(end), max_low_pfn); 135 count += __free_memory_core(start, start + size);
121 if (start_pfn < end_pfn) {
122 __free_pages_memory(start_pfn, end_pfn);
123 count += end_pfn - start_pfn;
124 }
125 }
126 136
127 /* put region array back? */
128 memblock_reserve_reserved_regions();
129 return count; 137 return count;
130} 138}
131 139
@@ -274,7 +282,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
274 return ___alloc_bootmem(size, align, goal, limit); 282 return ___alloc_bootmem(size, align, goal, limit);
275} 283}
276 284
277static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, 285void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
278 unsigned long size, 286 unsigned long size,
279 unsigned long align, 287 unsigned long align,
280 unsigned long goal, 288 unsigned long goal,
diff --git a/mm/nommu.c b/mm/nommu.c
index c4acfbc09972..d4b0c10872de 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1486 1486
1487 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1487 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1488 1488
1489 ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1489 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1490 1490
1491 if (file) 1491 if (file)
1492 fput(file); 1492 fput(file);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ed0e19677360..ac300c99baf6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -183,7 +183,8 @@ static bool oom_unkillable_task(struct task_struct *p,
183unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, 183unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
184 const nodemask_t *nodemask, unsigned long totalpages) 184 const nodemask_t *nodemask, unsigned long totalpages)
185{ 185{
186 unsigned long points; 186 long points;
187 long adj;
187 188
188 if (oom_unkillable_task(p, memcg, nodemask)) 189 if (oom_unkillable_task(p, memcg, nodemask))
189 return 0; 190 return 0;
@@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
192 if (!p) 193 if (!p)
193 return 0; 194 return 0;
194 195
195 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { 196 adj = p->signal->oom_score_adj;
197 if (adj == OOM_SCORE_ADJ_MIN) {
196 task_unlock(p); 198 task_unlock(p);
197 return 0; 199 return 0;
198 } 200 }
@@ -210,20 +212,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
210 * implementation used by LSMs. 212 * implementation used by LSMs.
211 */ 213 */
212 if (has_capability_noaudit(p, CAP_SYS_ADMIN)) 214 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
213 points -= 30 * totalpages / 1000; 215 adj -= 30;
214 216
215 /* 217 /* Normalize to oom_score_adj units */
216 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may 218 adj *= totalpages / 1000;
217 * either completely disable oom killing or always prefer a certain 219 points += adj;
218 * task.
219 */
220 points += p->signal->oom_score_adj * totalpages / 1000;
221 220
222 /* 221 /*
223 * Never return 0 for an eligible task regardless of the root bonus and 222 * Never return 0 for an eligible task regardless of the root bonus and
224 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). 223 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
225 */ 224 */
226 return points ? points : 1; 225 return points > 0 ? points : 1;
227} 226}
228 227
229/* 228/*
@@ -366,7 +365,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
366 365
367/** 366/**
368 * dump_tasks - dump current memory state of all system tasks 367 * dump_tasks - dump current memory state of all system tasks
369 * @mem: current's memory controller, if constrained 368 * @memcg: current's memory controller, if constrained
370 * @nodemask: nodemask passed to page allocator for mempolicy ooms 369 * @nodemask: nodemask passed to page allocator for mempolicy ooms
371 * 370 *
372 * Dumps the current memory state of all eligible tasks. Tasks not in the same 371 * Dumps the current memory state of all eligible tasks. Tasks not in the same
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44030096da63..4a4f9219683f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5635,7 +5635,12 @@ static struct page *
5635__alloc_contig_migrate_alloc(struct page *page, unsigned long private, 5635__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
5636 int **resultp) 5636 int **resultp)
5637{ 5637{
5638 return alloc_page(GFP_HIGHUSER_MOVABLE); 5638 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
5639
5640 if (PageHighMem(page))
5641 gfp_mask |= __GFP_HIGHMEM;
5642
5643 return alloc_page(gfp_mask);
5639} 5644}
5640 5645
5641/* [start, end) must belong to a single zone. */ 5646/* [start, end) must belong to a single zone. */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1ccbd714059c..eb750f851395 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
392 392
393/** 393/**
394 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. 394 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
395 * @end: swap entry to be cmpxchged 395 * @ent: swap entry to be cmpxchged
396 * @old: old id 396 * @old: old id
397 * @new: new id 397 * @new: new id
398 * 398 *
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
422/** 422/**
423 * swap_cgroup_record - record mem_cgroup for this swp_entry. 423 * swap_cgroup_record - record mem_cgroup for this swp_entry.
424 * @ent: swap entry to be recorded into 424 * @ent: swap entry to be recorded into
425 * @mem: mem_cgroup to be recorded 425 * @id: mem_cgroup to be recorded
426 * 426 *
427 * Returns old value at success, 0 at failure. 427 * Returns old value at success, 0 at failure.
428 * (Of course, old value can be 0.) 428 * (Of course, old value can be 0.)
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611e..34f02923744c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,6 +18,7 @@
18#include <linux/bio.h> 18#include <linux/bio.h>
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/frontswap.h>
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
22 23
23static struct bio *get_swap_bio(gfp_t gfp_flags, 24static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
98 unlock_page(page); 99 unlock_page(page);
99 goto out; 100 goto out;
100 } 101 }
102 if (frontswap_store(page) == 0) {
103 set_page_writeback(page);
104 unlock_page(page);
105 end_page_writeback(page);
106 goto out;
107 }
101 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); 108 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
102 if (bio == NULL) { 109 if (bio == NULL) {
103 set_page_dirty(page); 110 set_page_dirty(page);
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
122 129
123 VM_BUG_ON(!PageLocked(page)); 130 VM_BUG_ON(!PageLocked(page));
124 VM_BUG_ON(PageUptodate(page)); 131 VM_BUG_ON(PageUptodate(page));
132 if (frontswap_load(page) == 0) {
133 SetPageUptodate(page);
134 unlock_page(page);
135 goto out;
136 }
125 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); 137 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
126 if (bio == NULL) { 138 if (bio == NULL) {
127 unlock_page(page); 139 unlock_page(page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e12714..6c118d012bb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
162 162
163/** 163/**
164 * walk_page_range - walk a memory map's page tables with a callback 164 * walk_page_range - walk a memory map's page tables with a callback
165 * @mm: memory map to walk
166 * @addr: starting address 165 * @addr: starting address
167 * @end: ending address 166 * @end: ending address
168 * @walk: set of callbacks to invoke for each level of the tree 167 * @walk: set of callbacks to invoke for each level of the tree
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 405d331804c3..3707c71ae4cd 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -360,7 +360,6 @@ err_free:
360 * @chunk: chunk to depopulate 360 * @chunk: chunk to depopulate
361 * @off: offset to the area to depopulate 361 * @off: offset to the area to depopulate
362 * @size: size of the area to depopulate in bytes 362 * @size: size of the area to depopulate in bytes
363 * @flush: whether to flush cache and tlb or not
364 * 363 *
365 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 364 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
366 * from @chunk. If @flush is true, vcache is flushed before unmapping 365 * from @chunk. If @flush is true, vcache is flushed before unmapping
diff --git a/mm/shmem.c b/mm/shmem.c
index c244e93a70fa..bd106361be4b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
264} 264}
265 265
266/* 266/*
267 * Sometimes, before we decide whether to proceed or to fail, we must check
268 * that an entry was not already brought back from swap by a racing thread.
269 *
270 * Checking page is not enough: by the time a SwapCache page is locked, it
271 * might be reused, and again be SwapCache, using the same swap as before.
272 */
273static bool shmem_confirm_swap(struct address_space *mapping,
274 pgoff_t index, swp_entry_t swap)
275{
276 void *item;
277
278 rcu_read_lock();
279 item = radix_tree_lookup(&mapping->page_tree, index);
280 rcu_read_unlock();
281 return item == swp_to_radix_entry(swap);
282}
283
284/*
267 * Like add_to_page_cache_locked, but error if expected item has gone. 285 * Like add_to_page_cache_locked, but error if expected item has gone.
268 */ 286 */
269static int shmem_add_to_page_cache(struct page *page, 287static int shmem_add_to_page_cache(struct page *page,
270 struct address_space *mapping, 288 struct address_space *mapping,
271 pgoff_t index, gfp_t gfp, void *expected) 289 pgoff_t index, gfp_t gfp, void *expected)
272{ 290{
273 int error = 0; 291 int error;
274 292
275 VM_BUG_ON(!PageLocked(page)); 293 VM_BUG_ON(!PageLocked(page));
276 VM_BUG_ON(!PageSwapBacked(page)); 294 VM_BUG_ON(!PageSwapBacked(page));
277 295
296 page_cache_get(page);
297 page->mapping = mapping;
298 page->index = index;
299
300 spin_lock_irq(&mapping->tree_lock);
278 if (!expected) 301 if (!expected)
279 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 302 error = radix_tree_insert(&mapping->page_tree, index, page);
303 else
304 error = shmem_radix_tree_replace(mapping, index, expected,
305 page);
280 if (!error) { 306 if (!error) {
281 page_cache_get(page); 307 mapping->nrpages++;
282 page->mapping = mapping; 308 __inc_zone_page_state(page, NR_FILE_PAGES);
283 page->index = index; 309 __inc_zone_page_state(page, NR_SHMEM);
284 310 spin_unlock_irq(&mapping->tree_lock);
285 spin_lock_irq(&mapping->tree_lock); 311 } else {
286 if (!expected) 312 page->mapping = NULL;
287 error = radix_tree_insert(&mapping->page_tree, 313 spin_unlock_irq(&mapping->tree_lock);
288 index, page); 314 page_cache_release(page);
289 else
290 error = shmem_radix_tree_replace(mapping, index,
291 expected, page);
292 if (!error) {
293 mapping->nrpages++;
294 __inc_zone_page_state(page, NR_FILE_PAGES);
295 __inc_zone_page_state(page, NR_SHMEM);
296 spin_unlock_irq(&mapping->tree_lock);
297 } else {
298 page->mapping = NULL;
299 spin_unlock_irq(&mapping->tree_lock);
300 page_cache_release(page);
301 }
302 if (!expected)
303 radix_tree_preload_end();
304 } 315 }
305 if (error)
306 mem_cgroup_uncharge_cache_page(page);
307 return error; 316 return error;
308} 317}
309 318
@@ -683,10 +692,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
683 mutex_lock(&shmem_swaplist_mutex); 692 mutex_lock(&shmem_swaplist_mutex);
684 /* 693 /*
685 * We needed to drop mutex to make that restrictive page 694 * We needed to drop mutex to make that restrictive page
686 * allocation; but the inode might already be freed by now, 695 * allocation, but the inode might have been freed while we
687 * and we cannot refer to inode or mapping or info to check. 696 * dropped it: although a racing shmem_evict_inode() cannot
688 * However, we do hold page lock on the PageSwapCache page, 697 * complete without emptying the radix_tree, our page lock
689 * so can check if that still has our reference remaining. 698 * on this swapcache page is not enough to prevent that -
699 * free_swap_and_cache() of our swap entry will only
700 * trylock_page(), removing swap from radix_tree whatever.
701 *
702 * We must not proceed to shmem_add_to_page_cache() if the
703 * inode has been freed, but of course we cannot rely on
704 * inode or mapping or info to check that. However, we can
705 * safely check if our swap entry is still in use (and here
706 * it can't have got reused for another page): if it's still
707 * in use, then the inode cannot have been freed yet, and we
708 * can safely proceed (if it's no longer in use, that tells
709 * nothing about the inode, but we don't need to unuse swap).
690 */ 710 */
691 if (!page_swapcount(*pagep)) 711 if (!page_swapcount(*pagep))
692 error = -ENOENT; 712 error = -ENOENT;
@@ -730,9 +750,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
730 750
731 /* 751 /*
732 * There's a faint possibility that swap page was replaced before 752 * There's a faint possibility that swap page was replaced before
733 * caller locked it: it will come back later with the right page. 753 * caller locked it: caller will come back later with the right page.
734 */ 754 */
735 if (unlikely(!PageSwapCache(page))) 755 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
736 goto out; 756 goto out;
737 757
738 /* 758 /*
@@ -995,21 +1015,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
995 newpage = shmem_alloc_page(gfp, info, index); 1015 newpage = shmem_alloc_page(gfp, info, index);
996 if (!newpage) 1016 if (!newpage)
997 return -ENOMEM; 1017 return -ENOMEM;
998 VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
999 1018
1000 *pagep = newpage;
1001 page_cache_get(newpage); 1019 page_cache_get(newpage);
1002 copy_highpage(newpage, oldpage); 1020 copy_highpage(newpage, oldpage);
1021 flush_dcache_page(newpage);
1003 1022
1004 VM_BUG_ON(!PageLocked(oldpage));
1005 __set_page_locked(newpage); 1023 __set_page_locked(newpage);
1006 VM_BUG_ON(!PageUptodate(oldpage));
1007 SetPageUptodate(newpage); 1024 SetPageUptodate(newpage);
1008 VM_BUG_ON(!PageSwapBacked(oldpage));
1009 SetPageSwapBacked(newpage); 1025 SetPageSwapBacked(newpage);
1010 VM_BUG_ON(!swap_index);
1011 set_page_private(newpage, swap_index); 1026 set_page_private(newpage, swap_index);
1012 VM_BUG_ON(!PageSwapCache(oldpage));
1013 SetPageSwapCache(newpage); 1027 SetPageSwapCache(newpage);
1014 1028
1015 /* 1029 /*
@@ -1019,13 +1033,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1019 spin_lock_irq(&swap_mapping->tree_lock); 1033 spin_lock_irq(&swap_mapping->tree_lock);
1020 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, 1034 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1021 newpage); 1035 newpage);
1022 __inc_zone_page_state(newpage, NR_FILE_PAGES); 1036 if (!error) {
1023 __dec_zone_page_state(oldpage, NR_FILE_PAGES); 1037 __inc_zone_page_state(newpage, NR_FILE_PAGES);
1038 __dec_zone_page_state(oldpage, NR_FILE_PAGES);
1039 }
1024 spin_unlock_irq(&swap_mapping->tree_lock); 1040 spin_unlock_irq(&swap_mapping->tree_lock);
1025 BUG_ON(error);
1026 1041
1027 mem_cgroup_replace_page_cache(oldpage, newpage); 1042 if (unlikely(error)) {
1028 lru_cache_add_anon(newpage); 1043 /*
1044 * Is this possible? I think not, now that our callers check
1045 * both PageSwapCache and page_private after getting page lock;
1046 * but be defensive. Reverse old to newpage for clear and free.
1047 */
1048 oldpage = newpage;
1049 } else {
1050 mem_cgroup_replace_page_cache(oldpage, newpage);
1051 lru_cache_add_anon(newpage);
1052 *pagep = newpage;
1053 }
1029 1054
1030 ClearPageSwapCache(oldpage); 1055 ClearPageSwapCache(oldpage);
1031 set_page_private(oldpage, 0); 1056 set_page_private(oldpage, 0);
@@ -1033,7 +1058,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1033 unlock_page(oldpage); 1058 unlock_page(oldpage);
1034 page_cache_release(oldpage); 1059 page_cache_release(oldpage);
1035 page_cache_release(oldpage); 1060 page_cache_release(oldpage);
1036 return 0; 1061 return error;
1037} 1062}
1038 1063
1039/* 1064/*
@@ -1107,9 +1132,10 @@ repeat:
1107 1132
1108 /* We have to do this with page locked to prevent races */ 1133 /* We have to do this with page locked to prevent races */
1109 lock_page(page); 1134 lock_page(page);
1110 if (!PageSwapCache(page) || page->mapping) { 1135 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1136 !shmem_confirm_swap(mapping, index, swap)) {
1111 error = -EEXIST; /* try again */ 1137 error = -EEXIST; /* try again */
1112 goto failed; 1138 goto unlock;
1113 } 1139 }
1114 if (!PageUptodate(page)) { 1140 if (!PageUptodate(page)) {
1115 error = -EIO; 1141 error = -EIO;
@@ -1125,9 +1151,12 @@ repeat:
1125 1151
1126 error = mem_cgroup_cache_charge(page, current->mm, 1152 error = mem_cgroup_cache_charge(page, current->mm,
1127 gfp & GFP_RECLAIM_MASK); 1153 gfp & GFP_RECLAIM_MASK);
1128 if (!error) 1154 if (!error) {
1129 error = shmem_add_to_page_cache(page, mapping, index, 1155 error = shmem_add_to_page_cache(page, mapping, index,
1130 gfp, swp_to_radix_entry(swap)); 1156 gfp, swp_to_radix_entry(swap));
1157 /* We already confirmed swap, and make no allocation */
1158 VM_BUG_ON(error);
1159 }
1131 if (error) 1160 if (error)
1132 goto failed; 1161 goto failed;
1133 1162
@@ -1164,11 +1193,18 @@ repeat:
1164 __set_page_locked(page); 1193 __set_page_locked(page);
1165 error = mem_cgroup_cache_charge(page, current->mm, 1194 error = mem_cgroup_cache_charge(page, current->mm,
1166 gfp & GFP_RECLAIM_MASK); 1195 gfp & GFP_RECLAIM_MASK);
1167 if (!error)
1168 error = shmem_add_to_page_cache(page, mapping, index,
1169 gfp, NULL);
1170 if (error) 1196 if (error)
1171 goto decused; 1197 goto decused;
1198 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
1199 if (!error) {
1200 error = shmem_add_to_page_cache(page, mapping, index,
1201 gfp, NULL);
1202 radix_tree_preload_end();
1203 }
1204 if (error) {
1205 mem_cgroup_uncharge_cache_page(page);
1206 goto decused;
1207 }
1172 lru_cache_add_anon(page); 1208 lru_cache_add_anon(page);
1173 1209
1174 spin_lock(&info->lock); 1210 spin_lock(&info->lock);
@@ -1228,14 +1264,10 @@ decused:
1228unacct: 1264unacct:
1229 shmem_unacct_blocks(info->flags, 1); 1265 shmem_unacct_blocks(info->flags, 1);
1230failed: 1266failed:
1231 if (swap.val && error != -EINVAL) { 1267 if (swap.val && error != -EINVAL &&
1232 struct page *test = find_get_page(mapping, index); 1268 !shmem_confirm_swap(mapping, index, swap))
1233 if (test && !radix_tree_exceptional_entry(test)) 1269 error = -EEXIST;
1234 page_cache_release(test); 1270unlock:
1235 /* Have another try if the entry has changed */
1236 if (test != swp_to_radix_entry(swap))
1237 error = -EEXIST;
1238 }
1239 if (page) { 1271 if (page) {
1240 unlock_page(page); 1272 unlock_page(page);
1241 page_cache_release(page); 1273 page_cache_release(page);
@@ -1247,7 +1279,7 @@ failed:
1247 spin_unlock(&info->lock); 1279 spin_unlock(&info->lock);
1248 goto repeat; 1280 goto repeat;
1249 } 1281 }
1250 if (error == -EEXIST) 1282 if (error == -EEXIST) /* from above or from radix_tree_insert */
1251 goto repeat; 1283 goto repeat;
1252 return error; 1284 return error;
1253} 1285}
@@ -1675,98 +1707,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1675 return error; 1707 return error;
1676} 1708}
1677 1709
1678/*
1679 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
1680 */
1681static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1682 pgoff_t index, pgoff_t end, int origin)
1683{
1684 struct page *page;
1685 struct pagevec pvec;
1686 pgoff_t indices[PAGEVEC_SIZE];
1687 bool done = false;
1688 int i;
1689
1690 pagevec_init(&pvec, 0);
1691 pvec.nr = 1; /* start small: we may be there already */
1692 while (!done) {
1693 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
1694 pvec.nr, pvec.pages, indices);
1695 if (!pvec.nr) {
1696 if (origin == SEEK_DATA)
1697 index = end;
1698 break;
1699 }
1700 for (i = 0; i < pvec.nr; i++, index++) {
1701 if (index < indices[i]) {
1702 if (origin == SEEK_HOLE) {
1703 done = true;
1704 break;
1705 }
1706 index = indices[i];
1707 }
1708 page = pvec.pages[i];
1709 if (page && !radix_tree_exceptional_entry(page)) {
1710 if (!PageUptodate(page))
1711 page = NULL;
1712 }
1713 if (index >= end ||
1714 (page && origin == SEEK_DATA) ||
1715 (!page && origin == SEEK_HOLE)) {
1716 done = true;
1717 break;
1718 }
1719 }
1720 shmem_deswap_pagevec(&pvec);
1721 pagevec_release(&pvec);
1722 pvec.nr = PAGEVEC_SIZE;
1723 cond_resched();
1724 }
1725 return index;
1726}
1727
1728static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
1729{
1730 struct address_space *mapping;
1731 struct inode *inode;
1732 pgoff_t start, end;
1733 loff_t new_offset;
1734
1735 if (origin != SEEK_DATA && origin != SEEK_HOLE)
1736 return generic_file_llseek_size(file, offset, origin,
1737 MAX_LFS_FILESIZE);
1738 mapping = file->f_mapping;
1739 inode = mapping->host;
1740 mutex_lock(&inode->i_mutex);
1741 /* We're holding i_mutex so we can access i_size directly */
1742
1743 if (offset < 0)
1744 offset = -EINVAL;
1745 else if (offset >= inode->i_size)
1746 offset = -ENXIO;
1747 else {
1748 start = offset >> PAGE_CACHE_SHIFT;
1749 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1750 new_offset = shmem_seek_hole_data(mapping, start, end, origin);
1751 new_offset <<= PAGE_CACHE_SHIFT;
1752 if (new_offset > offset) {
1753 if (new_offset < inode->i_size)
1754 offset = new_offset;
1755 else if (origin == SEEK_DATA)
1756 offset = -ENXIO;
1757 else
1758 offset = inode->i_size;
1759 }
1760 }
1761
1762 if (offset >= 0 && offset != file->f_pos) {
1763 file->f_pos = offset;
1764 file->f_version = 0;
1765 }
1766 mutex_unlock(&inode->i_mutex);
1767 return offset;
1768}
1769
1770static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1710static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1771 loff_t len) 1711 loff_t len)
1772{ 1712{
@@ -2770,7 +2710,7 @@ static const struct address_space_operations shmem_aops = {
2770static const struct file_operations shmem_file_operations = { 2710static const struct file_operations shmem_file_operations = {
2771 .mmap = shmem_mmap, 2711 .mmap = shmem_mmap,
2772#ifdef CONFIG_TMPFS 2712#ifdef CONFIG_TMPFS
2773 .llseek = shmem_file_llseek, 2713 .llseek = generic_file_llseek,
2774 .read = do_sync_read, 2714 .read = do_sync_read,
2775 .write = do_sync_write, 2715 .write = do_sync_write,
2776 .aio_read = shmem_file_aio_read, 2716 .aio_read = shmem_file_aio_read,
diff --git a/mm/sparse.c b/mm/sparse.c
index 6a4bf9160e85..c7bb952400c8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -275,8 +275,9 @@ static unsigned long * __init
275sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, 275sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
276 unsigned long size) 276 unsigned long size)
277{ 277{
278 pg_data_t *host_pgdat; 278 unsigned long goal, limit;
279 unsigned long goal; 279 unsigned long *p;
280 int nid;
280 /* 281 /*
281 * A page may contain usemaps for other sections preventing the 282 * A page may contain usemaps for other sections preventing the
282 * page being freed and making a section unremovable while 283 * page being freed and making a section unremovable while
@@ -287,10 +288,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
287 * from the same section as the pgdat where possible to avoid 288 * from the same section as the pgdat where possible to avoid
288 * this problem. 289 * this problem.
289 */ 290 */
290 goal = __pa(pgdat) & PAGE_SECTION_MASK; 291 goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
291 host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT)); 292 limit = goal + (1UL << PA_SECTION_SHIFT);
292 return __alloc_bootmem_node_nopanic(host_pgdat, size, 293 nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
293 SMP_CACHE_BYTES, goal); 294again:
295 p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
296 SMP_CACHE_BYTES, goal, limit);
297 if (!p && limit) {
298 limit = 0;
299 goto again;
300 }
301 return p;
294} 302}
295 303
296static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 304static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 457b10baef59..71373d03fcee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,8 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/poll.h> 32#include <linux/poll.h>
33#include <linux/oom.h> 33#include <linux/oom.h>
34#include <linux/frontswap.h>
35#include <linux/swapfile.h>
34 36
35#include <asm/pgtable.h> 37#include <asm/pgtable.h>
36#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
42static void free_swap_count_continuations(struct swap_info_struct *); 44static void free_swap_count_continuations(struct swap_info_struct *);
43static sector_t map_swap_entry(swp_entry_t, struct block_device**); 45static sector_t map_swap_entry(swp_entry_t, struct block_device**);
44 46
45static DEFINE_SPINLOCK(swap_lock); 47DEFINE_SPINLOCK(swap_lock);
46static unsigned int nr_swapfiles; 48static unsigned int nr_swapfiles;
47long nr_swap_pages; 49long nr_swap_pages;
48long total_swap_pages; 50long total_swap_pages;
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry ";
53static const char Bad_offset[] = "Bad swap offset entry "; 55static const char Bad_offset[] = "Bad swap offset entry ";
54static const char Unused_offset[] = "Unused swap offset entry "; 56static const char Unused_offset[] = "Unused swap offset entry ";
55 57
56static struct swap_list_t swap_list = {-1, -1}; 58struct swap_list_t swap_list = {-1, -1};
57 59
58static struct swap_info_struct *swap_info[MAX_SWAPFILES]; 60struct swap_info_struct *swap_info[MAX_SWAPFILES];
59 61
60static DEFINE_MUTEX(swapon_mutex); 62static DEFINE_MUTEX(swapon_mutex);
61 63
@@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
556 swap_list.next = p->type; 558 swap_list.next = p->type;
557 nr_swap_pages++; 559 nr_swap_pages++;
558 p->inuse_pages--; 560 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset);
559 if ((p->flags & SWP_BLKDEV) && 562 if ((p->flags & SWP_BLKDEV) &&
560 disk->fops->swap_slot_free_notify) 563 disk->fops->swap_slot_free_notify)
561 disk->fops->swap_slot_free_notify(p->bdev, offset); 564 disk->fops->swap_slot_free_notify(p->bdev, offset);
@@ -985,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm,
985} 988}
986 989
987/* 990/*
988 * Scan swap_map from current position to next entry still in use. 991 * Scan swap_map (or frontswap_map if frontswap parameter is true)
992 * from current position to next entry still in use.
989 * Recycle to start on reaching the end, returning 0 when empty. 993 * Recycle to start on reaching the end, returning 0 when empty.
990 */ 994 */
991static unsigned int find_next_to_unuse(struct swap_info_struct *si, 995static unsigned int find_next_to_unuse(struct swap_info_struct *si,
992 unsigned int prev) 996 unsigned int prev, bool frontswap)
993{ 997{
994 unsigned int max = si->max; 998 unsigned int max = si->max;
995 unsigned int i = prev; 999 unsigned int i = prev;
@@ -1015,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1015 prev = 0; 1019 prev = 0;
1016 i = 1; 1020 i = 1;
1017 } 1021 }
1022 if (frontswap) {
1023 if (frontswap_test(si, i))
1024 break;
1025 else
1026 continue;
1027 }
1018 count = si->swap_map[i]; 1028 count = si->swap_map[i];
1019 if (count && swap_count(count) != SWAP_MAP_BAD) 1029 if (count && swap_count(count) != SWAP_MAP_BAD)
1020 break; 1030 break;
@@ -1026,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1026 * We completely avoid races by reading each swap page in advance, 1036 * We completely avoid races by reading each swap page in advance,
1027 * and then search for the process using it. All the necessary 1037 * and then search for the process using it. All the necessary
1028 * page table adjustments can then be made atomically. 1038 * page table adjustments can then be made atomically.
1039 *
1040 * if the boolean frontswap is true, only unuse pages_to_unuse pages;
1041 * pages_to_unuse==0 means all pages; ignored if frontswap is false
1029 */ 1042 */
1030static int try_to_unuse(unsigned int type) 1043int try_to_unuse(unsigned int type, bool frontswap,
1044 unsigned long pages_to_unuse)
1031{ 1045{
1032 struct swap_info_struct *si = swap_info[type]; 1046 struct swap_info_struct *si = swap_info[type];
1033 struct mm_struct *start_mm; 1047 struct mm_struct *start_mm;
@@ -1060,7 +1074,7 @@ static int try_to_unuse(unsigned int type)
1060 * one pass through swap_map is enough, but not necessarily: 1074 * one pass through swap_map is enough, but not necessarily:
1061 * there are races when an instance of an entry might be missed. 1075 * there are races when an instance of an entry might be missed.
1062 */ 1076 */
1063 while ((i = find_next_to_unuse(si, i)) != 0) { 1077 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1064 if (signal_pending(current)) { 1078 if (signal_pending(current)) {
1065 retval = -EINTR; 1079 retval = -EINTR;
1066 break; 1080 break;
@@ -1227,6 +1241,10 @@ static int try_to_unuse(unsigned int type)
1227 * interactive performance. 1241 * interactive performance.
1228 */ 1242 */
1229 cond_resched(); 1243 cond_resched();
1244 if (frontswap && pages_to_unuse > 0) {
1245 if (!--pages_to_unuse)
1246 break;
1247 }
1230 } 1248 }
1231 1249
1232 mmput(start_mm); 1250 mmput(start_mm);
@@ -1486,7 +1504,8 @@ bad_bmap:
1486} 1504}
1487 1505
1488static void enable_swap_info(struct swap_info_struct *p, int prio, 1506static void enable_swap_info(struct swap_info_struct *p, int prio,
1489 unsigned char *swap_map) 1507 unsigned char *swap_map,
1508 unsigned long *frontswap_map)
1490{ 1509{
1491 int i, prev; 1510 int i, prev;
1492 1511
@@ -1496,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1496 else 1515 else
1497 p->prio = --least_priority; 1516 p->prio = --least_priority;
1498 p->swap_map = swap_map; 1517 p->swap_map = swap_map;
1518 frontswap_map_set(p, frontswap_map);
1499 p->flags |= SWP_WRITEOK; 1519 p->flags |= SWP_WRITEOK;
1500 nr_swap_pages += p->pages; 1520 nr_swap_pages += p->pages;
1501 total_swap_pages += p->pages; 1521 total_swap_pages += p->pages;
@@ -1512,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1512 swap_list.head = swap_list.next = p->type; 1532 swap_list.head = swap_list.next = p->type;
1513 else 1533 else
1514 swap_info[prev]->next = p->type; 1534 swap_info[prev]->next = p->type;
1535 frontswap_init(p->type);
1515 spin_unlock(&swap_lock); 1536 spin_unlock(&swap_lock);
1516} 1537}
1517 1538
@@ -1585,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1585 spin_unlock(&swap_lock); 1606 spin_unlock(&swap_lock);
1586 1607
1587 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1608 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1588 err = try_to_unuse(type); 1609 err = try_to_unuse(type, false, 0); /* force all pages to be unused */
1589 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); 1610 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1590 1611
1591 if (err) { 1612 if (err) {
@@ -1596,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1596 * sys_swapoff for this swap_info_struct at this point. 1617 * sys_swapoff for this swap_info_struct at this point.
1597 */ 1618 */
1598 /* re-insert swap space back into swap_list */ 1619 /* re-insert swap space back into swap_list */
1599 enable_swap_info(p, p->prio, p->swap_map); 1620 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1600 goto out_dput; 1621 goto out_dput;
1601 } 1622 }
1602 1623
@@ -1622,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1622 swap_map = p->swap_map; 1643 swap_map = p->swap_map;
1623 p->swap_map = NULL; 1644 p->swap_map = NULL;
1624 p->flags = 0; 1645 p->flags = 0;
1646 frontswap_invalidate_area(type);
1625 spin_unlock(&swap_lock); 1647 spin_unlock(&swap_lock);
1626 mutex_unlock(&swapon_mutex); 1648 mutex_unlock(&swapon_mutex);
1627 vfree(swap_map); 1649 vfree(swap_map);
1650 vfree(frontswap_map_get(p));
1628 /* Destroy swap account informatin */ 1651 /* Destroy swap account informatin */
1629 swap_cgroup_swapoff(type); 1652 swap_cgroup_swapoff(type);
1630 1653
@@ -1893,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1893 1916
1894 /* 1917 /*
1895 * Find out how many pages are allowed for a single swap 1918 * Find out how many pages are allowed for a single swap
1896 * device. There are three limiting factors: 1) the number 1919 * device. There are two limiting factors: 1) the number
1897 * of bits for the swap offset in the swp_entry_t type, and 1920 * of bits for the swap offset in the swp_entry_t type, and
1898 * 2) the number of bits in the swap pte as defined by the 1921 * 2) the number of bits in the swap pte as defined by the
1899 * the different architectures, and 3) the number of free bits 1922 * different architectures. In order to find the
1900 * in an exceptional radix_tree entry. In order to find the
1901 * largest possible bit mask, a swap entry with swap type 0 1923 * largest possible bit mask, a swap entry with swap type 0
1902 * and swap offset ~0UL is created, encoded to a swap pte, 1924 * and swap offset ~0UL is created, encoded to a swap pte,
1903 * decoded to a swp_entry_t again, and finally the swap 1925 * decoded to a swp_entry_t again, and finally the swap
1904 * offset is extracted. This will mask all the bits from 1926 * offset is extracted. This will mask all the bits from
1905 * the initial ~0UL mask that can't be encoded in either 1927 * the initial ~0UL mask that can't be encoded in either
1906 * the swp_entry_t or the architecture definition of a 1928 * the swp_entry_t or the architecture definition of a
1907 * swap pte. Then the same is done for a radix_tree entry. 1929 * swap pte.
1908 */ 1930 */
1909 maxpages = swp_offset(pte_to_swp_entry( 1931 maxpages = swp_offset(pte_to_swp_entry(
1910 swp_entry_to_pte(swp_entry(0, ~0UL)))); 1932 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1911 maxpages = swp_offset(radix_to_swp_entry(
1912 swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
1913
1914 if (maxpages > swap_header->info.last_page) { 1933 if (maxpages > swap_header->info.last_page) {
1915 maxpages = swap_header->info.last_page + 1; 1934 maxpages = swap_header->info.last_page + 1;
1916 /* p->max is an unsigned int: don't overflow it */ 1935 /* p->max is an unsigned int: don't overflow it */
@@ -1988,6 +2007,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1988 sector_t span; 2007 sector_t span;
1989 unsigned long maxpages; 2008 unsigned long maxpages;
1990 unsigned char *swap_map = NULL; 2009 unsigned char *swap_map = NULL;
2010 unsigned long *frontswap_map = NULL;
1991 struct page *page = NULL; 2011 struct page *page = NULL;
1992 struct inode *inode = NULL; 2012 struct inode *inode = NULL;
1993 2013
@@ -2071,6 +2091,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2071 error = nr_extents; 2091 error = nr_extents;
2072 goto bad_swap; 2092 goto bad_swap;
2073 } 2093 }
2094 /* frontswap enabled? set up bit-per-page map for frontswap */
2095 if (frontswap_enabled)
2096 frontswap_map = vzalloc(maxpages / sizeof(long));
2074 2097
2075 if (p->bdev) { 2098 if (p->bdev) {
2076 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2099 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2086,14 +2109,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2086 if (swap_flags & SWAP_FLAG_PREFER) 2109 if (swap_flags & SWAP_FLAG_PREFER)
2087 prio = 2110 prio =
2088 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2111 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2089 enable_swap_info(p, prio, swap_map); 2112 enable_swap_info(p, prio, swap_map, frontswap_map);
2090 2113
2091 printk(KERN_INFO "Adding %uk swap on %s. " 2114 printk(KERN_INFO "Adding %uk swap on %s. "
2092 "Priority:%d extents:%d across:%lluk %s%s\n", 2115 "Priority:%d extents:%d across:%lluk %s%s%s\n",
2093 p->pages<<(PAGE_SHIFT-10), name, p->prio, 2116 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2094 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2117 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2095 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2118 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2096 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2119 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2120 (frontswap_map) ? "FS" : "");
2097 2121
2098 mutex_unlock(&swapon_mutex); 2122 mutex_unlock(&swapon_mutex);
2099 atomic_inc(&proc_poll_event); 2123 atomic_inc(&proc_poll_event);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eeb3bc9d1d36..66e431060c05 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2688,7 +2688,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2688 * them before going back to sleep. 2688 * them before going back to sleep.
2689 */ 2689 */
2690 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2690 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2691 schedule(); 2691
2692 if (!kthread_should_stop())
2693 schedule();
2694
2692 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2695 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2693 } else { 2696 } else {
2694 if (remaining) 2697 if (remaining)
@@ -2955,14 +2958,17 @@ int kswapd_run(int nid)
2955} 2958}
2956 2959
2957/* 2960/*
2958 * Called by memory hotplug when all memory in a node is offlined. 2961 * Called by memory hotplug when all memory in a node is offlined. Caller must
2962 * hold lock_memory_hotplug().
2959 */ 2963 */
2960void kswapd_stop(int nid) 2964void kswapd_stop(int nid)
2961{ 2965{
2962 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 2966 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
2963 2967
2964 if (kswapd) 2968 if (kswapd) {
2965 kthread_stop(kswapd); 2969 kthread_stop(kswapd);
2970 NODE_DATA(nid)->kswapd = NULL;
2971 }
2966} 2972}
2967 2973
2968static int __init kswapd_init(void) 2974static int __init kswapd_init(void)