aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/Makefile1
-rw-r--r--mm/frontswap.c314
-rw-r--r--mm/memblock.c68
-rw-r--r--mm/memcontrol.c6
-rw-r--r--mm/memory.c12
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c21
-rw-r--r--mm/page_cgroup.c4
-rw-r--r--mm/page_io.c12
-rw-r--r--mm/pagewalk.c1
-rw-r--r--mm/percpu-vm.c1
-rw-r--r--mm/shmem.c57
-rw-r--r--mm/swapfile.c66
15 files changed, 511 insertions, 73 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b2176374b98e..82fed4eb2b6f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -389,3 +389,20 @@ config CLEANCACHE
389 in a negligible performance hit. 389 in a negligible performance hit.
390 390
391 If unsure, say Y to enable cleancache 391 If unsure, say Y to enable cleancache
392
393config FRONTSWAP
394 bool "Enable frontswap to cache swap pages if tmem is present"
395 depends on SWAP
396 default n
397 help
398 Frontswap is so named because it can be thought of as the opposite
399 of a "backing" store for a swap device. The data is stored into
400 "transcendent memory", memory that is not directly accessible or
401 addressable by the kernel and is of unknown and possibly
402 time-varying size. When space in transcendent memory is available,
403 a significant swap I/O reduction may be achieved. When none is
404 available, all frontswap calls are reduced to a single pointer-
405 compare-against-NULL resulting in a negligible performance hit
406 and swap data is stored as normal on the matching swap device.
407
408 If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index a156285ce88d..2e2fbbefb99f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
29 29
30obj-$(CONFIG_BOUNCE) += bounce.o 30obj-$(CONFIG_BOUNCE) += bounce.o
31obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 31obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
32obj-$(CONFIG_FRONTSWAP) += frontswap.o
32obj-$(CONFIG_HAS_DMA) += dmapool.o 33obj-$(CONFIG_HAS_DMA) += dmapool.o
33obj-$(CONFIG_HUGETLBFS) += hugetlb.o 34obj-$(CONFIG_HUGETLBFS) += hugetlb.o
34obj-$(CONFIG_NUMA) += mempolicy.o 35obj-$(CONFIG_NUMA) += mempolicy.o
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000000..e25025574a02
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,314 @@
1/*
2 * Frontswap frontend
3 *
4 * This code provides the generic "frontend" layer to call a matching
5 * "backend" driver implementation of frontswap. See
6 * Documentation/vm/frontswap.txt for more information.
7 *
8 * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
9 * Author: Dan Magenheimer
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2.
12 */
13
14#include <linux/mm.h>
15#include <linux/mman.h>
16#include <linux/swap.h>
17#include <linux/swapops.h>
18#include <linux/proc_fs.h>
19#include <linux/security.h>
20#include <linux/capability.h>
21#include <linux/module.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/frontswap.h>
25#include <linux/swapfile.h>
26
27/*
28 * frontswap_ops is set by frontswap_register_ops to contain the pointers
29 * to the frontswap "backend" implementation functions.
30 */
31static struct frontswap_ops frontswap_ops __read_mostly;
32
33/*
34 * This global enablement flag reduces overhead on systems where frontswap_ops
35 * has not been registered, so is preferred to the slower alternative: a
36 * function call that checks a non-global.
37 */
38bool frontswap_enabled __read_mostly;
39EXPORT_SYMBOL(frontswap_enabled);
40
41/*
42 * If enabled, frontswap_store will return failure even on success. As
43 * a result, the swap subsystem will always write the page to swap, in
44 * effect converting frontswap into a writethrough cache. In this mode,
45 * there is no direct reduction in swap writes, but a frontswap backend
46 * can unilaterally "reclaim" any pages in use with no data loss, thus
47 * providing increases control over maximum memory usage due to frontswap.
48 */
49static bool frontswap_writethrough_enabled __read_mostly;
50
51#ifdef CONFIG_DEBUG_FS
52/*
53 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
54 * properly configured). These are for information only so are not protected
55 * against increment races.
56 */
57static u64 frontswap_loads;
58static u64 frontswap_succ_stores;
59static u64 frontswap_failed_stores;
60static u64 frontswap_invalidates;
61
62static inline void inc_frontswap_loads(void) {
63 frontswap_loads++;
64}
65static inline void inc_frontswap_succ_stores(void) {
66 frontswap_succ_stores++;
67}
68static inline void inc_frontswap_failed_stores(void) {
69 frontswap_failed_stores++;
70}
71static inline void inc_frontswap_invalidates(void) {
72 frontswap_invalidates++;
73}
74#else
75static inline void inc_frontswap_loads(void) { }
76static inline void inc_frontswap_succ_stores(void) { }
77static inline void inc_frontswap_failed_stores(void) { }
78static inline void inc_frontswap_invalidates(void) { }
79#endif
80/*
81 * Register operations for frontswap, returning previous thus allowing
82 * detection of multiple backends and possible nesting.
83 */
84struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
85{
86 struct frontswap_ops old = frontswap_ops;
87
88 frontswap_ops = *ops;
89 frontswap_enabled = true;
90 return old;
91}
92EXPORT_SYMBOL(frontswap_register_ops);
93
94/*
95 * Enable/disable frontswap writethrough (see above).
96 */
97void frontswap_writethrough(bool enable)
98{
99 frontswap_writethrough_enabled = enable;
100}
101EXPORT_SYMBOL(frontswap_writethrough);
102
103/*
104 * Called when a swap device is swapon'd.
105 */
106void __frontswap_init(unsigned type)
107{
108 struct swap_info_struct *sis = swap_info[type];
109
110 BUG_ON(sis == NULL);
111 if (sis->frontswap_map == NULL)
112 return;
113 if (frontswap_enabled)
114 (*frontswap_ops.init)(type);
115}
116EXPORT_SYMBOL(__frontswap_init);
117
118/*
119 * "Store" data from a page to frontswap and associate it with the page's
120 * swaptype and offset. Page must be locked and in the swap cache.
121 * If frontswap already contains a page with matching swaptype and
122 * offset, the frontswap implmentation may either overwrite the data and
123 * return success or invalidate the page from frontswap and return failure.
124 */
125int __frontswap_store(struct page *page)
126{
127 int ret = -1, dup = 0;
128 swp_entry_t entry = { .val = page_private(page), };
129 int type = swp_type(entry);
130 struct swap_info_struct *sis = swap_info[type];
131 pgoff_t offset = swp_offset(entry);
132
133 BUG_ON(!PageLocked(page));
134 BUG_ON(sis == NULL);
135 if (frontswap_test(sis, offset))
136 dup = 1;
137 ret = (*frontswap_ops.store)(type, offset, page);
138 if (ret == 0) {
139 frontswap_set(sis, offset);
140 inc_frontswap_succ_stores();
141 if (!dup)
142 atomic_inc(&sis->frontswap_pages);
143 } else if (dup) {
144 /*
145 failed dup always results in automatic invalidate of
146 the (older) page from frontswap
147 */
148 frontswap_clear(sis, offset);
149 atomic_dec(&sis->frontswap_pages);
150 inc_frontswap_failed_stores();
151 } else
152 inc_frontswap_failed_stores();
153 if (frontswap_writethrough_enabled)
154 /* report failure so swap also writes to swap device */
155 ret = -1;
156 return ret;
157}
158EXPORT_SYMBOL(__frontswap_store);
159
160/*
161 * "Get" data from frontswap associated with swaptype and offset that were
162 * specified when the data was put to frontswap and use it to fill the
163 * specified page with data. Page must be locked and in the swap cache.
164 */
165int __frontswap_load(struct page *page)
166{
167 int ret = -1;
168 swp_entry_t entry = { .val = page_private(page), };
169 int type = swp_type(entry);
170 struct swap_info_struct *sis = swap_info[type];
171 pgoff_t offset = swp_offset(entry);
172
173 BUG_ON(!PageLocked(page));
174 BUG_ON(sis == NULL);
175 if (frontswap_test(sis, offset))
176 ret = (*frontswap_ops.load)(type, offset, page);
177 if (ret == 0)
178 inc_frontswap_loads();
179 return ret;
180}
181EXPORT_SYMBOL(__frontswap_load);
182
183/*
184 * Invalidate any data from frontswap associated with the specified swaptype
185 * and offset so that a subsequent "get" will fail.
186 */
187void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
188{
189 struct swap_info_struct *sis = swap_info[type];
190
191 BUG_ON(sis == NULL);
192 if (frontswap_test(sis, offset)) {
193 (*frontswap_ops.invalidate_page)(type, offset);
194 atomic_dec(&sis->frontswap_pages);
195 frontswap_clear(sis, offset);
196 inc_frontswap_invalidates();
197 }
198}
199EXPORT_SYMBOL(__frontswap_invalidate_page);
200
201/*
202 * Invalidate all data from frontswap associated with all offsets for the
203 * specified swaptype.
204 */
205void __frontswap_invalidate_area(unsigned type)
206{
207 struct swap_info_struct *sis = swap_info[type];
208
209 BUG_ON(sis == NULL);
210 if (sis->frontswap_map == NULL)
211 return;
212 (*frontswap_ops.invalidate_area)(type);
213 atomic_set(&sis->frontswap_pages, 0);
214 memset(sis->frontswap_map, 0, sis->max / sizeof(long));
215}
216EXPORT_SYMBOL(__frontswap_invalidate_area);
217
218/*
219 * Frontswap, like a true swap device, may unnecessarily retain pages
220 * under certain circumstances; "shrink" frontswap is essentially a
221 * "partial swapoff" and works by calling try_to_unuse to attempt to
222 * unuse enough frontswap pages to attempt to -- subject to memory
223 * constraints -- reduce the number of pages in frontswap to the
224 * number given in the parameter target_pages.
225 */
226void frontswap_shrink(unsigned long target_pages)
227{
228 struct swap_info_struct *si = NULL;
229 int si_frontswap_pages;
230 unsigned long total_pages = 0, total_pages_to_unuse;
231 unsigned long pages = 0, pages_to_unuse = 0;
232 int type;
233 bool locked = false;
234
235 /*
236 * we don't want to hold swap_lock while doing a very
237 * lengthy try_to_unuse, but swap_list may change
238 * so restart scan from swap_list.head each time
239 */
240 spin_lock(&swap_lock);
241 locked = true;
242 total_pages = 0;
243 for (type = swap_list.head; type >= 0; type = si->next) {
244 si = swap_info[type];
245 total_pages += atomic_read(&si->frontswap_pages);
246 }
247 if (total_pages <= target_pages)
248 goto out;
249 total_pages_to_unuse = total_pages - target_pages;
250 for (type = swap_list.head; type >= 0; type = si->next) {
251 si = swap_info[type];
252 si_frontswap_pages = atomic_read(&si->frontswap_pages);
253 if (total_pages_to_unuse < si_frontswap_pages)
254 pages = pages_to_unuse = total_pages_to_unuse;
255 else {
256 pages = si_frontswap_pages;
257 pages_to_unuse = 0; /* unuse all */
258 }
259 /* ensure there is enough RAM to fetch pages from frontswap */
260 if (security_vm_enough_memory_mm(current->mm, pages))
261 continue;
262 vm_unacct_memory(pages);
263 break;
264 }
265 if (type < 0)
266 goto out;
267 locked = false;
268 spin_unlock(&swap_lock);
269 try_to_unuse(type, true, pages_to_unuse);
270out:
271 if (locked)
272 spin_unlock(&swap_lock);
273 return;
274}
275EXPORT_SYMBOL(frontswap_shrink);
276
277/*
278 * Count and return the number of frontswap pages across all
279 * swap devices. This is exported so that backend drivers can
280 * determine current usage without reading debugfs.
281 */
282unsigned long frontswap_curr_pages(void)
283{
284 int type;
285 unsigned long totalpages = 0;
286 struct swap_info_struct *si = NULL;
287
288 spin_lock(&swap_lock);
289 for (type = swap_list.head; type >= 0; type = si->next) {
290 si = swap_info[type];
291 totalpages += atomic_read(&si->frontswap_pages);
292 }
293 spin_unlock(&swap_lock);
294 return totalpages;
295}
296EXPORT_SYMBOL(frontswap_curr_pages);
297
298static int __init init_frontswap(void)
299{
300#ifdef CONFIG_DEBUG_FS
301 struct dentry *root = debugfs_create_dir("frontswap", NULL);
302 if (root == NULL)
303 return -ENXIO;
304 debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
305 debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
306 debugfs_create_u64("failed_stores", S_IRUGO, root,
307 &frontswap_failed_stores);
308 debugfs_create_u64("invalidates", S_IRUGO,
309 root, &frontswap_invalidates);
310#endif
311 return 0;
312}
313
314module_init(init_frontswap);
diff --git a/mm/memblock.c b/mm/memblock.c
index 952123eba433..d4382095f8bd 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -184,7 +184,24 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
184 } 184 }
185} 185}
186 186
187static int __init_memblock memblock_double_array(struct memblock_type *type) 187/**
188 * memblock_double_array - double the size of the memblock regions array
189 * @type: memblock type of the regions array being doubled
190 * @new_area_start: starting address of memory range to avoid overlap with
191 * @new_area_size: size of memory range to avoid overlap with
192 *
193 * Double the size of the @type regions array. If memblock is being used to
194 * allocate memory for a new reserved regions array and there is a previously
195 * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
196 * waiting to be reserved, ensure the memory used by the new array does
197 * not overlap.
198 *
199 * RETURNS:
200 * 0 on success, -1 on failure.
201 */
202static int __init_memblock memblock_double_array(struct memblock_type *type,
203 phys_addr_t new_area_start,
204 phys_addr_t new_area_size)
188{ 205{
189 struct memblock_region *new_array, *old_array; 206 struct memblock_region *new_array, *old_array;
190 phys_addr_t old_size, new_size, addr; 207 phys_addr_t old_size, new_size, addr;
@@ -222,7 +239,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
222 new_array = kmalloc(new_size, GFP_KERNEL); 239 new_array = kmalloc(new_size, GFP_KERNEL);
223 addr = new_array ? __pa(new_array) : 0; 240 addr = new_array ? __pa(new_array) : 0;
224 } else { 241 } else {
225 addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); 242 /* only exclude range when trying to double reserved.regions */
243 if (type != &memblock.reserved)
244 new_area_start = new_area_size = 0;
245
246 addr = memblock_find_in_range(new_area_start + new_area_size,
247 memblock.current_limit,
248 new_size, sizeof(phys_addr_t));
249 if (!addr && new_area_size)
250 addr = memblock_find_in_range(0,
251 min(new_area_start, memblock.current_limit),
252 new_size, sizeof(phys_addr_t));
253
226 new_array = addr ? __va(addr) : 0; 254 new_array = addr ? __va(addr) : 0;
227 } 255 }
228 if (!addr) { 256 if (!addr) {
@@ -399,7 +427,7 @@ repeat:
399 */ 427 */
400 if (!insert) { 428 if (!insert) {
401 while (type->cnt + nr_new > type->max) 429 while (type->cnt + nr_new > type->max)
402 if (memblock_double_array(type) < 0) 430 if (memblock_double_array(type, obase, size) < 0)
403 return -ENOMEM; 431 return -ENOMEM;
404 insert = true; 432 insert = true;
405 goto repeat; 433 goto repeat;
@@ -450,7 +478,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
450 478
451 /* we'll create at most two more regions */ 479 /* we'll create at most two more regions */
452 while (type->cnt + 2 > type->max) 480 while (type->cnt + 2 > type->max)
453 if (memblock_double_array(type) < 0) 481 if (memblock_double_array(type, base, size) < 0)
454 return -ENOMEM; 482 return -ENOMEM;
455 483
456 for (i = 0; i < type->cnt; i++) { 484 for (i = 0; i < type->cnt; i++) {
@@ -540,9 +568,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
540 * __next_free_mem_range - next function for for_each_free_mem_range() 568 * __next_free_mem_range - next function for for_each_free_mem_range()
541 * @idx: pointer to u64 loop variable 569 * @idx: pointer to u64 loop variable
542 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 570 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
543 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 571 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
544 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 572 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
545 * @p_nid: ptr to int for nid of the range, can be %NULL 573 * @out_nid: ptr to int for nid of the range, can be %NULL
546 * 574 *
547 * Find the first free area from *@idx which matches @nid, fill the out 575 * Find the first free area from *@idx which matches @nid, fill the out
548 * parameters, and update *@idx for the next iteration. The lower 32bit of 576 * parameters, and update *@idx for the next iteration. The lower 32bit of
@@ -616,9 +644,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
616 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() 644 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
617 * @idx: pointer to u64 loop variable 645 * @idx: pointer to u64 loop variable
618 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 646 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
619 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 647 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
620 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 648 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
621 * @p_nid: ptr to int for nid of the range, can be %NULL 649 * @out_nid: ptr to int for nid of the range, can be %NULL
622 * 650 *
623 * Reverse of __next_free_mem_range(). 651 * Reverse of __next_free_mem_range().
624 */ 652 */
@@ -867,6 +895,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
867 return memblock_search(&memblock.memory, addr) != -1; 895 return memblock_search(&memblock.memory, addr) != -1;
868} 896}
869 897
898/**
899 * memblock_is_region_memory - check if a region is a subset of memory
900 * @base: base of region to check
901 * @size: size of region to check
902 *
903 * Check if the region [@base, @base+@size) is a subset of a memory block.
904 *
905 * RETURNS:
906 * 0 if false, non-zero if true
907 */
870int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) 908int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
871{ 909{
872 int idx = memblock_search(&memblock.memory, base); 910 int idx = memblock_search(&memblock.memory, base);
@@ -879,6 +917,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
879 memblock.memory.regions[idx].size) >= end; 917 memblock.memory.regions[idx].size) >= end;
880} 918}
881 919
920/**
921 * memblock_is_region_reserved - check if a region intersects reserved memory
922 * @base: base of region to check
923 * @size: size of region to check
924 *
925 * Check if the region [@base, @base+@size) intersects a reserved memory block.
926 *
927 * RETURNS:
928 * 0 if false, non-zero if true
929 */
882int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) 930int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
883{ 931{
884 memblock_cap_size(base, &size); 932 memblock_cap_size(base, &size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ac35bccadb7b..f72b5e52451a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1148,7 +1148,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1148{ 1148{
1149 if (root_memcg == memcg) 1149 if (root_memcg == memcg)
1150 return true; 1150 return true;
1151 if (!root_memcg->use_hierarchy) 1151 if (!root_memcg->use_hierarchy || !memcg)
1152 return false; 1152 return false;
1153 return css_is_ancestor(&memcg->css, &root_memcg->css); 1153 return css_is_ancestor(&memcg->css, &root_memcg->css);
1154} 1154}
@@ -1234,7 +1234,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1234 1234
1235/** 1235/**
1236 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1236 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1237 * @mem: the memory cgroup 1237 * @memcg: the memory cgroup
1238 * 1238 *
1239 * Returns the maximum amount of memory @mem can be charged with, in 1239 * Returns the maximum amount of memory @mem can be charged with, in
1240 * pages. 1240 * pages.
@@ -1508,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1508 1508
1509/** 1509/**
1510 * test_mem_cgroup_node_reclaimable 1510 * test_mem_cgroup_node_reclaimable
1511 * @mem: the target memcg 1511 * @memcg: the target memcg
1512 * @nid: the node ID to be checked. 1512 * @nid: the node ID to be checked.
1513 * @noswap : specify true here if the user wants flle only information. 1513 * @noswap : specify true here if the user wants flle only information.
1514 * 1514 *
diff --git a/mm/memory.c b/mm/memory.c
index 1b7dc662bf9f..2466d1250231 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1225,7 +1225,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1225 next = pmd_addr_end(addr, end); 1225 next = pmd_addr_end(addr, end);
1226 if (pmd_trans_huge(*pmd)) { 1226 if (pmd_trans_huge(*pmd)) {
1227 if (next - addr != HPAGE_PMD_SIZE) { 1227 if (next - addr != HPAGE_PMD_SIZE) {
1228 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1228#ifdef CONFIG_DEBUG_VM
1229 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1230 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1231 __func__, addr, end,
1232 vma->vm_start,
1233 vma->vm_end);
1234 BUG();
1235 }
1236#endif
1229 split_huge_page_pmd(vma->vm_mm, pmd); 1237 split_huge_page_pmd(vma->vm_mm, pmd);
1230 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1238 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1231 goto next; 1239 goto next;
@@ -1366,7 +1374,7 @@ void unmap_vmas(struct mmu_gather *tlb,
1366/** 1374/**
1367 * zap_page_range - remove user pages in a given range 1375 * zap_page_range - remove user pages in a given range
1368 * @vma: vm_area_struct holding the applicable pages 1376 * @vma: vm_area_struct holding the applicable pages
1369 * @address: starting address of pages to zap 1377 * @start: starting address of pages to zap
1370 * @size: number of bytes to zap 1378 * @size: number of bytes to zap
1371 * @details: details of nonlinear truncation or shared cache invalidation 1379 * @details: details of nonlinear truncation or shared cache invalidation
1372 * 1380 *
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f15c1b24ca18..1d771e4200d2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1177 if (!list_empty(&pagelist)) { 1177 if (!list_empty(&pagelist)) {
1178 nr_failed = migrate_pages(&pagelist, new_vma_page, 1178 nr_failed = migrate_pages(&pagelist, new_vma_page,
1179 (unsigned long)vma, 1179 (unsigned long)vma,
1180 false, true); 1180 false, MIGRATE_SYNC);
1181 if (nr_failed) 1181 if (nr_failed)
1182 putback_lru_pages(&pagelist); 1182 putback_lru_pages(&pagelist);
1183 } 1183 }
diff --git a/mm/nommu.c b/mm/nommu.c
index c4acfbc09972..d4b0c10872de 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1486 1486
1487 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1487 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1488 1488
1489 ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1489 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1490 1490
1491 if (file) 1491 if (file)
1492 fput(file); 1492 fput(file);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ed0e19677360..ac300c99baf6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -183,7 +183,8 @@ static bool oom_unkillable_task(struct task_struct *p,
183unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, 183unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
184 const nodemask_t *nodemask, unsigned long totalpages) 184 const nodemask_t *nodemask, unsigned long totalpages)
185{ 185{
186 unsigned long points; 186 long points;
187 long adj;
187 188
188 if (oom_unkillable_task(p, memcg, nodemask)) 189 if (oom_unkillable_task(p, memcg, nodemask))
189 return 0; 190 return 0;
@@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
192 if (!p) 193 if (!p)
193 return 0; 194 return 0;
194 195
195 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { 196 adj = p->signal->oom_score_adj;
197 if (adj == OOM_SCORE_ADJ_MIN) {
196 task_unlock(p); 198 task_unlock(p);
197 return 0; 199 return 0;
198 } 200 }
@@ -210,20 +212,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
210 * implementation used by LSMs. 212 * implementation used by LSMs.
211 */ 213 */
212 if (has_capability_noaudit(p, CAP_SYS_ADMIN)) 214 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
213 points -= 30 * totalpages / 1000; 215 adj -= 30;
214 216
215 /* 217 /* Normalize to oom_score_adj units */
216 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may 218 adj *= totalpages / 1000;
217 * either completely disable oom killing or always prefer a certain 219 points += adj;
218 * task.
219 */
220 points += p->signal->oom_score_adj * totalpages / 1000;
221 220
222 /* 221 /*
223 * Never return 0 for an eligible task regardless of the root bonus and 222 * Never return 0 for an eligible task regardless of the root bonus and
224 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). 223 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
225 */ 224 */
226 return points ? points : 1; 225 return points > 0 ? points : 1;
227} 226}
228 227
229/* 228/*
@@ -366,7 +365,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
366 365
367/** 366/**
368 * dump_tasks - dump current memory state of all system tasks 367 * dump_tasks - dump current memory state of all system tasks
369 * @mem: current's memory controller, if constrained 368 * @memcg: current's memory controller, if constrained
370 * @nodemask: nodemask passed to page allocator for mempolicy ooms 369 * @nodemask: nodemask passed to page allocator for mempolicy ooms
371 * 370 *
372 * Dumps the current memory state of all eligible tasks. Tasks not in the same 371 * Dumps the current memory state of all eligible tasks. Tasks not in the same
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1ccbd714059c..eb750f851395 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
392 392
393/** 393/**
394 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. 394 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
395 * @end: swap entry to be cmpxchged 395 * @ent: swap entry to be cmpxchged
396 * @old: old id 396 * @old: old id
397 * @new: new id 397 * @new: new id
398 * 398 *
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
422/** 422/**
423 * swap_cgroup_record - record mem_cgroup for this swp_entry. 423 * swap_cgroup_record - record mem_cgroup for this swp_entry.
424 * @ent: swap entry to be recorded into 424 * @ent: swap entry to be recorded into
425 * @mem: mem_cgroup to be recorded 425 * @id: mem_cgroup to be recorded
426 * 426 *
427 * Returns old value at success, 0 at failure. 427 * Returns old value at success, 0 at failure.
428 * (Of course, old value can be 0.) 428 * (Of course, old value can be 0.)
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611e..34f02923744c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,6 +18,7 @@
18#include <linux/bio.h> 18#include <linux/bio.h>
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/frontswap.h>
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
22 23
23static struct bio *get_swap_bio(gfp_t gfp_flags, 24static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
98 unlock_page(page); 99 unlock_page(page);
99 goto out; 100 goto out;
100 } 101 }
102 if (frontswap_store(page) == 0) {
103 set_page_writeback(page);
104 unlock_page(page);
105 end_page_writeback(page);
106 goto out;
107 }
101 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); 108 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
102 if (bio == NULL) { 109 if (bio == NULL) {
103 set_page_dirty(page); 110 set_page_dirty(page);
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
122 129
123 VM_BUG_ON(!PageLocked(page)); 130 VM_BUG_ON(!PageLocked(page));
124 VM_BUG_ON(PageUptodate(page)); 131 VM_BUG_ON(PageUptodate(page));
132 if (frontswap_load(page) == 0) {
133 SetPageUptodate(page);
134 unlock_page(page);
135 goto out;
136 }
125 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); 137 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
126 if (bio == NULL) { 138 if (bio == NULL) {
127 unlock_page(page); 139 unlock_page(page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e12714..6c118d012bb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
162 162
163/** 163/**
164 * walk_page_range - walk a memory map's page tables with a callback 164 * walk_page_range - walk a memory map's page tables with a callback
165 * @mm: memory map to walk
166 * @addr: starting address 165 * @addr: starting address
167 * @end: ending address 166 * @end: ending address
168 * @walk: set of callbacks to invoke for each level of the tree 167 * @walk: set of callbacks to invoke for each level of the tree
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 405d331804c3..3707c71ae4cd 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -360,7 +360,6 @@ err_free:
360 * @chunk: chunk to depopulate 360 * @chunk: chunk to depopulate
361 * @off: offset to the area to depopulate 361 * @off: offset to the area to depopulate
362 * @size: size of the area to depopulate in bytes 362 * @size: size of the area to depopulate in bytes
363 * @flush: whether to flush cache and tlb or not
364 * 363 *
365 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 364 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
366 * from @chunk. If @flush is true, vcache is flushed before unmapping 365 * from @chunk. If @flush is true, vcache is flushed before unmapping
diff --git a/mm/shmem.c b/mm/shmem.c
index c244e93a70fa..4ce02e0673db 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -683,10 +683,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
683 mutex_lock(&shmem_swaplist_mutex); 683 mutex_lock(&shmem_swaplist_mutex);
684 /* 684 /*
685 * We needed to drop mutex to make that restrictive page 685 * We needed to drop mutex to make that restrictive page
686 * allocation; but the inode might already be freed by now, 686 * allocation, but the inode might have been freed while we
687 * and we cannot refer to inode or mapping or info to check. 687 * dropped it: although a racing shmem_evict_inode() cannot
688 * However, we do hold page lock on the PageSwapCache page, 688 * complete without emptying the radix_tree, our page lock
689 * so can check if that still has our reference remaining. 689 * on this swapcache page is not enough to prevent that -
690 * free_swap_and_cache() of our swap entry will only
691 * trylock_page(), removing swap from radix_tree whatever.
692 *
693 * We must not proceed to shmem_add_to_page_cache() if the
694 * inode has been freed, but of course we cannot rely on
695 * inode or mapping or info to check that. However, we can
696 * safely check if our swap entry is still in use (and here
697 * it can't have got reused for another page): if it's still
698 * in use, then the inode cannot have been freed yet, and we
699 * can safely proceed (if it's no longer in use, that tells
700 * nothing about the inode, but we don't need to unuse swap).
690 */ 701 */
691 if (!page_swapcount(*pagep)) 702 if (!page_swapcount(*pagep))
692 error = -ENOENT; 703 error = -ENOENT;
@@ -730,9 +741,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
730 741
731 /* 742 /*
732 * There's a faint possibility that swap page was replaced before 743 * There's a faint possibility that swap page was replaced before
733 * caller locked it: it will come back later with the right page. 744 * caller locked it: caller will come back later with the right page.
734 */ 745 */
735 if (unlikely(!PageSwapCache(page))) 746 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
736 goto out; 747 goto out;
737 748
738 /* 749 /*
@@ -995,21 +1006,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
995 newpage = shmem_alloc_page(gfp, info, index); 1006 newpage = shmem_alloc_page(gfp, info, index);
996 if (!newpage) 1007 if (!newpage)
997 return -ENOMEM; 1008 return -ENOMEM;
998 VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
999 1009
1000 *pagep = newpage;
1001 page_cache_get(newpage); 1010 page_cache_get(newpage);
1002 copy_highpage(newpage, oldpage); 1011 copy_highpage(newpage, oldpage);
1012 flush_dcache_page(newpage);
1003 1013
1004 VM_BUG_ON(!PageLocked(oldpage));
1005 __set_page_locked(newpage); 1014 __set_page_locked(newpage);
1006 VM_BUG_ON(!PageUptodate(oldpage));
1007 SetPageUptodate(newpage); 1015 SetPageUptodate(newpage);
1008 VM_BUG_ON(!PageSwapBacked(oldpage));
1009 SetPageSwapBacked(newpage); 1016 SetPageSwapBacked(newpage);
1010 VM_BUG_ON(!swap_index);
1011 set_page_private(newpage, swap_index); 1017 set_page_private(newpage, swap_index);
1012 VM_BUG_ON(!PageSwapCache(oldpage));
1013 SetPageSwapCache(newpage); 1018 SetPageSwapCache(newpage);
1014 1019
1015 /* 1020 /*
@@ -1019,13 +1024,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1019 spin_lock_irq(&swap_mapping->tree_lock); 1024 spin_lock_irq(&swap_mapping->tree_lock);
1020 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, 1025 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1021 newpage); 1026 newpage);
1022 __inc_zone_page_state(newpage, NR_FILE_PAGES); 1027 if (!error) {
1023 __dec_zone_page_state(oldpage, NR_FILE_PAGES); 1028 __inc_zone_page_state(newpage, NR_FILE_PAGES);
1029 __dec_zone_page_state(oldpage, NR_FILE_PAGES);
1030 }
1024 spin_unlock_irq(&swap_mapping->tree_lock); 1031 spin_unlock_irq(&swap_mapping->tree_lock);
1025 BUG_ON(error);
1026 1032
1027 mem_cgroup_replace_page_cache(oldpage, newpage); 1033 if (unlikely(error)) {
1028 lru_cache_add_anon(newpage); 1034 /*
1035 * Is this possible? I think not, now that our callers check
1036 * both PageSwapCache and page_private after getting page lock;
1037 * but be defensive. Reverse old to newpage for clear and free.
1038 */
1039 oldpage = newpage;
1040 } else {
1041 mem_cgroup_replace_page_cache(oldpage, newpage);
1042 lru_cache_add_anon(newpage);
1043 *pagep = newpage;
1044 }
1029 1045
1030 ClearPageSwapCache(oldpage); 1046 ClearPageSwapCache(oldpage);
1031 set_page_private(oldpage, 0); 1047 set_page_private(oldpage, 0);
@@ -1033,7 +1049,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1033 unlock_page(oldpage); 1049 unlock_page(oldpage);
1034 page_cache_release(oldpage); 1050 page_cache_release(oldpage);
1035 page_cache_release(oldpage); 1051 page_cache_release(oldpage);
1036 return 0; 1052 return error;
1037} 1053}
1038 1054
1039/* 1055/*
@@ -1107,7 +1123,8 @@ repeat:
1107 1123
1108 /* We have to do this with page locked to prevent races */ 1124 /* We have to do this with page locked to prevent races */
1109 lock_page(page); 1125 lock_page(page);
1110 if (!PageSwapCache(page) || page->mapping) { 1126 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1127 page->mapping) {
1111 error = -EEXIST; /* try again */ 1128 error = -EEXIST; /* try again */
1112 goto failed; 1129 goto failed;
1113 } 1130 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 457b10baef59..71373d03fcee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,8 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/poll.h> 32#include <linux/poll.h>
33#include <linux/oom.h> 33#include <linux/oom.h>
34#include <linux/frontswap.h>
35#include <linux/swapfile.h>
34 36
35#include <asm/pgtable.h> 37#include <asm/pgtable.h>
36#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
42static void free_swap_count_continuations(struct swap_info_struct *); 44static void free_swap_count_continuations(struct swap_info_struct *);
43static sector_t map_swap_entry(swp_entry_t, struct block_device**); 45static sector_t map_swap_entry(swp_entry_t, struct block_device**);
44 46
45static DEFINE_SPINLOCK(swap_lock); 47DEFINE_SPINLOCK(swap_lock);
46static unsigned int nr_swapfiles; 48static unsigned int nr_swapfiles;
47long nr_swap_pages; 49long nr_swap_pages;
48long total_swap_pages; 50long total_swap_pages;
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry ";
53static const char Bad_offset[] = "Bad swap offset entry "; 55static const char Bad_offset[] = "Bad swap offset entry ";
54static const char Unused_offset[] = "Unused swap offset entry "; 56static const char Unused_offset[] = "Unused swap offset entry ";
55 57
56static struct swap_list_t swap_list = {-1, -1}; 58struct swap_list_t swap_list = {-1, -1};
57 59
58static struct swap_info_struct *swap_info[MAX_SWAPFILES]; 60struct swap_info_struct *swap_info[MAX_SWAPFILES];
59 61
60static DEFINE_MUTEX(swapon_mutex); 62static DEFINE_MUTEX(swapon_mutex);
61 63
@@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
556 swap_list.next = p->type; 558 swap_list.next = p->type;
557 nr_swap_pages++; 559 nr_swap_pages++;
558 p->inuse_pages--; 560 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset);
559 if ((p->flags & SWP_BLKDEV) && 562 if ((p->flags & SWP_BLKDEV) &&
560 disk->fops->swap_slot_free_notify) 563 disk->fops->swap_slot_free_notify)
561 disk->fops->swap_slot_free_notify(p->bdev, offset); 564 disk->fops->swap_slot_free_notify(p->bdev, offset);
@@ -985,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm,
985} 988}
986 989
987/* 990/*
988 * Scan swap_map from current position to next entry still in use. 991 * Scan swap_map (or frontswap_map if frontswap parameter is true)
992 * from current position to next entry still in use.
989 * Recycle to start on reaching the end, returning 0 when empty. 993 * Recycle to start on reaching the end, returning 0 when empty.
990 */ 994 */
991static unsigned int find_next_to_unuse(struct swap_info_struct *si, 995static unsigned int find_next_to_unuse(struct swap_info_struct *si,
992 unsigned int prev) 996 unsigned int prev, bool frontswap)
993{ 997{
994 unsigned int max = si->max; 998 unsigned int max = si->max;
995 unsigned int i = prev; 999 unsigned int i = prev;
@@ -1015,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1015 prev = 0; 1019 prev = 0;
1016 i = 1; 1020 i = 1;
1017 } 1021 }
1022 if (frontswap) {
1023 if (frontswap_test(si, i))
1024 break;
1025 else
1026 continue;
1027 }
1018 count = si->swap_map[i]; 1028 count = si->swap_map[i];
1019 if (count && swap_count(count) != SWAP_MAP_BAD) 1029 if (count && swap_count(count) != SWAP_MAP_BAD)
1020 break; 1030 break;
@@ -1026,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1026 * We completely avoid races by reading each swap page in advance, 1036 * We completely avoid races by reading each swap page in advance,
1027 * and then search for the process using it. All the necessary 1037 * and then search for the process using it. All the necessary
1028 * page table adjustments can then be made atomically. 1038 * page table adjustments can then be made atomically.
1039 *
1040 * if the boolean frontswap is true, only unuse pages_to_unuse pages;
1041 * pages_to_unuse==0 means all pages; ignored if frontswap is false
1029 */ 1042 */
1030static int try_to_unuse(unsigned int type) 1043int try_to_unuse(unsigned int type, bool frontswap,
1044 unsigned long pages_to_unuse)
1031{ 1045{
1032 struct swap_info_struct *si = swap_info[type]; 1046 struct swap_info_struct *si = swap_info[type];
1033 struct mm_struct *start_mm; 1047 struct mm_struct *start_mm;
@@ -1060,7 +1074,7 @@ static int try_to_unuse(unsigned int type)
1060 * one pass through swap_map is enough, but not necessarily: 1074 * one pass through swap_map is enough, but not necessarily:
1061 * there are races when an instance of an entry might be missed. 1075 * there are races when an instance of an entry might be missed.
1062 */ 1076 */
1063 while ((i = find_next_to_unuse(si, i)) != 0) { 1077 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1064 if (signal_pending(current)) { 1078 if (signal_pending(current)) {
1065 retval = -EINTR; 1079 retval = -EINTR;
1066 break; 1080 break;
@@ -1227,6 +1241,10 @@ static int try_to_unuse(unsigned int type)
1227 * interactive performance. 1241 * interactive performance.
1228 */ 1242 */
1229 cond_resched(); 1243 cond_resched();
1244 if (frontswap && pages_to_unuse > 0) {
1245 if (!--pages_to_unuse)
1246 break;
1247 }
1230 } 1248 }
1231 1249
1232 mmput(start_mm); 1250 mmput(start_mm);
@@ -1486,7 +1504,8 @@ bad_bmap:
1486} 1504}
1487 1505
1488static void enable_swap_info(struct swap_info_struct *p, int prio, 1506static void enable_swap_info(struct swap_info_struct *p, int prio,
1489 unsigned char *swap_map) 1507 unsigned char *swap_map,
1508 unsigned long *frontswap_map)
1490{ 1509{
1491 int i, prev; 1510 int i, prev;
1492 1511
@@ -1496,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1496 else 1515 else
1497 p->prio = --least_priority; 1516 p->prio = --least_priority;
1498 p->swap_map = swap_map; 1517 p->swap_map = swap_map;
1518 frontswap_map_set(p, frontswap_map);
1499 p->flags |= SWP_WRITEOK; 1519 p->flags |= SWP_WRITEOK;
1500 nr_swap_pages += p->pages; 1520 nr_swap_pages += p->pages;
1501 total_swap_pages += p->pages; 1521 total_swap_pages += p->pages;
@@ -1512,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1512 swap_list.head = swap_list.next = p->type; 1532 swap_list.head = swap_list.next = p->type;
1513 else 1533 else
1514 swap_info[prev]->next = p->type; 1534 swap_info[prev]->next = p->type;
1535 frontswap_init(p->type);
1515 spin_unlock(&swap_lock); 1536 spin_unlock(&swap_lock);
1516} 1537}
1517 1538
@@ -1585,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1585 spin_unlock(&swap_lock); 1606 spin_unlock(&swap_lock);
1586 1607
1587 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1608 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1588 err = try_to_unuse(type); 1609 err = try_to_unuse(type, false, 0); /* force all pages to be unused */
1589 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); 1610 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1590 1611
1591 if (err) { 1612 if (err) {
@@ -1596,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1596 * sys_swapoff for this swap_info_struct at this point. 1617 * sys_swapoff for this swap_info_struct at this point.
1597 */ 1618 */
1598 /* re-insert swap space back into swap_list */ 1619 /* re-insert swap space back into swap_list */
1599 enable_swap_info(p, p->prio, p->swap_map); 1620 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1600 goto out_dput; 1621 goto out_dput;
1601 } 1622 }
1602 1623
@@ -1622,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1622 swap_map = p->swap_map; 1643 swap_map = p->swap_map;
1623 p->swap_map = NULL; 1644 p->swap_map = NULL;
1624 p->flags = 0; 1645 p->flags = 0;
1646 frontswap_invalidate_area(type);
1625 spin_unlock(&swap_lock); 1647 spin_unlock(&swap_lock);
1626 mutex_unlock(&swapon_mutex); 1648 mutex_unlock(&swapon_mutex);
1627 vfree(swap_map); 1649 vfree(swap_map);
1650 vfree(frontswap_map_get(p));
1628 /* Destroy swap account informatin */ 1651 /* Destroy swap account informatin */
1629 swap_cgroup_swapoff(type); 1652 swap_cgroup_swapoff(type);
1630 1653
@@ -1893,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1893 1916
1894 /* 1917 /*
1895 * Find out how many pages are allowed for a single swap 1918 * Find out how many pages are allowed for a single swap
1896 * device. There are three limiting factors: 1) the number 1919 * device. There are two limiting factors: 1) the number
1897 * of bits for the swap offset in the swp_entry_t type, and 1920 * of bits for the swap offset in the swp_entry_t type, and
1898 * 2) the number of bits in the swap pte as defined by the 1921 * 2) the number of bits in the swap pte as defined by the
1899 * the different architectures, and 3) the number of free bits 1922 * different architectures. In order to find the
1900 * in an exceptional radix_tree entry. In order to find the
1901 * largest possible bit mask, a swap entry with swap type 0 1923 * largest possible bit mask, a swap entry with swap type 0
1902 * and swap offset ~0UL is created, encoded to a swap pte, 1924 * and swap offset ~0UL is created, encoded to a swap pte,
1903 * decoded to a swp_entry_t again, and finally the swap 1925 * decoded to a swp_entry_t again, and finally the swap
1904 * offset is extracted. This will mask all the bits from 1926 * offset is extracted. This will mask all the bits from
1905 * the initial ~0UL mask that can't be encoded in either 1927 * the initial ~0UL mask that can't be encoded in either
1906 * the swp_entry_t or the architecture definition of a 1928 * the swp_entry_t or the architecture definition of a
1907 * swap pte. Then the same is done for a radix_tree entry. 1929 * swap pte.
1908 */ 1930 */
1909 maxpages = swp_offset(pte_to_swp_entry( 1931 maxpages = swp_offset(pte_to_swp_entry(
1910 swp_entry_to_pte(swp_entry(0, ~0UL)))); 1932 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1911 maxpages = swp_offset(radix_to_swp_entry(
1912 swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
1913
1914 if (maxpages > swap_header->info.last_page) { 1933 if (maxpages > swap_header->info.last_page) {
1915 maxpages = swap_header->info.last_page + 1; 1934 maxpages = swap_header->info.last_page + 1;
1916 /* p->max is an unsigned int: don't overflow it */ 1935 /* p->max is an unsigned int: don't overflow it */
@@ -1988,6 +2007,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1988 sector_t span; 2007 sector_t span;
1989 unsigned long maxpages; 2008 unsigned long maxpages;
1990 unsigned char *swap_map = NULL; 2009 unsigned char *swap_map = NULL;
2010 unsigned long *frontswap_map = NULL;
1991 struct page *page = NULL; 2011 struct page *page = NULL;
1992 struct inode *inode = NULL; 2012 struct inode *inode = NULL;
1993 2013
@@ -2071,6 +2091,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2071 error = nr_extents; 2091 error = nr_extents;
2072 goto bad_swap; 2092 goto bad_swap;
2073 } 2093 }
2094 /* frontswap enabled? set up bit-per-page map for frontswap */
2095 if (frontswap_enabled)
2096 frontswap_map = vzalloc(maxpages / sizeof(long));
2074 2097
2075 if (p->bdev) { 2098 if (p->bdev) {
2076 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2099 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2086,14 +2109,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2086 if (swap_flags & SWAP_FLAG_PREFER) 2109 if (swap_flags & SWAP_FLAG_PREFER)
2087 prio = 2110 prio =
2088 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2111 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2089 enable_swap_info(p, prio, swap_map); 2112 enable_swap_info(p, prio, swap_map, frontswap_map);
2090 2113
2091 printk(KERN_INFO "Adding %uk swap on %s. " 2114 printk(KERN_INFO "Adding %uk swap on %s. "
2092 "Priority:%d extents:%d across:%lluk %s%s\n", 2115 "Priority:%d extents:%d across:%lluk %s%s%s\n",
2093 p->pages<<(PAGE_SHIFT-10), name, p->prio, 2116 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2094 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2117 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2095 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2118 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2096 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2119 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2120 (frontswap_map) ? "FS" : "");
2097 2121
2098 mutex_unlock(&swapon_mutex); 2122 mutex_unlock(&swapon_mutex);
2099 atomic_inc(&proc_poll_event); 2123 atomic_inc(&proc_poll_event);