diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 17 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/frontswap.c | 314 | ||||
-rw-r--r-- | mm/memblock.c | 68 | ||||
-rw-r--r-- | mm/memcontrol.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 12 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 21 | ||||
-rw-r--r-- | mm/page_cgroup.c | 4 | ||||
-rw-r--r-- | mm/page_io.c | 12 | ||||
-rw-r--r-- | mm/pagewalk.c | 1 | ||||
-rw-r--r-- | mm/percpu-vm.c | 1 | ||||
-rw-r--r-- | mm/shmem.c | 57 | ||||
-rw-r--r-- | mm/swapfile.c | 66 |
15 files changed, 511 insertions, 73 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index b2176374b98e..82fed4eb2b6f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -389,3 +389,20 @@ config CLEANCACHE | |||
389 | in a negligible performance hit. | 389 | in a negligible performance hit. |
390 | 390 | ||
391 | If unsure, say Y to enable cleancache | 391 | If unsure, say Y to enable cleancache |
392 | |||
393 | config FRONTSWAP | ||
394 | bool "Enable frontswap to cache swap pages if tmem is present" | ||
395 | depends on SWAP | ||
396 | default n | ||
397 | help | ||
398 | Frontswap is so named because it can be thought of as the opposite | ||
399 | of a "backing" store for a swap device. The data is stored into | ||
400 | "transcendent memory", memory that is not directly accessible or | ||
401 | addressable by the kernel and is of unknown and possibly | ||
402 | time-varying size. When space in transcendent memory is available, | ||
403 | a significant swap I/O reduction may be achieved. When none is | ||
404 | available, all frontswap calls are reduced to a single pointer- | ||
405 | compare-against-NULL resulting in a negligible performance hit | ||
406 | and swap data is stored as normal on the matching swap device. | ||
407 | |||
408 | If unsure, say Y to enable frontswap. | ||
diff --git a/mm/Makefile b/mm/Makefile index a156285ce88d..2e2fbbefb99f 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -29,6 +29,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | |||
29 | 29 | ||
30 | obj-$(CONFIG_BOUNCE) += bounce.o | 30 | obj-$(CONFIG_BOUNCE) += bounce.o |
31 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o | 31 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o |
32 | obj-$(CONFIG_FRONTSWAP) += frontswap.o | ||
32 | obj-$(CONFIG_HAS_DMA) += dmapool.o | 33 | obj-$(CONFIG_HAS_DMA) += dmapool.o |
33 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 34 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
34 | obj-$(CONFIG_NUMA) += mempolicy.o | 35 | obj-$(CONFIG_NUMA) += mempolicy.o |
diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000000..e25025574a02 --- /dev/null +++ b/mm/frontswap.c | |||
@@ -0,0 +1,314 @@ | |||
1 | /* | ||
2 | * Frontswap frontend | ||
3 | * | ||
4 | * This code provides the generic "frontend" layer to call a matching | ||
5 | * "backend" driver implementation of frontswap. See | ||
6 | * Documentation/vm/frontswap.txt for more information. | ||
7 | * | ||
8 | * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. | ||
9 | * Author: Dan Magenheimer | ||
10 | * | ||
11 | * This work is licensed under the terms of the GNU GPL, version 2. | ||
12 | */ | ||
13 | |||
14 | #include <linux/mm.h> | ||
15 | #include <linux/mman.h> | ||
16 | #include <linux/swap.h> | ||
17 | #include <linux/swapops.h> | ||
18 | #include <linux/proc_fs.h> | ||
19 | #include <linux/security.h> | ||
20 | #include <linux/capability.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/uaccess.h> | ||
23 | #include <linux/debugfs.h> | ||
24 | #include <linux/frontswap.h> | ||
25 | #include <linux/swapfile.h> | ||
26 | |||
27 | /* | ||
28 | * frontswap_ops is set by frontswap_register_ops to contain the pointers | ||
29 | * to the frontswap "backend" implementation functions. | ||
30 | */ | ||
31 | static struct frontswap_ops frontswap_ops __read_mostly; | ||
32 | |||
33 | /* | ||
34 | * This global enablement flag reduces overhead on systems where frontswap_ops | ||
35 | * has not been registered, so is preferred to the slower alternative: a | ||
36 | * function call that checks a non-global. | ||
37 | */ | ||
38 | bool frontswap_enabled __read_mostly; | ||
39 | EXPORT_SYMBOL(frontswap_enabled); | ||
40 | |||
41 | /* | ||
42 | * If enabled, frontswap_store will return failure even on success. As | ||
43 | * a result, the swap subsystem will always write the page to swap, in | ||
44 | * effect converting frontswap into a writethrough cache. In this mode, | ||
45 | * there is no direct reduction in swap writes, but a frontswap backend | ||
46 | * can unilaterally "reclaim" any pages in use with no data loss, thus | ||
47 | * providing increases control over maximum memory usage due to frontswap. | ||
48 | */ | ||
49 | static bool frontswap_writethrough_enabled __read_mostly; | ||
50 | |||
51 | #ifdef CONFIG_DEBUG_FS | ||
52 | /* | ||
53 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is | ||
54 | * properly configured). These are for information only so are not protected | ||
55 | * against increment races. | ||
56 | */ | ||
57 | static u64 frontswap_loads; | ||
58 | static u64 frontswap_succ_stores; | ||
59 | static u64 frontswap_failed_stores; | ||
60 | static u64 frontswap_invalidates; | ||
61 | |||
62 | static inline void inc_frontswap_loads(void) { | ||
63 | frontswap_loads++; | ||
64 | } | ||
65 | static inline void inc_frontswap_succ_stores(void) { | ||
66 | frontswap_succ_stores++; | ||
67 | } | ||
68 | static inline void inc_frontswap_failed_stores(void) { | ||
69 | frontswap_failed_stores++; | ||
70 | } | ||
71 | static inline void inc_frontswap_invalidates(void) { | ||
72 | frontswap_invalidates++; | ||
73 | } | ||
74 | #else | ||
75 | static inline void inc_frontswap_loads(void) { } | ||
76 | static inline void inc_frontswap_succ_stores(void) { } | ||
77 | static inline void inc_frontswap_failed_stores(void) { } | ||
78 | static inline void inc_frontswap_invalidates(void) { } | ||
79 | #endif | ||
80 | /* | ||
81 | * Register operations for frontswap, returning previous thus allowing | ||
82 | * detection of multiple backends and possible nesting. | ||
83 | */ | ||
84 | struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) | ||
85 | { | ||
86 | struct frontswap_ops old = frontswap_ops; | ||
87 | |||
88 | frontswap_ops = *ops; | ||
89 | frontswap_enabled = true; | ||
90 | return old; | ||
91 | } | ||
92 | EXPORT_SYMBOL(frontswap_register_ops); | ||
93 | |||
94 | /* | ||
95 | * Enable/disable frontswap writethrough (see above). | ||
96 | */ | ||
97 | void frontswap_writethrough(bool enable) | ||
98 | { | ||
99 | frontswap_writethrough_enabled = enable; | ||
100 | } | ||
101 | EXPORT_SYMBOL(frontswap_writethrough); | ||
102 | |||
103 | /* | ||
104 | * Called when a swap device is swapon'd. | ||
105 | */ | ||
106 | void __frontswap_init(unsigned type) | ||
107 | { | ||
108 | struct swap_info_struct *sis = swap_info[type]; | ||
109 | |||
110 | BUG_ON(sis == NULL); | ||
111 | if (sis->frontswap_map == NULL) | ||
112 | return; | ||
113 | if (frontswap_enabled) | ||
114 | (*frontswap_ops.init)(type); | ||
115 | } | ||
116 | EXPORT_SYMBOL(__frontswap_init); | ||
117 | |||
118 | /* | ||
119 | * "Store" data from a page to frontswap and associate it with the page's | ||
120 | * swaptype and offset. Page must be locked and in the swap cache. | ||
121 | * If frontswap already contains a page with matching swaptype and | ||
122 | * offset, the frontswap implmentation may either overwrite the data and | ||
123 | * return success or invalidate the page from frontswap and return failure. | ||
124 | */ | ||
125 | int __frontswap_store(struct page *page) | ||
126 | { | ||
127 | int ret = -1, dup = 0; | ||
128 | swp_entry_t entry = { .val = page_private(page), }; | ||
129 | int type = swp_type(entry); | ||
130 | struct swap_info_struct *sis = swap_info[type]; | ||
131 | pgoff_t offset = swp_offset(entry); | ||
132 | |||
133 | BUG_ON(!PageLocked(page)); | ||
134 | BUG_ON(sis == NULL); | ||
135 | if (frontswap_test(sis, offset)) | ||
136 | dup = 1; | ||
137 | ret = (*frontswap_ops.store)(type, offset, page); | ||
138 | if (ret == 0) { | ||
139 | frontswap_set(sis, offset); | ||
140 | inc_frontswap_succ_stores(); | ||
141 | if (!dup) | ||
142 | atomic_inc(&sis->frontswap_pages); | ||
143 | } else if (dup) { | ||
144 | /* | ||
145 | failed dup always results in automatic invalidate of | ||
146 | the (older) page from frontswap | ||
147 | */ | ||
148 | frontswap_clear(sis, offset); | ||
149 | atomic_dec(&sis->frontswap_pages); | ||
150 | inc_frontswap_failed_stores(); | ||
151 | } else | ||
152 | inc_frontswap_failed_stores(); | ||
153 | if (frontswap_writethrough_enabled) | ||
154 | /* report failure so swap also writes to swap device */ | ||
155 | ret = -1; | ||
156 | return ret; | ||
157 | } | ||
158 | EXPORT_SYMBOL(__frontswap_store); | ||
159 | |||
160 | /* | ||
161 | * "Get" data from frontswap associated with swaptype and offset that were | ||
162 | * specified when the data was put to frontswap and use it to fill the | ||
163 | * specified page with data. Page must be locked and in the swap cache. | ||
164 | */ | ||
165 | int __frontswap_load(struct page *page) | ||
166 | { | ||
167 | int ret = -1; | ||
168 | swp_entry_t entry = { .val = page_private(page), }; | ||
169 | int type = swp_type(entry); | ||
170 | struct swap_info_struct *sis = swap_info[type]; | ||
171 | pgoff_t offset = swp_offset(entry); | ||
172 | |||
173 | BUG_ON(!PageLocked(page)); | ||
174 | BUG_ON(sis == NULL); | ||
175 | if (frontswap_test(sis, offset)) | ||
176 | ret = (*frontswap_ops.load)(type, offset, page); | ||
177 | if (ret == 0) | ||
178 | inc_frontswap_loads(); | ||
179 | return ret; | ||
180 | } | ||
181 | EXPORT_SYMBOL(__frontswap_load); | ||
182 | |||
183 | /* | ||
184 | * Invalidate any data from frontswap associated with the specified swaptype | ||
185 | * and offset so that a subsequent "get" will fail. | ||
186 | */ | ||
187 | void __frontswap_invalidate_page(unsigned type, pgoff_t offset) | ||
188 | { | ||
189 | struct swap_info_struct *sis = swap_info[type]; | ||
190 | |||
191 | BUG_ON(sis == NULL); | ||
192 | if (frontswap_test(sis, offset)) { | ||
193 | (*frontswap_ops.invalidate_page)(type, offset); | ||
194 | atomic_dec(&sis->frontswap_pages); | ||
195 | frontswap_clear(sis, offset); | ||
196 | inc_frontswap_invalidates(); | ||
197 | } | ||
198 | } | ||
199 | EXPORT_SYMBOL(__frontswap_invalidate_page); | ||
200 | |||
201 | /* | ||
202 | * Invalidate all data from frontswap associated with all offsets for the | ||
203 | * specified swaptype. | ||
204 | */ | ||
205 | void __frontswap_invalidate_area(unsigned type) | ||
206 | { | ||
207 | struct swap_info_struct *sis = swap_info[type]; | ||
208 | |||
209 | BUG_ON(sis == NULL); | ||
210 | if (sis->frontswap_map == NULL) | ||
211 | return; | ||
212 | (*frontswap_ops.invalidate_area)(type); | ||
213 | atomic_set(&sis->frontswap_pages, 0); | ||
214 | memset(sis->frontswap_map, 0, sis->max / sizeof(long)); | ||
215 | } | ||
216 | EXPORT_SYMBOL(__frontswap_invalidate_area); | ||
217 | |||
218 | /* | ||
219 | * Frontswap, like a true swap device, may unnecessarily retain pages | ||
220 | * under certain circumstances; "shrink" frontswap is essentially a | ||
221 | * "partial swapoff" and works by calling try_to_unuse to attempt to | ||
222 | * unuse enough frontswap pages to attempt to -- subject to memory | ||
223 | * constraints -- reduce the number of pages in frontswap to the | ||
224 | * number given in the parameter target_pages. | ||
225 | */ | ||
226 | void frontswap_shrink(unsigned long target_pages) | ||
227 | { | ||
228 | struct swap_info_struct *si = NULL; | ||
229 | int si_frontswap_pages; | ||
230 | unsigned long total_pages = 0, total_pages_to_unuse; | ||
231 | unsigned long pages = 0, pages_to_unuse = 0; | ||
232 | int type; | ||
233 | bool locked = false; | ||
234 | |||
235 | /* | ||
236 | * we don't want to hold swap_lock while doing a very | ||
237 | * lengthy try_to_unuse, but swap_list may change | ||
238 | * so restart scan from swap_list.head each time | ||
239 | */ | ||
240 | spin_lock(&swap_lock); | ||
241 | locked = true; | ||
242 | total_pages = 0; | ||
243 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
244 | si = swap_info[type]; | ||
245 | total_pages += atomic_read(&si->frontswap_pages); | ||
246 | } | ||
247 | if (total_pages <= target_pages) | ||
248 | goto out; | ||
249 | total_pages_to_unuse = total_pages - target_pages; | ||
250 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
251 | si = swap_info[type]; | ||
252 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | ||
253 | if (total_pages_to_unuse < si_frontswap_pages) | ||
254 | pages = pages_to_unuse = total_pages_to_unuse; | ||
255 | else { | ||
256 | pages = si_frontswap_pages; | ||
257 | pages_to_unuse = 0; /* unuse all */ | ||
258 | } | ||
259 | /* ensure there is enough RAM to fetch pages from frontswap */ | ||
260 | if (security_vm_enough_memory_mm(current->mm, pages)) | ||
261 | continue; | ||
262 | vm_unacct_memory(pages); | ||
263 | break; | ||
264 | } | ||
265 | if (type < 0) | ||
266 | goto out; | ||
267 | locked = false; | ||
268 | spin_unlock(&swap_lock); | ||
269 | try_to_unuse(type, true, pages_to_unuse); | ||
270 | out: | ||
271 | if (locked) | ||
272 | spin_unlock(&swap_lock); | ||
273 | return; | ||
274 | } | ||
275 | EXPORT_SYMBOL(frontswap_shrink); | ||
276 | |||
277 | /* | ||
278 | * Count and return the number of frontswap pages across all | ||
279 | * swap devices. This is exported so that backend drivers can | ||
280 | * determine current usage without reading debugfs. | ||
281 | */ | ||
282 | unsigned long frontswap_curr_pages(void) | ||
283 | { | ||
284 | int type; | ||
285 | unsigned long totalpages = 0; | ||
286 | struct swap_info_struct *si = NULL; | ||
287 | |||
288 | spin_lock(&swap_lock); | ||
289 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
290 | si = swap_info[type]; | ||
291 | totalpages += atomic_read(&si->frontswap_pages); | ||
292 | } | ||
293 | spin_unlock(&swap_lock); | ||
294 | return totalpages; | ||
295 | } | ||
296 | EXPORT_SYMBOL(frontswap_curr_pages); | ||
297 | |||
298 | static int __init init_frontswap(void) | ||
299 | { | ||
300 | #ifdef CONFIG_DEBUG_FS | ||
301 | struct dentry *root = debugfs_create_dir("frontswap", NULL); | ||
302 | if (root == NULL) | ||
303 | return -ENXIO; | ||
304 | debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); | ||
305 | debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); | ||
306 | debugfs_create_u64("failed_stores", S_IRUGO, root, | ||
307 | &frontswap_failed_stores); | ||
308 | debugfs_create_u64("invalidates", S_IRUGO, | ||
309 | root, &frontswap_invalidates); | ||
310 | #endif | ||
311 | return 0; | ||
312 | } | ||
313 | |||
314 | module_init(init_frontswap); | ||
diff --git a/mm/memblock.c b/mm/memblock.c index 952123eba433..d4382095f8bd 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -184,7 +184,24 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u | |||
184 | } | 184 | } |
185 | } | 185 | } |
186 | 186 | ||
187 | static int __init_memblock memblock_double_array(struct memblock_type *type) | 187 | /** |
188 | * memblock_double_array - double the size of the memblock regions array | ||
189 | * @type: memblock type of the regions array being doubled | ||
190 | * @new_area_start: starting address of memory range to avoid overlap with | ||
191 | * @new_area_size: size of memory range to avoid overlap with | ||
192 | * | ||
193 | * Double the size of the @type regions array. If memblock is being used to | ||
194 | * allocate memory for a new reserved regions array and there is a previously | ||
195 | * allocated memory range [@new_area_start,@new_area_start+@new_area_size] | ||
196 | * waiting to be reserved, ensure the memory used by the new array does | ||
197 | * not overlap. | ||
198 | * | ||
199 | * RETURNS: | ||
200 | * 0 on success, -1 on failure. | ||
201 | */ | ||
202 | static int __init_memblock memblock_double_array(struct memblock_type *type, | ||
203 | phys_addr_t new_area_start, | ||
204 | phys_addr_t new_area_size) | ||
188 | { | 205 | { |
189 | struct memblock_region *new_array, *old_array; | 206 | struct memblock_region *new_array, *old_array; |
190 | phys_addr_t old_size, new_size, addr; | 207 | phys_addr_t old_size, new_size, addr; |
@@ -222,7 +239,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
222 | new_array = kmalloc(new_size, GFP_KERNEL); | 239 | new_array = kmalloc(new_size, GFP_KERNEL); |
223 | addr = new_array ? __pa(new_array) : 0; | 240 | addr = new_array ? __pa(new_array) : 0; |
224 | } else { | 241 | } else { |
225 | addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); | 242 | /* only exclude range when trying to double reserved.regions */ |
243 | if (type != &memblock.reserved) | ||
244 | new_area_start = new_area_size = 0; | ||
245 | |||
246 | addr = memblock_find_in_range(new_area_start + new_area_size, | ||
247 | memblock.current_limit, | ||
248 | new_size, sizeof(phys_addr_t)); | ||
249 | if (!addr && new_area_size) | ||
250 | addr = memblock_find_in_range(0, | ||
251 | min(new_area_start, memblock.current_limit), | ||
252 | new_size, sizeof(phys_addr_t)); | ||
253 | |||
226 | new_array = addr ? __va(addr) : 0; | 254 | new_array = addr ? __va(addr) : 0; |
227 | } | 255 | } |
228 | if (!addr) { | 256 | if (!addr) { |
@@ -399,7 +427,7 @@ repeat: | |||
399 | */ | 427 | */ |
400 | if (!insert) { | 428 | if (!insert) { |
401 | while (type->cnt + nr_new > type->max) | 429 | while (type->cnt + nr_new > type->max) |
402 | if (memblock_double_array(type) < 0) | 430 | if (memblock_double_array(type, obase, size) < 0) |
403 | return -ENOMEM; | 431 | return -ENOMEM; |
404 | insert = true; | 432 | insert = true; |
405 | goto repeat; | 433 | goto repeat; |
@@ -450,7 +478,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
450 | 478 | ||
451 | /* we'll create at most two more regions */ | 479 | /* we'll create at most two more regions */ |
452 | while (type->cnt + 2 > type->max) | 480 | while (type->cnt + 2 > type->max) |
453 | if (memblock_double_array(type) < 0) | 481 | if (memblock_double_array(type, base, size) < 0) |
454 | return -ENOMEM; | 482 | return -ENOMEM; |
455 | 483 | ||
456 | for (i = 0; i < type->cnt; i++) { | 484 | for (i = 0; i < type->cnt; i++) { |
@@ -540,9 +568,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | |||
540 | * __next_free_mem_range - next function for for_each_free_mem_range() | 568 | * __next_free_mem_range - next function for for_each_free_mem_range() |
541 | * @idx: pointer to u64 loop variable | 569 | * @idx: pointer to u64 loop variable |
542 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 570 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes |
543 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 571 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
544 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 572 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
545 | * @p_nid: ptr to int for nid of the range, can be %NULL | 573 | * @out_nid: ptr to int for nid of the range, can be %NULL |
546 | * | 574 | * |
547 | * Find the first free area from *@idx which matches @nid, fill the out | 575 | * Find the first free area from *@idx which matches @nid, fill the out |
548 | * parameters, and update *@idx for the next iteration. The lower 32bit of | 576 | * parameters, and update *@idx for the next iteration. The lower 32bit of |
@@ -616,9 +644,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
616 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() | 644 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() |
617 | * @idx: pointer to u64 loop variable | 645 | * @idx: pointer to u64 loop variable |
618 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 646 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes |
619 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 647 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
620 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 648 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
621 | * @p_nid: ptr to int for nid of the range, can be %NULL | 649 | * @out_nid: ptr to int for nid of the range, can be %NULL |
622 | * | 650 | * |
623 | * Reverse of __next_free_mem_range(). | 651 | * Reverse of __next_free_mem_range(). |
624 | */ | 652 | */ |
@@ -867,6 +895,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) | |||
867 | return memblock_search(&memblock.memory, addr) != -1; | 895 | return memblock_search(&memblock.memory, addr) != -1; |
868 | } | 896 | } |
869 | 897 | ||
898 | /** | ||
899 | * memblock_is_region_memory - check if a region is a subset of memory | ||
900 | * @base: base of region to check | ||
901 | * @size: size of region to check | ||
902 | * | ||
903 | * Check if the region [@base, @base+@size) is a subset of a memory block. | ||
904 | * | ||
905 | * RETURNS: | ||
906 | * 0 if false, non-zero if true | ||
907 | */ | ||
870 | int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) | 908 | int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) |
871 | { | 909 | { |
872 | int idx = memblock_search(&memblock.memory, base); | 910 | int idx = memblock_search(&memblock.memory, base); |
@@ -879,6 +917,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size | |||
879 | memblock.memory.regions[idx].size) >= end; | 917 | memblock.memory.regions[idx].size) >= end; |
880 | } | 918 | } |
881 | 919 | ||
920 | /** | ||
921 | * memblock_is_region_reserved - check if a region intersects reserved memory | ||
922 | * @base: base of region to check | ||
923 | * @size: size of region to check | ||
924 | * | ||
925 | * Check if the region [@base, @base+@size) intersects a reserved memory block. | ||
926 | * | ||
927 | * RETURNS: | ||
928 | * 0 if false, non-zero if true | ||
929 | */ | ||
882 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) | 930 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) |
883 | { | 931 | { |
884 | memblock_cap_size(base, &size); | 932 | memblock_cap_size(base, &size); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac35bccadb7b..f72b5e52451a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1148,7 +1148,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | |||
1148 | { | 1148 | { |
1149 | if (root_memcg == memcg) | 1149 | if (root_memcg == memcg) |
1150 | return true; | 1150 | return true; |
1151 | if (!root_memcg->use_hierarchy) | 1151 | if (!root_memcg->use_hierarchy || !memcg) |
1152 | return false; | 1152 | return false; |
1153 | return css_is_ancestor(&memcg->css, &root_memcg->css); | 1153 | return css_is_ancestor(&memcg->css, &root_memcg->css); |
1154 | } | 1154 | } |
@@ -1234,7 +1234,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) | |||
1234 | 1234 | ||
1235 | /** | 1235 | /** |
1236 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup | 1236 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup |
1237 | * @mem: the memory cgroup | 1237 | * @memcg: the memory cgroup |
1238 | * | 1238 | * |
1239 | * Returns the maximum amount of memory @mem can be charged with, in | 1239 | * Returns the maximum amount of memory @mem can be charged with, in |
1240 | * pages. | 1240 | * pages. |
@@ -1508,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
1508 | 1508 | ||
1509 | /** | 1509 | /** |
1510 | * test_mem_cgroup_node_reclaimable | 1510 | * test_mem_cgroup_node_reclaimable |
1511 | * @mem: the target memcg | 1511 | * @memcg: the target memcg |
1512 | * @nid: the node ID to be checked. | 1512 | * @nid: the node ID to be checked. |
1513 | * @noswap : specify true here if the user wants flle only information. | 1513 | * @noswap : specify true here if the user wants flle only information. |
1514 | * | 1514 | * |
diff --git a/mm/memory.c b/mm/memory.c index 1b7dc662bf9f..2466d1250231 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1225,7 +1225,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1225 | next = pmd_addr_end(addr, end); | 1225 | next = pmd_addr_end(addr, end); |
1226 | if (pmd_trans_huge(*pmd)) { | 1226 | if (pmd_trans_huge(*pmd)) { |
1227 | if (next - addr != HPAGE_PMD_SIZE) { | 1227 | if (next - addr != HPAGE_PMD_SIZE) { |
1228 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | 1228 | #ifdef CONFIG_DEBUG_VM |
1229 | if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { | ||
1230 | pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", | ||
1231 | __func__, addr, end, | ||
1232 | vma->vm_start, | ||
1233 | vma->vm_end); | ||
1234 | BUG(); | ||
1235 | } | ||
1236 | #endif | ||
1229 | split_huge_page_pmd(vma->vm_mm, pmd); | 1237 | split_huge_page_pmd(vma->vm_mm, pmd); |
1230 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1238 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1231 | goto next; | 1239 | goto next; |
@@ -1366,7 +1374,7 @@ void unmap_vmas(struct mmu_gather *tlb, | |||
1366 | /** | 1374 | /** |
1367 | * zap_page_range - remove user pages in a given range | 1375 | * zap_page_range - remove user pages in a given range |
1368 | * @vma: vm_area_struct holding the applicable pages | 1376 | * @vma: vm_area_struct holding the applicable pages |
1369 | * @address: starting address of pages to zap | 1377 | * @start: starting address of pages to zap |
1370 | * @size: number of bytes to zap | 1378 | * @size: number of bytes to zap |
1371 | * @details: details of nonlinear truncation or shared cache invalidation | 1379 | * @details: details of nonlinear truncation or shared cache invalidation |
1372 | * | 1380 | * |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f15c1b24ca18..1d771e4200d2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1177 | if (!list_empty(&pagelist)) { | 1177 | if (!list_empty(&pagelist)) { |
1178 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1178 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1179 | (unsigned long)vma, | 1179 | (unsigned long)vma, |
1180 | false, true); | 1180 | false, MIGRATE_SYNC); |
1181 | if (nr_failed) | 1181 | if (nr_failed) |
1182 | putback_lru_pages(&pagelist); | 1182 | putback_lru_pages(&pagelist); |
1183 | } | 1183 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index c4acfbc09972..d4b0c10872de 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1486 | 1486 | ||
1487 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | 1487 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); |
1488 | 1488 | ||
1489 | ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); | 1489 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); |
1490 | 1490 | ||
1491 | if (file) | 1491 | if (file) |
1492 | fput(file); | 1492 | fput(file); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ed0e19677360..ac300c99baf6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -183,7 +183,8 @@ static bool oom_unkillable_task(struct task_struct *p, | |||
183 | unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | 183 | unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, |
184 | const nodemask_t *nodemask, unsigned long totalpages) | 184 | const nodemask_t *nodemask, unsigned long totalpages) |
185 | { | 185 | { |
186 | unsigned long points; | 186 | long points; |
187 | long adj; | ||
187 | 188 | ||
188 | if (oom_unkillable_task(p, memcg, nodemask)) | 189 | if (oom_unkillable_task(p, memcg, nodemask)) |
189 | return 0; | 190 | return 0; |
@@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
192 | if (!p) | 193 | if (!p) |
193 | return 0; | 194 | return 0; |
194 | 195 | ||
195 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { | 196 | adj = p->signal->oom_score_adj; |
197 | if (adj == OOM_SCORE_ADJ_MIN) { | ||
196 | task_unlock(p); | 198 | task_unlock(p); |
197 | return 0; | 199 | return 0; |
198 | } | 200 | } |
@@ -210,20 +212,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
210 | * implementation used by LSMs. | 212 | * implementation used by LSMs. |
211 | */ | 213 | */ |
212 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) | 214 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) |
213 | points -= 30 * totalpages / 1000; | 215 | adj -= 30; |
214 | 216 | ||
215 | /* | 217 | /* Normalize to oom_score_adj units */ |
216 | * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may | 218 | adj *= totalpages / 1000; |
217 | * either completely disable oom killing or always prefer a certain | 219 | points += adj; |
218 | * task. | ||
219 | */ | ||
220 | points += p->signal->oom_score_adj * totalpages / 1000; | ||
221 | 220 | ||
222 | /* | 221 | /* |
223 | * Never return 0 for an eligible task regardless of the root bonus and | 222 | * Never return 0 for an eligible task regardless of the root bonus and |
224 | * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). | 223 | * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). |
225 | */ | 224 | */ |
226 | return points ? points : 1; | 225 | return points > 0 ? points : 1; |
227 | } | 226 | } |
228 | 227 | ||
229 | /* | 228 | /* |
@@ -366,7 +365,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
366 | 365 | ||
367 | /** | 366 | /** |
368 | * dump_tasks - dump current memory state of all system tasks | 367 | * dump_tasks - dump current memory state of all system tasks |
369 | * @mem: current's memory controller, if constrained | 368 | * @memcg: current's memory controller, if constrained |
370 | * @nodemask: nodemask passed to page allocator for mempolicy ooms | 369 | * @nodemask: nodemask passed to page allocator for mempolicy ooms |
371 | * | 370 | * |
372 | * Dumps the current memory state of all eligible tasks. Tasks not in the same | 371 | * Dumps the current memory state of all eligible tasks. Tasks not in the same |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 1ccbd714059c..eb750f851395 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, | |||
392 | 392 | ||
393 | /** | 393 | /** |
394 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | 394 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. |
395 | * @end: swap entry to be cmpxchged | 395 | * @ent: swap entry to be cmpxchged |
396 | * @old: old id | 396 | * @old: old id |
397 | * @new: new id | 397 | * @new: new id |
398 | * | 398 | * |
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | |||
422 | /** | 422 | /** |
423 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | 423 | * swap_cgroup_record - record mem_cgroup for this swp_entry. |
424 | * @ent: swap entry to be recorded into | 424 | * @ent: swap entry to be recorded into |
425 | * @mem: mem_cgroup to be recorded | 425 | * @id: mem_cgroup to be recorded |
426 | * | 426 | * |
427 | * Returns old value at success, 0 at failure. | 427 | * Returns old value at success, 0 at failure. |
428 | * (Of course, old value can be 0.) | 428 | * (Of course, old value can be 0.) |
diff --git a/mm/page_io.c b/mm/page_io.c index dc76b4d0611e..34f02923744c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/bio.h> | 18 | #include <linux/bio.h> |
19 | #include <linux/swapops.h> | 19 | #include <linux/swapops.h> |
20 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
21 | #include <linux/frontswap.h> | ||
21 | #include <asm/pgtable.h> | 22 | #include <asm/pgtable.h> |
22 | 23 | ||
23 | static struct bio *get_swap_bio(gfp_t gfp_flags, | 24 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
98 | unlock_page(page); | 99 | unlock_page(page); |
99 | goto out; | 100 | goto out; |
100 | } | 101 | } |
102 | if (frontswap_store(page) == 0) { | ||
103 | set_page_writeback(page); | ||
104 | unlock_page(page); | ||
105 | end_page_writeback(page); | ||
106 | goto out; | ||
107 | } | ||
101 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); | 108 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); |
102 | if (bio == NULL) { | 109 | if (bio == NULL) { |
103 | set_page_dirty(page); | 110 | set_page_dirty(page); |
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page) | |||
122 | 129 | ||
123 | VM_BUG_ON(!PageLocked(page)); | 130 | VM_BUG_ON(!PageLocked(page)); |
124 | VM_BUG_ON(PageUptodate(page)); | 131 | VM_BUG_ON(PageUptodate(page)); |
132 | if (frontswap_load(page) == 0) { | ||
133 | SetPageUptodate(page); | ||
134 | unlock_page(page); | ||
135 | goto out; | ||
136 | } | ||
125 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); | 137 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
126 | if (bio == NULL) { | 138 | if (bio == NULL) { |
127 | unlock_page(page); | 139 | unlock_page(page); |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index aa9701e12714..6c118d012bb5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
162 | 162 | ||
163 | /** | 163 | /** |
164 | * walk_page_range - walk a memory map's page tables with a callback | 164 | * walk_page_range - walk a memory map's page tables with a callback |
165 | * @mm: memory map to walk | ||
166 | * @addr: starting address | 165 | * @addr: starting address |
167 | * @end: ending address | 166 | * @end: ending address |
168 | * @walk: set of callbacks to invoke for each level of the tree | 167 | * @walk: set of callbacks to invoke for each level of the tree |
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 405d331804c3..3707c71ae4cd 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
@@ -360,7 +360,6 @@ err_free: | |||
360 | * @chunk: chunk to depopulate | 360 | * @chunk: chunk to depopulate |
361 | * @off: offset to the area to depopulate | 361 | * @off: offset to the area to depopulate |
362 | * @size: size of the area to depopulate in bytes | 362 | * @size: size of the area to depopulate in bytes |
363 | * @flush: whether to flush cache and tlb or not | ||
364 | * | 363 | * |
365 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) | 364 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) |
366 | * from @chunk. If @flush is true, vcache is flushed before unmapping | 365 | * from @chunk. If @flush is true, vcache is flushed before unmapping |
diff --git a/mm/shmem.c b/mm/shmem.c index c244e93a70fa..4ce02e0673db 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -683,10 +683,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
683 | mutex_lock(&shmem_swaplist_mutex); | 683 | mutex_lock(&shmem_swaplist_mutex); |
684 | /* | 684 | /* |
685 | * We needed to drop mutex to make that restrictive page | 685 | * We needed to drop mutex to make that restrictive page |
686 | * allocation; but the inode might already be freed by now, | 686 | * allocation, but the inode might have been freed while we |
687 | * and we cannot refer to inode or mapping or info to check. | 687 | * dropped it: although a racing shmem_evict_inode() cannot |
688 | * However, we do hold page lock on the PageSwapCache page, | 688 | * complete without emptying the radix_tree, our page lock |
689 | * so can check if that still has our reference remaining. | 689 | * on this swapcache page is not enough to prevent that - |
690 | * free_swap_and_cache() of our swap entry will only | ||
691 | * trylock_page(), removing swap from radix_tree whatever. | ||
692 | * | ||
693 | * We must not proceed to shmem_add_to_page_cache() if the | ||
694 | * inode has been freed, but of course we cannot rely on | ||
695 | * inode or mapping or info to check that. However, we can | ||
696 | * safely check if our swap entry is still in use (and here | ||
697 | * it can't have got reused for another page): if it's still | ||
698 | * in use, then the inode cannot have been freed yet, and we | ||
699 | * can safely proceed (if it's no longer in use, that tells | ||
700 | * nothing about the inode, but we don't need to unuse swap). | ||
690 | */ | 701 | */ |
691 | if (!page_swapcount(*pagep)) | 702 | if (!page_swapcount(*pagep)) |
692 | error = -ENOENT; | 703 | error = -ENOENT; |
@@ -730,9 +741,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
730 | 741 | ||
731 | /* | 742 | /* |
732 | * There's a faint possibility that swap page was replaced before | 743 | * There's a faint possibility that swap page was replaced before |
733 | * caller locked it: it will come back later with the right page. | 744 | * caller locked it: caller will come back later with the right page. |
734 | */ | 745 | */ |
735 | if (unlikely(!PageSwapCache(page))) | 746 | if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) |
736 | goto out; | 747 | goto out; |
737 | 748 | ||
738 | /* | 749 | /* |
@@ -995,21 +1006,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
995 | newpage = shmem_alloc_page(gfp, info, index); | 1006 | newpage = shmem_alloc_page(gfp, info, index); |
996 | if (!newpage) | 1007 | if (!newpage) |
997 | return -ENOMEM; | 1008 | return -ENOMEM; |
998 | VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); | ||
999 | 1009 | ||
1000 | *pagep = newpage; | ||
1001 | page_cache_get(newpage); | 1010 | page_cache_get(newpage); |
1002 | copy_highpage(newpage, oldpage); | 1011 | copy_highpage(newpage, oldpage); |
1012 | flush_dcache_page(newpage); | ||
1003 | 1013 | ||
1004 | VM_BUG_ON(!PageLocked(oldpage)); | ||
1005 | __set_page_locked(newpage); | 1014 | __set_page_locked(newpage); |
1006 | VM_BUG_ON(!PageUptodate(oldpage)); | ||
1007 | SetPageUptodate(newpage); | 1015 | SetPageUptodate(newpage); |
1008 | VM_BUG_ON(!PageSwapBacked(oldpage)); | ||
1009 | SetPageSwapBacked(newpage); | 1016 | SetPageSwapBacked(newpage); |
1010 | VM_BUG_ON(!swap_index); | ||
1011 | set_page_private(newpage, swap_index); | 1017 | set_page_private(newpage, swap_index); |
1012 | VM_BUG_ON(!PageSwapCache(oldpage)); | ||
1013 | SetPageSwapCache(newpage); | 1018 | SetPageSwapCache(newpage); |
1014 | 1019 | ||
1015 | /* | 1020 | /* |
@@ -1019,13 +1024,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
1019 | spin_lock_irq(&swap_mapping->tree_lock); | 1024 | spin_lock_irq(&swap_mapping->tree_lock); |
1020 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, | 1025 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, |
1021 | newpage); | 1026 | newpage); |
1022 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 1027 | if (!error) { |
1023 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | 1028 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
1029 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | ||
1030 | } | ||
1024 | spin_unlock_irq(&swap_mapping->tree_lock); | 1031 | spin_unlock_irq(&swap_mapping->tree_lock); |
1025 | BUG_ON(error); | ||
1026 | 1032 | ||
1027 | mem_cgroup_replace_page_cache(oldpage, newpage); | 1033 | if (unlikely(error)) { |
1028 | lru_cache_add_anon(newpage); | 1034 | /* |
1035 | * Is this possible? I think not, now that our callers check | ||
1036 | * both PageSwapCache and page_private after getting page lock; | ||
1037 | * but be defensive. Reverse old to newpage for clear and free. | ||
1038 | */ | ||
1039 | oldpage = newpage; | ||
1040 | } else { | ||
1041 | mem_cgroup_replace_page_cache(oldpage, newpage); | ||
1042 | lru_cache_add_anon(newpage); | ||
1043 | *pagep = newpage; | ||
1044 | } | ||
1029 | 1045 | ||
1030 | ClearPageSwapCache(oldpage); | 1046 | ClearPageSwapCache(oldpage); |
1031 | set_page_private(oldpage, 0); | 1047 | set_page_private(oldpage, 0); |
@@ -1033,7 +1049,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
1033 | unlock_page(oldpage); | 1049 | unlock_page(oldpage); |
1034 | page_cache_release(oldpage); | 1050 | page_cache_release(oldpage); |
1035 | page_cache_release(oldpage); | 1051 | page_cache_release(oldpage); |
1036 | return 0; | 1052 | return error; |
1037 | } | 1053 | } |
1038 | 1054 | ||
1039 | /* | 1055 | /* |
@@ -1107,7 +1123,8 @@ repeat: | |||
1107 | 1123 | ||
1108 | /* We have to do this with page locked to prevent races */ | 1124 | /* We have to do this with page locked to prevent races */ |
1109 | lock_page(page); | 1125 | lock_page(page); |
1110 | if (!PageSwapCache(page) || page->mapping) { | 1126 | if (!PageSwapCache(page) || page_private(page) != swap.val || |
1127 | page->mapping) { | ||
1111 | error = -EEXIST; /* try again */ | 1128 | error = -EEXIST; /* try again */ |
1112 | goto failed; | 1129 | goto failed; |
1113 | } | 1130 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 457b10baef59..71373d03fcee 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -31,6 +31,8 @@ | |||
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/poll.h> | 32 | #include <linux/poll.h> |
33 | #include <linux/oom.h> | 33 | #include <linux/oom.h> |
34 | #include <linux/frontswap.h> | ||
35 | #include <linux/swapfile.h> | ||
34 | 36 | ||
35 | #include <asm/pgtable.h> | 37 | #include <asm/pgtable.h> |
36 | #include <asm/tlbflush.h> | 38 | #include <asm/tlbflush.h> |
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | |||
42 | static void free_swap_count_continuations(struct swap_info_struct *); | 44 | static void free_swap_count_continuations(struct swap_info_struct *); |
43 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); | 45 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); |
44 | 46 | ||
45 | static DEFINE_SPINLOCK(swap_lock); | 47 | DEFINE_SPINLOCK(swap_lock); |
46 | static unsigned int nr_swapfiles; | 48 | static unsigned int nr_swapfiles; |
47 | long nr_swap_pages; | 49 | long nr_swap_pages; |
48 | long total_swap_pages; | 50 | long total_swap_pages; |
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry "; | |||
53 | static const char Bad_offset[] = "Bad swap offset entry "; | 55 | static const char Bad_offset[] = "Bad swap offset entry "; |
54 | static const char Unused_offset[] = "Unused swap offset entry "; | 56 | static const char Unused_offset[] = "Unused swap offset entry "; |
55 | 57 | ||
56 | static struct swap_list_t swap_list = {-1, -1}; | 58 | struct swap_list_t swap_list = {-1, -1}; |
57 | 59 | ||
58 | static struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 60 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
59 | 61 | ||
60 | static DEFINE_MUTEX(swapon_mutex); | 62 | static DEFINE_MUTEX(swapon_mutex); |
61 | 63 | ||
@@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
556 | swap_list.next = p->type; | 558 | swap_list.next = p->type; |
557 | nr_swap_pages++; | 559 | nr_swap_pages++; |
558 | p->inuse_pages--; | 560 | p->inuse_pages--; |
561 | frontswap_invalidate_page(p->type, offset); | ||
559 | if ((p->flags & SWP_BLKDEV) && | 562 | if ((p->flags & SWP_BLKDEV) && |
560 | disk->fops->swap_slot_free_notify) | 563 | disk->fops->swap_slot_free_notify) |
561 | disk->fops->swap_slot_free_notify(p->bdev, offset); | 564 | disk->fops->swap_slot_free_notify(p->bdev, offset); |
@@ -985,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm, | |||
985 | } | 988 | } |
986 | 989 | ||
987 | /* | 990 | /* |
988 | * Scan swap_map from current position to next entry still in use. | 991 | * Scan swap_map (or frontswap_map if frontswap parameter is true) |
992 | * from current position to next entry still in use. | ||
989 | * Recycle to start on reaching the end, returning 0 when empty. | 993 | * Recycle to start on reaching the end, returning 0 when empty. |
990 | */ | 994 | */ |
991 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, | 995 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, |
992 | unsigned int prev) | 996 | unsigned int prev, bool frontswap) |
993 | { | 997 | { |
994 | unsigned int max = si->max; | 998 | unsigned int max = si->max; |
995 | unsigned int i = prev; | 999 | unsigned int i = prev; |
@@ -1015,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1015 | prev = 0; | 1019 | prev = 0; |
1016 | i = 1; | 1020 | i = 1; |
1017 | } | 1021 | } |
1022 | if (frontswap) { | ||
1023 | if (frontswap_test(si, i)) | ||
1024 | break; | ||
1025 | else | ||
1026 | continue; | ||
1027 | } | ||
1018 | count = si->swap_map[i]; | 1028 | count = si->swap_map[i]; |
1019 | if (count && swap_count(count) != SWAP_MAP_BAD) | 1029 | if (count && swap_count(count) != SWAP_MAP_BAD) |
1020 | break; | 1030 | break; |
@@ -1026,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1026 | * We completely avoid races by reading each swap page in advance, | 1036 | * We completely avoid races by reading each swap page in advance, |
1027 | * and then search for the process using it. All the necessary | 1037 | * and then search for the process using it. All the necessary |
1028 | * page table adjustments can then be made atomically. | 1038 | * page table adjustments can then be made atomically. |
1039 | * | ||
1040 | * if the boolean frontswap is true, only unuse pages_to_unuse pages; | ||
1041 | * pages_to_unuse==0 means all pages; ignored if frontswap is false | ||
1029 | */ | 1042 | */ |
1030 | static int try_to_unuse(unsigned int type) | 1043 | int try_to_unuse(unsigned int type, bool frontswap, |
1044 | unsigned long pages_to_unuse) | ||
1031 | { | 1045 | { |
1032 | struct swap_info_struct *si = swap_info[type]; | 1046 | struct swap_info_struct *si = swap_info[type]; |
1033 | struct mm_struct *start_mm; | 1047 | struct mm_struct *start_mm; |
@@ -1060,7 +1074,7 @@ static int try_to_unuse(unsigned int type) | |||
1060 | * one pass through swap_map is enough, but not necessarily: | 1074 | * one pass through swap_map is enough, but not necessarily: |
1061 | * there are races when an instance of an entry might be missed. | 1075 | * there are races when an instance of an entry might be missed. |
1062 | */ | 1076 | */ |
1063 | while ((i = find_next_to_unuse(si, i)) != 0) { | 1077 | while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { |
1064 | if (signal_pending(current)) { | 1078 | if (signal_pending(current)) { |
1065 | retval = -EINTR; | 1079 | retval = -EINTR; |
1066 | break; | 1080 | break; |
@@ -1227,6 +1241,10 @@ static int try_to_unuse(unsigned int type) | |||
1227 | * interactive performance. | 1241 | * interactive performance. |
1228 | */ | 1242 | */ |
1229 | cond_resched(); | 1243 | cond_resched(); |
1244 | if (frontswap && pages_to_unuse > 0) { | ||
1245 | if (!--pages_to_unuse) | ||
1246 | break; | ||
1247 | } | ||
1230 | } | 1248 | } |
1231 | 1249 | ||
1232 | mmput(start_mm); | 1250 | mmput(start_mm); |
@@ -1486,7 +1504,8 @@ bad_bmap: | |||
1486 | } | 1504 | } |
1487 | 1505 | ||
1488 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1506 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
1489 | unsigned char *swap_map) | 1507 | unsigned char *swap_map, |
1508 | unsigned long *frontswap_map) | ||
1490 | { | 1509 | { |
1491 | int i, prev; | 1510 | int i, prev; |
1492 | 1511 | ||
@@ -1496,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1496 | else | 1515 | else |
1497 | p->prio = --least_priority; | 1516 | p->prio = --least_priority; |
1498 | p->swap_map = swap_map; | 1517 | p->swap_map = swap_map; |
1518 | frontswap_map_set(p, frontswap_map); | ||
1499 | p->flags |= SWP_WRITEOK; | 1519 | p->flags |= SWP_WRITEOK; |
1500 | nr_swap_pages += p->pages; | 1520 | nr_swap_pages += p->pages; |
1501 | total_swap_pages += p->pages; | 1521 | total_swap_pages += p->pages; |
@@ -1512,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1512 | swap_list.head = swap_list.next = p->type; | 1532 | swap_list.head = swap_list.next = p->type; |
1513 | else | 1533 | else |
1514 | swap_info[prev]->next = p->type; | 1534 | swap_info[prev]->next = p->type; |
1535 | frontswap_init(p->type); | ||
1515 | spin_unlock(&swap_lock); | 1536 | spin_unlock(&swap_lock); |
1516 | } | 1537 | } |
1517 | 1538 | ||
@@ -1585,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1585 | spin_unlock(&swap_lock); | 1606 | spin_unlock(&swap_lock); |
1586 | 1607 | ||
1587 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1608 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); |
1588 | err = try_to_unuse(type); | 1609 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ |
1589 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); | 1610 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); |
1590 | 1611 | ||
1591 | if (err) { | 1612 | if (err) { |
@@ -1596,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1596 | * sys_swapoff for this swap_info_struct at this point. | 1617 | * sys_swapoff for this swap_info_struct at this point. |
1597 | */ | 1618 | */ |
1598 | /* re-insert swap space back into swap_list */ | 1619 | /* re-insert swap space back into swap_list */ |
1599 | enable_swap_info(p, p->prio, p->swap_map); | 1620 | enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); |
1600 | goto out_dput; | 1621 | goto out_dput; |
1601 | } | 1622 | } |
1602 | 1623 | ||
@@ -1622,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1622 | swap_map = p->swap_map; | 1643 | swap_map = p->swap_map; |
1623 | p->swap_map = NULL; | 1644 | p->swap_map = NULL; |
1624 | p->flags = 0; | 1645 | p->flags = 0; |
1646 | frontswap_invalidate_area(type); | ||
1625 | spin_unlock(&swap_lock); | 1647 | spin_unlock(&swap_lock); |
1626 | mutex_unlock(&swapon_mutex); | 1648 | mutex_unlock(&swapon_mutex); |
1627 | vfree(swap_map); | 1649 | vfree(swap_map); |
1650 | vfree(frontswap_map_get(p)); | ||
1628 | /* Destroy swap account informatin */ | 1651 | /* Destroy swap account informatin */ |
1629 | swap_cgroup_swapoff(type); | 1652 | swap_cgroup_swapoff(type); |
1630 | 1653 | ||
@@ -1893,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1893 | 1916 | ||
1894 | /* | 1917 | /* |
1895 | * Find out how many pages are allowed for a single swap | 1918 | * Find out how many pages are allowed for a single swap |
1896 | * device. There are three limiting factors: 1) the number | 1919 | * device. There are two limiting factors: 1) the number |
1897 | * of bits for the swap offset in the swp_entry_t type, and | 1920 | * of bits for the swap offset in the swp_entry_t type, and |
1898 | * 2) the number of bits in the swap pte as defined by the | 1921 | * 2) the number of bits in the swap pte as defined by the |
1899 | * the different architectures, and 3) the number of free bits | 1922 | * different architectures. In order to find the |
1900 | * in an exceptional radix_tree entry. In order to find the | ||
1901 | * largest possible bit mask, a swap entry with swap type 0 | 1923 | * largest possible bit mask, a swap entry with swap type 0 |
1902 | * and swap offset ~0UL is created, encoded to a swap pte, | 1924 | * and swap offset ~0UL is created, encoded to a swap pte, |
1903 | * decoded to a swp_entry_t again, and finally the swap | 1925 | * decoded to a swp_entry_t again, and finally the swap |
1904 | * offset is extracted. This will mask all the bits from | 1926 | * offset is extracted. This will mask all the bits from |
1905 | * the initial ~0UL mask that can't be encoded in either | 1927 | * the initial ~0UL mask that can't be encoded in either |
1906 | * the swp_entry_t or the architecture definition of a | 1928 | * the swp_entry_t or the architecture definition of a |
1907 | * swap pte. Then the same is done for a radix_tree entry. | 1929 | * swap pte. |
1908 | */ | 1930 | */ |
1909 | maxpages = swp_offset(pte_to_swp_entry( | 1931 | maxpages = swp_offset(pte_to_swp_entry( |
1910 | swp_entry_to_pte(swp_entry(0, ~0UL)))); | 1932 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; |
1911 | maxpages = swp_offset(radix_to_swp_entry( | ||
1912 | swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; | ||
1913 | |||
1914 | if (maxpages > swap_header->info.last_page) { | 1933 | if (maxpages > swap_header->info.last_page) { |
1915 | maxpages = swap_header->info.last_page + 1; | 1934 | maxpages = swap_header->info.last_page + 1; |
1916 | /* p->max is an unsigned int: don't overflow it */ | 1935 | /* p->max is an unsigned int: don't overflow it */ |
@@ -1988,6 +2007,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1988 | sector_t span; | 2007 | sector_t span; |
1989 | unsigned long maxpages; | 2008 | unsigned long maxpages; |
1990 | unsigned char *swap_map = NULL; | 2009 | unsigned char *swap_map = NULL; |
2010 | unsigned long *frontswap_map = NULL; | ||
1991 | struct page *page = NULL; | 2011 | struct page *page = NULL; |
1992 | struct inode *inode = NULL; | 2012 | struct inode *inode = NULL; |
1993 | 2013 | ||
@@ -2071,6 +2091,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2071 | error = nr_extents; | 2091 | error = nr_extents; |
2072 | goto bad_swap; | 2092 | goto bad_swap; |
2073 | } | 2093 | } |
2094 | /* frontswap enabled? set up bit-per-page map for frontswap */ | ||
2095 | if (frontswap_enabled) | ||
2096 | frontswap_map = vzalloc(maxpages / sizeof(long)); | ||
2074 | 2097 | ||
2075 | if (p->bdev) { | 2098 | if (p->bdev) { |
2076 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 2099 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
@@ -2086,14 +2109,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2086 | if (swap_flags & SWAP_FLAG_PREFER) | 2109 | if (swap_flags & SWAP_FLAG_PREFER) |
2087 | prio = | 2110 | prio = |
2088 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; | 2111 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
2089 | enable_swap_info(p, prio, swap_map); | 2112 | enable_swap_info(p, prio, swap_map, frontswap_map); |
2090 | 2113 | ||
2091 | printk(KERN_INFO "Adding %uk swap on %s. " | 2114 | printk(KERN_INFO "Adding %uk swap on %s. " |
2092 | "Priority:%d extents:%d across:%lluk %s%s\n", | 2115 | "Priority:%d extents:%d across:%lluk %s%s%s\n", |
2093 | p->pages<<(PAGE_SHIFT-10), name, p->prio, | 2116 | p->pages<<(PAGE_SHIFT-10), name, p->prio, |
2094 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 2117 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
2095 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | 2118 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", |
2096 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); | 2119 | (p->flags & SWP_DISCARDABLE) ? "D" : "", |
2120 | (frontswap_map) ? "FS" : ""); | ||
2097 | 2121 | ||
2098 | mutex_unlock(&swapon_mutex); | 2122 | mutex_unlock(&swapon_mutex); |
2099 | atomic_inc(&proc_poll_event); | 2123 | atomic_inc(&proc_poll_event); |