diff options
author | Dan Magenheimer <dan.magenheimer@oracle.com> | 2012-04-09 19:09:27 -0400 |
---|---|---|
committer | Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | 2012-05-15 11:34:00 -0400 |
commit | 29f233cfffe7fbc6672938117ce7e4154a2f515f (patch) | |
tree | 480d4bc00ffc12cdd58447faecaf32df2b08043d /mm/frontswap.c | |
parent | 38b5faf4b178d5279b1fca5d7dadc68881342660 (diff) |
mm: frontswap: core frontswap functionality
This patch, 3of4, provides the core frontswap code that interfaces between
the hooks in the swap subsystem and a frontswap backend via frontswap_ops.
---
New file added: mm/frontswap.c
[v14: add support for writethrough, per suggestion by aarcange@redhat.com]
[v11: sjenning@linux.vnet.ibm.com: s/puts/failed_puts/]
[v10: sjenning@linux.vnet.ibm.com: fix debugfs calls on 32-bit]
[v9: akpm@linux-foundation.org: change "flush" to "invalidate", part 1]
[v9: akpm@linux-foundation.org: mark some statics __read_mostly]
[v9: akpm@linux-foundation.org: add clarifying comments]
[v9: akpm@linux-foundation.org: no need to loop repeating try_to_unuse]
[v9: error27@gmail.com: remove superfluous check for NULL]
[v8: rebase to 3.0-rc4]
[v8: kamezawa.hiroyu@jp.fujitsu.com: add comment to clarify find_next_to_unuse]
[v7: rebase to 3.0-rc3]
[v7: JBeulich@novell.com: use new static inlines, no-ops if not config'd]
[v6: rebase to 3.1-rc1]
[v6: lliubbo@gmail.com: use vzalloc]
[v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails]
[v6: konrad.wilk@oracl.com: various checks and code clarifications/comments]
[v4: rebase to 2.6.39]
Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Acked-by: Jan Beulich <JBeulich@novell.com>
Acked-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Rik Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
[v12: Squashed s/flush/invalidate/ in]
[v15: A bit of cleanup and seperate DEBUGFS]
Signed-off-by: Konrad Wilk <konrad.wilk@oracle.com>
Diffstat (limited to 'mm/frontswap.c')
-rw-r--r-- | mm/frontswap.c | 314 |
1 files changed, 314 insertions, 0 deletions
diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000000..8c0a5f8683f0 --- /dev/null +++ b/mm/frontswap.c | |||
@@ -0,0 +1,314 @@ | |||
1 | /* | ||
2 | * Frontswap frontend | ||
3 | * | ||
4 | * This code provides the generic "frontend" layer to call a matching | ||
5 | * "backend" driver implementation of frontswap. See | ||
6 | * Documentation/vm/frontswap.txt for more information. | ||
7 | * | ||
8 | * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. | ||
9 | * Author: Dan Magenheimer | ||
10 | * | ||
11 | * This work is licensed under the terms of the GNU GPL, version 2. | ||
12 | */ | ||
13 | |||
14 | #include <linux/mm.h> | ||
15 | #include <linux/mman.h> | ||
16 | #include <linux/swap.h> | ||
17 | #include <linux/swapops.h> | ||
18 | #include <linux/proc_fs.h> | ||
19 | #include <linux/security.h> | ||
20 | #include <linux/capability.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/uaccess.h> | ||
23 | #include <linux/debugfs.h> | ||
24 | #include <linux/frontswap.h> | ||
25 | #include <linux/swapfile.h> | ||
26 | |||
27 | /* | ||
28 | * frontswap_ops is set by frontswap_register_ops to contain the pointers | ||
29 | * to the frontswap "backend" implementation functions. | ||
30 | */ | ||
31 | static struct frontswap_ops frontswap_ops __read_mostly; | ||
32 | |||
33 | /* | ||
34 | * This global enablement flag reduces overhead on systems where frontswap_ops | ||
35 | * has not been registered, so is preferred to the slower alternative: a | ||
36 | * function call that checks a non-global. | ||
37 | */ | ||
38 | bool frontswap_enabled __read_mostly; | ||
39 | EXPORT_SYMBOL(frontswap_enabled); | ||
40 | |||
41 | /* | ||
42 | * If enabled, frontswap_put will return failure even on success. As | ||
43 | * a result, the swap subsystem will always write the page to swap, in | ||
44 | * effect converting frontswap into a writethrough cache. In this mode, | ||
45 | * there is no direct reduction in swap writes, but a frontswap backend | ||
46 | * can unilaterally "reclaim" any pages in use with no data loss, thus | ||
47 | * providing increases control over maximum memory usage due to frontswap. | ||
48 | */ | ||
49 | static bool frontswap_writethrough_enabled __read_mostly; | ||
50 | |||
51 | #ifdef CONFIG_DEBUG_FS | ||
52 | /* | ||
53 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is | ||
54 | * properly configured). These are for information only so are not protected | ||
55 | * against increment races. | ||
56 | */ | ||
57 | static u64 frontswap_gets; | ||
58 | static u64 frontswap_succ_puts; | ||
59 | static u64 frontswap_failed_puts; | ||
60 | static u64 frontswap_invalidates; | ||
61 | |||
62 | static inline void inc_frontswap_gets(void) { | ||
63 | frontswap_gets++; | ||
64 | } | ||
65 | static inline void inc_frontswap_succ_puts(void) { | ||
66 | frontswap_succ_puts++; | ||
67 | } | ||
68 | static inline void inc_frontswap_failed_puts(void) { | ||
69 | frontswap_failed_puts++; | ||
70 | } | ||
71 | static inline void inc_frontswap_invalidates(void) { | ||
72 | frontswap_invalidates++; | ||
73 | } | ||
74 | #else | ||
75 | static inline void inc_frontswap_gets(void) { } | ||
76 | static inline void inc_frontswap_succ_puts(void) { } | ||
77 | static inline void inc_frontswap_failed_puts(void) { } | ||
78 | static inline void inc_frontswap_invalidates(void) { } | ||
79 | #endif | ||
80 | /* | ||
81 | * Register operations for frontswap, returning previous thus allowing | ||
82 | * detection of multiple backends and possible nesting. | ||
83 | */ | ||
84 | struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) | ||
85 | { | ||
86 | struct frontswap_ops old = frontswap_ops; | ||
87 | |||
88 | frontswap_ops = *ops; | ||
89 | frontswap_enabled = true; | ||
90 | return old; | ||
91 | } | ||
92 | EXPORT_SYMBOL(frontswap_register_ops); | ||
93 | |||
94 | /* | ||
95 | * Enable/disable frontswap writethrough (see above). | ||
96 | */ | ||
97 | void frontswap_writethrough(bool enable) | ||
98 | { | ||
99 | frontswap_writethrough_enabled = enable; | ||
100 | } | ||
101 | EXPORT_SYMBOL(frontswap_writethrough); | ||
102 | |||
103 | /* | ||
104 | * Called when a swap device is swapon'd. | ||
105 | */ | ||
106 | void __frontswap_init(unsigned type) | ||
107 | { | ||
108 | struct swap_info_struct *sis = swap_info[type]; | ||
109 | |||
110 | BUG_ON(sis == NULL); | ||
111 | if (sis->frontswap_map == NULL) | ||
112 | return; | ||
113 | if (frontswap_enabled) | ||
114 | (*frontswap_ops.init)(type); | ||
115 | } | ||
116 | EXPORT_SYMBOL(__frontswap_init); | ||
117 | |||
118 | /* | ||
119 | * "Put" data from a page to frontswap and associate it with the page's | ||
120 | * swaptype and offset. Page must be locked and in the swap cache. | ||
121 | * If frontswap already contains a page with matching swaptype and | ||
122 | * offset, the frontswap implmentation may either overwrite the data and | ||
123 | * return success or invalidate the page from frontswap and return failure. | ||
124 | */ | ||
125 | int __frontswap_put_page(struct page *page) | ||
126 | { | ||
127 | int ret = -1, dup = 0; | ||
128 | swp_entry_t entry = { .val = page_private(page), }; | ||
129 | int type = swp_type(entry); | ||
130 | struct swap_info_struct *sis = swap_info[type]; | ||
131 | pgoff_t offset = swp_offset(entry); | ||
132 | |||
133 | BUG_ON(!PageLocked(page)); | ||
134 | BUG_ON(sis == NULL); | ||
135 | if (frontswap_test(sis, offset)) | ||
136 | dup = 1; | ||
137 | ret = (*frontswap_ops.put_page)(type, offset, page); | ||
138 | if (ret == 0) { | ||
139 | frontswap_set(sis, offset); | ||
140 | inc_frontswap_succ_puts(); | ||
141 | if (!dup) | ||
142 | atomic_inc(&sis->frontswap_pages); | ||
143 | } else if (dup) { | ||
144 | /* | ||
145 | failed dup always results in automatic invalidate of | ||
146 | the (older) page from frontswap | ||
147 | */ | ||
148 | frontswap_clear(sis, offset); | ||
149 | atomic_dec(&sis->frontswap_pages); | ||
150 | inc_frontswap_failed_puts(); | ||
151 | } else | ||
152 | inc_frontswap_failed_puts(); | ||
153 | if (frontswap_writethrough_enabled) | ||
154 | /* report failure so swap also writes to swap device */ | ||
155 | ret = -1; | ||
156 | return ret; | ||
157 | } | ||
158 | EXPORT_SYMBOL(__frontswap_put_page); | ||
159 | |||
160 | /* | ||
161 | * "Get" data from frontswap associated with swaptype and offset that were | ||
162 | * specified when the data was put to frontswap and use it to fill the | ||
163 | * specified page with data. Page must be locked and in the swap cache. | ||
164 | */ | ||
165 | int __frontswap_get_page(struct page *page) | ||
166 | { | ||
167 | int ret = -1; | ||
168 | swp_entry_t entry = { .val = page_private(page), }; | ||
169 | int type = swp_type(entry); | ||
170 | struct swap_info_struct *sis = swap_info[type]; | ||
171 | pgoff_t offset = swp_offset(entry); | ||
172 | |||
173 | BUG_ON(!PageLocked(page)); | ||
174 | BUG_ON(sis == NULL); | ||
175 | if (frontswap_test(sis, offset)) | ||
176 | ret = (*frontswap_ops.get_page)(type, offset, page); | ||
177 | if (ret == 0) | ||
178 | inc_frontswap_gets(); | ||
179 | return ret; | ||
180 | } | ||
181 | EXPORT_SYMBOL(__frontswap_get_page); | ||
182 | |||
183 | /* | ||
184 | * Invalidate any data from frontswap associated with the specified swaptype | ||
185 | * and offset so that a subsequent "get" will fail. | ||
186 | */ | ||
187 | void __frontswap_invalidate_page(unsigned type, pgoff_t offset) | ||
188 | { | ||
189 | struct swap_info_struct *sis = swap_info[type]; | ||
190 | |||
191 | BUG_ON(sis == NULL); | ||
192 | if (frontswap_test(sis, offset)) { | ||
193 | (*frontswap_ops.invalidate_page)(type, offset); | ||
194 | atomic_dec(&sis->frontswap_pages); | ||
195 | frontswap_clear(sis, offset); | ||
196 | inc_frontswap_invalidates(); | ||
197 | } | ||
198 | } | ||
199 | EXPORT_SYMBOL(__frontswap_invalidate_page); | ||
200 | |||
201 | /* | ||
202 | * Invalidate all data from frontswap associated with all offsets for the | ||
203 | * specified swaptype. | ||
204 | */ | ||
205 | void __frontswap_invalidate_area(unsigned type) | ||
206 | { | ||
207 | struct swap_info_struct *sis = swap_info[type]; | ||
208 | |||
209 | BUG_ON(sis == NULL); | ||
210 | if (sis->frontswap_map == NULL) | ||
211 | return; | ||
212 | (*frontswap_ops.invalidate_area)(type); | ||
213 | atomic_set(&sis->frontswap_pages, 0); | ||
214 | memset(sis->frontswap_map, 0, sis->max / sizeof(long)); | ||
215 | } | ||
216 | EXPORT_SYMBOL(__frontswap_invalidate_area); | ||
217 | |||
218 | /* | ||
219 | * Frontswap, like a true swap device, may unnecessarily retain pages | ||
220 | * under certain circumstances; "shrink" frontswap is essentially a | ||
221 | * "partial swapoff" and works by calling try_to_unuse to attempt to | ||
222 | * unuse enough frontswap pages to attempt to -- subject to memory | ||
223 | * constraints -- reduce the number of pages in frontswap to the | ||
224 | * number given in the parameter target_pages. | ||
225 | */ | ||
226 | void frontswap_shrink(unsigned long target_pages) | ||
227 | { | ||
228 | struct swap_info_struct *si = NULL; | ||
229 | int si_frontswap_pages; | ||
230 | unsigned long total_pages = 0, total_pages_to_unuse; | ||
231 | unsigned long pages = 0, pages_to_unuse = 0; | ||
232 | int type; | ||
233 | bool locked = false; | ||
234 | |||
235 | /* | ||
236 | * we don't want to hold swap_lock while doing a very | ||
237 | * lengthy try_to_unuse, but swap_list may change | ||
238 | * so restart scan from swap_list.head each time | ||
239 | */ | ||
240 | spin_lock(&swap_lock); | ||
241 | locked = true; | ||
242 | total_pages = 0; | ||
243 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
244 | si = swap_info[type]; | ||
245 | total_pages += atomic_read(&si->frontswap_pages); | ||
246 | } | ||
247 | if (total_pages <= target_pages) | ||
248 | goto out; | ||
249 | total_pages_to_unuse = total_pages - target_pages; | ||
250 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
251 | si = swap_info[type]; | ||
252 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | ||
253 | if (total_pages_to_unuse < si_frontswap_pages) | ||
254 | pages = pages_to_unuse = total_pages_to_unuse; | ||
255 | else { | ||
256 | pages = si_frontswap_pages; | ||
257 | pages_to_unuse = 0; /* unuse all */ | ||
258 | } | ||
259 | /* ensure there is enough RAM to fetch pages from frontswap */ | ||
260 | if (security_vm_enough_memory_mm(current->mm, pages)) | ||
261 | continue; | ||
262 | vm_unacct_memory(pages); | ||
263 | break; | ||
264 | } | ||
265 | if (type < 0) | ||
266 | goto out; | ||
267 | locked = false; | ||
268 | spin_unlock(&swap_lock); | ||
269 | try_to_unuse(type, true, pages_to_unuse); | ||
270 | out: | ||
271 | if (locked) | ||
272 | spin_unlock(&swap_lock); | ||
273 | return; | ||
274 | } | ||
275 | EXPORT_SYMBOL(frontswap_shrink); | ||
276 | |||
277 | /* | ||
278 | * Count and return the number of frontswap pages across all | ||
279 | * swap devices. This is exported so that backend drivers can | ||
280 | * determine current usage without reading debugfs. | ||
281 | */ | ||
282 | unsigned long frontswap_curr_pages(void) | ||
283 | { | ||
284 | int type; | ||
285 | unsigned long totalpages = 0; | ||
286 | struct swap_info_struct *si = NULL; | ||
287 | |||
288 | spin_lock(&swap_lock); | ||
289 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
290 | si = swap_info[type]; | ||
291 | totalpages += atomic_read(&si->frontswap_pages); | ||
292 | } | ||
293 | spin_unlock(&swap_lock); | ||
294 | return totalpages; | ||
295 | } | ||
296 | EXPORT_SYMBOL(frontswap_curr_pages); | ||
297 | |||
298 | static int __init init_frontswap(void) | ||
299 | { | ||
300 | #ifdef CONFIG_DEBUG_FS | ||
301 | struct dentry *root = debugfs_create_dir("frontswap", NULL); | ||
302 | if (root == NULL) | ||
303 | return -ENXIO; | ||
304 | debugfs_create_u64("gets", S_IRUGO, root, &frontswap_gets); | ||
305 | debugfs_create_u64("succ_puts", S_IRUGO, root, &frontswap_succ_puts); | ||
306 | debugfs_create_u64("failed_puts", S_IRUGO, root, | ||
307 | &frontswap_failed_puts); | ||
308 | debugfs_create_u64("invalidates", S_IRUGO, | ||
309 | root, &frontswap_invalidates); | ||
310 | #endif | ||
311 | return 0; | ||
312 | } | ||
313 | |||
314 | module_init(init_frontswap); | ||