diff options
| author | Dan Magenheimer <dan.magenheimer@oracle.com> | 2012-04-09 19:09:27 -0400 |
|---|---|---|
| committer | Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | 2012-05-15 11:34:00 -0400 |
| commit | 29f233cfffe7fbc6672938117ce7e4154a2f515f (patch) | |
| tree | 480d4bc00ffc12cdd58447faecaf32df2b08043d | |
| parent | 38b5faf4b178d5279b1fca5d7dadc68881342660 (diff) | |
mm: frontswap: core frontswap functionality
This patch, 3of4, provides the core frontswap code that interfaces between
the hooks in the swap subsystem and a frontswap backend via frontswap_ops.
---
New file added: mm/frontswap.c
[v14: add support for writethrough, per suggestion by aarcange@redhat.com]
[v11: sjenning@linux.vnet.ibm.com: s/puts/failed_puts/]
[v10: sjenning@linux.vnet.ibm.com: fix debugfs calls on 32-bit]
[v9: akpm@linux-foundation.org: change "flush" to "invalidate", part 1]
[v9: akpm@linux-foundation.org: mark some statics __read_mostly]
[v9: akpm@linux-foundation.org: add clarifying comments]
[v9: akpm@linux-foundation.org: no need to loop repeating try_to_unuse]
[v9: error27@gmail.com: remove superfluous check for NULL]
[v8: rebase to 3.0-rc4]
[v8: kamezawa.hiroyu@jp.fujitsu.com: add comment to clarify find_next_to_unuse]
[v7: rebase to 3.0-rc3]
[v7: JBeulich@novell.com: use new static inlines, no-ops if not config'd]
[v6: rebase to 3.1-rc1]
[v6: lliubbo@gmail.com: use vzalloc]
[v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails]
[v6: konrad.wilk@oracl.com: various checks and code clarifications/comments]
[v4: rebase to 2.6.39]
Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Acked-by: Jan Beulich <JBeulich@novell.com>
Acked-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Rik Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
[v12: Squashed s/flush/invalidate/ in]
[v15: A bit of cleanup and seperate DEBUGFS]
Signed-off-by: Konrad Wilk <konrad.wilk@oracle.com>
| -rw-r--r-- | mm/frontswap.c | 314 |
1 files changed, 314 insertions, 0 deletions
diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000000..8c0a5f8683f0 --- /dev/null +++ b/mm/frontswap.c | |||
| @@ -0,0 +1,314 @@ | |||
| 1 | /* | ||
| 2 | * Frontswap frontend | ||
| 3 | * | ||
| 4 | * This code provides the generic "frontend" layer to call a matching | ||
| 5 | * "backend" driver implementation of frontswap. See | ||
| 6 | * Documentation/vm/frontswap.txt for more information. | ||
| 7 | * | ||
| 8 | * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. | ||
| 9 | * Author: Dan Magenheimer | ||
| 10 | * | ||
| 11 | * This work is licensed under the terms of the GNU GPL, version 2. | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/mm.h> | ||
| 15 | #include <linux/mman.h> | ||
| 16 | #include <linux/swap.h> | ||
| 17 | #include <linux/swapops.h> | ||
| 18 | #include <linux/proc_fs.h> | ||
| 19 | #include <linux/security.h> | ||
| 20 | #include <linux/capability.h> | ||
| 21 | #include <linux/module.h> | ||
| 22 | #include <linux/uaccess.h> | ||
| 23 | #include <linux/debugfs.h> | ||
| 24 | #include <linux/frontswap.h> | ||
| 25 | #include <linux/swapfile.h> | ||
| 26 | |||
| 27 | /* | ||
| 28 | * frontswap_ops is set by frontswap_register_ops to contain the pointers | ||
| 29 | * to the frontswap "backend" implementation functions. | ||
| 30 | */ | ||
| 31 | static struct frontswap_ops frontswap_ops __read_mostly; | ||
| 32 | |||
| 33 | /* | ||
| 34 | * This global enablement flag reduces overhead on systems where frontswap_ops | ||
| 35 | * has not been registered, so is preferred to the slower alternative: a | ||
| 36 | * function call that checks a non-global. | ||
| 37 | */ | ||
| 38 | bool frontswap_enabled __read_mostly; | ||
| 39 | EXPORT_SYMBOL(frontswap_enabled); | ||
| 40 | |||
| 41 | /* | ||
| 42 | * If enabled, frontswap_put will return failure even on success. As | ||
| 43 | * a result, the swap subsystem will always write the page to swap, in | ||
| 44 | * effect converting frontswap into a writethrough cache. In this mode, | ||
| 45 | * there is no direct reduction in swap writes, but a frontswap backend | ||
| 46 | * can unilaterally "reclaim" any pages in use with no data loss, thus | ||
| 47 | * providing increases control over maximum memory usage due to frontswap. | ||
| 48 | */ | ||
| 49 | static bool frontswap_writethrough_enabled __read_mostly; | ||
| 50 | |||
| 51 | #ifdef CONFIG_DEBUG_FS | ||
| 52 | /* | ||
| 53 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is | ||
| 54 | * properly configured). These are for information only so are not protected | ||
| 55 | * against increment races. | ||
| 56 | */ | ||
| 57 | static u64 frontswap_gets; | ||
| 58 | static u64 frontswap_succ_puts; | ||
| 59 | static u64 frontswap_failed_puts; | ||
| 60 | static u64 frontswap_invalidates; | ||
| 61 | |||
| 62 | static inline void inc_frontswap_gets(void) { | ||
| 63 | frontswap_gets++; | ||
| 64 | } | ||
| 65 | static inline void inc_frontswap_succ_puts(void) { | ||
| 66 | frontswap_succ_puts++; | ||
| 67 | } | ||
| 68 | static inline void inc_frontswap_failed_puts(void) { | ||
| 69 | frontswap_failed_puts++; | ||
| 70 | } | ||
| 71 | static inline void inc_frontswap_invalidates(void) { | ||
| 72 | frontswap_invalidates++; | ||
| 73 | } | ||
| 74 | #else | ||
| 75 | static inline void inc_frontswap_gets(void) { } | ||
| 76 | static inline void inc_frontswap_succ_puts(void) { } | ||
| 77 | static inline void inc_frontswap_failed_puts(void) { } | ||
| 78 | static inline void inc_frontswap_invalidates(void) { } | ||
| 79 | #endif | ||
| 80 | /* | ||
| 81 | * Register operations for frontswap, returning previous thus allowing | ||
| 82 | * detection of multiple backends and possible nesting. | ||
| 83 | */ | ||
| 84 | struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) | ||
| 85 | { | ||
| 86 | struct frontswap_ops old = frontswap_ops; | ||
| 87 | |||
| 88 | frontswap_ops = *ops; | ||
| 89 | frontswap_enabled = true; | ||
| 90 | return old; | ||
| 91 | } | ||
| 92 | EXPORT_SYMBOL(frontswap_register_ops); | ||
| 93 | |||
| 94 | /* | ||
| 95 | * Enable/disable frontswap writethrough (see above). | ||
| 96 | */ | ||
| 97 | void frontswap_writethrough(bool enable) | ||
| 98 | { | ||
| 99 | frontswap_writethrough_enabled = enable; | ||
| 100 | } | ||
| 101 | EXPORT_SYMBOL(frontswap_writethrough); | ||
| 102 | |||
| 103 | /* | ||
| 104 | * Called when a swap device is swapon'd. | ||
| 105 | */ | ||
| 106 | void __frontswap_init(unsigned type) | ||
| 107 | { | ||
| 108 | struct swap_info_struct *sis = swap_info[type]; | ||
| 109 | |||
| 110 | BUG_ON(sis == NULL); | ||
| 111 | if (sis->frontswap_map == NULL) | ||
| 112 | return; | ||
| 113 | if (frontswap_enabled) | ||
| 114 | (*frontswap_ops.init)(type); | ||
| 115 | } | ||
| 116 | EXPORT_SYMBOL(__frontswap_init); | ||
| 117 | |||
| 118 | /* | ||
| 119 | * "Put" data from a page to frontswap and associate it with the page's | ||
| 120 | * swaptype and offset. Page must be locked and in the swap cache. | ||
| 121 | * If frontswap already contains a page with matching swaptype and | ||
| 122 | * offset, the frontswap implmentation may either overwrite the data and | ||
| 123 | * return success or invalidate the page from frontswap and return failure. | ||
| 124 | */ | ||
| 125 | int __frontswap_put_page(struct page *page) | ||
| 126 | { | ||
| 127 | int ret = -1, dup = 0; | ||
| 128 | swp_entry_t entry = { .val = page_private(page), }; | ||
| 129 | int type = swp_type(entry); | ||
| 130 | struct swap_info_struct *sis = swap_info[type]; | ||
| 131 | pgoff_t offset = swp_offset(entry); | ||
| 132 | |||
| 133 | BUG_ON(!PageLocked(page)); | ||
| 134 | BUG_ON(sis == NULL); | ||
| 135 | if (frontswap_test(sis, offset)) | ||
| 136 | dup = 1; | ||
| 137 | ret = (*frontswap_ops.put_page)(type, offset, page); | ||
| 138 | if (ret == 0) { | ||
| 139 | frontswap_set(sis, offset); | ||
| 140 | inc_frontswap_succ_puts(); | ||
| 141 | if (!dup) | ||
| 142 | atomic_inc(&sis->frontswap_pages); | ||
| 143 | } else if (dup) { | ||
| 144 | /* | ||
| 145 | failed dup always results in automatic invalidate of | ||
| 146 | the (older) page from frontswap | ||
| 147 | */ | ||
| 148 | frontswap_clear(sis, offset); | ||
| 149 | atomic_dec(&sis->frontswap_pages); | ||
| 150 | inc_frontswap_failed_puts(); | ||
| 151 | } else | ||
| 152 | inc_frontswap_failed_puts(); | ||
| 153 | if (frontswap_writethrough_enabled) | ||
| 154 | /* report failure so swap also writes to swap device */ | ||
| 155 | ret = -1; | ||
| 156 | return ret; | ||
| 157 | } | ||
| 158 | EXPORT_SYMBOL(__frontswap_put_page); | ||
| 159 | |||
| 160 | /* | ||
| 161 | * "Get" data from frontswap associated with swaptype and offset that were | ||
| 162 | * specified when the data was put to frontswap and use it to fill the | ||
| 163 | * specified page with data. Page must be locked and in the swap cache. | ||
| 164 | */ | ||
| 165 | int __frontswap_get_page(struct page *page) | ||
| 166 | { | ||
| 167 | int ret = -1; | ||
| 168 | swp_entry_t entry = { .val = page_private(page), }; | ||
| 169 | int type = swp_type(entry); | ||
| 170 | struct swap_info_struct *sis = swap_info[type]; | ||
| 171 | pgoff_t offset = swp_offset(entry); | ||
| 172 | |||
| 173 | BUG_ON(!PageLocked(page)); | ||
| 174 | BUG_ON(sis == NULL); | ||
| 175 | if (frontswap_test(sis, offset)) | ||
| 176 | ret = (*frontswap_ops.get_page)(type, offset, page); | ||
| 177 | if (ret == 0) | ||
| 178 | inc_frontswap_gets(); | ||
| 179 | return ret; | ||
| 180 | } | ||
| 181 | EXPORT_SYMBOL(__frontswap_get_page); | ||
| 182 | |||
| 183 | /* | ||
| 184 | * Invalidate any data from frontswap associated with the specified swaptype | ||
| 185 | * and offset so that a subsequent "get" will fail. | ||
| 186 | */ | ||
| 187 | void __frontswap_invalidate_page(unsigned type, pgoff_t offset) | ||
| 188 | { | ||
| 189 | struct swap_info_struct *sis = swap_info[type]; | ||
| 190 | |||
| 191 | BUG_ON(sis == NULL); | ||
| 192 | if (frontswap_test(sis, offset)) { | ||
| 193 | (*frontswap_ops.invalidate_page)(type, offset); | ||
| 194 | atomic_dec(&sis->frontswap_pages); | ||
| 195 | frontswap_clear(sis, offset); | ||
| 196 | inc_frontswap_invalidates(); | ||
| 197 | } | ||
| 198 | } | ||
| 199 | EXPORT_SYMBOL(__frontswap_invalidate_page); | ||
| 200 | |||
| 201 | /* | ||
| 202 | * Invalidate all data from frontswap associated with all offsets for the | ||
| 203 | * specified swaptype. | ||
| 204 | */ | ||
| 205 | void __frontswap_invalidate_area(unsigned type) | ||
| 206 | { | ||
| 207 | struct swap_info_struct *sis = swap_info[type]; | ||
| 208 | |||
| 209 | BUG_ON(sis == NULL); | ||
| 210 | if (sis->frontswap_map == NULL) | ||
| 211 | return; | ||
| 212 | (*frontswap_ops.invalidate_area)(type); | ||
| 213 | atomic_set(&sis->frontswap_pages, 0); | ||
| 214 | memset(sis->frontswap_map, 0, sis->max / sizeof(long)); | ||
| 215 | } | ||
| 216 | EXPORT_SYMBOL(__frontswap_invalidate_area); | ||
| 217 | |||
| 218 | /* | ||
| 219 | * Frontswap, like a true swap device, may unnecessarily retain pages | ||
| 220 | * under certain circumstances; "shrink" frontswap is essentially a | ||
| 221 | * "partial swapoff" and works by calling try_to_unuse to attempt to | ||
| 222 | * unuse enough frontswap pages to attempt to -- subject to memory | ||
| 223 | * constraints -- reduce the number of pages in frontswap to the | ||
| 224 | * number given in the parameter target_pages. | ||
| 225 | */ | ||
| 226 | void frontswap_shrink(unsigned long target_pages) | ||
| 227 | { | ||
| 228 | struct swap_info_struct *si = NULL; | ||
| 229 | int si_frontswap_pages; | ||
| 230 | unsigned long total_pages = 0, total_pages_to_unuse; | ||
| 231 | unsigned long pages = 0, pages_to_unuse = 0; | ||
| 232 | int type; | ||
| 233 | bool locked = false; | ||
| 234 | |||
| 235 | /* | ||
| 236 | * we don't want to hold swap_lock while doing a very | ||
| 237 | * lengthy try_to_unuse, but swap_list may change | ||
| 238 | * so restart scan from swap_list.head each time | ||
| 239 | */ | ||
| 240 | spin_lock(&swap_lock); | ||
| 241 | locked = true; | ||
| 242 | total_pages = 0; | ||
| 243 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
| 244 | si = swap_info[type]; | ||
| 245 | total_pages += atomic_read(&si->frontswap_pages); | ||
| 246 | } | ||
| 247 | if (total_pages <= target_pages) | ||
| 248 | goto out; | ||
| 249 | total_pages_to_unuse = total_pages - target_pages; | ||
| 250 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
| 251 | si = swap_info[type]; | ||
| 252 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | ||
| 253 | if (total_pages_to_unuse < si_frontswap_pages) | ||
| 254 | pages = pages_to_unuse = total_pages_to_unuse; | ||
| 255 | else { | ||
| 256 | pages = si_frontswap_pages; | ||
| 257 | pages_to_unuse = 0; /* unuse all */ | ||
| 258 | } | ||
| 259 | /* ensure there is enough RAM to fetch pages from frontswap */ | ||
| 260 | if (security_vm_enough_memory_mm(current->mm, pages)) | ||
| 261 | continue; | ||
| 262 | vm_unacct_memory(pages); | ||
| 263 | break; | ||
| 264 | } | ||
| 265 | if (type < 0) | ||
| 266 | goto out; | ||
| 267 | locked = false; | ||
| 268 | spin_unlock(&swap_lock); | ||
| 269 | try_to_unuse(type, true, pages_to_unuse); | ||
| 270 | out: | ||
| 271 | if (locked) | ||
| 272 | spin_unlock(&swap_lock); | ||
| 273 | return; | ||
| 274 | } | ||
| 275 | EXPORT_SYMBOL(frontswap_shrink); | ||
| 276 | |||
| 277 | /* | ||
| 278 | * Count and return the number of frontswap pages across all | ||
| 279 | * swap devices. This is exported so that backend drivers can | ||
| 280 | * determine current usage without reading debugfs. | ||
| 281 | */ | ||
| 282 | unsigned long frontswap_curr_pages(void) | ||
| 283 | { | ||
| 284 | int type; | ||
| 285 | unsigned long totalpages = 0; | ||
| 286 | struct swap_info_struct *si = NULL; | ||
| 287 | |||
| 288 | spin_lock(&swap_lock); | ||
| 289 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
| 290 | si = swap_info[type]; | ||
| 291 | totalpages += atomic_read(&si->frontswap_pages); | ||
| 292 | } | ||
| 293 | spin_unlock(&swap_lock); | ||
| 294 | return totalpages; | ||
| 295 | } | ||
| 296 | EXPORT_SYMBOL(frontswap_curr_pages); | ||
| 297 | |||
| 298 | static int __init init_frontswap(void) | ||
| 299 | { | ||
| 300 | #ifdef CONFIG_DEBUG_FS | ||
| 301 | struct dentry *root = debugfs_create_dir("frontswap", NULL); | ||
| 302 | if (root == NULL) | ||
| 303 | return -ENXIO; | ||
| 304 | debugfs_create_u64("gets", S_IRUGO, root, &frontswap_gets); | ||
| 305 | debugfs_create_u64("succ_puts", S_IRUGO, root, &frontswap_succ_puts); | ||
| 306 | debugfs_create_u64("failed_puts", S_IRUGO, root, | ||
| 307 | &frontswap_failed_puts); | ||
| 308 | debugfs_create_u64("invalidates", S_IRUGO, | ||
| 309 | root, &frontswap_invalidates); | ||
| 310 | #endif | ||
| 311 | return 0; | ||
| 312 | } | ||
| 313 | |||
| 314 | module_init(init_frontswap); | ||
