aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorHugh Dickins <hugh.dickins@tiscali.co.uk>2009-12-14 20:58:46 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-15 11:53:15 -0500
commit570a335b8e22579e2a51a68136d2b1f907a20eec (patch)
treec5312383e948d2e7ac60c2fa410fee98e8b38a70 /mm/swapfile.c
parent8d69aaee80c123b460918816cbfa2e83224c3646 (diff)
swap_info: swap count continuations
Swap is duplicated (reference count incremented by one) whenever the same swap page is inserted into another mm (when forking finds a swap entry in place of a pte, or when reclaim unmaps a pte to insert the swap entry). swap_info_struct's vmalloc'ed swap_map is the array of these reference counts: but what happens when the unsigned short (or unsigned char since the preceding patch) is full? (and its high bit is kept for a cache flag) We then lose track of it, never freeing, leaving it in use until swapoff: at which point we _hope_ that a single pass will have found all instances, assume there are no more, and will lose user data if we're wrong. Swapping of KSM pages has not yet been enabled; but it is implemented, and makes it very easy for a user to overflow the maximum swap count: possible with ordinary process pages, but unlikely, even when pid_max has been raised from PID_MAX_DEFAULT. This patch implements swap count continuations: when the count overflows, a continuation page is allocated and linked to the original vmalloc'ed map page, and this used to hold the continuation counts for that entry and its neighbours. These continuation pages are seldom referenced: the common paths all work on the original swap_map, only referring to a continuation page when the low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c304
1 files changed, 250 insertions, 54 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c0d7b9ed0c16..cc5e7ebf2d2c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -35,11 +35,14 @@
35#include <linux/swapops.h> 35#include <linux/swapops.h>
36#include <linux/page_cgroup.h> 36#include <linux/page_cgroup.h>
37 37
38static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
39 unsigned char);
40static void free_swap_count_continuations(struct swap_info_struct *);
41
38static DEFINE_SPINLOCK(swap_lock); 42static DEFINE_SPINLOCK(swap_lock);
39static unsigned int nr_swapfiles; 43static unsigned int nr_swapfiles;
40long nr_swap_pages; 44long nr_swap_pages;
41long total_swap_pages; 45long total_swap_pages;
42static int swap_overflow;
43static int least_priority; 46static int least_priority;
44 47
45static const char Bad_file[] = "Bad swap file entry "; 48static const char Bad_file[] = "Bad swap file entry ";
@@ -55,7 +58,7 @@ static DEFINE_MUTEX(swapon_mutex);
55 58
56static inline unsigned char swap_count(unsigned char ent) 59static inline unsigned char swap_count(unsigned char ent)
57{ 60{
58 return ent & ~SWAP_HAS_CACHE; 61 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
59} 62}
60 63
61/* returns 1 if swap entry is freed */ 64/* returns 1 if swap entry is freed */
@@ -545,8 +548,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
545 if (usage == SWAP_HAS_CACHE) { 548 if (usage == SWAP_HAS_CACHE) {
546 VM_BUG_ON(!has_cache); 549 VM_BUG_ON(!has_cache);
547 has_cache = 0; 550 has_cache = 0;
548 } else if (count < SWAP_MAP_MAX) 551 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
549 count--; 552 if (count == COUNT_CONTINUED) {
553 if (swap_count_continued(p, offset, count))
554 count = SWAP_MAP_MAX | COUNT_CONTINUED;
555 else
556 count = SWAP_MAP_MAX;
557 } else
558 count--;
559 }
550 560
551 if (!count) 561 if (!count)
552 mem_cgroup_uncharge_swap(entry); 562 mem_cgroup_uncharge_swap(entry);
@@ -604,6 +614,8 @@ void swapcache_free(swp_entry_t entry, struct page *page)
604 614
605/* 615/*
606 * How many references to page are currently swapped out? 616 * How many references to page are currently swapped out?
617 * This does not give an exact answer when swap count is continued,
618 * but does include the high COUNT_CONTINUED flag to allow for that.
607 */ 619 */
608static inline int page_swapcount(struct page *page) 620static inline int page_swapcount(struct page *page)
609{ 621{
@@ -1019,7 +1031,6 @@ static int try_to_unuse(unsigned int type)
1019 swp_entry_t entry; 1031 swp_entry_t entry;
1020 unsigned int i = 0; 1032 unsigned int i = 0;
1021 int retval = 0; 1033 int retval = 0;
1022 int reset_overflow = 0;
1023 int shmem; 1034 int shmem;
1024 1035
1025 /* 1036 /*
@@ -1034,8 +1045,7 @@ static int try_to_unuse(unsigned int type)
1034 * together, child after parent. If we race with dup_mmap(), we 1045 * together, child after parent. If we race with dup_mmap(), we
1035 * prefer to resolve parent before child, lest we miss entries 1046 * prefer to resolve parent before child, lest we miss entries
1036 * duplicated after we scanned child: using last mm would invert 1047 * duplicated after we scanned child: using last mm would invert
1037 * that. Though it's only a serious concern when an overflowed 1048 * that.
1038 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
1039 */ 1049 */
1040 start_mm = &init_mm; 1050 start_mm = &init_mm;
1041 atomic_inc(&init_mm.mm_users); 1051 atomic_inc(&init_mm.mm_users);
@@ -1165,36 +1175,6 @@ static int try_to_unuse(unsigned int type)
1165 } 1175 }
1166 1176
1167 /* 1177 /*
1168 * How could swap count reach 0x7ffe ?
1169 * There's no way to repeat a swap page within an mm
1170 * (except in shmem, where it's the shared object which takes
1171 * the reference count)?
1172 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1173 * short is too small....)
1174 * If that's wrong, then we should worry more about
1175 * exit_mmap() and do_munmap() cases described above:
1176 * we might be resetting SWAP_MAP_MAX too early here.
1177 *
1178 * Yes, that's wrong: though very unlikely, swap count 0x7ffe
1179 * could surely occur if pid_max raised from PID_MAX_DEFAULT;
1180 * and we are now lowering SWAP_MAP_MAX to 0x7e, making it
1181 * much easier to reach. But the next patch will fix that.
1182 *
1183 * We know "Undead"s can happen, they're okay, so don't
1184 * report them; but do report if we reset SWAP_MAP_MAX.
1185 */
1186 /* We might release the lock_page() in unuse_mm(). */
1187 if (!PageSwapCache(page) || page_private(page) != entry.val)
1188 goto retry;
1189
1190 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1191 spin_lock(&swap_lock);
1192 *swap_map = SWAP_HAS_CACHE;
1193 spin_unlock(&swap_lock);
1194 reset_overflow = 1;
1195 }
1196
1197 /*
1198 * If a reference remains (rare), we would like to leave 1178 * If a reference remains (rare), we would like to leave
1199 * the page in the swap cache; but try_to_unmap could 1179 * the page in the swap cache; but try_to_unmap could
1200 * then re-duplicate the entry once we drop page lock, 1180 * then re-duplicate the entry once we drop page lock,
@@ -1235,7 +1215,6 @@ static int try_to_unuse(unsigned int type)
1235 * mark page dirty so shrink_page_list will preserve it. 1215 * mark page dirty so shrink_page_list will preserve it.
1236 */ 1216 */
1237 SetPageDirty(page); 1217 SetPageDirty(page);
1238retry:
1239 unlock_page(page); 1218 unlock_page(page);
1240 page_cache_release(page); 1219 page_cache_release(page);
1241 1220
@@ -1247,10 +1226,6 @@ retry:
1247 } 1226 }
1248 1227
1249 mmput(start_mm); 1228 mmput(start_mm);
1250 if (reset_overflow) {
1251 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
1252 swap_overflow = 0;
1253 }
1254 return retval; 1229 return retval;
1255} 1230}
1256 1231
@@ -1593,6 +1568,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1593 up_write(&swap_unplug_sem); 1568 up_write(&swap_unplug_sem);
1594 1569
1595 destroy_swap_extents(p); 1570 destroy_swap_extents(p);
1571 if (p->flags & SWP_CONTINUED)
1572 free_swap_count_continuations(p);
1573
1596 mutex_lock(&swapon_mutex); 1574 mutex_lock(&swapon_mutex);
1597 spin_lock(&swap_lock); 1575 spin_lock(&swap_lock);
1598 drain_mmlist(); 1576 drain_mmlist();
@@ -2079,14 +2057,13 @@ void si_swapinfo(struct sysinfo *val)
2079/* 2057/*
2080 * Verify that a swap entry is valid and increment its swap map count. 2058 * Verify that a swap entry is valid and increment its swap map count.
2081 * 2059 *
2082 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
2083 * "permanent", but will be reclaimed by the next swapoff.
2084 * Returns error code in following case. 2060 * Returns error code in following case.
2085 * - success -> 0 2061 * - success -> 0
2086 * - swp_entry is invalid -> EINVAL 2062 * - swp_entry is invalid -> EINVAL
2087 * - swp_entry is migration entry -> EINVAL 2063 * - swp_entry is migration entry -> EINVAL
2088 * - swap-cache reference is requested but there is already one. -> EEXIST 2064 * - swap-cache reference is requested but there is already one. -> EEXIST
2089 * - swap-cache reference is requested but the entry is not used. -> ENOENT 2065 * - swap-cache reference is requested but the entry is not used. -> ENOENT
2066 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
2090 */ 2067 */
2091static int __swap_duplicate(swp_entry_t entry, unsigned char usage) 2068static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2092{ 2069{
@@ -2126,15 +2103,14 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2126 2103
2127 } else if (count || has_cache) { 2104 } else if (count || has_cache) {
2128 2105
2129 if (count < SWAP_MAP_MAX - 1) 2106 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2130 count++; 2107 count += usage;
2131 else if (count <= SWAP_MAP_MAX) { 2108 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2132 if (swap_overflow++ < 5)
2133 printk(KERN_WARNING
2134 "swap_dup: swap entry overflow\n");
2135 count = SWAP_MAP_MAX;
2136 } else
2137 err = -EINVAL; 2109 err = -EINVAL;
2110 else if (swap_count_continued(p, offset, count))
2111 count = COUNT_CONTINUED;
2112 else
2113 err = -ENOMEM;
2138 } else 2114 } else
2139 err = -ENOENT; /* unused swap entry */ 2115 err = -ENOENT; /* unused swap entry */
2140 2116
@@ -2153,9 +2129,13 @@ bad_file:
2153/* 2129/*
2154 * increase reference count of swap entry by 1. 2130 * increase reference count of swap entry by 1.
2155 */ 2131 */
2156void swap_duplicate(swp_entry_t entry) 2132int swap_duplicate(swp_entry_t entry)
2157{ 2133{
2158 __swap_duplicate(entry, 1); 2134 int err = 0;
2135
2136 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2137 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2138 return err;
2159} 2139}
2160 2140
2161/* 2141/*
@@ -2222,3 +2202,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2222 *offset = ++toff; 2202 *offset = ++toff;
2223 return nr_pages? ++nr_pages: 0; 2203 return nr_pages? ++nr_pages: 0;
2224} 2204}
2205
2206/*
2207 * add_swap_count_continuation - called when a swap count is duplicated
2208 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2209 * page of the original vmalloc'ed swap_map, to hold the continuation count
2210 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
2211 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
2212 *
2213 * These continuation pages are seldom referenced: the common paths all work
2214 * on the original swap_map, only referring to a continuation page when the
2215 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
2216 *
2217 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
2218 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
2219 * can be called after dropping locks.
2220 */
2221int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2222{
2223 struct swap_info_struct *si;
2224 struct page *head;
2225 struct page *page;
2226 struct page *list_page;
2227 pgoff_t offset;
2228 unsigned char count;
2229
2230 /*
2231 * When debugging, it's easier to use __GFP_ZERO here; but it's better
2232 * for latency not to zero a page while GFP_ATOMIC and holding locks.
2233 */
2234 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2235
2236 si = swap_info_get(entry);
2237 if (!si) {
2238 /*
2239 * An acceptable race has occurred since the failing
2240 * __swap_duplicate(): the swap entry has been freed,
2241 * perhaps even the whole swap_map cleared for swapoff.
2242 */
2243 goto outer;
2244 }
2245
2246 offset = swp_offset(entry);
2247 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2248
2249 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2250 /*
2251 * The higher the swap count, the more likely it is that tasks
2252 * will race to add swap count continuation: we need to avoid
2253 * over-provisioning.
2254 */
2255 goto out;
2256 }
2257
2258 if (!page) {
2259 spin_unlock(&swap_lock);
2260 return -ENOMEM;
2261 }
2262
2263 /*
2264 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2265 * no architecture is using highmem pages for kernel pagetables: so it
2266 * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
2267 */
2268 head = vmalloc_to_page(si->swap_map + offset);
2269 offset &= ~PAGE_MASK;
2270
2271 /*
2272 * Page allocation does not initialize the page's lru field,
2273 * but it does always reset its private field.
2274 */
2275 if (!page_private(head)) {
2276 BUG_ON(count & COUNT_CONTINUED);
2277 INIT_LIST_HEAD(&head->lru);
2278 set_page_private(head, SWP_CONTINUED);
2279 si->flags |= SWP_CONTINUED;
2280 }
2281
2282 list_for_each_entry(list_page, &head->lru, lru) {
2283 unsigned char *map;
2284
2285 /*
2286 * If the previous map said no continuation, but we've found
2287 * a continuation page, free our allocation and use this one.
2288 */
2289 if (!(count & COUNT_CONTINUED))
2290 goto out;
2291
2292 map = kmap_atomic(list_page, KM_USER0) + offset;
2293 count = *map;
2294 kunmap_atomic(map, KM_USER0);
2295
2296 /*
2297 * If this continuation count now has some space in it,
2298 * free our allocation and use this one.
2299 */
2300 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2301 goto out;
2302 }
2303
2304 list_add_tail(&page->lru, &head->lru);
2305 page = NULL; /* now it's attached, don't free it */
2306out:
2307 spin_unlock(&swap_lock);
2308outer:
2309 if (page)
2310 __free_page(page);
2311 return 0;
2312}
2313
2314/*
2315 * swap_count_continued - when the original swap_map count is incremented
2316 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
2317 * into, carry if so, or else fail until a new continuation page is allocated;
2318 * when the original swap_map count is decremented from 0 with continuation,
2319 * borrow from the continuation and report whether it still holds more.
2320 * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
2321 */
2322static bool swap_count_continued(struct swap_info_struct *si,
2323 pgoff_t offset, unsigned char count)
2324{
2325 struct page *head;
2326 struct page *page;
2327 unsigned char *map;
2328
2329 head = vmalloc_to_page(si->swap_map + offset);
2330 if (page_private(head) != SWP_CONTINUED) {
2331 BUG_ON(count & COUNT_CONTINUED);
2332 return false; /* need to add count continuation */
2333 }
2334
2335 offset &= ~PAGE_MASK;
2336 page = list_entry(head->lru.next, struct page, lru);
2337 map = kmap_atomic(page, KM_USER0) + offset;
2338
2339 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
2340 goto init_map; /* jump over SWAP_CONT_MAX checks */
2341
2342 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
2343 /*
2344 * Think of how you add 1 to 999
2345 */
2346 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2347 kunmap_atomic(map, KM_USER0);
2348 page = list_entry(page->lru.next, struct page, lru);
2349 BUG_ON(page == head);
2350 map = kmap_atomic(page, KM_USER0) + offset;
2351 }
2352 if (*map == SWAP_CONT_MAX) {
2353 kunmap_atomic(map, KM_USER0);
2354 page = list_entry(page->lru.next, struct page, lru);
2355 if (page == head)
2356 return false; /* add count continuation */
2357 map = kmap_atomic(page, KM_USER0) + offset;
2358init_map: *map = 0; /* we didn't zero the page */
2359 }
2360 *map += 1;
2361 kunmap_atomic(map, KM_USER0);
2362 page = list_entry(page->lru.prev, struct page, lru);
2363 while (page != head) {
2364 map = kmap_atomic(page, KM_USER0) + offset;
2365 *map = COUNT_CONTINUED;
2366 kunmap_atomic(map, KM_USER0);
2367 page = list_entry(page->lru.prev, struct page, lru);
2368 }
2369 return true; /* incremented */
2370
2371 } else { /* decrementing */
2372 /*
2373 * Think of how you subtract 1 from 1000
2374 */
2375 BUG_ON(count != COUNT_CONTINUED);
2376 while (*map == COUNT_CONTINUED) {
2377 kunmap_atomic(map, KM_USER0);
2378 page = list_entry(page->lru.next, struct page, lru);
2379 BUG_ON(page == head);
2380 map = kmap_atomic(page, KM_USER0) + offset;
2381 }
2382 BUG_ON(*map == 0);
2383 *map -= 1;
2384 if (*map == 0)
2385 count = 0;
2386 kunmap_atomic(map, KM_USER0);
2387 page = list_entry(page->lru.prev, struct page, lru);
2388 while (page != head) {
2389 map = kmap_atomic(page, KM_USER0) + offset;
2390 *map = SWAP_CONT_MAX | count;
2391 count = COUNT_CONTINUED;
2392 kunmap_atomic(map, KM_USER0);
2393 page = list_entry(page->lru.prev, struct page, lru);
2394 }
2395 return count == COUNT_CONTINUED;
2396 }
2397}
2398
2399/*
2400 * free_swap_count_continuations - swapoff free all the continuation pages
2401 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
2402 */
2403static void free_swap_count_continuations(struct swap_info_struct *si)
2404{
2405 pgoff_t offset;
2406
2407 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2408 struct page *head;
2409 head = vmalloc_to_page(si->swap_map + offset);
2410 if (page_private(head)) {
2411 struct list_head *this, *next;
2412 list_for_each_safe(this, next, &head->lru) {
2413 struct page *page;
2414 page = list_entry(this, struct page, lru);
2415 list_del(this);
2416 __free_page(page);
2417 }
2418 }
2419 }
2420}