diff options
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r-- | mm/swapfile.c | 304 |
1 files changed, 250 insertions, 54 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index c0d7b9ed0c16..cc5e7ebf2d2c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -35,11 +35,14 @@ | |||
35 | #include <linux/swapops.h> | 35 | #include <linux/swapops.h> |
36 | #include <linux/page_cgroup.h> | 36 | #include <linux/page_cgroup.h> |
37 | 37 | ||
38 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | ||
39 | unsigned char); | ||
40 | static void free_swap_count_continuations(struct swap_info_struct *); | ||
41 | |||
38 | static DEFINE_SPINLOCK(swap_lock); | 42 | static DEFINE_SPINLOCK(swap_lock); |
39 | static unsigned int nr_swapfiles; | 43 | static unsigned int nr_swapfiles; |
40 | long nr_swap_pages; | 44 | long nr_swap_pages; |
41 | long total_swap_pages; | 45 | long total_swap_pages; |
42 | static int swap_overflow; | ||
43 | static int least_priority; | 46 | static int least_priority; |
44 | 47 | ||
45 | static const char Bad_file[] = "Bad swap file entry "; | 48 | static const char Bad_file[] = "Bad swap file entry "; |
@@ -55,7 +58,7 @@ static DEFINE_MUTEX(swapon_mutex); | |||
55 | 58 | ||
56 | static inline unsigned char swap_count(unsigned char ent) | 59 | static inline unsigned char swap_count(unsigned char ent) |
57 | { | 60 | { |
58 | return ent & ~SWAP_HAS_CACHE; | 61 | return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ |
59 | } | 62 | } |
60 | 63 | ||
61 | /* returns 1 if swap entry is freed */ | 64 | /* returns 1 if swap entry is freed */ |
@@ -545,8 +548,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
545 | if (usage == SWAP_HAS_CACHE) { | 548 | if (usage == SWAP_HAS_CACHE) { |
546 | VM_BUG_ON(!has_cache); | 549 | VM_BUG_ON(!has_cache); |
547 | has_cache = 0; | 550 | has_cache = 0; |
548 | } else if (count < SWAP_MAP_MAX) | 551 | } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { |
549 | count--; | 552 | if (count == COUNT_CONTINUED) { |
553 | if (swap_count_continued(p, offset, count)) | ||
554 | count = SWAP_MAP_MAX | COUNT_CONTINUED; | ||
555 | else | ||
556 | count = SWAP_MAP_MAX; | ||
557 | } else | ||
558 | count--; | ||
559 | } | ||
550 | 560 | ||
551 | if (!count) | 561 | if (!count) |
552 | mem_cgroup_uncharge_swap(entry); | 562 | mem_cgroup_uncharge_swap(entry); |
@@ -604,6 +614,8 @@ void swapcache_free(swp_entry_t entry, struct page *page) | |||
604 | 614 | ||
605 | /* | 615 | /* |
606 | * How many references to page are currently swapped out? | 616 | * How many references to page are currently swapped out? |
617 | * This does not give an exact answer when swap count is continued, | ||
618 | * but does include the high COUNT_CONTINUED flag to allow for that. | ||
607 | */ | 619 | */ |
608 | static inline int page_swapcount(struct page *page) | 620 | static inline int page_swapcount(struct page *page) |
609 | { | 621 | { |
@@ -1019,7 +1031,6 @@ static int try_to_unuse(unsigned int type) | |||
1019 | swp_entry_t entry; | 1031 | swp_entry_t entry; |
1020 | unsigned int i = 0; | 1032 | unsigned int i = 0; |
1021 | int retval = 0; | 1033 | int retval = 0; |
1022 | int reset_overflow = 0; | ||
1023 | int shmem; | 1034 | int shmem; |
1024 | 1035 | ||
1025 | /* | 1036 | /* |
@@ -1034,8 +1045,7 @@ static int try_to_unuse(unsigned int type) | |||
1034 | * together, child after parent. If we race with dup_mmap(), we | 1045 | * together, child after parent. If we race with dup_mmap(), we |
1035 | * prefer to resolve parent before child, lest we miss entries | 1046 | * prefer to resolve parent before child, lest we miss entries |
1036 | * duplicated after we scanned child: using last mm would invert | 1047 | * duplicated after we scanned child: using last mm would invert |
1037 | * that. Though it's only a serious concern when an overflowed | 1048 | * that. |
1038 | * swap count is reset from SWAP_MAP_MAX, preventing a rescan. | ||
1039 | */ | 1049 | */ |
1040 | start_mm = &init_mm; | 1050 | start_mm = &init_mm; |
1041 | atomic_inc(&init_mm.mm_users); | 1051 | atomic_inc(&init_mm.mm_users); |
@@ -1165,36 +1175,6 @@ static int try_to_unuse(unsigned int type) | |||
1165 | } | 1175 | } |
1166 | 1176 | ||
1167 | /* | 1177 | /* |
1168 | * How could swap count reach 0x7ffe ? | ||
1169 | * There's no way to repeat a swap page within an mm | ||
1170 | * (except in shmem, where it's the shared object which takes | ||
1171 | * the reference count)? | ||
1172 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned | ||
1173 | * short is too small....) | ||
1174 | * If that's wrong, then we should worry more about | ||
1175 | * exit_mmap() and do_munmap() cases described above: | ||
1176 | * we might be resetting SWAP_MAP_MAX too early here. | ||
1177 | * | ||
1178 | * Yes, that's wrong: though very unlikely, swap count 0x7ffe | ||
1179 | * could surely occur if pid_max raised from PID_MAX_DEFAULT; | ||
1180 | * and we are now lowering SWAP_MAP_MAX to 0x7e, making it | ||
1181 | * much easier to reach. But the next patch will fix that. | ||
1182 | * | ||
1183 | * We know "Undead"s can happen, they're okay, so don't | ||
1184 | * report them; but do report if we reset SWAP_MAP_MAX. | ||
1185 | */ | ||
1186 | /* We might release the lock_page() in unuse_mm(). */ | ||
1187 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
1188 | goto retry; | ||
1189 | |||
1190 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
1191 | spin_lock(&swap_lock); | ||
1192 | *swap_map = SWAP_HAS_CACHE; | ||
1193 | spin_unlock(&swap_lock); | ||
1194 | reset_overflow = 1; | ||
1195 | } | ||
1196 | |||
1197 | /* | ||
1198 | * If a reference remains (rare), we would like to leave | 1178 | * If a reference remains (rare), we would like to leave |
1199 | * the page in the swap cache; but try_to_unmap could | 1179 | * the page in the swap cache; but try_to_unmap could |
1200 | * then re-duplicate the entry once we drop page lock, | 1180 | * then re-duplicate the entry once we drop page lock, |
@@ -1235,7 +1215,6 @@ static int try_to_unuse(unsigned int type) | |||
1235 | * mark page dirty so shrink_page_list will preserve it. | 1215 | * mark page dirty so shrink_page_list will preserve it. |
1236 | */ | 1216 | */ |
1237 | SetPageDirty(page); | 1217 | SetPageDirty(page); |
1238 | retry: | ||
1239 | unlock_page(page); | 1218 | unlock_page(page); |
1240 | page_cache_release(page); | 1219 | page_cache_release(page); |
1241 | 1220 | ||
@@ -1247,10 +1226,6 @@ retry: | |||
1247 | } | 1226 | } |
1248 | 1227 | ||
1249 | mmput(start_mm); | 1228 | mmput(start_mm); |
1250 | if (reset_overflow) { | ||
1251 | printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); | ||
1252 | swap_overflow = 0; | ||
1253 | } | ||
1254 | return retval; | 1229 | return retval; |
1255 | } | 1230 | } |
1256 | 1231 | ||
@@ -1593,6 +1568,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1593 | up_write(&swap_unplug_sem); | 1568 | up_write(&swap_unplug_sem); |
1594 | 1569 | ||
1595 | destroy_swap_extents(p); | 1570 | destroy_swap_extents(p); |
1571 | if (p->flags & SWP_CONTINUED) | ||
1572 | free_swap_count_continuations(p); | ||
1573 | |||
1596 | mutex_lock(&swapon_mutex); | 1574 | mutex_lock(&swapon_mutex); |
1597 | spin_lock(&swap_lock); | 1575 | spin_lock(&swap_lock); |
1598 | drain_mmlist(); | 1576 | drain_mmlist(); |
@@ -2079,14 +2057,13 @@ void si_swapinfo(struct sysinfo *val) | |||
2079 | /* | 2057 | /* |
2080 | * Verify that a swap entry is valid and increment its swap map count. | 2058 | * Verify that a swap entry is valid and increment its swap map count. |
2081 | * | 2059 | * |
2082 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | ||
2083 | * "permanent", but will be reclaimed by the next swapoff. | ||
2084 | * Returns error code in following case. | 2060 | * Returns error code in following case. |
2085 | * - success -> 0 | 2061 | * - success -> 0 |
2086 | * - swp_entry is invalid -> EINVAL | 2062 | * - swp_entry is invalid -> EINVAL |
2087 | * - swp_entry is migration entry -> EINVAL | 2063 | * - swp_entry is migration entry -> EINVAL |
2088 | * - swap-cache reference is requested but there is already one. -> EEXIST | 2064 | * - swap-cache reference is requested but there is already one. -> EEXIST |
2089 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | 2065 | * - swap-cache reference is requested but the entry is not used. -> ENOENT |
2066 | * - swap-mapped reference requested but needs continued swap count. -> ENOMEM | ||
2090 | */ | 2067 | */ |
2091 | static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | 2068 | static int __swap_duplicate(swp_entry_t entry, unsigned char usage) |
2092 | { | 2069 | { |
@@ -2126,15 +2103,14 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
2126 | 2103 | ||
2127 | } else if (count || has_cache) { | 2104 | } else if (count || has_cache) { |
2128 | 2105 | ||
2129 | if (count < SWAP_MAP_MAX - 1) | 2106 | if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) |
2130 | count++; | 2107 | count += usage; |
2131 | else if (count <= SWAP_MAP_MAX) { | 2108 | else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) |
2132 | if (swap_overflow++ < 5) | ||
2133 | printk(KERN_WARNING | ||
2134 | "swap_dup: swap entry overflow\n"); | ||
2135 | count = SWAP_MAP_MAX; | ||
2136 | } else | ||
2137 | err = -EINVAL; | 2109 | err = -EINVAL; |
2110 | else if (swap_count_continued(p, offset, count)) | ||
2111 | count = COUNT_CONTINUED; | ||
2112 | else | ||
2113 | err = -ENOMEM; | ||
2138 | } else | 2114 | } else |
2139 | err = -ENOENT; /* unused swap entry */ | 2115 | err = -ENOENT; /* unused swap entry */ |
2140 | 2116 | ||
@@ -2153,9 +2129,13 @@ bad_file: | |||
2153 | /* | 2129 | /* |
2154 | * increase reference count of swap entry by 1. | 2130 | * increase reference count of swap entry by 1. |
2155 | */ | 2131 | */ |
2156 | void swap_duplicate(swp_entry_t entry) | 2132 | int swap_duplicate(swp_entry_t entry) |
2157 | { | 2133 | { |
2158 | __swap_duplicate(entry, 1); | 2134 | int err = 0; |
2135 | |||
2136 | while (!err && __swap_duplicate(entry, 1) == -ENOMEM) | ||
2137 | err = add_swap_count_continuation(entry, GFP_ATOMIC); | ||
2138 | return err; | ||
2159 | } | 2139 | } |
2160 | 2140 | ||
2161 | /* | 2141 | /* |
@@ -2222,3 +2202,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2222 | *offset = ++toff; | 2202 | *offset = ++toff; |
2223 | return nr_pages? ++nr_pages: 0; | 2203 | return nr_pages? ++nr_pages: 0; |
2224 | } | 2204 | } |
2205 | |||
2206 | /* | ||
2207 | * add_swap_count_continuation - called when a swap count is duplicated | ||
2208 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | ||
2209 | * page of the original vmalloc'ed swap_map, to hold the continuation count | ||
2210 | * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called | ||
2211 | * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. | ||
2212 | * | ||
2213 | * These continuation pages are seldom referenced: the common paths all work | ||
2214 | * on the original swap_map, only referring to a continuation page when the | ||
2215 | * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. | ||
2216 | * | ||
2217 | * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding | ||
2218 | * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) | ||
2219 | * can be called after dropping locks. | ||
2220 | */ | ||
2221 | int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | ||
2222 | { | ||
2223 | struct swap_info_struct *si; | ||
2224 | struct page *head; | ||
2225 | struct page *page; | ||
2226 | struct page *list_page; | ||
2227 | pgoff_t offset; | ||
2228 | unsigned char count; | ||
2229 | |||
2230 | /* | ||
2231 | * When debugging, it's easier to use __GFP_ZERO here; but it's better | ||
2232 | * for latency not to zero a page while GFP_ATOMIC and holding locks. | ||
2233 | */ | ||
2234 | page = alloc_page(gfp_mask | __GFP_HIGHMEM); | ||
2235 | |||
2236 | si = swap_info_get(entry); | ||
2237 | if (!si) { | ||
2238 | /* | ||
2239 | * An acceptable race has occurred since the failing | ||
2240 | * __swap_duplicate(): the swap entry has been freed, | ||
2241 | * perhaps even the whole swap_map cleared for swapoff. | ||
2242 | */ | ||
2243 | goto outer; | ||
2244 | } | ||
2245 | |||
2246 | offset = swp_offset(entry); | ||
2247 | count = si->swap_map[offset] & ~SWAP_HAS_CACHE; | ||
2248 | |||
2249 | if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { | ||
2250 | /* | ||
2251 | * The higher the swap count, the more likely it is that tasks | ||
2252 | * will race to add swap count continuation: we need to avoid | ||
2253 | * over-provisioning. | ||
2254 | */ | ||
2255 | goto out; | ||
2256 | } | ||
2257 | |||
2258 | if (!page) { | ||
2259 | spin_unlock(&swap_lock); | ||
2260 | return -ENOMEM; | ||
2261 | } | ||
2262 | |||
2263 | /* | ||
2264 | * We are fortunate that although vmalloc_to_page uses pte_offset_map, | ||
2265 | * no architecture is using highmem pages for kernel pagetables: so it | ||
2266 | * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. | ||
2267 | */ | ||
2268 | head = vmalloc_to_page(si->swap_map + offset); | ||
2269 | offset &= ~PAGE_MASK; | ||
2270 | |||
2271 | /* | ||
2272 | * Page allocation does not initialize the page's lru field, | ||
2273 | * but it does always reset its private field. | ||
2274 | */ | ||
2275 | if (!page_private(head)) { | ||
2276 | BUG_ON(count & COUNT_CONTINUED); | ||
2277 | INIT_LIST_HEAD(&head->lru); | ||
2278 | set_page_private(head, SWP_CONTINUED); | ||
2279 | si->flags |= SWP_CONTINUED; | ||
2280 | } | ||
2281 | |||
2282 | list_for_each_entry(list_page, &head->lru, lru) { | ||
2283 | unsigned char *map; | ||
2284 | |||
2285 | /* | ||
2286 | * If the previous map said no continuation, but we've found | ||
2287 | * a continuation page, free our allocation and use this one. | ||
2288 | */ | ||
2289 | if (!(count & COUNT_CONTINUED)) | ||
2290 | goto out; | ||
2291 | |||
2292 | map = kmap_atomic(list_page, KM_USER0) + offset; | ||
2293 | count = *map; | ||
2294 | kunmap_atomic(map, KM_USER0); | ||
2295 | |||
2296 | /* | ||
2297 | * If this continuation count now has some space in it, | ||
2298 | * free our allocation and use this one. | ||
2299 | */ | ||
2300 | if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) | ||
2301 | goto out; | ||
2302 | } | ||
2303 | |||
2304 | list_add_tail(&page->lru, &head->lru); | ||
2305 | page = NULL; /* now it's attached, don't free it */ | ||
2306 | out: | ||
2307 | spin_unlock(&swap_lock); | ||
2308 | outer: | ||
2309 | if (page) | ||
2310 | __free_page(page); | ||
2311 | return 0; | ||
2312 | } | ||
2313 | |||
2314 | /* | ||
2315 | * swap_count_continued - when the original swap_map count is incremented | ||
2316 | * from SWAP_MAP_MAX, check if there is already a continuation page to carry | ||
2317 | * into, carry if so, or else fail until a new continuation page is allocated; | ||
2318 | * when the original swap_map count is decremented from 0 with continuation, | ||
2319 | * borrow from the continuation and report whether it still holds more. | ||
2320 | * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. | ||
2321 | */ | ||
2322 | static bool swap_count_continued(struct swap_info_struct *si, | ||
2323 | pgoff_t offset, unsigned char count) | ||
2324 | { | ||
2325 | struct page *head; | ||
2326 | struct page *page; | ||
2327 | unsigned char *map; | ||
2328 | |||
2329 | head = vmalloc_to_page(si->swap_map + offset); | ||
2330 | if (page_private(head) != SWP_CONTINUED) { | ||
2331 | BUG_ON(count & COUNT_CONTINUED); | ||
2332 | return false; /* need to add count continuation */ | ||
2333 | } | ||
2334 | |||
2335 | offset &= ~PAGE_MASK; | ||
2336 | page = list_entry(head->lru.next, struct page, lru); | ||
2337 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2338 | |||
2339 | if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ | ||
2340 | goto init_map; /* jump over SWAP_CONT_MAX checks */ | ||
2341 | |||
2342 | if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ | ||
2343 | /* | ||
2344 | * Think of how you add 1 to 999 | ||
2345 | */ | ||
2346 | while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { | ||
2347 | kunmap_atomic(map, KM_USER0); | ||
2348 | page = list_entry(page->lru.next, struct page, lru); | ||
2349 | BUG_ON(page == head); | ||
2350 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2351 | } | ||
2352 | if (*map == SWAP_CONT_MAX) { | ||
2353 | kunmap_atomic(map, KM_USER0); | ||
2354 | page = list_entry(page->lru.next, struct page, lru); | ||
2355 | if (page == head) | ||
2356 | return false; /* add count continuation */ | ||
2357 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2358 | init_map: *map = 0; /* we didn't zero the page */ | ||
2359 | } | ||
2360 | *map += 1; | ||
2361 | kunmap_atomic(map, KM_USER0); | ||
2362 | page = list_entry(page->lru.prev, struct page, lru); | ||
2363 | while (page != head) { | ||
2364 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2365 | *map = COUNT_CONTINUED; | ||
2366 | kunmap_atomic(map, KM_USER0); | ||
2367 | page = list_entry(page->lru.prev, struct page, lru); | ||
2368 | } | ||
2369 | return true; /* incremented */ | ||
2370 | |||
2371 | } else { /* decrementing */ | ||
2372 | /* | ||
2373 | * Think of how you subtract 1 from 1000 | ||
2374 | */ | ||
2375 | BUG_ON(count != COUNT_CONTINUED); | ||
2376 | while (*map == COUNT_CONTINUED) { | ||
2377 | kunmap_atomic(map, KM_USER0); | ||
2378 | page = list_entry(page->lru.next, struct page, lru); | ||
2379 | BUG_ON(page == head); | ||
2380 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2381 | } | ||
2382 | BUG_ON(*map == 0); | ||
2383 | *map -= 1; | ||
2384 | if (*map == 0) | ||
2385 | count = 0; | ||
2386 | kunmap_atomic(map, KM_USER0); | ||
2387 | page = list_entry(page->lru.prev, struct page, lru); | ||
2388 | while (page != head) { | ||
2389 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2390 | *map = SWAP_CONT_MAX | count; | ||
2391 | count = COUNT_CONTINUED; | ||
2392 | kunmap_atomic(map, KM_USER0); | ||
2393 | page = list_entry(page->lru.prev, struct page, lru); | ||
2394 | } | ||
2395 | return count == COUNT_CONTINUED; | ||
2396 | } | ||
2397 | } | ||
2398 | |||
2399 | /* | ||
2400 | * free_swap_count_continuations - swapoff free all the continuation pages | ||
2401 | * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. | ||
2402 | */ | ||
2403 | static void free_swap_count_continuations(struct swap_info_struct *si) | ||
2404 | { | ||
2405 | pgoff_t offset; | ||
2406 | |||
2407 | for (offset = 0; offset < si->max; offset += PAGE_SIZE) { | ||
2408 | struct page *head; | ||
2409 | head = vmalloc_to_page(si->swap_map + offset); | ||
2410 | if (page_private(head)) { | ||
2411 | struct list_head *this, *next; | ||
2412 | list_for_each_safe(this, next, &head->lru) { | ||
2413 | struct page *page; | ||
2414 | page = list_entry(this, struct page, lru); | ||
2415 | list_del(this); | ||
2416 | __free_page(page); | ||
2417 | } | ||
2418 | } | ||
2419 | } | ||
2420 | } | ||