diff options
-rw-r--r-- | include/linux/swap.h | 22 | ||||
-rw-r--r-- | mm/memory.c | 19 | ||||
-rw-r--r-- | mm/rmap.c | 6 | ||||
-rw-r--r-- | mm/swapfile.c | 304 |
4 files changed, 287 insertions, 64 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h index f733deb10748..389e7bd92cca 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -145,15 +145,18 @@ enum { | |||
145 | SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ | 145 | SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ |
146 | SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ | 146 | SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ |
147 | SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ | 147 | SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ |
148 | SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ | ||
148 | /* add others here before... */ | 149 | /* add others here before... */ |
149 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ | 150 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ |
150 | }; | 151 | }; |
151 | 152 | ||
152 | #define SWAP_CLUSTER_MAX 32 | 153 | #define SWAP_CLUSTER_MAX 32 |
153 | 154 | ||
154 | #define SWAP_MAP_MAX 0x7e | 155 | #define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ |
155 | #define SWAP_MAP_BAD 0x7f | 156 | #define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ |
156 | #define SWAP_HAS_CACHE 0x80 /* There is a swap cache of entry. */ | 157 | #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ |
158 | #define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */ | ||
159 | #define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */ | ||
157 | 160 | ||
158 | /* | 161 | /* |
159 | * The in-memory structure used to track swap areas. | 162 | * The in-memory structure used to track swap areas. |
@@ -311,9 +314,10 @@ extern long total_swap_pages; | |||
311 | extern void si_swapinfo(struct sysinfo *); | 314 | extern void si_swapinfo(struct sysinfo *); |
312 | extern swp_entry_t get_swap_page(void); | 315 | extern swp_entry_t get_swap_page(void); |
313 | extern swp_entry_t get_swap_page_of_type(int); | 316 | extern swp_entry_t get_swap_page_of_type(int); |
314 | extern void swap_duplicate(swp_entry_t); | ||
315 | extern int swapcache_prepare(swp_entry_t); | ||
316 | extern int valid_swaphandles(swp_entry_t, unsigned long *); | 317 | extern int valid_swaphandles(swp_entry_t, unsigned long *); |
318 | extern int add_swap_count_continuation(swp_entry_t, gfp_t); | ||
319 | extern int swap_duplicate(swp_entry_t); | ||
320 | extern int swapcache_prepare(swp_entry_t); | ||
317 | extern void swap_free(swp_entry_t); | 321 | extern void swap_free(swp_entry_t); |
318 | extern void swapcache_free(swp_entry_t, struct page *page); | 322 | extern void swapcache_free(swp_entry_t, struct page *page); |
319 | extern int free_swap_and_cache(swp_entry_t); | 323 | extern int free_swap_and_cache(swp_entry_t); |
@@ -385,8 +389,14 @@ static inline void show_swap_cache_info(void) | |||
385 | #define free_swap_and_cache(swp) is_migration_entry(swp) | 389 | #define free_swap_and_cache(swp) is_migration_entry(swp) |
386 | #define swapcache_prepare(swp) is_migration_entry(swp) | 390 | #define swapcache_prepare(swp) is_migration_entry(swp) |
387 | 391 | ||
388 | static inline void swap_duplicate(swp_entry_t swp) | 392 | static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) |
389 | { | 393 | { |
394 | return 0; | ||
395 | } | ||
396 | |||
397 | static inline int swap_duplicate(swp_entry_t swp) | ||
398 | { | ||
399 | return 0; | ||
390 | } | 400 | } |
391 | 401 | ||
392 | static inline void swap_free(swp_entry_t swp) | 402 | static inline void swap_free(swp_entry_t swp) |
diff --git a/mm/memory.c b/mm/memory.c index 6ab19dd4a199..543c446bf4ed 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -572,7 +572,7 @@ out: | |||
572 | * covered by this vma. | 572 | * covered by this vma. |
573 | */ | 573 | */ |
574 | 574 | ||
575 | static inline void | 575 | static inline unsigned long |
576 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 576 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
577 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, | 577 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, |
578 | unsigned long addr, int *rss) | 578 | unsigned long addr, int *rss) |
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
586 | if (!pte_file(pte)) { | 586 | if (!pte_file(pte)) { |
587 | swp_entry_t entry = pte_to_swp_entry(pte); | 587 | swp_entry_t entry = pte_to_swp_entry(pte); |
588 | 588 | ||
589 | swap_duplicate(entry); | 589 | if (swap_duplicate(entry) < 0) |
590 | return entry.val; | ||
591 | |||
590 | /* make sure dst_mm is on swapoff's mmlist. */ | 592 | /* make sure dst_mm is on swapoff's mmlist. */ |
591 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 593 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
592 | spin_lock(&mmlist_lock); | 594 | spin_lock(&mmlist_lock); |
@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
635 | 637 | ||
636 | out_set_pte: | 638 | out_set_pte: |
637 | set_pte_at(dst_mm, addr, dst_pte, pte); | 639 | set_pte_at(dst_mm, addr, dst_pte, pte); |
640 | return 0; | ||
638 | } | 641 | } |
639 | 642 | ||
640 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 643 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
646 | spinlock_t *src_ptl, *dst_ptl; | 649 | spinlock_t *src_ptl, *dst_ptl; |
647 | int progress = 0; | 650 | int progress = 0; |
648 | int rss[2]; | 651 | int rss[2]; |
652 | swp_entry_t entry = (swp_entry_t){0}; | ||
649 | 653 | ||
650 | again: | 654 | again: |
651 | rss[1] = rss[0] = 0; | 655 | rss[1] = rss[0] = 0; |
@@ -674,7 +678,10 @@ again: | |||
674 | progress++; | 678 | progress++; |
675 | continue; | 679 | continue; |
676 | } | 680 | } |
677 | copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); | 681 | entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, |
682 | vma, addr, rss); | ||
683 | if (entry.val) | ||
684 | break; | ||
678 | progress += 8; | 685 | progress += 8; |
679 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | 686 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); |
680 | 687 | ||
@@ -684,6 +691,12 @@ again: | |||
684 | add_mm_rss(dst_mm, rss[0], rss[1]); | 691 | add_mm_rss(dst_mm, rss[0], rss[1]); |
685 | pte_unmap_unlock(orig_dst_pte, dst_ptl); | 692 | pte_unmap_unlock(orig_dst_pte, dst_ptl); |
686 | cond_resched(); | 693 | cond_resched(); |
694 | |||
695 | if (entry.val) { | ||
696 | if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) | ||
697 | return -ENOMEM; | ||
698 | progress = 0; | ||
699 | } | ||
687 | if (addr != end) | 700 | if (addr != end) |
688 | goto again; | 701 | goto again; |
689 | return 0; | 702 | return 0; |
@@ -822,7 +822,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
822 | * Store the swap location in the pte. | 822 | * Store the swap location in the pte. |
823 | * See handle_pte_fault() ... | 823 | * See handle_pte_fault() ... |
824 | */ | 824 | */ |
825 | swap_duplicate(entry); | 825 | if (swap_duplicate(entry) < 0) { |
826 | set_pte_at(mm, address, pte, pteval); | ||
827 | ret = SWAP_FAIL; | ||
828 | goto out_unmap; | ||
829 | } | ||
826 | if (list_empty(&mm->mmlist)) { | 830 | if (list_empty(&mm->mmlist)) { |
827 | spin_lock(&mmlist_lock); | 831 | spin_lock(&mmlist_lock); |
828 | if (list_empty(&mm->mmlist)) | 832 | if (list_empty(&mm->mmlist)) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index c0d7b9ed0c16..cc5e7ebf2d2c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -35,11 +35,14 @@ | |||
35 | #include <linux/swapops.h> | 35 | #include <linux/swapops.h> |
36 | #include <linux/page_cgroup.h> | 36 | #include <linux/page_cgroup.h> |
37 | 37 | ||
38 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | ||
39 | unsigned char); | ||
40 | static void free_swap_count_continuations(struct swap_info_struct *); | ||
41 | |||
38 | static DEFINE_SPINLOCK(swap_lock); | 42 | static DEFINE_SPINLOCK(swap_lock); |
39 | static unsigned int nr_swapfiles; | 43 | static unsigned int nr_swapfiles; |
40 | long nr_swap_pages; | 44 | long nr_swap_pages; |
41 | long total_swap_pages; | 45 | long total_swap_pages; |
42 | static int swap_overflow; | ||
43 | static int least_priority; | 46 | static int least_priority; |
44 | 47 | ||
45 | static const char Bad_file[] = "Bad swap file entry "; | 48 | static const char Bad_file[] = "Bad swap file entry "; |
@@ -55,7 +58,7 @@ static DEFINE_MUTEX(swapon_mutex); | |||
55 | 58 | ||
56 | static inline unsigned char swap_count(unsigned char ent) | 59 | static inline unsigned char swap_count(unsigned char ent) |
57 | { | 60 | { |
58 | return ent & ~SWAP_HAS_CACHE; | 61 | return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ |
59 | } | 62 | } |
60 | 63 | ||
61 | /* returns 1 if swap entry is freed */ | 64 | /* returns 1 if swap entry is freed */ |
@@ -545,8 +548,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
545 | if (usage == SWAP_HAS_CACHE) { | 548 | if (usage == SWAP_HAS_CACHE) { |
546 | VM_BUG_ON(!has_cache); | 549 | VM_BUG_ON(!has_cache); |
547 | has_cache = 0; | 550 | has_cache = 0; |
548 | } else if (count < SWAP_MAP_MAX) | 551 | } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { |
549 | count--; | 552 | if (count == COUNT_CONTINUED) { |
553 | if (swap_count_continued(p, offset, count)) | ||
554 | count = SWAP_MAP_MAX | COUNT_CONTINUED; | ||
555 | else | ||
556 | count = SWAP_MAP_MAX; | ||
557 | } else | ||
558 | count--; | ||
559 | } | ||
550 | 560 | ||
551 | if (!count) | 561 | if (!count) |
552 | mem_cgroup_uncharge_swap(entry); | 562 | mem_cgroup_uncharge_swap(entry); |
@@ -604,6 +614,8 @@ void swapcache_free(swp_entry_t entry, struct page *page) | |||
604 | 614 | ||
605 | /* | 615 | /* |
606 | * How many references to page are currently swapped out? | 616 | * How many references to page are currently swapped out? |
617 | * This does not give an exact answer when swap count is continued, | ||
618 | * but does include the high COUNT_CONTINUED flag to allow for that. | ||
607 | */ | 619 | */ |
608 | static inline int page_swapcount(struct page *page) | 620 | static inline int page_swapcount(struct page *page) |
609 | { | 621 | { |
@@ -1019,7 +1031,6 @@ static int try_to_unuse(unsigned int type) | |||
1019 | swp_entry_t entry; | 1031 | swp_entry_t entry; |
1020 | unsigned int i = 0; | 1032 | unsigned int i = 0; |
1021 | int retval = 0; | 1033 | int retval = 0; |
1022 | int reset_overflow = 0; | ||
1023 | int shmem; | 1034 | int shmem; |
1024 | 1035 | ||
1025 | /* | 1036 | /* |
@@ -1034,8 +1045,7 @@ static int try_to_unuse(unsigned int type) | |||
1034 | * together, child after parent. If we race with dup_mmap(), we | 1045 | * together, child after parent. If we race with dup_mmap(), we |
1035 | * prefer to resolve parent before child, lest we miss entries | 1046 | * prefer to resolve parent before child, lest we miss entries |
1036 | * duplicated after we scanned child: using last mm would invert | 1047 | * duplicated after we scanned child: using last mm would invert |
1037 | * that. Though it's only a serious concern when an overflowed | 1048 | * that. |
1038 | * swap count is reset from SWAP_MAP_MAX, preventing a rescan. | ||
1039 | */ | 1049 | */ |
1040 | start_mm = &init_mm; | 1050 | start_mm = &init_mm; |
1041 | atomic_inc(&init_mm.mm_users); | 1051 | atomic_inc(&init_mm.mm_users); |
@@ -1165,36 +1175,6 @@ static int try_to_unuse(unsigned int type) | |||
1165 | } | 1175 | } |
1166 | 1176 | ||
1167 | /* | 1177 | /* |
1168 | * How could swap count reach 0x7ffe ? | ||
1169 | * There's no way to repeat a swap page within an mm | ||
1170 | * (except in shmem, where it's the shared object which takes | ||
1171 | * the reference count)? | ||
1172 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned | ||
1173 | * short is too small....) | ||
1174 | * If that's wrong, then we should worry more about | ||
1175 | * exit_mmap() and do_munmap() cases described above: | ||
1176 | * we might be resetting SWAP_MAP_MAX too early here. | ||
1177 | * | ||
1178 | * Yes, that's wrong: though very unlikely, swap count 0x7ffe | ||
1179 | * could surely occur if pid_max raised from PID_MAX_DEFAULT; | ||
1180 | * and we are now lowering SWAP_MAP_MAX to 0x7e, making it | ||
1181 | * much easier to reach. But the next patch will fix that. | ||
1182 | * | ||
1183 | * We know "Undead"s can happen, they're okay, so don't | ||
1184 | * report them; but do report if we reset SWAP_MAP_MAX. | ||
1185 | */ | ||
1186 | /* We might release the lock_page() in unuse_mm(). */ | ||
1187 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
1188 | goto retry; | ||
1189 | |||
1190 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
1191 | spin_lock(&swap_lock); | ||
1192 | *swap_map = SWAP_HAS_CACHE; | ||
1193 | spin_unlock(&swap_lock); | ||
1194 | reset_overflow = 1; | ||
1195 | } | ||
1196 | |||
1197 | /* | ||
1198 | * If a reference remains (rare), we would like to leave | 1178 | * If a reference remains (rare), we would like to leave |
1199 | * the page in the swap cache; but try_to_unmap could | 1179 | * the page in the swap cache; but try_to_unmap could |
1200 | * then re-duplicate the entry once we drop page lock, | 1180 | * then re-duplicate the entry once we drop page lock, |
@@ -1235,7 +1215,6 @@ static int try_to_unuse(unsigned int type) | |||
1235 | * mark page dirty so shrink_page_list will preserve it. | 1215 | * mark page dirty so shrink_page_list will preserve it. |
1236 | */ | 1216 | */ |
1237 | SetPageDirty(page); | 1217 | SetPageDirty(page); |
1238 | retry: | ||
1239 | unlock_page(page); | 1218 | unlock_page(page); |
1240 | page_cache_release(page); | 1219 | page_cache_release(page); |
1241 | 1220 | ||
@@ -1247,10 +1226,6 @@ retry: | |||
1247 | } | 1226 | } |
1248 | 1227 | ||
1249 | mmput(start_mm); | 1228 | mmput(start_mm); |
1250 | if (reset_overflow) { | ||
1251 | printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); | ||
1252 | swap_overflow = 0; | ||
1253 | } | ||
1254 | return retval; | 1229 | return retval; |
1255 | } | 1230 | } |
1256 | 1231 | ||
@@ -1593,6 +1568,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1593 | up_write(&swap_unplug_sem); | 1568 | up_write(&swap_unplug_sem); |
1594 | 1569 | ||
1595 | destroy_swap_extents(p); | 1570 | destroy_swap_extents(p); |
1571 | if (p->flags & SWP_CONTINUED) | ||
1572 | free_swap_count_continuations(p); | ||
1573 | |||
1596 | mutex_lock(&swapon_mutex); | 1574 | mutex_lock(&swapon_mutex); |
1597 | spin_lock(&swap_lock); | 1575 | spin_lock(&swap_lock); |
1598 | drain_mmlist(); | 1576 | drain_mmlist(); |
@@ -2079,14 +2057,13 @@ void si_swapinfo(struct sysinfo *val) | |||
2079 | /* | 2057 | /* |
2080 | * Verify that a swap entry is valid and increment its swap map count. | 2058 | * Verify that a swap entry is valid and increment its swap map count. |
2081 | * | 2059 | * |
2082 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | ||
2083 | * "permanent", but will be reclaimed by the next swapoff. | ||
2084 | * Returns error code in following case. | 2060 | * Returns error code in following case. |
2085 | * - success -> 0 | 2061 | * - success -> 0 |
2086 | * - swp_entry is invalid -> EINVAL | 2062 | * - swp_entry is invalid -> EINVAL |
2087 | * - swp_entry is migration entry -> EINVAL | 2063 | * - swp_entry is migration entry -> EINVAL |
2088 | * - swap-cache reference is requested but there is already one. -> EEXIST | 2064 | * - swap-cache reference is requested but there is already one. -> EEXIST |
2089 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | 2065 | * - swap-cache reference is requested but the entry is not used. -> ENOENT |
2066 | * - swap-mapped reference requested but needs continued swap count. -> ENOMEM | ||
2090 | */ | 2067 | */ |
2091 | static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | 2068 | static int __swap_duplicate(swp_entry_t entry, unsigned char usage) |
2092 | { | 2069 | { |
@@ -2126,15 +2103,14 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
2126 | 2103 | ||
2127 | } else if (count || has_cache) { | 2104 | } else if (count || has_cache) { |
2128 | 2105 | ||
2129 | if (count < SWAP_MAP_MAX - 1) | 2106 | if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) |
2130 | count++; | 2107 | count += usage; |
2131 | else if (count <= SWAP_MAP_MAX) { | 2108 | else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) |
2132 | if (swap_overflow++ < 5) | ||
2133 | printk(KERN_WARNING | ||
2134 | "swap_dup: swap entry overflow\n"); | ||
2135 | count = SWAP_MAP_MAX; | ||
2136 | } else | ||
2137 | err = -EINVAL; | 2109 | err = -EINVAL; |
2110 | else if (swap_count_continued(p, offset, count)) | ||
2111 | count = COUNT_CONTINUED; | ||
2112 | else | ||
2113 | err = -ENOMEM; | ||
2138 | } else | 2114 | } else |
2139 | err = -ENOENT; /* unused swap entry */ | 2115 | err = -ENOENT; /* unused swap entry */ |
2140 | 2116 | ||
@@ -2153,9 +2129,13 @@ bad_file: | |||
2153 | /* | 2129 | /* |
2154 | * increase reference count of swap entry by 1. | 2130 | * increase reference count of swap entry by 1. |
2155 | */ | 2131 | */ |
2156 | void swap_duplicate(swp_entry_t entry) | 2132 | int swap_duplicate(swp_entry_t entry) |
2157 | { | 2133 | { |
2158 | __swap_duplicate(entry, 1); | 2134 | int err = 0; |
2135 | |||
2136 | while (!err && __swap_duplicate(entry, 1) == -ENOMEM) | ||
2137 | err = add_swap_count_continuation(entry, GFP_ATOMIC); | ||
2138 | return err; | ||
2159 | } | 2139 | } |
2160 | 2140 | ||
2161 | /* | 2141 | /* |
@@ -2222,3 +2202,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2222 | *offset = ++toff; | 2202 | *offset = ++toff; |
2223 | return nr_pages? ++nr_pages: 0; | 2203 | return nr_pages? ++nr_pages: 0; |
2224 | } | 2204 | } |
2205 | |||
2206 | /* | ||
2207 | * add_swap_count_continuation - called when a swap count is duplicated | ||
2208 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | ||
2209 | * page of the original vmalloc'ed swap_map, to hold the continuation count | ||
2210 | * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called | ||
2211 | * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. | ||
2212 | * | ||
2213 | * These continuation pages are seldom referenced: the common paths all work | ||
2214 | * on the original swap_map, only referring to a continuation page when the | ||
2215 | * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. | ||
2216 | * | ||
2217 | * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding | ||
2218 | * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) | ||
2219 | * can be called after dropping locks. | ||
2220 | */ | ||
2221 | int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | ||
2222 | { | ||
2223 | struct swap_info_struct *si; | ||
2224 | struct page *head; | ||
2225 | struct page *page; | ||
2226 | struct page *list_page; | ||
2227 | pgoff_t offset; | ||
2228 | unsigned char count; | ||
2229 | |||
2230 | /* | ||
2231 | * When debugging, it's easier to use __GFP_ZERO here; but it's better | ||
2232 | * for latency not to zero a page while GFP_ATOMIC and holding locks. | ||
2233 | */ | ||
2234 | page = alloc_page(gfp_mask | __GFP_HIGHMEM); | ||
2235 | |||
2236 | si = swap_info_get(entry); | ||
2237 | if (!si) { | ||
2238 | /* | ||
2239 | * An acceptable race has occurred since the failing | ||
2240 | * __swap_duplicate(): the swap entry has been freed, | ||
2241 | * perhaps even the whole swap_map cleared for swapoff. | ||
2242 | */ | ||
2243 | goto outer; | ||
2244 | } | ||
2245 | |||
2246 | offset = swp_offset(entry); | ||
2247 | count = si->swap_map[offset] & ~SWAP_HAS_CACHE; | ||
2248 | |||
2249 | if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { | ||
2250 | /* | ||
2251 | * The higher the swap count, the more likely it is that tasks | ||
2252 | * will race to add swap count continuation: we need to avoid | ||
2253 | * over-provisioning. | ||
2254 | */ | ||
2255 | goto out; | ||
2256 | } | ||
2257 | |||
2258 | if (!page) { | ||
2259 | spin_unlock(&swap_lock); | ||
2260 | return -ENOMEM; | ||
2261 | } | ||
2262 | |||
2263 | /* | ||
2264 | * We are fortunate that although vmalloc_to_page uses pte_offset_map, | ||
2265 | * no architecture is using highmem pages for kernel pagetables: so it | ||
2266 | * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. | ||
2267 | */ | ||
2268 | head = vmalloc_to_page(si->swap_map + offset); | ||
2269 | offset &= ~PAGE_MASK; | ||
2270 | |||
2271 | /* | ||
2272 | * Page allocation does not initialize the page's lru field, | ||
2273 | * but it does always reset its private field. | ||
2274 | */ | ||
2275 | if (!page_private(head)) { | ||
2276 | BUG_ON(count & COUNT_CONTINUED); | ||
2277 | INIT_LIST_HEAD(&head->lru); | ||
2278 | set_page_private(head, SWP_CONTINUED); | ||
2279 | si->flags |= SWP_CONTINUED; | ||
2280 | } | ||
2281 | |||
2282 | list_for_each_entry(list_page, &head->lru, lru) { | ||
2283 | unsigned char *map; | ||
2284 | |||
2285 | /* | ||
2286 | * If the previous map said no continuation, but we've found | ||
2287 | * a continuation page, free our allocation and use this one. | ||
2288 | */ | ||
2289 | if (!(count & COUNT_CONTINUED)) | ||
2290 | goto out; | ||
2291 | |||
2292 | map = kmap_atomic(list_page, KM_USER0) + offset; | ||
2293 | count = *map; | ||
2294 | kunmap_atomic(map, KM_USER0); | ||
2295 | |||
2296 | /* | ||
2297 | * If this continuation count now has some space in it, | ||
2298 | * free our allocation and use this one. | ||
2299 | */ | ||
2300 | if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) | ||
2301 | goto out; | ||
2302 | } | ||
2303 | |||
2304 | list_add_tail(&page->lru, &head->lru); | ||
2305 | page = NULL; /* now it's attached, don't free it */ | ||
2306 | out: | ||
2307 | spin_unlock(&swap_lock); | ||
2308 | outer: | ||
2309 | if (page) | ||
2310 | __free_page(page); | ||
2311 | return 0; | ||
2312 | } | ||
2313 | |||
2314 | /* | ||
2315 | * swap_count_continued - when the original swap_map count is incremented | ||
2316 | * from SWAP_MAP_MAX, check if there is already a continuation page to carry | ||
2317 | * into, carry if so, or else fail until a new continuation page is allocated; | ||
2318 | * when the original swap_map count is decremented from 0 with continuation, | ||
2319 | * borrow from the continuation and report whether it still holds more. | ||
2320 | * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. | ||
2321 | */ | ||
2322 | static bool swap_count_continued(struct swap_info_struct *si, | ||
2323 | pgoff_t offset, unsigned char count) | ||
2324 | { | ||
2325 | struct page *head; | ||
2326 | struct page *page; | ||
2327 | unsigned char *map; | ||
2328 | |||
2329 | head = vmalloc_to_page(si->swap_map + offset); | ||
2330 | if (page_private(head) != SWP_CONTINUED) { | ||
2331 | BUG_ON(count & COUNT_CONTINUED); | ||
2332 | return false; /* need to add count continuation */ | ||
2333 | } | ||
2334 | |||
2335 | offset &= ~PAGE_MASK; | ||
2336 | page = list_entry(head->lru.next, struct page, lru); | ||
2337 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2338 | |||
2339 | if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ | ||
2340 | goto init_map; /* jump over SWAP_CONT_MAX checks */ | ||
2341 | |||
2342 | if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ | ||
2343 | /* | ||
2344 | * Think of how you add 1 to 999 | ||
2345 | */ | ||
2346 | while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { | ||
2347 | kunmap_atomic(map, KM_USER0); | ||
2348 | page = list_entry(page->lru.next, struct page, lru); | ||
2349 | BUG_ON(page == head); | ||
2350 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2351 | } | ||
2352 | if (*map == SWAP_CONT_MAX) { | ||
2353 | kunmap_atomic(map, KM_USER0); | ||
2354 | page = list_entry(page->lru.next, struct page, lru); | ||
2355 | if (page == head) | ||
2356 | return false; /* add count continuation */ | ||
2357 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2358 | init_map: *map = 0; /* we didn't zero the page */ | ||
2359 | } | ||
2360 | *map += 1; | ||
2361 | kunmap_atomic(map, KM_USER0); | ||
2362 | page = list_entry(page->lru.prev, struct page, lru); | ||
2363 | while (page != head) { | ||
2364 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2365 | *map = COUNT_CONTINUED; | ||
2366 | kunmap_atomic(map, KM_USER0); | ||
2367 | page = list_entry(page->lru.prev, struct page, lru); | ||
2368 | } | ||
2369 | return true; /* incremented */ | ||
2370 | |||
2371 | } else { /* decrementing */ | ||
2372 | /* | ||
2373 | * Think of how you subtract 1 from 1000 | ||
2374 | */ | ||
2375 | BUG_ON(count != COUNT_CONTINUED); | ||
2376 | while (*map == COUNT_CONTINUED) { | ||
2377 | kunmap_atomic(map, KM_USER0); | ||
2378 | page = list_entry(page->lru.next, struct page, lru); | ||
2379 | BUG_ON(page == head); | ||
2380 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2381 | } | ||
2382 | BUG_ON(*map == 0); | ||
2383 | *map -= 1; | ||
2384 | if (*map == 0) | ||
2385 | count = 0; | ||
2386 | kunmap_atomic(map, KM_USER0); | ||
2387 | page = list_entry(page->lru.prev, struct page, lru); | ||
2388 | while (page != head) { | ||
2389 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2390 | *map = SWAP_CONT_MAX | count; | ||
2391 | count = COUNT_CONTINUED; | ||
2392 | kunmap_atomic(map, KM_USER0); | ||
2393 | page = list_entry(page->lru.prev, struct page, lru); | ||
2394 | } | ||
2395 | return count == COUNT_CONTINUED; | ||
2396 | } | ||
2397 | } | ||
2398 | |||
2399 | /* | ||
2400 | * free_swap_count_continuations - swapoff free all the continuation pages | ||
2401 | * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. | ||
2402 | */ | ||
2403 | static void free_swap_count_continuations(struct swap_info_struct *si) | ||
2404 | { | ||
2405 | pgoff_t offset; | ||
2406 | |||
2407 | for (offset = 0; offset < si->max; offset += PAGE_SIZE) { | ||
2408 | struct page *head; | ||
2409 | head = vmalloc_to_page(si->swap_map + offset); | ||
2410 | if (page_private(head)) { | ||
2411 | struct list_head *this, *next; | ||
2412 | list_for_each_safe(this, next, &head->lru) { | ||
2413 | struct page *page; | ||
2414 | page = list_entry(this, struct page, lru); | ||
2415 | list_del(this); | ||
2416 | __free_page(page); | ||
2417 | } | ||
2418 | } | ||
2419 | } | ||
2420 | } | ||