aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c284
1 files changed, 231 insertions, 53 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 312fafe0ab6e..d1ade1a48ee7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES];
53 53
54static DEFINE_MUTEX(swapon_mutex); 54static DEFINE_MUTEX(swapon_mutex);
55 55
56/* For reference count accounting in swap_map */
57/* enum for swap_map[] handling. internal use only */
58enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */
61};
62
63static inline int swap_count(unsigned short ent)
64{
65 return ent & SWAP_COUNT_MASK;
66}
67
68static inline bool swap_has_cache(unsigned short ent)
69{
70 return !!(ent & SWAP_HAS_CACHE);
71}
72
73static inline unsigned short encode_swapmap(int count, bool has_cache)
74{
75 unsigned short ret = count;
76
77 if (has_cache)
78 return SWAP_HAS_CACHE | ret;
79 return ret;
80}
81
82/* returnes 1 if swap entry is freed */
83static int
84__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85{
86 int type = si - swap_info;
87 swp_entry_t entry = swp_entry(type, offset);
88 struct page *page;
89 int ret = 0;
90
91 page = find_get_page(&swapper_space, entry.val);
92 if (!page)
93 return 0;
94 /*
95 * This function is called from scan_swap_map() and it's called
96 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
97 * We have to use trylock for avoiding deadlock. This is a special
98 * case and you should use try_to_free_swap() with explicit lock_page()
99 * in usual operations.
100 */
101 if (trylock_page(page)) {
102 ret = try_to_free_swap(page);
103 unlock_page(page);
104 }
105 page_cache_release(page);
106 return ret;
107}
108
56/* 109/*
57 * We need this because the bdev->unplug_fn can sleep and we cannot 110 * We need this because the bdev->unplug_fn can sleep and we cannot
58 * hold swap_lock while calling the unplug_fn. And swap_lock 111 * hold swap_lock while calling the unplug_fn. And swap_lock
@@ -167,7 +220,8 @@ static int wait_for_discard(void *word)
167#define SWAPFILE_CLUSTER 256 220#define SWAPFILE_CLUSTER 256
168#define LATENCY_LIMIT 256 221#define LATENCY_LIMIT 256
169 222
170static inline unsigned long scan_swap_map(struct swap_info_struct *si) 223static inline unsigned long scan_swap_map(struct swap_info_struct *si,
224 int cache)
171{ 225{
172 unsigned long offset; 226 unsigned long offset;
173 unsigned long scan_base; 227 unsigned long scan_base;
@@ -273,6 +327,19 @@ checks:
273 goto no_page; 327 goto no_page;
274 if (offset > si->highest_bit) 328 if (offset > si->highest_bit)
275 scan_base = offset = si->lowest_bit; 329 scan_base = offset = si->lowest_bit;
330
331 /* reuse swap entry of cache-only swap if not busy. */
332 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
333 int swap_was_freed;
334 spin_unlock(&swap_lock);
335 swap_was_freed = __try_to_reclaim_swap(si, offset);
336 spin_lock(&swap_lock);
337 /* entry was freed successfully, try to use this again */
338 if (swap_was_freed)
339 goto checks;
340 goto scan; /* check next one */
341 }
342
276 if (si->swap_map[offset]) 343 if (si->swap_map[offset])
277 goto scan; 344 goto scan;
278 345
@@ -285,7 +352,10 @@ checks:
285 si->lowest_bit = si->max; 352 si->lowest_bit = si->max;
286 si->highest_bit = 0; 353 si->highest_bit = 0;
287 } 354 }
288 si->swap_map[offset] = 1; 355 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
356 si->swap_map[offset] = encode_swapmap(0, true);
357 else /* at suspend */
358 si->swap_map[offset] = encode_swapmap(1, false);
289 si->cluster_next = offset + 1; 359 si->cluster_next = offset + 1;
290 si->flags -= SWP_SCANNING; 360 si->flags -= SWP_SCANNING;
291 361
@@ -351,6 +421,10 @@ scan:
351 spin_lock(&swap_lock); 421 spin_lock(&swap_lock);
352 goto checks; 422 goto checks;
353 } 423 }
424 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
425 spin_lock(&swap_lock);
426 goto checks;
427 }
354 if (unlikely(--latency_ration < 0)) { 428 if (unlikely(--latency_ration < 0)) {
355 cond_resched(); 429 cond_resched();
356 latency_ration = LATENCY_LIMIT; 430 latency_ration = LATENCY_LIMIT;
@@ -362,6 +436,10 @@ scan:
362 spin_lock(&swap_lock); 436 spin_lock(&swap_lock);
363 goto checks; 437 goto checks;
364 } 438 }
439 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
440 spin_lock(&swap_lock);
441 goto checks;
442 }
365 if (unlikely(--latency_ration < 0)) { 443 if (unlikely(--latency_ration < 0)) {
366 cond_resched(); 444 cond_resched();
367 latency_ration = LATENCY_LIMIT; 445 latency_ration = LATENCY_LIMIT;
@@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void)
401 continue; 479 continue;
402 480
403 swap_list.next = next; 481 swap_list.next = next;
404 offset = scan_swap_map(si); 482 /* This is called for allocating swap entry for cache */
483 offset = scan_swap_map(si, SWAP_CACHE);
405 if (offset) { 484 if (offset) {
406 spin_unlock(&swap_lock); 485 spin_unlock(&swap_lock);
407 return swp_entry(type, offset); 486 return swp_entry(type, offset);
@@ -415,6 +494,7 @@ noswap:
415 return (swp_entry_t) {0}; 494 return (swp_entry_t) {0};
416} 495}
417 496
497/* The only caller of this function is now susupend routine */
418swp_entry_t get_swap_page_of_type(int type) 498swp_entry_t get_swap_page_of_type(int type)
419{ 499{
420 struct swap_info_struct *si; 500 struct swap_info_struct *si;
@@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type)
424 si = swap_info + type; 504 si = swap_info + type;
425 if (si->flags & SWP_WRITEOK) { 505 if (si->flags & SWP_WRITEOK) {
426 nr_swap_pages--; 506 nr_swap_pages--;
427 offset = scan_swap_map(si); 507 /* This is called for allocating swap entry, not cache */
508 offset = scan_swap_map(si, SWAP_MAP);
428 if (offset) { 509 if (offset) {
429 spin_unlock(&swap_lock); 510 spin_unlock(&swap_lock);
430 return swp_entry(type, offset); 511 return swp_entry(type, offset);
@@ -471,26 +552,40 @@ out:
471 return NULL; 552 return NULL;
472} 553}
473 554
474static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) 555static int swap_entry_free(struct swap_info_struct *p,
556 swp_entry_t ent, int cache)
475{ 557{
476 unsigned long offset = swp_offset(ent); 558 unsigned long offset = swp_offset(ent);
477 int count = p->swap_map[offset]; 559 int count = swap_count(p->swap_map[offset]);
478 560 bool has_cache;
479 if (count < SWAP_MAP_MAX) { 561
480 count--; 562 has_cache = swap_has_cache(p->swap_map[offset]);
481 p->swap_map[offset] = count; 563
482 if (!count) { 564 if (cache == SWAP_MAP) { /* dropping usage count of swap */
483 if (offset < p->lowest_bit) 565 if (count < SWAP_MAP_MAX) {
484 p->lowest_bit = offset; 566 count--;
485 if (offset > p->highest_bit) 567 p->swap_map[offset] = encode_swapmap(count, has_cache);
486 p->highest_bit = offset;
487 if (p->prio > swap_info[swap_list.next].prio)
488 swap_list.next = p - swap_info;
489 nr_swap_pages++;
490 p->inuse_pages--;
491 mem_cgroup_uncharge_swap(ent);
492 } 568 }
569 } else { /* dropping swap cache flag */
570 VM_BUG_ON(!has_cache);
571 p->swap_map[offset] = encode_swapmap(count, false);
572
573 }
574 /* return code. */
575 count = p->swap_map[offset];
576 /* free if no reference */
577 if (!count) {
578 if (offset < p->lowest_bit)
579 p->lowest_bit = offset;
580 if (offset > p->highest_bit)
581 p->highest_bit = offset;
582 if (p->prio > swap_info[swap_list.next].prio)
583 swap_list.next = p - swap_info;
584 nr_swap_pages++;
585 p->inuse_pages--;
493 } 586 }
587 if (!swap_count(count))
588 mem_cgroup_uncharge_swap(ent);
494 return count; 589 return count;
495} 590}
496 591
@@ -504,9 +599,33 @@ void swap_free(swp_entry_t entry)
504 599
505 p = swap_info_get(entry); 600 p = swap_info_get(entry);
506 if (p) { 601 if (p) {
507 swap_entry_free(p, entry); 602 swap_entry_free(p, entry, SWAP_MAP);
603 spin_unlock(&swap_lock);
604 }
605}
606
607/*
608 * Called after dropping swapcache to decrease refcnt to swap entries.
609 */
610void swapcache_free(swp_entry_t entry, struct page *page)
611{
612 struct swap_info_struct *p;
613 int ret;
614
615 p = swap_info_get(entry);
616 if (p) {
617 ret = swap_entry_free(p, entry, SWAP_CACHE);
618 if (page) {
619 bool swapout;
620 if (ret)
621 swapout = true; /* the end of swap out */
622 else
623 swapout = false; /* no more swap users! */
624 mem_cgroup_uncharge_swapcache(page, entry, swapout);
625 }
508 spin_unlock(&swap_lock); 626 spin_unlock(&swap_lock);
509 } 627 }
628 return;
510} 629}
511 630
512/* 631/*
@@ -521,8 +640,7 @@ static inline int page_swapcount(struct page *page)
521 entry.val = page_private(page); 640 entry.val = page_private(page);
522 p = swap_info_get(entry); 641 p = swap_info_get(entry);
523 if (p) { 642 if (p) {
524 /* Subtract the 1 for the swap cache itself */ 643 count = swap_count(p->swap_map[swp_offset(entry)]);
525 count = p->swap_map[swp_offset(entry)] - 1;
526 spin_unlock(&swap_lock); 644 spin_unlock(&swap_lock);
527 } 645 }
528 return count; 646 return count;
@@ -584,7 +702,7 @@ int free_swap_and_cache(swp_entry_t entry)
584 702
585 p = swap_info_get(entry); 703 p = swap_info_get(entry);
586 if (p) { 704 if (p) {
587 if (swap_entry_free(p, entry) == 1) { 705 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
588 page = find_get_page(&swapper_space, entry.val); 706 page = find_get_page(&swapper_space, entry.val);
589 if (page && !trylock_page(page)) { 707 if (page && !trylock_page(page)) {
590 page_cache_release(page); 708 page_cache_release(page);
@@ -891,7 +1009,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
891 i = 1; 1009 i = 1;
892 } 1010 }
893 count = si->swap_map[i]; 1011 count = si->swap_map[i];
894 if (count && count != SWAP_MAP_BAD) 1012 if (count && swap_count(count) != SWAP_MAP_BAD)
895 break; 1013 break;
896 } 1014 }
897 return i; 1015 return i;
@@ -995,13 +1113,13 @@ static int try_to_unuse(unsigned int type)
995 */ 1113 */
996 shmem = 0; 1114 shmem = 0;
997 swcount = *swap_map; 1115 swcount = *swap_map;
998 if (swcount > 1) { 1116 if (swap_count(swcount)) {
999 if (start_mm == &init_mm) 1117 if (start_mm == &init_mm)
1000 shmem = shmem_unuse(entry, page); 1118 shmem = shmem_unuse(entry, page);
1001 else 1119 else
1002 retval = unuse_mm(start_mm, entry, page); 1120 retval = unuse_mm(start_mm, entry, page);
1003 } 1121 }
1004 if (*swap_map > 1) { 1122 if (swap_count(*swap_map)) {
1005 int set_start_mm = (*swap_map >= swcount); 1123 int set_start_mm = (*swap_map >= swcount);
1006 struct list_head *p = &start_mm->mmlist; 1124 struct list_head *p = &start_mm->mmlist;
1007 struct mm_struct *new_start_mm = start_mm; 1125 struct mm_struct *new_start_mm = start_mm;
@@ -1011,7 +1129,7 @@ static int try_to_unuse(unsigned int type)
1011 atomic_inc(&new_start_mm->mm_users); 1129 atomic_inc(&new_start_mm->mm_users);
1012 atomic_inc(&prev_mm->mm_users); 1130 atomic_inc(&prev_mm->mm_users);
1013 spin_lock(&mmlist_lock); 1131 spin_lock(&mmlist_lock);
1014 while (*swap_map > 1 && !retval && !shmem && 1132 while (swap_count(*swap_map) && !retval && !shmem &&
1015 (p = p->next) != &start_mm->mmlist) { 1133 (p = p->next) != &start_mm->mmlist) {
1016 mm = list_entry(p, struct mm_struct, mmlist); 1134 mm = list_entry(p, struct mm_struct, mmlist);
1017 if (!atomic_inc_not_zero(&mm->mm_users)) 1135 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1023,14 +1141,16 @@ static int try_to_unuse(unsigned int type)
1023 cond_resched(); 1141 cond_resched();
1024 1142
1025 swcount = *swap_map; 1143 swcount = *swap_map;
1026 if (swcount <= 1) 1144 if (!swap_count(swcount)) /* any usage ? */
1027 ; 1145 ;
1028 else if (mm == &init_mm) { 1146 else if (mm == &init_mm) {
1029 set_start_mm = 1; 1147 set_start_mm = 1;
1030 shmem = shmem_unuse(entry, page); 1148 shmem = shmem_unuse(entry, page);
1031 } else 1149 } else
1032 retval = unuse_mm(mm, entry, page); 1150 retval = unuse_mm(mm, entry, page);
1033 if (set_start_mm && *swap_map < swcount) { 1151
1152 if (set_start_mm &&
1153 swap_count(*swap_map) < swcount) {
1034 mmput(new_start_mm); 1154 mmput(new_start_mm);
1035 atomic_inc(&mm->mm_users); 1155 atomic_inc(&mm->mm_users);
1036 new_start_mm = mm; 1156 new_start_mm = mm;
@@ -1057,21 +1177,25 @@ static int try_to_unuse(unsigned int type)
1057 } 1177 }
1058 1178
1059 /* 1179 /*
1060 * How could swap count reach 0x7fff when the maximum 1180 * How could swap count reach 0x7ffe ?
1061 * pid is 0x7fff, and there's no way to repeat a swap 1181 * There's no way to repeat a swap page within an mm
1062 * page within an mm (except in shmem, where it's the 1182 * (except in shmem, where it's the shared object which takes
1063 * shared object which takes the reference count)? 1183 * the reference count)?
1064 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. 1184 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1065 * 1185 * short is too small....)
1066 * If that's wrong, then we should worry more about 1186 * If that's wrong, then we should worry more about
1067 * exit_mmap() and do_munmap() cases described above: 1187 * exit_mmap() and do_munmap() cases described above:
1068 * we might be resetting SWAP_MAP_MAX too early here. 1188 * we might be resetting SWAP_MAP_MAX too early here.
1069 * We know "Undead"s can happen, they're okay, so don't 1189 * We know "Undead"s can happen, they're okay, so don't
1070 * report them; but do report if we reset SWAP_MAP_MAX. 1190 * report them; but do report if we reset SWAP_MAP_MAX.
1071 */ 1191 */
1072 if (*swap_map == SWAP_MAP_MAX) { 1192 /* We might release the lock_page() in unuse_mm(). */
1193 if (!PageSwapCache(page) || page_private(page) != entry.val)
1194 goto retry;
1195
1196 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1073 spin_lock(&swap_lock); 1197 spin_lock(&swap_lock);
1074 *swap_map = 1; 1198 *swap_map = encode_swapmap(0, true);
1075 spin_unlock(&swap_lock); 1199 spin_unlock(&swap_lock);
1076 reset_overflow = 1; 1200 reset_overflow = 1;
1077 } 1201 }
@@ -1089,7 +1213,8 @@ static int try_to_unuse(unsigned int type)
1089 * pages would be incorrect if swap supported "shared 1213 * pages would be incorrect if swap supported "shared
1090 * private" pages, but they are handled by tmpfs files. 1214 * private" pages, but they are handled by tmpfs files.
1091 */ 1215 */
1092 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 1216 if (swap_count(*swap_map) &&
1217 PageDirty(page) && PageSwapCache(page)) {
1093 struct writeback_control wbc = { 1218 struct writeback_control wbc = {
1094 .sync_mode = WB_SYNC_NONE, 1219 .sync_mode = WB_SYNC_NONE,
1095 }; 1220 };
@@ -1116,6 +1241,7 @@ static int try_to_unuse(unsigned int type)
1116 * mark page dirty so shrink_page_list will preserve it. 1241 * mark page dirty so shrink_page_list will preserve it.
1117 */ 1242 */
1118 SetPageDirty(page); 1243 SetPageDirty(page);
1244retry:
1119 unlock_page(page); 1245 unlock_page(page);
1120 page_cache_release(page); 1246 page_cache_release(page);
1121 1247
@@ -1942,15 +2068,23 @@ void si_swapinfo(struct sysinfo *val)
1942 * 2068 *
1943 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 2069 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
1944 * "permanent", but will be reclaimed by the next swapoff. 2070 * "permanent", but will be reclaimed by the next swapoff.
2071 * Returns error code in following case.
2072 * - success -> 0
2073 * - swp_entry is invalid -> EINVAL
2074 * - swp_entry is migration entry -> EINVAL
2075 * - swap-cache reference is requested but there is already one. -> EEXIST
2076 * - swap-cache reference is requested but the entry is not used. -> ENOENT
1945 */ 2077 */
1946int swap_duplicate(swp_entry_t entry) 2078static int __swap_duplicate(swp_entry_t entry, bool cache)
1947{ 2079{
1948 struct swap_info_struct * p; 2080 struct swap_info_struct * p;
1949 unsigned long offset, type; 2081 unsigned long offset, type;
1950 int result = 0; 2082 int result = -EINVAL;
2083 int count;
2084 bool has_cache;
1951 2085
1952 if (is_migration_entry(entry)) 2086 if (is_migration_entry(entry))
1953 return 1; 2087 return -EINVAL;
1954 2088
1955 type = swp_type(entry); 2089 type = swp_type(entry);
1956 if (type >= nr_swapfiles) 2090 if (type >= nr_swapfiles)
@@ -1959,17 +2093,40 @@ int swap_duplicate(swp_entry_t entry)
1959 offset = swp_offset(entry); 2093 offset = swp_offset(entry);
1960 2094
1961 spin_lock(&swap_lock); 2095 spin_lock(&swap_lock);
1962 if (offset < p->max && p->swap_map[offset]) { 2096
1963 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 2097 if (unlikely(offset >= p->max))
1964 p->swap_map[offset]++; 2098 goto unlock_out;
1965 result = 1; 2099
1966 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { 2100 count = swap_count(p->swap_map[offset]);
2101 has_cache = swap_has_cache(p->swap_map[offset]);
2102
2103 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
2104
2105 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2106 if (!has_cache && count) {
2107 p->swap_map[offset] = encode_swapmap(count, true);
2108 result = 0;
2109 } else if (has_cache) /* someone added cache */
2110 result = -EEXIST;
2111 else if (!count) /* no users */
2112 result = -ENOENT;
2113
2114 } else if (count || has_cache) {
2115 if (count < SWAP_MAP_MAX - 1) {
2116 p->swap_map[offset] = encode_swapmap(count + 1,
2117 has_cache);
2118 result = 0;
2119 } else if (count <= SWAP_MAP_MAX) {
1967 if (swap_overflow++ < 5) 2120 if (swap_overflow++ < 5)
1968 printk(KERN_WARNING "swap_dup: swap entry overflow\n"); 2121 printk(KERN_WARNING
1969 p->swap_map[offset] = SWAP_MAP_MAX; 2122 "swap_dup: swap entry overflow\n");
1970 result = 1; 2123 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
2124 has_cache);
2125 result = 0;
1971 } 2126 }
1972 } 2127 } else
2128 result = -ENOENT; /* unused swap entry */
2129unlock_out:
1973 spin_unlock(&swap_lock); 2130 spin_unlock(&swap_lock);
1974out: 2131out:
1975 return result; 2132 return result;
@@ -1978,6 +2135,27 @@ bad_file:
1978 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2135 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1979 goto out; 2136 goto out;
1980} 2137}
2138/*
2139 * increase reference count of swap entry by 1.
2140 */
2141void swap_duplicate(swp_entry_t entry)
2142{
2143 __swap_duplicate(entry, SWAP_MAP);
2144}
2145
2146/*
2147 * @entry: swap entry for which we allocate swap cache.
2148 *
2149 * Called when allocating swap cache for exising swap entry,
2150 * This can return error codes. Returns 0 at success.
2151 * -EBUSY means there is a swap cache.
2152 * Note: return code is different from swap_duplicate().
2153 */
2154int swapcache_prepare(swp_entry_t entry)
2155{
2156 return __swap_duplicate(entry, SWAP_CACHE);
2157}
2158
1981 2159
1982struct swap_info_struct * 2160struct swap_info_struct *
1983get_swap_info_struct(unsigned type) 2161get_swap_info_struct(unsigned type)
@@ -2016,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2016 /* Don't read in free or bad pages */ 2194 /* Don't read in free or bad pages */
2017 if (!si->swap_map[toff]) 2195 if (!si->swap_map[toff])
2018 break; 2196 break;
2019 if (si->swap_map[toff] == SWAP_MAP_BAD) 2197 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2020 break; 2198 break;
2021 } 2199 }
2022 /* Count contiguous allocated slots below our target */ 2200 /* Count contiguous allocated slots below our target */
@@ -2024,7 +2202,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2024 /* Don't read in free or bad pages */ 2202 /* Don't read in free or bad pages */
2025 if (!si->swap_map[toff]) 2203 if (!si->swap_map[toff])
2026 break; 2204 break;
2027 if (si->swap_map[toff] == SWAP_MAP_BAD) 2205 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2028 break; 2206 break;
2029 } 2207 }
2030 spin_unlock(&swap_lock); 2208 spin_unlock(&swap_lock);