aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c276
1 files changed, 223 insertions, 53 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 312fafe0ab6e..28faa01cf578 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES];
53 53
54static DEFINE_MUTEX(swapon_mutex); 54static DEFINE_MUTEX(swapon_mutex);
55 55
56/* For reference count accounting in swap_map */
57/* enum for swap_map[] handling. internal use only */
58enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */
61};
62
63static inline int swap_count(unsigned short ent)
64{
65 return ent & SWAP_COUNT_MASK;
66}
67
68static inline bool swap_has_cache(unsigned short ent)
69{
70 return !!(ent & SWAP_HAS_CACHE);
71}
72
73static inline unsigned short encode_swapmap(int count, bool has_cache)
74{
75 unsigned short ret = count;
76
77 if (has_cache)
78 return SWAP_HAS_CACHE | ret;
79 return ret;
80}
81
82/* returnes 1 if swap entry is freed */
83static int
84__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85{
86 int type = si - swap_info;
87 swp_entry_t entry = swp_entry(type, offset);
88 struct page *page;
89 int ret = 0;
90
91 page = find_get_page(&swapper_space, entry.val);
92 if (!page)
93 return 0;
94 /*
95 * This function is called from scan_swap_map() and it's called
96 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
97 * We have to use trylock for avoiding deadlock. This is a special
98 * case and you should use try_to_free_swap() with explicit lock_page()
99 * in usual operations.
100 */
101 if (trylock_page(page)) {
102 ret = try_to_free_swap(page);
103 unlock_page(page);
104 }
105 page_cache_release(page);
106 return ret;
107}
108
56/* 109/*
57 * We need this because the bdev->unplug_fn can sleep and we cannot 110 * We need this because the bdev->unplug_fn can sleep and we cannot
58 * hold swap_lock while calling the unplug_fn. And swap_lock 111 * hold swap_lock while calling the unplug_fn. And swap_lock
@@ -167,7 +220,8 @@ static int wait_for_discard(void *word)
167#define SWAPFILE_CLUSTER 256 220#define SWAPFILE_CLUSTER 256
168#define LATENCY_LIMIT 256 221#define LATENCY_LIMIT 256
169 222
170static inline unsigned long scan_swap_map(struct swap_info_struct *si) 223static inline unsigned long scan_swap_map(struct swap_info_struct *si,
224 int cache)
171{ 225{
172 unsigned long offset; 226 unsigned long offset;
173 unsigned long scan_base; 227 unsigned long scan_base;
@@ -273,6 +327,19 @@ checks:
273 goto no_page; 327 goto no_page;
274 if (offset > si->highest_bit) 328 if (offset > si->highest_bit)
275 scan_base = offset = si->lowest_bit; 329 scan_base = offset = si->lowest_bit;
330
331 /* reuse swap entry of cache-only swap if not busy. */
332 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
333 int swap_was_freed;
334 spin_unlock(&swap_lock);
335 swap_was_freed = __try_to_reclaim_swap(si, offset);
336 spin_lock(&swap_lock);
337 /* entry was freed successfully, try to use this again */
338 if (swap_was_freed)
339 goto checks;
340 goto scan; /* check next one */
341 }
342
276 if (si->swap_map[offset]) 343 if (si->swap_map[offset])
277 goto scan; 344 goto scan;
278 345
@@ -285,7 +352,10 @@ checks:
285 si->lowest_bit = si->max; 352 si->lowest_bit = si->max;
286 si->highest_bit = 0; 353 si->highest_bit = 0;
287 } 354 }
288 si->swap_map[offset] = 1; 355 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
356 si->swap_map[offset] = encode_swapmap(0, true);
357 else /* at suspend */
358 si->swap_map[offset] = encode_swapmap(1, false);
289 si->cluster_next = offset + 1; 359 si->cluster_next = offset + 1;
290 si->flags -= SWP_SCANNING; 360 si->flags -= SWP_SCANNING;
291 361
@@ -351,6 +421,10 @@ scan:
351 spin_lock(&swap_lock); 421 spin_lock(&swap_lock);
352 goto checks; 422 goto checks;
353 } 423 }
424 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
425 spin_lock(&swap_lock);
426 goto checks;
427 }
354 if (unlikely(--latency_ration < 0)) { 428 if (unlikely(--latency_ration < 0)) {
355 cond_resched(); 429 cond_resched();
356 latency_ration = LATENCY_LIMIT; 430 latency_ration = LATENCY_LIMIT;
@@ -362,6 +436,10 @@ scan:
362 spin_lock(&swap_lock); 436 spin_lock(&swap_lock);
363 goto checks; 437 goto checks;
364 } 438 }
439 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
440 spin_lock(&swap_lock);
441 goto checks;
442 }
365 if (unlikely(--latency_ration < 0)) { 443 if (unlikely(--latency_ration < 0)) {
366 cond_resched(); 444 cond_resched();
367 latency_ration = LATENCY_LIMIT; 445 latency_ration = LATENCY_LIMIT;
@@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void)
401 continue; 479 continue;
402 480
403 swap_list.next = next; 481 swap_list.next = next;
404 offset = scan_swap_map(si); 482 /* This is called for allocating swap entry for cache */
483 offset = scan_swap_map(si, SWAP_CACHE);
405 if (offset) { 484 if (offset) {
406 spin_unlock(&swap_lock); 485 spin_unlock(&swap_lock);
407 return swp_entry(type, offset); 486 return swp_entry(type, offset);
@@ -415,6 +494,7 @@ noswap:
415 return (swp_entry_t) {0}; 494 return (swp_entry_t) {0};
416} 495}
417 496
497/* The only caller of this function is now susupend routine */
418swp_entry_t get_swap_page_of_type(int type) 498swp_entry_t get_swap_page_of_type(int type)
419{ 499{
420 struct swap_info_struct *si; 500 struct swap_info_struct *si;
@@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type)
424 si = swap_info + type; 504 si = swap_info + type;
425 if (si->flags & SWP_WRITEOK) { 505 if (si->flags & SWP_WRITEOK) {
426 nr_swap_pages--; 506 nr_swap_pages--;
427 offset = scan_swap_map(si); 507 /* This is called for allocating swap entry, not cache */
508 offset = scan_swap_map(si, SWAP_MAP);
428 if (offset) { 509 if (offset) {
429 spin_unlock(&swap_lock); 510 spin_unlock(&swap_lock);
430 return swp_entry(type, offset); 511 return swp_entry(type, offset);
@@ -471,25 +552,38 @@ out:
471 return NULL; 552 return NULL;
472} 553}
473 554
474static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) 555static int swap_entry_free(struct swap_info_struct *p,
556 swp_entry_t ent, int cache)
475{ 557{
476 unsigned long offset = swp_offset(ent); 558 unsigned long offset = swp_offset(ent);
477 int count = p->swap_map[offset]; 559 int count = swap_count(p->swap_map[offset]);
478 560 bool has_cache;
479 if (count < SWAP_MAP_MAX) { 561
480 count--; 562 has_cache = swap_has_cache(p->swap_map[offset]);
481 p->swap_map[offset] = count; 563
482 if (!count) { 564 if (cache == SWAP_MAP) { /* dropping usage count of swap */
483 if (offset < p->lowest_bit) 565 if (count < SWAP_MAP_MAX) {
484 p->lowest_bit = offset; 566 count--;
485 if (offset > p->highest_bit) 567 p->swap_map[offset] = encode_swapmap(count, has_cache);
486 p->highest_bit = offset;
487 if (p->prio > swap_info[swap_list.next].prio)
488 swap_list.next = p - swap_info;
489 nr_swap_pages++;
490 p->inuse_pages--;
491 mem_cgroup_uncharge_swap(ent);
492 } 568 }
569 } else { /* dropping swap cache flag */
570 VM_BUG_ON(!has_cache);
571 p->swap_map[offset] = encode_swapmap(count, false);
572
573 }
574 /* return code. */
575 count = p->swap_map[offset];
576 /* free if no reference */
577 if (!count) {
578 if (offset < p->lowest_bit)
579 p->lowest_bit = offset;
580 if (offset > p->highest_bit)
581 p->highest_bit = offset;
582 if (p->prio > swap_info[swap_list.next].prio)
583 swap_list.next = p - swap_info;
584 nr_swap_pages++;
585 p->inuse_pages--;
586 mem_cgroup_uncharge_swap(ent);
493 } 587 }
494 return count; 588 return count;
495} 589}
@@ -504,9 +598,26 @@ void swap_free(swp_entry_t entry)
504 598
505 p = swap_info_get(entry); 599 p = swap_info_get(entry);
506 if (p) { 600 if (p) {
507 swap_entry_free(p, entry); 601 swap_entry_free(p, entry, SWAP_MAP);
602 spin_unlock(&swap_lock);
603 }
604}
605
606/*
607 * Called after dropping swapcache to decrease refcnt to swap entries.
608 */
609void swapcache_free(swp_entry_t entry, struct page *page)
610{
611 struct swap_info_struct *p;
612
613 if (page)
614 mem_cgroup_uncharge_swapcache(page, entry);
615 p = swap_info_get(entry);
616 if (p) {
617 swap_entry_free(p, entry, SWAP_CACHE);
508 spin_unlock(&swap_lock); 618 spin_unlock(&swap_lock);
509 } 619 }
620 return;
510} 621}
511 622
512/* 623/*
@@ -521,8 +632,7 @@ static inline int page_swapcount(struct page *page)
521 entry.val = page_private(page); 632 entry.val = page_private(page);
522 p = swap_info_get(entry); 633 p = swap_info_get(entry);
523 if (p) { 634 if (p) {
524 /* Subtract the 1 for the swap cache itself */ 635 count = swap_count(p->swap_map[swp_offset(entry)]);
525 count = p->swap_map[swp_offset(entry)] - 1;
526 spin_unlock(&swap_lock); 636 spin_unlock(&swap_lock);
527 } 637 }
528 return count; 638 return count;
@@ -584,7 +694,7 @@ int free_swap_and_cache(swp_entry_t entry)
584 694
585 p = swap_info_get(entry); 695 p = swap_info_get(entry);
586 if (p) { 696 if (p) {
587 if (swap_entry_free(p, entry) == 1) { 697 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
588 page = find_get_page(&swapper_space, entry.val); 698 page = find_get_page(&swapper_space, entry.val);
589 if (page && !trylock_page(page)) { 699 if (page && !trylock_page(page)) {
590 page_cache_release(page); 700 page_cache_release(page);
@@ -891,7 +1001,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
891 i = 1; 1001 i = 1;
892 } 1002 }
893 count = si->swap_map[i]; 1003 count = si->swap_map[i];
894 if (count && count != SWAP_MAP_BAD) 1004 if (count && swap_count(count) != SWAP_MAP_BAD)
895 break; 1005 break;
896 } 1006 }
897 return i; 1007 return i;
@@ -995,13 +1105,13 @@ static int try_to_unuse(unsigned int type)
995 */ 1105 */
996 shmem = 0; 1106 shmem = 0;
997 swcount = *swap_map; 1107 swcount = *swap_map;
998 if (swcount > 1) { 1108 if (swap_count(swcount)) {
999 if (start_mm == &init_mm) 1109 if (start_mm == &init_mm)
1000 shmem = shmem_unuse(entry, page); 1110 shmem = shmem_unuse(entry, page);
1001 else 1111 else
1002 retval = unuse_mm(start_mm, entry, page); 1112 retval = unuse_mm(start_mm, entry, page);
1003 } 1113 }
1004 if (*swap_map > 1) { 1114 if (swap_count(*swap_map)) {
1005 int set_start_mm = (*swap_map >= swcount); 1115 int set_start_mm = (*swap_map >= swcount);
1006 struct list_head *p = &start_mm->mmlist; 1116 struct list_head *p = &start_mm->mmlist;
1007 struct mm_struct *new_start_mm = start_mm; 1117 struct mm_struct *new_start_mm = start_mm;
@@ -1011,7 +1121,7 @@ static int try_to_unuse(unsigned int type)
1011 atomic_inc(&new_start_mm->mm_users); 1121 atomic_inc(&new_start_mm->mm_users);
1012 atomic_inc(&prev_mm->mm_users); 1122 atomic_inc(&prev_mm->mm_users);
1013 spin_lock(&mmlist_lock); 1123 spin_lock(&mmlist_lock);
1014 while (*swap_map > 1 && !retval && !shmem && 1124 while (swap_count(*swap_map) && !retval && !shmem &&
1015 (p = p->next) != &start_mm->mmlist) { 1125 (p = p->next) != &start_mm->mmlist) {
1016 mm = list_entry(p, struct mm_struct, mmlist); 1126 mm = list_entry(p, struct mm_struct, mmlist);
1017 if (!atomic_inc_not_zero(&mm->mm_users)) 1127 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1023,14 +1133,16 @@ static int try_to_unuse(unsigned int type)
1023 cond_resched(); 1133 cond_resched();
1024 1134
1025 swcount = *swap_map; 1135 swcount = *swap_map;
1026 if (swcount <= 1) 1136 if (!swap_count(swcount)) /* any usage ? */
1027 ; 1137 ;
1028 else if (mm == &init_mm) { 1138 else if (mm == &init_mm) {
1029 set_start_mm = 1; 1139 set_start_mm = 1;
1030 shmem = shmem_unuse(entry, page); 1140 shmem = shmem_unuse(entry, page);
1031 } else 1141 } else
1032 retval = unuse_mm(mm, entry, page); 1142 retval = unuse_mm(mm, entry, page);
1033 if (set_start_mm && *swap_map < swcount) { 1143
1144 if (set_start_mm &&
1145 swap_count(*swap_map) < swcount) {
1034 mmput(new_start_mm); 1146 mmput(new_start_mm);
1035 atomic_inc(&mm->mm_users); 1147 atomic_inc(&mm->mm_users);
1036 new_start_mm = mm; 1148 new_start_mm = mm;
@@ -1057,21 +1169,25 @@ static int try_to_unuse(unsigned int type)
1057 } 1169 }
1058 1170
1059 /* 1171 /*
1060 * How could swap count reach 0x7fff when the maximum 1172 * How could swap count reach 0x7ffe ?
1061 * pid is 0x7fff, and there's no way to repeat a swap 1173 * There's no way to repeat a swap page within an mm
1062 * page within an mm (except in shmem, where it's the 1174 * (except in shmem, where it's the shared object which takes
1063 * shared object which takes the reference count)? 1175 * the reference count)?
1064 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. 1176 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1065 * 1177 * short is too small....)
1066 * If that's wrong, then we should worry more about 1178 * If that's wrong, then we should worry more about
1067 * exit_mmap() and do_munmap() cases described above: 1179 * exit_mmap() and do_munmap() cases described above:
1068 * we might be resetting SWAP_MAP_MAX too early here. 1180 * we might be resetting SWAP_MAP_MAX too early here.
1069 * We know "Undead"s can happen, they're okay, so don't 1181 * We know "Undead"s can happen, they're okay, so don't
1070 * report them; but do report if we reset SWAP_MAP_MAX. 1182 * report them; but do report if we reset SWAP_MAP_MAX.
1071 */ 1183 */
1072 if (*swap_map == SWAP_MAP_MAX) { 1184 /* We might release the lock_page() in unuse_mm(). */
1185 if (!PageSwapCache(page) || page_private(page) != entry.val)
1186 goto retry;
1187
1188 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1073 spin_lock(&swap_lock); 1189 spin_lock(&swap_lock);
1074 *swap_map = 1; 1190 *swap_map = encode_swapmap(0, true);
1075 spin_unlock(&swap_lock); 1191 spin_unlock(&swap_lock);
1076 reset_overflow = 1; 1192 reset_overflow = 1;
1077 } 1193 }
@@ -1089,7 +1205,8 @@ static int try_to_unuse(unsigned int type)
1089 * pages would be incorrect if swap supported "shared 1205 * pages would be incorrect if swap supported "shared
1090 * private" pages, but they are handled by tmpfs files. 1206 * private" pages, but they are handled by tmpfs files.
1091 */ 1207 */
1092 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 1208 if (swap_count(*swap_map) &&
1209 PageDirty(page) && PageSwapCache(page)) {
1093 struct writeback_control wbc = { 1210 struct writeback_control wbc = {
1094 .sync_mode = WB_SYNC_NONE, 1211 .sync_mode = WB_SYNC_NONE,
1095 }; 1212 };
@@ -1116,6 +1233,7 @@ static int try_to_unuse(unsigned int type)
1116 * mark page dirty so shrink_page_list will preserve it. 1233 * mark page dirty so shrink_page_list will preserve it.
1117 */ 1234 */
1118 SetPageDirty(page); 1235 SetPageDirty(page);
1236retry:
1119 unlock_page(page); 1237 unlock_page(page);
1120 page_cache_release(page); 1238 page_cache_release(page);
1121 1239
@@ -1942,15 +2060,23 @@ void si_swapinfo(struct sysinfo *val)
1942 * 2060 *
1943 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 2061 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
1944 * "permanent", but will be reclaimed by the next swapoff. 2062 * "permanent", but will be reclaimed by the next swapoff.
2063 * Returns error code in following case.
2064 * - success -> 0
2065 * - swp_entry is invalid -> EINVAL
2066 * - swp_entry is migration entry -> EINVAL
2067 * - swap-cache reference is requested but there is already one. -> EEXIST
2068 * - swap-cache reference is requested but the entry is not used. -> ENOENT
1945 */ 2069 */
1946int swap_duplicate(swp_entry_t entry) 2070static int __swap_duplicate(swp_entry_t entry, bool cache)
1947{ 2071{
1948 struct swap_info_struct * p; 2072 struct swap_info_struct * p;
1949 unsigned long offset, type; 2073 unsigned long offset, type;
1950 int result = 0; 2074 int result = -EINVAL;
2075 int count;
2076 bool has_cache;
1951 2077
1952 if (is_migration_entry(entry)) 2078 if (is_migration_entry(entry))
1953 return 1; 2079 return -EINVAL;
1954 2080
1955 type = swp_type(entry); 2081 type = swp_type(entry);
1956 if (type >= nr_swapfiles) 2082 if (type >= nr_swapfiles)
@@ -1959,17 +2085,40 @@ int swap_duplicate(swp_entry_t entry)
1959 offset = swp_offset(entry); 2085 offset = swp_offset(entry);
1960 2086
1961 spin_lock(&swap_lock); 2087 spin_lock(&swap_lock);
1962 if (offset < p->max && p->swap_map[offset]) { 2088
1963 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 2089 if (unlikely(offset >= p->max))
1964 p->swap_map[offset]++; 2090 goto unlock_out;
1965 result = 1; 2091
1966 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { 2092 count = swap_count(p->swap_map[offset]);
2093 has_cache = swap_has_cache(p->swap_map[offset]);
2094
2095 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
2096
2097 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2098 if (!has_cache && count) {
2099 p->swap_map[offset] = encode_swapmap(count, true);
2100 result = 0;
2101 } else if (has_cache) /* someone added cache */
2102 result = -EEXIST;
2103 else if (!count) /* no users */
2104 result = -ENOENT;
2105
2106 } else if (count || has_cache) {
2107 if (count < SWAP_MAP_MAX - 1) {
2108 p->swap_map[offset] = encode_swapmap(count + 1,
2109 has_cache);
2110 result = 0;
2111 } else if (count <= SWAP_MAP_MAX) {
1967 if (swap_overflow++ < 5) 2112 if (swap_overflow++ < 5)
1968 printk(KERN_WARNING "swap_dup: swap entry overflow\n"); 2113 printk(KERN_WARNING
1969 p->swap_map[offset] = SWAP_MAP_MAX; 2114 "swap_dup: swap entry overflow\n");
1970 result = 1; 2115 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
2116 has_cache);
2117 result = 0;
1971 } 2118 }
1972 } 2119 } else
2120 result = -ENOENT; /* unused swap entry */
2121unlock_out:
1973 spin_unlock(&swap_lock); 2122 spin_unlock(&swap_lock);
1974out: 2123out:
1975 return result; 2124 return result;
@@ -1978,6 +2127,27 @@ bad_file:
1978 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2127 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1979 goto out; 2128 goto out;
1980} 2129}
2130/*
2131 * increase reference count of swap entry by 1.
2132 */
2133void swap_duplicate(swp_entry_t entry)
2134{
2135 __swap_duplicate(entry, SWAP_MAP);
2136}
2137
2138/*
2139 * @entry: swap entry for which we allocate swap cache.
2140 *
2141 * Called when allocating swap cache for exising swap entry,
2142 * This can return error codes. Returns 0 at success.
2143 * -EBUSY means there is a swap cache.
2144 * Note: return code is different from swap_duplicate().
2145 */
2146int swapcache_prepare(swp_entry_t entry)
2147{
2148 return __swap_duplicate(entry, SWAP_CACHE);
2149}
2150
1981 2151
1982struct swap_info_struct * 2152struct swap_info_struct *
1983get_swap_info_struct(unsigned type) 2153get_swap_info_struct(unsigned type)
@@ -2016,7 +2186,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2016 /* Don't read in free or bad pages */ 2186 /* Don't read in free or bad pages */
2017 if (!si->swap_map[toff]) 2187 if (!si->swap_map[toff])
2018 break; 2188 break;
2019 if (si->swap_map[toff] == SWAP_MAP_BAD) 2189 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2020 break; 2190 break;
2021 } 2191 }
2022 /* Count contiguous allocated slots below our target */ 2192 /* Count contiguous allocated slots below our target */
@@ -2024,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2024 /* Don't read in free or bad pages */ 2194 /* Don't read in free or bad pages */
2025 if (!si->swap_map[toff]) 2195 if (!si->swap_map[toff])
2026 break; 2196 break;
2027 if (si->swap_map[toff] == SWAP_MAP_BAD) 2197 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2028 break; 2198 break;
2029 } 2199 }
2030 spin_unlock(&swap_lock); 2200 spin_unlock(&swap_lock);