aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-06-16 18:32:53 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-06-16 22:47:42 -0400
commit355cfa73ddff2fb8fa14e93bd94a057cc022512e (patch)
tree7ff70cd56d533070d50b06db6ba0086e8aab0d71 /mm
parentcb4b86ba47bb0937b71fb825b3ed88adf7a190f0 (diff)
mm: modify swap_map and add SWAP_HAS_CACHE flag
This is a part of the patches for fixing memcg's swap accountinf leak. But, IMHO, not a bad patch even if no memcg. There are 2 kinds of references to swap. - reference from swap entry - reference from swap cache Then, - If there is swap cache && swap's refcnt is 1, there is only swap cache. (*) swapcount(entry) == 1 && find_get_page(swapper_space, entry) != NULL This counting logic have worked well for a long time. But considering that we cannot know there is a _real_ reference or not by swap_map[], current usage of counter is not very good. This patch adds a flag SWAP_HAS_CACHE and recored information that a swap entry has a cache or not. This will remove -1 magic used in swapfile.c and be a help to avoid unnecessary find_get_page(). Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com> Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c214
2 files changed, 163 insertions, 56 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 19bdf3017a9e..b9ca029673a5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -292,7 +292,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
292 /* 292 /*
293 * Swap entry may have been freed since our caller observed it. 293 * Swap entry may have been freed since our caller observed it.
294 */ 294 */
295 if (!swapcache_prepare(entry)) 295 err = swapcache_prepare(entry);
296 if (err == -EEXIST) /* seems racy */
297 continue;
298 if (err) /* swp entry is obsolete ? */
296 break; 299 break;
297 300
298 /* 301 /*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3187079903fd..0d7296971ad9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,6 +53,33 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES];
53 53
54static DEFINE_MUTEX(swapon_mutex); 54static DEFINE_MUTEX(swapon_mutex);
55 55
56/* For reference count accounting in swap_map */
57/* enum for swap_map[] handling. internal use only */
58enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */
61};
62
63static inline int swap_count(unsigned short ent)
64{
65 return ent & SWAP_COUNT_MASK;
66}
67
68static inline bool swap_has_cache(unsigned short ent)
69{
70 return !!(ent & SWAP_HAS_CACHE);
71}
72
73static inline unsigned short encode_swapmap(int count, bool has_cache)
74{
75 unsigned short ret = count;
76
77 if (has_cache)
78 return SWAP_HAS_CACHE | ret;
79 return ret;
80}
81
82
56/* 83/*
57 * We need this because the bdev->unplug_fn can sleep and we cannot 84 * We need this because the bdev->unplug_fn can sleep and we cannot
58 * hold swap_lock while calling the unplug_fn. And swap_lock 85 * hold swap_lock while calling the unplug_fn. And swap_lock
@@ -167,7 +194,8 @@ static int wait_for_discard(void *word)
167#define SWAPFILE_CLUSTER 256 194#define SWAPFILE_CLUSTER 256
168#define LATENCY_LIMIT 256 195#define LATENCY_LIMIT 256
169 196
170static inline unsigned long scan_swap_map(struct swap_info_struct *si) 197static inline unsigned long scan_swap_map(struct swap_info_struct *si,
198 int cache)
171{ 199{
172 unsigned long offset; 200 unsigned long offset;
173 unsigned long scan_base; 201 unsigned long scan_base;
@@ -285,7 +313,10 @@ checks:
285 si->lowest_bit = si->max; 313 si->lowest_bit = si->max;
286 si->highest_bit = 0; 314 si->highest_bit = 0;
287 } 315 }
288 si->swap_map[offset] = 1; 316 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
317 si->swap_map[offset] = encode_swapmap(0, true);
318 else /* at suspend */
319 si->swap_map[offset] = encode_swapmap(1, false);
289 si->cluster_next = offset + 1; 320 si->cluster_next = offset + 1;
290 si->flags -= SWP_SCANNING; 321 si->flags -= SWP_SCANNING;
291 322
@@ -401,7 +432,8 @@ swp_entry_t get_swap_page(void)
401 continue; 432 continue;
402 433
403 swap_list.next = next; 434 swap_list.next = next;
404 offset = scan_swap_map(si); 435 /* This is called for allocating swap entry for cache */
436 offset = scan_swap_map(si, SWAP_CACHE);
405 if (offset) { 437 if (offset) {
406 spin_unlock(&swap_lock); 438 spin_unlock(&swap_lock);
407 return swp_entry(type, offset); 439 return swp_entry(type, offset);
@@ -415,6 +447,7 @@ noswap:
415 return (swp_entry_t) {0}; 447 return (swp_entry_t) {0};
416} 448}
417 449
450/* The only caller of this function is now susupend routine */
418swp_entry_t get_swap_page_of_type(int type) 451swp_entry_t get_swap_page_of_type(int type)
419{ 452{
420 struct swap_info_struct *si; 453 struct swap_info_struct *si;
@@ -424,7 +457,8 @@ swp_entry_t get_swap_page_of_type(int type)
424 si = swap_info + type; 457 si = swap_info + type;
425 if (si->flags & SWP_WRITEOK) { 458 if (si->flags & SWP_WRITEOK) {
426 nr_swap_pages--; 459 nr_swap_pages--;
427 offset = scan_swap_map(si); 460 /* This is called for allocating swap entry, not cache */
461 offset = scan_swap_map(si, SWAP_MAP);
428 if (offset) { 462 if (offset) {
429 spin_unlock(&swap_lock); 463 spin_unlock(&swap_lock);
430 return swp_entry(type, offset); 464 return swp_entry(type, offset);
@@ -471,25 +505,38 @@ out:
471 return NULL; 505 return NULL;
472} 506}
473 507
474static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) 508static int swap_entry_free(struct swap_info_struct *p,
509 swp_entry_t ent, int cache)
475{ 510{
476 unsigned long offset = swp_offset(ent); 511 unsigned long offset = swp_offset(ent);
477 int count = p->swap_map[offset]; 512 int count = swap_count(p->swap_map[offset]);
478 513 bool has_cache;
479 if (count < SWAP_MAP_MAX) { 514
480 count--; 515 has_cache = swap_has_cache(p->swap_map[offset]);
481 p->swap_map[offset] = count; 516
482 if (!count) { 517 if (cache == SWAP_MAP) { /* dropping usage count of swap */
483 if (offset < p->lowest_bit) 518 if (count < SWAP_MAP_MAX) {
484 p->lowest_bit = offset; 519 count--;
485 if (offset > p->highest_bit) 520 p->swap_map[offset] = encode_swapmap(count, has_cache);
486 p->highest_bit = offset;
487 if (p->prio > swap_info[swap_list.next].prio)
488 swap_list.next = p - swap_info;
489 nr_swap_pages++;
490 p->inuse_pages--;
491 mem_cgroup_uncharge_swap(ent);
492 } 521 }
522 } else { /* dropping swap cache flag */
523 VM_BUG_ON(!has_cache);
524 p->swap_map[offset] = encode_swapmap(count, false);
525
526 }
527 /* return code. */
528 count = p->swap_map[offset];
529 /* free if no reference */
530 if (!count) {
531 if (offset < p->lowest_bit)
532 p->lowest_bit = offset;
533 if (offset > p->highest_bit)
534 p->highest_bit = offset;
535 if (p->prio > swap_info[swap_list.next].prio)
536 swap_list.next = p - swap_info;
537 nr_swap_pages++;
538 p->inuse_pages--;
539 mem_cgroup_uncharge_swap(ent);
493 } 540 }
494 return count; 541 return count;
495} 542}
@@ -504,7 +551,7 @@ void swap_free(swp_entry_t entry)
504 551
505 p = swap_info_get(entry); 552 p = swap_info_get(entry);
506 if (p) { 553 if (p) {
507 swap_entry_free(p, entry); 554 swap_entry_free(p, entry, SWAP_MAP);
508 spin_unlock(&swap_lock); 555 spin_unlock(&swap_lock);
509 } 556 }
510} 557}
@@ -514,9 +561,16 @@ void swap_free(swp_entry_t entry)
514 */ 561 */
515void swapcache_free(swp_entry_t entry, struct page *page) 562void swapcache_free(swp_entry_t entry, struct page *page)
516{ 563{
564 struct swap_info_struct *p;
565
517 if (page) 566 if (page)
518 mem_cgroup_uncharge_swapcache(page, entry); 567 mem_cgroup_uncharge_swapcache(page, entry);
519 return swap_free(entry); 568 p = swap_info_get(entry);
569 if (p) {
570 swap_entry_free(p, entry, SWAP_CACHE);
571 spin_unlock(&swap_lock);
572 }
573 return;
520} 574}
521 575
522/* 576/*
@@ -531,8 +585,7 @@ static inline int page_swapcount(struct page *page)
531 entry.val = page_private(page); 585 entry.val = page_private(page);
532 p = swap_info_get(entry); 586 p = swap_info_get(entry);
533 if (p) { 587 if (p) {
534 /* Subtract the 1 for the swap cache itself */ 588 count = swap_count(p->swap_map[swp_offset(entry)]);
535 count = p->swap_map[swp_offset(entry)] - 1;
536 spin_unlock(&swap_lock); 589 spin_unlock(&swap_lock);
537 } 590 }
538 return count; 591 return count;
@@ -594,7 +647,7 @@ int free_swap_and_cache(swp_entry_t entry)
594 647
595 p = swap_info_get(entry); 648 p = swap_info_get(entry);
596 if (p) { 649 if (p) {
597 if (swap_entry_free(p, entry) == 1) { 650 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
598 page = find_get_page(&swapper_space, entry.val); 651 page = find_get_page(&swapper_space, entry.val);
599 if (page && !trylock_page(page)) { 652 if (page && !trylock_page(page)) {
600 page_cache_release(page); 653 page_cache_release(page);
@@ -901,7 +954,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
901 i = 1; 954 i = 1;
902 } 955 }
903 count = si->swap_map[i]; 956 count = si->swap_map[i];
904 if (count && count != SWAP_MAP_BAD) 957 if (count && swap_count(count) != SWAP_MAP_BAD)
905 break; 958 break;
906 } 959 }
907 return i; 960 return i;
@@ -1005,13 +1058,13 @@ static int try_to_unuse(unsigned int type)
1005 */ 1058 */
1006 shmem = 0; 1059 shmem = 0;
1007 swcount = *swap_map; 1060 swcount = *swap_map;
1008 if (swcount > 1) { 1061 if (swap_count(swcount)) {
1009 if (start_mm == &init_mm) 1062 if (start_mm == &init_mm)
1010 shmem = shmem_unuse(entry, page); 1063 shmem = shmem_unuse(entry, page);
1011 else 1064 else
1012 retval = unuse_mm(start_mm, entry, page); 1065 retval = unuse_mm(start_mm, entry, page);
1013 } 1066 }
1014 if (*swap_map > 1) { 1067 if (swap_count(*swap_map)) {
1015 int set_start_mm = (*swap_map >= swcount); 1068 int set_start_mm = (*swap_map >= swcount);
1016 struct list_head *p = &start_mm->mmlist; 1069 struct list_head *p = &start_mm->mmlist;
1017 struct mm_struct *new_start_mm = start_mm; 1070 struct mm_struct *new_start_mm = start_mm;
@@ -1021,7 +1074,7 @@ static int try_to_unuse(unsigned int type)
1021 atomic_inc(&new_start_mm->mm_users); 1074 atomic_inc(&new_start_mm->mm_users);
1022 atomic_inc(&prev_mm->mm_users); 1075 atomic_inc(&prev_mm->mm_users);
1023 spin_lock(&mmlist_lock); 1076 spin_lock(&mmlist_lock);
1024 while (*swap_map > 1 && !retval && !shmem && 1077 while (swap_count(*swap_map) && !retval && !shmem &&
1025 (p = p->next) != &start_mm->mmlist) { 1078 (p = p->next) != &start_mm->mmlist) {
1026 mm = list_entry(p, struct mm_struct, mmlist); 1079 mm = list_entry(p, struct mm_struct, mmlist);
1027 if (!atomic_inc_not_zero(&mm->mm_users)) 1080 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1033,14 +1086,16 @@ static int try_to_unuse(unsigned int type)
1033 cond_resched(); 1086 cond_resched();
1034 1087
1035 swcount = *swap_map; 1088 swcount = *swap_map;
1036 if (swcount <= 1) 1089 if (!swap_count(swcount)) /* any usage ? */
1037 ; 1090 ;
1038 else if (mm == &init_mm) { 1091 else if (mm == &init_mm) {
1039 set_start_mm = 1; 1092 set_start_mm = 1;
1040 shmem = shmem_unuse(entry, page); 1093 shmem = shmem_unuse(entry, page);
1041 } else 1094 } else
1042 retval = unuse_mm(mm, entry, page); 1095 retval = unuse_mm(mm, entry, page);
1043 if (set_start_mm && *swap_map < swcount) { 1096
1097 if (set_start_mm &&
1098 swap_count(*swap_map) < swcount) {
1044 mmput(new_start_mm); 1099 mmput(new_start_mm);
1045 atomic_inc(&mm->mm_users); 1100 atomic_inc(&mm->mm_users);
1046 new_start_mm = mm; 1101 new_start_mm = mm;
@@ -1067,21 +1122,25 @@ static int try_to_unuse(unsigned int type)
1067 } 1122 }
1068 1123
1069 /* 1124 /*
1070 * How could swap count reach 0x7fff when the maximum 1125 * How could swap count reach 0x7ffe ?
1071 * pid is 0x7fff, and there's no way to repeat a swap 1126 * There's no way to repeat a swap page within an mm
1072 * page within an mm (except in shmem, where it's the 1127 * (except in shmem, where it's the shared object which takes
1073 * shared object which takes the reference count)? 1128 * the reference count)?
1074 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. 1129 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1075 * 1130 * short is too small....)
1076 * If that's wrong, then we should worry more about 1131 * If that's wrong, then we should worry more about
1077 * exit_mmap() and do_munmap() cases described above: 1132 * exit_mmap() and do_munmap() cases described above:
1078 * we might be resetting SWAP_MAP_MAX too early here. 1133 * we might be resetting SWAP_MAP_MAX too early here.
1079 * We know "Undead"s can happen, they're okay, so don't 1134 * We know "Undead"s can happen, they're okay, so don't
1080 * report them; but do report if we reset SWAP_MAP_MAX. 1135 * report them; but do report if we reset SWAP_MAP_MAX.
1081 */ 1136 */
1082 if (*swap_map == SWAP_MAP_MAX) { 1137 /* We might release the lock_page() in unuse_mm(). */
1138 if (!PageSwapCache(page) || page_private(page) != entry.val)
1139 goto retry;
1140
1141 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1083 spin_lock(&swap_lock); 1142 spin_lock(&swap_lock);
1084 *swap_map = 1; 1143 *swap_map = encode_swapmap(0, true);
1085 spin_unlock(&swap_lock); 1144 spin_unlock(&swap_lock);
1086 reset_overflow = 1; 1145 reset_overflow = 1;
1087 } 1146 }
@@ -1099,7 +1158,8 @@ static int try_to_unuse(unsigned int type)
1099 * pages would be incorrect if swap supported "shared 1158 * pages would be incorrect if swap supported "shared
1100 * private" pages, but they are handled by tmpfs files. 1159 * private" pages, but they are handled by tmpfs files.
1101 */ 1160 */
1102 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 1161 if (swap_count(*swap_map) &&
1162 PageDirty(page) && PageSwapCache(page)) {
1103 struct writeback_control wbc = { 1163 struct writeback_control wbc = {
1104 .sync_mode = WB_SYNC_NONE, 1164 .sync_mode = WB_SYNC_NONE,
1105 }; 1165 };
@@ -1126,6 +1186,7 @@ static int try_to_unuse(unsigned int type)
1126 * mark page dirty so shrink_page_list will preserve it. 1186 * mark page dirty so shrink_page_list will preserve it.
1127 */ 1187 */
1128 SetPageDirty(page); 1188 SetPageDirty(page);
1189retry:
1129 unlock_page(page); 1190 unlock_page(page);
1130 page_cache_release(page); 1191 page_cache_release(page);
1131 1192
@@ -1952,15 +2013,23 @@ void si_swapinfo(struct sysinfo *val)
1952 * 2013 *
1953 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 2014 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
1954 * "permanent", but will be reclaimed by the next swapoff. 2015 * "permanent", but will be reclaimed by the next swapoff.
2016 * Returns error code in following case.
2017 * - success -> 0
2018 * - swp_entry is invalid -> EINVAL
2019 * - swp_entry is migration entry -> EINVAL
2020 * - swap-cache reference is requested but there is already one. -> EEXIST
2021 * - swap-cache reference is requested but the entry is not used. -> ENOENT
1955 */ 2022 */
1956int swap_duplicate(swp_entry_t entry) 2023static int __swap_duplicate(swp_entry_t entry, bool cache)
1957{ 2024{
1958 struct swap_info_struct * p; 2025 struct swap_info_struct * p;
1959 unsigned long offset, type; 2026 unsigned long offset, type;
1960 int result = 0; 2027 int result = -EINVAL;
2028 int count;
2029 bool has_cache;
1961 2030
1962 if (is_migration_entry(entry)) 2031 if (is_migration_entry(entry))
1963 return 1; 2032 return -EINVAL;
1964 2033
1965 type = swp_type(entry); 2034 type = swp_type(entry);
1966 if (type >= nr_swapfiles) 2035 if (type >= nr_swapfiles)
@@ -1969,17 +2038,40 @@ int swap_duplicate(swp_entry_t entry)
1969 offset = swp_offset(entry); 2038 offset = swp_offset(entry);
1970 2039
1971 spin_lock(&swap_lock); 2040 spin_lock(&swap_lock);
1972 if (offset < p->max && p->swap_map[offset]) { 2041
1973 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 2042 if (unlikely(offset >= p->max))
1974 p->swap_map[offset]++; 2043 goto unlock_out;
1975 result = 1; 2044
1976 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { 2045 count = swap_count(p->swap_map[offset]);
2046 has_cache = swap_has_cache(p->swap_map[offset]);
2047
2048 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
2049
2050 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2051 if (!has_cache && count) {
2052 p->swap_map[offset] = encode_swapmap(count, true);
2053 result = 0;
2054 } else if (has_cache) /* someone added cache */
2055 result = -EEXIST;
2056 else if (!count) /* no users */
2057 result = -ENOENT;
2058
2059 } else if (count || has_cache) {
2060 if (count < SWAP_MAP_MAX - 1) {
2061 p->swap_map[offset] = encode_swapmap(count + 1,
2062 has_cache);
2063 result = 0;
2064 } else if (count <= SWAP_MAP_MAX) {
1977 if (swap_overflow++ < 5) 2065 if (swap_overflow++ < 5)
1978 printk(KERN_WARNING "swap_dup: swap entry overflow\n"); 2066 printk(KERN_WARNING
1979 p->swap_map[offset] = SWAP_MAP_MAX; 2067 "swap_dup: swap entry overflow\n");
1980 result = 1; 2068 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
2069 has_cache);
2070 result = 0;
1981 } 2071 }
1982 } 2072 } else
2073 result = -ENOENT; /* unused swap entry */
2074unlock_out:
1983 spin_unlock(&swap_lock); 2075 spin_unlock(&swap_lock);
1984out: 2076out:
1985 return result; 2077 return result;
@@ -1988,13 +2080,25 @@ bad_file:
1988 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2080 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1989 goto out; 2081 goto out;
1990} 2082}
2083/*
2084 * increase reference count of swap entry by 1.
2085 */
2086void swap_duplicate(swp_entry_t entry)
2087{
2088 __swap_duplicate(entry, SWAP_MAP);
2089}
1991 2090
1992/* 2091/*
2092 * @entry: swap entry for which we allocate swap cache.
2093 *
1993 * Called when allocating swap cache for exising swap entry, 2094 * Called when allocating swap cache for exising swap entry,
2095 * This can return error codes. Returns 0 at success.
2096 * -EBUSY means there is a swap cache.
2097 * Note: return code is different from swap_duplicate().
1994 */ 2098 */
1995int swapcache_prepare(swp_entry_t entry) 2099int swapcache_prepare(swp_entry_t entry)
1996{ 2100{
1997 return swap_duplicate(entry); 2101 return __swap_duplicate(entry, SWAP_CACHE);
1998} 2102}
1999 2103
2000 2104
@@ -2035,7 +2139,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2035 /* Don't read in free or bad pages */ 2139 /* Don't read in free or bad pages */
2036 if (!si->swap_map[toff]) 2140 if (!si->swap_map[toff])
2037 break; 2141 break;
2038 if (si->swap_map[toff] == SWAP_MAP_BAD) 2142 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2039 break; 2143 break;
2040 } 2144 }
2041 /* Count contiguous allocated slots below our target */ 2145 /* Count contiguous allocated slots below our target */
@@ -2043,7 +2147,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2043 /* Don't read in free or bad pages */ 2147 /* Don't read in free or bad pages */
2044 if (!si->swap_map[toff]) 2148 if (!si->swap_map[toff])
2045 break; 2149 break;
2046 if (si->swap_map[toff] == SWAP_MAP_BAD) 2150 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2047 break; 2151 break;
2048 } 2152 }
2049 spin_unlock(&swap_lock); 2153 spin_unlock(&swap_lock);