diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-06-16 18:32:53 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-16 22:47:42 -0400 |
commit | 355cfa73ddff2fb8fa14e93bd94a057cc022512e (patch) | |
tree | 7ff70cd56d533070d50b06db6ba0086e8aab0d71 | |
parent | cb4b86ba47bb0937b71fb825b3ed88adf7a190f0 (diff) |
mm: modify swap_map and add SWAP_HAS_CACHE flag
This is a part of the patches for fixing memcg's swap accountinf leak.
But, IMHO, not a bad patch even if no memcg.
There are 2 kinds of references to swap.
- reference from swap entry
- reference from swap cache
Then,
- If there is swap cache && swap's refcnt is 1, there is only swap cache.
(*) swapcount(entry) == 1 && find_get_page(swapper_space, entry) != NULL
This counting logic have worked well for a long time. But considering
that we cannot know there is a _real_ reference or not by swap_map[],
current usage of counter is not very good.
This patch adds a flag SWAP_HAS_CACHE and recored information that a swap
entry has a cache or not. This will remove -1 magic used in swapfile.c
and be a help to avoid unnecessary find_get_page().
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/swap.h | 14 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/swapfile.c | 214 |
3 files changed, 172 insertions, 61 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h index 259e96c150ef..fed5e8e1104b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -129,9 +129,10 @@ enum { | |||
129 | 129 | ||
130 | #define SWAP_CLUSTER_MAX 32 | 130 | #define SWAP_CLUSTER_MAX 32 |
131 | 131 | ||
132 | #define SWAP_MAP_MAX 0x7fff | 132 | #define SWAP_MAP_MAX 0x7ffe |
133 | #define SWAP_MAP_BAD 0x8000 | 133 | #define SWAP_MAP_BAD 0x7fff |
134 | 134 | #define SWAP_HAS_CACHE 0x8000 /* There is a swap cache of entry. */ | |
135 | #define SWAP_COUNT_MASK (~SWAP_HAS_CACHE) | ||
135 | /* | 136 | /* |
136 | * The in-memory structure used to track swap areas. | 137 | * The in-memory structure used to track swap areas. |
137 | */ | 138 | */ |
@@ -281,7 +282,7 @@ extern long total_swap_pages; | |||
281 | extern void si_swapinfo(struct sysinfo *); | 282 | extern void si_swapinfo(struct sysinfo *); |
282 | extern swp_entry_t get_swap_page(void); | 283 | extern swp_entry_t get_swap_page(void); |
283 | extern swp_entry_t get_swap_page_of_type(int); | 284 | extern swp_entry_t get_swap_page_of_type(int); |
284 | extern int swap_duplicate(swp_entry_t); | 285 | extern void swap_duplicate(swp_entry_t); |
285 | extern int swapcache_prepare(swp_entry_t); | 286 | extern int swapcache_prepare(swp_entry_t); |
286 | extern int valid_swaphandles(swp_entry_t, unsigned long *); | 287 | extern int valid_swaphandles(swp_entry_t, unsigned long *); |
287 | extern void swap_free(swp_entry_t); | 288 | extern void swap_free(swp_entry_t); |
@@ -353,9 +354,12 @@ static inline void show_swap_cache_info(void) | |||
353 | } | 354 | } |
354 | 355 | ||
355 | #define free_swap_and_cache(swp) is_migration_entry(swp) | 356 | #define free_swap_and_cache(swp) is_migration_entry(swp) |
356 | #define swap_duplicate(swp) is_migration_entry(swp) | ||
357 | #define swapcache_prepare(swp) is_migration_entry(swp) | 357 | #define swapcache_prepare(swp) is_migration_entry(swp) |
358 | 358 | ||
359 | static inline void swap_duplicate(swp_entry_t swp) | ||
360 | { | ||
361 | } | ||
362 | |||
359 | static inline void swap_free(swp_entry_t swp) | 363 | static inline void swap_free(swp_entry_t swp) |
360 | { | 364 | { |
361 | } | 365 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 19bdf3017a9e..b9ca029673a5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -292,7 +292,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
292 | /* | 292 | /* |
293 | * Swap entry may have been freed since our caller observed it. | 293 | * Swap entry may have been freed since our caller observed it. |
294 | */ | 294 | */ |
295 | if (!swapcache_prepare(entry)) | 295 | err = swapcache_prepare(entry); |
296 | if (err == -EEXIST) /* seems racy */ | ||
297 | continue; | ||
298 | if (err) /* swp entry is obsolete ? */ | ||
296 | break; | 299 | break; |
297 | 300 | ||
298 | /* | 301 | /* |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 3187079903fd..0d7296971ad9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -53,6 +53,33 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES]; | |||
53 | 53 | ||
54 | static DEFINE_MUTEX(swapon_mutex); | 54 | static DEFINE_MUTEX(swapon_mutex); |
55 | 55 | ||
56 | /* For reference count accounting in swap_map */ | ||
57 | /* enum for swap_map[] handling. internal use only */ | ||
58 | enum { | ||
59 | SWAP_MAP = 0, /* ops for reference from swap users */ | ||
60 | SWAP_CACHE, /* ops for reference from swap cache */ | ||
61 | }; | ||
62 | |||
63 | static inline int swap_count(unsigned short ent) | ||
64 | { | ||
65 | return ent & SWAP_COUNT_MASK; | ||
66 | } | ||
67 | |||
68 | static inline bool swap_has_cache(unsigned short ent) | ||
69 | { | ||
70 | return !!(ent & SWAP_HAS_CACHE); | ||
71 | } | ||
72 | |||
73 | static inline unsigned short encode_swapmap(int count, bool has_cache) | ||
74 | { | ||
75 | unsigned short ret = count; | ||
76 | |||
77 | if (has_cache) | ||
78 | return SWAP_HAS_CACHE | ret; | ||
79 | return ret; | ||
80 | } | ||
81 | |||
82 | |||
56 | /* | 83 | /* |
57 | * We need this because the bdev->unplug_fn can sleep and we cannot | 84 | * We need this because the bdev->unplug_fn can sleep and we cannot |
58 | * hold swap_lock while calling the unplug_fn. And swap_lock | 85 | * hold swap_lock while calling the unplug_fn. And swap_lock |
@@ -167,7 +194,8 @@ static int wait_for_discard(void *word) | |||
167 | #define SWAPFILE_CLUSTER 256 | 194 | #define SWAPFILE_CLUSTER 256 |
168 | #define LATENCY_LIMIT 256 | 195 | #define LATENCY_LIMIT 256 |
169 | 196 | ||
170 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) | 197 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, |
198 | int cache) | ||
171 | { | 199 | { |
172 | unsigned long offset; | 200 | unsigned long offset; |
173 | unsigned long scan_base; | 201 | unsigned long scan_base; |
@@ -285,7 +313,10 @@ checks: | |||
285 | si->lowest_bit = si->max; | 313 | si->lowest_bit = si->max; |
286 | si->highest_bit = 0; | 314 | si->highest_bit = 0; |
287 | } | 315 | } |
288 | si->swap_map[offset] = 1; | 316 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ |
317 | si->swap_map[offset] = encode_swapmap(0, true); | ||
318 | else /* at suspend */ | ||
319 | si->swap_map[offset] = encode_swapmap(1, false); | ||
289 | si->cluster_next = offset + 1; | 320 | si->cluster_next = offset + 1; |
290 | si->flags -= SWP_SCANNING; | 321 | si->flags -= SWP_SCANNING; |
291 | 322 | ||
@@ -401,7 +432,8 @@ swp_entry_t get_swap_page(void) | |||
401 | continue; | 432 | continue; |
402 | 433 | ||
403 | swap_list.next = next; | 434 | swap_list.next = next; |
404 | offset = scan_swap_map(si); | 435 | /* This is called for allocating swap entry for cache */ |
436 | offset = scan_swap_map(si, SWAP_CACHE); | ||
405 | if (offset) { | 437 | if (offset) { |
406 | spin_unlock(&swap_lock); | 438 | spin_unlock(&swap_lock); |
407 | return swp_entry(type, offset); | 439 | return swp_entry(type, offset); |
@@ -415,6 +447,7 @@ noswap: | |||
415 | return (swp_entry_t) {0}; | 447 | return (swp_entry_t) {0}; |
416 | } | 448 | } |
417 | 449 | ||
450 | /* The only caller of this function is now susupend routine */ | ||
418 | swp_entry_t get_swap_page_of_type(int type) | 451 | swp_entry_t get_swap_page_of_type(int type) |
419 | { | 452 | { |
420 | struct swap_info_struct *si; | 453 | struct swap_info_struct *si; |
@@ -424,7 +457,8 @@ swp_entry_t get_swap_page_of_type(int type) | |||
424 | si = swap_info + type; | 457 | si = swap_info + type; |
425 | if (si->flags & SWP_WRITEOK) { | 458 | if (si->flags & SWP_WRITEOK) { |
426 | nr_swap_pages--; | 459 | nr_swap_pages--; |
427 | offset = scan_swap_map(si); | 460 | /* This is called for allocating swap entry, not cache */ |
461 | offset = scan_swap_map(si, SWAP_MAP); | ||
428 | if (offset) { | 462 | if (offset) { |
429 | spin_unlock(&swap_lock); | 463 | spin_unlock(&swap_lock); |
430 | return swp_entry(type, offset); | 464 | return swp_entry(type, offset); |
@@ -471,25 +505,38 @@ out: | |||
471 | return NULL; | 505 | return NULL; |
472 | } | 506 | } |
473 | 507 | ||
474 | static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) | 508 | static int swap_entry_free(struct swap_info_struct *p, |
509 | swp_entry_t ent, int cache) | ||
475 | { | 510 | { |
476 | unsigned long offset = swp_offset(ent); | 511 | unsigned long offset = swp_offset(ent); |
477 | int count = p->swap_map[offset]; | 512 | int count = swap_count(p->swap_map[offset]); |
478 | 513 | bool has_cache; | |
479 | if (count < SWAP_MAP_MAX) { | 514 | |
480 | count--; | 515 | has_cache = swap_has_cache(p->swap_map[offset]); |
481 | p->swap_map[offset] = count; | 516 | |
482 | if (!count) { | 517 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ |
483 | if (offset < p->lowest_bit) | 518 | if (count < SWAP_MAP_MAX) { |
484 | p->lowest_bit = offset; | 519 | count--; |
485 | if (offset > p->highest_bit) | 520 | p->swap_map[offset] = encode_swapmap(count, has_cache); |
486 | p->highest_bit = offset; | ||
487 | if (p->prio > swap_info[swap_list.next].prio) | ||
488 | swap_list.next = p - swap_info; | ||
489 | nr_swap_pages++; | ||
490 | p->inuse_pages--; | ||
491 | mem_cgroup_uncharge_swap(ent); | ||
492 | } | 521 | } |
522 | } else { /* dropping swap cache flag */ | ||
523 | VM_BUG_ON(!has_cache); | ||
524 | p->swap_map[offset] = encode_swapmap(count, false); | ||
525 | |||
526 | } | ||
527 | /* return code. */ | ||
528 | count = p->swap_map[offset]; | ||
529 | /* free if no reference */ | ||
530 | if (!count) { | ||
531 | if (offset < p->lowest_bit) | ||
532 | p->lowest_bit = offset; | ||
533 | if (offset > p->highest_bit) | ||
534 | p->highest_bit = offset; | ||
535 | if (p->prio > swap_info[swap_list.next].prio) | ||
536 | swap_list.next = p - swap_info; | ||
537 | nr_swap_pages++; | ||
538 | p->inuse_pages--; | ||
539 | mem_cgroup_uncharge_swap(ent); | ||
493 | } | 540 | } |
494 | return count; | 541 | return count; |
495 | } | 542 | } |
@@ -504,7 +551,7 @@ void swap_free(swp_entry_t entry) | |||
504 | 551 | ||
505 | p = swap_info_get(entry); | 552 | p = swap_info_get(entry); |
506 | if (p) { | 553 | if (p) { |
507 | swap_entry_free(p, entry); | 554 | swap_entry_free(p, entry, SWAP_MAP); |
508 | spin_unlock(&swap_lock); | 555 | spin_unlock(&swap_lock); |
509 | } | 556 | } |
510 | } | 557 | } |
@@ -514,9 +561,16 @@ void swap_free(swp_entry_t entry) | |||
514 | */ | 561 | */ |
515 | void swapcache_free(swp_entry_t entry, struct page *page) | 562 | void swapcache_free(swp_entry_t entry, struct page *page) |
516 | { | 563 | { |
564 | struct swap_info_struct *p; | ||
565 | |||
517 | if (page) | 566 | if (page) |
518 | mem_cgroup_uncharge_swapcache(page, entry); | 567 | mem_cgroup_uncharge_swapcache(page, entry); |
519 | return swap_free(entry); | 568 | p = swap_info_get(entry); |
569 | if (p) { | ||
570 | swap_entry_free(p, entry, SWAP_CACHE); | ||
571 | spin_unlock(&swap_lock); | ||
572 | } | ||
573 | return; | ||
520 | } | 574 | } |
521 | 575 | ||
522 | /* | 576 | /* |
@@ -531,8 +585,7 @@ static inline int page_swapcount(struct page *page) | |||
531 | entry.val = page_private(page); | 585 | entry.val = page_private(page); |
532 | p = swap_info_get(entry); | 586 | p = swap_info_get(entry); |
533 | if (p) { | 587 | if (p) { |
534 | /* Subtract the 1 for the swap cache itself */ | 588 | count = swap_count(p->swap_map[swp_offset(entry)]); |
535 | count = p->swap_map[swp_offset(entry)] - 1; | ||
536 | spin_unlock(&swap_lock); | 589 | spin_unlock(&swap_lock); |
537 | } | 590 | } |
538 | return count; | 591 | return count; |
@@ -594,7 +647,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
594 | 647 | ||
595 | p = swap_info_get(entry); | 648 | p = swap_info_get(entry); |
596 | if (p) { | 649 | if (p) { |
597 | if (swap_entry_free(p, entry) == 1) { | 650 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { |
598 | page = find_get_page(&swapper_space, entry.val); | 651 | page = find_get_page(&swapper_space, entry.val); |
599 | if (page && !trylock_page(page)) { | 652 | if (page && !trylock_page(page)) { |
600 | page_cache_release(page); | 653 | page_cache_release(page); |
@@ -901,7 +954,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
901 | i = 1; | 954 | i = 1; |
902 | } | 955 | } |
903 | count = si->swap_map[i]; | 956 | count = si->swap_map[i]; |
904 | if (count && count != SWAP_MAP_BAD) | 957 | if (count && swap_count(count) != SWAP_MAP_BAD) |
905 | break; | 958 | break; |
906 | } | 959 | } |
907 | return i; | 960 | return i; |
@@ -1005,13 +1058,13 @@ static int try_to_unuse(unsigned int type) | |||
1005 | */ | 1058 | */ |
1006 | shmem = 0; | 1059 | shmem = 0; |
1007 | swcount = *swap_map; | 1060 | swcount = *swap_map; |
1008 | if (swcount > 1) { | 1061 | if (swap_count(swcount)) { |
1009 | if (start_mm == &init_mm) | 1062 | if (start_mm == &init_mm) |
1010 | shmem = shmem_unuse(entry, page); | 1063 | shmem = shmem_unuse(entry, page); |
1011 | else | 1064 | else |
1012 | retval = unuse_mm(start_mm, entry, page); | 1065 | retval = unuse_mm(start_mm, entry, page); |
1013 | } | 1066 | } |
1014 | if (*swap_map > 1) { | 1067 | if (swap_count(*swap_map)) { |
1015 | int set_start_mm = (*swap_map >= swcount); | 1068 | int set_start_mm = (*swap_map >= swcount); |
1016 | struct list_head *p = &start_mm->mmlist; | 1069 | struct list_head *p = &start_mm->mmlist; |
1017 | struct mm_struct *new_start_mm = start_mm; | 1070 | struct mm_struct *new_start_mm = start_mm; |
@@ -1021,7 +1074,7 @@ static int try_to_unuse(unsigned int type) | |||
1021 | atomic_inc(&new_start_mm->mm_users); | 1074 | atomic_inc(&new_start_mm->mm_users); |
1022 | atomic_inc(&prev_mm->mm_users); | 1075 | atomic_inc(&prev_mm->mm_users); |
1023 | spin_lock(&mmlist_lock); | 1076 | spin_lock(&mmlist_lock); |
1024 | while (*swap_map > 1 && !retval && !shmem && | 1077 | while (swap_count(*swap_map) && !retval && !shmem && |
1025 | (p = p->next) != &start_mm->mmlist) { | 1078 | (p = p->next) != &start_mm->mmlist) { |
1026 | mm = list_entry(p, struct mm_struct, mmlist); | 1079 | mm = list_entry(p, struct mm_struct, mmlist); |
1027 | if (!atomic_inc_not_zero(&mm->mm_users)) | 1080 | if (!atomic_inc_not_zero(&mm->mm_users)) |
@@ -1033,14 +1086,16 @@ static int try_to_unuse(unsigned int type) | |||
1033 | cond_resched(); | 1086 | cond_resched(); |
1034 | 1087 | ||
1035 | swcount = *swap_map; | 1088 | swcount = *swap_map; |
1036 | if (swcount <= 1) | 1089 | if (!swap_count(swcount)) /* any usage ? */ |
1037 | ; | 1090 | ; |
1038 | else if (mm == &init_mm) { | 1091 | else if (mm == &init_mm) { |
1039 | set_start_mm = 1; | 1092 | set_start_mm = 1; |
1040 | shmem = shmem_unuse(entry, page); | 1093 | shmem = shmem_unuse(entry, page); |
1041 | } else | 1094 | } else |
1042 | retval = unuse_mm(mm, entry, page); | 1095 | retval = unuse_mm(mm, entry, page); |
1043 | if (set_start_mm && *swap_map < swcount) { | 1096 | |
1097 | if (set_start_mm && | ||
1098 | swap_count(*swap_map) < swcount) { | ||
1044 | mmput(new_start_mm); | 1099 | mmput(new_start_mm); |
1045 | atomic_inc(&mm->mm_users); | 1100 | atomic_inc(&mm->mm_users); |
1046 | new_start_mm = mm; | 1101 | new_start_mm = mm; |
@@ -1067,21 +1122,25 @@ static int try_to_unuse(unsigned int type) | |||
1067 | } | 1122 | } |
1068 | 1123 | ||
1069 | /* | 1124 | /* |
1070 | * How could swap count reach 0x7fff when the maximum | 1125 | * How could swap count reach 0x7ffe ? |
1071 | * pid is 0x7fff, and there's no way to repeat a swap | 1126 | * There's no way to repeat a swap page within an mm |
1072 | * page within an mm (except in shmem, where it's the | 1127 | * (except in shmem, where it's the shared object which takes |
1073 | * shared object which takes the reference count)? | 1128 | * the reference count)? |
1074 | * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. | 1129 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned |
1075 | * | 1130 | * short is too small....) |
1076 | * If that's wrong, then we should worry more about | 1131 | * If that's wrong, then we should worry more about |
1077 | * exit_mmap() and do_munmap() cases described above: | 1132 | * exit_mmap() and do_munmap() cases described above: |
1078 | * we might be resetting SWAP_MAP_MAX too early here. | 1133 | * we might be resetting SWAP_MAP_MAX too early here. |
1079 | * We know "Undead"s can happen, they're okay, so don't | 1134 | * We know "Undead"s can happen, they're okay, so don't |
1080 | * report them; but do report if we reset SWAP_MAP_MAX. | 1135 | * report them; but do report if we reset SWAP_MAP_MAX. |
1081 | */ | 1136 | */ |
1082 | if (*swap_map == SWAP_MAP_MAX) { | 1137 | /* We might release the lock_page() in unuse_mm(). */ |
1138 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
1139 | goto retry; | ||
1140 | |||
1141 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
1083 | spin_lock(&swap_lock); | 1142 | spin_lock(&swap_lock); |
1084 | *swap_map = 1; | 1143 | *swap_map = encode_swapmap(0, true); |
1085 | spin_unlock(&swap_lock); | 1144 | spin_unlock(&swap_lock); |
1086 | reset_overflow = 1; | 1145 | reset_overflow = 1; |
1087 | } | 1146 | } |
@@ -1099,7 +1158,8 @@ static int try_to_unuse(unsigned int type) | |||
1099 | * pages would be incorrect if swap supported "shared | 1158 | * pages would be incorrect if swap supported "shared |
1100 | * private" pages, but they are handled by tmpfs files. | 1159 | * private" pages, but they are handled by tmpfs files. |
1101 | */ | 1160 | */ |
1102 | if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { | 1161 | if (swap_count(*swap_map) && |
1162 | PageDirty(page) && PageSwapCache(page)) { | ||
1103 | struct writeback_control wbc = { | 1163 | struct writeback_control wbc = { |
1104 | .sync_mode = WB_SYNC_NONE, | 1164 | .sync_mode = WB_SYNC_NONE, |
1105 | }; | 1165 | }; |
@@ -1126,6 +1186,7 @@ static int try_to_unuse(unsigned int type) | |||
1126 | * mark page dirty so shrink_page_list will preserve it. | 1186 | * mark page dirty so shrink_page_list will preserve it. |
1127 | */ | 1187 | */ |
1128 | SetPageDirty(page); | 1188 | SetPageDirty(page); |
1189 | retry: | ||
1129 | unlock_page(page); | 1190 | unlock_page(page); |
1130 | page_cache_release(page); | 1191 | page_cache_release(page); |
1131 | 1192 | ||
@@ -1952,15 +2013,23 @@ void si_swapinfo(struct sysinfo *val) | |||
1952 | * | 2013 | * |
1953 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | 2014 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as |
1954 | * "permanent", but will be reclaimed by the next swapoff. | 2015 | * "permanent", but will be reclaimed by the next swapoff. |
2016 | * Returns error code in following case. | ||
2017 | * - success -> 0 | ||
2018 | * - swp_entry is invalid -> EINVAL | ||
2019 | * - swp_entry is migration entry -> EINVAL | ||
2020 | * - swap-cache reference is requested but there is already one. -> EEXIST | ||
2021 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | ||
1955 | */ | 2022 | */ |
1956 | int swap_duplicate(swp_entry_t entry) | 2023 | static int __swap_duplicate(swp_entry_t entry, bool cache) |
1957 | { | 2024 | { |
1958 | struct swap_info_struct * p; | 2025 | struct swap_info_struct * p; |
1959 | unsigned long offset, type; | 2026 | unsigned long offset, type; |
1960 | int result = 0; | 2027 | int result = -EINVAL; |
2028 | int count; | ||
2029 | bool has_cache; | ||
1961 | 2030 | ||
1962 | if (is_migration_entry(entry)) | 2031 | if (is_migration_entry(entry)) |
1963 | return 1; | 2032 | return -EINVAL; |
1964 | 2033 | ||
1965 | type = swp_type(entry); | 2034 | type = swp_type(entry); |
1966 | if (type >= nr_swapfiles) | 2035 | if (type >= nr_swapfiles) |
@@ -1969,17 +2038,40 @@ int swap_duplicate(swp_entry_t entry) | |||
1969 | offset = swp_offset(entry); | 2038 | offset = swp_offset(entry); |
1970 | 2039 | ||
1971 | spin_lock(&swap_lock); | 2040 | spin_lock(&swap_lock); |
1972 | if (offset < p->max && p->swap_map[offset]) { | 2041 | |
1973 | if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { | 2042 | if (unlikely(offset >= p->max)) |
1974 | p->swap_map[offset]++; | 2043 | goto unlock_out; |
1975 | result = 1; | 2044 | |
1976 | } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { | 2045 | count = swap_count(p->swap_map[offset]); |
2046 | has_cache = swap_has_cache(p->swap_map[offset]); | ||
2047 | |||
2048 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ | ||
2049 | |||
2050 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ | ||
2051 | if (!has_cache && count) { | ||
2052 | p->swap_map[offset] = encode_swapmap(count, true); | ||
2053 | result = 0; | ||
2054 | } else if (has_cache) /* someone added cache */ | ||
2055 | result = -EEXIST; | ||
2056 | else if (!count) /* no users */ | ||
2057 | result = -ENOENT; | ||
2058 | |||
2059 | } else if (count || has_cache) { | ||
2060 | if (count < SWAP_MAP_MAX - 1) { | ||
2061 | p->swap_map[offset] = encode_swapmap(count + 1, | ||
2062 | has_cache); | ||
2063 | result = 0; | ||
2064 | } else if (count <= SWAP_MAP_MAX) { | ||
1977 | if (swap_overflow++ < 5) | 2065 | if (swap_overflow++ < 5) |
1978 | printk(KERN_WARNING "swap_dup: swap entry overflow\n"); | 2066 | printk(KERN_WARNING |
1979 | p->swap_map[offset] = SWAP_MAP_MAX; | 2067 | "swap_dup: swap entry overflow\n"); |
1980 | result = 1; | 2068 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, |
2069 | has_cache); | ||
2070 | result = 0; | ||
1981 | } | 2071 | } |
1982 | } | 2072 | } else |
2073 | result = -ENOENT; /* unused swap entry */ | ||
2074 | unlock_out: | ||
1983 | spin_unlock(&swap_lock); | 2075 | spin_unlock(&swap_lock); |
1984 | out: | 2076 | out: |
1985 | return result; | 2077 | return result; |
@@ -1988,13 +2080,25 @@ bad_file: | |||
1988 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2080 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); |
1989 | goto out; | 2081 | goto out; |
1990 | } | 2082 | } |
2083 | /* | ||
2084 | * increase reference count of swap entry by 1. | ||
2085 | */ | ||
2086 | void swap_duplicate(swp_entry_t entry) | ||
2087 | { | ||
2088 | __swap_duplicate(entry, SWAP_MAP); | ||
2089 | } | ||
1991 | 2090 | ||
1992 | /* | 2091 | /* |
2092 | * @entry: swap entry for which we allocate swap cache. | ||
2093 | * | ||
1993 | * Called when allocating swap cache for exising swap entry, | 2094 | * Called when allocating swap cache for exising swap entry, |
2095 | * This can return error codes. Returns 0 at success. | ||
2096 | * -EBUSY means there is a swap cache. | ||
2097 | * Note: return code is different from swap_duplicate(). | ||
1994 | */ | 2098 | */ |
1995 | int swapcache_prepare(swp_entry_t entry) | 2099 | int swapcache_prepare(swp_entry_t entry) |
1996 | { | 2100 | { |
1997 | return swap_duplicate(entry); | 2101 | return __swap_duplicate(entry, SWAP_CACHE); |
1998 | } | 2102 | } |
1999 | 2103 | ||
2000 | 2104 | ||
@@ -2035,7 +2139,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2035 | /* Don't read in free or bad pages */ | 2139 | /* Don't read in free or bad pages */ |
2036 | if (!si->swap_map[toff]) | 2140 | if (!si->swap_map[toff]) |
2037 | break; | 2141 | break; |
2038 | if (si->swap_map[toff] == SWAP_MAP_BAD) | 2142 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
2039 | break; | 2143 | break; |
2040 | } | 2144 | } |
2041 | /* Count contiguous allocated slots below our target */ | 2145 | /* Count contiguous allocated slots below our target */ |
@@ -2043,7 +2147,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2043 | /* Don't read in free or bad pages */ | 2147 | /* Don't read in free or bad pages */ |
2044 | if (!si->swap_map[toff]) | 2148 | if (!si->swap_map[toff]) |
2045 | break; | 2149 | break; |
2046 | if (si->swap_map[toff] == SWAP_MAP_BAD) | 2150 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
2047 | break; | 2151 | break; |
2048 | } | 2152 | } |
2049 | spin_unlock(&swap_lock); | 2153 | spin_unlock(&swap_lock); |