diff options
author | Shaohua Li <shli@kernel.org> | 2013-02-22 19:34:38 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-23 20:50:17 -0500 |
commit | ec8acf20afb8534ed511f6613dd2226b9e301010 (patch) | |
tree | a0d6779eeffa0f523a2799dbb619e0a34fd786d4 | |
parent | 33806f06da654092182410d974b6d3c5396ea3eb (diff) |
swap: add per-partition lock for swapfile
swap_lock is heavily contended when I test swap to 3 fast SSD (even
slightly slower than swap to 2 such SSD). The main contention comes
from swap_info_get(). This patch tries to fix the gap with adding a new
per-partition lock.
Global data like nr_swapfiles, total_swap_pages, least_priority and
swap_list are still protected by swap_lock.
nr_swap_pages is an atomic now, it can be changed without swap_lock. In
theory, it's possible get_swap_page() finds no swap pages but actually
there are free swap pages. But sounds not a big problem.
Accessing partition specific data (like scan_swap_map and so on) is only
protected by swap_info_struct.lock.
Changing swap_info_struct.flags need hold swap_lock and
swap_info_struct.lock, because scan_scan_map() will check it. read the
flags is ok with either the locks hold.
If both swap_lock and swap_info_struct.lock must be hold, we always hold
the former first to avoid deadlock.
swap_entry_free() can change swap_list. To delete that code, we add a
new highest_priority_index. Whenever get_swap_page() is called, we
check it. If it's valid, we use it.
It's a pity get_swap_page() still holds swap_lock(). But in practice,
swap_lock() isn't heavily contended in my test with this patch (or I can
say there are other much more heavier bottlenecks like TLB flush). And
BTW, looks get_swap_page() doesn't really need the lock. We never free
swap_info[] and we check SWAP_WRITEOK flag. The only risk without the
lock is we could swapout to some low priority swap, but we can quickly
recover after several rounds of swap, so sounds not a big deal to me.
But I'd prefer to fix this if it's a real problem.
"swap: make each swap partition have one address_space" improved the
swapout speed from 1.7G/s to 2G/s. This patch further improves the
speed to 2.3G/s, so around 15% improvement. It's a multi-process test,
so TLB flush isn't the biggest bottleneck before the patches.
[arnd@arndb.de: fix it for nommu]
[hughd@google.com: add missing unlock]
[minchan@kernel.org: get rid of lockdep whinge on sys_swapon]
Signed-off-by: Shaohua Li <shli@fusionio.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | arch/sparc/mm/init_32.c | 2 | ||||
-rw-r--r-- | arch/tile/mm/pgtable.c | 2 | ||||
-rw-r--r-- | include/linux/swap.h | 32 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/swap_state.c | 3 | ||||
-rw-r--r-- | mm/swapfile.c | 154 | ||||
-rw-r--r-- | mm/vmscan.c | 8 |
8 files changed, 145 insertions, 60 deletions
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index dde85ef1c56d..48e0c030e8f5 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c | |||
@@ -57,7 +57,7 @@ void show_mem(unsigned int filter) | |||
57 | printk("Mem-info:\n"); | 57 | printk("Mem-info:\n"); |
58 | show_free_areas(filter); | 58 | show_free_areas(filter); |
59 | printk("Free swap: %6ldkB\n", | 59 | printk("Free swap: %6ldkB\n", |
60 | nr_swap_pages << (PAGE_SHIFT-10)); | 60 | get_nr_swap_pages() << (PAGE_SHIFT-10)); |
61 | printk("%ld pages of RAM\n", totalram_pages); | 61 | printk("%ld pages of RAM\n", totalram_pages); |
62 | printk("%ld free pages\n", nr_free_pages()); | 62 | printk("%ld free pages\n", nr_free_pages()); |
63 | } | 63 | } |
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index de0de0c0e8a1..b3b4972c2451 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c | |||
@@ -61,7 +61,7 @@ void show_mem(unsigned int filter) | |||
61 | global_page_state(NR_PAGETABLE), | 61 | global_page_state(NR_PAGETABLE), |
62 | global_page_state(NR_BOUNCE), | 62 | global_page_state(NR_BOUNCE), |
63 | global_page_state(NR_FILE_PAGES), | 63 | global_page_state(NR_FILE_PAGES), |
64 | nr_swap_pages); | 64 | get_nr_swap_pages()); |
65 | 65 | ||
66 | for_each_zone(zone) { | 66 | for_each_zone(zone) { |
67 | unsigned long flags, order, total = 0, largest_order = -1; | 67 | unsigned long flags, order, total = 0, largest_order = -1; |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 235c039892ee..a3e22d357e91 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -202,6 +202,18 @@ struct swap_info_struct { | |||
202 | unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ | 202 | unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ |
203 | atomic_t frontswap_pages; /* frontswap pages in-use counter */ | 203 | atomic_t frontswap_pages; /* frontswap pages in-use counter */ |
204 | #endif | 204 | #endif |
205 | spinlock_t lock; /* | ||
206 | * protect map scan related fields like | ||
207 | * swap_map, lowest_bit, highest_bit, | ||
208 | * inuse_pages, cluster_next, | ||
209 | * cluster_nr, lowest_alloc and | ||
210 | * highest_alloc. other fields are only | ||
211 | * changed at swapon/swapoff, so are | ||
212 | * protected by swap_lock. changing | ||
213 | * flags need hold this lock and | ||
214 | * swap_lock. If both locks need hold, | ||
215 | * hold swap_lock first. | ||
216 | */ | ||
205 | }; | 217 | }; |
206 | 218 | ||
207 | struct swap_list_t { | 219 | struct swap_list_t { |
@@ -209,9 +221,6 @@ struct swap_list_t { | |||
209 | int next; /* swapfile to be used next */ | 221 | int next; /* swapfile to be used next */ |
210 | }; | 222 | }; |
211 | 223 | ||
212 | /* Swap 50% full? Release swapcache more aggressively.. */ | ||
213 | #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) | ||
214 | |||
215 | /* linux/mm/page_alloc.c */ | 224 | /* linux/mm/page_alloc.c */ |
216 | extern unsigned long totalram_pages; | 225 | extern unsigned long totalram_pages; |
217 | extern unsigned long totalreserve_pages; | 226 | extern unsigned long totalreserve_pages; |
@@ -347,8 +356,20 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, | |||
347 | struct vm_area_struct *vma, unsigned long addr); | 356 | struct vm_area_struct *vma, unsigned long addr); |
348 | 357 | ||
349 | /* linux/mm/swapfile.c */ | 358 | /* linux/mm/swapfile.c */ |
350 | extern long nr_swap_pages; | 359 | extern atomic_long_t nr_swap_pages; |
351 | extern long total_swap_pages; | 360 | extern long total_swap_pages; |
361 | |||
362 | /* Swap 50% full? Release swapcache more aggressively.. */ | ||
363 | static inline bool vm_swap_full(void) | ||
364 | { | ||
365 | return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages; | ||
366 | } | ||
367 | |||
368 | static inline long get_nr_swap_pages(void) | ||
369 | { | ||
370 | return atomic_long_read(&nr_swap_pages); | ||
371 | } | ||
372 | |||
352 | extern void si_swapinfo(struct sysinfo *); | 373 | extern void si_swapinfo(struct sysinfo *); |
353 | extern swp_entry_t get_swap_page(void); | 374 | extern swp_entry_t get_swap_page(void); |
354 | extern swp_entry_t get_swap_page_of_type(int); | 375 | extern swp_entry_t get_swap_page_of_type(int); |
@@ -381,9 +402,10 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
381 | 402 | ||
382 | #else /* CONFIG_SWAP */ | 403 | #else /* CONFIG_SWAP */ |
383 | 404 | ||
384 | #define nr_swap_pages 0L | 405 | #define get_nr_swap_pages() 0L |
385 | #define total_swap_pages 0L | 406 | #define total_swap_pages 0L |
386 | #define total_swapcache_pages() 0UL | 407 | #define total_swapcache_pages() 0UL |
408 | #define vm_swap_full() 0 | ||
387 | 409 | ||
388 | #define si_swapinfo(val) \ | 410 | #define si_swapinfo(val) \ |
389 | do { (val)->freeswap = (val)->totalswap = 0; } while (0) | 411 | do { (val)->freeswap = (val)->totalswap = 0; } while (0) |
@@ -144,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
144 | */ | 144 | */ |
145 | free -= global_page_state(NR_SHMEM); | 145 | free -= global_page_state(NR_SHMEM); |
146 | 146 | ||
147 | free += nr_swap_pages; | 147 | free += get_nr_swap_pages(); |
148 | 148 | ||
149 | /* | 149 | /* |
150 | * Any slabs which are created with the | 150 | * Any slabs which are created with the |
diff --git a/mm/nommu.c b/mm/nommu.c index 18c1b932e2c4..87854a55829d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1907,7 +1907,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1907 | */ | 1907 | */ |
1908 | free -= global_page_state(NR_SHMEM); | 1908 | free -= global_page_state(NR_SHMEM); |
1909 | 1909 | ||
1910 | free += nr_swap_pages; | 1910 | free += get_nr_swap_pages(); |
1911 | 1911 | ||
1912 | /* | 1912 | /* |
1913 | * Any slabs which are created with the | 1913 | * Any slabs which are created with the |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 8d6644c5d0cc..7efcf1525921 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -69,7 +69,8 @@ void show_swap_cache_info(void) | |||
69 | printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", | 69 | printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", |
70 | swap_cache_info.add_total, swap_cache_info.del_total, | 70 | swap_cache_info.add_total, swap_cache_info.del_total, |
71 | swap_cache_info.find_success, swap_cache_info.find_total); | 71 | swap_cache_info.find_success, swap_cache_info.find_total); |
72 | printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); | 72 | printk("Free swap = %ldkB\n", |
73 | get_nr_swap_pages() << (PAGE_SHIFT - 10)); | ||
73 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); | 74 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); |
74 | } | 75 | } |
75 | 76 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index e51864e6fe8b..9b51266413cd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -47,9 +47,11 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**); | |||
47 | 47 | ||
48 | DEFINE_SPINLOCK(swap_lock); | 48 | DEFINE_SPINLOCK(swap_lock); |
49 | static unsigned int nr_swapfiles; | 49 | static unsigned int nr_swapfiles; |
50 | long nr_swap_pages; | 50 | atomic_long_t nr_swap_pages; |
51 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ | ||
51 | long total_swap_pages; | 52 | long total_swap_pages; |
52 | static int least_priority; | 53 | static int least_priority; |
54 | static atomic_t highest_priority_index = ATOMIC_INIT(-1); | ||
53 | 55 | ||
54 | static const char Bad_file[] = "Bad swap file entry "; | 56 | static const char Bad_file[] = "Bad swap file entry "; |
55 | static const char Unused_file[] = "Unused swap file entry "; | 57 | static const char Unused_file[] = "Unused swap file entry "; |
@@ -223,7 +225,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
223 | si->lowest_alloc = si->max; | 225 | si->lowest_alloc = si->max; |
224 | si->highest_alloc = 0; | 226 | si->highest_alloc = 0; |
225 | } | 227 | } |
226 | spin_unlock(&swap_lock); | 228 | spin_unlock(&si->lock); |
227 | 229 | ||
228 | /* | 230 | /* |
229 | * If seek is expensive, start searching for new cluster from | 231 | * If seek is expensive, start searching for new cluster from |
@@ -242,7 +244,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
242 | if (si->swap_map[offset]) | 244 | if (si->swap_map[offset]) |
243 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 245 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
244 | else if (offset == last_in_cluster) { | 246 | else if (offset == last_in_cluster) { |
245 | spin_lock(&swap_lock); | 247 | spin_lock(&si->lock); |
246 | offset -= SWAPFILE_CLUSTER - 1; | 248 | offset -= SWAPFILE_CLUSTER - 1; |
247 | si->cluster_next = offset; | 249 | si->cluster_next = offset; |
248 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 250 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
@@ -263,7 +265,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
263 | if (si->swap_map[offset]) | 265 | if (si->swap_map[offset]) |
264 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 266 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
265 | else if (offset == last_in_cluster) { | 267 | else if (offset == last_in_cluster) { |
266 | spin_lock(&swap_lock); | 268 | spin_lock(&si->lock); |
267 | offset -= SWAPFILE_CLUSTER - 1; | 269 | offset -= SWAPFILE_CLUSTER - 1; |
268 | si->cluster_next = offset; | 270 | si->cluster_next = offset; |
269 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 271 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
@@ -277,7 +279,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
277 | } | 279 | } |
278 | 280 | ||
279 | offset = scan_base; | 281 | offset = scan_base; |
280 | spin_lock(&swap_lock); | 282 | spin_lock(&si->lock); |
281 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 283 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
282 | si->lowest_alloc = 0; | 284 | si->lowest_alloc = 0; |
283 | } | 285 | } |
@@ -293,9 +295,9 @@ checks: | |||
293 | /* reuse swap entry of cache-only swap if not busy. */ | 295 | /* reuse swap entry of cache-only swap if not busy. */ |
294 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 296 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
295 | int swap_was_freed; | 297 | int swap_was_freed; |
296 | spin_unlock(&swap_lock); | 298 | spin_unlock(&si->lock); |
297 | swap_was_freed = __try_to_reclaim_swap(si, offset); | 299 | swap_was_freed = __try_to_reclaim_swap(si, offset); |
298 | spin_lock(&swap_lock); | 300 | spin_lock(&si->lock); |
299 | /* entry was freed successfully, try to use this again */ | 301 | /* entry was freed successfully, try to use this again */ |
300 | if (swap_was_freed) | 302 | if (swap_was_freed) |
301 | goto checks; | 303 | goto checks; |
@@ -335,13 +337,13 @@ checks: | |||
335 | si->lowest_alloc <= last_in_cluster) | 337 | si->lowest_alloc <= last_in_cluster) |
336 | last_in_cluster = si->lowest_alloc - 1; | 338 | last_in_cluster = si->lowest_alloc - 1; |
337 | si->flags |= SWP_DISCARDING; | 339 | si->flags |= SWP_DISCARDING; |
338 | spin_unlock(&swap_lock); | 340 | spin_unlock(&si->lock); |
339 | 341 | ||
340 | if (offset < last_in_cluster) | 342 | if (offset < last_in_cluster) |
341 | discard_swap_cluster(si, offset, | 343 | discard_swap_cluster(si, offset, |
342 | last_in_cluster - offset + 1); | 344 | last_in_cluster - offset + 1); |
343 | 345 | ||
344 | spin_lock(&swap_lock); | 346 | spin_lock(&si->lock); |
345 | si->lowest_alloc = 0; | 347 | si->lowest_alloc = 0; |
346 | si->flags &= ~SWP_DISCARDING; | 348 | si->flags &= ~SWP_DISCARDING; |
347 | 349 | ||
@@ -355,10 +357,10 @@ checks: | |||
355 | * could defer that delay until swap_writepage, | 357 | * could defer that delay until swap_writepage, |
356 | * but it's easier to keep this self-contained. | 358 | * but it's easier to keep this self-contained. |
357 | */ | 359 | */ |
358 | spin_unlock(&swap_lock); | 360 | spin_unlock(&si->lock); |
359 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), | 361 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), |
360 | wait_for_discard, TASK_UNINTERRUPTIBLE); | 362 | wait_for_discard, TASK_UNINTERRUPTIBLE); |
361 | spin_lock(&swap_lock); | 363 | spin_lock(&si->lock); |
362 | } else { | 364 | } else { |
363 | /* | 365 | /* |
364 | * Note pages allocated by racing tasks while | 366 | * Note pages allocated by racing tasks while |
@@ -374,14 +376,14 @@ checks: | |||
374 | return offset; | 376 | return offset; |
375 | 377 | ||
376 | scan: | 378 | scan: |
377 | spin_unlock(&swap_lock); | 379 | spin_unlock(&si->lock); |
378 | while (++offset <= si->highest_bit) { | 380 | while (++offset <= si->highest_bit) { |
379 | if (!si->swap_map[offset]) { | 381 | if (!si->swap_map[offset]) { |
380 | spin_lock(&swap_lock); | 382 | spin_lock(&si->lock); |
381 | goto checks; | 383 | goto checks; |
382 | } | 384 | } |
383 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 385 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
384 | spin_lock(&swap_lock); | 386 | spin_lock(&si->lock); |
385 | goto checks; | 387 | goto checks; |
386 | } | 388 | } |
387 | if (unlikely(--latency_ration < 0)) { | 389 | if (unlikely(--latency_ration < 0)) { |
@@ -392,11 +394,11 @@ scan: | |||
392 | offset = si->lowest_bit; | 394 | offset = si->lowest_bit; |
393 | while (++offset < scan_base) { | 395 | while (++offset < scan_base) { |
394 | if (!si->swap_map[offset]) { | 396 | if (!si->swap_map[offset]) { |
395 | spin_lock(&swap_lock); | 397 | spin_lock(&si->lock); |
396 | goto checks; | 398 | goto checks; |
397 | } | 399 | } |
398 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 400 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
399 | spin_lock(&swap_lock); | 401 | spin_lock(&si->lock); |
400 | goto checks; | 402 | goto checks; |
401 | } | 403 | } |
402 | if (unlikely(--latency_ration < 0)) { | 404 | if (unlikely(--latency_ration < 0)) { |
@@ -404,7 +406,7 @@ scan: | |||
404 | latency_ration = LATENCY_LIMIT; | 406 | latency_ration = LATENCY_LIMIT; |
405 | } | 407 | } |
406 | } | 408 | } |
407 | spin_lock(&swap_lock); | 409 | spin_lock(&si->lock); |
408 | 410 | ||
409 | no_page: | 411 | no_page: |
410 | si->flags -= SWP_SCANNING; | 412 | si->flags -= SWP_SCANNING; |
@@ -417,13 +419,34 @@ swp_entry_t get_swap_page(void) | |||
417 | pgoff_t offset; | 419 | pgoff_t offset; |
418 | int type, next; | 420 | int type, next; |
419 | int wrapped = 0; | 421 | int wrapped = 0; |
422 | int hp_index; | ||
420 | 423 | ||
421 | spin_lock(&swap_lock); | 424 | spin_lock(&swap_lock); |
422 | if (nr_swap_pages <= 0) | 425 | if (atomic_long_read(&nr_swap_pages) <= 0) |
423 | goto noswap; | 426 | goto noswap; |
424 | nr_swap_pages--; | 427 | atomic_long_dec(&nr_swap_pages); |
425 | 428 | ||
426 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 429 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { |
430 | hp_index = atomic_xchg(&highest_priority_index, -1); | ||
431 | /* | ||
432 | * highest_priority_index records current highest priority swap | ||
433 | * type which just frees swap entries. If its priority is | ||
434 | * higher than that of swap_list.next swap type, we use it. It | ||
435 | * isn't protected by swap_lock, so it can be an invalid value | ||
436 | * if the corresponding swap type is swapoff. We double check | ||
437 | * the flags here. It's even possible the swap type is swapoff | ||
438 | * and swapon again and its priority is changed. In such rare | ||
439 | * case, low prority swap type might be used, but eventually | ||
440 | * high priority swap will be used after several rounds of | ||
441 | * swap. | ||
442 | */ | ||
443 | if (hp_index != -1 && hp_index != type && | ||
444 | swap_info[type]->prio < swap_info[hp_index]->prio && | ||
445 | (swap_info[hp_index]->flags & SWP_WRITEOK)) { | ||
446 | type = hp_index; | ||
447 | swap_list.next = type; | ||
448 | } | ||
449 | |||
427 | si = swap_info[type]; | 450 | si = swap_info[type]; |
428 | next = si->next; | 451 | next = si->next; |
429 | if (next < 0 || | 452 | if (next < 0 || |
@@ -432,22 +455,29 @@ swp_entry_t get_swap_page(void) | |||
432 | wrapped++; | 455 | wrapped++; |
433 | } | 456 | } |
434 | 457 | ||
435 | if (!si->highest_bit) | 458 | spin_lock(&si->lock); |
459 | if (!si->highest_bit) { | ||
460 | spin_unlock(&si->lock); | ||
436 | continue; | 461 | continue; |
437 | if (!(si->flags & SWP_WRITEOK)) | 462 | } |
463 | if (!(si->flags & SWP_WRITEOK)) { | ||
464 | spin_unlock(&si->lock); | ||
438 | continue; | 465 | continue; |
466 | } | ||
439 | 467 | ||
440 | swap_list.next = next; | 468 | swap_list.next = next; |
469 | |||
470 | spin_unlock(&swap_lock); | ||
441 | /* This is called for allocating swap entry for cache */ | 471 | /* This is called for allocating swap entry for cache */ |
442 | offset = scan_swap_map(si, SWAP_HAS_CACHE); | 472 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
443 | if (offset) { | 473 | spin_unlock(&si->lock); |
444 | spin_unlock(&swap_lock); | 474 | if (offset) |
445 | return swp_entry(type, offset); | 475 | return swp_entry(type, offset); |
446 | } | 476 | spin_lock(&swap_lock); |
447 | next = swap_list.next; | 477 | next = swap_list.next; |
448 | } | 478 | } |
449 | 479 | ||
450 | nr_swap_pages++; | 480 | atomic_long_inc(&nr_swap_pages); |
451 | noswap: | 481 | noswap: |
452 | spin_unlock(&swap_lock); | 482 | spin_unlock(&swap_lock); |
453 | return (swp_entry_t) {0}; | 483 | return (swp_entry_t) {0}; |
@@ -459,19 +489,19 @@ swp_entry_t get_swap_page_of_type(int type) | |||
459 | struct swap_info_struct *si; | 489 | struct swap_info_struct *si; |
460 | pgoff_t offset; | 490 | pgoff_t offset; |
461 | 491 | ||
462 | spin_lock(&swap_lock); | ||
463 | si = swap_info[type]; | 492 | si = swap_info[type]; |
493 | spin_lock(&si->lock); | ||
464 | if (si && (si->flags & SWP_WRITEOK)) { | 494 | if (si && (si->flags & SWP_WRITEOK)) { |
465 | nr_swap_pages--; | 495 | atomic_long_dec(&nr_swap_pages); |
466 | /* This is called for allocating swap entry, not cache */ | 496 | /* This is called for allocating swap entry, not cache */ |
467 | offset = scan_swap_map(si, 1); | 497 | offset = scan_swap_map(si, 1); |
468 | if (offset) { | 498 | if (offset) { |
469 | spin_unlock(&swap_lock); | 499 | spin_unlock(&si->lock); |
470 | return swp_entry(type, offset); | 500 | return swp_entry(type, offset); |
471 | } | 501 | } |
472 | nr_swap_pages++; | 502 | atomic_long_inc(&nr_swap_pages); |
473 | } | 503 | } |
474 | spin_unlock(&swap_lock); | 504 | spin_unlock(&si->lock); |
475 | return (swp_entry_t) {0}; | 505 | return (swp_entry_t) {0}; |
476 | } | 506 | } |
477 | 507 | ||
@@ -493,7 +523,7 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) | |||
493 | goto bad_offset; | 523 | goto bad_offset; |
494 | if (!p->swap_map[offset]) | 524 | if (!p->swap_map[offset]) |
495 | goto bad_free; | 525 | goto bad_free; |
496 | spin_lock(&swap_lock); | 526 | spin_lock(&p->lock); |
497 | return p; | 527 | return p; |
498 | 528 | ||
499 | bad_free: | 529 | bad_free: |
@@ -511,6 +541,27 @@ out: | |||
511 | return NULL; | 541 | return NULL; |
512 | } | 542 | } |
513 | 543 | ||
544 | /* | ||
545 | * This swap type frees swap entry, check if it is the highest priority swap | ||
546 | * type which just frees swap entry. get_swap_page() uses | ||
547 | * highest_priority_index to search highest priority swap type. The | ||
548 | * swap_info_struct.lock can't protect us if there are multiple swap types | ||
549 | * active, so we use atomic_cmpxchg. | ||
550 | */ | ||
551 | static void set_highest_priority_index(int type) | ||
552 | { | ||
553 | int old_hp_index, new_hp_index; | ||
554 | |||
555 | do { | ||
556 | old_hp_index = atomic_read(&highest_priority_index); | ||
557 | if (old_hp_index != -1 && | ||
558 | swap_info[old_hp_index]->prio >= swap_info[type]->prio) | ||
559 | break; | ||
560 | new_hp_index = type; | ||
561 | } while (atomic_cmpxchg(&highest_priority_index, | ||
562 | old_hp_index, new_hp_index) != old_hp_index); | ||
563 | } | ||
564 | |||
514 | static unsigned char swap_entry_free(struct swap_info_struct *p, | 565 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
515 | swp_entry_t entry, unsigned char usage) | 566 | swp_entry_t entry, unsigned char usage) |
516 | { | 567 | { |
@@ -553,10 +604,8 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
553 | p->lowest_bit = offset; | 604 | p->lowest_bit = offset; |
554 | if (offset > p->highest_bit) | 605 | if (offset > p->highest_bit) |
555 | p->highest_bit = offset; | 606 | p->highest_bit = offset; |
556 | if (swap_list.next >= 0 && | 607 | set_highest_priority_index(p->type); |
557 | p->prio > swap_info[swap_list.next]->prio) | 608 | atomic_long_inc(&nr_swap_pages); |
558 | swap_list.next = p->type; | ||
559 | nr_swap_pages++; | ||
560 | p->inuse_pages--; | 609 | p->inuse_pages--; |
561 | frontswap_invalidate_page(p->type, offset); | 610 | frontswap_invalidate_page(p->type, offset); |
562 | if (p->flags & SWP_BLKDEV) { | 611 | if (p->flags & SWP_BLKDEV) { |
@@ -581,7 +630,7 @@ void swap_free(swp_entry_t entry) | |||
581 | p = swap_info_get(entry); | 630 | p = swap_info_get(entry); |
582 | if (p) { | 631 | if (p) { |
583 | swap_entry_free(p, entry, 1); | 632 | swap_entry_free(p, entry, 1); |
584 | spin_unlock(&swap_lock); | 633 | spin_unlock(&p->lock); |
585 | } | 634 | } |
586 | } | 635 | } |
587 | 636 | ||
@@ -598,7 +647,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) | |||
598 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); | 647 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); |
599 | if (page) | 648 | if (page) |
600 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); | 649 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); |
601 | spin_unlock(&swap_lock); | 650 | spin_unlock(&p->lock); |
602 | } | 651 | } |
603 | } | 652 | } |
604 | 653 | ||
@@ -617,7 +666,7 @@ int page_swapcount(struct page *page) | |||
617 | p = swap_info_get(entry); | 666 | p = swap_info_get(entry); |
618 | if (p) { | 667 | if (p) { |
619 | count = swap_count(p->swap_map[swp_offset(entry)]); | 668 | count = swap_count(p->swap_map[swp_offset(entry)]); |
620 | spin_unlock(&swap_lock); | 669 | spin_unlock(&p->lock); |
621 | } | 670 | } |
622 | return count; | 671 | return count; |
623 | } | 672 | } |
@@ -706,7 +755,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
706 | page = NULL; | 755 | page = NULL; |
707 | } | 756 | } |
708 | } | 757 | } |
709 | spin_unlock(&swap_lock); | 758 | spin_unlock(&p->lock); |
710 | } | 759 | } |
711 | if (page) { | 760 | if (page) { |
712 | /* | 761 | /* |
@@ -804,11 +853,13 @@ unsigned int count_swap_pages(int type, int free) | |||
804 | if ((unsigned int)type < nr_swapfiles) { | 853 | if ((unsigned int)type < nr_swapfiles) { |
805 | struct swap_info_struct *sis = swap_info[type]; | 854 | struct swap_info_struct *sis = swap_info[type]; |
806 | 855 | ||
856 | spin_lock(&sis->lock); | ||
807 | if (sis->flags & SWP_WRITEOK) { | 857 | if (sis->flags & SWP_WRITEOK) { |
808 | n = sis->pages; | 858 | n = sis->pages; |
809 | if (free) | 859 | if (free) |
810 | n -= sis->inuse_pages; | 860 | n -= sis->inuse_pages; |
811 | } | 861 | } |
862 | spin_unlock(&sis->lock); | ||
812 | } | 863 | } |
813 | spin_unlock(&swap_lock); | 864 | spin_unlock(&swap_lock); |
814 | return n; | 865 | return n; |
@@ -1457,7 +1508,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1457 | p->swap_map = swap_map; | 1508 | p->swap_map = swap_map; |
1458 | frontswap_map_set(p, frontswap_map); | 1509 | frontswap_map_set(p, frontswap_map); |
1459 | p->flags |= SWP_WRITEOK; | 1510 | p->flags |= SWP_WRITEOK; |
1460 | nr_swap_pages += p->pages; | 1511 | atomic_long_add(p->pages, &nr_swap_pages); |
1461 | total_swap_pages += p->pages; | 1512 | total_swap_pages += p->pages; |
1462 | 1513 | ||
1463 | /* insert swap space into swap_list: */ | 1514 | /* insert swap space into swap_list: */ |
@@ -1479,15 +1530,19 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1479 | unsigned long *frontswap_map) | 1530 | unsigned long *frontswap_map) |
1480 | { | 1531 | { |
1481 | spin_lock(&swap_lock); | 1532 | spin_lock(&swap_lock); |
1533 | spin_lock(&p->lock); | ||
1482 | _enable_swap_info(p, prio, swap_map, frontswap_map); | 1534 | _enable_swap_info(p, prio, swap_map, frontswap_map); |
1483 | frontswap_init(p->type); | 1535 | frontswap_init(p->type); |
1536 | spin_unlock(&p->lock); | ||
1484 | spin_unlock(&swap_lock); | 1537 | spin_unlock(&swap_lock); |
1485 | } | 1538 | } |
1486 | 1539 | ||
1487 | static void reinsert_swap_info(struct swap_info_struct *p) | 1540 | static void reinsert_swap_info(struct swap_info_struct *p) |
1488 | { | 1541 | { |
1489 | spin_lock(&swap_lock); | 1542 | spin_lock(&swap_lock); |
1543 | spin_lock(&p->lock); | ||
1490 | _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); | 1544 | _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); |
1545 | spin_unlock(&p->lock); | ||
1491 | spin_unlock(&swap_lock); | 1546 | spin_unlock(&swap_lock); |
1492 | } | 1547 | } |
1493 | 1548 | ||
@@ -1547,14 +1602,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1547 | /* just pick something that's safe... */ | 1602 | /* just pick something that's safe... */ |
1548 | swap_list.next = swap_list.head; | 1603 | swap_list.next = swap_list.head; |
1549 | } | 1604 | } |
1605 | spin_lock(&p->lock); | ||
1550 | if (p->prio < 0) { | 1606 | if (p->prio < 0) { |
1551 | for (i = p->next; i >= 0; i = swap_info[i]->next) | 1607 | for (i = p->next; i >= 0; i = swap_info[i]->next) |
1552 | swap_info[i]->prio = p->prio--; | 1608 | swap_info[i]->prio = p->prio--; |
1553 | least_priority++; | 1609 | least_priority++; |
1554 | } | 1610 | } |
1555 | nr_swap_pages -= p->pages; | 1611 | atomic_long_sub(p->pages, &nr_swap_pages); |
1556 | total_swap_pages -= p->pages; | 1612 | total_swap_pages -= p->pages; |
1557 | p->flags &= ~SWP_WRITEOK; | 1613 | p->flags &= ~SWP_WRITEOK; |
1614 | spin_unlock(&p->lock); | ||
1558 | spin_unlock(&swap_lock); | 1615 | spin_unlock(&swap_lock); |
1559 | 1616 | ||
1560 | set_current_oom_origin(); | 1617 | set_current_oom_origin(); |
@@ -1573,14 +1630,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1573 | 1630 | ||
1574 | mutex_lock(&swapon_mutex); | 1631 | mutex_lock(&swapon_mutex); |
1575 | spin_lock(&swap_lock); | 1632 | spin_lock(&swap_lock); |
1633 | spin_lock(&p->lock); | ||
1576 | drain_mmlist(); | 1634 | drain_mmlist(); |
1577 | 1635 | ||
1578 | /* wait for anyone still in scan_swap_map */ | 1636 | /* wait for anyone still in scan_swap_map */ |
1579 | p->highest_bit = 0; /* cuts scans short */ | 1637 | p->highest_bit = 0; /* cuts scans short */ |
1580 | while (p->flags >= SWP_SCANNING) { | 1638 | while (p->flags >= SWP_SCANNING) { |
1639 | spin_unlock(&p->lock); | ||
1581 | spin_unlock(&swap_lock); | 1640 | spin_unlock(&swap_lock); |
1582 | schedule_timeout_uninterruptible(1); | 1641 | schedule_timeout_uninterruptible(1); |
1583 | spin_lock(&swap_lock); | 1642 | spin_lock(&swap_lock); |
1643 | spin_lock(&p->lock); | ||
1584 | } | 1644 | } |
1585 | 1645 | ||
1586 | swap_file = p->swap_file; | 1646 | swap_file = p->swap_file; |
@@ -1590,6 +1650,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1590 | p->swap_map = NULL; | 1650 | p->swap_map = NULL; |
1591 | p->flags = 0; | 1651 | p->flags = 0; |
1592 | frontswap_invalidate_area(type); | 1652 | frontswap_invalidate_area(type); |
1653 | spin_unlock(&p->lock); | ||
1593 | spin_unlock(&swap_lock); | 1654 | spin_unlock(&swap_lock); |
1594 | mutex_unlock(&swapon_mutex); | 1655 | mutex_unlock(&swapon_mutex); |
1595 | vfree(swap_map); | 1656 | vfree(swap_map); |
@@ -1795,6 +1856,7 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
1795 | p->flags = SWP_USED; | 1856 | p->flags = SWP_USED; |
1796 | p->next = -1; | 1857 | p->next = -1; |
1797 | spin_unlock(&swap_lock); | 1858 | spin_unlock(&swap_lock); |
1859 | spin_lock_init(&p->lock); | ||
1798 | 1860 | ||
1799 | return p; | 1861 | return p; |
1800 | } | 1862 | } |
@@ -2117,7 +2179,7 @@ void si_swapinfo(struct sysinfo *val) | |||
2117 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) | 2179 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
2118 | nr_to_be_unused += si->inuse_pages; | 2180 | nr_to_be_unused += si->inuse_pages; |
2119 | } | 2181 | } |
2120 | val->freeswap = nr_swap_pages + nr_to_be_unused; | 2182 | val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; |
2121 | val->totalswap = total_swap_pages + nr_to_be_unused; | 2183 | val->totalswap = total_swap_pages + nr_to_be_unused; |
2122 | spin_unlock(&swap_lock); | 2184 | spin_unlock(&swap_lock); |
2123 | } | 2185 | } |
@@ -2150,7 +2212,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
2150 | p = swap_info[type]; | 2212 | p = swap_info[type]; |
2151 | offset = swp_offset(entry); | 2213 | offset = swp_offset(entry); |
2152 | 2214 | ||
2153 | spin_lock(&swap_lock); | 2215 | spin_lock(&p->lock); |
2154 | if (unlikely(offset >= p->max)) | 2216 | if (unlikely(offset >= p->max)) |
2155 | goto unlock_out; | 2217 | goto unlock_out; |
2156 | 2218 | ||
@@ -2185,7 +2247,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
2185 | p->swap_map[offset] = count | has_cache; | 2247 | p->swap_map[offset] = count | has_cache; |
2186 | 2248 | ||
2187 | unlock_out: | 2249 | unlock_out: |
2188 | spin_unlock(&swap_lock); | 2250 | spin_unlock(&p->lock); |
2189 | out: | 2251 | out: |
2190 | return err; | 2252 | return err; |
2191 | 2253 | ||
@@ -2310,7 +2372,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | |||
2310 | } | 2372 | } |
2311 | 2373 | ||
2312 | if (!page) { | 2374 | if (!page) { |
2313 | spin_unlock(&swap_lock); | 2375 | spin_unlock(&si->lock); |
2314 | return -ENOMEM; | 2376 | return -ENOMEM; |
2315 | } | 2377 | } |
2316 | 2378 | ||
@@ -2358,7 +2420,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | |||
2358 | list_add_tail(&page->lru, &head->lru); | 2420 | list_add_tail(&page->lru, &head->lru); |
2359 | page = NULL; /* now it's attached, don't free it */ | 2421 | page = NULL; /* now it's attached, don't free it */ |
2360 | out: | 2422 | out: |
2361 | spin_unlock(&swap_lock); | 2423 | spin_unlock(&si->lock); |
2362 | outer: | 2424 | outer: |
2363 | if (page) | 2425 | if (page) |
2364 | __free_page(page); | 2426 | __free_page(page); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index a68fa20269d9..b7d8015a6d54 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1684,7 +1684,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1684 | force_scan = true; | 1684 | force_scan = true; |
1685 | 1685 | ||
1686 | /* If we have no swap space, do not bother scanning anon pages. */ | 1686 | /* If we have no swap space, do not bother scanning anon pages. */ |
1687 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1687 | if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { |
1688 | scan_balance = SCAN_FILE; | 1688 | scan_balance = SCAN_FILE; |
1689 | goto out; | 1689 | goto out; |
1690 | } | 1690 | } |
@@ -1933,7 +1933,7 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
1933 | */ | 1933 | */ |
1934 | pages_for_compaction = (2UL << sc->order); | 1934 | pages_for_compaction = (2UL << sc->order); |
1935 | inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); | 1935 | inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); |
1936 | if (nr_swap_pages > 0) | 1936 | if (get_nr_swap_pages() > 0) |
1937 | inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); | 1937 | inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); |
1938 | if (sc->nr_reclaimed < pages_for_compaction && | 1938 | if (sc->nr_reclaimed < pages_for_compaction && |
1939 | inactive_lru_pages > pages_for_compaction) | 1939 | inactive_lru_pages > pages_for_compaction) |
@@ -3085,7 +3085,7 @@ unsigned long global_reclaimable_pages(void) | |||
3085 | nr = global_page_state(NR_ACTIVE_FILE) + | 3085 | nr = global_page_state(NR_ACTIVE_FILE) + |
3086 | global_page_state(NR_INACTIVE_FILE); | 3086 | global_page_state(NR_INACTIVE_FILE); |
3087 | 3087 | ||
3088 | if (nr_swap_pages > 0) | 3088 | if (get_nr_swap_pages() > 0) |
3089 | nr += global_page_state(NR_ACTIVE_ANON) + | 3089 | nr += global_page_state(NR_ACTIVE_ANON) + |
3090 | global_page_state(NR_INACTIVE_ANON); | 3090 | global_page_state(NR_INACTIVE_ANON); |
3091 | 3091 | ||
@@ -3099,7 +3099,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) | |||
3099 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + | 3099 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + |
3100 | zone_page_state(zone, NR_INACTIVE_FILE); | 3100 | zone_page_state(zone, NR_INACTIVE_FILE); |
3101 | 3101 | ||
3102 | if (nr_swap_pages > 0) | 3102 | if (get_nr_swap_pages() > 0) |
3103 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + | 3103 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + |
3104 | zone_page_state(zone, NR_INACTIVE_ANON); | 3104 | zone_page_state(zone, NR_INACTIVE_ANON); |
3105 | 3105 | ||