aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShaohua Li <shli@kernel.org>2013-02-22 19:34:38 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-23 20:50:17 -0500
commitec8acf20afb8534ed511f6613dd2226b9e301010 (patch)
treea0d6779eeffa0f523a2799dbb619e0a34fd786d4
parent33806f06da654092182410d974b6d3c5396ea3eb (diff)
swap: add per-partition lock for swapfile
swap_lock is heavily contended when I test swap to 3 fast SSD (even slightly slower than swap to 2 such SSD). The main contention comes from swap_info_get(). This patch tries to fix the gap with adding a new per-partition lock. Global data like nr_swapfiles, total_swap_pages, least_priority and swap_list are still protected by swap_lock. nr_swap_pages is an atomic now, it can be changed without swap_lock. In theory, it's possible get_swap_page() finds no swap pages but actually there are free swap pages. But sounds not a big problem. Accessing partition specific data (like scan_swap_map and so on) is only protected by swap_info_struct.lock. Changing swap_info_struct.flags need hold swap_lock and swap_info_struct.lock, because scan_scan_map() will check it. read the flags is ok with either the locks hold. If both swap_lock and swap_info_struct.lock must be hold, we always hold the former first to avoid deadlock. swap_entry_free() can change swap_list. To delete that code, we add a new highest_priority_index. Whenever get_swap_page() is called, we check it. If it's valid, we use it. It's a pity get_swap_page() still holds swap_lock(). But in practice, swap_lock() isn't heavily contended in my test with this patch (or I can say there are other much more heavier bottlenecks like TLB flush). And BTW, looks get_swap_page() doesn't really need the lock. We never free swap_info[] and we check SWAP_WRITEOK flag. The only risk without the lock is we could swapout to some low priority swap, but we can quickly recover after several rounds of swap, so sounds not a big deal to me. But I'd prefer to fix this if it's a real problem. "swap: make each swap partition have one address_space" improved the swapout speed from 1.7G/s to 2G/s. This patch further improves the speed to 2.3G/s, so around 15% improvement. It's a multi-process test, so TLB flush isn't the biggest bottleneck before the patches. [arnd@arndb.de: fix it for nommu] [hughd@google.com: add missing unlock] [minchan@kernel.org: get rid of lockdep whinge on sys_swapon] Signed-off-by: Shaohua Li <shli@fusionio.com> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Seth Jennings <sjenning@linux.vnet.ibm.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Hugh Dickins <hughd@google.com> Signed-off-by: Minchan Kim <minchan@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/sparc/mm/init_32.c2
-rw-r--r--arch/tile/mm/pgtable.c2
-rw-r--r--include/linux/swap.h32
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/swap_state.c3
-rw-r--r--mm/swapfile.c154
-rw-r--r--mm/vmscan.c8
8 files changed, 145 insertions, 60 deletions
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index dde85ef1c56d..48e0c030e8f5 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -57,7 +57,7 @@ void show_mem(unsigned int filter)
57 printk("Mem-info:\n"); 57 printk("Mem-info:\n");
58 show_free_areas(filter); 58 show_free_areas(filter);
59 printk("Free swap: %6ldkB\n", 59 printk("Free swap: %6ldkB\n",
60 nr_swap_pages << (PAGE_SHIFT-10)); 60 get_nr_swap_pages() << (PAGE_SHIFT-10));
61 printk("%ld pages of RAM\n", totalram_pages); 61 printk("%ld pages of RAM\n", totalram_pages);
62 printk("%ld free pages\n", nr_free_pages()); 62 printk("%ld free pages\n", nr_free_pages());
63} 63}
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index de0de0c0e8a1..b3b4972c2451 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -61,7 +61,7 @@ void show_mem(unsigned int filter)
61 global_page_state(NR_PAGETABLE), 61 global_page_state(NR_PAGETABLE),
62 global_page_state(NR_BOUNCE), 62 global_page_state(NR_BOUNCE),
63 global_page_state(NR_FILE_PAGES), 63 global_page_state(NR_FILE_PAGES),
64 nr_swap_pages); 64 get_nr_swap_pages());
65 65
66 for_each_zone(zone) { 66 for_each_zone(zone) {
67 unsigned long flags, order, total = 0, largest_order = -1; 67 unsigned long flags, order, total = 0, largest_order = -1;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 235c039892ee..a3e22d357e91 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -202,6 +202,18 @@ struct swap_info_struct {
202 unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ 202 unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
203 atomic_t frontswap_pages; /* frontswap pages in-use counter */ 203 atomic_t frontswap_pages; /* frontswap pages in-use counter */
204#endif 204#endif
205 spinlock_t lock; /*
206 * protect map scan related fields like
207 * swap_map, lowest_bit, highest_bit,
208 * inuse_pages, cluster_next,
209 * cluster_nr, lowest_alloc and
210 * highest_alloc. other fields are only
211 * changed at swapon/swapoff, so are
212 * protected by swap_lock. changing
213 * flags need hold this lock and
214 * swap_lock. If both locks need hold,
215 * hold swap_lock first.
216 */
205}; 217};
206 218
207struct swap_list_t { 219struct swap_list_t {
@@ -209,9 +221,6 @@ struct swap_list_t {
209 int next; /* swapfile to be used next */ 221 int next; /* swapfile to be used next */
210}; 222};
211 223
212/* Swap 50% full? Release swapcache more aggressively.. */
213#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
214
215/* linux/mm/page_alloc.c */ 224/* linux/mm/page_alloc.c */
216extern unsigned long totalram_pages; 225extern unsigned long totalram_pages;
217extern unsigned long totalreserve_pages; 226extern unsigned long totalreserve_pages;
@@ -347,8 +356,20 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
347 struct vm_area_struct *vma, unsigned long addr); 356 struct vm_area_struct *vma, unsigned long addr);
348 357
349/* linux/mm/swapfile.c */ 358/* linux/mm/swapfile.c */
350extern long nr_swap_pages; 359extern atomic_long_t nr_swap_pages;
351extern long total_swap_pages; 360extern long total_swap_pages;
361
362/* Swap 50% full? Release swapcache more aggressively.. */
363static inline bool vm_swap_full(void)
364{
365 return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
366}
367
368static inline long get_nr_swap_pages(void)
369{
370 return atomic_long_read(&nr_swap_pages);
371}
372
352extern void si_swapinfo(struct sysinfo *); 373extern void si_swapinfo(struct sysinfo *);
353extern swp_entry_t get_swap_page(void); 374extern swp_entry_t get_swap_page(void);
354extern swp_entry_t get_swap_page_of_type(int); 375extern swp_entry_t get_swap_page_of_type(int);
@@ -381,9 +402,10 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
381 402
382#else /* CONFIG_SWAP */ 403#else /* CONFIG_SWAP */
383 404
384#define nr_swap_pages 0L 405#define get_nr_swap_pages() 0L
385#define total_swap_pages 0L 406#define total_swap_pages 0L
386#define total_swapcache_pages() 0UL 407#define total_swapcache_pages() 0UL
408#define vm_swap_full() 0
387 409
388#define si_swapinfo(val) \ 410#define si_swapinfo(val) \
389 do { (val)->freeswap = (val)->totalswap = 0; } while (0) 411 do { (val)->freeswap = (val)->totalswap = 0; } while (0)
diff --git a/mm/mmap.c b/mm/mmap.c
index 44bb4d869884..28416f6b8dd5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -144,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
144 */ 144 */
145 free -= global_page_state(NR_SHMEM); 145 free -= global_page_state(NR_SHMEM);
146 146
147 free += nr_swap_pages; 147 free += get_nr_swap_pages();
148 148
149 /* 149 /*
150 * Any slabs which are created with the 150 * Any slabs which are created with the
diff --git a/mm/nommu.c b/mm/nommu.c
index 18c1b932e2c4..87854a55829d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1907,7 +1907,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1907 */ 1907 */
1908 free -= global_page_state(NR_SHMEM); 1908 free -= global_page_state(NR_SHMEM);
1909 1909
1910 free += nr_swap_pages; 1910 free += get_nr_swap_pages();
1911 1911
1912 /* 1912 /*
1913 * Any slabs which are created with the 1913 * Any slabs which are created with the
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8d6644c5d0cc..7efcf1525921 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -69,7 +69,8 @@ void show_swap_cache_info(void)
69 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", 69 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
70 swap_cache_info.add_total, swap_cache_info.del_total, 70 swap_cache_info.add_total, swap_cache_info.del_total,
71 swap_cache_info.find_success, swap_cache_info.find_total); 71 swap_cache_info.find_success, swap_cache_info.find_total);
72 printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 72 printk("Free swap = %ldkB\n",
73 get_nr_swap_pages() << (PAGE_SHIFT - 10));
73 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 74 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
74} 75}
75 76
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e51864e6fe8b..9b51266413cd 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,9 +47,11 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**);
47 47
48DEFINE_SPINLOCK(swap_lock); 48DEFINE_SPINLOCK(swap_lock);
49static unsigned int nr_swapfiles; 49static unsigned int nr_swapfiles;
50long nr_swap_pages; 50atomic_long_t nr_swap_pages;
51/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
51long total_swap_pages; 52long total_swap_pages;
52static int least_priority; 53static int least_priority;
54static atomic_t highest_priority_index = ATOMIC_INIT(-1);
53 55
54static const char Bad_file[] = "Bad swap file entry "; 56static const char Bad_file[] = "Bad swap file entry ";
55static const char Unused_file[] = "Unused swap file entry "; 57static const char Unused_file[] = "Unused swap file entry ";
@@ -223,7 +225,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
223 si->lowest_alloc = si->max; 225 si->lowest_alloc = si->max;
224 si->highest_alloc = 0; 226 si->highest_alloc = 0;
225 } 227 }
226 spin_unlock(&swap_lock); 228 spin_unlock(&si->lock);
227 229
228 /* 230 /*
229 * If seek is expensive, start searching for new cluster from 231 * If seek is expensive, start searching for new cluster from
@@ -242,7 +244,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
242 if (si->swap_map[offset]) 244 if (si->swap_map[offset])
243 last_in_cluster = offset + SWAPFILE_CLUSTER; 245 last_in_cluster = offset + SWAPFILE_CLUSTER;
244 else if (offset == last_in_cluster) { 246 else if (offset == last_in_cluster) {
245 spin_lock(&swap_lock); 247 spin_lock(&si->lock);
246 offset -= SWAPFILE_CLUSTER - 1; 248 offset -= SWAPFILE_CLUSTER - 1;
247 si->cluster_next = offset; 249 si->cluster_next = offset;
248 si->cluster_nr = SWAPFILE_CLUSTER - 1; 250 si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -263,7 +265,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
263 if (si->swap_map[offset]) 265 if (si->swap_map[offset])
264 last_in_cluster = offset + SWAPFILE_CLUSTER; 266 last_in_cluster = offset + SWAPFILE_CLUSTER;
265 else if (offset == last_in_cluster) { 267 else if (offset == last_in_cluster) {
266 spin_lock(&swap_lock); 268 spin_lock(&si->lock);
267 offset -= SWAPFILE_CLUSTER - 1; 269 offset -= SWAPFILE_CLUSTER - 1;
268 si->cluster_next = offset; 270 si->cluster_next = offset;
269 si->cluster_nr = SWAPFILE_CLUSTER - 1; 271 si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -277,7 +279,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
277 } 279 }
278 280
279 offset = scan_base; 281 offset = scan_base;
280 spin_lock(&swap_lock); 282 spin_lock(&si->lock);
281 si->cluster_nr = SWAPFILE_CLUSTER - 1; 283 si->cluster_nr = SWAPFILE_CLUSTER - 1;
282 si->lowest_alloc = 0; 284 si->lowest_alloc = 0;
283 } 285 }
@@ -293,9 +295,9 @@ checks:
293 /* reuse swap entry of cache-only swap if not busy. */ 295 /* reuse swap entry of cache-only swap if not busy. */
294 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 296 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
295 int swap_was_freed; 297 int swap_was_freed;
296 spin_unlock(&swap_lock); 298 spin_unlock(&si->lock);
297 swap_was_freed = __try_to_reclaim_swap(si, offset); 299 swap_was_freed = __try_to_reclaim_swap(si, offset);
298 spin_lock(&swap_lock); 300 spin_lock(&si->lock);
299 /* entry was freed successfully, try to use this again */ 301 /* entry was freed successfully, try to use this again */
300 if (swap_was_freed) 302 if (swap_was_freed)
301 goto checks; 303 goto checks;
@@ -335,13 +337,13 @@ checks:
335 si->lowest_alloc <= last_in_cluster) 337 si->lowest_alloc <= last_in_cluster)
336 last_in_cluster = si->lowest_alloc - 1; 338 last_in_cluster = si->lowest_alloc - 1;
337 si->flags |= SWP_DISCARDING; 339 si->flags |= SWP_DISCARDING;
338 spin_unlock(&swap_lock); 340 spin_unlock(&si->lock);
339 341
340 if (offset < last_in_cluster) 342 if (offset < last_in_cluster)
341 discard_swap_cluster(si, offset, 343 discard_swap_cluster(si, offset,
342 last_in_cluster - offset + 1); 344 last_in_cluster - offset + 1);
343 345
344 spin_lock(&swap_lock); 346 spin_lock(&si->lock);
345 si->lowest_alloc = 0; 347 si->lowest_alloc = 0;
346 si->flags &= ~SWP_DISCARDING; 348 si->flags &= ~SWP_DISCARDING;
347 349
@@ -355,10 +357,10 @@ checks:
355 * could defer that delay until swap_writepage, 357 * could defer that delay until swap_writepage,
356 * but it's easier to keep this self-contained. 358 * but it's easier to keep this self-contained.
357 */ 359 */
358 spin_unlock(&swap_lock); 360 spin_unlock(&si->lock);
359 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), 361 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
360 wait_for_discard, TASK_UNINTERRUPTIBLE); 362 wait_for_discard, TASK_UNINTERRUPTIBLE);
361 spin_lock(&swap_lock); 363 spin_lock(&si->lock);
362 } else { 364 } else {
363 /* 365 /*
364 * Note pages allocated by racing tasks while 366 * Note pages allocated by racing tasks while
@@ -374,14 +376,14 @@ checks:
374 return offset; 376 return offset;
375 377
376scan: 378scan:
377 spin_unlock(&swap_lock); 379 spin_unlock(&si->lock);
378 while (++offset <= si->highest_bit) { 380 while (++offset <= si->highest_bit) {
379 if (!si->swap_map[offset]) { 381 if (!si->swap_map[offset]) {
380 spin_lock(&swap_lock); 382 spin_lock(&si->lock);
381 goto checks; 383 goto checks;
382 } 384 }
383 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 385 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
384 spin_lock(&swap_lock); 386 spin_lock(&si->lock);
385 goto checks; 387 goto checks;
386 } 388 }
387 if (unlikely(--latency_ration < 0)) { 389 if (unlikely(--latency_ration < 0)) {
@@ -392,11 +394,11 @@ scan:
392 offset = si->lowest_bit; 394 offset = si->lowest_bit;
393 while (++offset < scan_base) { 395 while (++offset < scan_base) {
394 if (!si->swap_map[offset]) { 396 if (!si->swap_map[offset]) {
395 spin_lock(&swap_lock); 397 spin_lock(&si->lock);
396 goto checks; 398 goto checks;
397 } 399 }
398 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 400 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
399 spin_lock(&swap_lock); 401 spin_lock(&si->lock);
400 goto checks; 402 goto checks;
401 } 403 }
402 if (unlikely(--latency_ration < 0)) { 404 if (unlikely(--latency_ration < 0)) {
@@ -404,7 +406,7 @@ scan:
404 latency_ration = LATENCY_LIMIT; 406 latency_ration = LATENCY_LIMIT;
405 } 407 }
406 } 408 }
407 spin_lock(&swap_lock); 409 spin_lock(&si->lock);
408 410
409no_page: 411no_page:
410 si->flags -= SWP_SCANNING; 412 si->flags -= SWP_SCANNING;
@@ -417,13 +419,34 @@ swp_entry_t get_swap_page(void)
417 pgoff_t offset; 419 pgoff_t offset;
418 int type, next; 420 int type, next;
419 int wrapped = 0; 421 int wrapped = 0;
422 int hp_index;
420 423
421 spin_lock(&swap_lock); 424 spin_lock(&swap_lock);
422 if (nr_swap_pages <= 0) 425 if (atomic_long_read(&nr_swap_pages) <= 0)
423 goto noswap; 426 goto noswap;
424 nr_swap_pages--; 427 atomic_long_dec(&nr_swap_pages);
425 428
426 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 429 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
430 hp_index = atomic_xchg(&highest_priority_index, -1);
431 /*
432 * highest_priority_index records current highest priority swap
433 * type which just frees swap entries. If its priority is
434 * higher than that of swap_list.next swap type, we use it. It
435 * isn't protected by swap_lock, so it can be an invalid value
436 * if the corresponding swap type is swapoff. We double check
437 * the flags here. It's even possible the swap type is swapoff
438 * and swapon again and its priority is changed. In such rare
439 * case, low prority swap type might be used, but eventually
440 * high priority swap will be used after several rounds of
441 * swap.
442 */
443 if (hp_index != -1 && hp_index != type &&
444 swap_info[type]->prio < swap_info[hp_index]->prio &&
445 (swap_info[hp_index]->flags & SWP_WRITEOK)) {
446 type = hp_index;
447 swap_list.next = type;
448 }
449
427 si = swap_info[type]; 450 si = swap_info[type];
428 next = si->next; 451 next = si->next;
429 if (next < 0 || 452 if (next < 0 ||
@@ -432,22 +455,29 @@ swp_entry_t get_swap_page(void)
432 wrapped++; 455 wrapped++;
433 } 456 }
434 457
435 if (!si->highest_bit) 458 spin_lock(&si->lock);
459 if (!si->highest_bit) {
460 spin_unlock(&si->lock);
436 continue; 461 continue;
437 if (!(si->flags & SWP_WRITEOK)) 462 }
463 if (!(si->flags & SWP_WRITEOK)) {
464 spin_unlock(&si->lock);
438 continue; 465 continue;
466 }
439 467
440 swap_list.next = next; 468 swap_list.next = next;
469
470 spin_unlock(&swap_lock);
441 /* This is called for allocating swap entry for cache */ 471 /* This is called for allocating swap entry for cache */
442 offset = scan_swap_map(si, SWAP_HAS_CACHE); 472 offset = scan_swap_map(si, SWAP_HAS_CACHE);
443 if (offset) { 473 spin_unlock(&si->lock);
444 spin_unlock(&swap_lock); 474 if (offset)
445 return swp_entry(type, offset); 475 return swp_entry(type, offset);
446 } 476 spin_lock(&swap_lock);
447 next = swap_list.next; 477 next = swap_list.next;
448 } 478 }
449 479
450 nr_swap_pages++; 480 atomic_long_inc(&nr_swap_pages);
451noswap: 481noswap:
452 spin_unlock(&swap_lock); 482 spin_unlock(&swap_lock);
453 return (swp_entry_t) {0}; 483 return (swp_entry_t) {0};
@@ -459,19 +489,19 @@ swp_entry_t get_swap_page_of_type(int type)
459 struct swap_info_struct *si; 489 struct swap_info_struct *si;
460 pgoff_t offset; 490 pgoff_t offset;
461 491
462 spin_lock(&swap_lock);
463 si = swap_info[type]; 492 si = swap_info[type];
493 spin_lock(&si->lock);
464 if (si && (si->flags & SWP_WRITEOK)) { 494 if (si && (si->flags & SWP_WRITEOK)) {
465 nr_swap_pages--; 495 atomic_long_dec(&nr_swap_pages);
466 /* This is called for allocating swap entry, not cache */ 496 /* This is called for allocating swap entry, not cache */
467 offset = scan_swap_map(si, 1); 497 offset = scan_swap_map(si, 1);
468 if (offset) { 498 if (offset) {
469 spin_unlock(&swap_lock); 499 spin_unlock(&si->lock);
470 return swp_entry(type, offset); 500 return swp_entry(type, offset);
471 } 501 }
472 nr_swap_pages++; 502 atomic_long_inc(&nr_swap_pages);
473 } 503 }
474 spin_unlock(&swap_lock); 504 spin_unlock(&si->lock);
475 return (swp_entry_t) {0}; 505 return (swp_entry_t) {0};
476} 506}
477 507
@@ -493,7 +523,7 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
493 goto bad_offset; 523 goto bad_offset;
494 if (!p->swap_map[offset]) 524 if (!p->swap_map[offset])
495 goto bad_free; 525 goto bad_free;
496 spin_lock(&swap_lock); 526 spin_lock(&p->lock);
497 return p; 527 return p;
498 528
499bad_free: 529bad_free:
@@ -511,6 +541,27 @@ out:
511 return NULL; 541 return NULL;
512} 542}
513 543
544/*
545 * This swap type frees swap entry, check if it is the highest priority swap
546 * type which just frees swap entry. get_swap_page() uses
547 * highest_priority_index to search highest priority swap type. The
548 * swap_info_struct.lock can't protect us if there are multiple swap types
549 * active, so we use atomic_cmpxchg.
550 */
551static void set_highest_priority_index(int type)
552{
553 int old_hp_index, new_hp_index;
554
555 do {
556 old_hp_index = atomic_read(&highest_priority_index);
557 if (old_hp_index != -1 &&
558 swap_info[old_hp_index]->prio >= swap_info[type]->prio)
559 break;
560 new_hp_index = type;
561 } while (atomic_cmpxchg(&highest_priority_index,
562 old_hp_index, new_hp_index) != old_hp_index);
563}
564
514static unsigned char swap_entry_free(struct swap_info_struct *p, 565static unsigned char swap_entry_free(struct swap_info_struct *p,
515 swp_entry_t entry, unsigned char usage) 566 swp_entry_t entry, unsigned char usage)
516{ 567{
@@ -553,10 +604,8 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
553 p->lowest_bit = offset; 604 p->lowest_bit = offset;
554 if (offset > p->highest_bit) 605 if (offset > p->highest_bit)
555 p->highest_bit = offset; 606 p->highest_bit = offset;
556 if (swap_list.next >= 0 && 607 set_highest_priority_index(p->type);
557 p->prio > swap_info[swap_list.next]->prio) 608 atomic_long_inc(&nr_swap_pages);
558 swap_list.next = p->type;
559 nr_swap_pages++;
560 p->inuse_pages--; 609 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset); 610 frontswap_invalidate_page(p->type, offset);
562 if (p->flags & SWP_BLKDEV) { 611 if (p->flags & SWP_BLKDEV) {
@@ -581,7 +630,7 @@ void swap_free(swp_entry_t entry)
581 p = swap_info_get(entry); 630 p = swap_info_get(entry);
582 if (p) { 631 if (p) {
583 swap_entry_free(p, entry, 1); 632 swap_entry_free(p, entry, 1);
584 spin_unlock(&swap_lock); 633 spin_unlock(&p->lock);
585 } 634 }
586} 635}
587 636
@@ -598,7 +647,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
598 count = swap_entry_free(p, entry, SWAP_HAS_CACHE); 647 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
599 if (page) 648 if (page)
600 mem_cgroup_uncharge_swapcache(page, entry, count != 0); 649 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
601 spin_unlock(&swap_lock); 650 spin_unlock(&p->lock);
602 } 651 }
603} 652}
604 653
@@ -617,7 +666,7 @@ int page_swapcount(struct page *page)
617 p = swap_info_get(entry); 666 p = swap_info_get(entry);
618 if (p) { 667 if (p) {
619 count = swap_count(p->swap_map[swp_offset(entry)]); 668 count = swap_count(p->swap_map[swp_offset(entry)]);
620 spin_unlock(&swap_lock); 669 spin_unlock(&p->lock);
621 } 670 }
622 return count; 671 return count;
623} 672}
@@ -706,7 +755,7 @@ int free_swap_and_cache(swp_entry_t entry)
706 page = NULL; 755 page = NULL;
707 } 756 }
708 } 757 }
709 spin_unlock(&swap_lock); 758 spin_unlock(&p->lock);
710 } 759 }
711 if (page) { 760 if (page) {
712 /* 761 /*
@@ -804,11 +853,13 @@ unsigned int count_swap_pages(int type, int free)
804 if ((unsigned int)type < nr_swapfiles) { 853 if ((unsigned int)type < nr_swapfiles) {
805 struct swap_info_struct *sis = swap_info[type]; 854 struct swap_info_struct *sis = swap_info[type];
806 855
856 spin_lock(&sis->lock);
807 if (sis->flags & SWP_WRITEOK) { 857 if (sis->flags & SWP_WRITEOK) {
808 n = sis->pages; 858 n = sis->pages;
809 if (free) 859 if (free)
810 n -= sis->inuse_pages; 860 n -= sis->inuse_pages;
811 } 861 }
862 spin_unlock(&sis->lock);
812 } 863 }
813 spin_unlock(&swap_lock); 864 spin_unlock(&swap_lock);
814 return n; 865 return n;
@@ -1457,7 +1508,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1457 p->swap_map = swap_map; 1508 p->swap_map = swap_map;
1458 frontswap_map_set(p, frontswap_map); 1509 frontswap_map_set(p, frontswap_map);
1459 p->flags |= SWP_WRITEOK; 1510 p->flags |= SWP_WRITEOK;
1460 nr_swap_pages += p->pages; 1511 atomic_long_add(p->pages, &nr_swap_pages);
1461 total_swap_pages += p->pages; 1512 total_swap_pages += p->pages;
1462 1513
1463 /* insert swap space into swap_list: */ 1514 /* insert swap space into swap_list: */
@@ -1479,15 +1530,19 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1479 unsigned long *frontswap_map) 1530 unsigned long *frontswap_map)
1480{ 1531{
1481 spin_lock(&swap_lock); 1532 spin_lock(&swap_lock);
1533 spin_lock(&p->lock);
1482 _enable_swap_info(p, prio, swap_map, frontswap_map); 1534 _enable_swap_info(p, prio, swap_map, frontswap_map);
1483 frontswap_init(p->type); 1535 frontswap_init(p->type);
1536 spin_unlock(&p->lock);
1484 spin_unlock(&swap_lock); 1537 spin_unlock(&swap_lock);
1485} 1538}
1486 1539
1487static void reinsert_swap_info(struct swap_info_struct *p) 1540static void reinsert_swap_info(struct swap_info_struct *p)
1488{ 1541{
1489 spin_lock(&swap_lock); 1542 spin_lock(&swap_lock);
1543 spin_lock(&p->lock);
1490 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); 1544 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1545 spin_unlock(&p->lock);
1491 spin_unlock(&swap_lock); 1546 spin_unlock(&swap_lock);
1492} 1547}
1493 1548
@@ -1547,14 +1602,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1547 /* just pick something that's safe... */ 1602 /* just pick something that's safe... */
1548 swap_list.next = swap_list.head; 1603 swap_list.next = swap_list.head;
1549 } 1604 }
1605 spin_lock(&p->lock);
1550 if (p->prio < 0) { 1606 if (p->prio < 0) {
1551 for (i = p->next; i >= 0; i = swap_info[i]->next) 1607 for (i = p->next; i >= 0; i = swap_info[i]->next)
1552 swap_info[i]->prio = p->prio--; 1608 swap_info[i]->prio = p->prio--;
1553 least_priority++; 1609 least_priority++;
1554 } 1610 }
1555 nr_swap_pages -= p->pages; 1611 atomic_long_sub(p->pages, &nr_swap_pages);
1556 total_swap_pages -= p->pages; 1612 total_swap_pages -= p->pages;
1557 p->flags &= ~SWP_WRITEOK; 1613 p->flags &= ~SWP_WRITEOK;
1614 spin_unlock(&p->lock);
1558 spin_unlock(&swap_lock); 1615 spin_unlock(&swap_lock);
1559 1616
1560 set_current_oom_origin(); 1617 set_current_oom_origin();
@@ -1573,14 +1630,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1573 1630
1574 mutex_lock(&swapon_mutex); 1631 mutex_lock(&swapon_mutex);
1575 spin_lock(&swap_lock); 1632 spin_lock(&swap_lock);
1633 spin_lock(&p->lock);
1576 drain_mmlist(); 1634 drain_mmlist();
1577 1635
1578 /* wait for anyone still in scan_swap_map */ 1636 /* wait for anyone still in scan_swap_map */
1579 p->highest_bit = 0; /* cuts scans short */ 1637 p->highest_bit = 0; /* cuts scans short */
1580 while (p->flags >= SWP_SCANNING) { 1638 while (p->flags >= SWP_SCANNING) {
1639 spin_unlock(&p->lock);
1581 spin_unlock(&swap_lock); 1640 spin_unlock(&swap_lock);
1582 schedule_timeout_uninterruptible(1); 1641 schedule_timeout_uninterruptible(1);
1583 spin_lock(&swap_lock); 1642 spin_lock(&swap_lock);
1643 spin_lock(&p->lock);
1584 } 1644 }
1585 1645
1586 swap_file = p->swap_file; 1646 swap_file = p->swap_file;
@@ -1590,6 +1650,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1590 p->swap_map = NULL; 1650 p->swap_map = NULL;
1591 p->flags = 0; 1651 p->flags = 0;
1592 frontswap_invalidate_area(type); 1652 frontswap_invalidate_area(type);
1653 spin_unlock(&p->lock);
1593 spin_unlock(&swap_lock); 1654 spin_unlock(&swap_lock);
1594 mutex_unlock(&swapon_mutex); 1655 mutex_unlock(&swapon_mutex);
1595 vfree(swap_map); 1656 vfree(swap_map);
@@ -1795,6 +1856,7 @@ static struct swap_info_struct *alloc_swap_info(void)
1795 p->flags = SWP_USED; 1856 p->flags = SWP_USED;
1796 p->next = -1; 1857 p->next = -1;
1797 spin_unlock(&swap_lock); 1858 spin_unlock(&swap_lock);
1859 spin_lock_init(&p->lock);
1798 1860
1799 return p; 1861 return p;
1800} 1862}
@@ -2117,7 +2179,7 @@ void si_swapinfo(struct sysinfo *val)
2117 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 2179 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2118 nr_to_be_unused += si->inuse_pages; 2180 nr_to_be_unused += si->inuse_pages;
2119 } 2181 }
2120 val->freeswap = nr_swap_pages + nr_to_be_unused; 2182 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
2121 val->totalswap = total_swap_pages + nr_to_be_unused; 2183 val->totalswap = total_swap_pages + nr_to_be_unused;
2122 spin_unlock(&swap_lock); 2184 spin_unlock(&swap_lock);
2123} 2185}
@@ -2150,7 +2212,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2150 p = swap_info[type]; 2212 p = swap_info[type];
2151 offset = swp_offset(entry); 2213 offset = swp_offset(entry);
2152 2214
2153 spin_lock(&swap_lock); 2215 spin_lock(&p->lock);
2154 if (unlikely(offset >= p->max)) 2216 if (unlikely(offset >= p->max))
2155 goto unlock_out; 2217 goto unlock_out;
2156 2218
@@ -2185,7 +2247,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2185 p->swap_map[offset] = count | has_cache; 2247 p->swap_map[offset] = count | has_cache;
2186 2248
2187unlock_out: 2249unlock_out:
2188 spin_unlock(&swap_lock); 2250 spin_unlock(&p->lock);
2189out: 2251out:
2190 return err; 2252 return err;
2191 2253
@@ -2310,7 +2372,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2310 } 2372 }
2311 2373
2312 if (!page) { 2374 if (!page) {
2313 spin_unlock(&swap_lock); 2375 spin_unlock(&si->lock);
2314 return -ENOMEM; 2376 return -ENOMEM;
2315 } 2377 }
2316 2378
@@ -2358,7 +2420,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2358 list_add_tail(&page->lru, &head->lru); 2420 list_add_tail(&page->lru, &head->lru);
2359 page = NULL; /* now it's attached, don't free it */ 2421 page = NULL; /* now it's attached, don't free it */
2360out: 2422out:
2361 spin_unlock(&swap_lock); 2423 spin_unlock(&si->lock);
2362outer: 2424outer:
2363 if (page) 2425 if (page)
2364 __free_page(page); 2426 __free_page(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a68fa20269d9..b7d8015a6d54 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1684,7 +1684,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1684 force_scan = true; 1684 force_scan = true;
1685 1685
1686 /* If we have no swap space, do not bother scanning anon pages. */ 1686 /* If we have no swap space, do not bother scanning anon pages. */
1687 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1687 if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
1688 scan_balance = SCAN_FILE; 1688 scan_balance = SCAN_FILE;
1689 goto out; 1689 goto out;
1690 } 1690 }
@@ -1933,7 +1933,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
1933 */ 1933 */
1934 pages_for_compaction = (2UL << sc->order); 1934 pages_for_compaction = (2UL << sc->order);
1935 inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); 1935 inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
1936 if (nr_swap_pages > 0) 1936 if (get_nr_swap_pages() > 0)
1937 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); 1937 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
1938 if (sc->nr_reclaimed < pages_for_compaction && 1938 if (sc->nr_reclaimed < pages_for_compaction &&
1939 inactive_lru_pages > pages_for_compaction) 1939 inactive_lru_pages > pages_for_compaction)
@@ -3085,7 +3085,7 @@ unsigned long global_reclaimable_pages(void)
3085 nr = global_page_state(NR_ACTIVE_FILE) + 3085 nr = global_page_state(NR_ACTIVE_FILE) +
3086 global_page_state(NR_INACTIVE_FILE); 3086 global_page_state(NR_INACTIVE_FILE);
3087 3087
3088 if (nr_swap_pages > 0) 3088 if (get_nr_swap_pages() > 0)
3089 nr += global_page_state(NR_ACTIVE_ANON) + 3089 nr += global_page_state(NR_ACTIVE_ANON) +
3090 global_page_state(NR_INACTIVE_ANON); 3090 global_page_state(NR_INACTIVE_ANON);
3091 3091
@@ -3099,7 +3099,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
3099 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 3099 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
3100 zone_page_state(zone, NR_INACTIVE_FILE); 3100 zone_page_state(zone, NR_INACTIVE_FILE);
3101 3101
3102 if (nr_swap_pages > 0) 3102 if (get_nr_swap_pages() > 0)
3103 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 3103 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
3104 zone_page_state(zone, NR_INACTIVE_ANON); 3104 zone_page_state(zone, NR_INACTIVE_ANON);
3105 3105