summaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorHuang, Ying <ying.huang@intel.com>2017-02-22 18:45:22 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-22 19:41:30 -0500
commit235b62176712b970c815923e36b9a9cc05d4d901 (patch)
tree5e64033c7a4f2e47e8d66a16a993f6aa37b6e63b /mm/swapfile.c
parent6a991fc72d1243b8da0c644d3147d3ec41a0b281 (diff)
mm/swap: add cluster lock
This patch is to reduce the lock contention of swap_info_struct->lock via using a more fine grained lock in swap_cluster_info for some swap operations. swap_info_struct->lock is heavily contended if multiple processes reclaim pages simultaneously. Because there is only one lock for each swap device. While in common configuration, there is only one or several swap devices in the system. The lock protects almost all swap related operations. In fact, many swap operations only access one element of swap_info_struct->swap_map array. And there is no dependency between different elements of swap_info_struct->swap_map. So a fine grained lock can be used to allow parallel access to the different elements of swap_info_struct->swap_map. In this patch, a spinlock is added to swap_cluster_info to protect the elements of swap_info_struct->swap_map in the swap cluster and the fields of swap_cluster_info. This reduced locking contention for swap_info_struct->swap_map access greatly. Because of the added spinlock, the size of swap_cluster_info increases from 4 bytes to 8 bytes on the 64 bit and 32 bit system. This will use additional 4k RAM for every 1G swap space. Because the size of swap_cluster_info is much smaller than the size of the cache line (8 vs 64 on x86_64 architecture), there may be false cache line sharing between spinlocks in swap_cluster_info. To avoid the false sharing in the first round of the swap cluster allocation, the order of the swap clusters in the free clusters list is changed. So that, the swap_cluster_info sharing the same cache line will be placed as far as possible. After the first round of allocation, the order of the clusters in free clusters list is expected to be random. So the false sharing should be not serious. Compared with a previous implementation using bit_spin_lock, the sequential swap out throughput improved about 3.2%. Test was done on a Xeon E5 v3 system. The swap device used is a RAM simulated PMEM (persistent memory) device. To test the sequential swapping out, the test case created 32 processes, which sequentially allocate and write to the anonymous pages until the RAM and part of the swap device is used. [ying.huang@intel.com: v5] Link: http://lkml.kernel.org/r/878tqeuuic.fsf_-_@yhuang-dev.intel.com [minchan@kernel.org: initialize spinlock for swap_cluster_info] Link: http://lkml.kernel.org/r/1486434945-29753-1-git-send-email-minchan@kernel.org [hughd@google.com: annotate nested locking for cluster lock] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1702161050540.21773@eggly.anvils Link: http://lkml.kernel.org/r/dbb860bbd825b1aaba18988015e8963f263c3f0d.1484082593.git.tim.c.chen@linux.intel.com Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: Minchan Kim <minchan@kernel.org> Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Aaron Lu <aaron.lu@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> escreveu: Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Rik van Riel <riel@redhat.com> Cc: Shaohua Li <shli@kernel.org> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c215
1 files changed, 173 insertions, 42 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2001ce427a1d..eb71b5d9430b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -257,6 +257,47 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
257 info->data = 0; 257 info->data = 0;
258} 258}
259 259
260static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
261 unsigned long offset)
262{
263 struct swap_cluster_info *ci;
264
265 ci = si->cluster_info;
266 if (ci) {
267 ci += offset / SWAPFILE_CLUSTER;
268 spin_lock(&ci->lock);
269 }
270 return ci;
271}
272
273static inline void unlock_cluster(struct swap_cluster_info *ci)
274{
275 if (ci)
276 spin_unlock(&ci->lock);
277}
278
279static inline struct swap_cluster_info *lock_cluster_or_swap_info(
280 struct swap_info_struct *si,
281 unsigned long offset)
282{
283 struct swap_cluster_info *ci;
284
285 ci = lock_cluster(si, offset);
286 if (!ci)
287 spin_lock(&si->lock);
288
289 return ci;
290}
291
292static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
293 struct swap_cluster_info *ci)
294{
295 if (ci)
296 unlock_cluster(ci);
297 else
298 spin_unlock(&si->lock);
299}
300
260static inline bool cluster_list_empty(struct swap_cluster_list *list) 301static inline bool cluster_list_empty(struct swap_cluster_list *list)
261{ 302{
262 return cluster_is_null(&list->head); 303 return cluster_is_null(&list->head);
@@ -281,9 +322,17 @@ static void cluster_list_add_tail(struct swap_cluster_list *list,
281 cluster_set_next_flag(&list->head, idx, 0); 322 cluster_set_next_flag(&list->head, idx, 0);
282 cluster_set_next_flag(&list->tail, idx, 0); 323 cluster_set_next_flag(&list->tail, idx, 0);
283 } else { 324 } else {
325 struct swap_cluster_info *ci_tail;
284 unsigned int tail = cluster_next(&list->tail); 326 unsigned int tail = cluster_next(&list->tail);
285 327
286 cluster_set_next(&ci[tail], idx); 328 /*
329 * Nested cluster lock, but both cluster locks are
330 * only acquired when we held swap_info_struct->lock
331 */
332 ci_tail = ci + tail;
333 spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
334 cluster_set_next(ci_tail, idx);
335 unlock_cluster(ci_tail);
287 cluster_set_next_flag(&list->tail, idx, 0); 336 cluster_set_next_flag(&list->tail, idx, 0);
288 } 337 }
289} 338}
@@ -328,7 +377,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
328*/ 377*/
329static void swap_do_scheduled_discard(struct swap_info_struct *si) 378static void swap_do_scheduled_discard(struct swap_info_struct *si)
330{ 379{
331 struct swap_cluster_info *info; 380 struct swap_cluster_info *info, *ci;
332 unsigned int idx; 381 unsigned int idx;
333 382
334 info = si->cluster_info; 383 info = si->cluster_info;
@@ -341,10 +390,14 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
341 SWAPFILE_CLUSTER); 390 SWAPFILE_CLUSTER);
342 391
343 spin_lock(&si->lock); 392 spin_lock(&si->lock);
344 cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); 393 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
394 cluster_set_flag(ci, CLUSTER_FLAG_FREE);
395 unlock_cluster(ci);
345 cluster_list_add_tail(&si->free_clusters, info, idx); 396 cluster_list_add_tail(&si->free_clusters, info, idx);
397 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
346 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 398 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
347 0, SWAPFILE_CLUSTER); 399 0, SWAPFILE_CLUSTER);
400 unlock_cluster(ci);
348 } 401 }
349} 402}
350 403
@@ -447,8 +500,9 @@ static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
447 unsigned long *offset, unsigned long *scan_base) 500 unsigned long *offset, unsigned long *scan_base)
448{ 501{
449 struct percpu_cluster *cluster; 502 struct percpu_cluster *cluster;
503 struct swap_cluster_info *ci;
450 bool found_free; 504 bool found_free;
451 unsigned long tmp; 505 unsigned long tmp, max;
452 506
453new_cluster: 507new_cluster:
454 cluster = this_cpu_ptr(si->percpu_cluster); 508 cluster = this_cpu_ptr(si->percpu_cluster);
@@ -476,14 +530,21 @@ new_cluster:
476 * check if there is still free entry in the cluster 530 * check if there is still free entry in the cluster
477 */ 531 */
478 tmp = cluster->next; 532 tmp = cluster->next;
479 while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * 533 max = min_t(unsigned long, si->max,
480 SWAPFILE_CLUSTER) { 534 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
535 if (tmp >= max) {
536 cluster_set_null(&cluster->index);
537 goto new_cluster;
538 }
539 ci = lock_cluster(si, tmp);
540 while (tmp < max) {
481 if (!si->swap_map[tmp]) { 541 if (!si->swap_map[tmp]) {
482 found_free = true; 542 found_free = true;
483 break; 543 break;
484 } 544 }
485 tmp++; 545 tmp++;
486 } 546 }
547 unlock_cluster(ci);
487 if (!found_free) { 548 if (!found_free) {
488 cluster_set_null(&cluster->index); 549 cluster_set_null(&cluster->index);
489 goto new_cluster; 550 goto new_cluster;
@@ -496,6 +557,7 @@ new_cluster:
496static unsigned long scan_swap_map(struct swap_info_struct *si, 557static unsigned long scan_swap_map(struct swap_info_struct *si,
497 unsigned char usage) 558 unsigned char usage)
498{ 559{
560 struct swap_cluster_info *ci;
499 unsigned long offset; 561 unsigned long offset;
500 unsigned long scan_base; 562 unsigned long scan_base;
501 unsigned long last_in_cluster = 0; 563 unsigned long last_in_cluster = 0;
@@ -572,9 +634,11 @@ checks:
572 if (offset > si->highest_bit) 634 if (offset > si->highest_bit)
573 scan_base = offset = si->lowest_bit; 635 scan_base = offset = si->lowest_bit;
574 636
637 ci = lock_cluster(si, offset);
575 /* reuse swap entry of cache-only swap if not busy. */ 638 /* reuse swap entry of cache-only swap if not busy. */
576 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 639 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
577 int swap_was_freed; 640 int swap_was_freed;
641 unlock_cluster(ci);
578 spin_unlock(&si->lock); 642 spin_unlock(&si->lock);
579 swap_was_freed = __try_to_reclaim_swap(si, offset); 643 swap_was_freed = __try_to_reclaim_swap(si, offset);
580 spin_lock(&si->lock); 644 spin_lock(&si->lock);
@@ -584,8 +648,10 @@ checks:
584 goto scan; /* check next one */ 648 goto scan; /* check next one */
585 } 649 }
586 650
587 if (si->swap_map[offset]) 651 if (si->swap_map[offset]) {
652 unlock_cluster(ci);
588 goto scan; 653 goto scan;
654 }
589 655
590 if (offset == si->lowest_bit) 656 if (offset == si->lowest_bit)
591 si->lowest_bit++; 657 si->lowest_bit++;
@@ -601,6 +667,7 @@ checks:
601 } 667 }
602 si->swap_map[offset] = usage; 668 si->swap_map[offset] = usage;
603 inc_cluster_info_page(si, si->cluster_info, offset); 669 inc_cluster_info_page(si, si->cluster_info, offset);
670 unlock_cluster(ci);
604 si->cluster_next = offset + 1; 671 si->cluster_next = offset + 1;
605 si->flags -= SWP_SCANNING; 672 si->flags -= SWP_SCANNING;
606 673
@@ -731,7 +798,7 @@ swp_entry_t get_swap_page_of_type(int type)
731 return (swp_entry_t) {0}; 798 return (swp_entry_t) {0};
732} 799}
733 800
734static struct swap_info_struct *swap_info_get(swp_entry_t entry) 801static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
735{ 802{
736 struct swap_info_struct *p; 803 struct swap_info_struct *p;
737 unsigned long offset, type; 804 unsigned long offset, type;
@@ -749,7 +816,6 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
749 goto bad_offset; 816 goto bad_offset;
750 if (!p->swap_map[offset]) 817 if (!p->swap_map[offset])
751 goto bad_free; 818 goto bad_free;
752 spin_lock(&p->lock);
753 return p; 819 return p;
754 820
755bad_free: 821bad_free:
@@ -767,14 +833,45 @@ out:
767 return NULL; 833 return NULL;
768} 834}
769 835
836static struct swap_info_struct *swap_info_get(swp_entry_t entry)
837{
838 struct swap_info_struct *p;
839
840 p = _swap_info_get(entry);
841 if (p)
842 spin_lock(&p->lock);
843 return p;
844}
845
770static unsigned char swap_entry_free(struct swap_info_struct *p, 846static unsigned char swap_entry_free(struct swap_info_struct *p,
771 swp_entry_t entry, unsigned char usage) 847 swp_entry_t entry, unsigned char usage,
848 bool swap_info_locked)
772{ 849{
850 struct swap_cluster_info *ci;
773 unsigned long offset = swp_offset(entry); 851 unsigned long offset = swp_offset(entry);
774 unsigned char count; 852 unsigned char count;
775 unsigned char has_cache; 853 unsigned char has_cache;
854 bool lock_swap_info = false;
855
856 if (!swap_info_locked) {
857 count = p->swap_map[offset];
858 if (!p->cluster_info || count == usage || count == SWAP_MAP_SHMEM) {
859lock_swap_info:
860 swap_info_locked = true;
861 lock_swap_info = true;
862 spin_lock(&p->lock);
863 }
864 }
865
866 ci = lock_cluster(p, offset);
776 867
777 count = p->swap_map[offset]; 868 count = p->swap_map[offset];
869
870 if (!swap_info_locked && (count == usage || count == SWAP_MAP_SHMEM)) {
871 unlock_cluster(ci);
872 goto lock_swap_info;
873 }
874
778 has_cache = count & SWAP_HAS_CACHE; 875 has_cache = count & SWAP_HAS_CACHE;
779 count &= ~SWAP_HAS_CACHE; 876 count &= ~SWAP_HAS_CACHE;
780 877
@@ -800,10 +897,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
800 usage = count | has_cache; 897 usage = count | has_cache;
801 p->swap_map[offset] = usage; 898 p->swap_map[offset] = usage;
802 899
900 unlock_cluster(ci);
901
803 /* free if no reference */ 902 /* free if no reference */
804 if (!usage) { 903 if (!usage) {
904 VM_BUG_ON(!swap_info_locked);
805 mem_cgroup_uncharge_swap(entry); 905 mem_cgroup_uncharge_swap(entry);
906 ci = lock_cluster(p, offset);
806 dec_cluster_info_page(p, p->cluster_info, offset); 907 dec_cluster_info_page(p, p->cluster_info, offset);
908 unlock_cluster(ci);
807 if (offset < p->lowest_bit) 909 if (offset < p->lowest_bit)
808 p->lowest_bit = offset; 910 p->lowest_bit = offset;
809 if (offset > p->highest_bit) { 911 if (offset > p->highest_bit) {
@@ -829,6 +931,9 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
829 } 931 }
830 } 932 }
831 933
934 if (lock_swap_info)
935 spin_unlock(&p->lock);
936
832 return usage; 937 return usage;
833} 938}
834 939
@@ -840,11 +945,9 @@ void swap_free(swp_entry_t entry)
840{ 945{
841 struct swap_info_struct *p; 946 struct swap_info_struct *p;
842 947
843 p = swap_info_get(entry); 948 p = _swap_info_get(entry);
844 if (p) { 949 if (p)
845 swap_entry_free(p, entry, 1); 950 swap_entry_free(p, entry, 1, false);
846 spin_unlock(&p->lock);
847 }
848} 951}
849 952
850/* 953/*
@@ -854,11 +957,9 @@ void swapcache_free(swp_entry_t entry)
854{ 957{
855 struct swap_info_struct *p; 958 struct swap_info_struct *p;
856 959
857 p = swap_info_get(entry); 960 p = _swap_info_get(entry);
858 if (p) { 961 if (p)
859 swap_entry_free(p, entry, SWAP_HAS_CACHE); 962 swap_entry_free(p, entry, SWAP_HAS_CACHE, false);
860 spin_unlock(&p->lock);
861 }
862} 963}
863 964
864/* 965/*
@@ -870,13 +971,17 @@ int page_swapcount(struct page *page)
870{ 971{
871 int count = 0; 972 int count = 0;
872 struct swap_info_struct *p; 973 struct swap_info_struct *p;
974 struct swap_cluster_info *ci;
873 swp_entry_t entry; 975 swp_entry_t entry;
976 unsigned long offset;
874 977
875 entry.val = page_private(page); 978 entry.val = page_private(page);
876 p = swap_info_get(entry); 979 p = _swap_info_get(entry);
877 if (p) { 980 if (p) {
878 count = swap_count(p->swap_map[swp_offset(entry)]); 981 offset = swp_offset(entry);
879 spin_unlock(&p->lock); 982 ci = lock_cluster_or_swap_info(p, offset);
983 count = swap_count(p->swap_map[offset]);
984 unlock_cluster_or_swap_info(p, ci);
880 } 985 }
881 return count; 986 return count;
882} 987}
@@ -889,22 +994,26 @@ int swp_swapcount(swp_entry_t entry)
889{ 994{
890 int count, tmp_count, n; 995 int count, tmp_count, n;
891 struct swap_info_struct *p; 996 struct swap_info_struct *p;
997 struct swap_cluster_info *ci;
892 struct page *page; 998 struct page *page;
893 pgoff_t offset; 999 pgoff_t offset;
894 unsigned char *map; 1000 unsigned char *map;
895 1001
896 p = swap_info_get(entry); 1002 p = _swap_info_get(entry);
897 if (!p) 1003 if (!p)
898 return 0; 1004 return 0;
899 1005
900 count = swap_count(p->swap_map[swp_offset(entry)]); 1006 offset = swp_offset(entry);
1007
1008 ci = lock_cluster_or_swap_info(p, offset);
1009
1010 count = swap_count(p->swap_map[offset]);
901 if (!(count & COUNT_CONTINUED)) 1011 if (!(count & COUNT_CONTINUED))
902 goto out; 1012 goto out;
903 1013
904 count &= ~COUNT_CONTINUED; 1014 count &= ~COUNT_CONTINUED;
905 n = SWAP_MAP_MAX + 1; 1015 n = SWAP_MAP_MAX + 1;
906 1016
907 offset = swp_offset(entry);
908 page = vmalloc_to_page(p->swap_map + offset); 1017 page = vmalloc_to_page(p->swap_map + offset);
909 offset &= ~PAGE_MASK; 1018 offset &= ~PAGE_MASK;
910 VM_BUG_ON(page_private(page) != SWP_CONTINUED); 1019 VM_BUG_ON(page_private(page) != SWP_CONTINUED);
@@ -919,7 +1028,7 @@ int swp_swapcount(swp_entry_t entry)
919 n *= (SWAP_CONT_MAX + 1); 1028 n *= (SWAP_CONT_MAX + 1);
920 } while (tmp_count & COUNT_CONTINUED); 1029 } while (tmp_count & COUNT_CONTINUED);
921out: 1030out:
922 spin_unlock(&p->lock); 1031 unlock_cluster_or_swap_info(p, ci);
923 return count; 1032 return count;
924} 1033}
925 1034
@@ -1017,7 +1126,7 @@ int free_swap_and_cache(swp_entry_t entry)
1017 1126
1018 p = swap_info_get(entry); 1127 p = swap_info_get(entry);
1019 if (p) { 1128 if (p) {
1020 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { 1129 if (swap_entry_free(p, entry, 1, true) == SWAP_HAS_CACHE) {
1021 page = find_get_page(swap_address_space(entry), 1130 page = find_get_page(swap_address_space(entry),
1022 swp_offset(entry)); 1131 swp_offset(entry));
1023 if (page && !trylock_page(page)) { 1132 if (page && !trylock_page(page)) {
@@ -2298,6 +2407,9 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
2298 return maxpages; 2407 return maxpages;
2299} 2408}
2300 2409
2410#define SWAP_CLUSTER_COLS \
2411 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
2412
2301static int setup_swap_map_and_extents(struct swap_info_struct *p, 2413static int setup_swap_map_and_extents(struct swap_info_struct *p,
2302 union swap_header *swap_header, 2414 union swap_header *swap_header,
2303 unsigned char *swap_map, 2415 unsigned char *swap_map,
@@ -2305,11 +2417,12 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2305 unsigned long maxpages, 2417 unsigned long maxpages,
2306 sector_t *span) 2418 sector_t *span)
2307{ 2419{
2308 int i; 2420 unsigned int j, k;
2309 unsigned int nr_good_pages; 2421 unsigned int nr_good_pages;
2310 int nr_extents; 2422 int nr_extents;
2311 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); 2423 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2312 unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER; 2424 unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
2425 unsigned long i, idx;
2313 2426
2314 nr_good_pages = maxpages - 1; /* omit header page */ 2427 nr_good_pages = maxpages - 1; /* omit header page */
2315 2428
@@ -2357,15 +2470,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2357 if (!cluster_info) 2470 if (!cluster_info)
2358 return nr_extents; 2471 return nr_extents;
2359 2472
2360 for (i = 0; i < nr_clusters; i++) { 2473
2361 if (!cluster_count(&cluster_info[idx])) { 2474 /* Reduce false cache line sharing between cluster_info */
2475 for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
2476 j = (k + col) % SWAP_CLUSTER_COLS;
2477 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
2478 idx = i * SWAP_CLUSTER_COLS + j;
2479 if (idx >= nr_clusters)
2480 continue;
2481 if (cluster_count(&cluster_info[idx]))
2482 continue;
2362 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); 2483 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
2363 cluster_list_add_tail(&p->free_clusters, cluster_info, 2484 cluster_list_add_tail(&p->free_clusters, cluster_info,
2364 idx); 2485 idx);
2365 } 2486 }
2366 idx++;
2367 if (idx == nr_clusters)
2368 idx = 0;
2369 } 2487 }
2370 return nr_extents; 2488 return nr_extents;
2371} 2489}
@@ -2468,6 +2586,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2468 2586
2469 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2587 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2470 int cpu; 2588 int cpu;
2589 unsigned long ci, nr_cluster;
2471 2590
2472 p->flags |= SWP_SOLIDSTATE; 2591 p->flags |= SWP_SOLIDSTATE;
2473 /* 2592 /*
@@ -2475,13 +2594,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2475 * SSD 2594 * SSD
2476 */ 2595 */
2477 p->cluster_next = 1 + (prandom_u32() % p->highest_bit); 2596 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2597 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2478 2598
2479 cluster_info = vzalloc(DIV_ROUND_UP(maxpages, 2599 cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info));
2480 SWAPFILE_CLUSTER) * sizeof(*cluster_info));
2481 if (!cluster_info) { 2600 if (!cluster_info) {
2482 error = -ENOMEM; 2601 error = -ENOMEM;
2483 goto bad_swap; 2602 goto bad_swap;
2484 } 2603 }
2604
2605 for (ci = 0; ci < nr_cluster; ci++)
2606 spin_lock_init(&((cluster_info + ci)->lock));
2607
2485 p->percpu_cluster = alloc_percpu(struct percpu_cluster); 2608 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
2486 if (!p->percpu_cluster) { 2609 if (!p->percpu_cluster) {
2487 error = -ENOMEM; 2610 error = -ENOMEM;
@@ -2627,6 +2750,7 @@ void si_swapinfo(struct sysinfo *val)
2627static int __swap_duplicate(swp_entry_t entry, unsigned char usage) 2750static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2628{ 2751{
2629 struct swap_info_struct *p; 2752 struct swap_info_struct *p;
2753 struct swap_cluster_info *ci;
2630 unsigned long offset, type; 2754 unsigned long offset, type;
2631 unsigned char count; 2755 unsigned char count;
2632 unsigned char has_cache; 2756 unsigned char has_cache;
@@ -2640,10 +2764,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2640 goto bad_file; 2764 goto bad_file;
2641 p = swap_info[type]; 2765 p = swap_info[type];
2642 offset = swp_offset(entry); 2766 offset = swp_offset(entry);
2643
2644 spin_lock(&p->lock);
2645 if (unlikely(offset >= p->max)) 2767 if (unlikely(offset >= p->max))
2646 goto unlock_out; 2768 goto out;
2769
2770 ci = lock_cluster_or_swap_info(p, offset);
2647 2771
2648 count = p->swap_map[offset]; 2772 count = p->swap_map[offset];
2649 2773
@@ -2686,7 +2810,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2686 p->swap_map[offset] = count | has_cache; 2810 p->swap_map[offset] = count | has_cache;
2687 2811
2688unlock_out: 2812unlock_out:
2689 spin_unlock(&p->lock); 2813 unlock_cluster_or_swap_info(p, ci);
2690out: 2814out:
2691 return err; 2815 return err;
2692 2816
@@ -2775,6 +2899,7 @@ EXPORT_SYMBOL_GPL(__page_file_index);
2775int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) 2899int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2776{ 2900{
2777 struct swap_info_struct *si; 2901 struct swap_info_struct *si;
2902 struct swap_cluster_info *ci;
2778 struct page *head; 2903 struct page *head;
2779 struct page *page; 2904 struct page *page;
2780 struct page *list_page; 2905 struct page *list_page;
@@ -2798,6 +2923,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2798 } 2923 }
2799 2924
2800 offset = swp_offset(entry); 2925 offset = swp_offset(entry);
2926
2927 ci = lock_cluster(si, offset);
2928
2801 count = si->swap_map[offset] & ~SWAP_HAS_CACHE; 2929 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2802 2930
2803 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { 2931 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
@@ -2810,6 +2938,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2810 } 2938 }
2811 2939
2812 if (!page) { 2940 if (!page) {
2941 unlock_cluster(ci);
2813 spin_unlock(&si->lock); 2942 spin_unlock(&si->lock);
2814 return -ENOMEM; 2943 return -ENOMEM;
2815 } 2944 }
@@ -2858,6 +2987,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2858 list_add_tail(&page->lru, &head->lru); 2987 list_add_tail(&page->lru, &head->lru);
2859 page = NULL; /* now it's attached, don't free it */ 2988 page = NULL; /* now it's attached, don't free it */
2860out: 2989out:
2990 unlock_cluster(ci);
2861 spin_unlock(&si->lock); 2991 spin_unlock(&si->lock);
2862outer: 2992outer:
2863 if (page) 2993 if (page)
@@ -2871,7 +3001,8 @@ outer:
2871 * into, carry if so, or else fail until a new continuation page is allocated; 3001 * into, carry if so, or else fail until a new continuation page is allocated;
2872 * when the original swap_map count is decremented from 0 with continuation, 3002 * when the original swap_map count is decremented from 0 with continuation,
2873 * borrow from the continuation and report whether it still holds more. 3003 * borrow from the continuation and report whether it still holds more.
2874 * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. 3004 * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
3005 * lock.
2875 */ 3006 */
2876static bool swap_count_continued(struct swap_info_struct *si, 3007static bool swap_count_continued(struct swap_info_struct *si,
2877 pgoff_t offset, unsigned char count) 3008 pgoff_t offset, unsigned char count)