summaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorAaron Lu <aaron.lu@intel.com>2017-09-06 19:24:57 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 20:27:30 -0400
commita2468cc9bfdff6139f59ca896671e5819ff5f94a (patch)
treebd1d326088483afddbbec76be18bc2558d74b6c4 /mm/swapfile.c
parentda99ecf117fce6570bd3989263d68ee0007e1249 (diff)
swap: choose swap device according to numa node
If the system has more than one swap device and swap device has the node information, we can make use of this information to decide which swap device to use in get_swap_pages() to get better performance. The current code uses a priority based list, swap_avail_list, to decide which swap device to use and if multiple swap devices share the same priority, they are used round robin. This patch changes the previous single global swap_avail_list into a per-numa-node list, i.e. for each numa node, it sees its own priority based list of available swap devices. Swap device's priority can be promoted on its matching node's swap_avail_list. The current swap device's priority is set as: user can set a >=0 value, or the system will pick one starting from -1 then downwards. The priority value in the swap_avail_list is the negated value of the swap device's due to plist being sorted from low to high. The new policy doesn't change the semantics for priority >=0 cases, the previous starting from -1 then downwards now becomes starting from -2 then downwards and -1 is reserved as the promoted value. Take 4-node EX machine as an example, suppose 4 swap devices are available, each sit on a different node: swapA on node 0 swapB on node 1 swapC on node 2 swapD on node 3 After they are all swapped on in the sequence of ABCD. Current behaviour: their priorities will be: swapA: -1 swapB: -2 swapC: -3 swapD: -4 And their position in the global swap_avail_list will be: swapA -> swapB -> swapC -> swapD prio:1 prio:2 prio:3 prio:4 New behaviour: their priorities will be(note that -1 is skipped): swapA: -2 swapB: -3 swapC: -4 swapD: -5 And their positions in the 4 swap_avail_lists[nid] will be: swap_avail_lists[0]: /* node 0's available swap device list */ swapA -> swapB -> swapC -> swapD prio:1 prio:3 prio:4 prio:5 swap_avali_lists[1]: /* node 1's available swap device list */ swapB -> swapA -> swapC -> swapD prio:1 prio:2 prio:4 prio:5 swap_avail_lists[2]: /* node 2's available swap device list */ swapC -> swapA -> swapB -> swapD prio:1 prio:2 prio:3 prio:5 swap_avail_lists[3]: /* node 3's available swap device list */ swapD -> swapA -> swapB -> swapC prio:1 prio:2 prio:3 prio:4 To see the effect of the patch, a test that starts N process, each mmap a region of anonymous memory and then continually write to it at random position to trigger both swap in and out is used. On a 2 node Skylake EP machine with 64GiB memory, two 170GB SSD drives are used as swap devices with each attached to a different node, the result is: runtime=30m/processes=32/total test size=128G/each process mmap region=4G kernel throughput vanilla 13306 auto-binding 15169 +14% runtime=30m/processes=64/total test size=128G/each process mmap region=2G kernel throughput vanilla 11885 auto-binding 14879 +25% [aaron.lu@intel.com: v2] Link: http://lkml.kernel.org/r/20170814053130.GD2369@aaronlu.sh.intel.com Link: http://lkml.kernel.org/r/20170816024439.GA10925@aaronlu.sh.intel.com [akpm@linux-foundation.org: use kmalloc_array()] Link: http://lkml.kernel.org/r/20170814053130.GD2369@aaronlu.sh.intel.com Link: http://lkml.kernel.org/r/20170816024439.GA10925@aaronlu.sh.intel.com Signed-off-by: Aaron Lu <aaron.lu@intel.com> Cc: "Chen, Tim C" <tim.c.chen@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c120
1 files changed, 94 insertions, 26 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4f8b3e08a547..d483278ee35b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages;
60EXPORT_SYMBOL_GPL(nr_swap_pages); 60EXPORT_SYMBOL_GPL(nr_swap_pages);
61/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 61/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
62long total_swap_pages; 62long total_swap_pages;
63static int least_priority; 63static int least_priority = -1;
64 64
65static const char Bad_file[] = "Bad swap file entry "; 65static const char Bad_file[] = "Bad swap file entry ";
66static const char Unused_file[] = "Unused swap file entry "; 66static const char Unused_file[] = "Unused swap file entry ";
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head);
85 * is held and the locking order requires swap_lock to be taken 85 * is held and the locking order requires swap_lock to be taken
86 * before any swap_info_struct->lock. 86 * before any swap_info_struct->lock.
87 */ 87 */
88static PLIST_HEAD(swap_avail_head); 88struct plist_head *swap_avail_heads;
89static DEFINE_SPINLOCK(swap_avail_lock); 89static DEFINE_SPINLOCK(swap_avail_lock);
90 90
91struct swap_info_struct *swap_info[MAX_SWAPFILES]; 91struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -592,6 +592,21 @@ new_cluster:
592 return found_free; 592 return found_free;
593} 593}
594 594
595static void __del_from_avail_list(struct swap_info_struct *p)
596{
597 int nid;
598
599 for_each_node(nid)
600 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
601}
602
603static void del_from_avail_list(struct swap_info_struct *p)
604{
605 spin_lock(&swap_avail_lock);
606 __del_from_avail_list(p);
607 spin_unlock(&swap_avail_lock);
608}
609
595static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, 610static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
596 unsigned int nr_entries) 611 unsigned int nr_entries)
597{ 612{
@@ -605,12 +620,22 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
605 if (si->inuse_pages == si->pages) { 620 if (si->inuse_pages == si->pages) {
606 si->lowest_bit = si->max; 621 si->lowest_bit = si->max;
607 si->highest_bit = 0; 622 si->highest_bit = 0;
608 spin_lock(&swap_avail_lock); 623 del_from_avail_list(si);
609 plist_del(&si->avail_list, &swap_avail_head);
610 spin_unlock(&swap_avail_lock);
611 } 624 }
612} 625}
613 626
627static void add_to_avail_list(struct swap_info_struct *p)
628{
629 int nid;
630
631 spin_lock(&swap_avail_lock);
632 for_each_node(nid) {
633 WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
634 plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
635 }
636 spin_unlock(&swap_avail_lock);
637}
638
614static void swap_range_free(struct swap_info_struct *si, unsigned long offset, 639static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
615 unsigned int nr_entries) 640 unsigned int nr_entries)
616{ 641{
@@ -623,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
623 bool was_full = !si->highest_bit; 648 bool was_full = !si->highest_bit;
624 649
625 si->highest_bit = end; 650 si->highest_bit = end;
626 if (was_full && (si->flags & SWP_WRITEOK)) { 651 if (was_full && (si->flags & SWP_WRITEOK))
627 spin_lock(&swap_avail_lock); 652 add_to_avail_list(si);
628 WARN_ON(!plist_node_empty(&si->avail_list));
629 if (plist_node_empty(&si->avail_list))
630 plist_add(&si->avail_list, &swap_avail_head);
631 spin_unlock(&swap_avail_lock);
632 }
633 } 653 }
634 atomic_long_add(nr_entries, &nr_swap_pages); 654 atomic_long_add(nr_entries, &nr_swap_pages);
635 si->inuse_pages -= nr_entries; 655 si->inuse_pages -= nr_entries;
@@ -910,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
910 struct swap_info_struct *si, *next; 930 struct swap_info_struct *si, *next;
911 long avail_pgs; 931 long avail_pgs;
912 int n_ret = 0; 932 int n_ret = 0;
933 int node;
913 934
914 /* Only single cluster request supported */ 935 /* Only single cluster request supported */
915 WARN_ON_ONCE(n_goal > 1 && cluster); 936 WARN_ON_ONCE(n_goal > 1 && cluster);
@@ -929,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
929 spin_lock(&swap_avail_lock); 950 spin_lock(&swap_avail_lock);
930 951
931start_over: 952start_over:
932 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { 953 node = numa_node_id();
954 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
933 /* requeue si to after same-priority siblings */ 955 /* requeue si to after same-priority siblings */
934 plist_requeue(&si->avail_list, &swap_avail_head); 956 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
935 spin_unlock(&swap_avail_lock); 957 spin_unlock(&swap_avail_lock);
936 spin_lock(&si->lock); 958 spin_lock(&si->lock);
937 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { 959 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
938 spin_lock(&swap_avail_lock); 960 spin_lock(&swap_avail_lock);
939 if (plist_node_empty(&si->avail_list)) { 961 if (plist_node_empty(&si->avail_lists[node])) {
940 spin_unlock(&si->lock); 962 spin_unlock(&si->lock);
941 goto nextsi; 963 goto nextsi;
942 } 964 }
@@ -946,7 +968,7 @@ start_over:
946 WARN(!(si->flags & SWP_WRITEOK), 968 WARN(!(si->flags & SWP_WRITEOK),
947 "swap_info %d in list but !SWP_WRITEOK\n", 969 "swap_info %d in list but !SWP_WRITEOK\n",
948 si->type); 970 si->type);
949 plist_del(&si->avail_list, &swap_avail_head); 971 __del_from_avail_list(si);
950 spin_unlock(&si->lock); 972 spin_unlock(&si->lock);
951 goto nextsi; 973 goto nextsi;
952 } 974 }
@@ -975,7 +997,7 @@ nextsi:
975 * swap_avail_head list then try it, otherwise start over 997 * swap_avail_head list then try it, otherwise start over
976 * if we have not gotten any slots. 998 * if we have not gotten any slots.
977 */ 999 */
978 if (plist_node_empty(&next->avail_list)) 1000 if (plist_node_empty(&next->avail_lists[node]))
979 goto start_over; 1001 goto start_over;
980 } 1002 }
981 1003
@@ -2410,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2410 return generic_swapfile_activate(sis, swap_file, span); 2432 return generic_swapfile_activate(sis, swap_file, span);
2411} 2433}
2412 2434
2435static int swap_node(struct swap_info_struct *p)
2436{
2437 struct block_device *bdev;
2438
2439 if (p->bdev)
2440 bdev = p->bdev;
2441 else
2442 bdev = p->swap_file->f_inode->i_sb->s_bdev;
2443
2444 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2445}
2446
2413static void _enable_swap_info(struct swap_info_struct *p, int prio, 2447static void _enable_swap_info(struct swap_info_struct *p, int prio,
2414 unsigned char *swap_map, 2448 unsigned char *swap_map,
2415 struct swap_cluster_info *cluster_info) 2449 struct swap_cluster_info *cluster_info)
2416{ 2450{
2451 int i;
2452
2417 if (prio >= 0) 2453 if (prio >= 0)
2418 p->prio = prio; 2454 p->prio = prio;
2419 else 2455 else
@@ -2423,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
2423 * low-to-high, while swap ordering is high-to-low 2459 * low-to-high, while swap ordering is high-to-low
2424 */ 2460 */
2425 p->list.prio = -p->prio; 2461 p->list.prio = -p->prio;
2426 p->avail_list.prio = -p->prio; 2462 for_each_node(i) {
2463 if (p->prio >= 0)
2464 p->avail_lists[i].prio = -p->prio;
2465 else {
2466 if (swap_node(p) == i)
2467 p->avail_lists[i].prio = 1;
2468 else
2469 p->avail_lists[i].prio = -p->prio;
2470 }
2471 }
2427 p->swap_map = swap_map; 2472 p->swap_map = swap_map;
2428 p->cluster_info = cluster_info; 2473 p->cluster_info = cluster_info;
2429 p->flags |= SWP_WRITEOK; 2474 p->flags |= SWP_WRITEOK;
@@ -2442,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
2442 * swap_info_struct. 2487 * swap_info_struct.
2443 */ 2488 */
2444 plist_add(&p->list, &swap_active_head); 2489 plist_add(&p->list, &swap_active_head);
2445 spin_lock(&swap_avail_lock); 2490 add_to_avail_list(p);
2446 plist_add(&p->avail_list, &swap_avail_head);
2447 spin_unlock(&swap_avail_lock);
2448} 2491}
2449 2492
2450static void enable_swap_info(struct swap_info_struct *p, int prio, 2493static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2529,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2529 spin_unlock(&swap_lock); 2572 spin_unlock(&swap_lock);
2530 goto out_dput; 2573 goto out_dput;
2531 } 2574 }
2532 spin_lock(&swap_avail_lock); 2575 del_from_avail_list(p);
2533 plist_del(&p->avail_list, &swap_avail_head);
2534 spin_unlock(&swap_avail_lock);
2535 spin_lock(&p->lock); 2576 spin_lock(&p->lock);
2536 if (p->prio < 0) { 2577 if (p->prio < 0) {
2537 struct swap_info_struct *si = p; 2578 struct swap_info_struct *si = p;
2579 int nid;
2538 2580
2539 plist_for_each_entry_continue(si, &swap_active_head, list) { 2581 plist_for_each_entry_continue(si, &swap_active_head, list) {
2540 si->prio++; 2582 si->prio++;
2541 si->list.prio--; 2583 si->list.prio--;
2542 si->avail_list.prio--; 2584 for_each_node(nid) {
2585 if (si->avail_lists[nid].prio != 1)
2586 si->avail_lists[nid].prio--;
2587 }
2543 } 2588 }
2544 least_priority++; 2589 least_priority++;
2545 } 2590 }
@@ -2783,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void)
2783{ 2828{
2784 struct swap_info_struct *p; 2829 struct swap_info_struct *p;
2785 unsigned int type; 2830 unsigned int type;
2831 int i;
2786 2832
2787 p = kzalloc(sizeof(*p), GFP_KERNEL); 2833 p = kzalloc(sizeof(*p), GFP_KERNEL);
2788 if (!p) 2834 if (!p)
@@ -2818,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void)
2818 } 2864 }
2819 INIT_LIST_HEAD(&p->first_swap_extent.list); 2865 INIT_LIST_HEAD(&p->first_swap_extent.list);
2820 plist_node_init(&p->list, 0); 2866 plist_node_init(&p->list, 0);
2821 plist_node_init(&p->avail_list, 0); 2867 for_each_node(i)
2868 plist_node_init(&p->avail_lists[i], 0);
2822 p->flags = SWP_USED; 2869 p->flags = SWP_USED;
2823 spin_unlock(&swap_lock); 2870 spin_unlock(&swap_lock);
2824 spin_lock_init(&p->lock); 2871 spin_lock_init(&p->lock);
@@ -3060,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3060 if (!capable(CAP_SYS_ADMIN)) 3107 if (!capable(CAP_SYS_ADMIN))
3061 return -EPERM; 3108 return -EPERM;
3062 3109
3110 if (!swap_avail_heads)
3111 return -ENOMEM;
3112
3063 p = alloc_swap_info(); 3113 p = alloc_swap_info();
3064 if (IS_ERR(p)) 3114 if (IS_ERR(p))
3065 return PTR_ERR(p); 3115 return PTR_ERR(p);
@@ -3645,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
3645 } 3695 }
3646 } 3696 }
3647} 3697}
3698
3699static int __init swapfile_init(void)
3700{
3701 int nid;
3702
3703 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3704 GFP_KERNEL);
3705 if (!swap_avail_heads) {
3706 pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3707 return -ENOMEM;
3708 }
3709
3710 for_each_node(nid)
3711 plist_head_init(&swap_avail_heads[nid]);
3712
3713 return 0;
3714}
3715subsys_initcall(swapfile_init);