summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/vm/swap_numa.txt69
-rw-r--r--include/linux/swap.h2
-rw-r--r--mm/swapfile.c120
3 files changed, 164 insertions, 27 deletions
diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.txt
new file mode 100644
index 000000000000..d5960c9124f5
--- /dev/null
+++ b/Documentation/vm/swap_numa.txt
@@ -0,0 +1,69 @@
1Automatically bind swap device to numa node
2-------------------------------------------
3
4If the system has more than one swap device and swap device has the node
5information, we can make use of this information to decide which swap
6device to use in get_swap_pages() to get better performance.
7
8
9How to use this feature
10-----------------------
11
12Swap device has priority and that decides the order of it to be used. To make
13use of automatically binding, there is no need to manipulate priority settings
14for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
15swapB, with swapA attached to node 0 and swapB attached to node 1, are going
16to be swapped on. Simply swapping them on by doing:
17# swapon /dev/swapA
18# swapon /dev/swapB
19
20Then node 0 will use the two swap devices in the order of swapA then swapB and
21node 1 will use the two swap devices in the order of swapB then swapA. Note
22that the order of them being swapped on doesn't matter.
23
24A more complex example on a 4 node machine. Assume 6 swap devices are going to
25be swapped on: swapA and swapB are attached to node 0, swapC is attached to
26node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
27The way to swap them on is the same as above:
28# swapon /dev/swapA
29# swapon /dev/swapB
30# swapon /dev/swapC
31# swapon /dev/swapD
32# swapon /dev/swapE
33# swapon /dev/swapF
34
35Then node 0 will use them in the order of:
36swapA/swapB -> swapC -> swapD -> swapE -> swapF
37swapA and swapB will be used in a round robin mode before any other swap device.
38
39node 1 will use them in the order of:
40swapC -> swapA -> swapB -> swapD -> swapE -> swapF
41
42node 2 will use them in the order of:
43swapD/swapE -> swapA -> swapB -> swapC -> swapF
44Similaly, swapD and swapE will be used in a round robin mode before any
45other swap devices.
46
47node 3 will use them in the order of:
48swapF -> swapA -> swapB -> swapC -> swapD -> swapE
49
50
51Implementation details
52----------------------
53
54The current code uses a priority based list, swap_avail_list, to decide
55which swap device to use and if multiple swap devices share the same
56priority, they are used round robin. This change here replaces the single
57global swap_avail_list with a per-numa-node list, i.e. for each numa node,
58it sees its own priority based list of available swap devices. Swap
59device's priority can be promoted on its matching node's swap_avail_list.
60
61The current swap device's priority is set as: user can set a >=0 value,
62or the system will pick one starting from -1 then downwards. The priority
63value in the swap_avail_list is the negated value of the swap device's
64due to plist being sorted from low to high. The new policy doesn't change
65the semantics for priority >=0 cases, the previous starting from -1 then
66downwards now becomes starting from -2 then downwards and -1 is reserved
67as the promoted value. So if multiple swap devices are attached to the same
68node, they will all be promoted to priority -1 on that node's plist and will
69be used round robin before any other swap devices.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 9c4ae6f14eea..8bf3487fb204 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -212,7 +212,7 @@ struct swap_info_struct {
212 unsigned long flags; /* SWP_USED etc: see above */ 212 unsigned long flags; /* SWP_USED etc: see above */
213 signed short prio; /* swap priority of this type */ 213 signed short prio; /* swap priority of this type */
214 struct plist_node list; /* entry in swap_active_head */ 214 struct plist_node list; /* entry in swap_active_head */
215 struct plist_node avail_list; /* entry in swap_avail_head */ 215 struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */
216 signed char type; /* strange name for an index */ 216 signed char type; /* strange name for an index */
217 unsigned int max; /* extent of the swap_map */ 217 unsigned int max; /* extent of the swap_map */
218 unsigned char *swap_map; /* vmalloc'ed array of usage counts */ 218 unsigned char *swap_map; /* vmalloc'ed array of usage counts */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4f8b3e08a547..d483278ee35b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages;
60EXPORT_SYMBOL_GPL(nr_swap_pages); 60EXPORT_SYMBOL_GPL(nr_swap_pages);
61/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 61/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
62long total_swap_pages; 62long total_swap_pages;
63static int least_priority; 63static int least_priority = -1;
64 64
65static const char Bad_file[] = "Bad swap file entry "; 65static const char Bad_file[] = "Bad swap file entry ";
66static const char Unused_file[] = "Unused swap file entry "; 66static const char Unused_file[] = "Unused swap file entry ";
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head);
85 * is held and the locking order requires swap_lock to be taken 85 * is held and the locking order requires swap_lock to be taken
86 * before any swap_info_struct->lock. 86 * before any swap_info_struct->lock.
87 */ 87 */
88static PLIST_HEAD(swap_avail_head); 88struct plist_head *swap_avail_heads;
89static DEFINE_SPINLOCK(swap_avail_lock); 89static DEFINE_SPINLOCK(swap_avail_lock);
90 90
91struct swap_info_struct *swap_info[MAX_SWAPFILES]; 91struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -592,6 +592,21 @@ new_cluster:
592 return found_free; 592 return found_free;
593} 593}
594 594
595static void __del_from_avail_list(struct swap_info_struct *p)
596{
597 int nid;
598
599 for_each_node(nid)
600 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
601}
602
603static void del_from_avail_list(struct swap_info_struct *p)
604{
605 spin_lock(&swap_avail_lock);
606 __del_from_avail_list(p);
607 spin_unlock(&swap_avail_lock);
608}
609
595static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, 610static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
596 unsigned int nr_entries) 611 unsigned int nr_entries)
597{ 612{
@@ -605,12 +620,22 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
605 if (si->inuse_pages == si->pages) { 620 if (si->inuse_pages == si->pages) {
606 si->lowest_bit = si->max; 621 si->lowest_bit = si->max;
607 si->highest_bit = 0; 622 si->highest_bit = 0;
608 spin_lock(&swap_avail_lock); 623 del_from_avail_list(si);
609 plist_del(&si->avail_list, &swap_avail_head);
610 spin_unlock(&swap_avail_lock);
611 } 624 }
612} 625}
613 626
627static void add_to_avail_list(struct swap_info_struct *p)
628{
629 int nid;
630
631 spin_lock(&swap_avail_lock);
632 for_each_node(nid) {
633 WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
634 plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
635 }
636 spin_unlock(&swap_avail_lock);
637}
638
614static void swap_range_free(struct swap_info_struct *si, unsigned long offset, 639static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
615 unsigned int nr_entries) 640 unsigned int nr_entries)
616{ 641{
@@ -623,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
623 bool was_full = !si->highest_bit; 648 bool was_full = !si->highest_bit;
624 649
625 si->highest_bit = end; 650 si->highest_bit = end;
626 if (was_full && (si->flags & SWP_WRITEOK)) { 651 if (was_full && (si->flags & SWP_WRITEOK))
627 spin_lock(&swap_avail_lock); 652 add_to_avail_list(si);
628 WARN_ON(!plist_node_empty(&si->avail_list));
629 if (plist_node_empty(&si->avail_list))
630 plist_add(&si->avail_list, &swap_avail_head);
631 spin_unlock(&swap_avail_lock);
632 }
633 } 653 }
634 atomic_long_add(nr_entries, &nr_swap_pages); 654 atomic_long_add(nr_entries, &nr_swap_pages);
635 si->inuse_pages -= nr_entries; 655 si->inuse_pages -= nr_entries;
@@ -910,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
910 struct swap_info_struct *si, *next; 930 struct swap_info_struct *si, *next;
911 long avail_pgs; 931 long avail_pgs;
912 int n_ret = 0; 932 int n_ret = 0;
933 int node;
913 934
914 /* Only single cluster request supported */ 935 /* Only single cluster request supported */
915 WARN_ON_ONCE(n_goal > 1 && cluster); 936 WARN_ON_ONCE(n_goal > 1 && cluster);
@@ -929,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
929 spin_lock(&swap_avail_lock); 950 spin_lock(&swap_avail_lock);
930 951
931start_over: 952start_over:
932 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { 953 node = numa_node_id();
954 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
933 /* requeue si to after same-priority siblings */ 955 /* requeue si to after same-priority siblings */
934 plist_requeue(&si->avail_list, &swap_avail_head); 956 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
935 spin_unlock(&swap_avail_lock); 957 spin_unlock(&swap_avail_lock);
936 spin_lock(&si->lock); 958 spin_lock(&si->lock);
937 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { 959 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
938 spin_lock(&swap_avail_lock); 960 spin_lock(&swap_avail_lock);
939 if (plist_node_empty(&si->avail_list)) { 961 if (plist_node_empty(&si->avail_lists[node])) {
940 spin_unlock(&si->lock); 962 spin_unlock(&si->lock);
941 goto nextsi; 963 goto nextsi;
942 } 964 }
@@ -946,7 +968,7 @@ start_over:
946 WARN(!(si->flags & SWP_WRITEOK), 968 WARN(!(si->flags & SWP_WRITEOK),
947 "swap_info %d in list but !SWP_WRITEOK\n", 969 "swap_info %d in list but !SWP_WRITEOK\n",
948 si->type); 970 si->type);
949 plist_del(&si->avail_list, &swap_avail_head); 971 __del_from_avail_list(si);
950 spin_unlock(&si->lock); 972 spin_unlock(&si->lock);
951 goto nextsi; 973 goto nextsi;
952 } 974 }
@@ -975,7 +997,7 @@ nextsi:
975 * swap_avail_head list then try it, otherwise start over 997 * swap_avail_head list then try it, otherwise start over
976 * if we have not gotten any slots. 998 * if we have not gotten any slots.
977 */ 999 */
978 if (plist_node_empty(&next->avail_list)) 1000 if (plist_node_empty(&next->avail_lists[node]))
979 goto start_over; 1001 goto start_over;
980 } 1002 }
981 1003
@@ -2410,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2410 return generic_swapfile_activate(sis, swap_file, span); 2432 return generic_swapfile_activate(sis, swap_file, span);
2411} 2433}
2412 2434
2435static int swap_node(struct swap_info_struct *p)
2436{
2437 struct block_device *bdev;
2438
2439 if (p->bdev)
2440 bdev = p->bdev;
2441 else
2442 bdev = p->swap_file->f_inode->i_sb->s_bdev;
2443
2444 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2445}
2446
2413static void _enable_swap_info(struct swap_info_struct *p, int prio, 2447static void _enable_swap_info(struct swap_info_struct *p, int prio,
2414 unsigned char *swap_map, 2448 unsigned char *swap_map,
2415 struct swap_cluster_info *cluster_info) 2449 struct swap_cluster_info *cluster_info)
2416{ 2450{
2451 int i;
2452
2417 if (prio >= 0) 2453 if (prio >= 0)
2418 p->prio = prio; 2454 p->prio = prio;
2419 else 2455 else
@@ -2423,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
2423 * low-to-high, while swap ordering is high-to-low 2459 * low-to-high, while swap ordering is high-to-low
2424 */ 2460 */
2425 p->list.prio = -p->prio; 2461 p->list.prio = -p->prio;
2426 p->avail_list.prio = -p->prio; 2462 for_each_node(i) {
2463 if (p->prio >= 0)
2464 p->avail_lists[i].prio = -p->prio;
2465 else {
2466 if (swap_node(p) == i)
2467 p->avail_lists[i].prio = 1;
2468 else
2469 p->avail_lists[i].prio = -p->prio;
2470 }
2471 }
2427 p->swap_map = swap_map; 2472 p->swap_map = swap_map;
2428 p->cluster_info = cluster_info; 2473 p->cluster_info = cluster_info;
2429 p->flags |= SWP_WRITEOK; 2474 p->flags |= SWP_WRITEOK;
@@ -2442,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
2442 * swap_info_struct. 2487 * swap_info_struct.
2443 */ 2488 */
2444 plist_add(&p->list, &swap_active_head); 2489 plist_add(&p->list, &swap_active_head);
2445 spin_lock(&swap_avail_lock); 2490 add_to_avail_list(p);
2446 plist_add(&p->avail_list, &swap_avail_head);
2447 spin_unlock(&swap_avail_lock);
2448} 2491}
2449 2492
2450static void enable_swap_info(struct swap_info_struct *p, int prio, 2493static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2529,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2529 spin_unlock(&swap_lock); 2572 spin_unlock(&swap_lock);
2530 goto out_dput; 2573 goto out_dput;
2531 } 2574 }
2532 spin_lock(&swap_avail_lock); 2575 del_from_avail_list(p);
2533 plist_del(&p->avail_list, &swap_avail_head);
2534 spin_unlock(&swap_avail_lock);
2535 spin_lock(&p->lock); 2576 spin_lock(&p->lock);
2536 if (p->prio < 0) { 2577 if (p->prio < 0) {
2537 struct swap_info_struct *si = p; 2578 struct swap_info_struct *si = p;
2579 int nid;
2538 2580
2539 plist_for_each_entry_continue(si, &swap_active_head, list) { 2581 plist_for_each_entry_continue(si, &swap_active_head, list) {
2540 si->prio++; 2582 si->prio++;
2541 si->list.prio--; 2583 si->list.prio--;
2542 si->avail_list.prio--; 2584 for_each_node(nid) {
2585 if (si->avail_lists[nid].prio != 1)
2586 si->avail_lists[nid].prio--;
2587 }
2543 } 2588 }
2544 least_priority++; 2589 least_priority++;
2545 } 2590 }
@@ -2783,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void)
2783{ 2828{
2784 struct swap_info_struct *p; 2829 struct swap_info_struct *p;
2785 unsigned int type; 2830 unsigned int type;
2831 int i;
2786 2832
2787 p = kzalloc(sizeof(*p), GFP_KERNEL); 2833 p = kzalloc(sizeof(*p), GFP_KERNEL);
2788 if (!p) 2834 if (!p)
@@ -2818,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void)
2818 } 2864 }
2819 INIT_LIST_HEAD(&p->first_swap_extent.list); 2865 INIT_LIST_HEAD(&p->first_swap_extent.list);
2820 plist_node_init(&p->list, 0); 2866 plist_node_init(&p->list, 0);
2821 plist_node_init(&p->avail_list, 0); 2867 for_each_node(i)
2868 plist_node_init(&p->avail_lists[i], 0);
2822 p->flags = SWP_USED; 2869 p->flags = SWP_USED;
2823 spin_unlock(&swap_lock); 2870 spin_unlock(&swap_lock);
2824 spin_lock_init(&p->lock); 2871 spin_lock_init(&p->lock);
@@ -3060,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3060 if (!capable(CAP_SYS_ADMIN)) 3107 if (!capable(CAP_SYS_ADMIN))
3061 return -EPERM; 3108 return -EPERM;
3062 3109
3110 if (!swap_avail_heads)
3111 return -ENOMEM;
3112
3063 p = alloc_swap_info(); 3113 p = alloc_swap_info();
3064 if (IS_ERR(p)) 3114 if (IS_ERR(p))
3065 return PTR_ERR(p); 3115 return PTR_ERR(p);
@@ -3645,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
3645 } 3695 }
3646 } 3696 }
3647} 3697}
3698
3699static int __init swapfile_init(void)
3700{
3701 int nid;
3702
3703 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3704 GFP_KERNEL);
3705 if (!swap_avail_heads) {
3706 pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3707 return -ENOMEM;
3708 }
3709
3710 for_each_node(nid)
3711 plist_head_init(&swap_avail_heads[nid]);
3712
3713 return 0;
3714}
3715subsys_initcall(swapfile_init);