diff options
-rw-r--r-- | Documentation/vm/swap_numa.txt | 69 | ||||
-rw-r--r-- | include/linux/swap.h | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 120 |
3 files changed, 164 insertions, 27 deletions
diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.txt new file mode 100644 index 000000000000..d5960c9124f5 --- /dev/null +++ b/Documentation/vm/swap_numa.txt | |||
@@ -0,0 +1,69 @@ | |||
1 | Automatically bind swap device to numa node | ||
2 | ------------------------------------------- | ||
3 | |||
4 | If the system has more than one swap device and swap device has the node | ||
5 | information, we can make use of this information to decide which swap | ||
6 | device to use in get_swap_pages() to get better performance. | ||
7 | |||
8 | |||
9 | How to use this feature | ||
10 | ----------------------- | ||
11 | |||
12 | Swap device has priority and that decides the order of it to be used. To make | ||
13 | use of automatically binding, there is no need to manipulate priority settings | ||
14 | for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and | ||
15 | swapB, with swapA attached to node 0 and swapB attached to node 1, are going | ||
16 | to be swapped on. Simply swapping them on by doing: | ||
17 | # swapon /dev/swapA | ||
18 | # swapon /dev/swapB | ||
19 | |||
20 | Then node 0 will use the two swap devices in the order of swapA then swapB and | ||
21 | node 1 will use the two swap devices in the order of swapB then swapA. Note | ||
22 | that the order of them being swapped on doesn't matter. | ||
23 | |||
24 | A more complex example on a 4 node machine. Assume 6 swap devices are going to | ||
25 | be swapped on: swapA and swapB are attached to node 0, swapC is attached to | ||
26 | node 1, swapD and swapE are attached to node 2 and swapF is attached to node3. | ||
27 | The way to swap them on is the same as above: | ||
28 | # swapon /dev/swapA | ||
29 | # swapon /dev/swapB | ||
30 | # swapon /dev/swapC | ||
31 | # swapon /dev/swapD | ||
32 | # swapon /dev/swapE | ||
33 | # swapon /dev/swapF | ||
34 | |||
35 | Then node 0 will use them in the order of: | ||
36 | swapA/swapB -> swapC -> swapD -> swapE -> swapF | ||
37 | swapA and swapB will be used in a round robin mode before any other swap device. | ||
38 | |||
39 | node 1 will use them in the order of: | ||
40 | swapC -> swapA -> swapB -> swapD -> swapE -> swapF | ||
41 | |||
42 | node 2 will use them in the order of: | ||
43 | swapD/swapE -> swapA -> swapB -> swapC -> swapF | ||
44 | Similaly, swapD and swapE will be used in a round robin mode before any | ||
45 | other swap devices. | ||
46 | |||
47 | node 3 will use them in the order of: | ||
48 | swapF -> swapA -> swapB -> swapC -> swapD -> swapE | ||
49 | |||
50 | |||
51 | Implementation details | ||
52 | ---------------------- | ||
53 | |||
54 | The current code uses a priority based list, swap_avail_list, to decide | ||
55 | which swap device to use and if multiple swap devices share the same | ||
56 | priority, they are used round robin. This change here replaces the single | ||
57 | global swap_avail_list with a per-numa-node list, i.e. for each numa node, | ||
58 | it sees its own priority based list of available swap devices. Swap | ||
59 | device's priority can be promoted on its matching node's swap_avail_list. | ||
60 | |||
61 | The current swap device's priority is set as: user can set a >=0 value, | ||
62 | or the system will pick one starting from -1 then downwards. The priority | ||
63 | value in the swap_avail_list is the negated value of the swap device's | ||
64 | due to plist being sorted from low to high. The new policy doesn't change | ||
65 | the semantics for priority >=0 cases, the previous starting from -1 then | ||
66 | downwards now becomes starting from -2 then downwards and -1 is reserved | ||
67 | as the promoted value. So if multiple swap devices are attached to the same | ||
68 | node, they will all be promoted to priority -1 on that node's plist and will | ||
69 | be used round robin before any other swap devices. | ||
diff --git a/include/linux/swap.h b/include/linux/swap.h index 9c4ae6f14eea..8bf3487fb204 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -212,7 +212,7 @@ struct swap_info_struct { | |||
212 | unsigned long flags; /* SWP_USED etc: see above */ | 212 | unsigned long flags; /* SWP_USED etc: see above */ |
213 | signed short prio; /* swap priority of this type */ | 213 | signed short prio; /* swap priority of this type */ |
214 | struct plist_node list; /* entry in swap_active_head */ | 214 | struct plist_node list; /* entry in swap_active_head */ |
215 | struct plist_node avail_list; /* entry in swap_avail_head */ | 215 | struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */ |
216 | signed char type; /* strange name for an index */ | 216 | signed char type; /* strange name for an index */ |
217 | unsigned int max; /* extent of the swap_map */ | 217 | unsigned int max; /* extent of the swap_map */ |
218 | unsigned char *swap_map; /* vmalloc'ed array of usage counts */ | 218 | unsigned char *swap_map; /* vmalloc'ed array of usage counts */ |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 4f8b3e08a547..d483278ee35b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages; | |||
60 | EXPORT_SYMBOL_GPL(nr_swap_pages); | 60 | EXPORT_SYMBOL_GPL(nr_swap_pages); |
61 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ | 61 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ |
62 | long total_swap_pages; | 62 | long total_swap_pages; |
63 | static int least_priority; | 63 | static int least_priority = -1; |
64 | 64 | ||
65 | static const char Bad_file[] = "Bad swap file entry "; | 65 | static const char Bad_file[] = "Bad swap file entry "; |
66 | static const char Unused_file[] = "Unused swap file entry "; | 66 | static const char Unused_file[] = "Unused swap file entry "; |
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head); | |||
85 | * is held and the locking order requires swap_lock to be taken | 85 | * is held and the locking order requires swap_lock to be taken |
86 | * before any swap_info_struct->lock. | 86 | * before any swap_info_struct->lock. |
87 | */ | 87 | */ |
88 | static PLIST_HEAD(swap_avail_head); | 88 | struct plist_head *swap_avail_heads; |
89 | static DEFINE_SPINLOCK(swap_avail_lock); | 89 | static DEFINE_SPINLOCK(swap_avail_lock); |
90 | 90 | ||
91 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 91 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
@@ -592,6 +592,21 @@ new_cluster: | |||
592 | return found_free; | 592 | return found_free; |
593 | } | 593 | } |
594 | 594 | ||
595 | static void __del_from_avail_list(struct swap_info_struct *p) | ||
596 | { | ||
597 | int nid; | ||
598 | |||
599 | for_each_node(nid) | ||
600 | plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); | ||
601 | } | ||
602 | |||
603 | static void del_from_avail_list(struct swap_info_struct *p) | ||
604 | { | ||
605 | spin_lock(&swap_avail_lock); | ||
606 | __del_from_avail_list(p); | ||
607 | spin_unlock(&swap_avail_lock); | ||
608 | } | ||
609 | |||
595 | static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, | 610 | static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, |
596 | unsigned int nr_entries) | 611 | unsigned int nr_entries) |
597 | { | 612 | { |
@@ -605,12 +620,22 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, | |||
605 | if (si->inuse_pages == si->pages) { | 620 | if (si->inuse_pages == si->pages) { |
606 | si->lowest_bit = si->max; | 621 | si->lowest_bit = si->max; |
607 | si->highest_bit = 0; | 622 | si->highest_bit = 0; |
608 | spin_lock(&swap_avail_lock); | 623 | del_from_avail_list(si); |
609 | plist_del(&si->avail_list, &swap_avail_head); | ||
610 | spin_unlock(&swap_avail_lock); | ||
611 | } | 624 | } |
612 | } | 625 | } |
613 | 626 | ||
627 | static void add_to_avail_list(struct swap_info_struct *p) | ||
628 | { | ||
629 | int nid; | ||
630 | |||
631 | spin_lock(&swap_avail_lock); | ||
632 | for_each_node(nid) { | ||
633 | WARN_ON(!plist_node_empty(&p->avail_lists[nid])); | ||
634 | plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); | ||
635 | } | ||
636 | spin_unlock(&swap_avail_lock); | ||
637 | } | ||
638 | |||
614 | static void swap_range_free(struct swap_info_struct *si, unsigned long offset, | 639 | static void swap_range_free(struct swap_info_struct *si, unsigned long offset, |
615 | unsigned int nr_entries) | 640 | unsigned int nr_entries) |
616 | { | 641 | { |
@@ -623,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, | |||
623 | bool was_full = !si->highest_bit; | 648 | bool was_full = !si->highest_bit; |
624 | 649 | ||
625 | si->highest_bit = end; | 650 | si->highest_bit = end; |
626 | if (was_full && (si->flags & SWP_WRITEOK)) { | 651 | if (was_full && (si->flags & SWP_WRITEOK)) |
627 | spin_lock(&swap_avail_lock); | 652 | add_to_avail_list(si); |
628 | WARN_ON(!plist_node_empty(&si->avail_list)); | ||
629 | if (plist_node_empty(&si->avail_list)) | ||
630 | plist_add(&si->avail_list, &swap_avail_head); | ||
631 | spin_unlock(&swap_avail_lock); | ||
632 | } | ||
633 | } | 653 | } |
634 | atomic_long_add(nr_entries, &nr_swap_pages); | 654 | atomic_long_add(nr_entries, &nr_swap_pages); |
635 | si->inuse_pages -= nr_entries; | 655 | si->inuse_pages -= nr_entries; |
@@ -910,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) | |||
910 | struct swap_info_struct *si, *next; | 930 | struct swap_info_struct *si, *next; |
911 | long avail_pgs; | 931 | long avail_pgs; |
912 | int n_ret = 0; | 932 | int n_ret = 0; |
933 | int node; | ||
913 | 934 | ||
914 | /* Only single cluster request supported */ | 935 | /* Only single cluster request supported */ |
915 | WARN_ON_ONCE(n_goal > 1 && cluster); | 936 | WARN_ON_ONCE(n_goal > 1 && cluster); |
@@ -929,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) | |||
929 | spin_lock(&swap_avail_lock); | 950 | spin_lock(&swap_avail_lock); |
930 | 951 | ||
931 | start_over: | 952 | start_over: |
932 | plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { | 953 | node = numa_node_id(); |
954 | plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { | ||
933 | /* requeue si to after same-priority siblings */ | 955 | /* requeue si to after same-priority siblings */ |
934 | plist_requeue(&si->avail_list, &swap_avail_head); | 956 | plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); |
935 | spin_unlock(&swap_avail_lock); | 957 | spin_unlock(&swap_avail_lock); |
936 | spin_lock(&si->lock); | 958 | spin_lock(&si->lock); |
937 | if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { | 959 | if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { |
938 | spin_lock(&swap_avail_lock); | 960 | spin_lock(&swap_avail_lock); |
939 | if (plist_node_empty(&si->avail_list)) { | 961 | if (plist_node_empty(&si->avail_lists[node])) { |
940 | spin_unlock(&si->lock); | 962 | spin_unlock(&si->lock); |
941 | goto nextsi; | 963 | goto nextsi; |
942 | } | 964 | } |
@@ -946,7 +968,7 @@ start_over: | |||
946 | WARN(!(si->flags & SWP_WRITEOK), | 968 | WARN(!(si->flags & SWP_WRITEOK), |
947 | "swap_info %d in list but !SWP_WRITEOK\n", | 969 | "swap_info %d in list but !SWP_WRITEOK\n", |
948 | si->type); | 970 | si->type); |
949 | plist_del(&si->avail_list, &swap_avail_head); | 971 | __del_from_avail_list(si); |
950 | spin_unlock(&si->lock); | 972 | spin_unlock(&si->lock); |
951 | goto nextsi; | 973 | goto nextsi; |
952 | } | 974 | } |
@@ -975,7 +997,7 @@ nextsi: | |||
975 | * swap_avail_head list then try it, otherwise start over | 997 | * swap_avail_head list then try it, otherwise start over |
976 | * if we have not gotten any slots. | 998 | * if we have not gotten any slots. |
977 | */ | 999 | */ |
978 | if (plist_node_empty(&next->avail_list)) | 1000 | if (plist_node_empty(&next->avail_lists[node])) |
979 | goto start_over; | 1001 | goto start_over; |
980 | } | 1002 | } |
981 | 1003 | ||
@@ -2410,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
2410 | return generic_swapfile_activate(sis, swap_file, span); | 2432 | return generic_swapfile_activate(sis, swap_file, span); |
2411 | } | 2433 | } |
2412 | 2434 | ||
2435 | static int swap_node(struct swap_info_struct *p) | ||
2436 | { | ||
2437 | struct block_device *bdev; | ||
2438 | |||
2439 | if (p->bdev) | ||
2440 | bdev = p->bdev; | ||
2441 | else | ||
2442 | bdev = p->swap_file->f_inode->i_sb->s_bdev; | ||
2443 | |||
2444 | return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; | ||
2445 | } | ||
2446 | |||
2413 | static void _enable_swap_info(struct swap_info_struct *p, int prio, | 2447 | static void _enable_swap_info(struct swap_info_struct *p, int prio, |
2414 | unsigned char *swap_map, | 2448 | unsigned char *swap_map, |
2415 | struct swap_cluster_info *cluster_info) | 2449 | struct swap_cluster_info *cluster_info) |
2416 | { | 2450 | { |
2451 | int i; | ||
2452 | |||
2417 | if (prio >= 0) | 2453 | if (prio >= 0) |
2418 | p->prio = prio; | 2454 | p->prio = prio; |
2419 | else | 2455 | else |
@@ -2423,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
2423 | * low-to-high, while swap ordering is high-to-low | 2459 | * low-to-high, while swap ordering is high-to-low |
2424 | */ | 2460 | */ |
2425 | p->list.prio = -p->prio; | 2461 | p->list.prio = -p->prio; |
2426 | p->avail_list.prio = -p->prio; | 2462 | for_each_node(i) { |
2463 | if (p->prio >= 0) | ||
2464 | p->avail_lists[i].prio = -p->prio; | ||
2465 | else { | ||
2466 | if (swap_node(p) == i) | ||
2467 | p->avail_lists[i].prio = 1; | ||
2468 | else | ||
2469 | p->avail_lists[i].prio = -p->prio; | ||
2470 | } | ||
2471 | } | ||
2427 | p->swap_map = swap_map; | 2472 | p->swap_map = swap_map; |
2428 | p->cluster_info = cluster_info; | 2473 | p->cluster_info = cluster_info; |
2429 | p->flags |= SWP_WRITEOK; | 2474 | p->flags |= SWP_WRITEOK; |
@@ -2442,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
2442 | * swap_info_struct. | 2487 | * swap_info_struct. |
2443 | */ | 2488 | */ |
2444 | plist_add(&p->list, &swap_active_head); | 2489 | plist_add(&p->list, &swap_active_head); |
2445 | spin_lock(&swap_avail_lock); | 2490 | add_to_avail_list(p); |
2446 | plist_add(&p->avail_list, &swap_avail_head); | ||
2447 | spin_unlock(&swap_avail_lock); | ||
2448 | } | 2491 | } |
2449 | 2492 | ||
2450 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 2493 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
@@ -2529,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
2529 | spin_unlock(&swap_lock); | 2572 | spin_unlock(&swap_lock); |
2530 | goto out_dput; | 2573 | goto out_dput; |
2531 | } | 2574 | } |
2532 | spin_lock(&swap_avail_lock); | 2575 | del_from_avail_list(p); |
2533 | plist_del(&p->avail_list, &swap_avail_head); | ||
2534 | spin_unlock(&swap_avail_lock); | ||
2535 | spin_lock(&p->lock); | 2576 | spin_lock(&p->lock); |
2536 | if (p->prio < 0) { | 2577 | if (p->prio < 0) { |
2537 | struct swap_info_struct *si = p; | 2578 | struct swap_info_struct *si = p; |
2579 | int nid; | ||
2538 | 2580 | ||
2539 | plist_for_each_entry_continue(si, &swap_active_head, list) { | 2581 | plist_for_each_entry_continue(si, &swap_active_head, list) { |
2540 | si->prio++; | 2582 | si->prio++; |
2541 | si->list.prio--; | 2583 | si->list.prio--; |
2542 | si->avail_list.prio--; | 2584 | for_each_node(nid) { |
2585 | if (si->avail_lists[nid].prio != 1) | ||
2586 | si->avail_lists[nid].prio--; | ||
2587 | } | ||
2543 | } | 2588 | } |
2544 | least_priority++; | 2589 | least_priority++; |
2545 | } | 2590 | } |
@@ -2783,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2783 | { | 2828 | { |
2784 | struct swap_info_struct *p; | 2829 | struct swap_info_struct *p; |
2785 | unsigned int type; | 2830 | unsigned int type; |
2831 | int i; | ||
2786 | 2832 | ||
2787 | p = kzalloc(sizeof(*p), GFP_KERNEL); | 2833 | p = kzalloc(sizeof(*p), GFP_KERNEL); |
2788 | if (!p) | 2834 | if (!p) |
@@ -2818,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2818 | } | 2864 | } |
2819 | INIT_LIST_HEAD(&p->first_swap_extent.list); | 2865 | INIT_LIST_HEAD(&p->first_swap_extent.list); |
2820 | plist_node_init(&p->list, 0); | 2866 | plist_node_init(&p->list, 0); |
2821 | plist_node_init(&p->avail_list, 0); | 2867 | for_each_node(i) |
2868 | plist_node_init(&p->avail_lists[i], 0); | ||
2822 | p->flags = SWP_USED; | 2869 | p->flags = SWP_USED; |
2823 | spin_unlock(&swap_lock); | 2870 | spin_unlock(&swap_lock); |
2824 | spin_lock_init(&p->lock); | 2871 | spin_lock_init(&p->lock); |
@@ -3060,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
3060 | if (!capable(CAP_SYS_ADMIN)) | 3107 | if (!capable(CAP_SYS_ADMIN)) |
3061 | return -EPERM; | 3108 | return -EPERM; |
3062 | 3109 | ||
3110 | if (!swap_avail_heads) | ||
3111 | return -ENOMEM; | ||
3112 | |||
3063 | p = alloc_swap_info(); | 3113 | p = alloc_swap_info(); |
3064 | if (IS_ERR(p)) | 3114 | if (IS_ERR(p)) |
3065 | return PTR_ERR(p); | 3115 | return PTR_ERR(p); |
@@ -3645,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si) | |||
3645 | } | 3695 | } |
3646 | } | 3696 | } |
3647 | } | 3697 | } |
3698 | |||
3699 | static int __init swapfile_init(void) | ||
3700 | { | ||
3701 | int nid; | ||
3702 | |||
3703 | swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), | ||
3704 | GFP_KERNEL); | ||
3705 | if (!swap_avail_heads) { | ||
3706 | pr_emerg("Not enough memory for swap heads, swap is disabled\n"); | ||
3707 | return -ENOMEM; | ||
3708 | } | ||
3709 | |||
3710 | for_each_node(nid) | ||
3711 | plist_head_init(&swap_avail_heads[nid]); | ||
3712 | |||
3713 | return 0; | ||
3714 | } | ||
3715 | subsys_initcall(swapfile_init); | ||