aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorPetr Holasek <pholasek@redhat.com>2013-02-22 19:35:00 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-23 20:50:19 -0500
commit90bd6fd31c8097ee4ddcb74b7e08363134863de5 (patch)
tree830b2f923a87a20aaa4ca74b58bcfd991231a889 /mm
parent22b751c3d0376e86a377e3a0aa2ddbbe9d2eefc1 (diff)
ksm: allow trees per NUMA node
Here's a KSM series, based on mmotm 2013-01-23-17-04: starting with Petr's v7 "KSM: numa awareness sysfs knob"; then fixing the two issues we had with that, fully enabling KSM page migration on the way. (A different kind of KSM/NUMA issue which I've certainly not begun to address here: when KSM pages are unmerged, there's usually no sense in preferring to allocate the new pages local to the caller's node.) This patch: Introduces new sysfs boolean knob /sys/kernel/mm/ksm/merge_across_nodes which control merging pages across different numa nodes. When it is set to zero only pages from the same node are merged, otherwise pages from all nodes can be merged together (default behavior). Typical use-case could be a lot of KVM guests on NUMA machine and cpus from more distant nodes would have significant increase of access latency to the merged ksm page. Sysfs knob was choosen for higher variability when some users still prefers higher amount of saved physical memory regardless of access latency. Every numa node has its own stable & unstable trees because of faster searching and inserting. Changing of merge_across_nodes value is possible only when there are not any ksm shared pages in system. I've tested this patch on numa machines with 2, 4 and 8 nodes and measured speed of memory access inside of KVM guests with memory pinned to one of nodes with this benchmark: http://pholasek.fedorapeople.org/alloc_pg.c Population standard deviations of access times in percentage of average were following: merge_across_nodes=1 2 nodes 1.4% 4 nodes 1.6% 8 nodes 1.7% merge_across_nodes=0 2 nodes 1% 4 nodes 0.32% 8 nodes 0.018% RFC: https://lkml.org/lkml/2011/11/30/91 v1: https://lkml.org/lkml/2012/1/23/46 v2: https://lkml.org/lkml/2012/6/29/105 v3: https://lkml.org/lkml/2012/9/14/550 v4: https://lkml.org/lkml/2012/9/23/137 v5: https://lkml.org/lkml/2012/12/10/540 v6: https://lkml.org/lkml/2012/12/23/154 v7: https://lkml.org/lkml/2012/12/27/225 Hugh notes that this patch brings two problems, whose solution needs further support in mm/ksm.c, which follows in subsequent patches: 1) switching merge_across_nodes after running KSM is liable to oops on stale nodes still left over from the previous stable tree; 2) memory hotremove may migrate KSM pages, but there is no provision here for !merge_across_nodes to migrate nodes to the proper tree. Signed-off-by: Petr Holasek <pholasek@redhat.com> Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Izik Eidus <izik.eidus@ravellosystems.com> Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/ksm.c151
1 files changed, 132 insertions, 19 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index d3842b206f8a..1602cc9e3d73 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -36,6 +36,7 @@
36#include <linux/hashtable.h> 36#include <linux/hashtable.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/oom.h> 38#include <linux/oom.h>
39#include <linux/numa.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include "internal.h" 42#include "internal.h"
@@ -139,6 +140,9 @@ struct rmap_item {
139 struct mm_struct *mm; 140 struct mm_struct *mm;
140 unsigned long address; /* + low bits used for flags below */ 141 unsigned long address; /* + low bits used for flags below */
141 unsigned int oldchecksum; /* when unstable */ 142 unsigned int oldchecksum; /* when unstable */
143#ifdef CONFIG_NUMA
144 unsigned int nid;
145#endif
142 union { 146 union {
143 struct rb_node node; /* when node of unstable tree */ 147 struct rb_node node; /* when node of unstable tree */
144 struct { /* when listed from stable tree */ 148 struct { /* when listed from stable tree */
@@ -153,8 +157,8 @@ struct rmap_item {
153#define STABLE_FLAG 0x200 /* is listed from the stable tree */ 157#define STABLE_FLAG 0x200 /* is listed from the stable tree */
154 158
155/* The stable and unstable tree heads */ 159/* The stable and unstable tree heads */
156static struct rb_root root_stable_tree = RB_ROOT; 160static struct rb_root root_unstable_tree[MAX_NUMNODES];
157static struct rb_root root_unstable_tree = RB_ROOT; 161static struct rb_root root_stable_tree[MAX_NUMNODES];
158 162
159#define MM_SLOTS_HASH_BITS 10 163#define MM_SLOTS_HASH_BITS 10
160static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 164static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -188,6 +192,9 @@ static unsigned int ksm_thread_pages_to_scan = 100;
188/* Milliseconds ksmd should sleep between batches */ 192/* Milliseconds ksmd should sleep between batches */
189static unsigned int ksm_thread_sleep_millisecs = 20; 193static unsigned int ksm_thread_sleep_millisecs = 20;
190 194
195/* Zeroed when merging across nodes is not allowed */
196static unsigned int ksm_merge_across_nodes = 1;
197
191#define KSM_RUN_STOP 0 198#define KSM_RUN_STOP 0
192#define KSM_RUN_MERGE 1 199#define KSM_RUN_MERGE 1
193#define KSM_RUN_UNMERGE 2 200#define KSM_RUN_UNMERGE 2
@@ -441,10 +448,25 @@ out: page = NULL;
441 return page; 448 return page;
442} 449}
443 450
451/*
452 * This helper is used for getting right index into array of tree roots.
453 * When merge_across_nodes knob is set to 1, there are only two rb-trees for
454 * stable and unstable pages from all nodes with roots in index 0. Otherwise,
455 * every node has its own stable and unstable tree.
456 */
457static inline int get_kpfn_nid(unsigned long kpfn)
458{
459 if (ksm_merge_across_nodes)
460 return 0;
461 else
462 return pfn_to_nid(kpfn);
463}
464
444static void remove_node_from_stable_tree(struct stable_node *stable_node) 465static void remove_node_from_stable_tree(struct stable_node *stable_node)
445{ 466{
446 struct rmap_item *rmap_item; 467 struct rmap_item *rmap_item;
447 struct hlist_node *hlist; 468 struct hlist_node *hlist;
469 int nid;
448 470
449 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 471 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
450 if (rmap_item->hlist.next) 472 if (rmap_item->hlist.next)
@@ -456,7 +478,9 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
456 cond_resched(); 478 cond_resched();
457 } 479 }
458 480
459 rb_erase(&stable_node->node, &root_stable_tree); 481 nid = get_kpfn_nid(stable_node->kpfn);
482
483 rb_erase(&stable_node->node, &root_stable_tree[nid]);
460 free_stable_node(stable_node); 484 free_stable_node(stable_node);
461} 485}
462 486
@@ -554,7 +578,12 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
554 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 578 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
555 BUG_ON(age > 1); 579 BUG_ON(age > 1);
556 if (!age) 580 if (!age)
557 rb_erase(&rmap_item->node, &root_unstable_tree); 581#ifdef CONFIG_NUMA
582 rb_erase(&rmap_item->node,
583 &root_unstable_tree[rmap_item->nid]);
584#else
585 rb_erase(&rmap_item->node, &root_unstable_tree[0]);
586#endif
558 587
559 ksm_pages_unshared--; 588 ksm_pages_unshared--;
560 rmap_item->address &= PAGE_MASK; 589 rmap_item->address &= PAGE_MASK;
@@ -990,8 +1019,9 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
990 */ 1019 */
991static struct page *stable_tree_search(struct page *page) 1020static struct page *stable_tree_search(struct page *page)
992{ 1021{
993 struct rb_node *node = root_stable_tree.rb_node; 1022 struct rb_node *node;
994 struct stable_node *stable_node; 1023 struct stable_node *stable_node;
1024 int nid;
995 1025
996 stable_node = page_stable_node(page); 1026 stable_node = page_stable_node(page);
997 if (stable_node) { /* ksm page forked */ 1027 if (stable_node) { /* ksm page forked */
@@ -999,6 +1029,9 @@ static struct page *stable_tree_search(struct page *page)
999 return page; 1029 return page;
1000 } 1030 }
1001 1031
1032 nid = get_kpfn_nid(page_to_pfn(page));
1033 node = root_stable_tree[nid].rb_node;
1034
1002 while (node) { 1035 while (node) {
1003 struct page *tree_page; 1036 struct page *tree_page;
1004 int ret; 1037 int ret;
@@ -1033,10 +1066,16 @@ static struct page *stable_tree_search(struct page *page)
1033 */ 1066 */
1034static struct stable_node *stable_tree_insert(struct page *kpage) 1067static struct stable_node *stable_tree_insert(struct page *kpage)
1035{ 1068{
1036 struct rb_node **new = &root_stable_tree.rb_node; 1069 int nid;
1070 unsigned long kpfn;
1071 struct rb_node **new;
1037 struct rb_node *parent = NULL; 1072 struct rb_node *parent = NULL;
1038 struct stable_node *stable_node; 1073 struct stable_node *stable_node;
1039 1074
1075 kpfn = page_to_pfn(kpage);
1076 nid = get_kpfn_nid(kpfn);
1077 new = &root_stable_tree[nid].rb_node;
1078
1040 while (*new) { 1079 while (*new) {
1041 struct page *tree_page; 1080 struct page *tree_page;
1042 int ret; 1081 int ret;
@@ -1070,11 +1109,11 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
1070 return NULL; 1109 return NULL;
1071 1110
1072 rb_link_node(&stable_node->node, parent, new); 1111 rb_link_node(&stable_node->node, parent, new);
1073 rb_insert_color(&stable_node->node, &root_stable_tree); 1112 rb_insert_color(&stable_node->node, &root_stable_tree[nid]);
1074 1113
1075 INIT_HLIST_HEAD(&stable_node->hlist); 1114 INIT_HLIST_HEAD(&stable_node->hlist);
1076 1115
1077 stable_node->kpfn = page_to_pfn(kpage); 1116 stable_node->kpfn = kpfn;
1078 set_page_stable_node(kpage, stable_node); 1117 set_page_stable_node(kpage, stable_node);
1079 1118
1080 return stable_node; 1119 return stable_node;
@@ -1098,10 +1137,15 @@ static
1098struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, 1137struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1099 struct page *page, 1138 struct page *page,
1100 struct page **tree_pagep) 1139 struct page **tree_pagep)
1101
1102{ 1140{
1103 struct rb_node **new = &root_unstable_tree.rb_node; 1141 struct rb_node **new;
1142 struct rb_root *root;
1104 struct rb_node *parent = NULL; 1143 struct rb_node *parent = NULL;
1144 int nid;
1145
1146 nid = get_kpfn_nid(page_to_pfn(page));
1147 root = &root_unstable_tree[nid];
1148 new = &root->rb_node;
1105 1149
1106 while (*new) { 1150 while (*new) {
1107 struct rmap_item *tree_rmap_item; 1151 struct rmap_item *tree_rmap_item;
@@ -1122,6 +1166,18 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1122 return NULL; 1166 return NULL;
1123 } 1167 }
1124 1168
1169 /*
1170 * If tree_page has been migrated to another NUMA node, it
1171 * will be flushed out and put into the right unstable tree
1172 * next time: only merge with it if merge_across_nodes.
1173 * Just notice, we don't have similar problem for PageKsm
1174 * because their migration is disabled now. (62b61f611e)
1175 */
1176 if (!ksm_merge_across_nodes && page_to_nid(tree_page) != nid) {
1177 put_page(tree_page);
1178 return NULL;
1179 }
1180
1125 ret = memcmp_pages(page, tree_page); 1181 ret = memcmp_pages(page, tree_page);
1126 1182
1127 parent = *new; 1183 parent = *new;
@@ -1139,8 +1195,11 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1139 1195
1140 rmap_item->address |= UNSTABLE_FLAG; 1196 rmap_item->address |= UNSTABLE_FLAG;
1141 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1197 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1198#ifdef CONFIG_NUMA
1199 rmap_item->nid = nid;
1200#endif
1142 rb_link_node(&rmap_item->node, parent, new); 1201 rb_link_node(&rmap_item->node, parent, new);
1143 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1202 rb_insert_color(&rmap_item->node, root);
1144 1203
1145 ksm_pages_unshared++; 1204 ksm_pages_unshared++;
1146 return NULL; 1205 return NULL;
@@ -1154,6 +1213,13 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1154static void stable_tree_append(struct rmap_item *rmap_item, 1213static void stable_tree_append(struct rmap_item *rmap_item,
1155 struct stable_node *stable_node) 1214 struct stable_node *stable_node)
1156{ 1215{
1216#ifdef CONFIG_NUMA
1217 /*
1218 * Usually rmap_item->nid is already set correctly,
1219 * but it may be wrong after switching merge_across_nodes.
1220 */
1221 rmap_item->nid = get_kpfn_nid(stable_node->kpfn);
1222#endif
1157 rmap_item->head = stable_node; 1223 rmap_item->head = stable_node;
1158 rmap_item->address |= STABLE_FLAG; 1224 rmap_item->address |= STABLE_FLAG;
1159 hlist_add_head(&rmap_item->hlist, &stable_node->hlist); 1225 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
@@ -1283,6 +1349,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1283 struct mm_slot *slot; 1349 struct mm_slot *slot;
1284 struct vm_area_struct *vma; 1350 struct vm_area_struct *vma;
1285 struct rmap_item *rmap_item; 1351 struct rmap_item *rmap_item;
1352 int nid;
1286 1353
1287 if (list_empty(&ksm_mm_head.mm_list)) 1354 if (list_empty(&ksm_mm_head.mm_list))
1288 return NULL; 1355 return NULL;
@@ -1301,7 +1368,8 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1301 */ 1368 */
1302 lru_add_drain_all(); 1369 lru_add_drain_all();
1303 1370
1304 root_unstable_tree = RB_ROOT; 1371 for (nid = 0; nid < nr_node_ids; nid++)
1372 root_unstable_tree[nid] = RB_ROOT;
1305 1373
1306 spin_lock(&ksm_mmlist_lock); 1374 spin_lock(&ksm_mmlist_lock);
1307 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1375 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
@@ -1770,15 +1838,19 @@ static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
1770 unsigned long end_pfn) 1838 unsigned long end_pfn)
1771{ 1839{
1772 struct rb_node *node; 1840 struct rb_node *node;
1841 int nid;
1773 1842
1774 for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { 1843 for (nid = 0; nid < nr_node_ids; nid++)
1775 struct stable_node *stable_node; 1844 for (node = rb_first(&root_stable_tree[nid]); node;
1845 node = rb_next(node)) {
1846 struct stable_node *stable_node;
1847
1848 stable_node = rb_entry(node, struct stable_node, node);
1849 if (stable_node->kpfn >= start_pfn &&
1850 stable_node->kpfn < end_pfn)
1851 return stable_node;
1852 }
1776 1853
1777 stable_node = rb_entry(node, struct stable_node, node);
1778 if (stable_node->kpfn >= start_pfn &&
1779 stable_node->kpfn < end_pfn)
1780 return stable_node;
1781 }
1782 return NULL; 1854 return NULL;
1783} 1855}
1784 1856
@@ -1925,6 +1997,40 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1925} 1997}
1926KSM_ATTR(run); 1998KSM_ATTR(run);
1927 1999
2000#ifdef CONFIG_NUMA
2001static ssize_t merge_across_nodes_show(struct kobject *kobj,
2002 struct kobj_attribute *attr, char *buf)
2003{
2004 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2005}
2006
2007static ssize_t merge_across_nodes_store(struct kobject *kobj,
2008 struct kobj_attribute *attr,
2009 const char *buf, size_t count)
2010{
2011 int err;
2012 unsigned long knob;
2013
2014 err = kstrtoul(buf, 10, &knob);
2015 if (err)
2016 return err;
2017 if (knob > 1)
2018 return -EINVAL;
2019
2020 mutex_lock(&ksm_thread_mutex);
2021 if (ksm_merge_across_nodes != knob) {
2022 if (ksm_pages_shared)
2023 err = -EBUSY;
2024 else
2025 ksm_merge_across_nodes = knob;
2026 }
2027 mutex_unlock(&ksm_thread_mutex);
2028
2029 return err ? err : count;
2030}
2031KSM_ATTR(merge_across_nodes);
2032#endif
2033
1928static ssize_t pages_shared_show(struct kobject *kobj, 2034static ssize_t pages_shared_show(struct kobject *kobj,
1929 struct kobj_attribute *attr, char *buf) 2035 struct kobj_attribute *attr, char *buf)
1930{ 2036{
@@ -1979,6 +2085,9 @@ static struct attribute *ksm_attrs[] = {
1979 &pages_unshared_attr.attr, 2085 &pages_unshared_attr.attr,
1980 &pages_volatile_attr.attr, 2086 &pages_volatile_attr.attr,
1981 &full_scans_attr.attr, 2087 &full_scans_attr.attr,
2088#ifdef CONFIG_NUMA
2089 &merge_across_nodes_attr.attr,
2090#endif
1982 NULL, 2091 NULL,
1983}; 2092};
1984 2093
@@ -1992,11 +2101,15 @@ static int __init ksm_init(void)
1992{ 2101{
1993 struct task_struct *ksm_thread; 2102 struct task_struct *ksm_thread;
1994 int err; 2103 int err;
2104 int nid;
1995 2105
1996 err = ksm_slab_init(); 2106 err = ksm_slab_init();
1997 if (err) 2107 if (err)
1998 goto out; 2108 goto out;
1999 2109
2110 for (nid = 0; nid < nr_node_ids; nid++)
2111 root_stable_tree[nid] = RB_ROOT;
2112
2000 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 2113 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
2001 if (IS_ERR(ksm_thread)) { 2114 if (IS_ERR(ksm_thread)) {
2002 printk(KERN_ERR "ksm: creating kthread failed\n"); 2115 printk(KERN_ERR "ksm: creating kthread failed\n");