aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2013-02-22 19:35:11 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-23 20:50:19 -0500
commit4146d2d673e8d6abf9b30a5b5dd8cd95f29632eb (patch)
tree3e0d66fbcbd1f9940b6ebfcb4d02fb0abba9960b /mm
parentc8d6553b9580188a1324486173d79c0f8642e870 (diff)
ksm: make !merge_across_nodes migration safe
The new KSM NUMA merge_across_nodes knob introduces a problem, when it's set to non-default 0: if a KSM page is migrated to a different NUMA node, how do we migrate its stable node to the right tree? And what if that collides with an existing stable node? ksm_migrate_page() can do no more than it's already doing, updating stable_node->kpfn: the stable tree itself cannot be manipulated without holding ksm_thread_mutex. So accept that a stable tree may temporarily indicate a page belonging to the wrong NUMA node, leave updating until the next pass of ksmd, just be careful not to merge other pages on to a misplaced page. Note nid of holding tree in stable_node, and recognize that it will not always match nid of kpfn. A misplaced KSM page is discovered, either when ksm_do_scan() next comes around to one of its rmap_items (we now have to go to cmp_and_merge_page even on pages in a stable tree), or when stable_tree_search() arrives at a matching node for another page, and this node page is found misplaced. In each case, move the misplaced stable_node to a list of migrate_nodes (and use the address of migrate_nodes as magic by which to identify them): we don't need them in a tree. If stable_tree_search() finds no match for a page, but it's currently exiled to this list, then slot its stable_node right there into the tree, bringing all of its mappings with it; otherwise they get migrated one by one to the original page of the colliding node. stable_tree_search() is now modelled more like stable_tree_insert(), in order to handle these insertions of migrated nodes. remove_node_from_stable_tree(), remove_all_stable_nodes() and ksm_check_stable_tree() have to handle the migrate_nodes list as well as the stable tree itself. Less obviously, we do need to prune the list of stale entries from time to time (scan_get_next_rmap_item() does it once each full scan): whereas stale nodes in the stable tree get naturally pruned as searches try to brush past them, these migrate_nodes may get forgotten and accumulate. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Petr Holasek <pholasek@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Izik Eidus <izik.eidus@ravellosystems.com> Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/ksm.c164
1 files changed, 134 insertions, 30 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index df0529926703..91f07d0e9ce7 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -122,13 +122,25 @@ struct ksm_scan {
122/** 122/**
123 * struct stable_node - node of the stable rbtree 123 * struct stable_node - node of the stable rbtree
124 * @node: rb node of this ksm page in the stable tree 124 * @node: rb node of this ksm page in the stable tree
125 * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
126 * @list: linked into migrate_nodes, pending placement in the proper node tree
125 * @hlist: hlist head of rmap_items using this ksm page 127 * @hlist: hlist head of rmap_items using this ksm page
126 * @kpfn: page frame number of this ksm page 128 * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
129 * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
127 */ 130 */
128struct stable_node { 131struct stable_node {
129 struct rb_node node; 132 union {
133 struct rb_node node; /* when node of stable tree */
134 struct { /* when listed for migration */
135 struct list_head *head;
136 struct list_head list;
137 };
138 };
130 struct hlist_head hlist; 139 struct hlist_head hlist;
131 unsigned long kpfn; 140 unsigned long kpfn;
141#ifdef CONFIG_NUMA
142 int nid;
143#endif
132}; 144};
133 145
134/** 146/**
@@ -169,6 +181,9 @@ struct rmap_item {
169static struct rb_root root_unstable_tree[MAX_NUMNODES]; 181static struct rb_root root_unstable_tree[MAX_NUMNODES];
170static struct rb_root root_stable_tree[MAX_NUMNODES]; 182static struct rb_root root_stable_tree[MAX_NUMNODES];
171 183
184/* Recently migrated nodes of stable tree, pending proper placement */
185static LIST_HEAD(migrate_nodes);
186
172#define MM_SLOTS_HASH_BITS 10 187#define MM_SLOTS_HASH_BITS 10
173static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 188static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
174 189
@@ -311,11 +326,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
311 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); 326 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
312} 327}
313 328
314static inline int in_stable_tree(struct rmap_item *rmap_item)
315{
316 return rmap_item->address & STABLE_FLAG;
317}
318
319/* 329/*
320 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 330 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
321 * page tables after it has passed through ksm_exit() - which, if necessary, 331 * page tables after it has passed through ksm_exit() - which, if necessary,
@@ -476,7 +486,6 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
476{ 486{
477 struct rmap_item *rmap_item; 487 struct rmap_item *rmap_item;
478 struct hlist_node *hlist; 488 struct hlist_node *hlist;
479 int nid;
480 489
481 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 490 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
482 if (rmap_item->hlist.next) 491 if (rmap_item->hlist.next)
@@ -488,8 +497,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
488 cond_resched(); 497 cond_resched();
489 } 498 }
490 499
491 nid = get_kpfn_nid(stable_node->kpfn); 500 if (stable_node->head == &migrate_nodes)
492 rb_erase(&stable_node->node, &root_stable_tree[nid]); 501 list_del(&stable_node->list);
502 else
503 rb_erase(&stable_node->node,
504 &root_stable_tree[NUMA(stable_node->nid)]);
493 free_stable_node(stable_node); 505 free_stable_node(stable_node);
494} 506}
495 507
@@ -712,6 +724,7 @@ static int remove_stable_node(struct stable_node *stable_node)
712static int remove_all_stable_nodes(void) 724static int remove_all_stable_nodes(void)
713{ 725{
714 struct stable_node *stable_node; 726 struct stable_node *stable_node;
727 struct list_head *this, *next;
715 int nid; 728 int nid;
716 int err = 0; 729 int err = 0;
717 730
@@ -726,6 +739,12 @@ static int remove_all_stable_nodes(void)
726 cond_resched(); 739 cond_resched();
727 } 740 }
728 } 741 }
742 list_for_each_safe(this, next, &migrate_nodes) {
743 stable_node = list_entry(this, struct stable_node, list);
744 if (remove_stable_node(stable_node))
745 err = -EBUSY;
746 cond_resched();
747 }
729 return err; 748 return err;
730} 749}
731 750
@@ -1113,25 +1132,30 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1113 */ 1132 */
1114static struct page *stable_tree_search(struct page *page) 1133static struct page *stable_tree_search(struct page *page)
1115{ 1134{
1116 struct rb_node *node;
1117 struct stable_node *stable_node;
1118 int nid; 1135 int nid;
1136 struct rb_node **new;
1137 struct rb_node *parent;
1138 struct stable_node *stable_node;
1139 struct stable_node *page_node;
1119 1140
1120 stable_node = page_stable_node(page); 1141 page_node = page_stable_node(page);
1121 if (stable_node) { /* ksm page forked */ 1142 if (page_node && page_node->head != &migrate_nodes) {
1143 /* ksm page forked */
1122 get_page(page); 1144 get_page(page);
1123 return page; 1145 return page;
1124 } 1146 }
1125 1147
1126 nid = get_kpfn_nid(page_to_pfn(page)); 1148 nid = get_kpfn_nid(page_to_pfn(page));
1127 node = root_stable_tree[nid].rb_node; 1149again:
1150 new = &root_stable_tree[nid].rb_node;
1151 parent = NULL;
1128 1152
1129 while (node) { 1153 while (*new) {
1130 struct page *tree_page; 1154 struct page *tree_page;
1131 int ret; 1155 int ret;
1132 1156
1133 cond_resched(); 1157 cond_resched();
1134 stable_node = rb_entry(node, struct stable_node, node); 1158 stable_node = rb_entry(*new, struct stable_node, node);
1135 tree_page = get_ksm_page(stable_node, false); 1159 tree_page = get_ksm_page(stable_node, false);
1136 if (!tree_page) 1160 if (!tree_page)
1137 return NULL; 1161 return NULL;
@@ -1139,10 +1163,11 @@ static struct page *stable_tree_search(struct page *page)
1139 ret = memcmp_pages(page, tree_page); 1163 ret = memcmp_pages(page, tree_page);
1140 put_page(tree_page); 1164 put_page(tree_page);
1141 1165
1166 parent = *new;
1142 if (ret < 0) 1167 if (ret < 0)
1143 node = node->rb_left; 1168 new = &parent->rb_left;
1144 else if (ret > 0) 1169 else if (ret > 0)
1145 node = node->rb_right; 1170 new = &parent->rb_right;
1146 else { 1171 else {
1147 /* 1172 /*
1148 * Lock and unlock the stable_node's page (which 1173 * Lock and unlock the stable_node's page (which
@@ -1152,13 +1177,49 @@ static struct page *stable_tree_search(struct page *page)
1152 * than kpage, but that involves more changes. 1177 * than kpage, but that involves more changes.
1153 */ 1178 */
1154 tree_page = get_ksm_page(stable_node, true); 1179 tree_page = get_ksm_page(stable_node, true);
1155 if (tree_page) 1180 if (tree_page) {
1156 unlock_page(tree_page); 1181 unlock_page(tree_page);
1157 return tree_page; 1182 if (get_kpfn_nid(stable_node->kpfn) !=
1183 NUMA(stable_node->nid)) {
1184 put_page(tree_page);
1185 goto replace;
1186 }
1187 return tree_page;
1188 }
1189 /*
1190 * There is now a place for page_node, but the tree may
1191 * have been rebalanced, so re-evaluate parent and new.
1192 */
1193 if (page_node)
1194 goto again;
1195 return NULL;
1158 } 1196 }
1159 } 1197 }
1160 1198
1161 return NULL; 1199 if (!page_node)
1200 return NULL;
1201
1202 list_del(&page_node->list);
1203 DO_NUMA(page_node->nid = nid);
1204 rb_link_node(&page_node->node, parent, new);
1205 rb_insert_color(&page_node->node, &root_stable_tree[nid]);
1206 get_page(page);
1207 return page;
1208
1209replace:
1210 if (page_node) {
1211 list_del(&page_node->list);
1212 DO_NUMA(page_node->nid = nid);
1213 rb_replace_node(&stable_node->node,
1214 &page_node->node, &root_stable_tree[nid]);
1215 get_page(page);
1216 } else {
1217 rb_erase(&stable_node->node, &root_stable_tree[nid]);
1218 page = NULL;
1219 }
1220 stable_node->head = &migrate_nodes;
1221 list_add(&stable_node->list, stable_node->head);
1222 return page;
1162} 1223}
1163 1224
1164/* 1225/*
@@ -1215,6 +1276,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
1215 INIT_HLIST_HEAD(&stable_node->hlist); 1276 INIT_HLIST_HEAD(&stable_node->hlist);
1216 stable_node->kpfn = kpfn; 1277 stable_node->kpfn = kpfn;
1217 set_page_stable_node(kpage, stable_node); 1278 set_page_stable_node(kpage, stable_node);
1279 DO_NUMA(stable_node->nid = nid);
1218 rb_link_node(&stable_node->node, parent, new); 1280 rb_link_node(&stable_node->node, parent, new);
1219 rb_insert_color(&stable_node->node, &root_stable_tree[nid]); 1281 rb_insert_color(&stable_node->node, &root_stable_tree[nid]);
1220 1282
@@ -1311,11 +1373,6 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1311static void stable_tree_append(struct rmap_item *rmap_item, 1373static void stable_tree_append(struct rmap_item *rmap_item,
1312 struct stable_node *stable_node) 1374 struct stable_node *stable_node)
1313{ 1375{
1314 /*
1315 * Usually rmap_item->nid is already set correctly,
1316 * but it may be wrong after switching merge_across_nodes.
1317 */
1318 DO_NUMA(rmap_item->nid = get_kpfn_nid(stable_node->kpfn));
1319 rmap_item->head = stable_node; 1376 rmap_item->head = stable_node;
1320 rmap_item->address |= STABLE_FLAG; 1377 rmap_item->address |= STABLE_FLAG;
1321 hlist_add_head(&rmap_item->hlist, &stable_node->hlist); 1378 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
@@ -1344,10 +1401,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1344 unsigned int checksum; 1401 unsigned int checksum;
1345 int err; 1402 int err;
1346 1403
1347 remove_rmap_item_from_tree(rmap_item); 1404 stable_node = page_stable_node(page);
1405 if (stable_node) {
1406 if (stable_node->head != &migrate_nodes &&
1407 get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
1408 rb_erase(&stable_node->node,
1409 &root_stable_tree[NUMA(stable_node->nid)]);
1410 stable_node->head = &migrate_nodes;
1411 list_add(&stable_node->list, stable_node->head);
1412 }
1413 if (stable_node->head != &migrate_nodes &&
1414 rmap_item->head == stable_node)
1415 return;
1416 }
1348 1417
1349 /* We first start with searching the page inside the stable tree */ 1418 /* We first start with searching the page inside the stable tree */
1350 kpage = stable_tree_search(page); 1419 kpage = stable_tree_search(page);
1420 if (kpage == page && rmap_item->head == stable_node) {
1421 put_page(kpage);
1422 return;
1423 }
1424
1425 remove_rmap_item_from_tree(rmap_item);
1426
1351 if (kpage) { 1427 if (kpage) {
1352 err = try_to_merge_with_ksm_page(rmap_item, page, kpage); 1428 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1353 if (!err) { 1429 if (!err) {
@@ -1464,6 +1540,27 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1464 */ 1540 */
1465 lru_add_drain_all(); 1541 lru_add_drain_all();
1466 1542
1543 /*
1544 * Whereas stale stable_nodes on the stable_tree itself
1545 * get pruned in the regular course of stable_tree_search(),
1546 * those moved out to the migrate_nodes list can accumulate:
1547 * so prune them once before each full scan.
1548 */
1549 if (!ksm_merge_across_nodes) {
1550 struct stable_node *stable_node;
1551 struct list_head *this, *next;
1552 struct page *page;
1553
1554 list_for_each_safe(this, next, &migrate_nodes) {
1555 stable_node = list_entry(this,
1556 struct stable_node, list);
1557 page = get_ksm_page(stable_node, false);
1558 if (page)
1559 put_page(page);
1560 cond_resched();
1561 }
1562 }
1563
1467 for (nid = 0; nid < nr_node_ids; nid++) 1564 for (nid = 0; nid < nr_node_ids; nid++)
1468 root_unstable_tree[nid] = RB_ROOT; 1565 root_unstable_tree[nid] = RB_ROOT;
1469 1566
@@ -1586,8 +1683,7 @@ static void ksm_do_scan(unsigned int scan_npages)
1586 rmap_item = scan_get_next_rmap_item(&page); 1683 rmap_item = scan_get_next_rmap_item(&page);
1587 if (!rmap_item) 1684 if (!rmap_item)
1588 return; 1685 return;
1589 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1686 cmp_and_merge_page(page, rmap_item);
1590 cmp_and_merge_page(page, rmap_item);
1591 put_page(page); 1687 put_page(page);
1592 } 1688 }
1593} 1689}
@@ -1964,6 +2060,7 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
1964 unsigned long end_pfn) 2060 unsigned long end_pfn)
1965{ 2061{
1966 struct stable_node *stable_node; 2062 struct stable_node *stable_node;
2063 struct list_head *this, *next;
1967 struct rb_node *node; 2064 struct rb_node *node;
1968 int nid; 2065 int nid;
1969 2066
@@ -1984,6 +2081,13 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
1984 cond_resched(); 2081 cond_resched();
1985 } 2082 }
1986 } 2083 }
2084 list_for_each_safe(this, next, &migrate_nodes) {
2085 stable_node = list_entry(this, struct stable_node, list);
2086 if (stable_node->kpfn >= start_pfn &&
2087 stable_node->kpfn < end_pfn)
2088 remove_node_from_stable_tree(stable_node);
2089 cond_resched();
2090 }
1987} 2091}
1988 2092
1989static int ksm_memory_callback(struct notifier_block *self, 2093static int ksm_memory_callback(struct notifier_block *self,