diff options
author | Hugh Dickins <hughd@google.com> | 2013-02-22 19:35:11 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-23 20:50:19 -0500 |
commit | 4146d2d673e8d6abf9b30a5b5dd8cd95f29632eb (patch) | |
tree | 3e0d66fbcbd1f9940b6ebfcb4d02fb0abba9960b /mm | |
parent | c8d6553b9580188a1324486173d79c0f8642e870 (diff) |
ksm: make !merge_across_nodes migration safe
The new KSM NUMA merge_across_nodes knob introduces a problem, when it's
set to non-default 0: if a KSM page is migrated to a different NUMA node,
how do we migrate its stable node to the right tree? And what if that
collides with an existing stable node?
ksm_migrate_page() can do no more than it's already doing, updating
stable_node->kpfn: the stable tree itself cannot be manipulated without
holding ksm_thread_mutex. So accept that a stable tree may temporarily
indicate a page belonging to the wrong NUMA node, leave updating until the
next pass of ksmd, just be careful not to merge other pages on to a
misplaced page. Note nid of holding tree in stable_node, and recognize
that it will not always match nid of kpfn.
A misplaced KSM page is discovered, either when ksm_do_scan() next comes
around to one of its rmap_items (we now have to go to cmp_and_merge_page
even on pages in a stable tree), or when stable_tree_search() arrives at a
matching node for another page, and this node page is found misplaced.
In each case, move the misplaced stable_node to a list of migrate_nodes
(and use the address of migrate_nodes as magic by which to identify them):
we don't need them in a tree. If stable_tree_search() finds no match for
a page, but it's currently exiled to this list, then slot its stable_node
right there into the tree, bringing all of its mappings with it; otherwise
they get migrated one by one to the original page of the colliding node.
stable_tree_search() is now modelled more like stable_tree_insert(), in
order to handle these insertions of migrated nodes.
remove_node_from_stable_tree(), remove_all_stable_nodes() and
ksm_check_stable_tree() have to handle the migrate_nodes list as well as
the stable tree itself. Less obviously, we do need to prune the list of
stale entries from time to time (scan_get_next_rmap_item() does it once
each full scan): whereas stale nodes in the stable tree get naturally
pruned as searches try to brush past them, these migrate_nodes may get
forgotten and accumulate.
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Petr Holasek <pholasek@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Izik Eidus <izik.eidus@ravellosystems.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/ksm.c | 164 |
1 files changed, 134 insertions, 30 deletions
@@ -122,13 +122,25 @@ struct ksm_scan { | |||
122 | /** | 122 | /** |
123 | * struct stable_node - node of the stable rbtree | 123 | * struct stable_node - node of the stable rbtree |
124 | * @node: rb node of this ksm page in the stable tree | 124 | * @node: rb node of this ksm page in the stable tree |
125 | * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list | ||
126 | * @list: linked into migrate_nodes, pending placement in the proper node tree | ||
125 | * @hlist: hlist head of rmap_items using this ksm page | 127 | * @hlist: hlist head of rmap_items using this ksm page |
126 | * @kpfn: page frame number of this ksm page | 128 | * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) |
129 | * @nid: NUMA node id of stable tree in which linked (may not match kpfn) | ||
127 | */ | 130 | */ |
128 | struct stable_node { | 131 | struct stable_node { |
129 | struct rb_node node; | 132 | union { |
133 | struct rb_node node; /* when node of stable tree */ | ||
134 | struct { /* when listed for migration */ | ||
135 | struct list_head *head; | ||
136 | struct list_head list; | ||
137 | }; | ||
138 | }; | ||
130 | struct hlist_head hlist; | 139 | struct hlist_head hlist; |
131 | unsigned long kpfn; | 140 | unsigned long kpfn; |
141 | #ifdef CONFIG_NUMA | ||
142 | int nid; | ||
143 | #endif | ||
132 | }; | 144 | }; |
133 | 145 | ||
134 | /** | 146 | /** |
@@ -169,6 +181,9 @@ struct rmap_item { | |||
169 | static struct rb_root root_unstable_tree[MAX_NUMNODES]; | 181 | static struct rb_root root_unstable_tree[MAX_NUMNODES]; |
170 | static struct rb_root root_stable_tree[MAX_NUMNODES]; | 182 | static struct rb_root root_stable_tree[MAX_NUMNODES]; |
171 | 183 | ||
184 | /* Recently migrated nodes of stable tree, pending proper placement */ | ||
185 | static LIST_HEAD(migrate_nodes); | ||
186 | |||
172 | #define MM_SLOTS_HASH_BITS 10 | 187 | #define MM_SLOTS_HASH_BITS 10 |
173 | static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); | 188 | static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); |
174 | 189 | ||
@@ -311,11 +326,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, | |||
311 | hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); | 326 | hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); |
312 | } | 327 | } |
313 | 328 | ||
314 | static inline int in_stable_tree(struct rmap_item *rmap_item) | ||
315 | { | ||
316 | return rmap_item->address & STABLE_FLAG; | ||
317 | } | ||
318 | |||
319 | /* | 329 | /* |
320 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's | 330 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's |
321 | * page tables after it has passed through ksm_exit() - which, if necessary, | 331 | * page tables after it has passed through ksm_exit() - which, if necessary, |
@@ -476,7 +486,6 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
476 | { | 486 | { |
477 | struct rmap_item *rmap_item; | 487 | struct rmap_item *rmap_item; |
478 | struct hlist_node *hlist; | 488 | struct hlist_node *hlist; |
479 | int nid; | ||
480 | 489 | ||
481 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 490 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { |
482 | if (rmap_item->hlist.next) | 491 | if (rmap_item->hlist.next) |
@@ -488,8 +497,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
488 | cond_resched(); | 497 | cond_resched(); |
489 | } | 498 | } |
490 | 499 | ||
491 | nid = get_kpfn_nid(stable_node->kpfn); | 500 | if (stable_node->head == &migrate_nodes) |
492 | rb_erase(&stable_node->node, &root_stable_tree[nid]); | 501 | list_del(&stable_node->list); |
502 | else | ||
503 | rb_erase(&stable_node->node, | ||
504 | &root_stable_tree[NUMA(stable_node->nid)]); | ||
493 | free_stable_node(stable_node); | 505 | free_stable_node(stable_node); |
494 | } | 506 | } |
495 | 507 | ||
@@ -712,6 +724,7 @@ static int remove_stable_node(struct stable_node *stable_node) | |||
712 | static int remove_all_stable_nodes(void) | 724 | static int remove_all_stable_nodes(void) |
713 | { | 725 | { |
714 | struct stable_node *stable_node; | 726 | struct stable_node *stable_node; |
727 | struct list_head *this, *next; | ||
715 | int nid; | 728 | int nid; |
716 | int err = 0; | 729 | int err = 0; |
717 | 730 | ||
@@ -726,6 +739,12 @@ static int remove_all_stable_nodes(void) | |||
726 | cond_resched(); | 739 | cond_resched(); |
727 | } | 740 | } |
728 | } | 741 | } |
742 | list_for_each_safe(this, next, &migrate_nodes) { | ||
743 | stable_node = list_entry(this, struct stable_node, list); | ||
744 | if (remove_stable_node(stable_node)) | ||
745 | err = -EBUSY; | ||
746 | cond_resched(); | ||
747 | } | ||
729 | return err; | 748 | return err; |
730 | } | 749 | } |
731 | 750 | ||
@@ -1113,25 +1132,30 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, | |||
1113 | */ | 1132 | */ |
1114 | static struct page *stable_tree_search(struct page *page) | 1133 | static struct page *stable_tree_search(struct page *page) |
1115 | { | 1134 | { |
1116 | struct rb_node *node; | ||
1117 | struct stable_node *stable_node; | ||
1118 | int nid; | 1135 | int nid; |
1136 | struct rb_node **new; | ||
1137 | struct rb_node *parent; | ||
1138 | struct stable_node *stable_node; | ||
1139 | struct stable_node *page_node; | ||
1119 | 1140 | ||
1120 | stable_node = page_stable_node(page); | 1141 | page_node = page_stable_node(page); |
1121 | if (stable_node) { /* ksm page forked */ | 1142 | if (page_node && page_node->head != &migrate_nodes) { |
1143 | /* ksm page forked */ | ||
1122 | get_page(page); | 1144 | get_page(page); |
1123 | return page; | 1145 | return page; |
1124 | } | 1146 | } |
1125 | 1147 | ||
1126 | nid = get_kpfn_nid(page_to_pfn(page)); | 1148 | nid = get_kpfn_nid(page_to_pfn(page)); |
1127 | node = root_stable_tree[nid].rb_node; | 1149 | again: |
1150 | new = &root_stable_tree[nid].rb_node; | ||
1151 | parent = NULL; | ||
1128 | 1152 | ||
1129 | while (node) { | 1153 | while (*new) { |
1130 | struct page *tree_page; | 1154 | struct page *tree_page; |
1131 | int ret; | 1155 | int ret; |
1132 | 1156 | ||
1133 | cond_resched(); | 1157 | cond_resched(); |
1134 | stable_node = rb_entry(node, struct stable_node, node); | 1158 | stable_node = rb_entry(*new, struct stable_node, node); |
1135 | tree_page = get_ksm_page(stable_node, false); | 1159 | tree_page = get_ksm_page(stable_node, false); |
1136 | if (!tree_page) | 1160 | if (!tree_page) |
1137 | return NULL; | 1161 | return NULL; |
@@ -1139,10 +1163,11 @@ static struct page *stable_tree_search(struct page *page) | |||
1139 | ret = memcmp_pages(page, tree_page); | 1163 | ret = memcmp_pages(page, tree_page); |
1140 | put_page(tree_page); | 1164 | put_page(tree_page); |
1141 | 1165 | ||
1166 | parent = *new; | ||
1142 | if (ret < 0) | 1167 | if (ret < 0) |
1143 | node = node->rb_left; | 1168 | new = &parent->rb_left; |
1144 | else if (ret > 0) | 1169 | else if (ret > 0) |
1145 | node = node->rb_right; | 1170 | new = &parent->rb_right; |
1146 | else { | 1171 | else { |
1147 | /* | 1172 | /* |
1148 | * Lock and unlock the stable_node's page (which | 1173 | * Lock and unlock the stable_node's page (which |
@@ -1152,13 +1177,49 @@ static struct page *stable_tree_search(struct page *page) | |||
1152 | * than kpage, but that involves more changes. | 1177 | * than kpage, but that involves more changes. |
1153 | */ | 1178 | */ |
1154 | tree_page = get_ksm_page(stable_node, true); | 1179 | tree_page = get_ksm_page(stable_node, true); |
1155 | if (tree_page) | 1180 | if (tree_page) { |
1156 | unlock_page(tree_page); | 1181 | unlock_page(tree_page); |
1157 | return tree_page; | 1182 | if (get_kpfn_nid(stable_node->kpfn) != |
1183 | NUMA(stable_node->nid)) { | ||
1184 | put_page(tree_page); | ||
1185 | goto replace; | ||
1186 | } | ||
1187 | return tree_page; | ||
1188 | } | ||
1189 | /* | ||
1190 | * There is now a place for page_node, but the tree may | ||
1191 | * have been rebalanced, so re-evaluate parent and new. | ||
1192 | */ | ||
1193 | if (page_node) | ||
1194 | goto again; | ||
1195 | return NULL; | ||
1158 | } | 1196 | } |
1159 | } | 1197 | } |
1160 | 1198 | ||
1161 | return NULL; | 1199 | if (!page_node) |
1200 | return NULL; | ||
1201 | |||
1202 | list_del(&page_node->list); | ||
1203 | DO_NUMA(page_node->nid = nid); | ||
1204 | rb_link_node(&page_node->node, parent, new); | ||
1205 | rb_insert_color(&page_node->node, &root_stable_tree[nid]); | ||
1206 | get_page(page); | ||
1207 | return page; | ||
1208 | |||
1209 | replace: | ||
1210 | if (page_node) { | ||
1211 | list_del(&page_node->list); | ||
1212 | DO_NUMA(page_node->nid = nid); | ||
1213 | rb_replace_node(&stable_node->node, | ||
1214 | &page_node->node, &root_stable_tree[nid]); | ||
1215 | get_page(page); | ||
1216 | } else { | ||
1217 | rb_erase(&stable_node->node, &root_stable_tree[nid]); | ||
1218 | page = NULL; | ||
1219 | } | ||
1220 | stable_node->head = &migrate_nodes; | ||
1221 | list_add(&stable_node->list, stable_node->head); | ||
1222 | return page; | ||
1162 | } | 1223 | } |
1163 | 1224 | ||
1164 | /* | 1225 | /* |
@@ -1215,6 +1276,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage) | |||
1215 | INIT_HLIST_HEAD(&stable_node->hlist); | 1276 | INIT_HLIST_HEAD(&stable_node->hlist); |
1216 | stable_node->kpfn = kpfn; | 1277 | stable_node->kpfn = kpfn; |
1217 | set_page_stable_node(kpage, stable_node); | 1278 | set_page_stable_node(kpage, stable_node); |
1279 | DO_NUMA(stable_node->nid = nid); | ||
1218 | rb_link_node(&stable_node->node, parent, new); | 1280 | rb_link_node(&stable_node->node, parent, new); |
1219 | rb_insert_color(&stable_node->node, &root_stable_tree[nid]); | 1281 | rb_insert_color(&stable_node->node, &root_stable_tree[nid]); |
1220 | 1282 | ||
@@ -1311,11 +1373,6 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |||
1311 | static void stable_tree_append(struct rmap_item *rmap_item, | 1373 | static void stable_tree_append(struct rmap_item *rmap_item, |
1312 | struct stable_node *stable_node) | 1374 | struct stable_node *stable_node) |
1313 | { | 1375 | { |
1314 | /* | ||
1315 | * Usually rmap_item->nid is already set correctly, | ||
1316 | * but it may be wrong after switching merge_across_nodes. | ||
1317 | */ | ||
1318 | DO_NUMA(rmap_item->nid = get_kpfn_nid(stable_node->kpfn)); | ||
1319 | rmap_item->head = stable_node; | 1376 | rmap_item->head = stable_node; |
1320 | rmap_item->address |= STABLE_FLAG; | 1377 | rmap_item->address |= STABLE_FLAG; |
1321 | hlist_add_head(&rmap_item->hlist, &stable_node->hlist); | 1378 | hlist_add_head(&rmap_item->hlist, &stable_node->hlist); |
@@ -1344,10 +1401,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1344 | unsigned int checksum; | 1401 | unsigned int checksum; |
1345 | int err; | 1402 | int err; |
1346 | 1403 | ||
1347 | remove_rmap_item_from_tree(rmap_item); | 1404 | stable_node = page_stable_node(page); |
1405 | if (stable_node) { | ||
1406 | if (stable_node->head != &migrate_nodes && | ||
1407 | get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { | ||
1408 | rb_erase(&stable_node->node, | ||
1409 | &root_stable_tree[NUMA(stable_node->nid)]); | ||
1410 | stable_node->head = &migrate_nodes; | ||
1411 | list_add(&stable_node->list, stable_node->head); | ||
1412 | } | ||
1413 | if (stable_node->head != &migrate_nodes && | ||
1414 | rmap_item->head == stable_node) | ||
1415 | return; | ||
1416 | } | ||
1348 | 1417 | ||
1349 | /* We first start with searching the page inside the stable tree */ | 1418 | /* We first start with searching the page inside the stable tree */ |
1350 | kpage = stable_tree_search(page); | 1419 | kpage = stable_tree_search(page); |
1420 | if (kpage == page && rmap_item->head == stable_node) { | ||
1421 | put_page(kpage); | ||
1422 | return; | ||
1423 | } | ||
1424 | |||
1425 | remove_rmap_item_from_tree(rmap_item); | ||
1426 | |||
1351 | if (kpage) { | 1427 | if (kpage) { |
1352 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); | 1428 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
1353 | if (!err) { | 1429 | if (!err) { |
@@ -1464,6 +1540,27 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1464 | */ | 1540 | */ |
1465 | lru_add_drain_all(); | 1541 | lru_add_drain_all(); |
1466 | 1542 | ||
1543 | /* | ||
1544 | * Whereas stale stable_nodes on the stable_tree itself | ||
1545 | * get pruned in the regular course of stable_tree_search(), | ||
1546 | * those moved out to the migrate_nodes list can accumulate: | ||
1547 | * so prune them once before each full scan. | ||
1548 | */ | ||
1549 | if (!ksm_merge_across_nodes) { | ||
1550 | struct stable_node *stable_node; | ||
1551 | struct list_head *this, *next; | ||
1552 | struct page *page; | ||
1553 | |||
1554 | list_for_each_safe(this, next, &migrate_nodes) { | ||
1555 | stable_node = list_entry(this, | ||
1556 | struct stable_node, list); | ||
1557 | page = get_ksm_page(stable_node, false); | ||
1558 | if (page) | ||
1559 | put_page(page); | ||
1560 | cond_resched(); | ||
1561 | } | ||
1562 | } | ||
1563 | |||
1467 | for (nid = 0; nid < nr_node_ids; nid++) | 1564 | for (nid = 0; nid < nr_node_ids; nid++) |
1468 | root_unstable_tree[nid] = RB_ROOT; | 1565 | root_unstable_tree[nid] = RB_ROOT; |
1469 | 1566 | ||
@@ -1586,8 +1683,7 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
1586 | rmap_item = scan_get_next_rmap_item(&page); | 1683 | rmap_item = scan_get_next_rmap_item(&page); |
1587 | if (!rmap_item) | 1684 | if (!rmap_item) |
1588 | return; | 1685 | return; |
1589 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) | 1686 | cmp_and_merge_page(page, rmap_item); |
1590 | cmp_and_merge_page(page, rmap_item); | ||
1591 | put_page(page); | 1687 | put_page(page); |
1592 | } | 1688 | } |
1593 | } | 1689 | } |
@@ -1964,6 +2060,7 @@ static void ksm_check_stable_tree(unsigned long start_pfn, | |||
1964 | unsigned long end_pfn) | 2060 | unsigned long end_pfn) |
1965 | { | 2061 | { |
1966 | struct stable_node *stable_node; | 2062 | struct stable_node *stable_node; |
2063 | struct list_head *this, *next; | ||
1967 | struct rb_node *node; | 2064 | struct rb_node *node; |
1968 | int nid; | 2065 | int nid; |
1969 | 2066 | ||
@@ -1984,6 +2081,13 @@ static void ksm_check_stable_tree(unsigned long start_pfn, | |||
1984 | cond_resched(); | 2081 | cond_resched(); |
1985 | } | 2082 | } |
1986 | } | 2083 | } |
2084 | list_for_each_safe(this, next, &migrate_nodes) { | ||
2085 | stable_node = list_entry(this, struct stable_node, list); | ||
2086 | if (stable_node->kpfn >= start_pfn && | ||
2087 | stable_node->kpfn < end_pfn) | ||
2088 | remove_node_from_stable_tree(stable_node); | ||
2089 | cond_resched(); | ||
2090 | } | ||
1987 | } | 2091 | } |
1988 | 2092 | ||
1989 | static int ksm_memory_callback(struct notifier_block *self, | 2093 | static int ksm_memory_callback(struct notifier_block *self, |