diff options
Diffstat (limited to 'mm/ksm.c')
-rw-r--r-- | mm/ksm.c | 953 |
1 files changed, 611 insertions, 342 deletions
@@ -29,11 +29,13 @@ | |||
29 | #include <linux/wait.h> | 29 | #include <linux/wait.h> |
30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
31 | #include <linux/rbtree.h> | 31 | #include <linux/rbtree.h> |
32 | #include <linux/memory.h> | ||
32 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
33 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
34 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
35 | 36 | ||
36 | #include <asm/tlbflush.h> | 37 | #include <asm/tlbflush.h> |
38 | #include "internal.h" | ||
37 | 39 | ||
38 | /* | 40 | /* |
39 | * A few notes about the KSM scanning process, | 41 | * A few notes about the KSM scanning process, |
@@ -79,13 +81,13 @@ | |||
79 | * struct mm_slot - ksm information per mm that is being scanned | 81 | * struct mm_slot - ksm information per mm that is being scanned |
80 | * @link: link to the mm_slots hash list | 82 | * @link: link to the mm_slots hash list |
81 | * @mm_list: link into the mm_slots list, rooted in ksm_mm_head | 83 | * @mm_list: link into the mm_slots list, rooted in ksm_mm_head |
82 | * @rmap_list: head for this mm_slot's list of rmap_items | 84 | * @rmap_list: head for this mm_slot's singly-linked list of rmap_items |
83 | * @mm: the mm that this information is valid for | 85 | * @mm: the mm that this information is valid for |
84 | */ | 86 | */ |
85 | struct mm_slot { | 87 | struct mm_slot { |
86 | struct hlist_node link; | 88 | struct hlist_node link; |
87 | struct list_head mm_list; | 89 | struct list_head mm_list; |
88 | struct list_head rmap_list; | 90 | struct rmap_item *rmap_list; |
89 | struct mm_struct *mm; | 91 | struct mm_struct *mm; |
90 | }; | 92 | }; |
91 | 93 | ||
@@ -93,7 +95,7 @@ struct mm_slot { | |||
93 | * struct ksm_scan - cursor for scanning | 95 | * struct ksm_scan - cursor for scanning |
94 | * @mm_slot: the current mm_slot we are scanning | 96 | * @mm_slot: the current mm_slot we are scanning |
95 | * @address: the next address inside that to be scanned | 97 | * @address: the next address inside that to be scanned |
96 | * @rmap_item: the current rmap that we are scanning inside the rmap_list | 98 | * @rmap_list: link to the next rmap to be scanned in the rmap_list |
97 | * @seqnr: count of completed full scans (needed when removing unstable node) | 99 | * @seqnr: count of completed full scans (needed when removing unstable node) |
98 | * | 100 | * |
99 | * There is only the one ksm_scan instance of this cursor structure. | 101 | * There is only the one ksm_scan instance of this cursor structure. |
@@ -101,37 +103,51 @@ struct mm_slot { | |||
101 | struct ksm_scan { | 103 | struct ksm_scan { |
102 | struct mm_slot *mm_slot; | 104 | struct mm_slot *mm_slot; |
103 | unsigned long address; | 105 | unsigned long address; |
104 | struct rmap_item *rmap_item; | 106 | struct rmap_item **rmap_list; |
105 | unsigned long seqnr; | 107 | unsigned long seqnr; |
106 | }; | 108 | }; |
107 | 109 | ||
108 | /** | 110 | /** |
111 | * struct stable_node - node of the stable rbtree | ||
112 | * @node: rb node of this ksm page in the stable tree | ||
113 | * @hlist: hlist head of rmap_items using this ksm page | ||
114 | * @kpfn: page frame number of this ksm page | ||
115 | */ | ||
116 | struct stable_node { | ||
117 | struct rb_node node; | ||
118 | struct hlist_head hlist; | ||
119 | unsigned long kpfn; | ||
120 | }; | ||
121 | |||
122 | /** | ||
109 | * struct rmap_item - reverse mapping item for virtual addresses | 123 | * struct rmap_item - reverse mapping item for virtual addresses |
110 | * @link: link into mm_slot's rmap_list (rmap_list is per mm) | 124 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list |
125 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree | ||
111 | * @mm: the memory structure this rmap_item is pointing into | 126 | * @mm: the memory structure this rmap_item is pointing into |
112 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) | 127 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) |
113 | * @oldchecksum: previous checksum of the page at that virtual address | 128 | * @oldchecksum: previous checksum of the page at that virtual address |
114 | * @node: rb_node of this rmap_item in either unstable or stable tree | 129 | * @node: rb node of this rmap_item in the unstable tree |
115 | * @next: next rmap_item hanging off the same node of the stable tree | 130 | * @head: pointer to stable_node heading this list in the stable tree |
116 | * @prev: previous rmap_item hanging off the same node of the stable tree | 131 | * @hlist: link into hlist of rmap_items hanging off that stable_node |
117 | */ | 132 | */ |
118 | struct rmap_item { | 133 | struct rmap_item { |
119 | struct list_head link; | 134 | struct rmap_item *rmap_list; |
135 | struct anon_vma *anon_vma; /* when stable */ | ||
120 | struct mm_struct *mm; | 136 | struct mm_struct *mm; |
121 | unsigned long address; /* + low bits used for flags below */ | 137 | unsigned long address; /* + low bits used for flags below */ |
138 | unsigned int oldchecksum; /* when unstable */ | ||
122 | union { | 139 | union { |
123 | unsigned int oldchecksum; /* when unstable */ | 140 | struct rb_node node; /* when node of unstable tree */ |
124 | struct rmap_item *next; /* when stable */ | 141 | struct { /* when listed from stable tree */ |
125 | }; | 142 | struct stable_node *head; |
126 | union { | 143 | struct hlist_node hlist; |
127 | struct rb_node node; /* when tree node */ | 144 | }; |
128 | struct rmap_item *prev; /* in stable list */ | ||
129 | }; | 145 | }; |
130 | }; | 146 | }; |
131 | 147 | ||
132 | #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ | 148 | #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ |
133 | #define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ | 149 | #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ |
134 | #define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ | 150 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ |
135 | 151 | ||
136 | /* The stable and unstable tree heads */ | 152 | /* The stable and unstable tree heads */ |
137 | static struct rb_root root_stable_tree = RB_ROOT; | 153 | static struct rb_root root_stable_tree = RB_ROOT; |
@@ -148,6 +164,7 @@ static struct ksm_scan ksm_scan = { | |||
148 | }; | 164 | }; |
149 | 165 | ||
150 | static struct kmem_cache *rmap_item_cache; | 166 | static struct kmem_cache *rmap_item_cache; |
167 | static struct kmem_cache *stable_node_cache; | ||
151 | static struct kmem_cache *mm_slot_cache; | 168 | static struct kmem_cache *mm_slot_cache; |
152 | 169 | ||
153 | /* The number of nodes in the stable tree */ | 170 | /* The number of nodes in the stable tree */ |
@@ -162,9 +179,6 @@ static unsigned long ksm_pages_unshared; | |||
162 | /* The number of rmap_items in use: to calculate pages_volatile */ | 179 | /* The number of rmap_items in use: to calculate pages_volatile */ |
163 | static unsigned long ksm_rmap_items; | 180 | static unsigned long ksm_rmap_items; |
164 | 181 | ||
165 | /* Limit on the number of unswappable pages used */ | ||
166 | static unsigned long ksm_max_kernel_pages; | ||
167 | |||
168 | /* Number of pages ksmd should scan in one batch */ | 182 | /* Number of pages ksmd should scan in one batch */ |
169 | static unsigned int ksm_thread_pages_to_scan = 100; | 183 | static unsigned int ksm_thread_pages_to_scan = 100; |
170 | 184 | ||
@@ -190,13 +204,19 @@ static int __init ksm_slab_init(void) | |||
190 | if (!rmap_item_cache) | 204 | if (!rmap_item_cache) |
191 | goto out; | 205 | goto out; |
192 | 206 | ||
207 | stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); | ||
208 | if (!stable_node_cache) | ||
209 | goto out_free1; | ||
210 | |||
193 | mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); | 211 | mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); |
194 | if (!mm_slot_cache) | 212 | if (!mm_slot_cache) |
195 | goto out_free; | 213 | goto out_free2; |
196 | 214 | ||
197 | return 0; | 215 | return 0; |
198 | 216 | ||
199 | out_free: | 217 | out_free2: |
218 | kmem_cache_destroy(stable_node_cache); | ||
219 | out_free1: | ||
200 | kmem_cache_destroy(rmap_item_cache); | 220 | kmem_cache_destroy(rmap_item_cache); |
201 | out: | 221 | out: |
202 | return -ENOMEM; | 222 | return -ENOMEM; |
@@ -205,6 +225,7 @@ out: | |||
205 | static void __init ksm_slab_free(void) | 225 | static void __init ksm_slab_free(void) |
206 | { | 226 | { |
207 | kmem_cache_destroy(mm_slot_cache); | 227 | kmem_cache_destroy(mm_slot_cache); |
228 | kmem_cache_destroy(stable_node_cache); | ||
208 | kmem_cache_destroy(rmap_item_cache); | 229 | kmem_cache_destroy(rmap_item_cache); |
209 | mm_slot_cache = NULL; | 230 | mm_slot_cache = NULL; |
210 | } | 231 | } |
@@ -226,6 +247,16 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) | |||
226 | kmem_cache_free(rmap_item_cache, rmap_item); | 247 | kmem_cache_free(rmap_item_cache, rmap_item); |
227 | } | 248 | } |
228 | 249 | ||
250 | static inline struct stable_node *alloc_stable_node(void) | ||
251 | { | ||
252 | return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); | ||
253 | } | ||
254 | |||
255 | static inline void free_stable_node(struct stable_node *stable_node) | ||
256 | { | ||
257 | kmem_cache_free(stable_node_cache, stable_node); | ||
258 | } | ||
259 | |||
229 | static inline struct mm_slot *alloc_mm_slot(void) | 260 | static inline struct mm_slot *alloc_mm_slot(void) |
230 | { | 261 | { |
231 | if (!mm_slot_cache) /* initialization failed */ | 262 | if (!mm_slot_cache) /* initialization failed */ |
@@ -275,7 +306,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, | |||
275 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | 306 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) |
276 | % MM_SLOTS_HASH_HEADS]; | 307 | % MM_SLOTS_HASH_HEADS]; |
277 | mm_slot->mm = mm; | 308 | mm_slot->mm = mm; |
278 | INIT_LIST_HEAD(&mm_slot->rmap_list); | ||
279 | hlist_add_head(&mm_slot->link, bucket); | 309 | hlist_add_head(&mm_slot->link, bucket); |
280 | } | 310 | } |
281 | 311 | ||
@@ -284,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) | |||
284 | return rmap_item->address & STABLE_FLAG; | 314 | return rmap_item->address & STABLE_FLAG; |
285 | } | 315 | } |
286 | 316 | ||
317 | static void hold_anon_vma(struct rmap_item *rmap_item, | ||
318 | struct anon_vma *anon_vma) | ||
319 | { | ||
320 | rmap_item->anon_vma = anon_vma; | ||
321 | atomic_inc(&anon_vma->ksm_refcount); | ||
322 | } | ||
323 | |||
324 | static void drop_anon_vma(struct rmap_item *rmap_item) | ||
325 | { | ||
326 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
327 | |||
328 | if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { | ||
329 | int empty = list_empty(&anon_vma->head); | ||
330 | spin_unlock(&anon_vma->lock); | ||
331 | if (empty) | ||
332 | anon_vma_free(anon_vma); | ||
333 | } | ||
334 | } | ||
335 | |||
287 | /* | 336 | /* |
288 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's | 337 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's |
289 | * page tables after it has passed through ksm_exit() - which, if necessary, | 338 | * page tables after it has passed through ksm_exit() - which, if necessary, |
@@ -356,10 +405,18 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
356 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; | 405 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; |
357 | } | 406 | } |
358 | 407 | ||
359 | static void break_cow(struct mm_struct *mm, unsigned long addr) | 408 | static void break_cow(struct rmap_item *rmap_item) |
360 | { | 409 | { |
410 | struct mm_struct *mm = rmap_item->mm; | ||
411 | unsigned long addr = rmap_item->address; | ||
361 | struct vm_area_struct *vma; | 412 | struct vm_area_struct *vma; |
362 | 413 | ||
414 | /* | ||
415 | * It is not an accident that whenever we want to break COW | ||
416 | * to undo, we also need to drop a reference to the anon_vma. | ||
417 | */ | ||
418 | drop_anon_vma(rmap_item); | ||
419 | |||
363 | down_read(&mm->mmap_sem); | 420 | down_read(&mm->mmap_sem); |
364 | if (ksm_test_exit(mm)) | 421 | if (ksm_test_exit(mm)) |
365 | goto out; | 422 | goto out; |
@@ -403,21 +460,77 @@ out: page = NULL; | |||
403 | return page; | 460 | return page; |
404 | } | 461 | } |
405 | 462 | ||
463 | static void remove_node_from_stable_tree(struct stable_node *stable_node) | ||
464 | { | ||
465 | struct rmap_item *rmap_item; | ||
466 | struct hlist_node *hlist; | ||
467 | |||
468 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
469 | if (rmap_item->hlist.next) | ||
470 | ksm_pages_sharing--; | ||
471 | else | ||
472 | ksm_pages_shared--; | ||
473 | drop_anon_vma(rmap_item); | ||
474 | rmap_item->address &= PAGE_MASK; | ||
475 | cond_resched(); | ||
476 | } | ||
477 | |||
478 | rb_erase(&stable_node->node, &root_stable_tree); | ||
479 | free_stable_node(stable_node); | ||
480 | } | ||
481 | |||
406 | /* | 482 | /* |
407 | * get_ksm_page: checks if the page at the virtual address in rmap_item | 483 | * get_ksm_page: checks if the page indicated by the stable node |
408 | * is still PageKsm, in which case we can trust the content of the page, | 484 | * is still its ksm page, despite having held no reference to it. |
409 | * and it returns the gotten page; but NULL if the page has been zapped. | 485 | * In which case we can trust the content of the page, and it |
486 | * returns the gotten page; but if the page has now been zapped, | ||
487 | * remove the stale node from the stable tree and return NULL. | ||
488 | * | ||
489 | * You would expect the stable_node to hold a reference to the ksm page. | ||
490 | * But if it increments the page's count, swapping out has to wait for | ||
491 | * ksmd to come around again before it can free the page, which may take | ||
492 | * seconds or even minutes: much too unresponsive. So instead we use a | ||
493 | * "keyhole reference": access to the ksm page from the stable node peeps | ||
494 | * out through its keyhole to see if that page still holds the right key, | ||
495 | * pointing back to this stable node. This relies on freeing a PageAnon | ||
496 | * page to reset its page->mapping to NULL, and relies on no other use of | ||
497 | * a page to put something that might look like our key in page->mapping. | ||
498 | * | ||
499 | * include/linux/pagemap.h page_cache_get_speculative() is a good reference, | ||
500 | * but this is different - made simpler by ksm_thread_mutex being held, but | ||
501 | * interesting for assuming that no other use of the struct page could ever | ||
502 | * put our expected_mapping into page->mapping (or a field of the union which | ||
503 | * coincides with page->mapping). The RCU calls are not for KSM at all, but | ||
504 | * to keep the page_count protocol described with page_cache_get_speculative. | ||
505 | * | ||
506 | * Note: it is possible that get_ksm_page() will return NULL one moment, | ||
507 | * then page the next, if the page is in between page_freeze_refs() and | ||
508 | * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page | ||
509 | * is on its way to being freed; but it is an anomaly to bear in mind. | ||
410 | */ | 510 | */ |
411 | static struct page *get_ksm_page(struct rmap_item *rmap_item) | 511 | static struct page *get_ksm_page(struct stable_node *stable_node) |
412 | { | 512 | { |
413 | struct page *page; | 513 | struct page *page; |
414 | 514 | void *expected_mapping; | |
415 | page = get_mergeable_page(rmap_item); | 515 | |
416 | if (page && !PageKsm(page)) { | 516 | page = pfn_to_page(stable_node->kpfn); |
517 | expected_mapping = (void *)stable_node + | ||
518 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | ||
519 | rcu_read_lock(); | ||
520 | if (page->mapping != expected_mapping) | ||
521 | goto stale; | ||
522 | if (!get_page_unless_zero(page)) | ||
523 | goto stale; | ||
524 | if (page->mapping != expected_mapping) { | ||
417 | put_page(page); | 525 | put_page(page); |
418 | page = NULL; | 526 | goto stale; |
419 | } | 527 | } |
528 | rcu_read_unlock(); | ||
420 | return page; | 529 | return page; |
530 | stale: | ||
531 | rcu_read_unlock(); | ||
532 | remove_node_from_stable_tree(stable_node); | ||
533 | return NULL; | ||
421 | } | 534 | } |
422 | 535 | ||
423 | /* | 536 | /* |
@@ -426,35 +539,29 @@ static struct page *get_ksm_page(struct rmap_item *rmap_item) | |||
426 | */ | 539 | */ |
427 | static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | 540 | static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) |
428 | { | 541 | { |
429 | if (in_stable_tree(rmap_item)) { | 542 | if (rmap_item->address & STABLE_FLAG) { |
430 | struct rmap_item *next_item = rmap_item->next; | 543 | struct stable_node *stable_node; |
431 | 544 | struct page *page; | |
432 | if (rmap_item->address & NODE_FLAG) { | ||
433 | if (next_item) { | ||
434 | rb_replace_node(&rmap_item->node, | ||
435 | &next_item->node, | ||
436 | &root_stable_tree); | ||
437 | next_item->address |= NODE_FLAG; | ||
438 | ksm_pages_sharing--; | ||
439 | } else { | ||
440 | rb_erase(&rmap_item->node, &root_stable_tree); | ||
441 | ksm_pages_shared--; | ||
442 | } | ||
443 | } else { | ||
444 | struct rmap_item *prev_item = rmap_item->prev; | ||
445 | 545 | ||
446 | BUG_ON(prev_item->next != rmap_item); | 546 | stable_node = rmap_item->head; |
447 | prev_item->next = next_item; | 547 | page = get_ksm_page(stable_node); |
448 | if (next_item) { | 548 | if (!page) |
449 | BUG_ON(next_item->prev != rmap_item); | 549 | goto out; |
450 | next_item->prev = rmap_item->prev; | 550 | |
451 | } | 551 | lock_page(page); |
552 | hlist_del(&rmap_item->hlist); | ||
553 | unlock_page(page); | ||
554 | put_page(page); | ||
555 | |||
556 | if (stable_node->hlist.first) | ||
452 | ksm_pages_sharing--; | 557 | ksm_pages_sharing--; |
453 | } | 558 | else |
559 | ksm_pages_shared--; | ||
454 | 560 | ||
455 | rmap_item->next = NULL; | 561 | drop_anon_vma(rmap_item); |
562 | rmap_item->address &= PAGE_MASK; | ||
456 | 563 | ||
457 | } else if (rmap_item->address & NODE_FLAG) { | 564 | } else if (rmap_item->address & UNSTABLE_FLAG) { |
458 | unsigned char age; | 565 | unsigned char age; |
459 | /* | 566 | /* |
460 | * Usually ksmd can and must skip the rb_erase, because | 567 | * Usually ksmd can and must skip the rb_erase, because |
@@ -467,24 +574,21 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
467 | BUG_ON(age > 1); | 574 | BUG_ON(age > 1); |
468 | if (!age) | 575 | if (!age) |
469 | rb_erase(&rmap_item->node, &root_unstable_tree); | 576 | rb_erase(&rmap_item->node, &root_unstable_tree); |
577 | |||
470 | ksm_pages_unshared--; | 578 | ksm_pages_unshared--; |
579 | rmap_item->address &= PAGE_MASK; | ||
471 | } | 580 | } |
472 | 581 | out: | |
473 | rmap_item->address &= PAGE_MASK; | ||
474 | |||
475 | cond_resched(); /* we're called from many long loops */ | 582 | cond_resched(); /* we're called from many long loops */ |
476 | } | 583 | } |
477 | 584 | ||
478 | static void remove_trailing_rmap_items(struct mm_slot *mm_slot, | 585 | static void remove_trailing_rmap_items(struct mm_slot *mm_slot, |
479 | struct list_head *cur) | 586 | struct rmap_item **rmap_list) |
480 | { | 587 | { |
481 | struct rmap_item *rmap_item; | 588 | while (*rmap_list) { |
482 | 589 | struct rmap_item *rmap_item = *rmap_list; | |
483 | while (cur != &mm_slot->rmap_list) { | 590 | *rmap_list = rmap_item->rmap_list; |
484 | rmap_item = list_entry(cur, struct rmap_item, link); | ||
485 | cur = cur->next; | ||
486 | remove_rmap_item_from_tree(rmap_item); | 591 | remove_rmap_item_from_tree(rmap_item); |
487 | list_del(&rmap_item->link); | ||
488 | free_rmap_item(rmap_item); | 592 | free_rmap_item(rmap_item); |
489 | } | 593 | } |
490 | } | 594 | } |
@@ -550,7 +654,7 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
550 | goto error; | 654 | goto error; |
551 | } | 655 | } |
552 | 656 | ||
553 | remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); | 657 | remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); |
554 | 658 | ||
555 | spin_lock(&ksm_mmlist_lock); | 659 | spin_lock(&ksm_mmlist_lock); |
556 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, | 660 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, |
@@ -646,7 +750,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
646 | * Check that no O_DIRECT or similar I/O is in progress on the | 750 | * Check that no O_DIRECT or similar I/O is in progress on the |
647 | * page | 751 | * page |
648 | */ | 752 | */ |
649 | if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { | 753 | if (page_mapcount(page) + 1 + swapped != page_count(page)) { |
650 | set_pte_at_notify(mm, addr, ptep, entry); | 754 | set_pte_at_notify(mm, addr, ptep, entry); |
651 | goto out_unlock; | 755 | goto out_unlock; |
652 | } | 756 | } |
@@ -664,15 +768,15 @@ out: | |||
664 | 768 | ||
665 | /** | 769 | /** |
666 | * replace_page - replace page in vma by new ksm page | 770 | * replace_page - replace page in vma by new ksm page |
667 | * @vma: vma that holds the pte pointing to oldpage | 771 | * @vma: vma that holds the pte pointing to page |
668 | * @oldpage: the page we are replacing by newpage | 772 | * @page: the page we are replacing by kpage |
669 | * @newpage: the ksm page we replace oldpage by | 773 | * @kpage: the ksm page we replace page by |
670 | * @orig_pte: the original value of the pte | 774 | * @orig_pte: the original value of the pte |
671 | * | 775 | * |
672 | * Returns 0 on success, -EFAULT on failure. | 776 | * Returns 0 on success, -EFAULT on failure. |
673 | */ | 777 | */ |
674 | static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | 778 | static int replace_page(struct vm_area_struct *vma, struct page *page, |
675 | struct page *newpage, pte_t orig_pte) | 779 | struct page *kpage, pte_t orig_pte) |
676 | { | 780 | { |
677 | struct mm_struct *mm = vma->vm_mm; | 781 | struct mm_struct *mm = vma->vm_mm; |
678 | pgd_t *pgd; | 782 | pgd_t *pgd; |
@@ -681,12 +785,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | |||
681 | pte_t *ptep; | 785 | pte_t *ptep; |
682 | spinlock_t *ptl; | 786 | spinlock_t *ptl; |
683 | unsigned long addr; | 787 | unsigned long addr; |
684 | pgprot_t prot; | ||
685 | int err = -EFAULT; | 788 | int err = -EFAULT; |
686 | 789 | ||
687 | prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); | 790 | addr = page_address_in_vma(page, vma); |
688 | |||
689 | addr = page_address_in_vma(oldpage, vma); | ||
690 | if (addr == -EFAULT) | 791 | if (addr == -EFAULT) |
691 | goto out; | 792 | goto out; |
692 | 793 | ||
@@ -708,15 +809,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | |||
708 | goto out; | 809 | goto out; |
709 | } | 810 | } |
710 | 811 | ||
711 | get_page(newpage); | 812 | get_page(kpage); |
712 | page_add_ksm_rmap(newpage); | 813 | page_add_anon_rmap(kpage, vma, addr); |
713 | 814 | ||
714 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | 815 | flush_cache_page(vma, addr, pte_pfn(*ptep)); |
715 | ptep_clear_flush(vma, addr, ptep); | 816 | ptep_clear_flush(vma, addr, ptep); |
716 | set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); | 817 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
717 | 818 | ||
718 | page_remove_rmap(oldpage); | 819 | page_remove_rmap(page); |
719 | put_page(oldpage); | 820 | put_page(page); |
720 | 821 | ||
721 | pte_unmap_unlock(ptep, ptl); | 822 | pte_unmap_unlock(ptep, ptl); |
722 | err = 0; | 823 | err = 0; |
@@ -726,32 +827,27 @@ out: | |||
726 | 827 | ||
727 | /* | 828 | /* |
728 | * try_to_merge_one_page - take two pages and merge them into one | 829 | * try_to_merge_one_page - take two pages and merge them into one |
729 | * @vma: the vma that hold the pte pointing into oldpage | 830 | * @vma: the vma that holds the pte pointing to page |
730 | * @oldpage: the page that we want to replace with newpage | 831 | * @page: the PageAnon page that we want to replace with kpage |
731 | * @newpage: the page that we want to map instead of oldpage | 832 | * @kpage: the PageKsm page that we want to map instead of page, |
732 | * | 833 | * or NULL the first time when we want to use page as kpage. |
733 | * Note: | ||
734 | * oldpage should be a PageAnon page, while newpage should be a PageKsm page, | ||
735 | * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. | ||
736 | * | 834 | * |
737 | * This function returns 0 if the pages were merged, -EFAULT otherwise. | 835 | * This function returns 0 if the pages were merged, -EFAULT otherwise. |
738 | */ | 836 | */ |
739 | static int try_to_merge_one_page(struct vm_area_struct *vma, | 837 | static int try_to_merge_one_page(struct vm_area_struct *vma, |
740 | struct page *oldpage, | 838 | struct page *page, struct page *kpage) |
741 | struct page *newpage) | ||
742 | { | 839 | { |
743 | pte_t orig_pte = __pte(0); | 840 | pte_t orig_pte = __pte(0); |
744 | int err = -EFAULT; | 841 | int err = -EFAULT; |
745 | 842 | ||
843 | if (page == kpage) /* ksm page forked */ | ||
844 | return 0; | ||
845 | |||
746 | if (!(vma->vm_flags & VM_MERGEABLE)) | 846 | if (!(vma->vm_flags & VM_MERGEABLE)) |
747 | goto out; | 847 | goto out; |
748 | 848 | if (!PageAnon(page)) | |
749 | if (!PageAnon(oldpage)) | ||
750 | goto out; | 849 | goto out; |
751 | 850 | ||
752 | get_page(newpage); | ||
753 | get_page(oldpage); | ||
754 | |||
755 | /* | 851 | /* |
756 | * We need the page lock to read a stable PageSwapCache in | 852 | * We need the page lock to read a stable PageSwapCache in |
757 | * write_protect_page(). We use trylock_page() instead of | 853 | * write_protect_page(). We use trylock_page() instead of |
@@ -759,26 +855,39 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
759 | * prefer to continue scanning and merging different pages, | 855 | * prefer to continue scanning and merging different pages, |
760 | * then come back to this page when it is unlocked. | 856 | * then come back to this page when it is unlocked. |
761 | */ | 857 | */ |
762 | if (!trylock_page(oldpage)) | 858 | if (!trylock_page(page)) |
763 | goto out_putpage; | 859 | goto out; |
764 | /* | 860 | /* |
765 | * If this anonymous page is mapped only here, its pte may need | 861 | * If this anonymous page is mapped only here, its pte may need |
766 | * to be write-protected. If it's mapped elsewhere, all of its | 862 | * to be write-protected. If it's mapped elsewhere, all of its |
767 | * ptes are necessarily already write-protected. But in either | 863 | * ptes are necessarily already write-protected. But in either |
768 | * case, we need to lock and check page_count is not raised. | 864 | * case, we need to lock and check page_count is not raised. |
769 | */ | 865 | */ |
770 | if (write_protect_page(vma, oldpage, &orig_pte)) { | 866 | if (write_protect_page(vma, page, &orig_pte) == 0) { |
771 | unlock_page(oldpage); | 867 | if (!kpage) { |
772 | goto out_putpage; | 868 | /* |
869 | * While we hold page lock, upgrade page from | ||
870 | * PageAnon+anon_vma to PageKsm+NULL stable_node: | ||
871 | * stable_tree_insert() will update stable_node. | ||
872 | */ | ||
873 | set_page_stable_node(page, NULL); | ||
874 | mark_page_accessed(page); | ||
875 | err = 0; | ||
876 | } else if (pages_identical(page, kpage)) | ||
877 | err = replace_page(vma, page, kpage, orig_pte); | ||
773 | } | 878 | } |
774 | unlock_page(oldpage); | ||
775 | 879 | ||
776 | if (pages_identical(oldpage, newpage)) | 880 | if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { |
777 | err = replace_page(vma, oldpage, newpage, orig_pte); | 881 | munlock_vma_page(page); |
882 | if (!PageMlocked(kpage)) { | ||
883 | unlock_page(page); | ||
884 | lock_page(kpage); | ||
885 | mlock_vma_page(kpage); | ||
886 | page = kpage; /* for final unlock */ | ||
887 | } | ||
888 | } | ||
778 | 889 | ||
779 | out_putpage: | 890 | unlock_page(page); |
780 | put_page(oldpage); | ||
781 | put_page(newpage); | ||
782 | out: | 891 | out: |
783 | return err; | 892 | return err; |
784 | } | 893 | } |
@@ -786,26 +895,31 @@ out: | |||
786 | /* | 895 | /* |
787 | * try_to_merge_with_ksm_page - like try_to_merge_two_pages, | 896 | * try_to_merge_with_ksm_page - like try_to_merge_two_pages, |
788 | * but no new kernel page is allocated: kpage must already be a ksm page. | 897 | * but no new kernel page is allocated: kpage must already be a ksm page. |
898 | * | ||
899 | * This function returns 0 if the pages were merged, -EFAULT otherwise. | ||
789 | */ | 900 | */ |
790 | static int try_to_merge_with_ksm_page(struct mm_struct *mm1, | 901 | static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, |
791 | unsigned long addr1, | 902 | struct page *page, struct page *kpage) |
792 | struct page *page1, | ||
793 | struct page *kpage) | ||
794 | { | 903 | { |
904 | struct mm_struct *mm = rmap_item->mm; | ||
795 | struct vm_area_struct *vma; | 905 | struct vm_area_struct *vma; |
796 | int err = -EFAULT; | 906 | int err = -EFAULT; |
797 | 907 | ||
798 | down_read(&mm1->mmap_sem); | 908 | down_read(&mm->mmap_sem); |
799 | if (ksm_test_exit(mm1)) | 909 | if (ksm_test_exit(mm)) |
910 | goto out; | ||
911 | vma = find_vma(mm, rmap_item->address); | ||
912 | if (!vma || vma->vm_start > rmap_item->address) | ||
800 | goto out; | 913 | goto out; |
801 | 914 | ||
802 | vma = find_vma(mm1, addr1); | 915 | err = try_to_merge_one_page(vma, page, kpage); |
803 | if (!vma || vma->vm_start > addr1) | 916 | if (err) |
804 | goto out; | 917 | goto out; |
805 | 918 | ||
806 | err = try_to_merge_one_page(vma, page1, kpage); | 919 | /* Must get reference to anon_vma while still holding mmap_sem */ |
920 | hold_anon_vma(rmap_item, vma->anon_vma); | ||
807 | out: | 921 | out: |
808 | up_read(&mm1->mmap_sem); | 922 | up_read(&mm->mmap_sem); |
809 | return err; | 923 | return err; |
810 | } | 924 | } |
811 | 925 | ||
@@ -813,109 +927,73 @@ out: | |||
813 | * try_to_merge_two_pages - take two identical pages and prepare them | 927 | * try_to_merge_two_pages - take two identical pages and prepare them |
814 | * to be merged into one page. | 928 | * to be merged into one page. |
815 | * | 929 | * |
816 | * This function returns 0 if we successfully mapped two identical pages | 930 | * This function returns the kpage if we successfully merged two identical |
817 | * into one page, -EFAULT otherwise. | 931 | * pages into one ksm page, NULL otherwise. |
818 | * | 932 | * |
819 | * Note that this function allocates a new kernel page: if one of the pages | 933 | * Note that this function upgrades page to ksm page: if one of the pages |
820 | * is already a ksm page, try_to_merge_with_ksm_page should be used. | 934 | * is already a ksm page, try_to_merge_with_ksm_page should be used. |
821 | */ | 935 | */ |
822 | static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, | 936 | static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, |
823 | struct page *page1, struct mm_struct *mm2, | 937 | struct page *page, |
824 | unsigned long addr2, struct page *page2) | 938 | struct rmap_item *tree_rmap_item, |
939 | struct page *tree_page) | ||
825 | { | 940 | { |
826 | struct vm_area_struct *vma; | 941 | int err; |
827 | struct page *kpage; | ||
828 | int err = -EFAULT; | ||
829 | |||
830 | /* | ||
831 | * The number of nodes in the stable tree | ||
832 | * is the number of kernel pages that we hold. | ||
833 | */ | ||
834 | if (ksm_max_kernel_pages && | ||
835 | ksm_max_kernel_pages <= ksm_pages_shared) | ||
836 | return err; | ||
837 | |||
838 | kpage = alloc_page(GFP_HIGHUSER); | ||
839 | if (!kpage) | ||
840 | return err; | ||
841 | |||
842 | down_read(&mm1->mmap_sem); | ||
843 | if (ksm_test_exit(mm1)) { | ||
844 | up_read(&mm1->mmap_sem); | ||
845 | goto out; | ||
846 | } | ||
847 | vma = find_vma(mm1, addr1); | ||
848 | if (!vma || vma->vm_start > addr1) { | ||
849 | up_read(&mm1->mmap_sem); | ||
850 | goto out; | ||
851 | } | ||
852 | |||
853 | copy_user_highpage(kpage, page1, addr1, vma); | ||
854 | err = try_to_merge_one_page(vma, page1, kpage); | ||
855 | up_read(&mm1->mmap_sem); | ||
856 | 942 | ||
943 | err = try_to_merge_with_ksm_page(rmap_item, page, NULL); | ||
857 | if (!err) { | 944 | if (!err) { |
858 | err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); | 945 | err = try_to_merge_with_ksm_page(tree_rmap_item, |
946 | tree_page, page); | ||
859 | /* | 947 | /* |
860 | * If that fails, we have a ksm page with only one pte | 948 | * If that fails, we have a ksm page with only one pte |
861 | * pointing to it: so break it. | 949 | * pointing to it: so break it. |
862 | */ | 950 | */ |
863 | if (err) | 951 | if (err) |
864 | break_cow(mm1, addr1); | 952 | break_cow(rmap_item); |
865 | } | 953 | } |
866 | out: | 954 | return err ? NULL : page; |
867 | put_page(kpage); | ||
868 | return err; | ||
869 | } | 955 | } |
870 | 956 | ||
871 | /* | 957 | /* |
872 | * stable_tree_search - search page inside the stable tree | 958 | * stable_tree_search - search for page inside the stable tree |
873 | * @page: the page that we are searching identical pages to. | ||
874 | * @page2: pointer into identical page that we are holding inside the stable | ||
875 | * tree that we have found. | ||
876 | * @rmap_item: the reverse mapping item | ||
877 | * | 959 | * |
878 | * This function checks if there is a page inside the stable tree | 960 | * This function checks if there is a page inside the stable tree |
879 | * with identical content to the page that we are scanning right now. | 961 | * with identical content to the page that we are scanning right now. |
880 | * | 962 | * |
881 | * This function return rmap_item pointer to the identical item if found, | 963 | * This function returns the stable tree node of identical content if found, |
882 | * NULL otherwise. | 964 | * NULL otherwise. |
883 | */ | 965 | */ |
884 | static struct rmap_item *stable_tree_search(struct page *page, | 966 | static struct page *stable_tree_search(struct page *page) |
885 | struct page **page2, | ||
886 | struct rmap_item *rmap_item) | ||
887 | { | 967 | { |
888 | struct rb_node *node = root_stable_tree.rb_node; | 968 | struct rb_node *node = root_stable_tree.rb_node; |
969 | struct stable_node *stable_node; | ||
970 | |||
971 | stable_node = page_stable_node(page); | ||
972 | if (stable_node) { /* ksm page forked */ | ||
973 | get_page(page); | ||
974 | return page; | ||
975 | } | ||
889 | 976 | ||
890 | while (node) { | 977 | while (node) { |
891 | struct rmap_item *tree_rmap_item, *next_rmap_item; | 978 | struct page *tree_page; |
892 | int ret; | 979 | int ret; |
893 | 980 | ||
894 | tree_rmap_item = rb_entry(node, struct rmap_item, node); | 981 | cond_resched(); |
895 | while (tree_rmap_item) { | 982 | stable_node = rb_entry(node, struct stable_node, node); |
896 | BUG_ON(!in_stable_tree(tree_rmap_item)); | 983 | tree_page = get_ksm_page(stable_node); |
897 | cond_resched(); | 984 | if (!tree_page) |
898 | page2[0] = get_ksm_page(tree_rmap_item); | ||
899 | if (page2[0]) | ||
900 | break; | ||
901 | next_rmap_item = tree_rmap_item->next; | ||
902 | remove_rmap_item_from_tree(tree_rmap_item); | ||
903 | tree_rmap_item = next_rmap_item; | ||
904 | } | ||
905 | if (!tree_rmap_item) | ||
906 | return NULL; | 985 | return NULL; |
907 | 986 | ||
908 | ret = memcmp_pages(page, page2[0]); | 987 | ret = memcmp_pages(page, tree_page); |
909 | 988 | ||
910 | if (ret < 0) { | 989 | if (ret < 0) { |
911 | put_page(page2[0]); | 990 | put_page(tree_page); |
912 | node = node->rb_left; | 991 | node = node->rb_left; |
913 | } else if (ret > 0) { | 992 | } else if (ret > 0) { |
914 | put_page(page2[0]); | 993 | put_page(tree_page); |
915 | node = node->rb_right; | 994 | node = node->rb_right; |
916 | } else { | 995 | } else |
917 | return tree_rmap_item; | 996 | return tree_page; |
918 | } | ||
919 | } | 997 | } |
920 | 998 | ||
921 | return NULL; | 999 | return NULL; |
@@ -925,38 +1003,26 @@ static struct rmap_item *stable_tree_search(struct page *page, | |||
925 | * stable_tree_insert - insert rmap_item pointing to new ksm page | 1003 | * stable_tree_insert - insert rmap_item pointing to new ksm page |
926 | * into the stable tree. | 1004 | * into the stable tree. |
927 | * | 1005 | * |
928 | * @page: the page that we are searching identical page to inside the stable | 1006 | * This function returns the stable tree node just allocated on success, |
929 | * tree. | 1007 | * NULL otherwise. |
930 | * @rmap_item: pointer to the reverse mapping item. | ||
931 | * | ||
932 | * This function returns rmap_item if success, NULL otherwise. | ||
933 | */ | 1008 | */ |
934 | static struct rmap_item *stable_tree_insert(struct page *page, | 1009 | static struct stable_node *stable_tree_insert(struct page *kpage) |
935 | struct rmap_item *rmap_item) | ||
936 | { | 1010 | { |
937 | struct rb_node **new = &root_stable_tree.rb_node; | 1011 | struct rb_node **new = &root_stable_tree.rb_node; |
938 | struct rb_node *parent = NULL; | 1012 | struct rb_node *parent = NULL; |
1013 | struct stable_node *stable_node; | ||
939 | 1014 | ||
940 | while (*new) { | 1015 | while (*new) { |
941 | struct rmap_item *tree_rmap_item, *next_rmap_item; | ||
942 | struct page *tree_page; | 1016 | struct page *tree_page; |
943 | int ret; | 1017 | int ret; |
944 | 1018 | ||
945 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | 1019 | cond_resched(); |
946 | while (tree_rmap_item) { | 1020 | stable_node = rb_entry(*new, struct stable_node, node); |
947 | BUG_ON(!in_stable_tree(tree_rmap_item)); | 1021 | tree_page = get_ksm_page(stable_node); |
948 | cond_resched(); | 1022 | if (!tree_page) |
949 | tree_page = get_ksm_page(tree_rmap_item); | ||
950 | if (tree_page) | ||
951 | break; | ||
952 | next_rmap_item = tree_rmap_item->next; | ||
953 | remove_rmap_item_from_tree(tree_rmap_item); | ||
954 | tree_rmap_item = next_rmap_item; | ||
955 | } | ||
956 | if (!tree_rmap_item) | ||
957 | return NULL; | 1023 | return NULL; |
958 | 1024 | ||
959 | ret = memcmp_pages(page, tree_page); | 1025 | ret = memcmp_pages(kpage, tree_page); |
960 | put_page(tree_page); | 1026 | put_page(tree_page); |
961 | 1027 | ||
962 | parent = *new; | 1028 | parent = *new; |
@@ -974,22 +1040,24 @@ static struct rmap_item *stable_tree_insert(struct page *page, | |||
974 | } | 1040 | } |
975 | } | 1041 | } |
976 | 1042 | ||
977 | rmap_item->address |= NODE_FLAG | STABLE_FLAG; | 1043 | stable_node = alloc_stable_node(); |
978 | rmap_item->next = NULL; | 1044 | if (!stable_node) |
979 | rb_link_node(&rmap_item->node, parent, new); | 1045 | return NULL; |
980 | rb_insert_color(&rmap_item->node, &root_stable_tree); | ||
981 | 1046 | ||
982 | ksm_pages_shared++; | 1047 | rb_link_node(&stable_node->node, parent, new); |
983 | return rmap_item; | 1048 | rb_insert_color(&stable_node->node, &root_stable_tree); |
1049 | |||
1050 | INIT_HLIST_HEAD(&stable_node->hlist); | ||
1051 | |||
1052 | stable_node->kpfn = page_to_pfn(kpage); | ||
1053 | set_page_stable_node(kpage, stable_node); | ||
1054 | |||
1055 | return stable_node; | ||
984 | } | 1056 | } |
985 | 1057 | ||
986 | /* | 1058 | /* |
987 | * unstable_tree_search_insert - search and insert items into the unstable tree. | 1059 | * unstable_tree_search_insert - search for identical page, |
988 | * | 1060 | * else insert rmap_item into the unstable tree. |
989 | * @page: the page that we are going to search for identical page or to insert | ||
990 | * into the unstable tree | ||
991 | * @page2: pointer into identical page that was found inside the unstable tree | ||
992 | * @rmap_item: the reverse mapping item of page | ||
993 | * | 1061 | * |
994 | * This function searches for a page in the unstable tree identical to the | 1062 | * This function searches for a page in the unstable tree identical to the |
995 | * page currently being scanned; and if no identical page is found in the | 1063 | * page currently being scanned; and if no identical page is found in the |
@@ -1001,47 +1069,50 @@ static struct rmap_item *stable_tree_insert(struct page *page, | |||
1001 | * This function does both searching and inserting, because they share | 1069 | * This function does both searching and inserting, because they share |
1002 | * the same walking algorithm in an rbtree. | 1070 | * the same walking algorithm in an rbtree. |
1003 | */ | 1071 | */ |
1004 | static struct rmap_item *unstable_tree_search_insert(struct page *page, | 1072 | static |
1005 | struct page **page2, | 1073 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, |
1006 | struct rmap_item *rmap_item) | 1074 | struct page *page, |
1075 | struct page **tree_pagep) | ||
1076 | |||
1007 | { | 1077 | { |
1008 | struct rb_node **new = &root_unstable_tree.rb_node; | 1078 | struct rb_node **new = &root_unstable_tree.rb_node; |
1009 | struct rb_node *parent = NULL; | 1079 | struct rb_node *parent = NULL; |
1010 | 1080 | ||
1011 | while (*new) { | 1081 | while (*new) { |
1012 | struct rmap_item *tree_rmap_item; | 1082 | struct rmap_item *tree_rmap_item; |
1083 | struct page *tree_page; | ||
1013 | int ret; | 1084 | int ret; |
1014 | 1085 | ||
1015 | cond_resched(); | 1086 | cond_resched(); |
1016 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | 1087 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); |
1017 | page2[0] = get_mergeable_page(tree_rmap_item); | 1088 | tree_page = get_mergeable_page(tree_rmap_item); |
1018 | if (!page2[0]) | 1089 | if (!tree_page) |
1019 | return NULL; | 1090 | return NULL; |
1020 | 1091 | ||
1021 | /* | 1092 | /* |
1022 | * Don't substitute an unswappable ksm page | 1093 | * Don't substitute a ksm page for a forked page. |
1023 | * just for one good swappable forked page. | ||
1024 | */ | 1094 | */ |
1025 | if (page == page2[0]) { | 1095 | if (page == tree_page) { |
1026 | put_page(page2[0]); | 1096 | put_page(tree_page); |
1027 | return NULL; | 1097 | return NULL; |
1028 | } | 1098 | } |
1029 | 1099 | ||
1030 | ret = memcmp_pages(page, page2[0]); | 1100 | ret = memcmp_pages(page, tree_page); |
1031 | 1101 | ||
1032 | parent = *new; | 1102 | parent = *new; |
1033 | if (ret < 0) { | 1103 | if (ret < 0) { |
1034 | put_page(page2[0]); | 1104 | put_page(tree_page); |
1035 | new = &parent->rb_left; | 1105 | new = &parent->rb_left; |
1036 | } else if (ret > 0) { | 1106 | } else if (ret > 0) { |
1037 | put_page(page2[0]); | 1107 | put_page(tree_page); |
1038 | new = &parent->rb_right; | 1108 | new = &parent->rb_right; |
1039 | } else { | 1109 | } else { |
1110 | *tree_pagep = tree_page; | ||
1040 | return tree_rmap_item; | 1111 | return tree_rmap_item; |
1041 | } | 1112 | } |
1042 | } | 1113 | } |
1043 | 1114 | ||
1044 | rmap_item->address |= NODE_FLAG; | 1115 | rmap_item->address |= UNSTABLE_FLAG; |
1045 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); | 1116 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); |
1046 | rb_link_node(&rmap_item->node, parent, new); | 1117 | rb_link_node(&rmap_item->node, parent, new); |
1047 | rb_insert_color(&rmap_item->node, &root_unstable_tree); | 1118 | rb_insert_color(&rmap_item->node, &root_unstable_tree); |
@@ -1056,18 +1127,16 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page, | |||
1056 | * the same ksm page. | 1127 | * the same ksm page. |
1057 | */ | 1128 | */ |
1058 | static void stable_tree_append(struct rmap_item *rmap_item, | 1129 | static void stable_tree_append(struct rmap_item *rmap_item, |
1059 | struct rmap_item *tree_rmap_item) | 1130 | struct stable_node *stable_node) |
1060 | { | 1131 | { |
1061 | rmap_item->next = tree_rmap_item->next; | 1132 | rmap_item->head = stable_node; |
1062 | rmap_item->prev = tree_rmap_item; | ||
1063 | |||
1064 | if (tree_rmap_item->next) | ||
1065 | tree_rmap_item->next->prev = rmap_item; | ||
1066 | |||
1067 | tree_rmap_item->next = rmap_item; | ||
1068 | rmap_item->address |= STABLE_FLAG; | 1133 | rmap_item->address |= STABLE_FLAG; |
1134 | hlist_add_head(&rmap_item->hlist, &stable_node->hlist); | ||
1069 | 1135 | ||
1070 | ksm_pages_sharing++; | 1136 | if (rmap_item->hlist.next) |
1137 | ksm_pages_sharing++; | ||
1138 | else | ||
1139 | ksm_pages_shared++; | ||
1071 | } | 1140 | } |
1072 | 1141 | ||
1073 | /* | 1142 | /* |
@@ -1081,49 +1150,37 @@ static void stable_tree_append(struct rmap_item *rmap_item, | |||
1081 | */ | 1150 | */ |
1082 | static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | 1151 | static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) |
1083 | { | 1152 | { |
1084 | struct page *page2[1]; | ||
1085 | struct rmap_item *tree_rmap_item; | 1153 | struct rmap_item *tree_rmap_item; |
1154 | struct page *tree_page = NULL; | ||
1155 | struct stable_node *stable_node; | ||
1156 | struct page *kpage; | ||
1086 | unsigned int checksum; | 1157 | unsigned int checksum; |
1087 | int err; | 1158 | int err; |
1088 | 1159 | ||
1089 | if (in_stable_tree(rmap_item)) | 1160 | remove_rmap_item_from_tree(rmap_item); |
1090 | remove_rmap_item_from_tree(rmap_item); | ||
1091 | 1161 | ||
1092 | /* We first start with searching the page inside the stable tree */ | 1162 | /* We first start with searching the page inside the stable tree */ |
1093 | tree_rmap_item = stable_tree_search(page, page2, rmap_item); | 1163 | kpage = stable_tree_search(page); |
1094 | if (tree_rmap_item) { | 1164 | if (kpage) { |
1095 | if (page == page2[0]) /* forked */ | 1165 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
1096 | err = 0; | ||
1097 | else | ||
1098 | err = try_to_merge_with_ksm_page(rmap_item->mm, | ||
1099 | rmap_item->address, | ||
1100 | page, page2[0]); | ||
1101 | put_page(page2[0]); | ||
1102 | |||
1103 | if (!err) { | 1166 | if (!err) { |
1104 | /* | 1167 | /* |
1105 | * The page was successfully merged: | 1168 | * The page was successfully merged: |
1106 | * add its rmap_item to the stable tree. | 1169 | * add its rmap_item to the stable tree. |
1107 | */ | 1170 | */ |
1108 | stable_tree_append(rmap_item, tree_rmap_item); | 1171 | lock_page(kpage); |
1172 | stable_tree_append(rmap_item, page_stable_node(kpage)); | ||
1173 | unlock_page(kpage); | ||
1109 | } | 1174 | } |
1175 | put_page(kpage); | ||
1110 | return; | 1176 | return; |
1111 | } | 1177 | } |
1112 | 1178 | ||
1113 | /* | 1179 | /* |
1114 | * A ksm page might have got here by fork, but its other | 1180 | * If the hash value of the page has changed from the last time |
1115 | * references have already been removed from the stable tree. | 1181 | * we calculated it, this page is changing frequently: therefore we |
1116 | * Or it might be left over from a break_ksm which failed | 1182 | * don't want to insert it in the unstable tree, and we don't want |
1117 | * when the mem_cgroup had reached its limit: try again now. | 1183 | * to waste our time searching for something identical to it there. |
1118 | */ | ||
1119 | if (PageKsm(page)) | ||
1120 | break_cow(rmap_item->mm, rmap_item->address); | ||
1121 | |||
1122 | /* | ||
1123 | * In case the hash value of the page was changed from the last time we | ||
1124 | * have calculated it, this page to be changed frequely, therefore we | ||
1125 | * don't want to insert it to the unstable tree, and we don't want to | ||
1126 | * waste our time to search if there is something identical to it there. | ||
1127 | */ | 1184 | */ |
1128 | checksum = calc_checksum(page); | 1185 | checksum = calc_checksum(page); |
1129 | if (rmap_item->oldchecksum != checksum) { | 1186 | if (rmap_item->oldchecksum != checksum) { |
@@ -1131,21 +1188,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1131 | return; | 1188 | return; |
1132 | } | 1189 | } |
1133 | 1190 | ||
1134 | tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); | 1191 | tree_rmap_item = |
1192 | unstable_tree_search_insert(rmap_item, page, &tree_page); | ||
1135 | if (tree_rmap_item) { | 1193 | if (tree_rmap_item) { |
1136 | err = try_to_merge_two_pages(rmap_item->mm, | 1194 | kpage = try_to_merge_two_pages(rmap_item, page, |
1137 | rmap_item->address, page, | 1195 | tree_rmap_item, tree_page); |
1138 | tree_rmap_item->mm, | 1196 | put_page(tree_page); |
1139 | tree_rmap_item->address, page2[0]); | ||
1140 | /* | 1197 | /* |
1141 | * As soon as we merge this page, we want to remove the | 1198 | * As soon as we merge this page, we want to remove the |
1142 | * rmap_item of the page we have merged with from the unstable | 1199 | * rmap_item of the page we have merged with from the unstable |
1143 | * tree, and insert it instead as new node in the stable tree. | 1200 | * tree, and insert it instead as new node in the stable tree. |
1144 | */ | 1201 | */ |
1145 | if (!err) { | 1202 | if (kpage) { |
1146 | rb_erase(&tree_rmap_item->node, &root_unstable_tree); | 1203 | remove_rmap_item_from_tree(tree_rmap_item); |
1147 | tree_rmap_item->address &= ~NODE_FLAG; | 1204 | |
1148 | ksm_pages_unshared--; | 1205 | lock_page(kpage); |
1206 | stable_node = stable_tree_insert(kpage); | ||
1207 | if (stable_node) { | ||
1208 | stable_tree_append(tree_rmap_item, stable_node); | ||
1209 | stable_tree_append(rmap_item, stable_node); | ||
1210 | } | ||
1211 | unlock_page(kpage); | ||
1149 | 1212 | ||
1150 | /* | 1213 | /* |
1151 | * If we fail to insert the page into the stable tree, | 1214 | * If we fail to insert the page into the stable tree, |
@@ -1153,37 +1216,28 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1153 | * to a ksm page left outside the stable tree, | 1216 | * to a ksm page left outside the stable tree, |
1154 | * in which case we need to break_cow on both. | 1217 | * in which case we need to break_cow on both. |
1155 | */ | 1218 | */ |
1156 | if (stable_tree_insert(page2[0], tree_rmap_item)) | 1219 | if (!stable_node) { |
1157 | stable_tree_append(rmap_item, tree_rmap_item); | 1220 | break_cow(tree_rmap_item); |
1158 | else { | 1221 | break_cow(rmap_item); |
1159 | break_cow(tree_rmap_item->mm, | ||
1160 | tree_rmap_item->address); | ||
1161 | break_cow(rmap_item->mm, rmap_item->address); | ||
1162 | } | 1222 | } |
1163 | } | 1223 | } |
1164 | |||
1165 | put_page(page2[0]); | ||
1166 | } | 1224 | } |
1167 | } | 1225 | } |
1168 | 1226 | ||
1169 | static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, | 1227 | static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, |
1170 | struct list_head *cur, | 1228 | struct rmap_item **rmap_list, |
1171 | unsigned long addr) | 1229 | unsigned long addr) |
1172 | { | 1230 | { |
1173 | struct rmap_item *rmap_item; | 1231 | struct rmap_item *rmap_item; |
1174 | 1232 | ||
1175 | while (cur != &mm_slot->rmap_list) { | 1233 | while (*rmap_list) { |
1176 | rmap_item = list_entry(cur, struct rmap_item, link); | 1234 | rmap_item = *rmap_list; |
1177 | if ((rmap_item->address & PAGE_MASK) == addr) { | 1235 | if ((rmap_item->address & PAGE_MASK) == addr) |
1178 | if (!in_stable_tree(rmap_item)) | ||
1179 | remove_rmap_item_from_tree(rmap_item); | ||
1180 | return rmap_item; | 1236 | return rmap_item; |
1181 | } | ||
1182 | if (rmap_item->address > addr) | 1237 | if (rmap_item->address > addr) |
1183 | break; | 1238 | break; |
1184 | cur = cur->next; | 1239 | *rmap_list = rmap_item->rmap_list; |
1185 | remove_rmap_item_from_tree(rmap_item); | 1240 | remove_rmap_item_from_tree(rmap_item); |
1186 | list_del(&rmap_item->link); | ||
1187 | free_rmap_item(rmap_item); | 1241 | free_rmap_item(rmap_item); |
1188 | } | 1242 | } |
1189 | 1243 | ||
@@ -1192,7 +1246,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, | |||
1192 | /* It has already been zeroed */ | 1246 | /* It has already been zeroed */ |
1193 | rmap_item->mm = mm_slot->mm; | 1247 | rmap_item->mm = mm_slot->mm; |
1194 | rmap_item->address = addr; | 1248 | rmap_item->address = addr; |
1195 | list_add_tail(&rmap_item->link, cur); | 1249 | rmap_item->rmap_list = *rmap_list; |
1250 | *rmap_list = rmap_item; | ||
1196 | } | 1251 | } |
1197 | return rmap_item; | 1252 | return rmap_item; |
1198 | } | 1253 | } |
@@ -1217,8 +1272,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1217 | spin_unlock(&ksm_mmlist_lock); | 1272 | spin_unlock(&ksm_mmlist_lock); |
1218 | next_mm: | 1273 | next_mm: |
1219 | ksm_scan.address = 0; | 1274 | ksm_scan.address = 0; |
1220 | ksm_scan.rmap_item = list_entry(&slot->rmap_list, | 1275 | ksm_scan.rmap_list = &slot->rmap_list; |
1221 | struct rmap_item, link); | ||
1222 | } | 1276 | } |
1223 | 1277 | ||
1224 | mm = slot->mm; | 1278 | mm = slot->mm; |
@@ -1244,10 +1298,10 @@ next_mm: | |||
1244 | flush_anon_page(vma, *page, ksm_scan.address); | 1298 | flush_anon_page(vma, *page, ksm_scan.address); |
1245 | flush_dcache_page(*page); | 1299 | flush_dcache_page(*page); |
1246 | rmap_item = get_next_rmap_item(slot, | 1300 | rmap_item = get_next_rmap_item(slot, |
1247 | ksm_scan.rmap_item->link.next, | 1301 | ksm_scan.rmap_list, ksm_scan.address); |
1248 | ksm_scan.address); | ||
1249 | if (rmap_item) { | 1302 | if (rmap_item) { |
1250 | ksm_scan.rmap_item = rmap_item; | 1303 | ksm_scan.rmap_list = |
1304 | &rmap_item->rmap_list; | ||
1251 | ksm_scan.address += PAGE_SIZE; | 1305 | ksm_scan.address += PAGE_SIZE; |
1252 | } else | 1306 | } else |
1253 | put_page(*page); | 1307 | put_page(*page); |
@@ -1263,14 +1317,13 @@ next_mm: | |||
1263 | 1317 | ||
1264 | if (ksm_test_exit(mm)) { | 1318 | if (ksm_test_exit(mm)) { |
1265 | ksm_scan.address = 0; | 1319 | ksm_scan.address = 0; |
1266 | ksm_scan.rmap_item = list_entry(&slot->rmap_list, | 1320 | ksm_scan.rmap_list = &slot->rmap_list; |
1267 | struct rmap_item, link); | ||
1268 | } | 1321 | } |
1269 | /* | 1322 | /* |
1270 | * Nuke all the rmap_items that are above this current rmap: | 1323 | * Nuke all the rmap_items that are above this current rmap: |
1271 | * because there were no VM_MERGEABLE vmas with such addresses. | 1324 | * because there were no VM_MERGEABLE vmas with such addresses. |
1272 | */ | 1325 | */ |
1273 | remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); | 1326 | remove_trailing_rmap_items(slot, ksm_scan.rmap_list); |
1274 | 1327 | ||
1275 | spin_lock(&ksm_mmlist_lock); | 1328 | spin_lock(&ksm_mmlist_lock); |
1276 | ksm_scan.mm_slot = list_entry(slot->mm_list.next, | 1329 | ksm_scan.mm_slot = list_entry(slot->mm_list.next, |
@@ -1323,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
1323 | return; | 1376 | return; |
1324 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) | 1377 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) |
1325 | cmp_and_merge_page(page, rmap_item); | 1378 | cmp_and_merge_page(page, rmap_item); |
1326 | else if (page_mapcount(page) == 1) { | ||
1327 | /* | ||
1328 | * Replace now-unshared ksm page by ordinary page. | ||
1329 | */ | ||
1330 | break_cow(rmap_item->mm, rmap_item->address); | ||
1331 | remove_rmap_item_from_tree(rmap_item); | ||
1332 | rmap_item->oldchecksum = calc_checksum(page); | ||
1333 | } | ||
1334 | put_page(page); | 1379 | put_page(page); |
1335 | } | 1380 | } |
1336 | } | 1381 | } |
@@ -1375,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1375 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | 1420 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | |
1376 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | 1421 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | |
1377 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | 1422 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | |
1378 | VM_MIXEDMAP | VM_SAO)) | 1423 | VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) |
1379 | return 0; /* just ignore the advice */ | 1424 | return 0; /* just ignore the advice */ |
1380 | 1425 | ||
1381 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { | 1426 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { |
@@ -1452,7 +1497,7 @@ void __ksm_exit(struct mm_struct *mm) | |||
1452 | spin_lock(&ksm_mmlist_lock); | 1497 | spin_lock(&ksm_mmlist_lock); |
1453 | mm_slot = get_mm_slot(mm); | 1498 | mm_slot = get_mm_slot(mm); |
1454 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { | 1499 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { |
1455 | if (list_empty(&mm_slot->rmap_list)) { | 1500 | if (!mm_slot->rmap_list) { |
1456 | hlist_del(&mm_slot->link); | 1501 | hlist_del(&mm_slot->link); |
1457 | list_del(&mm_slot->mm_list); | 1502 | list_del(&mm_slot->mm_list); |
1458 | easy_to_free = 1; | 1503 | easy_to_free = 1; |
@@ -1473,6 +1518,249 @@ void __ksm_exit(struct mm_struct *mm) | |||
1473 | } | 1518 | } |
1474 | } | 1519 | } |
1475 | 1520 | ||
1521 | struct page *ksm_does_need_to_copy(struct page *page, | ||
1522 | struct vm_area_struct *vma, unsigned long address) | ||
1523 | { | ||
1524 | struct page *new_page; | ||
1525 | |||
1526 | unlock_page(page); /* any racers will COW it, not modify it */ | ||
1527 | |||
1528 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
1529 | if (new_page) { | ||
1530 | copy_user_highpage(new_page, page, address, vma); | ||
1531 | |||
1532 | SetPageDirty(new_page); | ||
1533 | __SetPageUptodate(new_page); | ||
1534 | SetPageSwapBacked(new_page); | ||
1535 | __set_page_locked(new_page); | ||
1536 | |||
1537 | if (page_evictable(new_page, vma)) | ||
1538 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | ||
1539 | else | ||
1540 | add_page_to_unevictable_list(new_page); | ||
1541 | } | ||
1542 | |||
1543 | page_cache_release(page); | ||
1544 | return new_page; | ||
1545 | } | ||
1546 | |||
1547 | int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | ||
1548 | unsigned long *vm_flags) | ||
1549 | { | ||
1550 | struct stable_node *stable_node; | ||
1551 | struct rmap_item *rmap_item; | ||
1552 | struct hlist_node *hlist; | ||
1553 | unsigned int mapcount = page_mapcount(page); | ||
1554 | int referenced = 0; | ||
1555 | int search_new_forks = 0; | ||
1556 | |||
1557 | VM_BUG_ON(!PageKsm(page)); | ||
1558 | VM_BUG_ON(!PageLocked(page)); | ||
1559 | |||
1560 | stable_node = page_stable_node(page); | ||
1561 | if (!stable_node) | ||
1562 | return 0; | ||
1563 | again: | ||
1564 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1565 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1566 | struct vm_area_struct *vma; | ||
1567 | |||
1568 | spin_lock(&anon_vma->lock); | ||
1569 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
1570 | if (rmap_item->address < vma->vm_start || | ||
1571 | rmap_item->address >= vma->vm_end) | ||
1572 | continue; | ||
1573 | /* | ||
1574 | * Initially we examine only the vma which covers this | ||
1575 | * rmap_item; but later, if there is still work to do, | ||
1576 | * we examine covering vmas in other mms: in case they | ||
1577 | * were forked from the original since ksmd passed. | ||
1578 | */ | ||
1579 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1580 | continue; | ||
1581 | |||
1582 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
1583 | continue; | ||
1584 | |||
1585 | referenced += page_referenced_one(page, vma, | ||
1586 | rmap_item->address, &mapcount, vm_flags); | ||
1587 | if (!search_new_forks || !mapcount) | ||
1588 | break; | ||
1589 | } | ||
1590 | spin_unlock(&anon_vma->lock); | ||
1591 | if (!mapcount) | ||
1592 | goto out; | ||
1593 | } | ||
1594 | if (!search_new_forks++) | ||
1595 | goto again; | ||
1596 | out: | ||
1597 | return referenced; | ||
1598 | } | ||
1599 | |||
1600 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | ||
1601 | { | ||
1602 | struct stable_node *stable_node; | ||
1603 | struct hlist_node *hlist; | ||
1604 | struct rmap_item *rmap_item; | ||
1605 | int ret = SWAP_AGAIN; | ||
1606 | int search_new_forks = 0; | ||
1607 | |||
1608 | VM_BUG_ON(!PageKsm(page)); | ||
1609 | VM_BUG_ON(!PageLocked(page)); | ||
1610 | |||
1611 | stable_node = page_stable_node(page); | ||
1612 | if (!stable_node) | ||
1613 | return SWAP_FAIL; | ||
1614 | again: | ||
1615 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1616 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1617 | struct vm_area_struct *vma; | ||
1618 | |||
1619 | spin_lock(&anon_vma->lock); | ||
1620 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
1621 | if (rmap_item->address < vma->vm_start || | ||
1622 | rmap_item->address >= vma->vm_end) | ||
1623 | continue; | ||
1624 | /* | ||
1625 | * Initially we examine only the vma which covers this | ||
1626 | * rmap_item; but later, if there is still work to do, | ||
1627 | * we examine covering vmas in other mms: in case they | ||
1628 | * were forked from the original since ksmd passed. | ||
1629 | */ | ||
1630 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1631 | continue; | ||
1632 | |||
1633 | ret = try_to_unmap_one(page, vma, | ||
1634 | rmap_item->address, flags); | ||
1635 | if (ret != SWAP_AGAIN || !page_mapped(page)) { | ||
1636 | spin_unlock(&anon_vma->lock); | ||
1637 | goto out; | ||
1638 | } | ||
1639 | } | ||
1640 | spin_unlock(&anon_vma->lock); | ||
1641 | } | ||
1642 | if (!search_new_forks++) | ||
1643 | goto again; | ||
1644 | out: | ||
1645 | return ret; | ||
1646 | } | ||
1647 | |||
1648 | #ifdef CONFIG_MIGRATION | ||
1649 | int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | ||
1650 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1651 | { | ||
1652 | struct stable_node *stable_node; | ||
1653 | struct hlist_node *hlist; | ||
1654 | struct rmap_item *rmap_item; | ||
1655 | int ret = SWAP_AGAIN; | ||
1656 | int search_new_forks = 0; | ||
1657 | |||
1658 | VM_BUG_ON(!PageKsm(page)); | ||
1659 | VM_BUG_ON(!PageLocked(page)); | ||
1660 | |||
1661 | stable_node = page_stable_node(page); | ||
1662 | if (!stable_node) | ||
1663 | return ret; | ||
1664 | again: | ||
1665 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1666 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1667 | struct vm_area_struct *vma; | ||
1668 | |||
1669 | spin_lock(&anon_vma->lock); | ||
1670 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
1671 | if (rmap_item->address < vma->vm_start || | ||
1672 | rmap_item->address >= vma->vm_end) | ||
1673 | continue; | ||
1674 | /* | ||
1675 | * Initially we examine only the vma which covers this | ||
1676 | * rmap_item; but later, if there is still work to do, | ||
1677 | * we examine covering vmas in other mms: in case they | ||
1678 | * were forked from the original since ksmd passed. | ||
1679 | */ | ||
1680 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1681 | continue; | ||
1682 | |||
1683 | ret = rmap_one(page, vma, rmap_item->address, arg); | ||
1684 | if (ret != SWAP_AGAIN) { | ||
1685 | spin_unlock(&anon_vma->lock); | ||
1686 | goto out; | ||
1687 | } | ||
1688 | } | ||
1689 | spin_unlock(&anon_vma->lock); | ||
1690 | } | ||
1691 | if (!search_new_forks++) | ||
1692 | goto again; | ||
1693 | out: | ||
1694 | return ret; | ||
1695 | } | ||
1696 | |||
1697 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) | ||
1698 | { | ||
1699 | struct stable_node *stable_node; | ||
1700 | |||
1701 | VM_BUG_ON(!PageLocked(oldpage)); | ||
1702 | VM_BUG_ON(!PageLocked(newpage)); | ||
1703 | VM_BUG_ON(newpage->mapping != oldpage->mapping); | ||
1704 | |||
1705 | stable_node = page_stable_node(newpage); | ||
1706 | if (stable_node) { | ||
1707 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | ||
1708 | stable_node->kpfn = page_to_pfn(newpage); | ||
1709 | } | ||
1710 | } | ||
1711 | #endif /* CONFIG_MIGRATION */ | ||
1712 | |||
1713 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
1714 | static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, | ||
1715 | unsigned long end_pfn) | ||
1716 | { | ||
1717 | struct rb_node *node; | ||
1718 | |||
1719 | for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { | ||
1720 | struct stable_node *stable_node; | ||
1721 | |||
1722 | stable_node = rb_entry(node, struct stable_node, node); | ||
1723 | if (stable_node->kpfn >= start_pfn && | ||
1724 | stable_node->kpfn < end_pfn) | ||
1725 | return stable_node; | ||
1726 | } | ||
1727 | return NULL; | ||
1728 | } | ||
1729 | |||
1730 | static int ksm_memory_callback(struct notifier_block *self, | ||
1731 | unsigned long action, void *arg) | ||
1732 | { | ||
1733 | struct memory_notify *mn = arg; | ||
1734 | struct stable_node *stable_node; | ||
1735 | |||
1736 | switch (action) { | ||
1737 | case MEM_GOING_OFFLINE: | ||
1738 | /* | ||
1739 | * Keep it very simple for now: just lock out ksmd and | ||
1740 | * MADV_UNMERGEABLE while any memory is going offline. | ||
1741 | */ | ||
1742 | mutex_lock(&ksm_thread_mutex); | ||
1743 | break; | ||
1744 | |||
1745 | case MEM_OFFLINE: | ||
1746 | /* | ||
1747 | * Most of the work is done by page migration; but there might | ||
1748 | * be a few stable_nodes left over, still pointing to struct | ||
1749 | * pages which have been offlined: prune those from the tree. | ||
1750 | */ | ||
1751 | while ((stable_node = ksm_check_stable_tree(mn->start_pfn, | ||
1752 | mn->start_pfn + mn->nr_pages)) != NULL) | ||
1753 | remove_node_from_stable_tree(stable_node); | ||
1754 | /* fallthrough */ | ||
1755 | |||
1756 | case MEM_CANCEL_OFFLINE: | ||
1757 | mutex_unlock(&ksm_thread_mutex); | ||
1758 | break; | ||
1759 | } | ||
1760 | return NOTIFY_OK; | ||
1761 | } | ||
1762 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
1763 | |||
1476 | #ifdef CONFIG_SYSFS | 1764 | #ifdef CONFIG_SYSFS |
1477 | /* | 1765 | /* |
1478 | * This all compiles without CONFIG_SYSFS, but is a waste of space. | 1766 | * This all compiles without CONFIG_SYSFS, but is a waste of space. |
@@ -1551,8 +1839,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1551 | /* | 1839 | /* |
1552 | * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. | 1840 | * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. |
1553 | * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, | 1841 | * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, |
1554 | * breaking COW to free the unswappable pages_shared (but leaves | 1842 | * breaking COW to free the pages_shared (but leaves mm_slots |
1555 | * mm_slots on the list for when ksmd may be set running again). | 1843 | * on the list for when ksmd may be set running again). |
1556 | */ | 1844 | */ |
1557 | 1845 | ||
1558 | mutex_lock(&ksm_thread_mutex); | 1846 | mutex_lock(&ksm_thread_mutex); |
@@ -1577,29 +1865,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1577 | } | 1865 | } |
1578 | KSM_ATTR(run); | 1866 | KSM_ATTR(run); |
1579 | 1867 | ||
1580 | static ssize_t max_kernel_pages_store(struct kobject *kobj, | ||
1581 | struct kobj_attribute *attr, | ||
1582 | const char *buf, size_t count) | ||
1583 | { | ||
1584 | int err; | ||
1585 | unsigned long nr_pages; | ||
1586 | |||
1587 | err = strict_strtoul(buf, 10, &nr_pages); | ||
1588 | if (err) | ||
1589 | return -EINVAL; | ||
1590 | |||
1591 | ksm_max_kernel_pages = nr_pages; | ||
1592 | |||
1593 | return count; | ||
1594 | } | ||
1595 | |||
1596 | static ssize_t max_kernel_pages_show(struct kobject *kobj, | ||
1597 | struct kobj_attribute *attr, char *buf) | ||
1598 | { | ||
1599 | return sprintf(buf, "%lu\n", ksm_max_kernel_pages); | ||
1600 | } | ||
1601 | KSM_ATTR(max_kernel_pages); | ||
1602 | |||
1603 | static ssize_t pages_shared_show(struct kobject *kobj, | 1868 | static ssize_t pages_shared_show(struct kobject *kobj, |
1604 | struct kobj_attribute *attr, char *buf) | 1869 | struct kobj_attribute *attr, char *buf) |
1605 | { | 1870 | { |
@@ -1649,7 +1914,6 @@ static struct attribute *ksm_attrs[] = { | |||
1649 | &sleep_millisecs_attr.attr, | 1914 | &sleep_millisecs_attr.attr, |
1650 | &pages_to_scan_attr.attr, | 1915 | &pages_to_scan_attr.attr, |
1651 | &run_attr.attr, | 1916 | &run_attr.attr, |
1652 | &max_kernel_pages_attr.attr, | ||
1653 | &pages_shared_attr.attr, | 1917 | &pages_shared_attr.attr, |
1654 | &pages_sharing_attr.attr, | 1918 | &pages_sharing_attr.attr, |
1655 | &pages_unshared_attr.attr, | 1919 | &pages_unshared_attr.attr, |
@@ -1669,8 +1933,6 @@ static int __init ksm_init(void) | |||
1669 | struct task_struct *ksm_thread; | 1933 | struct task_struct *ksm_thread; |
1670 | int err; | 1934 | int err; |
1671 | 1935 | ||
1672 | ksm_max_kernel_pages = totalram_pages / 4; | ||
1673 | |||
1674 | err = ksm_slab_init(); | 1936 | err = ksm_slab_init(); |
1675 | if (err) | 1937 | if (err) |
1676 | goto out; | 1938 | goto out; |
@@ -1698,6 +1960,13 @@ static int __init ksm_init(void) | |||
1698 | 1960 | ||
1699 | #endif /* CONFIG_SYSFS */ | 1961 | #endif /* CONFIG_SYSFS */ |
1700 | 1962 | ||
1963 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
1964 | /* | ||
1965 | * Choose a high priority since the callback takes ksm_thread_mutex: | ||
1966 | * later callbacks could only be taking locks which nest within that. | ||
1967 | */ | ||
1968 | hotplug_memory_notifier(ksm_memory_callback, 100); | ||
1969 | #endif | ||
1701 | return 0; | 1970 | return 0; |
1702 | 1971 | ||
1703 | out_free2: | 1972 | out_free2: |