aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIzik Eidus <ieidus@redhat.com>2009-09-21 20:01:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-22 10:17:31 -0400
commit828502d30073036a486d96b1fe051e0f08b6df83 (patch)
tree61b728cbeb88c1a2c522307dff6264e8d0b1d8f1
parent451ea25da71590361c71bf3044c55b870a887d53 (diff)
ksm: add mmu_notifier set_pte_at_notify()
KSM is a linux driver that allows dynamicly sharing identical memory pages between one or more processes. Unlike tradtional page sharing that is made at the allocation of the memory, ksm do it dynamicly after the memory was created. Memory is periodically scanned; identical pages are identified and merged. The sharing is made in a transparent way to the processes that use it. Ksm is highly important for hypervisors (kvm), where in production enviorments there might be many copys of the same data data among the host memory. This kind of data can be: similar kernels, librarys, cache, and so on. Even that ksm was wrote for kvm, any userspace application that want to use it to share its data can try it. Ksm may be useful for any application that might have similar (page aligment) data strctures among the memory, ksm will find this data merge it to one copy, and even if it will be changed and thereforew copy on writed, ksm will merge it again as soon as it will be identical again. Another reason to consider using ksm is the fact that it might simplify alot the userspace code of application that want to use shared private data, instead that the application will mange shared area, ksm will do this for the application, and even write to this data will be allowed without any synchinization acts from the application. Ksm was designed to be a loadable module that doesn't change the VM code of linux. This patch: The set_pte_at_notify() macro allows setting a pte in the shadow page table directly, instead of flushing the shadow page table entry and then getting vmexit to set it. It uses a new change_pte() callback to do so. set_pte_at_notify() is an optimization for kvm, and other users of mmu_notifiers, for COW pages. It is useful for kvm when ksm is used, because it allows kvm not to have to receive vmexit and only then map the ksm page into the shadow page table, but instead map it directly at the same time as Linux maps the page into the host page table. Users of mmu_notifiers who don't implement new mmu_notifier_change_pte() callback will just receive the mmu_notifier_invalidate_page() callback. Signed-off-by: Izik Eidus <ieidus@redhat.com> Signed-off-by: Chris Wright <chrisw@redhat.com> Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Avi Kivity <avi@redhat.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mmu_notifier.h34
-rw-r--r--mm/memory.c9
-rw-r--r--mm/mmu_notifier.c20
3 files changed, 61 insertions, 2 deletions
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index b77486d152cd..4e02ee2b071e 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -62,6 +62,15 @@ struct mmu_notifier_ops {
62 unsigned long address); 62 unsigned long address);
63 63
64 /* 64 /*
65 * change_pte is called in cases that pte mapping to page is changed:
66 * for example, when ksm remaps pte to point to a new shared page.
67 */
68 void (*change_pte)(struct mmu_notifier *mn,
69 struct mm_struct *mm,
70 unsigned long address,
71 pte_t pte);
72
73 /*
65 * Before this is invoked any secondary MMU is still ok to 74 * Before this is invoked any secondary MMU is still ok to
66 * read/write to the page previously pointed to by the Linux 75 * read/write to the page previously pointed to by the Linux
67 * pte because the page hasn't been freed yet and it won't be 76 * pte because the page hasn't been freed yet and it won't be
@@ -154,6 +163,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
154extern void __mmu_notifier_release(struct mm_struct *mm); 163extern void __mmu_notifier_release(struct mm_struct *mm);
155extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, 164extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
156 unsigned long address); 165 unsigned long address);
166extern void __mmu_notifier_change_pte(struct mm_struct *mm,
167 unsigned long address, pte_t pte);
157extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, 168extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
158 unsigned long address); 169 unsigned long address);
159extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, 170extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -175,6 +186,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
175 return 0; 186 return 0;
176} 187}
177 188
189static inline void mmu_notifier_change_pte(struct mm_struct *mm,
190 unsigned long address, pte_t pte)
191{
192 if (mm_has_notifiers(mm))
193 __mmu_notifier_change_pte(mm, address, pte);
194}
195
178static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, 196static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
179 unsigned long address) 197 unsigned long address)
180{ 198{
@@ -236,6 +254,16 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
236 __young; \ 254 __young; \
237}) 255})
238 256
257#define set_pte_at_notify(__mm, __address, __ptep, __pte) \
258({ \
259 struct mm_struct *___mm = __mm; \
260 unsigned long ___address = __address; \
261 pte_t ___pte = __pte; \
262 \
263 set_pte_at(___mm, ___address, __ptep, ___pte); \
264 mmu_notifier_change_pte(___mm, ___address, ___pte); \
265})
266
239#else /* CONFIG_MMU_NOTIFIER */ 267#else /* CONFIG_MMU_NOTIFIER */
240 268
241static inline void mmu_notifier_release(struct mm_struct *mm) 269static inline void mmu_notifier_release(struct mm_struct *mm)
@@ -248,6 +276,11 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
248 return 0; 276 return 0;
249} 277}
250 278
279static inline void mmu_notifier_change_pte(struct mm_struct *mm,
280 unsigned long address, pte_t pte)
281{
282}
283
251static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, 284static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
252 unsigned long address) 285 unsigned long address)
253{ 286{
@@ -273,6 +306,7 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
273 306
274#define ptep_clear_flush_young_notify ptep_clear_flush_young 307#define ptep_clear_flush_young_notify ptep_clear_flush_young
275#define ptep_clear_flush_notify ptep_clear_flush 308#define ptep_clear_flush_notify ptep_clear_flush
309#define set_pte_at_notify set_pte_at
276 310
277#endif /* CONFIG_MMU_NOTIFIER */ 311#endif /* CONFIG_MMU_NOTIFIER */
278 312
diff --git a/mm/memory.c b/mm/memory.c
index e8f63d9961ea..368561f32009 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2115,9 +2115,14 @@ gotten:
2115 * seen in the presence of one thread doing SMC and another 2115 * seen in the presence of one thread doing SMC and another
2116 * thread doing COW. 2116 * thread doing COW.
2117 */ 2117 */
2118 ptep_clear_flush_notify(vma, address, page_table); 2118 ptep_clear_flush(vma, address, page_table);
2119 page_add_new_anon_rmap(new_page, vma, address); 2119 page_add_new_anon_rmap(new_page, vma, address);
2120 set_pte_at(mm, address, page_table, entry); 2120 /*
2121 * We call the notify macro here because, when using secondary
2122 * mmu page tables (such as kvm shadow page tables), we want the
2123 * new page to be mapped directly into the secondary page table.
2124 */
2125 set_pte_at_notify(mm, address, page_table, entry);
2121 update_mmu_cache(vma, address, entry); 2126 update_mmu_cache(vma, address, entry);
2122 if (old_page) { 2127 if (old_page) {
2123 /* 2128 /*
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5f4ef0250bee..7e33f2cb3c77 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
99 return young; 99 return young;
100} 100}
101 101
102void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
103 pte_t pte)
104{
105 struct mmu_notifier *mn;
106 struct hlist_node *n;
107
108 rcu_read_lock();
109 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
110 if (mn->ops->change_pte)
111 mn->ops->change_pte(mn, mm, address, pte);
112 /*
113 * Some drivers don't have change_pte,
114 * so we must call invalidate_page in that case.
115 */
116 else if (mn->ops->invalidate_page)
117 mn->ops->invalidate_page(mn, mm, address);
118 }
119 rcu_read_unlock();
120}
121
102void __mmu_notifier_invalidate_page(struct mm_struct *mm, 122void __mmu_notifier_invalidate_page(struct mm_struct *mm,
103 unsigned long address) 123 unsigned long address)
104{ 124{