xen: SMP guest support

This is a fairly straightforward Xen implementation of smp_ops. Xen has its own IPI mechanisms, and has no dependency on any APIC-based IPI. The smp_ops hooks and the flush_tlb_others pv_op allow a Xen guest to avoid all APIC code in arch/i386 (the only apic operation is a single apic_read for the apic version number). One subtle point which needs to be addressed is unpinning pagetables when another cpu may have a lazy tlb reference to the pagetable. Xen will not allow an in-use pagetable to be unpinned, so we must find any other cpus with a reference to the pagetable and get them to shoot down their references. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Benjamin LaHaise <bcrl@kvack.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Andi Kleen <ak@suse.de>
author: Jeremy Fitzhardinge <jeremy@xensource.com> 2007-07-17 21:37:06 -0400
committer: Jeremy Fitzhardinge <jeremy@goop.org> 2007-07-18 11:47:44 -0400
commit: f87e4cac4f4e940b328d3deb5b53e642e3881f43 (patch)
tree: 7409f86561e5f97459378abd2ae21e9a5c82bfea /arch/i386/xen/mmu.c
parent: ab55028886dd1dd54585f22bf19a00eb23869340 (diff)
1 files changed, 52 insertions, 17 deletions
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
index 53501ce2d15c..bc49ef846203 100644
--- a/arch/i386/xen/mmu.c
+++ b/arch/i386/xen/mmu.c
@@ -391,8 +391,12 @@ void xen_pgd_pin(pgd_t *pgd)
        xen_mc_batch();
-        if (pgd_walk(pgd, pin_page, TASK_SIZE))
+        if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+                /* re-enable interrupts for kmap_flush_unused */
+                xen_mc_issue(0);
                kmap_flush_unused();
+                xen_mc_batch();
+        }
        mcs = __xen_mc_entry(sizeof(*op));
        op = mcs.args;
@@ -474,27 +478,58 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
        spin_unlock(&mm->page_table_lock);
 }
-void xen_exit_mmap(struct mm_struct *mm)
-{
-        struct task_struct *tsk = current;
-        task_lock(tsk);
-        /*
+#ifdef CONFIG_SMP
-         * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+/* Another cpu may still have their %cr3 pointing at the pagetable, so
-         * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+   we need to repoint it somewhere else before we can unpin it. */
-         */
+static void drop_other_mm_ref(void *info)
-        if (tsk->active_mm == mm) {
+{
-                tsk->active_mm = &init_mm;
+        struct mm_struct *mm = info;
-                atomic_inc(&init_mm.mm_count);
-                switch_mm(mm, &init_mm, tsk);
+        if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+                leave_mm(smp_processor_id());
+}
-                atomic_dec(&mm->mm_count);
+static void drop_mm_ref(struct mm_struct *mm)
-                BUG_ON(atomic_read(&mm->mm_count) == 0);
+{
+        if (current->active_mm == mm) {
+                if (current->mm == mm)
+                        load_cr3(swapper_pg_dir);
+                else
+                        leave_mm(smp_processor_id());
        }
-        task_unlock(tsk);
+        if (!cpus_empty(mm->cpu_vm_mask))
+                xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
+                                           mm, 1);
+}
+#else
+static void drop_mm_ref(struct mm_struct *mm)
+{
+        if (current->active_mm == mm)
+                load_cr3(swapper_pg_dir);
+}
+#endif
+/*
+ * While a process runs, Xen pins its pagetables, which means that the
+ * hypervisor forces it to be read-only, and it controls all updates
+ * to it.  This means that all pagetable updates have to go via the
+ * hypervisor, which is moderately expensive.
+ *
+ * Since we're pulling the pagetable down, we switch to use init_mm,
+ * unpin old process pagetable and mark it all read-write, which
+ * allows further operations on it to be simple memory accesses.
+ *
+ * The only subtle point is that another CPU may be still using the
+ * pagetable because of lazy tlb flushing.  This means we need need to
+ * switch all CPUs off this pagetable before we can unpin it.
+ */
+void xen_exit_mmap(struct mm_struct *mm)
+{
+        get_cpu();              /* make sure we don't move around */
+        drop_mm_ref(mm);
+        put_cpu();
        xen_pgd_unpin(mm->pgd);
 }
author	Jeremy Fitzhardinge <jeremy@xensource.com>	2007-07-17 21:37:06 -0400
committer	Jeremy Fitzhardinge <jeremy@goop.org>	2007-07-18 11:47:44 -0400
commit	f87e4cac4f4e940b328d3deb5b53e642e3881f43 (patch)
tree	7409f86561e5f97459378abd2ae21e9a5c82bfea /arch/i386/xen/mmu.c
parent	ab55028886dd1dd54585f22bf19a00eb23869340 (diff)

diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c index 53501ce2d15c..bc49ef846203 100644 --- a/arch/i386/xen/mmu.c +++ b/arch/i386/xen/mmu.c
@@ -391,8 +391,12 @@ void xen_pgd_pin(pgd_t *pgd)
391		391
392	xen_mc_batch();	392	xen_mc_batch();
393		393
394	if (pgd_walk(pgd, pin_page, TASK_SIZE))	394	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
		395	/* re-enable interrupts for kmap_flush_unused */
		396	xen_mc_issue(0);
395	kmap_flush_unused();	397	kmap_flush_unused();
		398	xen_mc_batch();
		399	}
396		400
397	mcs = __xen_mc_entry(sizeof(*op));	401	mcs = __xen_mc_entry(sizeof(*op));
398	op = mcs.args;	402	op = mcs.args;
@@ -474,27 +478,58 @@ void xen_dup_mmap(struct mm_struct oldmm, struct mm_struct mm)
474	spin_unlock(&mm->page_table_lock);	478	spin_unlock(&mm->page_table_lock);
475	}	479	}
476		480
477	void xen_exit_mmap(struct mm_struct *mm)
478	{
479	struct task_struct *tsk = current;
480
481	task_lock(tsk);
482		481
483	/*	482	#ifdef CONFIG_SMP
484	* We aggressively remove defunct pgd from cr3. We execute unmap_vmas()	483	/* Another cpu may still have their %cr3 pointing at the pagetable, so
485	* much faster this way, as no tlb flushes means bigger wrpt batches.	484	we need to repoint it somewhere else before we can unpin it. */
486	*/	485	static void drop_other_mm_ref(void *info)
487	if (tsk->active_mm == mm) {	486	{
488	tsk->active_mm = &init_mm;	487	struct mm_struct *mm = info;
489	atomic_inc(&init_mm.mm_count);
490		488
491	switch_mm(mm, &init_mm, tsk);	489	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
		490	leave_mm(smp_processor_id());
		491	}
492		492
493	atomic_dec(&mm->mm_count);	493	static void drop_mm_ref(struct mm_struct *mm)
494	BUG_ON(atomic_read(&mm->mm_count) == 0);	494	{
		495	if (current->active_mm == mm) {
		496	if (current->mm == mm)
		497	load_cr3(swapper_pg_dir);
		498	else
		499	leave_mm(smp_processor_id());
495	}	500	}
496		501
497	task_unlock(tsk);	502	if (!cpus_empty(mm->cpu_vm_mask))
		503	xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
		504	mm, 1);
		505	}
		506	#else
		507	static void drop_mm_ref(struct mm_struct *mm)
		508	{
		509	if (current->active_mm == mm)
		510	load_cr3(swapper_pg_dir);
		511	}
		512	#endif
		513
		514	/*
		515	* While a process runs, Xen pins its pagetables, which means that the
		516	* hypervisor forces it to be read-only, and it controls all updates
		517	* to it. This means that all pagetable updates have to go via the
		518	* hypervisor, which is moderately expensive.
		519	*
		520	* Since we're pulling the pagetable down, we switch to use init_mm,
		521	* unpin old process pagetable and mark it all read-write, which
		522	* allows further operations on it to be simple memory accesses.
		523	*
		524	* The only subtle point is that another CPU may be still using the
		525	* pagetable because of lazy tlb flushing. This means we need need to
		526	* switch all CPUs off this pagetable before we can unpin it.
		527	*/
		528	void xen_exit_mmap(struct mm_struct *mm)
		529	{
		530	get_cpu(); /* make sure we don't move around */
		531	drop_mm_ref(mm);
		532	put_cpu();
498		533
499	xen_pgd_unpin(mm->pgd);	534	xen_pgd_unpin(mm->pgd);
500	}	535	}