aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/cpu/common.c11
-rw-r--r--arch/x86/kernel/ldt.c9
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c37
-rw-r--r--arch/x86/kernel/paravirt.c27
-rw-r--r--arch/x86/kernel/process_32.c39
-rw-r--r--arch/x86/kernel/process_64.c22
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kernel/smpboot.c77
-rw-r--r--arch/x86/kernel/tlb_32.c8
-rw-r--r--arch/x86/mm/fault.c14
-rw-r--r--arch/x86/xen/Kconfig10
-rw-r--r--arch/x86/xen/Makefile12
-rw-r--r--arch/x86/xen/debugfs.c123
-rw-r--r--arch/x86/xen/debugfs.h10
-rw-r--r--arch/x86/xen/enlighten.c189
-rw-r--r--arch/x86/xen/irq.c143
-rw-r--r--arch/x86/xen/mmu.c270
-rw-r--r--arch/x86/xen/multicalls.c115
-rw-r--r--arch/x86/xen/smp.c245
-rw-r--r--arch/x86/xen/spinlock.c428
-rw-r--r--arch/x86/xen/time.c12
-rw-r--r--arch/x86/xen/xen-asm_32.S2
-rw-r--r--arch/x86/xen/xen-asm_64.S2
-rw-r--r--arch/x86/xen/xen-ops.h8
25 files changed, 1354 insertions, 469 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3db651fc8ec5..d679cb2c79b4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE
10# Do not profile debug and lowlevel utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt.o = -pg 13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14endif 14endif
15 15
16# 16#
@@ -89,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
89obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 89obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
90obj-$(CONFIG_KVM_GUEST) += kvm.o 90obj-$(CONFIG_KVM_GUEST) += kvm.o
91obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 91obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
92obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 92obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
93obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 93obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
94 94
95obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 95obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4e456bd955bb..9983bc3f5d18 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -728,14 +728,3 @@ void __cpuinit cpu_init(void)
728 mxcsr_feature_mask_init(); 728 mxcsr_feature_mask_init();
729} 729}
730 730
731#ifdef CONFIG_HOTPLUG_CPU
732void __cpuinit cpu_uninit(void)
733{
734 int cpu = raw_smp_processor_id();
735 cpu_clear(cpu, cpu_initialized);
736
737 /* lazy TLB state */
738 per_cpu(cpu_tlbstate, cpu).state = 0;
739 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
740}
741#endif
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b68e21f06f4f..6e388412a854 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -51,6 +51,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
51 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, 51 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
52 (mincount - oldsize) * LDT_ENTRY_SIZE); 52 (mincount - oldsize) * LDT_ENTRY_SIZE);
53 53
54 paravirt_alloc_ldt(newldt, mincount);
55
54#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
55 /* CHECKME: Do we really need this ? */ 57 /* CHECKME: Do we really need this ? */
56 wmb(); 58 wmb();
@@ -73,6 +75,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
73#endif 75#endif
74 } 76 }
75 if (oldsize) { 77 if (oldsize) {
78 paravirt_free_ldt(oldldt, oldsize);
76 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) 79 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
77 vfree(oldldt); 80 vfree(oldldt);
78 else 81 else
@@ -84,10 +87,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
84static inline int copy_ldt(mm_context_t *new, mm_context_t *old) 87static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
85{ 88{
86 int err = alloc_ldt(new, old->size, 0); 89 int err = alloc_ldt(new, old->size, 0);
90 int i;
87 91
88 if (err < 0) 92 if (err < 0)
89 return err; 93 return err;
90 memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); 94
95 for(i = 0; i < old->size; i++)
96 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
91 return 0; 97 return 0;
92} 98}
93 99
@@ -124,6 +130,7 @@ void destroy_context(struct mm_struct *mm)
124 if (mm == current->active_mm) 130 if (mm == current->active_mm)
125 clear_LDT(); 131 clear_LDT();
126#endif 132#endif
133 paravirt_free_ldt(mm->context.ldt, mm->context.size);
127 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) 134 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
128 vfree(mm->context.ldt); 135 vfree(mm->context.ldt);
129 else 136 else
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 000000000000..0e9f1982b1dd
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,37 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/spinlock.h>
6#include <linux/module.h>
7
8#include <asm/paravirt.h>
9
10static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
11{
12 __raw_spin_lock(lock);
13}
14
15struct pv_lock_ops pv_lock_ops = {
16#ifdef CONFIG_SMP
17 .spin_is_locked = __ticket_spin_is_locked,
18 .spin_is_contended = __ticket_spin_is_contended,
19
20 .spin_lock = __ticket_spin_lock,
21 .spin_lock_flags = default_spin_lock_flags,
22 .spin_trylock = __ticket_spin_trylock,
23 .spin_unlock = __ticket_spin_unlock,
24#endif
25};
26EXPORT_SYMBOL(pv_lock_ops);
27
28void __init paravirt_use_bytelocks(void)
29{
30#ifdef CONFIG_SMP
31 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
32 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
33 pv_lock_ops.spin_lock = __byte_spin_lock;
34 pv_lock_ops.spin_trylock = __byte_spin_trylock;
35 pv_lock_ops.spin_unlock = __byte_spin_unlock;
36#endif
37}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 300da17e61cb..7faea1817d05 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
268 return __get_cpu_var(paravirt_lazy_mode); 268 return __get_cpu_var(paravirt_lazy_mode);
269} 269}
270 270
271void __init paravirt_use_bytelocks(void)
272{
273#ifdef CONFIG_SMP
274 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
275 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
276 pv_lock_ops.spin_lock = __byte_spin_lock;
277 pv_lock_ops.spin_trylock = __byte_spin_trylock;
278 pv_lock_ops.spin_unlock = __byte_spin_unlock;
279#endif
280}
281
282struct pv_info pv_info = { 271struct pv_info pv_info = {
283 .name = "bare hardware", 272 .name = "bare hardware",
284 .paravirt_enabled = 0, 273 .paravirt_enabled = 0,
@@ -348,6 +337,10 @@ struct pv_cpu_ops pv_cpu_ops = {
348 .write_ldt_entry = native_write_ldt_entry, 337 .write_ldt_entry = native_write_ldt_entry,
349 .write_gdt_entry = native_write_gdt_entry, 338 .write_gdt_entry = native_write_gdt_entry,
350 .write_idt_entry = native_write_idt_entry, 339 .write_idt_entry = native_write_idt_entry,
340
341 .alloc_ldt = paravirt_nop,
342 .free_ldt = paravirt_nop,
343
351 .load_sp0 = native_load_sp0, 344 .load_sp0 = native_load_sp0,
352 345
353#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 346#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -461,18 +454,6 @@ struct pv_mmu_ops pv_mmu_ops = {
461 .set_fixmap = native_set_fixmap, 454 .set_fixmap = native_set_fixmap,
462}; 455};
463 456
464struct pv_lock_ops pv_lock_ops = {
465#ifdef CONFIG_SMP
466 .spin_is_locked = __ticket_spin_is_locked,
467 .spin_is_contended = __ticket_spin_is_contended,
468
469 .spin_lock = __ticket_spin_lock,
470 .spin_trylock = __ticket_spin_trylock,
471 .spin_unlock = __ticket_spin_unlock,
472#endif
473};
474EXPORT_SYMBOL(pv_lock_ops);
475
476EXPORT_SYMBOL_GPL(pv_time_ops); 457EXPORT_SYMBOL_GPL(pv_time_ops);
477EXPORT_SYMBOL (pv_cpu_ops); 458EXPORT_SYMBOL (pv_cpu_ops);
478EXPORT_SYMBOL (pv_mmu_ops); 459EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4b3cfdf54216..b76b38ff962b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -72,47 +72,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
72 return ((unsigned long *)tsk->thread.sp)[3]; 72 return ((unsigned long *)tsk->thread.sp)[3];
73} 73}
74 74
75#ifdef CONFIG_HOTPLUG_CPU 75#ifndef CONFIG_SMP
76#include <asm/nmi.h>
77
78static void cpu_exit_clear(void)
79{
80 int cpu = raw_smp_processor_id();
81
82 idle_task_exit();
83
84 cpu_uninit();
85 irq_ctx_exit(cpu);
86
87 cpu_clear(cpu, cpu_callout_map);
88 cpu_clear(cpu, cpu_callin_map);
89
90 numa_remove_cpu(cpu);
91 c1e_remove_cpu(cpu);
92}
93
94/* We don't actually take CPU down, just spin without interrupts. */
95static inline void play_dead(void)
96{
97 /* This must be done before dead CPU ack */
98 cpu_exit_clear();
99 mb();
100 /* Ack it */
101 __get_cpu_var(cpu_state) = CPU_DEAD;
102
103 /*
104 * With physical CPU hotplug, we should halt the cpu
105 */
106 local_irq_disable();
107 /* mask all interrupts, flush any and all caches, and halt */
108 wbinvd_halt();
109}
110#else
111static inline void play_dead(void) 76static inline void play_dead(void)
112{ 77{
113 BUG(); 78 BUG();
114} 79}
115#endif /* CONFIG_HOTPLUG_CPU */ 80#endif
116 81
117/* 82/*
118 * The idle thread. There's no useful work to be 83 * The idle thread. There's no useful work to be
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e12e0e4dd256..ec27afa43d7e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -85,30 +85,12 @@ void exit_idle(void)
85 __exit_idle(); 85 __exit_idle();
86} 86}
87 87
88#ifdef CONFIG_HOTPLUG_CPU 88#ifndef CONFIG_SMP
89DECLARE_PER_CPU(int, cpu_state);
90
91#include <asm/nmi.h>
92/* We halt the CPU with physical CPU hotplug */
93static inline void play_dead(void)
94{
95 idle_task_exit();
96 c1e_remove_cpu(raw_smp_processor_id());
97
98 mb();
99 /* Ack it */
100 __get_cpu_var(cpu_state) = CPU_DEAD;
101
102 local_irq_disable();
103 /* mask all interrupts, flush any and all caches, and halt */
104 wbinvd_halt();
105}
106#else
107static inline void play_dead(void) 89static inline void play_dead(void)
108{ 90{
109 BUG(); 91 BUG();
110} 92}
111#endif /* CONFIG_HOTPLUG_CPU */ 93#endif
112 94
113/* 95/*
114 * The idle thread. There's no useful work to be 96 * The idle thread. There's no useful work to be
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 361b7a4c640c..18f9b19f5f8f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
214struct smp_ops smp_ops = { 214struct smp_ops smp_ops = {
215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
216 .smp_prepare_cpus = native_smp_prepare_cpus, 216 .smp_prepare_cpus = native_smp_prepare_cpus,
217 .cpu_up = native_cpu_up,
218 .smp_cpus_done = native_smp_cpus_done, 217 .smp_cpus_done = native_smp_cpus_done,
219 218
220 .smp_send_stop = native_smp_send_stop, 219 .smp_send_stop = native_smp_send_stop,
221 .smp_send_reschedule = native_smp_send_reschedule, 220 .smp_send_reschedule = native_smp_send_reschedule,
222 221
222 .cpu_up = native_cpu_up,
223 .cpu_die = native_cpu_die,
224 .cpu_disable = native_cpu_disable,
225 .play_dead = native_play_dead,
226
223 .send_call_func_ipi = native_send_call_func_ipi, 227 .send_call_func_ipi = native_send_call_func_ipi,
224 .send_call_func_single_ipi = native_send_call_func_single_ipi, 228 .send_call_func_single_ipi = native_send_call_func_single_ipi,
225}; 229};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7985c5b3f916..06f1407d5542 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h>
55#include <asm/smp.h> 56#include <asm/smp.h>
56#include <asm/trampoline.h> 57#include <asm/trampoline.h>
57#include <asm/cpu.h> 58#include <asm/cpu.h>
@@ -1346,25 +1347,9 @@ static void __ref remove_cpu_from_maps(int cpu)
1346 numa_remove_cpu(cpu); 1347 numa_remove_cpu(cpu);
1347} 1348}
1348 1349
1349int __cpu_disable(void) 1350void cpu_disable_common(void)
1350{ 1351{
1351 int cpu = smp_processor_id(); 1352 int cpu = smp_processor_id();
1352
1353 /*
1354 * Perhaps use cpufreq to drop frequency, but that could go
1355 * into generic code.
1356 *
1357 * We won't take down the boot processor on i386 due to some
1358 * interrupts only being able to be serviced by the BSP.
1359 * Especially so if we're not using an IOAPIC -zwane
1360 */
1361 if (cpu == 0)
1362 return -EBUSY;
1363
1364 if (nmi_watchdog == NMI_LOCAL_APIC)
1365 stop_apic_nmi_watchdog(NULL);
1366 clear_local_APIC();
1367
1368 /* 1353 /*
1369 * HACK: 1354 * HACK:
1370 * Allow any queued timer interrupts to get serviced 1355 * Allow any queued timer interrupts to get serviced
@@ -1382,10 +1367,32 @@ int __cpu_disable(void)
1382 remove_cpu_from_maps(cpu); 1367 remove_cpu_from_maps(cpu);
1383 unlock_vector_lock(); 1368 unlock_vector_lock();
1384 fixup_irqs(cpu_online_map); 1369 fixup_irqs(cpu_online_map);
1370}
1371
1372int native_cpu_disable(void)
1373{
1374 int cpu = smp_processor_id();
1375
1376 /*
1377 * Perhaps use cpufreq to drop frequency, but that could go
1378 * into generic code.
1379 *
1380 * We won't take down the boot processor on i386 due to some
1381 * interrupts only being able to be serviced by the BSP.
1382 * Especially so if we're not using an IOAPIC -zwane
1383 */
1384 if (cpu == 0)
1385 return -EBUSY;
1386
1387 if (nmi_watchdog == NMI_LOCAL_APIC)
1388 stop_apic_nmi_watchdog(NULL);
1389 clear_local_APIC();
1390
1391 cpu_disable_common();
1385 return 0; 1392 return 0;
1386} 1393}
1387 1394
1388void __cpu_die(unsigned int cpu) 1395void native_cpu_die(unsigned int cpu)
1389{ 1396{
1390 /* We don't do anything here: idle task is faking death itself. */ 1397 /* We don't do anything here: idle task is faking death itself. */
1391 unsigned int i; 1398 unsigned int i;
@@ -1402,15 +1409,45 @@ void __cpu_die(unsigned int cpu)
1402 } 1409 }
1403 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1410 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1404} 1411}
1412
1413void play_dead_common(void)
1414{
1415 idle_task_exit();
1416 reset_lazy_tlbstate();
1417 irq_ctx_exit(raw_smp_processor_id());
1418 c1e_remove_cpu(raw_smp_processor_id());
1419
1420 mb();
1421 /* Ack it */
1422 __get_cpu_var(cpu_state) = CPU_DEAD;
1423
1424 /*
1425 * With physical CPU hotplug, we should halt the cpu
1426 */
1427 local_irq_disable();
1428}
1429
1430void native_play_dead(void)
1431{
1432 play_dead_common();
1433 wbinvd_halt();
1434}
1435
1405#else /* ... !CONFIG_HOTPLUG_CPU */ 1436#else /* ... !CONFIG_HOTPLUG_CPU */
1406int __cpu_disable(void) 1437int native_cpu_disable(void)
1407{ 1438{
1408 return -ENOSYS; 1439 return -ENOSYS;
1409} 1440}
1410 1441
1411void __cpu_die(unsigned int cpu) 1442void native_cpu_die(unsigned int cpu)
1412{ 1443{
1413 /* We said "no" in __cpu_disable */ 1444 /* We said "no" in __cpu_disable */
1414 BUG(); 1445 BUG();
1415} 1446}
1447
1448void native_play_dead(void)
1449{
1450 BUG();
1451}
1452
1416#endif 1453#endif
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index fec1ecedc9b7..e00534b33534 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -241,3 +241,11 @@ void flush_tlb_all(void)
241 on_each_cpu(do_flush_tlb_all, NULL, 1); 241 on_each_cpu(do_flush_tlb_all, NULL, 1);
242} 242}
243 243
244void reset_lazy_tlbstate(void)
245{
246 int cpu = raw_smp_processor_id();
247
248 per_cpu(cpu_tlbstate, cpu).state = 0;
249 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
250}
251
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 455f3fe67b42..356ed2dec3a6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -915,15 +915,15 @@ LIST_HEAD(pgd_list);
915 915
916void vmalloc_sync_all(void) 916void vmalloc_sync_all(void)
917{ 917{
918#ifdef CONFIG_X86_32
919 unsigned long start = VMALLOC_START & PGDIR_MASK;
920 unsigned long address; 918 unsigned long address;
921 919
920#ifdef CONFIG_X86_32
922 if (SHARED_KERNEL_PMD) 921 if (SHARED_KERNEL_PMD)
923 return; 922 return;
924 923
925 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 924 for (address = VMALLOC_START & PMD_MASK;
926 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 925 address >= TASK_SIZE && address < FIXADDR_TOP;
926 address += PMD_SIZE) {
927 unsigned long flags; 927 unsigned long flags;
928 struct page *page; 928 struct page *page;
929 929
@@ -936,10 +936,8 @@ void vmalloc_sync_all(void)
936 spin_unlock_irqrestore(&pgd_lock, flags); 936 spin_unlock_irqrestore(&pgd_lock, flags);
937 } 937 }
938#else /* CONFIG_X86_64 */ 938#else /* CONFIG_X86_64 */
939 unsigned long start = VMALLOC_START & PGDIR_MASK; 939 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
940 unsigned long address; 940 address += PGDIR_SIZE) {
941
942 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
943 const pgd_t *pgd_ref = pgd_offset_k(address); 941 const pgd_t *pgd_ref = pgd_offset_k(address);
944 unsigned long flags; 942 unsigned long flags;
945 struct page *page; 943 struct page *page;
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 3815e425f470..d3e68465ace9 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -27,4 +27,12 @@ config XEN_MAX_DOMAIN_MEMORY
27config XEN_SAVE_RESTORE 27config XEN_SAVE_RESTORE
28 bool 28 bool
29 depends on PM 29 depends on PM
30 default y \ No newline at end of file 30 default y
31
32config XEN_DEBUG_FS
33 bool "Enable Xen debug and tuning parameters in debugfs"
34 depends on XEN && DEBUG_FS
35 default n
36 help
37 Enable statistics output and various tuning options in debugfs.
38 Enabling this option may incur a significant performance overhead. \ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 59c1e539aed2..313947940a1a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,12 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1ifdef CONFIG_FTRACE
2# Do not profile debug and lowlevel utilities
3CFLAGS_REMOVE_spinlock.o = -pg
4CFLAGS_REMOVE_time.o = -pg
5CFLAGS_REMOVE_irq.o = -pg
6endif
7
8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
2 time.o xen-asm_$(BITS).o grant-table.o suspend.o 9 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 10
4obj-$(CONFIG_SMP) += smp.o 11obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 000000000000..b53225d2cac3
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,123 @@
1#include <linux/init.h>
2#include <linux/debugfs.h>
3#include <linux/module.h>
4
5#include "debugfs.h"
6
7static struct dentry *d_xen_debug;
8
9struct dentry * __init xen_init_debugfs(void)
10{
11 if (!d_xen_debug) {
12 d_xen_debug = debugfs_create_dir("xen", NULL);
13
14 if (!d_xen_debug)
15 pr_warning("Could not create 'xen' debugfs directory\n");
16 }
17
18 return d_xen_debug;
19}
20
21struct array_data
22{
23 void *array;
24 unsigned elements;
25};
26
27static int u32_array_open(struct inode *inode, struct file *file)
28{
29 file->private_data = NULL;
30 return nonseekable_open(inode, file);
31}
32
33static size_t format_array(char *buf, size_t bufsize, const char *fmt,
34 u32 *array, unsigned array_size)
35{
36 size_t ret = 0;
37 unsigned i;
38
39 for(i = 0; i < array_size; i++) {
40 size_t len;
41
42 len = snprintf(buf, bufsize, fmt, array[i]);
43 len++; /* ' ' or '\n' */
44 ret += len;
45
46 if (buf) {
47 buf += len;
48 bufsize -= len;
49 buf[-1] = (i == array_size-1) ? '\n' : ' ';
50 }
51 }
52
53 ret++; /* \0 */
54 if (buf)
55 *buf = '\0';
56
57 return ret;
58}
59
60static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
61{
62 size_t len = format_array(NULL, 0, fmt, array, array_size);
63 char *ret;
64
65 ret = kmalloc(len, GFP_KERNEL);
66 if (ret == NULL)
67 return NULL;
68
69 format_array(ret, len, fmt, array, array_size);
70 return ret;
71}
72
73static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
74 loff_t *ppos)
75{
76 struct inode *inode = file->f_path.dentry->d_inode;
77 struct array_data *data = inode->i_private;
78 size_t size;
79
80 if (*ppos == 0) {
81 if (file->private_data) {
82 kfree(file->private_data);
83 file->private_data = NULL;
84 }
85
86 file->private_data = format_array_alloc("%u", data->array, data->elements);
87 }
88
89 size = 0;
90 if (file->private_data)
91 size = strlen(file->private_data);
92
93 return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
94}
95
96static int xen_array_release(struct inode *inode, struct file *file)
97{
98 kfree(file->private_data);
99
100 return 0;
101}
102
103static struct file_operations u32_array_fops = {
104 .owner = THIS_MODULE,
105 .open = u32_array_open,
106 .release= xen_array_release,
107 .read = u32_array_read,
108};
109
110struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
111 struct dentry *parent,
112 u32 *array, unsigned elements)
113{
114 struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
115
116 if (data == NULL)
117 return NULL;
118
119 data->array = array;
120 data->elements = elements;
121
122 return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
123}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 000000000000..e28132084832
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,10 @@
1#ifndef _XEN_DEBUGFS_H
2#define _XEN_DEBUGFS_H
3
4struct dentry * __init xen_init_debugfs(void);
5
6struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
7 struct dentry *parent,
8 u32 *array, unsigned elements);
9
10#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a4e201b47f64..8ca2f88bde1e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,7 +30,6 @@
30#include <xen/interface/xen.h> 30#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h> 31#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h> 32#include <xen/interface/vcpu.h>
33#include <xen/interface/sched.h>
34#include <xen/features.h> 33#include <xen/features.h>
35#include <xen/page.h> 34#include <xen/page.h>
36#include <xen/hvc-console.h> 35#include <xen/hvc-console.h>
@@ -57,6 +56,9 @@ EXPORT_SYMBOL_GPL(hypercall_page);
57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 56DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 57DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
59 58
59enum xen_domain_type xen_domain_type = XEN_NATIVE;
60EXPORT_SYMBOL_GPL(xen_domain_type);
61
60/* 62/*
61 * Identity map, in addition to plain kernel map. This needs to be 63 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest. 64 * large enough to allocate page table pages to allocate the rest.
@@ -226,103 +228,68 @@ static unsigned long xen_get_debugreg(int reg)
226 return HYPERVISOR_get_debugreg(reg); 228 return HYPERVISOR_get_debugreg(reg);
227} 229}
228 230
229static unsigned long xen_save_fl(void) 231static void xen_leave_lazy(void)
230{ 232{
231 struct vcpu_info *vcpu; 233 paravirt_leave_lazy(paravirt_get_lazy_mode());
232 unsigned long flags; 234 xen_mc_flush();
233
234 vcpu = x86_read_percpu(xen_vcpu);
235
236 /* flag has opposite sense of mask */
237 flags = !vcpu->evtchn_upcall_mask;
238
239 /* convert to IF type flag
240 -0 -> 0x00000000
241 -1 -> 0xffffffff
242 */
243 return (-flags) & X86_EFLAGS_IF;
244} 235}
245 236
246static void xen_restore_fl(unsigned long flags) 237static unsigned long xen_store_tr(void)
247{ 238{
248 struct vcpu_info *vcpu; 239 return 0;
249
250 /* convert from IF type flag */
251 flags = !(flags & X86_EFLAGS_IF);
252
253 /* There's a one instruction preempt window here. We need to
254 make sure we're don't switch CPUs between getting the vcpu
255 pointer and updating the mask. */
256 preempt_disable();
257 vcpu = x86_read_percpu(xen_vcpu);
258 vcpu->evtchn_upcall_mask = flags;
259 preempt_enable_no_resched();
260
261 /* Doesn't matter if we get preempted here, because any
262 pending event will get dealt with anyway. */
263
264 if (flags == 0) {
265 preempt_check_resched();
266 barrier(); /* unmask then check (avoid races) */
267 if (unlikely(vcpu->evtchn_upcall_pending))
268 force_evtchn_callback();
269 }
270} 240}
271 241
272static void xen_irq_disable(void) 242/*
243 * Set the page permissions for a particular virtual address. If the
244 * address is a vmalloc mapping (or other non-linear mapping), then
245 * find the linear mapping of the page and also set its protections to
246 * match.
247 */
248static void set_aliased_prot(void *v, pgprot_t prot)
273{ 249{
274 /* There's a one instruction preempt window here. We need to 250 int level;
275 make sure we're don't switch CPUs between getting the vcpu 251 pte_t *ptep;
276 pointer and updating the mask. */ 252 pte_t pte;
277 preempt_disable(); 253 unsigned long pfn;
278 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 254 struct page *page;
279 preempt_enable_no_resched();
280}
281 255
282static void xen_irq_enable(void) 256 ptep = lookup_address((unsigned long)v, &level);
283{ 257 BUG_ON(ptep == NULL);
284 struct vcpu_info *vcpu;
285 258
286 /* We don't need to worry about being preempted here, since 259 pfn = pte_pfn(*ptep);
287 either a) interrupts are disabled, so no preemption, or b) 260 page = pfn_to_page(pfn);
288 the caller is confused and is trying to re-enable interrupts
289 on an indeterminate processor. */
290 261
291 vcpu = x86_read_percpu(xen_vcpu); 262 pte = pfn_pte(pfn, prot);
292 vcpu->evtchn_upcall_mask = 0;
293 263
294 /* Doesn't matter if we get preempted here, because any 264 if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
295 pending event will get dealt with anyway. */ 265 BUG();
296 266
297 barrier(); /* unmask then check (avoid races) */ 267 if (!PageHighMem(page)) {
298 if (unlikely(vcpu->evtchn_upcall_pending)) 268 void *av = __va(PFN_PHYS(pfn));
299 force_evtchn_callback();
300}
301 269
302static void xen_safe_halt(void) 270 if (av != v)
303{ 271 if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
304 /* Blocking includes an implicit local_irq_enable(). */ 272 BUG();
305 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) 273 } else
306 BUG(); 274 kmap_flush_unused();
307} 275}
308 276
309static void xen_halt(void) 277static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
310{ 278{
311 if (irqs_disabled()) 279 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
312 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 280 int i;
313 else
314 xen_safe_halt();
315}
316 281
317static void xen_leave_lazy(void) 282 for(i = 0; i < entries; i += entries_per_page)
318{ 283 set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
319 paravirt_leave_lazy(paravirt_get_lazy_mode());
320 xen_mc_flush();
321} 284}
322 285
323static unsigned long xen_store_tr(void) 286static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
324{ 287{
325 return 0; 288 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
289 int i;
290
291 for(i = 0; i < entries; i += entries_per_page)
292 set_aliased_prot(ldt + i, PAGE_KERNEL);
326} 293}
327 294
328static void xen_set_ldt(const void *addr, unsigned entries) 295static void xen_set_ldt(const void *addr, unsigned entries)
@@ -425,8 +392,7 @@ static void xen_load_gs_index(unsigned int idx)
425static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 392static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
426 const void *ptr) 393 const void *ptr)
427{ 394{
428 unsigned long lp = (unsigned long)&dt[entrynum]; 395 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
429 xmaddr_t mach_lp = virt_to_machine(lp);
430 u64 entry = *(u64 *)ptr; 396 u64 entry = *(u64 *)ptr;
431 397
432 preempt_disable(); 398 preempt_disable();
@@ -559,7 +525,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
559} 525}
560 526
561static void xen_load_sp0(struct tss_struct *tss, 527static void xen_load_sp0(struct tss_struct *tss,
562 struct thread_struct *thread) 528 struct thread_struct *thread)
563{ 529{
564 struct multicall_space mcs = xen_mc_entry(0); 530 struct multicall_space mcs = xen_mc_entry(0);
565 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 531 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
@@ -803,6 +769,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
803 ret = -EFAULT; 769 ret = -EFAULT;
804 break; 770 break;
805#endif 771#endif
772
773 case MSR_STAR:
774 case MSR_CSTAR:
775 case MSR_LSTAR:
776 case MSR_SYSCALL_MASK:
777 case MSR_IA32_SYSENTER_CS:
778 case MSR_IA32_SYSENTER_ESP:
779 case MSR_IA32_SYSENTER_EIP:
780 /* Fast syscall setup is all done in hypercalls, so
781 these are all ignored. Stub them out here to stop
782 Xen console noise. */
783 break;
784
806 default: 785 default:
807 ret = native_write_msr_safe(msr, low, high); 786 ret = native_write_msr_safe(msr, low, high);
808 } 787 }
@@ -846,8 +825,8 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
846 SetPagePinned(page); 825 SetPagePinned(page);
847 826
848 if (!PageHighMem(page)) { 827 if (!PageHighMem(page)) {
849 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 828 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
850 if (level == PT_PTE) 829 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
851 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 830 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
852 } else 831 } else
853 /* make sure there are no stray mappings of 832 /* make sure there are no stray mappings of
@@ -915,7 +894,7 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
915 894
916 if (PagePinned(page)) { 895 if (PagePinned(page)) {
917 if (!PageHighMem(page)) { 896 if (!PageHighMem(page)) {
918 if (level == PT_PTE) 897 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
919 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 898 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
920 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 899 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
921 } 900 }
@@ -1220,6 +1199,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1220 .load_gs_index = xen_load_gs_index, 1199 .load_gs_index = xen_load_gs_index,
1221#endif 1200#endif
1222 1201
1202 .alloc_ldt = xen_alloc_ldt,
1203 .free_ldt = xen_free_ldt,
1204
1223 .store_gdt = native_store_gdt, 1205 .store_gdt = native_store_gdt,
1224 .store_idt = native_store_idt, 1206 .store_idt = native_store_idt,
1225 .store_tr = xen_store_tr, 1207 .store_tr = xen_store_tr,
@@ -1241,36 +1223,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1241 }, 1223 },
1242}; 1224};
1243 1225
1244static void __init __xen_init_IRQ(void)
1245{
1246#ifdef CONFIG_X86_64
1247 int i;
1248
1249 /* Create identity vector->irq map */
1250 for(i = 0; i < NR_VECTORS; i++) {
1251 int cpu;
1252
1253 for_each_possible_cpu(cpu)
1254 per_cpu(vector_irq, cpu)[i] = i;
1255 }
1256#endif /* CONFIG_X86_64 */
1257
1258 xen_init_IRQ();
1259}
1260
1261static const struct pv_irq_ops xen_irq_ops __initdata = {
1262 .init_IRQ = __xen_init_IRQ,
1263 .save_fl = xen_save_fl,
1264 .restore_fl = xen_restore_fl,
1265 .irq_disable = xen_irq_disable,
1266 .irq_enable = xen_irq_enable,
1267 .safe_halt = xen_safe_halt,
1268 .halt = xen_halt,
1269#ifdef CONFIG_X86_64
1270 .adjust_exception_frame = xen_adjust_exception_frame,
1271#endif
1272};
1273
1274static const struct pv_apic_ops xen_apic_ops __initdata = { 1226static const struct pv_apic_ops xen_apic_ops __initdata = {
1275#ifdef CONFIG_X86_LOCAL_APIC 1227#ifdef CONFIG_X86_LOCAL_APIC
1276 .apic_write = xen_apic_write, 1228 .apic_write = xen_apic_write,
@@ -1664,6 +1616,8 @@ asmlinkage void __init xen_start_kernel(void)
1664 if (!xen_start_info) 1616 if (!xen_start_info)
1665 return; 1617 return;
1666 1618
1619 xen_domain_type = XEN_PV_DOMAIN;
1620
1667 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1621 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1668 1622
1669 xen_setup_features(); 1623 xen_setup_features();
@@ -1673,10 +1627,11 @@ asmlinkage void __init xen_start_kernel(void)
1673 pv_init_ops = xen_init_ops; 1627 pv_init_ops = xen_init_ops;
1674 pv_time_ops = xen_time_ops; 1628 pv_time_ops = xen_time_ops;
1675 pv_cpu_ops = xen_cpu_ops; 1629 pv_cpu_ops = xen_cpu_ops;
1676 pv_irq_ops = xen_irq_ops;
1677 pv_apic_ops = xen_apic_ops; 1630 pv_apic_ops = xen_apic_ops;
1678 pv_mmu_ops = xen_mmu_ops; 1631 pv_mmu_ops = xen_mmu_ops;
1679 1632
1633 xen_init_irq_ops();
1634
1680 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 1635 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1681 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; 1636 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1682 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; 1637 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
@@ -1700,7 +1655,7 @@ asmlinkage void __init xen_start_kernel(void)
1700 1655
1701 /* Prevent unwanted bits from being set in PTEs. */ 1656 /* Prevent unwanted bits from being set in PTEs. */
1702 __supported_pte_mask &= ~_PAGE_GLOBAL; 1657 __supported_pte_mask &= ~_PAGE_GLOBAL;
1703 if (!is_initial_xendomain()) 1658 if (!xen_initial_domain())
1704 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); 1659 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1705 1660
1706 /* Don't do the full vcpu_info placement stuff until we have a 1661 /* Don't do the full vcpu_info placement stuff until we have a
@@ -1735,7 +1690,7 @@ asmlinkage void __init xen_start_kernel(void)
1735 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1690 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1736 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); 1691 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1737 1692
1738 if (!is_initial_xendomain()) { 1693 if (!xen_initial_domain()) {
1739 add_preferred_console("xenboot", 0, NULL); 1694 add_preferred_console("xenboot", 0, NULL);
1740 add_preferred_console("tty", 0, NULL); 1695 add_preferred_console("tty", 0, NULL);
1741 add_preferred_console("hvc", 0, NULL); 1696 add_preferred_console("hvc", 0, NULL);
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 000000000000..28b85ab8422e
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,143 @@
1#include <linux/hardirq.h>
2
3#include <xen/interface/xen.h>
4#include <xen/interface/sched.h>
5#include <xen/interface/vcpu.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/hypervisor.h>
9
10#include "xen-ops.h"
11
12/*
13 * Force a proper event-channel callback from Xen after clearing the
14 * callback mask. We do this in a very simple manner, by making a call
15 * down into Xen. The pending flag will be checked by Xen on return.
16 */
17void xen_force_evtchn_callback(void)
18{
19 (void)HYPERVISOR_xen_version(0, NULL);
20}
21
22static void __init __xen_init_IRQ(void)
23{
24#ifdef CONFIG_X86_64
25 int i;
26
27 /* Create identity vector->irq map */
28 for(i = 0; i < NR_VECTORS; i++) {
29 int cpu;
30
31 for_each_possible_cpu(cpu)
32 per_cpu(vector_irq, cpu)[i] = i;
33 }
34#endif /* CONFIG_X86_64 */
35
36 xen_init_IRQ();
37}
38
39static unsigned long xen_save_fl(void)
40{
41 struct vcpu_info *vcpu;
42 unsigned long flags;
43
44 vcpu = x86_read_percpu(xen_vcpu);
45
46 /* flag has opposite sense of mask */
47 flags = !vcpu->evtchn_upcall_mask;
48
49 /* convert to IF type flag
50 -0 -> 0x00000000
51 -1 -> 0xffffffff
52 */
53 return (-flags) & X86_EFLAGS_IF;
54}
55
56static void xen_restore_fl(unsigned long flags)
57{
58 struct vcpu_info *vcpu;
59
60 /* convert from IF type flag */
61 flags = !(flags & X86_EFLAGS_IF);
62
63 /* There's a one instruction preempt window here. We need to
64 make sure we're don't switch CPUs between getting the vcpu
65 pointer and updating the mask. */
66 preempt_disable();
67 vcpu = x86_read_percpu(xen_vcpu);
68 vcpu->evtchn_upcall_mask = flags;
69 preempt_enable_no_resched();
70
71 /* Doesn't matter if we get preempted here, because any
72 pending event will get dealt with anyway. */
73
74 if (flags == 0) {
75 preempt_check_resched();
76 barrier(); /* unmask then check (avoid races) */
77 if (unlikely(vcpu->evtchn_upcall_pending))
78 xen_force_evtchn_callback();
79 }
80}
81
82static void xen_irq_disable(void)
83{
84 /* There's a one instruction preempt window here. We need to
85 make sure we're don't switch CPUs between getting the vcpu
86 pointer and updating the mask. */
87 preempt_disable();
88 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
89 preempt_enable_no_resched();
90}
91
92static void xen_irq_enable(void)
93{
94 struct vcpu_info *vcpu;
95
96 /* We don't need to worry about being preempted here, since
97 either a) interrupts are disabled, so no preemption, or b)
98 the caller is confused and is trying to re-enable interrupts
99 on an indeterminate processor. */
100
101 vcpu = x86_read_percpu(xen_vcpu);
102 vcpu->evtchn_upcall_mask = 0;
103
104 /* Doesn't matter if we get preempted here, because any
105 pending event will get dealt with anyway. */
106
107 barrier(); /* unmask then check (avoid races) */
108 if (unlikely(vcpu->evtchn_upcall_pending))
109 xen_force_evtchn_callback();
110}
111
112static void xen_safe_halt(void)
113{
114 /* Blocking includes an implicit local_irq_enable(). */
115 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
116 BUG();
117}
118
119static void xen_halt(void)
120{
121 if (irqs_disabled())
122 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
123 else
124 xen_safe_halt();
125}
126
127static const struct pv_irq_ops xen_irq_ops __initdata = {
128 .init_IRQ = __xen_init_IRQ,
129 .save_fl = xen_save_fl,
130 .restore_fl = xen_restore_fl,
131 .irq_disable = xen_irq_disable,
132 .irq_enable = xen_irq_enable,
133 .safe_halt = xen_safe_halt,
134 .halt = xen_halt,
135#ifdef CONFIG_X86_64
136 .adjust_exception_frame = xen_adjust_exception_frame,
137#endif
138};
139
140void __init xen_init_irq_ops()
141{
142 pv_irq_ops = xen_irq_ops;
143}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aa37469da696..64e58681767e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,6 +40,7 @@
40 */ 40 */
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h>
43#include <linux/bug.h> 44#include <linux/bug.h>
44 45
45#include <asm/pgtable.h> 46#include <asm/pgtable.h>
@@ -57,6 +58,61 @@
57 58
58#include "multicalls.h" 59#include "multicalls.h"
59#include "mmu.h" 60#include "mmu.h"
61#include "debugfs.h"
62
63#define MMU_UPDATE_HISTO 30
64
65#ifdef CONFIG_XEN_DEBUG_FS
66
67static struct {
68 u32 pgd_update;
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
71
72 u32 pud_update;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
75
76 u32 pmd_update;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
79
80 u32 pte_update;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
83
84 u32 mmu_update;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
87
88 u32 prot_commit;
89 u32 prot_commit_batched;
90
91 u32 set_pte_at;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
96} mmu_stats;
97
98static u8 zero_stats;
99
100static inline void check_zero(void)
101{
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
104 zero_stats = 0;
105 }
106}
107
108#define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
110
111#else /* !CONFIG_XEN_DEBUG_FS */
112
113#define ADD_STATS(elem, val) do { (void)(val); } while(0)
114
115#endif /* CONFIG_XEN_DEBUG_FS */
60 116
61/* 117/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a 118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr)
229} 285}
230 286
231 287
232static bool page_pinned(void *ptr) 288static bool xen_page_pinned(void *ptr)
233{ 289{
234 struct page *page = virt_to_page(ptr); 290 struct page *page = virt_to_page(ptr);
235 291
236 return PagePinned(page); 292 return PagePinned(page);
237} 293}
238 294
239static void extend_mmu_update(const struct mmu_update *update) 295static void xen_extend_mmu_update(const struct mmu_update *update)
240{ 296{
241 struct multicall_space mcs; 297 struct multicall_space mcs;
242 struct mmu_update *u; 298 struct mmu_update *u;
243 299
244 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 300 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
245 301
246 if (mcs.mc != NULL) 302 if (mcs.mc != NULL) {
303 ADD_STATS(mmu_update_extended, 1);
304 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
305
247 mcs.mc->args[1]++; 306 mcs.mc->args[1]++;
248 else { 307
308 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
309 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
310 else
311 ADD_STATS(mmu_update_histo[0], 1);
312 } else {
313 ADD_STATS(mmu_update, 1);
249 mcs = __xen_mc_entry(sizeof(*u)); 314 mcs = __xen_mc_entry(sizeof(*u));
250 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 315 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
316 ADD_STATS(mmu_update_histo[1], 1);
251 } 317 }
252 318
253 u = mcs.args; 319 u = mcs.args;
@@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
265 /* ptr may be ioremapped for 64-bit pagetable setup */ 331 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 332 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
267 u.val = pmd_val_ma(val); 333 u.val = pmd_val_ma(val);
268 extend_mmu_update(&u); 334 xen_extend_mmu_update(&u);
335
336 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
269 337
270 xen_mc_issue(PARAVIRT_LAZY_MMU); 338 xen_mc_issue(PARAVIRT_LAZY_MMU);
271 339
@@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
274 342
275void xen_set_pmd(pmd_t *ptr, pmd_t val) 343void xen_set_pmd(pmd_t *ptr, pmd_t val)
276{ 344{
345 ADD_STATS(pmd_update, 1);
346
277 /* If page is not pinned, we can just update the entry 347 /* If page is not pinned, we can just update the entry
278 directly */ 348 directly */
279 if (!page_pinned(ptr)) { 349 if (!xen_page_pinned(ptr)) {
280 *ptr = val; 350 *ptr = val;
281 return; 351 return;
282 } 352 }
283 353
354 ADD_STATS(pmd_update_pinned, 1);
355
284 xen_set_pmd_hyper(ptr, val); 356 xen_set_pmd_hyper(ptr, val);
285} 357}
286 358
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
300 if (mm == &init_mm) 372 if (mm == &init_mm)
301 preempt_disable(); 373 preempt_disable();
302 374
375 ADD_STATS(set_pte_at, 1);
376// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
377 ADD_STATS(set_pte_at_current, mm == current->mm);
378 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
379
303 if (mm == current->mm || mm == &init_mm) { 380 if (mm == current->mm || mm == &init_mm) {
304 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 381 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
305 struct multicall_space mcs; 382 struct multicall_space mcs;
306 mcs = xen_mc_entry(0); 383 mcs = xen_mc_entry(0);
307 384
308 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 385 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
386 ADD_STATS(set_pte_at_batched, 1);
309 xen_mc_issue(PARAVIRT_LAZY_MMU); 387 xen_mc_issue(PARAVIRT_LAZY_MMU);
310 goto out; 388 goto out;
311 } else 389 } else
@@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
334 412
335 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 413 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
336 u.val = pte_val_ma(pte); 414 u.val = pte_val_ma(pte);
337 extend_mmu_update(&u); 415 xen_extend_mmu_update(&u);
416
417 ADD_STATS(prot_commit, 1);
418 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
338 419
339 xen_mc_issue(PARAVIRT_LAZY_MMU); 420 xen_mc_issue(PARAVIRT_LAZY_MMU);
340} 421}
@@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
400 /* ptr may be ioremapped for 64-bit pagetable setup */ 481 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 482 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
402 u.val = pud_val_ma(val); 483 u.val = pud_val_ma(val);
403 extend_mmu_update(&u); 484 xen_extend_mmu_update(&u);
485
486 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
404 487
405 xen_mc_issue(PARAVIRT_LAZY_MMU); 488 xen_mc_issue(PARAVIRT_LAZY_MMU);
406 489
@@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
409 492
410void xen_set_pud(pud_t *ptr, pud_t val) 493void xen_set_pud(pud_t *ptr, pud_t val)
411{ 494{
495 ADD_STATS(pud_update, 1);
496
412 /* If page is not pinned, we can just update the entry 497 /* If page is not pinned, we can just update the entry
413 directly */ 498 directly */
414 if (!page_pinned(ptr)) { 499 if (!xen_page_pinned(ptr)) {
415 *ptr = val; 500 *ptr = val;
416 return; 501 return;
417 } 502 }
418 503
504 ADD_STATS(pud_update_pinned, 1);
505
419 xen_set_pud_hyper(ptr, val); 506 xen_set_pud_hyper(ptr, val);
420} 507}
421 508
422void xen_set_pte(pte_t *ptep, pte_t pte) 509void xen_set_pte(pte_t *ptep, pte_t pte)
423{ 510{
511 ADD_STATS(pte_update, 1);
512// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
513 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
514
424#ifdef CONFIG_X86_PAE 515#ifdef CONFIG_X86_PAE
425 ptep->pte_high = pte.pte_high; 516 ptep->pte_high = pte.pte_high;
426 smp_wmb(); 517 smp_wmb();
@@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
490 581
491 u.ptr = virt_to_machine(ptr).maddr; 582 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val); 583 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u); 584 xen_extend_mmu_update(&u);
494} 585}
495 586
496/* 587/*
@@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{ 608{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr); 609 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519 610
611 ADD_STATS(pgd_update, 1);
612
520 /* If page is not pinned, we can just update the entry 613 /* If page is not pinned, we can just update the entry
521 directly */ 614 directly */
522 if (!page_pinned(ptr)) { 615 if (!xen_page_pinned(ptr)) {
523 *ptr = val; 616 *ptr = val;
524 if (user_ptr) { 617 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr)); 618 WARN_ON(xen_page_pinned(user_ptr));
526 *user_ptr = val; 619 *user_ptr = val;
527 } 620 }
528 return; 621 return;
529 } 622 }
530 623
624 ADD_STATS(pgd_update_pinned, 1);
625 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
626
531 /* If it's pinned, then we can at least batch the kernel and 627 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */ 628 user updates together. */
533 xen_mc_batch(); 629 xen_mc_batch();
@@ -555,8 +651,8 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
555 * For 64-bit, we must skip the Xen hole in the middle of the address 651 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole. 652 * space, just after the big x86-64 virtual hole.
557 */ 653 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), 654static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
559 unsigned long limit) 655 unsigned long limit)
560{ 656{
561 int flush = 0; 657 int flush = 0;
562 unsigned hole_low, hole_high; 658 unsigned hole_low, hole_high;
@@ -590,8 +686,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
590 pmdidx_limit = 0; 686 pmdidx_limit = 0;
591#endif 687#endif
592 688
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { 689 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
596 pud_t *pud; 690 pud_t *pud;
597 691
@@ -637,16 +731,22 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
637 } 731 }
638 } 732 }
639 } 733 }
734
640out: 735out:
736 /* Do the top level last, so that the callbacks can use it as
737 a cue to do final things like tlb flushes. */
738 flush |= (*func)(virt_to_page(pgd), PT_PGD);
641 739
642 return flush; 740 return flush;
643} 741}
644 742
645static spinlock_t *lock_pte(struct page *page) 743/* If we're using split pte locks, then take the page's lock and
744 return a pointer to it. Otherwise return NULL. */
745static spinlock_t *xen_pte_lock(struct page *page)
646{ 746{
647 spinlock_t *ptl = NULL; 747 spinlock_t *ptl = NULL;
648 748
649#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 749#if USE_SPLIT_PTLOCKS
650 ptl = __pte_lockptr(page); 750 ptl = __pte_lockptr(page);
651 spin_lock(ptl); 751 spin_lock(ptl);
652#endif 752#endif
@@ -654,7 +754,7 @@ static spinlock_t *lock_pte(struct page *page)
654 return ptl; 754 return ptl;
655} 755}
656 756
657static void do_unlock(void *v) 757static void xen_pte_unlock(void *v)
658{ 758{
659 spinlock_t *ptl = v; 759 spinlock_t *ptl = v;
660 spin_unlock(ptl); 760 spin_unlock(ptl);
@@ -672,7 +772,7 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
672 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 772 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
673} 773}
674 774
675static int pin_page(struct page *page, enum pt_level level) 775static int xen_pin_page(struct page *page, enum pt_level level)
676{ 776{
677 unsigned pgfl = TestSetPagePinned(page); 777 unsigned pgfl = TestSetPagePinned(page);
678 int flush; 778 int flush;
@@ -691,21 +791,40 @@ static int pin_page(struct page *page, enum pt_level level)
691 791
692 flush = 0; 792 flush = 0;
693 793
794 /*
795 * We need to hold the pagetable lock between the time
796 * we make the pagetable RO and when we actually pin
797 * it. If we don't, then other users may come in and
798 * attempt to update the pagetable by writing it,
799 * which will fail because the memory is RO but not
800 * pinned, so Xen won't do the trap'n'emulate.
801 *
802 * If we're using split pte locks, we can't hold the
803 * entire pagetable's worth of locks during the
804 * traverse, because we may wrap the preempt count (8
805 * bits). The solution is to mark RO and pin each PTE
806 * page while holding the lock. This means the number
807 * of locks we end up holding is never more than a
808 * batch size (~32 entries, at present).
809 *
810 * If we're not using split pte locks, we needn't pin
811 * the PTE pages independently, because we're
812 * protected by the overall pagetable lock.
813 */
694 ptl = NULL; 814 ptl = NULL;
695 if (level == PT_PTE) 815 if (level == PT_PTE)
696 ptl = lock_pte(page); 816 ptl = xen_pte_lock(page);
697 817
698 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 818 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
699 pfn_pte(pfn, PAGE_KERNEL_RO), 819 pfn_pte(pfn, PAGE_KERNEL_RO),
700 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 820 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
701 821
702 if (level == PT_PTE) 822 if (ptl) {
703 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 823 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
704 824
705 if (ptl) {
706 /* Queue a deferred unlock for when this batch 825 /* Queue a deferred unlock for when this batch
707 is completed. */ 826 is completed. */
708 xen_mc_callback(do_unlock, ptl); 827 xen_mc_callback(xen_pte_unlock, ptl);
709 } 828 }
710 } 829 }
711 830
@@ -719,7 +838,7 @@ void xen_pgd_pin(pgd_t *pgd)
719{ 838{
720 xen_mc_batch(); 839 xen_mc_batch();
721 840
722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) { 841 if (xen_pgd_walk(pgd, xen_pin_page, USER_LIMIT)) {
723 /* re-enable interrupts for kmap_flush_unused */ 842 /* re-enable interrupts for kmap_flush_unused */
724 xen_mc_issue(0); 843 xen_mc_issue(0);
725 kmap_flush_unused(); 844 kmap_flush_unused();
@@ -733,14 +852,14 @@ void xen_pgd_pin(pgd_t *pgd)
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); 852 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734 853
735 if (user_pgd) { 854 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD); 855 xen_pin_page(virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); 856 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 } 857 }
739 } 858 }
740#else /* CONFIG_X86_32 */ 859#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE 860#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */ 861 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 862 xen_pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
744#endif 863#endif
745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 864 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */ 865#endif /* CONFIG_X86_64 */
@@ -775,7 +894,7 @@ void xen_mm_pin_all(void)
775 * that's before we have page structures to store the bits. So do all 894 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now. 895 * the book-keeping now.
777 */ 896 */
778static __init int mark_pinned(struct page *page, enum pt_level level) 897static __init int xen_mark_pinned(struct page *page, enum pt_level level)
779{ 898{
780 SetPagePinned(page); 899 SetPagePinned(page);
781 return 0; 900 return 0;
@@ -783,10 +902,10 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
783 902
784void __init xen_mark_init_mm_pinned(void) 903void __init xen_mark_init_mm_pinned(void)
785{ 904{
786 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 905 xen_pgd_walk(init_mm.pgd, xen_mark_pinned, FIXADDR_TOP);
787} 906}
788 907
789static int unpin_page(struct page *page, enum pt_level level) 908static int xen_unpin_page(struct page *page, enum pt_level level)
790{ 909{
791 unsigned pgfl = TestClearPagePinned(page); 910 unsigned pgfl = TestClearPagePinned(page);
792 911
@@ -796,10 +915,18 @@ static int unpin_page(struct page *page, enum pt_level level)
796 spinlock_t *ptl = NULL; 915 spinlock_t *ptl = NULL;
797 struct multicall_space mcs; 916 struct multicall_space mcs;
798 917
918 /*
919 * Do the converse to pin_page. If we're using split
920 * pte locks, we must be holding the lock for while
921 * the pte page is unpinned but still RO to prevent
922 * concurrent updates from seeing it in this
923 * partially-pinned state.
924 */
799 if (level == PT_PTE) { 925 if (level == PT_PTE) {
800 ptl = lock_pte(page); 926 ptl = xen_pte_lock(page);
801 927
802 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 928 if (ptl)
929 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
803 } 930 }
804 931
805 mcs = __xen_mc_entry(0); 932 mcs = __xen_mc_entry(0);
@@ -810,7 +937,7 @@ static int unpin_page(struct page *page, enum pt_level level)
810 937
811 if (ptl) { 938 if (ptl) {
812 /* unlock when batch completed */ 939 /* unlock when batch completed */
813 xen_mc_callback(do_unlock, ptl); 940 xen_mc_callback(xen_pte_unlock, ptl);
814 } 941 }
815 } 942 }
816 943
@@ -830,17 +957,17 @@ static void xen_pgd_unpin(pgd_t *pgd)
830 957
831 if (user_pgd) { 958 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); 959 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD); 960 xen_unpin_page(virt_to_page(user_pgd), PT_PGD);
834 } 961 }
835 } 962 }
836#endif 963#endif
837 964
838#ifdef CONFIG_X86_PAE 965#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */ 966 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 967 xen_unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
841#endif 968#endif
842 969
843 pgd_walk(pgd, unpin_page, USER_LIMIT); 970 xen_pgd_walk(pgd, xen_unpin_page, USER_LIMIT);
844 971
845 xen_mc_issue(0); 972 xen_mc_issue(0);
846} 973}
@@ -907,7 +1034,7 @@ static void drop_other_mm_ref(void *info)
907 } 1034 }
908} 1035}
909 1036
910static void drop_mm_ref(struct mm_struct *mm) 1037static void xen_drop_mm_ref(struct mm_struct *mm)
911{ 1038{
912 cpumask_t mask; 1039 cpumask_t mask;
913 unsigned cpu; 1040 unsigned cpu;
@@ -937,7 +1064,7 @@ static void drop_mm_ref(struct mm_struct *mm)
937 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1064 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
938} 1065}
939#else 1066#else
940static void drop_mm_ref(struct mm_struct *mm) 1067static void xen_drop_mm_ref(struct mm_struct *mm)
941{ 1068{
942 if (current->active_mm == mm) 1069 if (current->active_mm == mm)
943 load_cr3(swapper_pg_dir); 1070 load_cr3(swapper_pg_dir);
@@ -961,14 +1088,77 @@ static void drop_mm_ref(struct mm_struct *mm)
961void xen_exit_mmap(struct mm_struct *mm) 1088void xen_exit_mmap(struct mm_struct *mm)
962{ 1089{
963 get_cpu(); /* make sure we don't move around */ 1090 get_cpu(); /* make sure we don't move around */
964 drop_mm_ref(mm); 1091 xen_drop_mm_ref(mm);
965 put_cpu(); 1092 put_cpu();
966 1093
967 spin_lock(&mm->page_table_lock); 1094 spin_lock(&mm->page_table_lock);
968 1095
969 /* pgd may not be pinned in the error exit path of execve */ 1096 /* pgd may not be pinned in the error exit path of execve */
970 if (page_pinned(mm->pgd)) 1097 if (xen_page_pinned(mm->pgd))
971 xen_pgd_unpin(mm->pgd); 1098 xen_pgd_unpin(mm->pgd);
972 1099
973 spin_unlock(&mm->page_table_lock); 1100 spin_unlock(&mm->page_table_lock);
974} 1101}
1102
1103#ifdef CONFIG_XEN_DEBUG_FS
1104
1105static struct dentry *d_mmu_debug;
1106
1107static int __init xen_mmu_debugfs(void)
1108{
1109 struct dentry *d_xen = xen_init_debugfs();
1110
1111 if (d_xen == NULL)
1112 return -ENOMEM;
1113
1114 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1115
1116 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1117
1118 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1119 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1120 &mmu_stats.pgd_update_pinned);
1121 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1122 &mmu_stats.pgd_update_pinned);
1123
1124 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1125 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1126 &mmu_stats.pud_update_pinned);
1127 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1128 &mmu_stats.pud_update_pinned);
1129
1130 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1131 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1132 &mmu_stats.pmd_update_pinned);
1133 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1134 &mmu_stats.pmd_update_pinned);
1135
1136 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1137// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1138// &mmu_stats.pte_update_pinned);
1139 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1140 &mmu_stats.pte_update_pinned);
1141
1142 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1143 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1144 &mmu_stats.mmu_update_extended);
1145 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1146 mmu_stats.mmu_update_histo, 20);
1147
1148 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1149 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1150 &mmu_stats.set_pte_at_batched);
1151 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1152 &mmu_stats.set_pte_at_current);
1153 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1154 &mmu_stats.set_pte_at_kernel);
1155
1156 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1157 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1158 &mmu_stats.prot_commit_batched);
1159
1160 return 0;
1161}
1162fs_initcall(xen_mmu_debugfs);
1163
1164#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 9efd1c6c9776..8ea8a0d0b0de 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -21,16 +21,20 @@
21 */ 21 */
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/hardirq.h> 23#include <linux/hardirq.h>
24#include <linux/debugfs.h>
24 25
25#include <asm/xen/hypercall.h> 26#include <asm/xen/hypercall.h>
26 27
27#include "multicalls.h" 28#include "multicalls.h"
29#include "debugfs.h"
30
31#define MC_BATCH 32
28 32
29#define MC_DEBUG 1 33#define MC_DEBUG 1
30 34
31#define MC_BATCH 32
32#define MC_ARGS (MC_BATCH * 16) 35#define MC_ARGS (MC_BATCH * 16)
33 36
37
34struct mc_buffer { 38struct mc_buffer {
35 struct multicall_entry entries[MC_BATCH]; 39 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG 40#if MC_DEBUG
@@ -47,6 +51,76 @@ struct mc_buffer {
47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 51static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
48DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); 52DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
49 53
54/* flush reasons 0- slots, 1- args, 2- callbacks */
55enum flush_reasons
56{
57 FL_SLOTS,
58 FL_ARGS,
59 FL_CALLBACKS,
60
61 FL_N_REASONS
62};
63
64#ifdef CONFIG_XEN_DEBUG_FS
65#define NHYPERCALLS 40 /* not really */
66
67static struct {
68 unsigned histo[MC_BATCH+1];
69
70 unsigned issued;
71 unsigned arg_total;
72 unsigned hypercalls;
73 unsigned histo_hypercalls[NHYPERCALLS];
74
75 unsigned flush[FL_N_REASONS];
76} mc_stats;
77
78static u8 zero_stats;
79
80static inline void check_zero(void)
81{
82 if (unlikely(zero_stats)) {
83 memset(&mc_stats, 0, sizeof(mc_stats));
84 zero_stats = 0;
85 }
86}
87
88static void mc_add_stats(const struct mc_buffer *mc)
89{
90 int i;
91
92 check_zero();
93
94 mc_stats.issued++;
95 mc_stats.hypercalls += mc->mcidx;
96 mc_stats.arg_total += mc->argidx;
97
98 mc_stats.histo[mc->mcidx]++;
99 for(i = 0; i < mc->mcidx; i++) {
100 unsigned op = mc->entries[i].op;
101 if (op < NHYPERCALLS)
102 mc_stats.histo_hypercalls[op]++;
103 }
104}
105
106static void mc_stats_flush(enum flush_reasons idx)
107{
108 check_zero();
109
110 mc_stats.flush[idx]++;
111}
112
113#else /* !CONFIG_XEN_DEBUG_FS */
114
115static inline void mc_add_stats(const struct mc_buffer *mc)
116{
117}
118
119static inline void mc_stats_flush(enum flush_reasons idx)
120{
121}
122#endif /* CONFIG_XEN_DEBUG_FS */
123
50void xen_mc_flush(void) 124void xen_mc_flush(void)
51{ 125{
52 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 126 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
@@ -60,6 +134,8 @@ void xen_mc_flush(void)
60 something in the middle */ 134 something in the middle */
61 local_irq_save(flags); 135 local_irq_save(flags);
62 136
137 mc_add_stats(b);
138
63 if (b->mcidx) { 139 if (b->mcidx) {
64#if MC_DEBUG 140#if MC_DEBUG
65 memcpy(b->debug, b->entries, 141 memcpy(b->debug, b->entries,
@@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args)
115 191
116 if (b->mcidx == MC_BATCH || 192 if (b->mcidx == MC_BATCH ||
117 (argidx + args) > MC_ARGS) { 193 (argidx + args) > MC_ARGS) {
194 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
118 xen_mc_flush(); 195 xen_mc_flush();
119 argidx = roundup(b->argidx, sizeof(u64)); 196 argidx = roundup(b->argidx, sizeof(u64));
120 } 197 }
@@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data)
158 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 235 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
159 struct callback *cb; 236 struct callback *cb;
160 237
161 if (b->cbidx == MC_BATCH) 238 if (b->cbidx == MC_BATCH) {
239 mc_stats_flush(FL_CALLBACKS);
162 xen_mc_flush(); 240 xen_mc_flush();
241 }
163 242
164 cb = &b->callbacks[b->cbidx++]; 243 cb = &b->callbacks[b->cbidx++];
165 cb->fn = fn; 244 cb->fn = fn;
166 cb->data = data; 245 cb->data = data;
167} 246}
247
248#ifdef CONFIG_XEN_DEBUG_FS
249
250static struct dentry *d_mc_debug;
251
252static int __init xen_mc_debugfs(void)
253{
254 struct dentry *d_xen = xen_init_debugfs();
255
256 if (d_xen == NULL)
257 return -ENOMEM;
258
259 d_mc_debug = debugfs_create_dir("multicalls", d_xen);
260
261 debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
262
263 debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
264 debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
265 debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
266
267 xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
268 mc_stats.histo, MC_BATCH);
269 xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
270 mc_stats.histo_hypercalls, NHYPERCALLS);
271 xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
272 mc_stats.flush, FL_N_REASONS);
273
274 return 0;
275}
276fs_initcall(xen_mc_debugfs);
277
278#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index d8faf79a0a1d..d77da613b1d2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -11,11 +11,8 @@
11 * useful topology information for the kernel to make use of. As a 11 * useful topology information for the kernel to make use of. As a
12 * result, all CPUs are treated as if they're single-core and 12 * result, all CPUs are treated as if they're single-core and
13 * single-threaded. 13 * single-threaded.
14 *
15 * This does not handle HOTPLUG_CPU yet.
16 */ 14 */
17#include <linux/sched.h> 15#include <linux/sched.h>
18#include <linux/kernel_stat.h>
19#include <linux/err.h> 16#include <linux/err.h>
20#include <linux/smp.h> 17#include <linux/smp.h>
21 18
@@ -36,8 +33,6 @@
36#include "xen-ops.h" 33#include "xen-ops.h"
37#include "mmu.h" 34#include "mmu.h"
38 35
39static void __cpuinit xen_init_lock_cpu(int cpu);
40
41cpumask_t xen_cpu_initialized_map; 36cpumask_t xen_cpu_initialized_map;
42 37
43static DEFINE_PER_CPU(int, resched_irq); 38static DEFINE_PER_CPU(int, resched_irq);
@@ -64,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
64 return IRQ_HANDLED; 59 return IRQ_HANDLED;
65} 60}
66 61
67static __cpuinit void cpu_bringup_and_idle(void) 62static __cpuinit void cpu_bringup(void)
68{ 63{
69 int cpu = smp_processor_id(); 64 int cpu = smp_processor_id();
70 65
71 cpu_init(); 66 cpu_init();
67 touch_softlockup_watchdog();
72 preempt_disable(); 68 preempt_disable();
73 69
74 xen_enable_sysenter(); 70 xen_enable_sysenter();
@@ -89,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void)
89 local_irq_enable(); 85 local_irq_enable();
90 86
91 wmb(); /* make sure everything is out */ 87 wmb(); /* make sure everything is out */
88}
89
90static __cpuinit void cpu_bringup_and_idle(void)
91{
92 cpu_bringup();
92 cpu_idle(); 93 cpu_idle();
93} 94}
94 95
@@ -212,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
212 213
213 cpu_set(cpu, cpu_present_map); 214 cpu_set(cpu, cpu_present_map);
214 } 215 }
215
216 //init_xenbus_allowed_cpumask();
217} 216}
218 217
219static __cpuinit int 218static __cpuinit int
@@ -281,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
281 struct task_struct *idle = idle_task(cpu); 280 struct task_struct *idle = idle_task(cpu);
282 int rc; 281 int rc;
283 282
284#if 0
285 rc = cpu_up_check(cpu);
286 if (rc)
287 return rc;
288#endif
289
290#ifdef CONFIG_X86_64 283#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */ 284 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0); 285 WARN_ON(cpu == 0);
@@ -339,6 +332,60 @@ static void xen_smp_cpus_done(unsigned int max_cpus)
339{ 332{
340} 333}
341 334
335#ifdef CONFIG_HOTPLUG_CPU
336static int xen_cpu_disable(void)
337{
338 unsigned int cpu = smp_processor_id();
339 if (cpu == 0)
340 return -EBUSY;
341
342 cpu_disable_common();
343
344 load_cr3(swapper_pg_dir);
345 return 0;
346}
347
348static void xen_cpu_die(unsigned int cpu)
349{
350 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
351 current->state = TASK_UNINTERRUPTIBLE;
352 schedule_timeout(HZ/10);
353 }
354 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
355 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
356 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
357 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
358 xen_uninit_lock_cpu(cpu);
359 xen_teardown_timer(cpu);
360
361 if (num_online_cpus() == 1)
362 alternatives_smp_switch(0);
363}
364
365static void xen_play_dead(void)
366{
367 play_dead_common();
368 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
369 cpu_bringup();
370}
371
372#else /* !CONFIG_HOTPLUG_CPU */
373static int xen_cpu_disable(void)
374{
375 return -ENOSYS;
376}
377
378static void xen_cpu_die(unsigned int cpu)
379{
380 BUG();
381}
382
383static void xen_play_dead(void)
384{
385 BUG();
386}
387
388#endif
342static void stop_self(void *v) 389static void stop_self(void *v)
343{ 390{
344 int cpu = smp_processor_id(); 391 int cpu = smp_processor_id();
@@ -419,176 +466,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
419 return IRQ_HANDLED; 466 return IRQ_HANDLED;
420} 467}
421 468
422struct xen_spinlock {
423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
428{
429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
430
431 return xl->lock != 0;
432}
433
434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
437
438 /* Not strictly true; this is only the count of contended
439 lock-takers entering the slow path. */
440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
539 }
540}
541
542static void xen_spin_unlock(struct raw_spinlock *lock)
543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
545
546 smp_wmb(); /* make sure no writes get moved after unlock */
547 xl->lock = 0; /* release lock */
548
549 /* make sure unlock happens before kick */
550 barrier();
551
552 if (unlikely(xl->spinners))
553 xen_spin_unlock_slow(xl);
554}
555
556static __cpuinit void xen_init_lock_cpu(int cpu)
557{
558 int irq;
559 const char *name;
560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
573
574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
575}
576
577static void __init xen_init_spinlocks(void)
578{
579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
585
586static const struct smp_ops xen_smp_ops __initdata = { 469static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 470 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus, 471 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done, 472 .smp_cpus_done = xen_smp_cpus_done,
591 473
474 .cpu_up = xen_cpu_up,
475 .cpu_die = xen_cpu_die,
476 .cpu_disable = xen_cpu_disable,
477 .play_dead = xen_play_dead,
478
592 .smp_send_stop = xen_smp_send_stop, 479 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule, 480 .smp_send_reschedule = xen_smp_send_reschedule,
594 481
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 000000000000..dd71e3a021cd
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,428 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/kernel_stat.h>
6#include <linux/spinlock.h>
7#include <linux/debugfs.h>
8#include <linux/log2.h>
9
10#include <asm/paravirt.h>
11
12#include <xen/interface/xen.h>
13#include <xen/events.h>
14
15#include "xen-ops.h"
16#include "debugfs.h"
17
18#ifdef CONFIG_XEN_DEBUG_FS
19static struct xen_spinlock_stats
20{
21 u64 taken;
22 u32 taken_slow;
23 u32 taken_slow_nested;
24 u32 taken_slow_pickup;
25 u32 taken_slow_spurious;
26 u32 taken_slow_irqenable;
27
28 u64 released;
29 u32 released_slow;
30 u32 released_slow_kicked;
31
32#define HISTO_BUCKETS 30
33 u32 histo_spin_total[HISTO_BUCKETS+1];
34 u32 histo_spin_spinning[HISTO_BUCKETS+1];
35 u32 histo_spin_blocked[HISTO_BUCKETS+1];
36
37 u64 time_total;
38 u64 time_spinning;
39 u64 time_blocked;
40} spinlock_stats;
41
42static u8 zero_stats;
43
44static unsigned lock_timeout = 1 << 10;
45#define TIMEOUT lock_timeout
46
47static inline void check_zero(void)
48{
49 if (unlikely(zero_stats)) {
50 memset(&spinlock_stats, 0, sizeof(spinlock_stats));
51 zero_stats = 0;
52 }
53}
54
55#define ADD_STATS(elem, val) \
56 do { check_zero(); spinlock_stats.elem += (val); } while(0)
57
58static inline u64 spin_time_start(void)
59{
60 return xen_clocksource_read();
61}
62
63static void __spin_time_accum(u64 delta, u32 *array)
64{
65 unsigned index = ilog2(delta);
66
67 check_zero();
68
69 if (index < HISTO_BUCKETS)
70 array[index]++;
71 else
72 array[HISTO_BUCKETS]++;
73}
74
75static inline void spin_time_accum_spinning(u64 start)
76{
77 u32 delta = xen_clocksource_read() - start;
78
79 __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
80 spinlock_stats.time_spinning += delta;
81}
82
83static inline void spin_time_accum_total(u64 start)
84{
85 u32 delta = xen_clocksource_read() - start;
86
87 __spin_time_accum(delta, spinlock_stats.histo_spin_total);
88 spinlock_stats.time_total += delta;
89}
90
91static inline void spin_time_accum_blocked(u64 start)
92{
93 u32 delta = xen_clocksource_read() - start;
94
95 __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
96 spinlock_stats.time_blocked += delta;
97}
98#else /* !CONFIG_XEN_DEBUG_FS */
99#define TIMEOUT (1 << 10)
100#define ADD_STATS(elem, val) do { (void)(val); } while(0)
101
102static inline u64 spin_time_start(void)
103{
104 return 0;
105}
106
107static inline void spin_time_accum_total(u64 start)
108{
109}
110static inline void spin_time_accum_spinning(u64 start)
111{
112}
113static inline void spin_time_accum_blocked(u64 start)
114{
115}
116#endif /* CONFIG_XEN_DEBUG_FS */
117
118struct xen_spinlock {
119 unsigned char lock; /* 0 -> free; 1 -> locked */
120 unsigned short spinners; /* count of waiting cpus */
121};
122
123static int xen_spin_is_locked(struct raw_spinlock *lock)
124{
125 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
126
127 return xl->lock != 0;
128}
129
130static int xen_spin_is_contended(struct raw_spinlock *lock)
131{
132 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
133
134 /* Not strictly true; this is only the count of contended
135 lock-takers entering the slow path. */
136 return xl->spinners != 0;
137}
138
139static int xen_spin_trylock(struct raw_spinlock *lock)
140{
141 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
142 u8 old = 1;
143
144 asm("xchgb %b0,%1"
145 : "+q" (old), "+m" (xl->lock) : : "memory");
146
147 return old == 0;
148}
149
150static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
151static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
152
153/*
154 * Mark a cpu as interested in a lock. Returns the CPU's previous
155 * lock of interest, in case we got preempted by an interrupt.
156 */
157static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
158{
159 struct xen_spinlock *prev;
160
161 prev = __get_cpu_var(lock_spinners);
162 __get_cpu_var(lock_spinners) = xl;
163
164 wmb(); /* set lock of interest before count */
165
166 asm(LOCK_PREFIX " incw %0"
167 : "+m" (xl->spinners) : : "memory");
168
169 return prev;
170}
171
172/*
173 * Mark a cpu as no longer interested in a lock. Restores previous
174 * lock of interest (NULL for none).
175 */
176static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
177{
178 asm(LOCK_PREFIX " decw %0"
179 : "+m" (xl->spinners) : : "memory");
180 wmb(); /* decrement count before restoring lock */
181 __get_cpu_var(lock_spinners) = prev;
182}
183
184static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
185{
186 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
187 struct xen_spinlock *prev;
188 int irq = __get_cpu_var(lock_kicker_irq);
189 int ret;
190 unsigned long flags;
191 u64 start;
192
193 /* If kicker interrupts not initialized yet, just spin */
194 if (irq == -1)
195 return 0;
196
197 start = spin_time_start();
198
199 /* announce we're spinning */
200 prev = spinning_lock(xl);
201
202 flags = __raw_local_save_flags();
203 if (irq_enable) {
204 ADD_STATS(taken_slow_irqenable, 1);
205 raw_local_irq_enable();
206 }
207
208 ADD_STATS(taken_slow, 1);
209 ADD_STATS(taken_slow_nested, prev != NULL);
210
211 do {
212 /* clear pending */
213 xen_clear_irq_pending(irq);
214
215 /* check again make sure it didn't become free while
216 we weren't looking */
217 ret = xen_spin_trylock(lock);
218 if (ret) {
219 ADD_STATS(taken_slow_pickup, 1);
220
221 /*
222 * If we interrupted another spinlock while it
223 * was blocking, make sure it doesn't block
224 * without rechecking the lock.
225 */
226 if (prev != NULL)
227 xen_set_irq_pending(irq);
228 goto out;
229 }
230
231 /*
232 * Block until irq becomes pending. If we're
233 * interrupted at this point (after the trylock but
234 * before entering the block), then the nested lock
235 * handler guarantees that the irq will be left
236 * pending if there's any chance the lock became free;
237 * xen_poll_irq() returns immediately if the irq is
238 * pending.
239 */
240 xen_poll_irq(irq);
241 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
242 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
243
244 kstat_this_cpu.irqs[irq]++;
245
246out:
247 raw_local_irq_restore(flags);
248 unspinning_lock(xl, prev);
249 spin_time_accum_blocked(start);
250
251 return ret;
252}
253
254static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
255{
256 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
257 unsigned timeout;
258 u8 oldval;
259 u64 start_spin;
260
261 ADD_STATS(taken, 1);
262
263 start_spin = spin_time_start();
264
265 do {
266 u64 start_spin_fast = spin_time_start();
267
268 timeout = TIMEOUT;
269
270 asm("1: xchgb %1,%0\n"
271 " testb %1,%1\n"
272 " jz 3f\n"
273 "2: rep;nop\n"
274 " cmpb $0,%0\n"
275 " je 1b\n"
276 " dec %2\n"
277 " jnz 2b\n"
278 "3:\n"
279 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
280 : "1" (1)
281 : "memory");
282
283 spin_time_accum_spinning(start_spin_fast);
284
285 } while (unlikely(oldval != 0 &&
286 (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
287
288 spin_time_accum_total(start_spin);
289}
290
291static void xen_spin_lock(struct raw_spinlock *lock)
292{
293 __xen_spin_lock(lock, false);
294}
295
296static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
297{
298 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
299}
300
301static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
302{
303 int cpu;
304
305 ADD_STATS(released_slow, 1);
306
307 for_each_online_cpu(cpu) {
308 /* XXX should mix up next cpu selection */
309 if (per_cpu(lock_spinners, cpu) == xl) {
310 ADD_STATS(released_slow_kicked, 1);
311 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
312 break;
313 }
314 }
315}
316
317static void xen_spin_unlock(struct raw_spinlock *lock)
318{
319 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
320
321 ADD_STATS(released, 1);
322
323 smp_wmb(); /* make sure no writes get moved after unlock */
324 xl->lock = 0; /* release lock */
325
326 /* make sure unlock happens before kick */
327 barrier();
328
329 if (unlikely(xl->spinners))
330 xen_spin_unlock_slow(xl);
331}
332
333static irqreturn_t dummy_handler(int irq, void *dev_id)
334{
335 BUG();
336 return IRQ_HANDLED;
337}
338
339void __cpuinit xen_init_lock_cpu(int cpu)
340{
341 int irq;
342 const char *name;
343
344 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
345 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
346 cpu,
347 dummy_handler,
348 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
349 name,
350 NULL);
351
352 if (irq >= 0) {
353 disable_irq(irq); /* make sure it's never delivered */
354 per_cpu(lock_kicker_irq, cpu) = irq;
355 }
356
357 printk("cpu %d spinlock event irq %d\n", cpu, irq);
358}
359
360void xen_uninit_lock_cpu(int cpu)
361{
362 unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
363}
364
365void __init xen_init_spinlocks(void)
366{
367 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
368 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
369 pv_lock_ops.spin_lock = xen_spin_lock;
370 pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
371 pv_lock_ops.spin_trylock = xen_spin_trylock;
372 pv_lock_ops.spin_unlock = xen_spin_unlock;
373}
374
375#ifdef CONFIG_XEN_DEBUG_FS
376
377static struct dentry *d_spin_debug;
378
379static int __init xen_spinlock_debugfs(void)
380{
381 struct dentry *d_xen = xen_init_debugfs();
382
383 if (d_xen == NULL)
384 return -ENOMEM;
385
386 d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
387
388 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
389
390 debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
391
392 debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
393 debugfs_create_u32("taken_slow", 0444, d_spin_debug,
394 &spinlock_stats.taken_slow);
395 debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
396 &spinlock_stats.taken_slow_nested);
397 debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
398 &spinlock_stats.taken_slow_pickup);
399 debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
400 &spinlock_stats.taken_slow_spurious);
401 debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
402 &spinlock_stats.taken_slow_irqenable);
403
404 debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
405 debugfs_create_u32("released_slow", 0444, d_spin_debug,
406 &spinlock_stats.released_slow);
407 debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
408 &spinlock_stats.released_slow_kicked);
409
410 debugfs_create_u64("time_spinning", 0444, d_spin_debug,
411 &spinlock_stats.time_spinning);
412 debugfs_create_u64("time_blocked", 0444, d_spin_debug,
413 &spinlock_stats.time_blocked);
414 debugfs_create_u64("time_total", 0444, d_spin_debug,
415 &spinlock_stats.time_total);
416
417 xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
418 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
419 xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
420 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
421 xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
422 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
423
424 return 0;
425}
426fs_initcall(xen_spinlock_debugfs);
427
428#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 685b77470fc3..004ba86326ae 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -30,8 +30,6 @@
30#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
31#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
32 32
33static cycle_t xen_clocksource_read(void);
34
35/* runstate info updated by Xen */ 33/* runstate info updated by Xen */
36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
37 35
@@ -213,7 +211,7 @@ unsigned long xen_tsc_khz(void)
213 return xen_khz; 211 return xen_khz;
214} 212}
215 213
216static cycle_t xen_clocksource_read(void) 214cycle_t xen_clocksource_read(void)
217{ 215{
218 struct pvclock_vcpu_time_info *src; 216 struct pvclock_vcpu_time_info *src;
219 cycle_t ret; 217 cycle_t ret;
@@ -452,6 +450,14 @@ void xen_setup_timer(int cpu)
452 setup_runstate_info(cpu); 450 setup_runstate_info(cpu);
453} 451}
454 452
453void xen_teardown_timer(int cpu)
454{
455 struct clock_event_device *evt;
456 BUG_ON(cpu == 0);
457 evt = &per_cpu(xen_clock_events, cpu);
458 unbind_from_irqhandler(evt->irq, NULL);
459}
460
455void xen_setup_cpu_clockevents(void) 461void xen_setup_cpu_clockevents(void)
456{ 462{
457 BUG_ON(preemptible()); 463 BUG_ON(preemptible());
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..42786f59d9c0 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -298,7 +298,7 @@ check_events:
298 push %eax 298 push %eax
299 push %ecx 299 push %ecx
300 push %edx 300 push %edx
301 call force_evtchn_callback 301 call xen_force_evtchn_callback
302 pop %edx 302 pop %edx
303 pop %ecx 303 pop %ecx
304 pop %eax 304 pop %eax
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 7f58304fafb3..3b9bda46487a 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -122,7 +122,7 @@ check_events:
122 push %r9 122 push %r9
123 push %r10 123 push %r10
124 push %r11 124 push %r11
125 call force_evtchn_callback 125 call xen_force_evtchn_callback
126 pop %r11 126 pop %r11
127 pop %r10 127 pop %r10
128 pop %r9 128 pop %r9
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index dd3c23152a2e..d7422dc2a55c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,7 @@
2#define XEN_OPS_H 2#define XEN_OPS_H
3 3
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/clocksource.h>
5#include <linux/irqreturn.h> 6#include <linux/irqreturn.h>
6#include <xen/xen-ops.h> 7#include <xen/xen-ops.h>
7 8
@@ -31,7 +32,10 @@ void xen_vcpu_restore(void);
31 32
32void __init xen_build_dynamic_phys_to_machine(void); 33void __init xen_build_dynamic_phys_to_machine(void);
33 34
35void xen_init_irq_ops(void);
34void xen_setup_timer(int cpu); 36void xen_setup_timer(int cpu);
37void xen_teardown_timer(int cpu);
38cycle_t xen_clocksource_read(void);
35void xen_setup_cpu_clockevents(void); 39void xen_setup_cpu_clockevents(void);
36unsigned long xen_tsc_khz(void); 40unsigned long xen_tsc_khz(void);
37void __init xen_time_init(void); 41void __init xen_time_init(void);
@@ -50,6 +54,10 @@ void __init xen_setup_vcpu_info_placement(void);
50#ifdef CONFIG_SMP 54#ifdef CONFIG_SMP
51void xen_smp_init(void); 55void xen_smp_init(void);
52 56
57void __init xen_init_spinlocks(void);
58__cpuinit void xen_init_lock_cpu(int cpu);
59void xen_uninit_lock_cpu(int cpu);
60
53extern cpumask_t xen_cpu_initialized_map; 61extern cpumask_t xen_cpu_initialized_map;
54#else 62#else
55static inline void xen_smp_init(void) {} 63static inline void xen_smp_init(void) {}