aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/cpu/common.c11
-rw-r--r--arch/x86/kernel/ldt.c9
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c37
-rw-r--r--arch/x86/kernel/paravirt.c27
-rw-r--r--arch/x86/kernel/process_32.c39
-rw-r--r--arch/x86/kernel/process_64.c22
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kernel/smpboot.c77
-rw-r--r--arch/x86/kernel/tlb_32.c8
-rw-r--r--arch/x86/mm/fault.c14
-rw-r--r--arch/x86/xen/Kconfig12
-rw-r--r--arch/x86/xen/Makefile12
-rw-r--r--arch/x86/xen/debugfs.c123
-rw-r--r--arch/x86/xen/debugfs.h10
-rw-r--r--arch/x86/xen/enlighten.c252
-rw-r--r--arch/x86/xen/irq.c143
-rw-r--r--arch/x86/xen/mmu.c314
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/multicalls.c115
-rw-r--r--arch/x86/xen/smp.c245
-rw-r--r--arch/x86/xen/spinlock.c428
-rw-r--r--arch/x86/xen/time.c12
-rw-r--r--arch/x86/xen/xen-asm_32.S2
-rw-r--r--arch/x86/xen/xen-asm_64.S22
-rw-r--r--arch/x86/xen/xen-ops.h8
-rw-r--r--drivers/block/xen-blkfront.c2
-rw-r--r--drivers/char/hvc_xen.c6
-rw-r--r--drivers/input/xen-kbdfront.c4
-rw-r--r--drivers/net/xen-netfront.c6
-rw-r--r--drivers/video/xen-fbfront.c4
-rw-r--r--drivers/xen/Makefile1
-rw-r--r--drivers/xen/balloon.c175
-rw-r--r--drivers/xen/cpu_hotplug.c90
-rw-r--r--drivers/xen/events.c40
-rw-r--r--drivers/xen/grant-table.c2
-rw-r--r--drivers/xen/xenbus/xenbus_probe.c8
-rw-r--r--include/asm-x86/desc.h15
-rw-r--r--include/asm-x86/paravirt.h20
-rw-r--r--include/asm-x86/smp.h34
-rw-r--r--include/asm-x86/spinlock.h67
-rw-r--r--include/asm-x86/tlbflush.h10
-rw-r--r--include/asm-x86/xen/hypervisor.h14
-rw-r--r--include/linux/kernel.h2
-rw-r--r--include/linux/mm.h6
-rw-r--r--include/linux/mm_types.h10
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/xen/balloon.h61
-rw-r--r--include/xen/events.h2
-rw-r--r--lib/cmdline.c2
50 files changed, 1696 insertions, 846 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index c9be69fedb70..7b655b5bb9ab 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE
10# Do not profile debug and lowlevel utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt.o = -pg 13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14endif 14endif
15 15
16# 16#
@@ -90,7 +90,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
90obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 90obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
91obj-$(CONFIG_KVM_GUEST) += kvm.o 91obj-$(CONFIG_KVM_GUEST) += kvm.o
92obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 92obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
93obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 93obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
94obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 94obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
95 95
96obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 96obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7581b62df184..fb789dd9e691 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1121,16 +1121,5 @@ void __cpuinit cpu_init(void)
1121 xsave_init(); 1121 xsave_init();
1122} 1122}
1123 1123
1124#ifdef CONFIG_HOTPLUG_CPU
1125void __cpuinit cpu_uninit(void)
1126{
1127 int cpu = raw_smp_processor_id();
1128 cpu_clear(cpu, cpu_initialized);
1129
1130 /* lazy TLB state */
1131 per_cpu(cpu_tlbstate, cpu).state = 0;
1132 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
1133}
1134#endif
1135 1124
1136#endif 1125#endif
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0ed5f939b905..eee32b43fee3 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -52,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
52 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, 52 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
53 (mincount - oldsize) * LDT_ENTRY_SIZE); 53 (mincount - oldsize) * LDT_ENTRY_SIZE);
54 54
55 paravirt_alloc_ldt(newldt, mincount);
56
55#ifdef CONFIG_X86_64 57#ifdef CONFIG_X86_64
56 /* CHECKME: Do we really need this ? */ 58 /* CHECKME: Do we really need this ? */
57 wmb(); 59 wmb();
@@ -74,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
74#endif 76#endif
75 } 77 }
76 if (oldsize) { 78 if (oldsize) {
79 paravirt_free_ldt(oldldt, oldsize);
77 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) 80 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
78 vfree(oldldt); 81 vfree(oldldt);
79 else 82 else
@@ -85,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
85static inline int copy_ldt(mm_context_t *new, mm_context_t *old) 88static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
86{ 89{
87 int err = alloc_ldt(new, old->size, 0); 90 int err = alloc_ldt(new, old->size, 0);
91 int i;
88 92
89 if (err < 0) 93 if (err < 0)
90 return err; 94 return err;
91 memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); 95
96 for(i = 0; i < old->size; i++)
97 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
92 return 0; 98 return 0;
93} 99}
94 100
@@ -125,6 +131,7 @@ void destroy_context(struct mm_struct *mm)
125 if (mm == current->active_mm) 131 if (mm == current->active_mm)
126 clear_LDT(); 132 clear_LDT();
127#endif 133#endif
134 paravirt_free_ldt(mm->context.ldt, mm->context.size);
128 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) 135 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
129 vfree(mm->context.ldt); 136 vfree(mm->context.ldt);
130 else 137 else
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 000000000000..0e9f1982b1dd
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,37 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/spinlock.h>
6#include <linux/module.h>
7
8#include <asm/paravirt.h>
9
10static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
11{
12 __raw_spin_lock(lock);
13}
14
15struct pv_lock_ops pv_lock_ops = {
16#ifdef CONFIG_SMP
17 .spin_is_locked = __ticket_spin_is_locked,
18 .spin_is_contended = __ticket_spin_is_contended,
19
20 .spin_lock = __ticket_spin_lock,
21 .spin_lock_flags = default_spin_lock_flags,
22 .spin_trylock = __ticket_spin_trylock,
23 .spin_unlock = __ticket_spin_unlock,
24#endif
25};
26EXPORT_SYMBOL(pv_lock_ops);
27
28void __init paravirt_use_bytelocks(void)
29{
30#ifdef CONFIG_SMP
31 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
32 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
33 pv_lock_ops.spin_lock = __byte_spin_lock;
34 pv_lock_ops.spin_trylock = __byte_spin_trylock;
35 pv_lock_ops.spin_unlock = __byte_spin_unlock;
36#endif
37}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 6b0bb73998dd..e4c8fb608873 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
268 return __get_cpu_var(paravirt_lazy_mode); 268 return __get_cpu_var(paravirt_lazy_mode);
269} 269}
270 270
271void __init paravirt_use_bytelocks(void)
272{
273#ifdef CONFIG_SMP
274 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
275 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
276 pv_lock_ops.spin_lock = __byte_spin_lock;
277 pv_lock_ops.spin_trylock = __byte_spin_trylock;
278 pv_lock_ops.spin_unlock = __byte_spin_unlock;
279#endif
280}
281
282struct pv_info pv_info = { 271struct pv_info pv_info = {
283 .name = "bare hardware", 272 .name = "bare hardware",
284 .paravirt_enabled = 0, 273 .paravirt_enabled = 0,
@@ -349,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = {
349 .write_ldt_entry = native_write_ldt_entry, 338 .write_ldt_entry = native_write_ldt_entry,
350 .write_gdt_entry = native_write_gdt_entry, 339 .write_gdt_entry = native_write_gdt_entry,
351 .write_idt_entry = native_write_idt_entry, 340 .write_idt_entry = native_write_idt_entry,
341
342 .alloc_ldt = paravirt_nop,
343 .free_ldt = paravirt_nop,
344
352 .load_sp0 = native_load_sp0, 345 .load_sp0 = native_load_sp0,
353 346
354#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 347#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -460,18 +453,6 @@ struct pv_mmu_ops pv_mmu_ops = {
460 .set_fixmap = native_set_fixmap, 453 .set_fixmap = native_set_fixmap,
461}; 454};
462 455
463struct pv_lock_ops pv_lock_ops = {
464#ifdef CONFIG_SMP
465 .spin_is_locked = __ticket_spin_is_locked,
466 .spin_is_contended = __ticket_spin_is_contended,
467
468 .spin_lock = __ticket_spin_lock,
469 .spin_trylock = __ticket_spin_trylock,
470 .spin_unlock = __ticket_spin_unlock,
471#endif
472};
473EXPORT_SYMBOL(pv_lock_ops);
474
475EXPORT_SYMBOL_GPL(pv_time_ops); 456EXPORT_SYMBOL_GPL(pv_time_ops);
476EXPORT_SYMBOL (pv_cpu_ops); 457EXPORT_SYMBOL (pv_cpu_ops);
477EXPORT_SYMBOL (pv_mmu_ops); 458EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 205188db9626..922c14058f97 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -76,47 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
76 return ((unsigned long *)tsk->thread.sp)[3]; 76 return ((unsigned long *)tsk->thread.sp)[3];
77} 77}
78 78
79#ifdef CONFIG_HOTPLUG_CPU 79#ifndef CONFIG_SMP
80#include <asm/nmi.h>
81
82static void cpu_exit_clear(void)
83{
84 int cpu = raw_smp_processor_id();
85
86 idle_task_exit();
87
88 cpu_uninit();
89 irq_ctx_exit(cpu);
90
91 cpu_clear(cpu, cpu_callout_map);
92 cpu_clear(cpu, cpu_callin_map);
93
94 numa_remove_cpu(cpu);
95 c1e_remove_cpu(cpu);
96}
97
98/* We don't actually take CPU down, just spin without interrupts. */
99static inline void play_dead(void)
100{
101 /* This must be done before dead CPU ack */
102 cpu_exit_clear();
103 mb();
104 /* Ack it */
105 __get_cpu_var(cpu_state) = CPU_DEAD;
106
107 /*
108 * With physical CPU hotplug, we should halt the cpu
109 */
110 local_irq_disable();
111 /* mask all interrupts, flush any and all caches, and halt */
112 wbinvd_halt();
113}
114#else
115static inline void play_dead(void) 80static inline void play_dead(void)
116{ 81{
117 BUG(); 82 BUG();
118} 83}
119#endif /* CONFIG_HOTPLUG_CPU */ 84#endif
120 85
121/* 86/*
122 * The idle thread. There's no useful work to be 87 * The idle thread. There's no useful work to be
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b6b508ea7110..ca80394ef5b8 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -86,30 +86,12 @@ void exit_idle(void)
86 __exit_idle(); 86 __exit_idle();
87} 87}
88 88
89#ifdef CONFIG_HOTPLUG_CPU 89#ifndef CONFIG_SMP
90DECLARE_PER_CPU(int, cpu_state);
91
92#include <linux/nmi.h>
93/* We halt the CPU with physical CPU hotplug */
94static inline void play_dead(void)
95{
96 idle_task_exit();
97 c1e_remove_cpu(raw_smp_processor_id());
98
99 mb();
100 /* Ack it */
101 __get_cpu_var(cpu_state) = CPU_DEAD;
102
103 local_irq_disable();
104 /* mask all interrupts, flush any and all caches, and halt */
105 wbinvd_halt();
106}
107#else
108static inline void play_dead(void) 90static inline void play_dead(void)
109{ 91{
110 BUG(); 92 BUG();
111} 93}
112#endif /* CONFIG_HOTPLUG_CPU */ 94#endif
113 95
114/* 96/*
115 * The idle thread. There's no useful work to be 97 * The idle thread. There's no useful work to be
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 361b7a4c640c..18f9b19f5f8f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
214struct smp_ops smp_ops = { 214struct smp_ops smp_ops = {
215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
216 .smp_prepare_cpus = native_smp_prepare_cpus, 216 .smp_prepare_cpus = native_smp_prepare_cpus,
217 .cpu_up = native_cpu_up,
218 .smp_cpus_done = native_smp_cpus_done, 217 .smp_cpus_done = native_smp_cpus_done,
219 218
220 .smp_send_stop = native_smp_send_stop, 219 .smp_send_stop = native_smp_send_stop,
221 .smp_send_reschedule = native_smp_send_reschedule, 220 .smp_send_reschedule = native_smp_send_reschedule,
222 221
222 .cpu_up = native_cpu_up,
223 .cpu_die = native_cpu_die,
224 .cpu_disable = native_cpu_disable,
225 .play_dead = native_play_dead,
226
223 .send_call_func_ipi = native_send_call_func_ipi, 227 .send_call_func_ipi = native_send_call_func_ipi,
224 .send_call_func_single_ipi = native_send_call_func_single_ipi, 228 .send_call_func_single_ipi = native_send_call_func_single_ipi,
225}; 229};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9056f7e272c0..76b6f50978f7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h>
55#include <asm/smp.h> 56#include <asm/smp.h>
56#include <asm/trampoline.h> 57#include <asm/trampoline.h>
57#include <asm/cpu.h> 58#include <asm/cpu.h>
@@ -1344,25 +1345,9 @@ static void __ref remove_cpu_from_maps(int cpu)
1344 numa_remove_cpu(cpu); 1345 numa_remove_cpu(cpu);
1345} 1346}
1346 1347
1347int __cpu_disable(void) 1348void cpu_disable_common(void)
1348{ 1349{
1349 int cpu = smp_processor_id(); 1350 int cpu = smp_processor_id();
1350
1351 /*
1352 * Perhaps use cpufreq to drop frequency, but that could go
1353 * into generic code.
1354 *
1355 * We won't take down the boot processor on i386 due to some
1356 * interrupts only being able to be serviced by the BSP.
1357 * Especially so if we're not using an IOAPIC -zwane
1358 */
1359 if (cpu == 0)
1360 return -EBUSY;
1361
1362 if (nmi_watchdog == NMI_LOCAL_APIC)
1363 stop_apic_nmi_watchdog(NULL);
1364 clear_local_APIC();
1365
1366 /* 1351 /*
1367 * HACK: 1352 * HACK:
1368 * Allow any queued timer interrupts to get serviced 1353 * Allow any queued timer interrupts to get serviced
@@ -1380,10 +1365,32 @@ int __cpu_disable(void)
1380 remove_cpu_from_maps(cpu); 1365 remove_cpu_from_maps(cpu);
1381 unlock_vector_lock(); 1366 unlock_vector_lock();
1382 fixup_irqs(cpu_online_map); 1367 fixup_irqs(cpu_online_map);
1368}
1369
1370int native_cpu_disable(void)
1371{
1372 int cpu = smp_processor_id();
1373
1374 /*
1375 * Perhaps use cpufreq to drop frequency, but that could go
1376 * into generic code.
1377 *
1378 * We won't take down the boot processor on i386 due to some
1379 * interrupts only being able to be serviced by the BSP.
1380 * Especially so if we're not using an IOAPIC -zwane
1381 */
1382 if (cpu == 0)
1383 return -EBUSY;
1384
1385 if (nmi_watchdog == NMI_LOCAL_APIC)
1386 stop_apic_nmi_watchdog(NULL);
1387 clear_local_APIC();
1388
1389 cpu_disable_common();
1383 return 0; 1390 return 0;
1384} 1391}
1385 1392
1386void __cpu_die(unsigned int cpu) 1393void native_cpu_die(unsigned int cpu)
1387{ 1394{
1388 /* We don't do anything here: idle task is faking death itself. */ 1395 /* We don't do anything here: idle task is faking death itself. */
1389 unsigned int i; 1396 unsigned int i;
@@ -1400,15 +1407,45 @@ void __cpu_die(unsigned int cpu)
1400 } 1407 }
1401 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1408 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1402} 1409}
1410
1411void play_dead_common(void)
1412{
1413 idle_task_exit();
1414 reset_lazy_tlbstate();
1415 irq_ctx_exit(raw_smp_processor_id());
1416 c1e_remove_cpu(raw_smp_processor_id());
1417
1418 mb();
1419 /* Ack it */
1420 __get_cpu_var(cpu_state) = CPU_DEAD;
1421
1422 /*
1423 * With physical CPU hotplug, we should halt the cpu
1424 */
1425 local_irq_disable();
1426}
1427
1428void native_play_dead(void)
1429{
1430 play_dead_common();
1431 wbinvd_halt();
1432}
1433
1403#else /* ... !CONFIG_HOTPLUG_CPU */ 1434#else /* ... !CONFIG_HOTPLUG_CPU */
1404int __cpu_disable(void) 1435int native_cpu_disable(void)
1405{ 1436{
1406 return -ENOSYS; 1437 return -ENOSYS;
1407} 1438}
1408 1439
1409void __cpu_die(unsigned int cpu) 1440void native_cpu_die(unsigned int cpu)
1410{ 1441{
1411 /* We said "no" in __cpu_disable */ 1442 /* We said "no" in __cpu_disable */
1412 BUG(); 1443 BUG();
1413} 1444}
1445
1446void native_play_dead(void)
1447{
1448 BUG();
1449}
1450
1414#endif 1451#endif
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index fec1ecedc9b7..e00534b33534 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -241,3 +241,11 @@ void flush_tlb_all(void)
241 on_each_cpu(do_flush_tlb_all, NULL, 1); 241 on_each_cpu(do_flush_tlb_all, NULL, 1);
242} 242}
243 243
244void reset_lazy_tlbstate(void)
245{
246 int cpu = raw_smp_processor_id();
247
248 per_cpu(cpu_tlbstate, cpu).state = 0;
249 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
250}
251
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8f92cac4e6db..a742d753d5b0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -914,15 +914,15 @@ LIST_HEAD(pgd_list);
914 914
915void vmalloc_sync_all(void) 915void vmalloc_sync_all(void)
916{ 916{
917#ifdef CONFIG_X86_32
918 unsigned long start = VMALLOC_START & PGDIR_MASK;
919 unsigned long address; 917 unsigned long address;
920 918
919#ifdef CONFIG_X86_32
921 if (SHARED_KERNEL_PMD) 920 if (SHARED_KERNEL_PMD)
922 return; 921 return;
923 922
924 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 923 for (address = VMALLOC_START & PMD_MASK;
925 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 924 address >= TASK_SIZE && address < FIXADDR_TOP;
925 address += PMD_SIZE) {
926 unsigned long flags; 926 unsigned long flags;
927 struct page *page; 927 struct page *page;
928 928
@@ -935,10 +935,8 @@ void vmalloc_sync_all(void)
935 spin_unlock_irqrestore(&pgd_lock, flags); 935 spin_unlock_irqrestore(&pgd_lock, flags);
936 } 936 }
937#else /* CONFIG_X86_64 */ 937#else /* CONFIG_X86_64 */
938 unsigned long start = VMALLOC_START & PGDIR_MASK; 938 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
939 unsigned long address; 939 address += PGDIR_SIZE) {
940
941 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
942 const pgd_t *pgd_ref = pgd_offset_k(address); 940 const pgd_t *pgd_ref = pgd_offset_k(address);
943 unsigned long flags; 941 unsigned long flags;
944 struct page *page; 942 struct page *page;
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 3815e425f470..87b9ab166423 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -26,5 +26,13 @@ config XEN_MAX_DOMAIN_MEMORY
26 26
27config XEN_SAVE_RESTORE 27config XEN_SAVE_RESTORE
28 bool 28 bool
29 depends on PM 29 depends on XEN && PM
30 default y \ No newline at end of file 30 default y
31
32config XEN_DEBUG_FS
33 bool "Enable Xen debug and tuning parameters in debugfs"
34 depends on XEN && DEBUG_FS
35 default n
36 help
37 Enable statistics output and various tuning options in debugfs.
38 Enabling this option may incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 59c1e539aed2..313947940a1a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,12 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1ifdef CONFIG_FTRACE
2# Do not profile debug and lowlevel utilities
3CFLAGS_REMOVE_spinlock.o = -pg
4CFLAGS_REMOVE_time.o = -pg
5CFLAGS_REMOVE_irq.o = -pg
6endif
7
8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
2 time.o xen-asm_$(BITS).o grant-table.o suspend.o 9 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 10
4obj-$(CONFIG_SMP) += smp.o 11obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 000000000000..b53225d2cac3
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,123 @@
1#include <linux/init.h>
2#include <linux/debugfs.h>
3#include <linux/module.h>
4
5#include "debugfs.h"
6
7static struct dentry *d_xen_debug;
8
9struct dentry * __init xen_init_debugfs(void)
10{
11 if (!d_xen_debug) {
12 d_xen_debug = debugfs_create_dir("xen", NULL);
13
14 if (!d_xen_debug)
15 pr_warning("Could not create 'xen' debugfs directory\n");
16 }
17
18 return d_xen_debug;
19}
20
21struct array_data
22{
23 void *array;
24 unsigned elements;
25};
26
27static int u32_array_open(struct inode *inode, struct file *file)
28{
29 file->private_data = NULL;
30 return nonseekable_open(inode, file);
31}
32
33static size_t format_array(char *buf, size_t bufsize, const char *fmt,
34 u32 *array, unsigned array_size)
35{
36 size_t ret = 0;
37 unsigned i;
38
39 for(i = 0; i < array_size; i++) {
40 size_t len;
41
42 len = snprintf(buf, bufsize, fmt, array[i]);
43 len++; /* ' ' or '\n' */
44 ret += len;
45
46 if (buf) {
47 buf += len;
48 bufsize -= len;
49 buf[-1] = (i == array_size-1) ? '\n' : ' ';
50 }
51 }
52
53 ret++; /* \0 */
54 if (buf)
55 *buf = '\0';
56
57 return ret;
58}
59
60static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
61{
62 size_t len = format_array(NULL, 0, fmt, array, array_size);
63 char *ret;
64
65 ret = kmalloc(len, GFP_KERNEL);
66 if (ret == NULL)
67 return NULL;
68
69 format_array(ret, len, fmt, array, array_size);
70 return ret;
71}
72
73static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
74 loff_t *ppos)
75{
76 struct inode *inode = file->f_path.dentry->d_inode;
77 struct array_data *data = inode->i_private;
78 size_t size;
79
80 if (*ppos == 0) {
81 if (file->private_data) {
82 kfree(file->private_data);
83 file->private_data = NULL;
84 }
85
86 file->private_data = format_array_alloc("%u", data->array, data->elements);
87 }
88
89 size = 0;
90 if (file->private_data)
91 size = strlen(file->private_data);
92
93 return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
94}
95
96static int xen_array_release(struct inode *inode, struct file *file)
97{
98 kfree(file->private_data);
99
100 return 0;
101}
102
103static struct file_operations u32_array_fops = {
104 .owner = THIS_MODULE,
105 .open = u32_array_open,
106 .release= xen_array_release,
107 .read = u32_array_read,
108};
109
110struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
111 struct dentry *parent,
112 u32 *array, unsigned elements)
113{
114 struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
115
116 if (data == NULL)
117 return NULL;
118
119 data->array = array;
120 data->elements = elements;
121
122 return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
123}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 000000000000..e28132084832
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,10 @@
1#ifndef _XEN_DEBUGFS_H
2#define _XEN_DEBUGFS_H
3
4struct dentry * __init xen_init_debugfs(void);
5
6struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
7 struct dentry *parent,
8 u32 *array, unsigned elements);
9
10#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a27d562a9744..0013a729b41d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,7 +30,6 @@
30#include <xen/interface/xen.h> 30#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h> 31#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h> 32#include <xen/interface/vcpu.h>
33#include <xen/interface/sched.h>
34#include <xen/features.h> 33#include <xen/features.h>
35#include <xen/page.h> 34#include <xen/page.h>
36#include <xen/hvc-console.h> 35#include <xen/hvc-console.h>
@@ -58,6 +57,9 @@ EXPORT_SYMBOL_GPL(hypercall_page);
58DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
59DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
60 59
60enum xen_domain_type xen_domain_type = XEN_NATIVE;
61EXPORT_SYMBOL_GPL(xen_domain_type);
62
61/* 63/*
62 * Identity map, in addition to plain kernel map. This needs to be 64 * Identity map, in addition to plain kernel map. This needs to be
63 * large enough to allocate page table pages to allocate the rest. 65 * large enough to allocate page table pages to allocate the rest.
@@ -111,7 +113,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
111 * 113 *
112 * 0: not available, 1: available 114 * 0: not available, 1: available
113 */ 115 */
114static int have_vcpu_info_placement = 1; 116static int have_vcpu_info_placement =
117#ifdef CONFIG_X86_32
118 1
119#else
120 0
121#endif
122 ;
123
115 124
116static void xen_vcpu_setup(int cpu) 125static void xen_vcpu_setup(int cpu)
117{ 126{
@@ -227,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg)
227 return HYPERVISOR_get_debugreg(reg); 236 return HYPERVISOR_get_debugreg(reg);
228} 237}
229 238
230static unsigned long xen_save_fl(void) 239static void xen_leave_lazy(void)
231{ 240{
232 struct vcpu_info *vcpu; 241 paravirt_leave_lazy(paravirt_get_lazy_mode());
233 unsigned long flags; 242 xen_mc_flush();
234
235 vcpu = x86_read_percpu(xen_vcpu);
236
237 /* flag has opposite sense of mask */
238 flags = !vcpu->evtchn_upcall_mask;
239
240 /* convert to IF type flag
241 -0 -> 0x00000000
242 -1 -> 0xffffffff
243 */
244 return (-flags) & X86_EFLAGS_IF;
245} 243}
246 244
247static void xen_restore_fl(unsigned long flags) 245static unsigned long xen_store_tr(void)
248{ 246{
249 struct vcpu_info *vcpu; 247 return 0;
250
251 /* convert from IF type flag */
252 flags = !(flags & X86_EFLAGS_IF);
253
254 /* There's a one instruction preempt window here. We need to
255 make sure we're don't switch CPUs between getting the vcpu
256 pointer and updating the mask. */
257 preempt_disable();
258 vcpu = x86_read_percpu(xen_vcpu);
259 vcpu->evtchn_upcall_mask = flags;
260 preempt_enable_no_resched();
261
262 /* Doesn't matter if we get preempted here, because any
263 pending event will get dealt with anyway. */
264
265 if (flags == 0) {
266 preempt_check_resched();
267 barrier(); /* unmask then check (avoid races) */
268 if (unlikely(vcpu->evtchn_upcall_pending))
269 force_evtchn_callback();
270 }
271} 248}
272 249
273static void xen_irq_disable(void) 250/*
251 * Set the page permissions for a particular virtual address. If the
252 * address is a vmalloc mapping (or other non-linear mapping), then
253 * find the linear mapping of the page and also set its protections to
254 * match.
255 */
256static void set_aliased_prot(void *v, pgprot_t prot)
274{ 257{
275 /* There's a one instruction preempt window here. We need to 258 int level;
276 make sure we're don't switch CPUs between getting the vcpu 259 pte_t *ptep;
277 pointer and updating the mask. */ 260 pte_t pte;
278 preempt_disable(); 261 unsigned long pfn;
279 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 262 struct page *page;
280 preempt_enable_no_resched();
281}
282 263
283static void xen_irq_enable(void) 264 ptep = lookup_address((unsigned long)v, &level);
284{ 265 BUG_ON(ptep == NULL);
285 struct vcpu_info *vcpu;
286 266
287 /* We don't need to worry about being preempted here, since 267 pfn = pte_pfn(*ptep);
288 either a) interrupts are disabled, so no preemption, or b) 268 page = pfn_to_page(pfn);
289 the caller is confused and is trying to re-enable interrupts
290 on an indeterminate processor. */
291 269
292 vcpu = x86_read_percpu(xen_vcpu); 270 pte = pfn_pte(pfn, prot);
293 vcpu->evtchn_upcall_mask = 0;
294 271
295 /* Doesn't matter if we get preempted here, because any 272 if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
296 pending event will get dealt with anyway. */ 273 BUG();
297 274
298 barrier(); /* unmask then check (avoid races) */ 275 if (!PageHighMem(page)) {
299 if (unlikely(vcpu->evtchn_upcall_pending)) 276 void *av = __va(PFN_PHYS(pfn));
300 force_evtchn_callback();
301}
302 277
303static void xen_safe_halt(void) 278 if (av != v)
304{ 279 if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
305 /* Blocking includes an implicit local_irq_enable(). */ 280 BUG();
306 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) 281 } else
307 BUG(); 282 kmap_flush_unused();
308} 283}
309 284
310static void xen_halt(void) 285static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
311{ 286{
312 if (irqs_disabled()) 287 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
313 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 288 int i;
314 else
315 xen_safe_halt();
316}
317 289
318static void xen_leave_lazy(void) 290 for(i = 0; i < entries; i += entries_per_page)
319{ 291 set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
320 paravirt_leave_lazy(paravirt_get_lazy_mode());
321 xen_mc_flush();
322} 292}
323 293
324static unsigned long xen_store_tr(void) 294static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
325{ 295{
326 return 0; 296 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
297 int i;
298
299 for(i = 0; i < entries; i += entries_per_page)
300 set_aliased_prot(ldt + i, PAGE_KERNEL);
327} 301}
328 302
329static void xen_set_ldt(const void *addr, unsigned entries) 303static void xen_set_ldt(const void *addr, unsigned entries)
@@ -426,8 +400,7 @@ static void xen_load_gs_index(unsigned int idx)
426static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 400static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
427 const void *ptr) 401 const void *ptr)
428{ 402{
429 unsigned long lp = (unsigned long)&dt[entrynum]; 403 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
430 xmaddr_t mach_lp = virt_to_machine(lp);
431 u64 entry = *(u64 *)ptr; 404 u64 entry = *(u64 *)ptr;
432 405
433 preempt_disable(); 406 preempt_disable();
@@ -560,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
560} 533}
561 534
562static void xen_load_sp0(struct tss_struct *tss, 535static void xen_load_sp0(struct tss_struct *tss,
563 struct thread_struct *thread) 536 struct thread_struct *thread)
564{ 537{
565 struct multicall_space mcs = xen_mc_entry(0); 538 struct multicall_space mcs = xen_mc_entry(0);
566 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 539 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
@@ -835,6 +808,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
835 ret = -EFAULT; 808 ret = -EFAULT;
836 break; 809 break;
837#endif 810#endif
811
812 case MSR_STAR:
813 case MSR_CSTAR:
814 case MSR_LSTAR:
815 case MSR_SYSCALL_MASK:
816 case MSR_IA32_SYSENTER_CS:
817 case MSR_IA32_SYSENTER_ESP:
818 case MSR_IA32_SYSENTER_EIP:
819 /* Fast syscall setup is all done in hypercalls, so
820 these are all ignored. Stub them out here to stop
821 Xen console noise. */
822 break;
823
838 default: 824 default:
839 ret = native_write_msr_safe(msr, low, high); 825 ret = native_write_msr_safe(msr, low, high);
840 } 826 }
@@ -878,8 +864,8 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
878 SetPagePinned(page); 864 SetPagePinned(page);
879 865
880 if (!PageHighMem(page)) { 866 if (!PageHighMem(page)) {
881 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 867 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
882 if (level == PT_PTE) 868 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
883 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 869 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
884 } else 870 } else
885 /* make sure there are no stray mappings of 871 /* make sure there are no stray mappings of
@@ -947,7 +933,7 @@ static void xen_release_ptpage(unsigned long pfn, unsigned level)
947 933
948 if (PagePinned(page)) { 934 if (PagePinned(page)) {
949 if (!PageHighMem(page)) { 935 if (!PageHighMem(page)) {
950 if (level == PT_PTE) 936 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
951 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 937 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
952 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 938 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
953 } 939 }
@@ -994,6 +980,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
994} 980}
995#endif 981#endif
996 982
983#ifdef CONFIG_X86_32
997static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 984static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
998{ 985{
999 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 986 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
@@ -1012,6 +999,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1012 999
1013 xen_set_pte(ptep, pte); 1000 xen_set_pte(ptep, pte);
1014} 1001}
1002#endif
1015 1003
1016static __init void xen_pagetable_setup_start(pgd_t *base) 1004static __init void xen_pagetable_setup_start(pgd_t *base)
1017{ 1005{
@@ -1078,7 +1066,6 @@ void xen_setup_vcpu_info_placement(void)
1078 1066
1079 /* xen_vcpu_setup managed to place the vcpu_info within the 1067 /* xen_vcpu_setup managed to place the vcpu_info within the
1080 percpu area for all cpus, so make use of it */ 1068 percpu area for all cpus, so make use of it */
1081#ifdef CONFIG_X86_32
1082 if (have_vcpu_info_placement) { 1069 if (have_vcpu_info_placement) {
1083 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1070 printk(KERN_INFO "Xen: using vcpu_info placement\n");
1084 1071
@@ -1088,7 +1075,6 @@ void xen_setup_vcpu_info_placement(void)
1088 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1075 pv_irq_ops.irq_enable = xen_irq_enable_direct;
1089 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1076 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1090 } 1077 }
1091#endif
1092} 1078}
1093 1079
1094static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1080static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -1109,12 +1095,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1109 goto patch_site 1095 goto patch_site
1110 1096
1111 switch (type) { 1097 switch (type) {
1112#ifdef CONFIG_X86_32
1113 SITE(pv_irq_ops, irq_enable); 1098 SITE(pv_irq_ops, irq_enable);
1114 SITE(pv_irq_ops, irq_disable); 1099 SITE(pv_irq_ops, irq_disable);
1115 SITE(pv_irq_ops, save_fl); 1100 SITE(pv_irq_ops, save_fl);
1116 SITE(pv_irq_ops, restore_fl); 1101 SITE(pv_irq_ops, restore_fl);
1117#endif /* CONFIG_X86_32 */
1118#undef SITE 1102#undef SITE
1119 1103
1120 patch_site: 1104 patch_site:
@@ -1252,6 +1236,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1252 .load_gs_index = xen_load_gs_index, 1236 .load_gs_index = xen_load_gs_index,
1253#endif 1237#endif
1254 1238
1239 .alloc_ldt = xen_alloc_ldt,
1240 .free_ldt = xen_free_ldt,
1241
1255 .store_gdt = native_store_gdt, 1242 .store_gdt = native_store_gdt,
1256 .store_idt = native_store_idt, 1243 .store_idt = native_store_idt,
1257 .store_tr = xen_store_tr, 1244 .store_tr = xen_store_tr,
@@ -1273,36 +1260,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1273 }, 1260 },
1274}; 1261};
1275 1262
1276static void __init __xen_init_IRQ(void)
1277{
1278#ifdef CONFIG_X86_64
1279 int i;
1280
1281 /* Create identity vector->irq map */
1282 for(i = 0; i < NR_VECTORS; i++) {
1283 int cpu;
1284
1285 for_each_possible_cpu(cpu)
1286 per_cpu(vector_irq, cpu)[i] = i;
1287 }
1288#endif /* CONFIG_X86_64 */
1289
1290 xen_init_IRQ();
1291}
1292
1293static const struct pv_irq_ops xen_irq_ops __initdata = {
1294 .init_IRQ = __xen_init_IRQ,
1295 .save_fl = xen_save_fl,
1296 .restore_fl = xen_restore_fl,
1297 .irq_disable = xen_irq_disable,
1298 .irq_enable = xen_irq_enable,
1299 .safe_halt = xen_safe_halt,
1300 .halt = xen_halt,
1301#ifdef CONFIG_X86_64
1302 .adjust_exception_frame = xen_adjust_exception_frame,
1303#endif
1304};
1305
1306static const struct pv_apic_ops xen_apic_ops __initdata = { 1263static const struct pv_apic_ops xen_apic_ops __initdata = {
1307#ifdef CONFIG_X86_LOCAL_APIC 1264#ifdef CONFIG_X86_LOCAL_APIC
1308 .setup_boot_clock = paravirt_nop, 1265 .setup_boot_clock = paravirt_nop,
@@ -1443,7 +1400,7 @@ static void __init xen_reserve_top(void)
1443 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1400 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1444 top = pp.virt_start; 1401 top = pp.virt_start;
1445 1402
1446 reserve_top_address(-top + 2 * PAGE_SIZE); 1403 reserve_top_address(-top);
1447#endif /* CONFIG_X86_32 */ 1404#endif /* CONFIG_X86_32 */
1448} 1405}
1449 1406
@@ -1477,48 +1434,11 @@ static void *m2v(phys_addr_t maddr)
1477 return __ka(m2p(maddr)); 1434 return __ka(m2p(maddr));
1478} 1435}
1479 1436
1480#ifdef CONFIG_X86_64
1481static void walk(pgd_t *pgd, unsigned long addr)
1482{
1483 unsigned l4idx = pgd_index(addr);
1484 unsigned l3idx = pud_index(addr);
1485 unsigned l2idx = pmd_index(addr);
1486 unsigned l1idx = pte_index(addr);
1487 pgd_t l4;
1488 pud_t l3;
1489 pmd_t l2;
1490 pte_t l1;
1491
1492 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1493 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1494
1495 l4 = pgd[l4idx];
1496 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1497 xen_raw_printk(" %016lx\n", pgd_val(l4));
1498
1499 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1500 xen_raw_printk(" l3: %016lx\n", l3.pud);
1501 xen_raw_printk(" %016lx\n", pud_val(l3));
1502
1503 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1504 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1505 xen_raw_printk(" %016lx\n", pmd_val(l2));
1506
1507 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1508 xen_raw_printk(" l1: %016lx\n", l1.pte);
1509 xen_raw_printk(" %016lx\n", pte_val(l1));
1510}
1511#endif
1512
1513static void set_page_prot(void *addr, pgprot_t prot) 1437static void set_page_prot(void *addr, pgprot_t prot)
1514{ 1438{
1515 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1439 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1516 pte_t pte = pfn_pte(pfn, prot); 1440 pte_t pte = pfn_pte(pfn, prot);
1517 1441
1518 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1519 addr, pfn, get_phys_to_machine(pfn),
1520 pgprot_val(prot), pte.pte);
1521
1522 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) 1442 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1523 BUG(); 1443 BUG();
1524} 1444}
@@ -1694,6 +1614,8 @@ asmlinkage void __init xen_start_kernel(void)
1694 if (!xen_start_info) 1614 if (!xen_start_info)
1695 return; 1615 return;
1696 1616
1617 xen_domain_type = XEN_PV_DOMAIN;
1618
1697 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1619 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1698 1620
1699 xen_setup_features(); 1621 xen_setup_features();
@@ -1703,10 +1625,11 @@ asmlinkage void __init xen_start_kernel(void)
1703 pv_init_ops = xen_init_ops; 1625 pv_init_ops = xen_init_ops;
1704 pv_time_ops = xen_time_ops; 1626 pv_time_ops = xen_time_ops;
1705 pv_cpu_ops = xen_cpu_ops; 1627 pv_cpu_ops = xen_cpu_ops;
1706 pv_irq_ops = xen_irq_ops;
1707 pv_apic_ops = xen_apic_ops; 1628 pv_apic_ops = xen_apic_ops;
1708 pv_mmu_ops = xen_mmu_ops; 1629 pv_mmu_ops = xen_mmu_ops;
1709 1630
1631 xen_init_irq_ops();
1632
1710#ifdef CONFIG_X86_LOCAL_APIC 1633#ifdef CONFIG_X86_LOCAL_APIC
1711 /* 1634 /*
1712 * set up the basic apic ops. 1635 * set up the basic apic ops.
@@ -1737,7 +1660,7 @@ asmlinkage void __init xen_start_kernel(void)
1737 1660
1738 /* Prevent unwanted bits from being set in PTEs. */ 1661 /* Prevent unwanted bits from being set in PTEs. */
1739 __supported_pte_mask &= ~_PAGE_GLOBAL; 1662 __supported_pte_mask &= ~_PAGE_GLOBAL;
1740 if (!is_initial_xendomain()) 1663 if (!xen_initial_domain())
1741 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); 1664 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1742 1665
1743 /* Don't do the full vcpu_info placement stuff until we have a 1666 /* Don't do the full vcpu_info placement stuff until we have a
@@ -1772,7 +1695,7 @@ asmlinkage void __init xen_start_kernel(void)
1772 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1695 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1773 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); 1696 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1774 1697
1775 if (!is_initial_xendomain()) { 1698 if (!xen_initial_domain()) {
1776 add_preferred_console("xenboot", 0, NULL); 1699 add_preferred_console("xenboot", 0, NULL);
1777 add_preferred_console("tty", 0, NULL); 1700 add_preferred_console("tty", 0, NULL);
1778 add_preferred_console("hvc", 0, NULL); 1701 add_preferred_console("hvc", 0, NULL);
@@ -1780,15 +1703,6 @@ asmlinkage void __init xen_start_kernel(void)
1780 1703
1781 xen_raw_console_write("about to get started...\n"); 1704 xen_raw_console_write("about to get started...\n");
1782 1705
1783#if 0
1784 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1785 &boot_params, __pa_symbol(&boot_params),
1786 __va(__pa_symbol(&boot_params)));
1787
1788 walk(pgd, &boot_params);
1789 walk(pgd, __va(__pa(&boot_params)));
1790#endif
1791
1792 /* Start the world */ 1706 /* Start the world */
1793#ifdef CONFIG_X86_32 1707#ifdef CONFIG_X86_32
1794 i386_start_kernel(); 1708 i386_start_kernel();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 000000000000..28b85ab8422e
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,143 @@
1#include <linux/hardirq.h>
2
3#include <xen/interface/xen.h>
4#include <xen/interface/sched.h>
5#include <xen/interface/vcpu.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/hypervisor.h>
9
10#include "xen-ops.h"
11
12/*
13 * Force a proper event-channel callback from Xen after clearing the
14 * callback mask. We do this in a very simple manner, by making a call
15 * down into Xen. The pending flag will be checked by Xen on return.
16 */
17void xen_force_evtchn_callback(void)
18{
19 (void)HYPERVISOR_xen_version(0, NULL);
20}
21
22static void __init __xen_init_IRQ(void)
23{
24#ifdef CONFIG_X86_64
25 int i;
26
27 /* Create identity vector->irq map */
28 for(i = 0; i < NR_VECTORS; i++) {
29 int cpu;
30
31 for_each_possible_cpu(cpu)
32 per_cpu(vector_irq, cpu)[i] = i;
33 }
34#endif /* CONFIG_X86_64 */
35
36 xen_init_IRQ();
37}
38
39static unsigned long xen_save_fl(void)
40{
41 struct vcpu_info *vcpu;
42 unsigned long flags;
43
44 vcpu = x86_read_percpu(xen_vcpu);
45
46 /* flag has opposite sense of mask */
47 flags = !vcpu->evtchn_upcall_mask;
48
49 /* convert to IF type flag
50 -0 -> 0x00000000
51 -1 -> 0xffffffff
52 */
53 return (-flags) & X86_EFLAGS_IF;
54}
55
56static void xen_restore_fl(unsigned long flags)
57{
58 struct vcpu_info *vcpu;
59
60 /* convert from IF type flag */
61 flags = !(flags & X86_EFLAGS_IF);
62
63 /* There's a one instruction preempt window here. We need to
64 make sure we're don't switch CPUs between getting the vcpu
65 pointer and updating the mask. */
66 preempt_disable();
67 vcpu = x86_read_percpu(xen_vcpu);
68 vcpu->evtchn_upcall_mask = flags;
69 preempt_enable_no_resched();
70
71 /* Doesn't matter if we get preempted here, because any
72 pending event will get dealt with anyway. */
73
74 if (flags == 0) {
75 preempt_check_resched();
76 barrier(); /* unmask then check (avoid races) */
77 if (unlikely(vcpu->evtchn_upcall_pending))
78 xen_force_evtchn_callback();
79 }
80}
81
82static void xen_irq_disable(void)
83{
84 /* There's a one instruction preempt window here. We need to
85 make sure we're don't switch CPUs between getting the vcpu
86 pointer and updating the mask. */
87 preempt_disable();
88 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
89 preempt_enable_no_resched();
90}
91
92static void xen_irq_enable(void)
93{
94 struct vcpu_info *vcpu;
95
96 /* We don't need to worry about being preempted here, since
97 either a) interrupts are disabled, so no preemption, or b)
98 the caller is confused and is trying to re-enable interrupts
99 on an indeterminate processor. */
100
101 vcpu = x86_read_percpu(xen_vcpu);
102 vcpu->evtchn_upcall_mask = 0;
103
104 /* Doesn't matter if we get preempted here, because any
105 pending event will get dealt with anyway. */
106
107 barrier(); /* unmask then check (avoid races) */
108 if (unlikely(vcpu->evtchn_upcall_pending))
109 xen_force_evtchn_callback();
110}
111
112static void xen_safe_halt(void)
113{
114 /* Blocking includes an implicit local_irq_enable(). */
115 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
116 BUG();
117}
118
119static void xen_halt(void)
120{
121 if (irqs_disabled())
122 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
123 else
124 xen_safe_halt();
125}
126
127static const struct pv_irq_ops xen_irq_ops __initdata = {
128 .init_IRQ = __xen_init_IRQ,
129 .save_fl = xen_save_fl,
130 .restore_fl = xen_restore_fl,
131 .irq_disable = xen_irq_disable,
132 .irq_enable = xen_irq_enable,
133 .safe_halt = xen_safe_halt,
134 .halt = xen_halt,
135#ifdef CONFIG_X86_64
136 .adjust_exception_frame = xen_adjust_exception_frame,
137#endif
138};
139
140void __init xen_init_irq_ops()
141{
142 pv_irq_ops = xen_irq_ops;
143}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aa37469da696..ae173f6edd8b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,6 +40,7 @@
40 */ 40 */
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h>
43#include <linux/bug.h> 44#include <linux/bug.h>
44 45
45#include <asm/pgtable.h> 46#include <asm/pgtable.h>
@@ -57,6 +58,61 @@
57 58
58#include "multicalls.h" 59#include "multicalls.h"
59#include "mmu.h" 60#include "mmu.h"
61#include "debugfs.h"
62
63#define MMU_UPDATE_HISTO 30
64
65#ifdef CONFIG_XEN_DEBUG_FS
66
67static struct {
68 u32 pgd_update;
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
71
72 u32 pud_update;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
75
76 u32 pmd_update;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
79
80 u32 pte_update;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
83
84 u32 mmu_update;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
87
88 u32 prot_commit;
89 u32 prot_commit_batched;
90
91 u32 set_pte_at;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
96} mmu_stats;
97
98static u8 zero_stats;
99
100static inline void check_zero(void)
101{
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
104 zero_stats = 0;
105 }
106}
107
108#define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
110
111#else /* !CONFIG_XEN_DEBUG_FS */
112
113#define ADD_STATS(elem, val) do { (void)(val); } while(0)
114
115#endif /* CONFIG_XEN_DEBUG_FS */
60 116
61/* 117/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a 118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr)
229} 285}
230 286
231 287
232static bool page_pinned(void *ptr) 288static bool xen_page_pinned(void *ptr)
233{ 289{
234 struct page *page = virt_to_page(ptr); 290 struct page *page = virt_to_page(ptr);
235 291
236 return PagePinned(page); 292 return PagePinned(page);
237} 293}
238 294
239static void extend_mmu_update(const struct mmu_update *update) 295static void xen_extend_mmu_update(const struct mmu_update *update)
240{ 296{
241 struct multicall_space mcs; 297 struct multicall_space mcs;
242 struct mmu_update *u; 298 struct mmu_update *u;
243 299
244 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 300 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
245 301
246 if (mcs.mc != NULL) 302 if (mcs.mc != NULL) {
303 ADD_STATS(mmu_update_extended, 1);
304 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
305
247 mcs.mc->args[1]++; 306 mcs.mc->args[1]++;
248 else { 307
308 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
309 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
310 else
311 ADD_STATS(mmu_update_histo[0], 1);
312 } else {
313 ADD_STATS(mmu_update, 1);
249 mcs = __xen_mc_entry(sizeof(*u)); 314 mcs = __xen_mc_entry(sizeof(*u));
250 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 315 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
316 ADD_STATS(mmu_update_histo[1], 1);
251 } 317 }
252 318
253 u = mcs.args; 319 u = mcs.args;
@@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
265 /* ptr may be ioremapped for 64-bit pagetable setup */ 331 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 332 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
267 u.val = pmd_val_ma(val); 333 u.val = pmd_val_ma(val);
268 extend_mmu_update(&u); 334 xen_extend_mmu_update(&u);
335
336 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
269 337
270 xen_mc_issue(PARAVIRT_LAZY_MMU); 338 xen_mc_issue(PARAVIRT_LAZY_MMU);
271 339
@@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
274 342
275void xen_set_pmd(pmd_t *ptr, pmd_t val) 343void xen_set_pmd(pmd_t *ptr, pmd_t val)
276{ 344{
345 ADD_STATS(pmd_update, 1);
346
277 /* If page is not pinned, we can just update the entry 347 /* If page is not pinned, we can just update the entry
278 directly */ 348 directly */
279 if (!page_pinned(ptr)) { 349 if (!xen_page_pinned(ptr)) {
280 *ptr = val; 350 *ptr = val;
281 return; 351 return;
282 } 352 }
283 353
354 ADD_STATS(pmd_update_pinned, 1);
355
284 xen_set_pmd_hyper(ptr, val); 356 xen_set_pmd_hyper(ptr, val);
285} 357}
286 358
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
300 if (mm == &init_mm) 372 if (mm == &init_mm)
301 preempt_disable(); 373 preempt_disable();
302 374
375 ADD_STATS(set_pte_at, 1);
376// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
377 ADD_STATS(set_pte_at_current, mm == current->mm);
378 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
379
303 if (mm == current->mm || mm == &init_mm) { 380 if (mm == current->mm || mm == &init_mm) {
304 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 381 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
305 struct multicall_space mcs; 382 struct multicall_space mcs;
306 mcs = xen_mc_entry(0); 383 mcs = xen_mc_entry(0);
307 384
308 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 385 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
386 ADD_STATS(set_pte_at_batched, 1);
309 xen_mc_issue(PARAVIRT_LAZY_MMU); 387 xen_mc_issue(PARAVIRT_LAZY_MMU);
310 goto out; 388 goto out;
311 } else 389 } else
@@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
334 412
335 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 413 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
336 u.val = pte_val_ma(pte); 414 u.val = pte_val_ma(pte);
337 extend_mmu_update(&u); 415 xen_extend_mmu_update(&u);
416
417 ADD_STATS(prot_commit, 1);
418 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
338 419
339 xen_mc_issue(PARAVIRT_LAZY_MMU); 420 xen_mc_issue(PARAVIRT_LAZY_MMU);
340} 421}
@@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
400 /* ptr may be ioremapped for 64-bit pagetable setup */ 481 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 482 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
402 u.val = pud_val_ma(val); 483 u.val = pud_val_ma(val);
403 extend_mmu_update(&u); 484 xen_extend_mmu_update(&u);
485
486 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
404 487
405 xen_mc_issue(PARAVIRT_LAZY_MMU); 488 xen_mc_issue(PARAVIRT_LAZY_MMU);
406 489
@@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
409 492
410void xen_set_pud(pud_t *ptr, pud_t val) 493void xen_set_pud(pud_t *ptr, pud_t val)
411{ 494{
495 ADD_STATS(pud_update, 1);
496
412 /* If page is not pinned, we can just update the entry 497 /* If page is not pinned, we can just update the entry
413 directly */ 498 directly */
414 if (!page_pinned(ptr)) { 499 if (!xen_page_pinned(ptr)) {
415 *ptr = val; 500 *ptr = val;
416 return; 501 return;
417 } 502 }
418 503
504 ADD_STATS(pud_update_pinned, 1);
505
419 xen_set_pud_hyper(ptr, val); 506 xen_set_pud_hyper(ptr, val);
420} 507}
421 508
422void xen_set_pte(pte_t *ptep, pte_t pte) 509void xen_set_pte(pte_t *ptep, pte_t pte)
423{ 510{
511 ADD_STATS(pte_update, 1);
512// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
513 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
514
424#ifdef CONFIG_X86_PAE 515#ifdef CONFIG_X86_PAE
425 ptep->pte_high = pte.pte_high; 516 ptep->pte_high = pte.pte_high;
426 smp_wmb(); 517 smp_wmb();
@@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
490 581
491 u.ptr = virt_to_machine(ptr).maddr; 582 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val); 583 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u); 584 xen_extend_mmu_update(&u);
494} 585}
495 586
496/* 587/*
@@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{ 608{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr); 609 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519 610
611 ADD_STATS(pgd_update, 1);
612
520 /* If page is not pinned, we can just update the entry 613 /* If page is not pinned, we can just update the entry
521 directly */ 614 directly */
522 if (!page_pinned(ptr)) { 615 if (!xen_page_pinned(ptr)) {
523 *ptr = val; 616 *ptr = val;
524 if (user_ptr) { 617 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr)); 618 WARN_ON(xen_page_pinned(user_ptr));
526 *user_ptr = val; 619 *user_ptr = val;
527 } 620 }
528 return; 621 return;
529 } 622 }
530 623
624 ADD_STATS(pgd_update_pinned, 1);
625 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
626
531 /* If it's pinned, then we can at least batch the kernel and 627 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */ 628 user updates together. */
533 xen_mc_batch(); 629 xen_mc_batch();
@@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
555 * For 64-bit, we must skip the Xen hole in the middle of the address 651 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole. 652 * space, just after the big x86-64 virtual hole.
557 */ 653 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), 654static int xen_pgd_walk(struct mm_struct *mm,
559 unsigned long limit) 655 int (*func)(struct mm_struct *mm, struct page *,
656 enum pt_level),
657 unsigned long limit)
560{ 658{
659 pgd_t *pgd = mm->pgd;
561 int flush = 0; 660 int flush = 0;
562 unsigned hole_low, hole_high; 661 unsigned hole_low, hole_high;
563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; 662 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
@@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
590 pmdidx_limit = 0; 689 pmdidx_limit = 0;
591#endif 690#endif
592 691
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { 692 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
596 pud_t *pud; 693 pud_t *pud;
597 694
@@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
604 pud = pud_offset(&pgd[pgdidx], 0); 701 pud = pud_offset(&pgd[pgdidx], 0);
605 702
606 if (PTRS_PER_PUD > 1) /* not folded */ 703 if (PTRS_PER_PUD > 1) /* not folded */
607 flush |= (*func)(virt_to_page(pud), PT_PUD); 704 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
608 705
609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { 706 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
610 pmd_t *pmd; 707 pmd_t *pmd;
@@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
619 pmd = pmd_offset(&pud[pudidx], 0); 716 pmd = pmd_offset(&pud[pudidx], 0);
620 717
621 if (PTRS_PER_PMD > 1) /* not folded */ 718 if (PTRS_PER_PMD > 1) /* not folded */
622 flush |= (*func)(virt_to_page(pmd), PT_PMD); 719 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
623 720
624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { 721 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
625 struct page *pte; 722 struct page *pte;
@@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
633 continue; 730 continue;
634 731
635 pte = pmd_page(pmd[pmdidx]); 732 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE); 733 flush |= (*func)(mm, pte, PT_PTE);
637 } 734 }
638 } 735 }
639 } 736 }
737
640out: 738out:
739 /* Do the top level last, so that the callbacks can use it as
740 a cue to do final things like tlb flushes. */
741 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
641 742
642 return flush; 743 return flush;
643} 744}
644 745
645static spinlock_t *lock_pte(struct page *page) 746/* If we're using split pte locks, then take the page's lock and
747 return a pointer to it. Otherwise return NULL. */
748static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
646{ 749{
647 spinlock_t *ptl = NULL; 750 spinlock_t *ptl = NULL;
648 751
649#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 752#if USE_SPLIT_PTLOCKS
650 ptl = __pte_lockptr(page); 753 ptl = __pte_lockptr(page);
651 spin_lock(ptl); 754 spin_lock_nest_lock(ptl, &mm->page_table_lock);
652#endif 755#endif
653 756
654 return ptl; 757 return ptl;
655} 758}
656 759
657static void do_unlock(void *v) 760static void xen_pte_unlock(void *v)
658{ 761{
659 spinlock_t *ptl = v; 762 spinlock_t *ptl = v;
660 spin_unlock(ptl); 763 spin_unlock(ptl);
@@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
672 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 775 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
673} 776}
674 777
675static int pin_page(struct page *page, enum pt_level level) 778static int xen_pin_page(struct mm_struct *mm, struct page *page,
779 enum pt_level level)
676{ 780{
677 unsigned pgfl = TestSetPagePinned(page); 781 unsigned pgfl = TestSetPagePinned(page);
678 int flush; 782 int flush;
@@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level)
691 795
692 flush = 0; 796 flush = 0;
693 797
798 /*
799 * We need to hold the pagetable lock between the time
800 * we make the pagetable RO and when we actually pin
801 * it. If we don't, then other users may come in and
802 * attempt to update the pagetable by writing it,
803 * which will fail because the memory is RO but not
804 * pinned, so Xen won't do the trap'n'emulate.
805 *
806 * If we're using split pte locks, we can't hold the
807 * entire pagetable's worth of locks during the
808 * traverse, because we may wrap the preempt count (8
809 * bits). The solution is to mark RO and pin each PTE
810 * page while holding the lock. This means the number
811 * of locks we end up holding is never more than a
812 * batch size (~32 entries, at present).
813 *
814 * If we're not using split pte locks, we needn't pin
815 * the PTE pages independently, because we're
816 * protected by the overall pagetable lock.
817 */
694 ptl = NULL; 818 ptl = NULL;
695 if (level == PT_PTE) 819 if (level == PT_PTE)
696 ptl = lock_pte(page); 820 ptl = xen_pte_lock(page, mm);
697 821
698 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 822 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
699 pfn_pte(pfn, PAGE_KERNEL_RO), 823 pfn_pte(pfn, PAGE_KERNEL_RO),
700 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 824 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
701 825
702 if (level == PT_PTE) 826 if (ptl) {
703 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 827 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
704 828
705 if (ptl) {
706 /* Queue a deferred unlock for when this batch 829 /* Queue a deferred unlock for when this batch
707 is completed. */ 830 is completed. */
708 xen_mc_callback(do_unlock, ptl); 831 xen_mc_callback(xen_pte_unlock, ptl);
709 } 832 }
710 } 833 }
711 834
@@ -715,11 +838,11 @@ static int pin_page(struct page *page, enum pt_level level)
715/* This is called just after a mm has been created, but it has not 838/* This is called just after a mm has been created, but it has not
716 been used yet. We need to make sure that its pagetable is all 839 been used yet. We need to make sure that its pagetable is all
717 read-only, and can be pinned. */ 840 read-only, and can be pinned. */
718void xen_pgd_pin(pgd_t *pgd) 841static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
719{ 842{
720 xen_mc_batch(); 843 xen_mc_batch();
721 844
722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) { 845 if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
723 /* re-enable interrupts for kmap_flush_unused */ 846 /* re-enable interrupts for kmap_flush_unused */
724 xen_mc_issue(0); 847 xen_mc_issue(0);
725 kmap_flush_unused(); 848 kmap_flush_unused();
@@ -733,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd)
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); 856 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734 857
735 if (user_pgd) { 858 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD); 859 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); 860 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 } 861 }
739 } 862 }
740#else /* CONFIG_X86_32 */ 863#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE 864#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */ 865 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 866 xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
867 PT_PMD);
744#endif 868#endif
745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 869 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */ 870#endif /* CONFIG_X86_64 */
747 xen_mc_issue(0); 871 xen_mc_issue(0);
748} 872}
749 873
874static void xen_pgd_pin(struct mm_struct *mm)
875{
876 __xen_pgd_pin(mm, mm->pgd);
877}
878
750/* 879/*
751 * On save, we need to pin all pagetables to make sure they get their 880 * On save, we need to pin all pagetables to make sure they get their
752 * mfns turned into pfns. Search the list for any unpinned pgds and pin 881 * mfns turned into pfns. Search the list for any unpinned pgds and pin
753 * them (unpinned pgds are not currently in use, probably because the 882 * them (unpinned pgds are not currently in use, probably because the
754 * process is under construction or destruction). 883 * process is under construction or destruction).
884 *
885 * Expected to be called in stop_machine() ("equivalent to taking
886 * every spinlock in the system"), so the locking doesn't really
887 * matter all that much.
755 */ 888 */
756void xen_mm_pin_all(void) 889void xen_mm_pin_all(void)
757{ 890{
@@ -762,7 +895,7 @@ void xen_mm_pin_all(void)
762 895
763 list_for_each_entry(page, &pgd_list, lru) { 896 list_for_each_entry(page, &pgd_list, lru) {
764 if (!PagePinned(page)) { 897 if (!PagePinned(page)) {
765 xen_pgd_pin((pgd_t *)page_address(page)); 898 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
766 SetPageSavePinned(page); 899 SetPageSavePinned(page);
767 } 900 }
768 } 901 }
@@ -775,7 +908,8 @@ void xen_mm_pin_all(void)
775 * that's before we have page structures to store the bits. So do all 908 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now. 909 * the book-keeping now.
777 */ 910 */
778static __init int mark_pinned(struct page *page, enum pt_level level) 911static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
912 enum pt_level level)
779{ 913{
780 SetPagePinned(page); 914 SetPagePinned(page);
781 return 0; 915 return 0;
@@ -783,10 +917,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
783 917
784void __init xen_mark_init_mm_pinned(void) 918void __init xen_mark_init_mm_pinned(void)
785{ 919{
786 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 920 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
787} 921}
788 922
789static int unpin_page(struct page *page, enum pt_level level) 923static int xen_unpin_page(struct mm_struct *mm, struct page *page,
924 enum pt_level level)
790{ 925{
791 unsigned pgfl = TestClearPagePinned(page); 926 unsigned pgfl = TestClearPagePinned(page);
792 927
@@ -796,10 +931,18 @@ static int unpin_page(struct page *page, enum pt_level level)
796 spinlock_t *ptl = NULL; 931 spinlock_t *ptl = NULL;
797 struct multicall_space mcs; 932 struct multicall_space mcs;
798 933
934 /*
935 * Do the converse to pin_page. If we're using split
936 * pte locks, we must be holding the lock for while
937 * the pte page is unpinned but still RO to prevent
938 * concurrent updates from seeing it in this
939 * partially-pinned state.
940 */
799 if (level == PT_PTE) { 941 if (level == PT_PTE) {
800 ptl = lock_pte(page); 942 ptl = xen_pte_lock(page, mm);
801 943
802 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 944 if (ptl)
945 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
803 } 946 }
804 947
805 mcs = __xen_mc_entry(0); 948 mcs = __xen_mc_entry(0);
@@ -810,7 +953,7 @@ static int unpin_page(struct page *page, enum pt_level level)
810 953
811 if (ptl) { 954 if (ptl) {
812 /* unlock when batch completed */ 955 /* unlock when batch completed */
813 xen_mc_callback(do_unlock, ptl); 956 xen_mc_callback(xen_pte_unlock, ptl);
814 } 957 }
815 } 958 }
816 959
@@ -818,7 +961,7 @@ static int unpin_page(struct page *page, enum pt_level level)
818} 961}
819 962
820/* Release a pagetables pages back as normal RW */ 963/* Release a pagetables pages back as normal RW */
821static void xen_pgd_unpin(pgd_t *pgd) 964static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
822{ 965{
823 xen_mc_batch(); 966 xen_mc_batch();
824 967
@@ -830,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd)
830 973
831 if (user_pgd) { 974 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); 975 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD); 976 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
834 } 977 }
835 } 978 }
836#endif 979#endif
837 980
838#ifdef CONFIG_X86_PAE 981#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */ 982 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 983 xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
984 PT_PMD);
841#endif 985#endif
842 986
843 pgd_walk(pgd, unpin_page, USER_LIMIT); 987 xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
844 988
845 xen_mc_issue(0); 989 xen_mc_issue(0);
846} 990}
847 991
992static void xen_pgd_unpin(struct mm_struct *mm)
993{
994 __xen_pgd_unpin(mm, mm->pgd);
995}
996
848/* 997/*
849 * On resume, undo any pinning done at save, so that the rest of the 998 * On resume, undo any pinning done at save, so that the rest of the
850 * kernel doesn't see any unexpected pinned pagetables. 999 * kernel doesn't see any unexpected pinned pagetables.
@@ -859,7 +1008,7 @@ void xen_mm_unpin_all(void)
859 list_for_each_entry(page, &pgd_list, lru) { 1008 list_for_each_entry(page, &pgd_list, lru) {
860 if (PageSavePinned(page)) { 1009 if (PageSavePinned(page)) {
861 BUG_ON(!PagePinned(page)); 1010 BUG_ON(!PagePinned(page));
862 xen_pgd_unpin((pgd_t *)page_address(page)); 1011 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
863 ClearPageSavePinned(page); 1012 ClearPageSavePinned(page);
864 } 1013 }
865 } 1014 }
@@ -870,14 +1019,14 @@ void xen_mm_unpin_all(void)
870void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1019void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
871{ 1020{
872 spin_lock(&next->page_table_lock); 1021 spin_lock(&next->page_table_lock);
873 xen_pgd_pin(next->pgd); 1022 xen_pgd_pin(next);
874 spin_unlock(&next->page_table_lock); 1023 spin_unlock(&next->page_table_lock);
875} 1024}
876 1025
877void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1026void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
878{ 1027{
879 spin_lock(&mm->page_table_lock); 1028 spin_lock(&mm->page_table_lock);
880 xen_pgd_pin(mm->pgd); 1029 xen_pgd_pin(mm);
881 spin_unlock(&mm->page_table_lock); 1030 spin_unlock(&mm->page_table_lock);
882} 1031}
883 1032
@@ -907,7 +1056,7 @@ static void drop_other_mm_ref(void *info)
907 } 1056 }
908} 1057}
909 1058
910static void drop_mm_ref(struct mm_struct *mm) 1059static void xen_drop_mm_ref(struct mm_struct *mm)
911{ 1060{
912 cpumask_t mask; 1061 cpumask_t mask;
913 unsigned cpu; 1062 unsigned cpu;
@@ -937,7 +1086,7 @@ static void drop_mm_ref(struct mm_struct *mm)
937 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1086 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
938} 1087}
939#else 1088#else
940static void drop_mm_ref(struct mm_struct *mm) 1089static void xen_drop_mm_ref(struct mm_struct *mm)
941{ 1090{
942 if (current->active_mm == mm) 1091 if (current->active_mm == mm)
943 load_cr3(swapper_pg_dir); 1092 load_cr3(swapper_pg_dir);
@@ -961,14 +1110,77 @@ static void drop_mm_ref(struct mm_struct *mm)
961void xen_exit_mmap(struct mm_struct *mm) 1110void xen_exit_mmap(struct mm_struct *mm)
962{ 1111{
963 get_cpu(); /* make sure we don't move around */ 1112 get_cpu(); /* make sure we don't move around */
964 drop_mm_ref(mm); 1113 xen_drop_mm_ref(mm);
965 put_cpu(); 1114 put_cpu();
966 1115
967 spin_lock(&mm->page_table_lock); 1116 spin_lock(&mm->page_table_lock);
968 1117
969 /* pgd may not be pinned in the error exit path of execve */ 1118 /* pgd may not be pinned in the error exit path of execve */
970 if (page_pinned(mm->pgd)) 1119 if (xen_page_pinned(mm->pgd))
971 xen_pgd_unpin(mm->pgd); 1120 xen_pgd_unpin(mm);
972 1121
973 spin_unlock(&mm->page_table_lock); 1122 spin_unlock(&mm->page_table_lock);
974} 1123}
1124
1125#ifdef CONFIG_XEN_DEBUG_FS
1126
1127static struct dentry *d_mmu_debug;
1128
1129static int __init xen_mmu_debugfs(void)
1130{
1131 struct dentry *d_xen = xen_init_debugfs();
1132
1133 if (d_xen == NULL)
1134 return -ENOMEM;
1135
1136 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1137
1138 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1139
1140 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1141 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1142 &mmu_stats.pgd_update_pinned);
1143 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1144 &mmu_stats.pgd_update_pinned);
1145
1146 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1147 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1148 &mmu_stats.pud_update_pinned);
1149 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1150 &mmu_stats.pud_update_pinned);
1151
1152 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1153 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1154 &mmu_stats.pmd_update_pinned);
1155 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1156 &mmu_stats.pmd_update_pinned);
1157
1158 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1159// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1160// &mmu_stats.pte_update_pinned);
1161 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1162 &mmu_stats.pte_update_pinned);
1163
1164 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1165 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1166 &mmu_stats.mmu_update_extended);
1167 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1168 mmu_stats.mmu_update_histo, 20);
1169
1170 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1171 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1172 &mmu_stats.set_pte_at_batched);
1173 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1174 &mmu_stats.set_pte_at_current);
1175 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1176 &mmu_stats.set_pte_at_kernel);
1177
1178 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1179 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1180 &mmu_stats.prot_commit_batched);
1181
1182 return 0;
1183}
1184fs_initcall(xen_mmu_debugfs);
1185
1186#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 0f59bd03f9e3..98d71659da5a 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); 18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
19void xen_exit_mmap(struct mm_struct *mm); 19void xen_exit_mmap(struct mm_struct *mm);
20 20
21void xen_pgd_pin(pgd_t *pgd);
22//void xen_pgd_unpin(pgd_t *pgd);
23
24pteval_t xen_pte_val(pte_t); 21pteval_t xen_pte_val(pte_t);
25pmdval_t xen_pmd_val(pmd_t); 22pmdval_t xen_pmd_val(pmd_t);
26pgdval_t xen_pgd_val(pgd_t); 23pgdval_t xen_pgd_val(pgd_t);
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 9efd1c6c9776..8ea8a0d0b0de 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -21,16 +21,20 @@
21 */ 21 */
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/hardirq.h> 23#include <linux/hardirq.h>
24#include <linux/debugfs.h>
24 25
25#include <asm/xen/hypercall.h> 26#include <asm/xen/hypercall.h>
26 27
27#include "multicalls.h" 28#include "multicalls.h"
29#include "debugfs.h"
30
31#define MC_BATCH 32
28 32
29#define MC_DEBUG 1 33#define MC_DEBUG 1
30 34
31#define MC_BATCH 32
32#define MC_ARGS (MC_BATCH * 16) 35#define MC_ARGS (MC_BATCH * 16)
33 36
37
34struct mc_buffer { 38struct mc_buffer {
35 struct multicall_entry entries[MC_BATCH]; 39 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG 40#if MC_DEBUG
@@ -47,6 +51,76 @@ struct mc_buffer {
47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 51static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
48DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); 52DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
49 53
54/* flush reasons 0- slots, 1- args, 2- callbacks */
55enum flush_reasons
56{
57 FL_SLOTS,
58 FL_ARGS,
59 FL_CALLBACKS,
60
61 FL_N_REASONS
62};
63
64#ifdef CONFIG_XEN_DEBUG_FS
65#define NHYPERCALLS 40 /* not really */
66
67static struct {
68 unsigned histo[MC_BATCH+1];
69
70 unsigned issued;
71 unsigned arg_total;
72 unsigned hypercalls;
73 unsigned histo_hypercalls[NHYPERCALLS];
74
75 unsigned flush[FL_N_REASONS];
76} mc_stats;
77
78static u8 zero_stats;
79
80static inline void check_zero(void)
81{
82 if (unlikely(zero_stats)) {
83 memset(&mc_stats, 0, sizeof(mc_stats));
84 zero_stats = 0;
85 }
86}
87
88static void mc_add_stats(const struct mc_buffer *mc)
89{
90 int i;
91
92 check_zero();
93
94 mc_stats.issued++;
95 mc_stats.hypercalls += mc->mcidx;
96 mc_stats.arg_total += mc->argidx;
97
98 mc_stats.histo[mc->mcidx]++;
99 for(i = 0; i < mc->mcidx; i++) {
100 unsigned op = mc->entries[i].op;
101 if (op < NHYPERCALLS)
102 mc_stats.histo_hypercalls[op]++;
103 }
104}
105
106static void mc_stats_flush(enum flush_reasons idx)
107{
108 check_zero();
109
110 mc_stats.flush[idx]++;
111}
112
113#else /* !CONFIG_XEN_DEBUG_FS */
114
115static inline void mc_add_stats(const struct mc_buffer *mc)
116{
117}
118
119static inline void mc_stats_flush(enum flush_reasons idx)
120{
121}
122#endif /* CONFIG_XEN_DEBUG_FS */
123
50void xen_mc_flush(void) 124void xen_mc_flush(void)
51{ 125{
52 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 126 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
@@ -60,6 +134,8 @@ void xen_mc_flush(void)
60 something in the middle */ 134 something in the middle */
61 local_irq_save(flags); 135 local_irq_save(flags);
62 136
137 mc_add_stats(b);
138
63 if (b->mcidx) { 139 if (b->mcidx) {
64#if MC_DEBUG 140#if MC_DEBUG
65 memcpy(b->debug, b->entries, 141 memcpy(b->debug, b->entries,
@@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args)
115 191
116 if (b->mcidx == MC_BATCH || 192 if (b->mcidx == MC_BATCH ||
117 (argidx + args) > MC_ARGS) { 193 (argidx + args) > MC_ARGS) {
194 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
118 xen_mc_flush(); 195 xen_mc_flush();
119 argidx = roundup(b->argidx, sizeof(u64)); 196 argidx = roundup(b->argidx, sizeof(u64));
120 } 197 }
@@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data)
158 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 235 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
159 struct callback *cb; 236 struct callback *cb;
160 237
161 if (b->cbidx == MC_BATCH) 238 if (b->cbidx == MC_BATCH) {
239 mc_stats_flush(FL_CALLBACKS);
162 xen_mc_flush(); 240 xen_mc_flush();
241 }
163 242
164 cb = &b->callbacks[b->cbidx++]; 243 cb = &b->callbacks[b->cbidx++];
165 cb->fn = fn; 244 cb->fn = fn;
166 cb->data = data; 245 cb->data = data;
167} 246}
247
248#ifdef CONFIG_XEN_DEBUG_FS
249
250static struct dentry *d_mc_debug;
251
252static int __init xen_mc_debugfs(void)
253{
254 struct dentry *d_xen = xen_init_debugfs();
255
256 if (d_xen == NULL)
257 return -ENOMEM;
258
259 d_mc_debug = debugfs_create_dir("multicalls", d_xen);
260
261 debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
262
263 debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
264 debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
265 debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
266
267 xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
268 mc_stats.histo, MC_BATCH);
269 xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
270 mc_stats.histo_hypercalls, NHYPERCALLS);
271 xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
272 mc_stats.flush, FL_N_REASONS);
273
274 return 0;
275}
276fs_initcall(xen_mc_debugfs);
277
278#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index d8faf79a0a1d..d77da613b1d2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -11,11 +11,8 @@
11 * useful topology information for the kernel to make use of. As a 11 * useful topology information for the kernel to make use of. As a
12 * result, all CPUs are treated as if they're single-core and 12 * result, all CPUs are treated as if they're single-core and
13 * single-threaded. 13 * single-threaded.
14 *
15 * This does not handle HOTPLUG_CPU yet.
16 */ 14 */
17#include <linux/sched.h> 15#include <linux/sched.h>
18#include <linux/kernel_stat.h>
19#include <linux/err.h> 16#include <linux/err.h>
20#include <linux/smp.h> 17#include <linux/smp.h>
21 18
@@ -36,8 +33,6 @@
36#include "xen-ops.h" 33#include "xen-ops.h"
37#include "mmu.h" 34#include "mmu.h"
38 35
39static void __cpuinit xen_init_lock_cpu(int cpu);
40
41cpumask_t xen_cpu_initialized_map; 36cpumask_t xen_cpu_initialized_map;
42 37
43static DEFINE_PER_CPU(int, resched_irq); 38static DEFINE_PER_CPU(int, resched_irq);
@@ -64,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
64 return IRQ_HANDLED; 59 return IRQ_HANDLED;
65} 60}
66 61
67static __cpuinit void cpu_bringup_and_idle(void) 62static __cpuinit void cpu_bringup(void)
68{ 63{
69 int cpu = smp_processor_id(); 64 int cpu = smp_processor_id();
70 65
71 cpu_init(); 66 cpu_init();
67 touch_softlockup_watchdog();
72 preempt_disable(); 68 preempt_disable();
73 69
74 xen_enable_sysenter(); 70 xen_enable_sysenter();
@@ -89,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void)
89 local_irq_enable(); 85 local_irq_enable();
90 86
91 wmb(); /* make sure everything is out */ 87 wmb(); /* make sure everything is out */
88}
89
90static __cpuinit void cpu_bringup_and_idle(void)
91{
92 cpu_bringup();
92 cpu_idle(); 93 cpu_idle();
93} 94}
94 95
@@ -212,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
212 213
213 cpu_set(cpu, cpu_present_map); 214 cpu_set(cpu, cpu_present_map);
214 } 215 }
215
216 //init_xenbus_allowed_cpumask();
217} 216}
218 217
219static __cpuinit int 218static __cpuinit int
@@ -281,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
281 struct task_struct *idle = idle_task(cpu); 280 struct task_struct *idle = idle_task(cpu);
282 int rc; 281 int rc;
283 282
284#if 0
285 rc = cpu_up_check(cpu);
286 if (rc)
287 return rc;
288#endif
289
290#ifdef CONFIG_X86_64 283#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */ 284 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0); 285 WARN_ON(cpu == 0);
@@ -339,6 +332,60 @@ static void xen_smp_cpus_done(unsigned int max_cpus)
339{ 332{
340} 333}
341 334
335#ifdef CONFIG_HOTPLUG_CPU
336static int xen_cpu_disable(void)
337{
338 unsigned int cpu = smp_processor_id();
339 if (cpu == 0)
340 return -EBUSY;
341
342 cpu_disable_common();
343
344 load_cr3(swapper_pg_dir);
345 return 0;
346}
347
348static void xen_cpu_die(unsigned int cpu)
349{
350 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
351 current->state = TASK_UNINTERRUPTIBLE;
352 schedule_timeout(HZ/10);
353 }
354 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
355 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
356 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
357 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
358 xen_uninit_lock_cpu(cpu);
359 xen_teardown_timer(cpu);
360
361 if (num_online_cpus() == 1)
362 alternatives_smp_switch(0);
363}
364
365static void xen_play_dead(void)
366{
367 play_dead_common();
368 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
369 cpu_bringup();
370}
371
372#else /* !CONFIG_HOTPLUG_CPU */
373static int xen_cpu_disable(void)
374{
375 return -ENOSYS;
376}
377
378static void xen_cpu_die(unsigned int cpu)
379{
380 BUG();
381}
382
383static void xen_play_dead(void)
384{
385 BUG();
386}
387
388#endif
342static void stop_self(void *v) 389static void stop_self(void *v)
343{ 390{
344 int cpu = smp_processor_id(); 391 int cpu = smp_processor_id();
@@ -419,176 +466,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
419 return IRQ_HANDLED; 466 return IRQ_HANDLED;
420} 467}
421 468
422struct xen_spinlock {
423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
428{
429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
430
431 return xl->lock != 0;
432}
433
434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
437
438 /* Not strictly true; this is only the count of contended
439 lock-takers entering the slow path. */
440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
539 }
540}
541
542static void xen_spin_unlock(struct raw_spinlock *lock)
543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
545
546 smp_wmb(); /* make sure no writes get moved after unlock */
547 xl->lock = 0; /* release lock */
548
549 /* make sure unlock happens before kick */
550 barrier();
551
552 if (unlikely(xl->spinners))
553 xen_spin_unlock_slow(xl);
554}
555
556static __cpuinit void xen_init_lock_cpu(int cpu)
557{
558 int irq;
559 const char *name;
560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
573
574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
575}
576
577static void __init xen_init_spinlocks(void)
578{
579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
585
586static const struct smp_ops xen_smp_ops __initdata = { 469static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 470 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus, 471 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done, 472 .smp_cpus_done = xen_smp_cpus_done,
591 473
474 .cpu_up = xen_cpu_up,
475 .cpu_die = xen_cpu_die,
476 .cpu_disable = xen_cpu_disable,
477 .play_dead = xen_play_dead,
478
592 .smp_send_stop = xen_smp_send_stop, 479 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule, 480 .smp_send_reschedule = xen_smp_send_reschedule,
594 481
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 000000000000..dd71e3a021cd
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,428 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/kernel_stat.h>
6#include <linux/spinlock.h>
7#include <linux/debugfs.h>
8#include <linux/log2.h>
9
10#include <asm/paravirt.h>
11
12#include <xen/interface/xen.h>
13#include <xen/events.h>
14
15#include "xen-ops.h"
16#include "debugfs.h"
17
18#ifdef CONFIG_XEN_DEBUG_FS
19static struct xen_spinlock_stats
20{
21 u64 taken;
22 u32 taken_slow;
23 u32 taken_slow_nested;
24 u32 taken_slow_pickup;
25 u32 taken_slow_spurious;
26 u32 taken_slow_irqenable;
27
28 u64 released;
29 u32 released_slow;
30 u32 released_slow_kicked;
31
32#define HISTO_BUCKETS 30
33 u32 histo_spin_total[HISTO_BUCKETS+1];
34 u32 histo_spin_spinning[HISTO_BUCKETS+1];
35 u32 histo_spin_blocked[HISTO_BUCKETS+1];
36
37 u64 time_total;
38 u64 time_spinning;
39 u64 time_blocked;
40} spinlock_stats;
41
42static u8 zero_stats;
43
44static unsigned lock_timeout = 1 << 10;
45#define TIMEOUT lock_timeout
46
47static inline void check_zero(void)
48{
49 if (unlikely(zero_stats)) {
50 memset(&spinlock_stats, 0, sizeof(spinlock_stats));
51 zero_stats = 0;
52 }
53}
54
55#define ADD_STATS(elem, val) \
56 do { check_zero(); spinlock_stats.elem += (val); } while(0)
57
58static inline u64 spin_time_start(void)
59{
60 return xen_clocksource_read();
61}
62
63static void __spin_time_accum(u64 delta, u32 *array)
64{
65 unsigned index = ilog2(delta);
66
67 check_zero();
68
69 if (index < HISTO_BUCKETS)
70 array[index]++;
71 else
72 array[HISTO_BUCKETS]++;
73}
74
75static inline void spin_time_accum_spinning(u64 start)
76{
77 u32 delta = xen_clocksource_read() - start;
78
79 __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
80 spinlock_stats.time_spinning += delta;
81}
82
83static inline void spin_time_accum_total(u64 start)
84{
85 u32 delta = xen_clocksource_read() - start;
86
87 __spin_time_accum(delta, spinlock_stats.histo_spin_total);
88 spinlock_stats.time_total += delta;
89}
90
91static inline void spin_time_accum_blocked(u64 start)
92{
93 u32 delta = xen_clocksource_read() - start;
94
95 __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
96 spinlock_stats.time_blocked += delta;
97}
98#else /* !CONFIG_XEN_DEBUG_FS */
99#define TIMEOUT (1 << 10)
100#define ADD_STATS(elem, val) do { (void)(val); } while(0)
101
102static inline u64 spin_time_start(void)
103{
104 return 0;
105}
106
107static inline void spin_time_accum_total(u64 start)
108{
109}
110static inline void spin_time_accum_spinning(u64 start)
111{
112}
113static inline void spin_time_accum_blocked(u64 start)
114{
115}
116#endif /* CONFIG_XEN_DEBUG_FS */
117
118struct xen_spinlock {
119 unsigned char lock; /* 0 -> free; 1 -> locked */
120 unsigned short spinners; /* count of waiting cpus */
121};
122
123static int xen_spin_is_locked(struct raw_spinlock *lock)
124{
125 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
126
127 return xl->lock != 0;
128}
129
130static int xen_spin_is_contended(struct raw_spinlock *lock)
131{
132 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
133
134 /* Not strictly true; this is only the count of contended
135 lock-takers entering the slow path. */
136 return xl->spinners != 0;
137}
138
139static int xen_spin_trylock(struct raw_spinlock *lock)
140{
141 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
142 u8 old = 1;
143
144 asm("xchgb %b0,%1"
145 : "+q" (old), "+m" (xl->lock) : : "memory");
146
147 return old == 0;
148}
149
150static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
151static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
152
153/*
154 * Mark a cpu as interested in a lock. Returns the CPU's previous
155 * lock of interest, in case we got preempted by an interrupt.
156 */
157static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
158{
159 struct xen_spinlock *prev;
160
161 prev = __get_cpu_var(lock_spinners);
162 __get_cpu_var(lock_spinners) = xl;
163
164 wmb(); /* set lock of interest before count */
165
166 asm(LOCK_PREFIX " incw %0"
167 : "+m" (xl->spinners) : : "memory");
168
169 return prev;
170}
171
172/*
173 * Mark a cpu as no longer interested in a lock. Restores previous
174 * lock of interest (NULL for none).
175 */
176static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
177{
178 asm(LOCK_PREFIX " decw %0"
179 : "+m" (xl->spinners) : : "memory");
180 wmb(); /* decrement count before restoring lock */
181 __get_cpu_var(lock_spinners) = prev;
182}
183
184static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
185{
186 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
187 struct xen_spinlock *prev;
188 int irq = __get_cpu_var(lock_kicker_irq);
189 int ret;
190 unsigned long flags;
191 u64 start;
192
193 /* If kicker interrupts not initialized yet, just spin */
194 if (irq == -1)
195 return 0;
196
197 start = spin_time_start();
198
199 /* announce we're spinning */
200 prev = spinning_lock(xl);
201
202 flags = __raw_local_save_flags();
203 if (irq_enable) {
204 ADD_STATS(taken_slow_irqenable, 1);
205 raw_local_irq_enable();
206 }
207
208 ADD_STATS(taken_slow, 1);
209 ADD_STATS(taken_slow_nested, prev != NULL);
210
211 do {
212 /* clear pending */
213 xen_clear_irq_pending(irq);
214
215 /* check again make sure it didn't become free while
216 we weren't looking */
217 ret = xen_spin_trylock(lock);
218 if (ret) {
219 ADD_STATS(taken_slow_pickup, 1);
220
221 /*
222 * If we interrupted another spinlock while it
223 * was blocking, make sure it doesn't block
224 * without rechecking the lock.
225 */
226 if (prev != NULL)
227 xen_set_irq_pending(irq);
228 goto out;
229 }
230
231 /*
232 * Block until irq becomes pending. If we're
233 * interrupted at this point (after the trylock but
234 * before entering the block), then the nested lock
235 * handler guarantees that the irq will be left
236 * pending if there's any chance the lock became free;
237 * xen_poll_irq() returns immediately if the irq is
238 * pending.
239 */
240 xen_poll_irq(irq);
241 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
242 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
243
244 kstat_this_cpu.irqs[irq]++;
245
246out:
247 raw_local_irq_restore(flags);
248 unspinning_lock(xl, prev);
249 spin_time_accum_blocked(start);
250
251 return ret;
252}
253
254static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
255{
256 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
257 unsigned timeout;
258 u8 oldval;
259 u64 start_spin;
260
261 ADD_STATS(taken, 1);
262
263 start_spin = spin_time_start();
264
265 do {
266 u64 start_spin_fast = spin_time_start();
267
268 timeout = TIMEOUT;
269
270 asm("1: xchgb %1,%0\n"
271 " testb %1,%1\n"
272 " jz 3f\n"
273 "2: rep;nop\n"
274 " cmpb $0,%0\n"
275 " je 1b\n"
276 " dec %2\n"
277 " jnz 2b\n"
278 "3:\n"
279 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
280 : "1" (1)
281 : "memory");
282
283 spin_time_accum_spinning(start_spin_fast);
284
285 } while (unlikely(oldval != 0 &&
286 (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
287
288 spin_time_accum_total(start_spin);
289}
290
291static void xen_spin_lock(struct raw_spinlock *lock)
292{
293 __xen_spin_lock(lock, false);
294}
295
296static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
297{
298 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
299}
300
301static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
302{
303 int cpu;
304
305 ADD_STATS(released_slow, 1);
306
307 for_each_online_cpu(cpu) {
308 /* XXX should mix up next cpu selection */
309 if (per_cpu(lock_spinners, cpu) == xl) {
310 ADD_STATS(released_slow_kicked, 1);
311 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
312 break;
313 }
314 }
315}
316
317static void xen_spin_unlock(struct raw_spinlock *lock)
318{
319 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
320
321 ADD_STATS(released, 1);
322
323 smp_wmb(); /* make sure no writes get moved after unlock */
324 xl->lock = 0; /* release lock */
325
326 /* make sure unlock happens before kick */
327 barrier();
328
329 if (unlikely(xl->spinners))
330 xen_spin_unlock_slow(xl);
331}
332
333static irqreturn_t dummy_handler(int irq, void *dev_id)
334{
335 BUG();
336 return IRQ_HANDLED;
337}
338
339void __cpuinit xen_init_lock_cpu(int cpu)
340{
341 int irq;
342 const char *name;
343
344 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
345 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
346 cpu,
347 dummy_handler,
348 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
349 name,
350 NULL);
351
352 if (irq >= 0) {
353 disable_irq(irq); /* make sure it's never delivered */
354 per_cpu(lock_kicker_irq, cpu) = irq;
355 }
356
357 printk("cpu %d spinlock event irq %d\n", cpu, irq);
358}
359
360void xen_uninit_lock_cpu(int cpu)
361{
362 unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
363}
364
365void __init xen_init_spinlocks(void)
366{
367 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
368 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
369 pv_lock_ops.spin_lock = xen_spin_lock;
370 pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
371 pv_lock_ops.spin_trylock = xen_spin_trylock;
372 pv_lock_ops.spin_unlock = xen_spin_unlock;
373}
374
375#ifdef CONFIG_XEN_DEBUG_FS
376
377static struct dentry *d_spin_debug;
378
379static int __init xen_spinlock_debugfs(void)
380{
381 struct dentry *d_xen = xen_init_debugfs();
382
383 if (d_xen == NULL)
384 return -ENOMEM;
385
386 d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
387
388 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
389
390 debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
391
392 debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
393 debugfs_create_u32("taken_slow", 0444, d_spin_debug,
394 &spinlock_stats.taken_slow);
395 debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
396 &spinlock_stats.taken_slow_nested);
397 debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
398 &spinlock_stats.taken_slow_pickup);
399 debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
400 &spinlock_stats.taken_slow_spurious);
401 debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
402 &spinlock_stats.taken_slow_irqenable);
403
404 debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
405 debugfs_create_u32("released_slow", 0444, d_spin_debug,
406 &spinlock_stats.released_slow);
407 debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
408 &spinlock_stats.released_slow_kicked);
409
410 debugfs_create_u64("time_spinning", 0444, d_spin_debug,
411 &spinlock_stats.time_spinning);
412 debugfs_create_u64("time_blocked", 0444, d_spin_debug,
413 &spinlock_stats.time_blocked);
414 debugfs_create_u64("time_total", 0444, d_spin_debug,
415 &spinlock_stats.time_total);
416
417 xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
418 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
419 xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
420 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
421 xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
422 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
423
424 return 0;
425}
426fs_initcall(xen_spinlock_debugfs);
427
428#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 685b77470fc3..004ba86326ae 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -30,8 +30,6 @@
30#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
31#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
32 32
33static cycle_t xen_clocksource_read(void);
34
35/* runstate info updated by Xen */ 33/* runstate info updated by Xen */
36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
37 35
@@ -213,7 +211,7 @@ unsigned long xen_tsc_khz(void)
213 return xen_khz; 211 return xen_khz;
214} 212}
215 213
216static cycle_t xen_clocksource_read(void) 214cycle_t xen_clocksource_read(void)
217{ 215{
218 struct pvclock_vcpu_time_info *src; 216 struct pvclock_vcpu_time_info *src;
219 cycle_t ret; 217 cycle_t ret;
@@ -452,6 +450,14 @@ void xen_setup_timer(int cpu)
452 setup_runstate_info(cpu); 450 setup_runstate_info(cpu);
453} 451}
454 452
453void xen_teardown_timer(int cpu)
454{
455 struct clock_event_device *evt;
456 BUG_ON(cpu == 0);
457 evt = &per_cpu(xen_clock_events, cpu);
458 unbind_from_irqhandler(evt->irq, NULL);
459}
460
455void xen_setup_cpu_clockevents(void) 461void xen_setup_cpu_clockevents(void)
456{ 462{
457 BUG_ON(preemptible()); 463 BUG_ON(preemptible());
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..42786f59d9c0 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -298,7 +298,7 @@ check_events:
298 push %eax 298 push %eax
299 push %ecx 299 push %ecx
300 push %edx 300 push %edx
301 call force_evtchn_callback 301 call xen_force_evtchn_callback
302 pop %edx 302 pop %edx
303 pop %ecx 303 pop %ecx
304 pop %eax 304 pop %eax
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 7f58304fafb3..05794c566e87 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -26,8 +26,15 @@
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */ 26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000 27#define XEN_EFLAGS_NMI 0x80000000
28 28
29#if 0 29#if 1
30#include <asm/percpu.h> 30/*
31 x86-64 does not yet support direct access to percpu variables
32 via a segment override, so we just need to make sure this code
33 never gets used
34 */
35#define BUG ud2a
36#define PER_CPU_VAR(var, off) 0xdeadbeef
37#endif
31 38
32/* 39/*
33 Enable events. This clears the event mask and tests the pending 40 Enable events. This clears the event mask and tests the pending
@@ -35,6 +42,8 @@
35 events, then enter the hypervisor to get them handled. 42 events, then enter the hypervisor to get them handled.
36 */ 43 */
37ENTRY(xen_irq_enable_direct) 44ENTRY(xen_irq_enable_direct)
45 BUG
46
38 /* Unmask events */ 47 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 48 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40 49
@@ -58,6 +67,8 @@ ENDPATCH(xen_irq_enable_direct)
58 non-zero. 67 non-zero.
59 */ 68 */
60ENTRY(xen_irq_disable_direct) 69ENTRY(xen_irq_disable_direct)
70 BUG
71
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 72 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct) 73ENDPATCH(xen_irq_disable_direct)
63 ret 74 ret
@@ -74,6 +85,8 @@ ENDPATCH(xen_irq_disable_direct)
74 Xen and x86 use opposite senses (mask vs enable). 85 Xen and x86 use opposite senses (mask vs enable).
75 */ 86 */
76ENTRY(xen_save_fl_direct) 87ENTRY(xen_save_fl_direct)
88 BUG
89
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 90 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah 91 setz %ah
79 addb %ah,%ah 92 addb %ah,%ah
@@ -91,6 +104,8 @@ ENDPATCH(xen_save_fl_direct)
91 if so. 104 if so.
92 */ 105 */
93ENTRY(xen_restore_fl_direct) 106ENTRY(xen_restore_fl_direct)
107 BUG
108
94 testb $X86_EFLAGS_IF>>8, %ah 109 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 110 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with 111 /* Preempt here doesn't matter because that will deal with
@@ -122,7 +137,7 @@ check_events:
122 push %r9 137 push %r9
123 push %r10 138 push %r10
124 push %r11 139 push %r11
125 call force_evtchn_callback 140 call xen_force_evtchn_callback
126 pop %r11 141 pop %r11
127 pop %r10 142 pop %r10
128 pop %r9 143 pop %r9
@@ -133,7 +148,6 @@ check_events:
133 pop %rcx 148 pop %rcx
134 pop %rax 149 pop %rax
135 ret 150 ret
136#endif
137 151
138ENTRY(xen_adjust_exception_frame) 152ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx 153 mov 8+0(%rsp),%rcx
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index dd3c23152a2e..d7422dc2a55c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,7 @@
2#define XEN_OPS_H 2#define XEN_OPS_H
3 3
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/clocksource.h>
5#include <linux/irqreturn.h> 6#include <linux/irqreturn.h>
6#include <xen/xen-ops.h> 7#include <xen/xen-ops.h>
7 8
@@ -31,7 +32,10 @@ void xen_vcpu_restore(void);
31 32
32void __init xen_build_dynamic_phys_to_machine(void); 33void __init xen_build_dynamic_phys_to_machine(void);
33 34
35void xen_init_irq_ops(void);
34void xen_setup_timer(int cpu); 36void xen_setup_timer(int cpu);
37void xen_teardown_timer(int cpu);
38cycle_t xen_clocksource_read(void);
35void xen_setup_cpu_clockevents(void); 39void xen_setup_cpu_clockevents(void);
36unsigned long xen_tsc_khz(void); 40unsigned long xen_tsc_khz(void);
37void __init xen_time_init(void); 41void __init xen_time_init(void);
@@ -50,6 +54,10 @@ void __init xen_setup_vcpu_info_placement(void);
50#ifdef CONFIG_SMP 54#ifdef CONFIG_SMP
51void xen_smp_init(void); 55void xen_smp_init(void);
52 56
57void __init xen_init_spinlocks(void);
58__cpuinit void xen_init_lock_cpu(int cpu);
59void xen_uninit_lock_cpu(int cpu);
60
53extern cpumask_t xen_cpu_initialized_map; 61extern cpumask_t xen_cpu_initialized_map;
54#else 62#else
55static inline void xen_smp_init(void) {} 63static inline void xen_smp_init(void) {}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index bff602ccccf3..1a50ae70f716 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1066,7 +1066,7 @@ static struct xenbus_driver blkfront = {
1066 1066
1067static int __init xlblk_init(void) 1067static int __init xlblk_init(void)
1068{ 1068{
1069 if (!is_running_on_xen()) 1069 if (!xen_domain())
1070 return -ENODEV; 1070 return -ENODEV;
1071 1071
1072 if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { 1072 if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
index 6b70aa66a587..538ceea5e7df 100644
--- a/drivers/char/hvc_xen.c
+++ b/drivers/char/hvc_xen.c
@@ -108,8 +108,8 @@ static int __init xen_init(void)
108{ 108{
109 struct hvc_struct *hp; 109 struct hvc_struct *hp;
110 110
111 if (!is_running_on_xen() || 111 if (!xen_pv_domain() ||
112 is_initial_xendomain() || 112 xen_initial_domain() ||
113 !xen_start_info->console.domU.evtchn) 113 !xen_start_info->console.domU.evtchn)
114 return -ENODEV; 114 return -ENODEV;
115 115
@@ -142,7 +142,7 @@ static void __exit xen_fini(void)
142 142
143static int xen_cons_init(void) 143static int xen_cons_init(void)
144{ 144{
145 if (!is_running_on_xen()) 145 if (!xen_pv_domain())
146 return 0; 146 return 0;
147 147
148 hvc_instantiate(HVC_COOKIE, 0, &hvc_ops); 148 hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c
index 9ce3b3baf3a2..3ab6362f043c 100644
--- a/drivers/input/xen-kbdfront.c
+++ b/drivers/input/xen-kbdfront.c
@@ -335,11 +335,11 @@ static struct xenbus_driver xenkbd = {
335 335
336static int __init xenkbd_init(void) 336static int __init xenkbd_init(void)
337{ 337{
338 if (!is_running_on_xen()) 338 if (!xen_domain())
339 return -ENODEV; 339 return -ENODEV;
340 340
341 /* Nothing to do if running in dom0. */ 341 /* Nothing to do if running in dom0. */
342 if (is_initial_xendomain()) 342 if (xen_initial_domain())
343 return -ENODEV; 343 return -ENODEV;
344 344
345 return xenbus_register_frontend(&xenkbd); 345 return xenbus_register_frontend(&xenkbd);
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index c749bdba214c..3c3dd403f5dd 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1794,10 +1794,10 @@ static struct xenbus_driver netfront = {
1794 1794
1795static int __init netif_init(void) 1795static int __init netif_init(void)
1796{ 1796{
1797 if (!is_running_on_xen()) 1797 if (!xen_domain())
1798 return -ENODEV; 1798 return -ENODEV;
1799 1799
1800 if (is_initial_xendomain()) 1800 if (xen_initial_domain())
1801 return 0; 1801 return 0;
1802 1802
1803 printk(KERN_INFO "Initialising Xen virtual ethernet driver.\n"); 1803 printk(KERN_INFO "Initialising Xen virtual ethernet driver.\n");
@@ -1809,7 +1809,7 @@ module_init(netif_init);
1809 1809
1810static void __exit netif_exit(void) 1810static void __exit netif_exit(void)
1811{ 1811{
1812 if (is_initial_xendomain()) 1812 if (xen_initial_domain())
1813 return; 1813 return;
1814 1814
1815 xenbus_unregister_driver(&netfront); 1815 xenbus_unregister_driver(&netfront);
diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c
index 47ed39b52f9c..a463b3dd837b 100644
--- a/drivers/video/xen-fbfront.c
+++ b/drivers/video/xen-fbfront.c
@@ -680,11 +680,11 @@ static struct xenbus_driver xenfb = {
680 680
681static int __init xenfb_init(void) 681static int __init xenfb_init(void)
682{ 682{
683 if (!is_running_on_xen()) 683 if (!xen_domain())
684 return -ENODEV; 684 return -ENODEV;
685 685
686 /* Nothing to do if running in dom0. */ 686 /* Nothing to do if running in dom0. */
687 if (is_initial_xendomain()) 687 if (xen_initial_domain())
688 return -ENODEV; 688 return -ENODEV;
689 689
690 return xenbus_register_frontend(&xenfb); 690 return xenbus_register_frontend(&xenfb);
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 363286c54290..d2a8fdf0e191 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,4 +1,5 @@
1obj-y += grant-table.o features.o events.o manage.o 1obj-y += grant-table.o features.o events.o manage.o
2obj-y += xenbus/ 2obj-y += xenbus/
3obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
3obj-$(CONFIG_XEN_XENCOMM) += xencomm.o 4obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
4obj-$(CONFIG_XEN_BALLOON) += balloon.o 5obj-$(CONFIG_XEN_BALLOON) += balloon.o
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 2e15da5459cf..8c83abc73400 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -53,7 +53,6 @@
53#include <asm/tlb.h> 53#include <asm/tlb.h>
54 54
55#include <xen/interface/memory.h> 55#include <xen/interface/memory.h>
56#include <xen/balloon.h>
57#include <xen/xenbus.h> 56#include <xen/xenbus.h>
58#include <xen/features.h> 57#include <xen/features.h>
59#include <xen/page.h> 58#include <xen/page.h>
@@ -226,9 +225,8 @@ static int increase_reservation(unsigned long nr_pages)
226 } 225 }
227 226
228 set_xen_guest_handle(reservation.extent_start, frame_list); 227 set_xen_guest_handle(reservation.extent_start, frame_list);
229 reservation.nr_extents = nr_pages; 228 reservation.nr_extents = nr_pages;
230 rc = HYPERVISOR_memory_op( 229 rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
231 XENMEM_populate_physmap, &reservation);
232 if (rc < nr_pages) { 230 if (rc < nr_pages) {
233 if (rc > 0) { 231 if (rc > 0) {
234 int ret; 232 int ret;
@@ -236,7 +234,7 @@ static int increase_reservation(unsigned long nr_pages)
236 /* We hit the Xen hard limit: reprobe. */ 234 /* We hit the Xen hard limit: reprobe. */
237 reservation.nr_extents = rc; 235 reservation.nr_extents = rc;
238 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, 236 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
239 &reservation); 237 &reservation);
240 BUG_ON(ret != rc); 238 BUG_ON(ret != rc);
241 } 239 }
242 if (rc >= 0) 240 if (rc >= 0)
@@ -420,7 +418,7 @@ static int __init balloon_init(void)
420 unsigned long pfn; 418 unsigned long pfn;
421 struct page *page; 419 struct page *page;
422 420
423 if (!is_running_on_xen()) 421 if (!xen_pv_domain())
424 return -ENODEV; 422 return -ENODEV;
425 423
426 pr_info("xen_balloon: Initialising balloon driver.\n"); 424 pr_info("xen_balloon: Initialising balloon driver.\n");
@@ -464,136 +462,13 @@ static void balloon_exit(void)
464 462
465module_exit(balloon_exit); 463module_exit(balloon_exit);
466 464
467static void balloon_update_driver_allowance(long delta) 465#define BALLOON_SHOW(name, format, args...) \
468{ 466 static ssize_t show_##name(struct sys_device *dev, \
469 unsigned long flags; 467 struct sysdev_attribute *attr, \
470 468 char *buf) \
471 spin_lock_irqsave(&balloon_lock, flags); 469 { \
472 balloon_stats.driver_pages += delta; 470 return sprintf(buf, format, ##args); \
473 spin_unlock_irqrestore(&balloon_lock, flags); 471 } \
474}
475
476static int dealloc_pte_fn(
477 pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
478{
479 unsigned long mfn = pte_mfn(*pte);
480 int ret;
481 struct xen_memory_reservation reservation = {
482 .nr_extents = 1,
483 .extent_order = 0,
484 .domid = DOMID_SELF
485 };
486 set_xen_guest_handle(reservation.extent_start, &mfn);
487 set_pte_at(&init_mm, addr, pte, __pte_ma(0ull));
488 set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
489 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
490 BUG_ON(ret != 1);
491 return 0;
492}
493
494static struct page **alloc_empty_pages_and_pagevec(int nr_pages)
495{
496 unsigned long vaddr, flags;
497 struct page *page, **pagevec;
498 int i, ret;
499
500 pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
501 if (pagevec == NULL)
502 return NULL;
503
504 for (i = 0; i < nr_pages; i++) {
505 page = pagevec[i] = alloc_page(GFP_KERNEL);
506 if (page == NULL)
507 goto err;
508
509 vaddr = (unsigned long)page_address(page);
510
511 scrub_page(page);
512
513 spin_lock_irqsave(&balloon_lock, flags);
514
515 if (xen_feature(XENFEAT_auto_translated_physmap)) {
516 unsigned long gmfn = page_to_pfn(page);
517 struct xen_memory_reservation reservation = {
518 .nr_extents = 1,
519 .extent_order = 0,
520 .domid = DOMID_SELF
521 };
522 set_xen_guest_handle(reservation.extent_start, &gmfn);
523 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
524 &reservation);
525 if (ret == 1)
526 ret = 0; /* success */
527 } else {
528 ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE,
529 dealloc_pte_fn, NULL);
530 }
531
532 if (ret != 0) {
533 spin_unlock_irqrestore(&balloon_lock, flags);
534 __free_page(page);
535 goto err;
536 }
537
538 totalram_pages = --balloon_stats.current_pages;
539
540 spin_unlock_irqrestore(&balloon_lock, flags);
541 }
542
543 out:
544 schedule_work(&balloon_worker);
545 flush_tlb_all();
546 return pagevec;
547
548 err:
549 spin_lock_irqsave(&balloon_lock, flags);
550 while (--i >= 0)
551 balloon_append(pagevec[i]);
552 spin_unlock_irqrestore(&balloon_lock, flags);
553 kfree(pagevec);
554 pagevec = NULL;
555 goto out;
556}
557
558static void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
559{
560 unsigned long flags;
561 int i;
562
563 if (pagevec == NULL)
564 return;
565
566 spin_lock_irqsave(&balloon_lock, flags);
567 for (i = 0; i < nr_pages; i++) {
568 BUG_ON(page_count(pagevec[i]) != 1);
569 balloon_append(pagevec[i]);
570 }
571 spin_unlock_irqrestore(&balloon_lock, flags);
572
573 kfree(pagevec);
574
575 schedule_work(&balloon_worker);
576}
577
578static void balloon_release_driver_page(struct page *page)
579{
580 unsigned long flags;
581
582 spin_lock_irqsave(&balloon_lock, flags);
583 balloon_append(page);
584 balloon_stats.driver_pages--;
585 spin_unlock_irqrestore(&balloon_lock, flags);
586
587 schedule_work(&balloon_worker);
588}
589
590
591#define BALLOON_SHOW(name, format, args...) \
592 static ssize_t show_##name(struct sys_device *dev, \
593 char *buf) \
594 { \
595 return sprintf(buf, format, ##args); \
596 } \
597 static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) 472 static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
598 473
599BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); 474BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages));
@@ -604,7 +479,8 @@ BALLOON_SHOW(hard_limit_kb,
604 (balloon_stats.hard_limit!=~0UL) ? PAGES2KB(balloon_stats.hard_limit) : 0); 479 (balloon_stats.hard_limit!=~0UL) ? PAGES2KB(balloon_stats.hard_limit) : 0);
605BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages)); 480BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages));
606 481
607static ssize_t show_target_kb(struct sys_device *dev, char *buf) 482static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr,
483 char *buf)
608{ 484{
609 return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); 485 return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages));
610} 486}
@@ -614,19 +490,14 @@ static ssize_t store_target_kb(struct sys_device *dev,
614 const char *buf, 490 const char *buf,
615 size_t count) 491 size_t count)
616{ 492{
617 char memstring[64], *endchar; 493 char *endchar;
618 unsigned long long target_bytes; 494 unsigned long long target_bytes;
619 495
620 if (!capable(CAP_SYS_ADMIN)) 496 if (!capable(CAP_SYS_ADMIN))
621 return -EPERM; 497 return -EPERM;
622 498
623 if (count <= 1) 499 target_bytes = memparse(buf, &endchar);
624 return -EBADMSG; /* runt */
625 if (count > sizeof(memstring))
626 return -EFBIG; /* too long */
627 strcpy(memstring, buf);
628 500
629 target_bytes = memparse(memstring, &endchar);
630 balloon_set_new_target(target_bytes >> PAGE_SHIFT); 501 balloon_set_new_target(target_bytes >> PAGE_SHIFT);
631 502
632 return count; 503 return count;
@@ -694,20 +565,4 @@ static int register_balloon(struct sys_device *sysdev)
694 return error; 565 return error;
695} 566}
696 567
697static void unregister_balloon(struct sys_device *sysdev)
698{
699 int i;
700
701 sysfs_remove_group(&sysdev->kobj, &balloon_info_group);
702 for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++)
703 sysdev_remove_file(sysdev, balloon_attrs[i]);
704 sysdev_unregister(sysdev);
705 sysdev_class_unregister(&balloon_sysdev_class);
706}
707
708static void balloon_sysfs_exit(void)
709{
710 unregister_balloon(&balloon_sysdev);
711}
712
713MODULE_LICENSE("GPL"); 568MODULE_LICENSE("GPL");
diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
new file mode 100644
index 000000000000..565280ec1c6a
--- /dev/null
+++ b/drivers/xen/cpu_hotplug.c
@@ -0,0 +1,90 @@
1#include <linux/notifier.h>
2
3#include <xen/xenbus.h>
4
5#include <asm-x86/xen/hypervisor.h>
6#include <asm/cpu.h>
7
8static void enable_hotplug_cpu(int cpu)
9{
10 if (!cpu_present(cpu))
11 arch_register_cpu(cpu);
12
13 cpu_set(cpu, cpu_present_map);
14}
15
16static void disable_hotplug_cpu(int cpu)
17{
18 if (cpu_present(cpu))
19 arch_unregister_cpu(cpu);
20
21 cpu_clear(cpu, cpu_present_map);
22}
23
24static void vcpu_hotplug(unsigned int cpu)
25{
26 int err;
27 char dir[32], state[32];
28
29 if (!cpu_possible(cpu))
30 return;
31
32 sprintf(dir, "cpu/%u", cpu);
33 err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
34 if (err != 1) {
35 printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
36 return;
37 }
38
39 if (strcmp(state, "online") == 0) {
40 enable_hotplug_cpu(cpu);
41 } else if (strcmp(state, "offline") == 0) {
42 (void)cpu_down(cpu);
43 disable_hotplug_cpu(cpu);
44 } else {
45 printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
46 state, cpu);
47 }
48}
49
50static void handle_vcpu_hotplug_event(struct xenbus_watch *watch,
51 const char **vec, unsigned int len)
52{
53 unsigned int cpu;
54 char *cpustr;
55 const char *node = vec[XS_WATCH_PATH];
56
57 cpustr = strstr(node, "cpu/");
58 if (cpustr != NULL) {
59 sscanf(cpustr, "cpu/%u", &cpu);
60 vcpu_hotplug(cpu);
61 }
62}
63
64static int setup_cpu_watcher(struct notifier_block *notifier,
65 unsigned long event, void *data)
66{
67 static struct xenbus_watch cpu_watch = {
68 .node = "cpu",
69 .callback = handle_vcpu_hotplug_event};
70
71 (void)register_xenbus_watch(&cpu_watch);
72
73 return NOTIFY_DONE;
74}
75
76static int __init setup_vcpu_hotplug_event(void)
77{
78 static struct notifier_block xsn_cpu = {
79 .notifier_call = setup_cpu_watcher };
80
81 if (!xen_pv_domain())
82 return -ENODEV;
83
84 register_xenstore_notifier(&xsn_cpu);
85
86 return 0;
87}
88
89arch_initcall(setup_vcpu_hotplug_event);
90
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 0e0c28574af8..c3290bc186a0 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -84,17 +84,6 @@ static int irq_bindcount[NR_IRQS];
84/* Xen will never allocate port zero for any purpose. */ 84/* Xen will never allocate port zero for any purpose. */
85#define VALID_EVTCHN(chn) ((chn) != 0) 85#define VALID_EVTCHN(chn) ((chn) != 0)
86 86
87/*
88 * Force a proper event-channel callback from Xen after clearing the
89 * callback mask. We do this in a very simple manner, by making a call
90 * down into Xen. The pending flag will be checked by Xen on return.
91 */
92void force_evtchn_callback(void)
93{
94 (void)HYPERVISOR_xen_version(0, NULL);
95}
96EXPORT_SYMBOL_GPL(force_evtchn_callback);
97
98static struct irq_chip xen_dynamic_chip; 87static struct irq_chip xen_dynamic_chip;
99 88
100/* Constructor for packed IRQ information. */ 89/* Constructor for packed IRQ information. */
@@ -175,6 +164,12 @@ static inline void set_evtchn(int port)
175 sync_set_bit(port, &s->evtchn_pending[0]); 164 sync_set_bit(port, &s->evtchn_pending[0]);
176} 165}
177 166
167static inline int test_evtchn(int port)
168{
169 struct shared_info *s = HYPERVISOR_shared_info;
170 return sync_test_bit(port, &s->evtchn_pending[0]);
171}
172
178 173
179/** 174/**
180 * notify_remote_via_irq - send event to remote end of event channel via irq 175 * notify_remote_via_irq - send event to remote end of event channel via irq
@@ -365,6 +360,10 @@ static void unbind_from_irq(unsigned int irq)
365 per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) 360 per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
366 [index_from_irq(irq)] = -1; 361 [index_from_irq(irq)] = -1;
367 break; 362 break;
363 case IRQT_IPI:
364 per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
365 [index_from_irq(irq)] = -1;
366 break;
368 default: 367 default:
369 break; 368 break;
370 } 369 }
@@ -743,6 +742,25 @@ void xen_clear_irq_pending(int irq)
743 clear_evtchn(evtchn); 742 clear_evtchn(evtchn);
744} 743}
745 744
745void xen_set_irq_pending(int irq)
746{
747 int evtchn = evtchn_from_irq(irq);
748
749 if (VALID_EVTCHN(evtchn))
750 set_evtchn(evtchn);
751}
752
753bool xen_test_irq_pending(int irq)
754{
755 int evtchn = evtchn_from_irq(irq);
756 bool ret = false;
757
758 if (VALID_EVTCHN(evtchn))
759 ret = test_evtchn(evtchn);
760
761 return ret;
762}
763
746/* Poll waiting for an irq to become pending. In the usual case, the 764/* Poll waiting for an irq to become pending. In the usual case, the
747 irq will be disabled so it won't deliver an interrupt. */ 765 irq will be disabled so it won't deliver an interrupt. */
748void xen_poll_irq(int irq) 766void xen_poll_irq(int irq)
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index e9e11168616a..06592b9da83c 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -508,7 +508,7 @@ static int __devinit gnttab_init(void)
508 unsigned int max_nr_glist_frames, nr_glist_frames; 508 unsigned int max_nr_glist_frames, nr_glist_frames;
509 unsigned int nr_init_grefs; 509 unsigned int nr_init_grefs;
510 510
511 if (!is_running_on_xen()) 511 if (!xen_domain())
512 return -ENODEV; 512 return -ENODEV;
513 513
514 nr_grant_frames = 1; 514 nr_grant_frames = 1;
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 57ceb5346b74..7f24a98a446f 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -814,7 +814,7 @@ static int __init xenbus_probe_init(void)
814 DPRINTK(""); 814 DPRINTK("");
815 815
816 err = -ENODEV; 816 err = -ENODEV;
817 if (!is_running_on_xen()) 817 if (!xen_domain())
818 goto out_error; 818 goto out_error;
819 819
820 /* Register ourselves with the kernel bus subsystem */ 820 /* Register ourselves with the kernel bus subsystem */
@@ -829,7 +829,7 @@ static int __init xenbus_probe_init(void)
829 /* 829 /*
830 * Domain0 doesn't have a store_evtchn or store_mfn yet. 830 * Domain0 doesn't have a store_evtchn or store_mfn yet.
831 */ 831 */
832 if (is_initial_xendomain()) { 832 if (xen_initial_domain()) {
833 /* dom0 not yet supported */ 833 /* dom0 not yet supported */
834 } else { 834 } else {
835 xenstored_ready = 1; 835 xenstored_ready = 1;
@@ -846,7 +846,7 @@ static int __init xenbus_probe_init(void)
846 goto out_unreg_back; 846 goto out_unreg_back;
847 } 847 }
848 848
849 if (!is_initial_xendomain()) 849 if (!xen_initial_domain())
850 xenbus_probe(NULL); 850 xenbus_probe(NULL);
851 851
852 return 0; 852 return 0;
@@ -937,7 +937,7 @@ static void wait_for_devices(struct xenbus_driver *xendrv)
937 unsigned long timeout = jiffies + 10*HZ; 937 unsigned long timeout = jiffies + 10*HZ;
938 struct device_driver *drv = xendrv ? &xendrv->driver : NULL; 938 struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
939 939
940 if (!ready_to_wait_for_devices || !is_running_on_xen()) 940 if (!ready_to_wait_for_devices || !xen_domain())
941 return; 941 return;
942 942
943 while (exists_disconnected_device(drv)) { 943 while (exists_disconnected_device(drv)) {
diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h
index b73fea54def2..ebc307817e98 100644
--- a/include/asm-x86/desc.h
+++ b/include/asm-x86/desc.h
@@ -24,6 +24,11 @@ static inline void fill_ldt(struct desc_struct *desc,
24 desc->d = info->seg_32bit; 24 desc->d = info->seg_32bit;
25 desc->g = info->limit_in_pages; 25 desc->g = info->limit_in_pages;
26 desc->base2 = (info->base_addr & 0xff000000) >> 24; 26 desc->base2 = (info->base_addr & 0xff000000) >> 24;
27 /*
28 * Don't allow setting of the lm bit. It is useless anyway
29 * because 64bit system calls require __USER_CS:
30 */
31 desc->l = 0;
27} 32}
28 33
29extern struct desc_ptr idt_descr; 34extern struct desc_ptr idt_descr;
@@ -97,7 +102,15 @@ static inline int desc_empty(const void *ptr)
97 native_write_gdt_entry(dt, entry, desc, type) 102 native_write_gdt_entry(dt, entry, desc, type)
98#define write_idt_entry(dt, entry, g) \ 103#define write_idt_entry(dt, entry, g) \
99 native_write_idt_entry(dt, entry, g) 104 native_write_idt_entry(dt, entry, g)
100#endif 105
106static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
107{
108}
109
110static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
111{
112}
113#endif /* CONFIG_PARAVIRT */
101 114
102static inline void native_write_idt_entry(gate_desc *idt, int entry, 115static inline void native_write_idt_entry(gate_desc *idt, int entry,
103 const gate_desc *gate) 116 const gate_desc *gate)
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index d7d358a43996..8d6ae2f760d0 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -124,6 +124,9 @@ struct pv_cpu_ops {
124 int entrynum, const void *desc, int size); 124 int entrynum, const void *desc, int size);
125 void (*write_idt_entry)(gate_desc *, 125 void (*write_idt_entry)(gate_desc *,
126 int entrynum, const gate_desc *gate); 126 int entrynum, const gate_desc *gate);
127 void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
128 void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
129
127 void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); 130 void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
128 131
129 void (*set_iopl_mask)(unsigned mask); 132 void (*set_iopl_mask)(unsigned mask);
@@ -325,6 +328,7 @@ struct pv_lock_ops {
325 int (*spin_is_locked)(struct raw_spinlock *lock); 328 int (*spin_is_locked)(struct raw_spinlock *lock);
326 int (*spin_is_contended)(struct raw_spinlock *lock); 329 int (*spin_is_contended)(struct raw_spinlock *lock);
327 void (*spin_lock)(struct raw_spinlock *lock); 330 void (*spin_lock)(struct raw_spinlock *lock);
331 void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags);
328 int (*spin_trylock)(struct raw_spinlock *lock); 332 int (*spin_trylock)(struct raw_spinlock *lock);
329 void (*spin_unlock)(struct raw_spinlock *lock); 333 void (*spin_unlock)(struct raw_spinlock *lock);
330}; 334};
@@ -830,6 +834,16 @@ do { \
830 (aux) = __aux; \ 834 (aux) = __aux; \
831} while (0) 835} while (0)
832 836
837static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
838{
839 PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
840}
841
842static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
843{
844 PVOP_VCALL2(pv_cpu_ops.free_ldt, ldt, entries);
845}
846
833static inline void load_TR_desc(void) 847static inline void load_TR_desc(void)
834{ 848{
835 PVOP_VCALL0(pv_cpu_ops.load_tr_desc); 849 PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
@@ -1394,6 +1408,12 @@ static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
1394 PVOP_VCALL1(pv_lock_ops.spin_lock, lock); 1408 PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
1395} 1409}
1396 1410
1411static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock,
1412 unsigned long flags)
1413{
1414 PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
1415}
1416
1397static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock) 1417static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock)
1398{ 1418{
1399 return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); 1419 return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index 29324c103341..6df2615f9138 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -50,12 +50,16 @@ extern struct {
50struct smp_ops { 50struct smp_ops {
51 void (*smp_prepare_boot_cpu)(void); 51 void (*smp_prepare_boot_cpu)(void);
52 void (*smp_prepare_cpus)(unsigned max_cpus); 52 void (*smp_prepare_cpus)(unsigned max_cpus);
53 int (*cpu_up)(unsigned cpu);
54 void (*smp_cpus_done)(unsigned max_cpus); 53 void (*smp_cpus_done)(unsigned max_cpus);
55 54
56 void (*smp_send_stop)(void); 55 void (*smp_send_stop)(void);
57 void (*smp_send_reschedule)(int cpu); 56 void (*smp_send_reschedule)(int cpu);
58 57
58 int (*cpu_up)(unsigned cpu);
59 int (*cpu_disable)(void);
60 void (*cpu_die)(unsigned int cpu);
61 void (*play_dead)(void);
62
59 void (*send_call_func_ipi)(cpumask_t mask); 63 void (*send_call_func_ipi)(cpumask_t mask);
60 void (*send_call_func_single_ipi)(int cpu); 64 void (*send_call_func_single_ipi)(int cpu);
61}; 65};
@@ -94,6 +98,21 @@ static inline int __cpu_up(unsigned int cpu)
94 return smp_ops.cpu_up(cpu); 98 return smp_ops.cpu_up(cpu);
95} 99}
96 100
101static inline int __cpu_disable(void)
102{
103 return smp_ops.cpu_disable();
104}
105
106static inline void __cpu_die(unsigned int cpu)
107{
108 smp_ops.cpu_die(cpu);
109}
110
111static inline void play_dead(void)
112{
113 smp_ops.play_dead();
114}
115
97static inline void smp_send_reschedule(int cpu) 116static inline void smp_send_reschedule(int cpu)
98{ 117{
99 smp_ops.smp_send_reschedule(cpu); 118 smp_ops.smp_send_reschedule(cpu);
@@ -109,16 +128,19 @@ static inline void arch_send_call_function_ipi(cpumask_t mask)
109 smp_ops.send_call_func_ipi(mask); 128 smp_ops.send_call_func_ipi(mask);
110} 129}
111 130
131void cpu_disable_common(void);
112void native_smp_prepare_boot_cpu(void); 132void native_smp_prepare_boot_cpu(void);
113void native_smp_prepare_cpus(unsigned int max_cpus); 133void native_smp_prepare_cpus(unsigned int max_cpus);
114void native_smp_cpus_done(unsigned int max_cpus); 134void native_smp_cpus_done(unsigned int max_cpus);
115int native_cpu_up(unsigned int cpunum); 135int native_cpu_up(unsigned int cpunum);
136int native_cpu_disable(void);
137void native_cpu_die(unsigned int cpu);
138void native_play_dead(void);
139void play_dead_common(void);
140
116void native_send_call_func_ipi(cpumask_t mask); 141void native_send_call_func_ipi(cpumask_t mask);
117void native_send_call_func_single_ipi(int cpu); 142void native_send_call_func_single_ipi(int cpu);
118 143
119extern int __cpu_disable(void);
120extern void __cpu_die(unsigned int cpu);
121
122void smp_store_cpu_info(int id); 144void smp_store_cpu_info(int id);
123#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) 145#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
124 146
@@ -205,9 +227,5 @@ static inline int hard_smp_processor_id(void)
205 227
206#endif /* CONFIG_X86_LOCAL_APIC */ 228#endif /* CONFIG_X86_LOCAL_APIC */
207 229
208#ifdef CONFIG_HOTPLUG_CPU
209extern void cpu_uninit(void);
210#endif
211
212#endif /* __ASSEMBLY__ */ 230#endif /* __ASSEMBLY__ */
213#endif /* ASM_X86__SMP_H */ 231#endif /* ASM_X86__SMP_H */
diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h
index 93adae338ac6..157ff7fab97a 100644
--- a/include/asm-x86/spinlock.h
+++ b/include/asm-x86/spinlock.h
@@ -21,8 +21,10 @@
21 21
22#ifdef CONFIG_X86_32 22#ifdef CONFIG_X86_32
23# define LOCK_PTR_REG "a" 23# define LOCK_PTR_REG "a"
24# define REG_PTR_MODE "k"
24#else 25#else
25# define LOCK_PTR_REG "D" 26# define LOCK_PTR_REG "D"
27# define REG_PTR_MODE "q"
26#endif 28#endif
27 29
28#if defined(CONFIG_X86_32) && \ 30#if defined(CONFIG_X86_32) && \
@@ -54,19 +56,7 @@
54 * much between them in performance though, especially as locks are out of line. 56 * much between them in performance though, especially as locks are out of line.
55 */ 57 */
56#if (NR_CPUS < 256) 58#if (NR_CPUS < 256)
57static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) 59#define TICKET_SHIFT 8
58{
59 int tmp = ACCESS_ONCE(lock->slock);
60
61 return (((tmp >> 8) & 0xff) != (tmp & 0xff));
62}
63
64static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
65{
66 int tmp = ACCESS_ONCE(lock->slock);
67
68 return (((tmp >> 8) - tmp) & 0xff) > 1;
69}
70 60
71static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) 61static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
72{ 62{
@@ -89,19 +79,17 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
89 79
90static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) 80static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
91{ 81{
92 int tmp; 82 int tmp, new;
93 short new;
94 83
95 asm volatile("movw %2,%w0\n\t" 84 asm volatile("movzwl %2, %0\n\t"
96 "cmpb %h0,%b0\n\t" 85 "cmpb %h0,%b0\n\t"
86 "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
97 "jne 1f\n\t" 87 "jne 1f\n\t"
98 "movw %w0,%w1\n\t"
99 "incb %h1\n\t"
100 LOCK_PREFIX "cmpxchgw %w1,%2\n\t" 88 LOCK_PREFIX "cmpxchgw %w1,%2\n\t"
101 "1:" 89 "1:"
102 "sete %b1\n\t" 90 "sete %b1\n\t"
103 "movzbl %b1,%0\n\t" 91 "movzbl %b1,%0\n\t"
104 : "=&a" (tmp), "=Q" (new), "+m" (lock->slock) 92 : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
105 : 93 :
106 : "memory", "cc"); 94 : "memory", "cc");
107 95
@@ -116,19 +104,7 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
116 : "memory", "cc"); 104 : "memory", "cc");
117} 105}
118#else 106#else
119static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) 107#define TICKET_SHIFT 16
120{
121 int tmp = ACCESS_ONCE(lock->slock);
122
123 return (((tmp >> 16) & 0xffff) != (tmp & 0xffff));
124}
125
126static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
127{
128 int tmp = ACCESS_ONCE(lock->slock);
129
130 return (((tmp >> 16) - tmp) & 0xffff) > 1;
131}
132 108
133static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) 109static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
134{ 110{
@@ -146,7 +122,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
146 /* don't need lfence here, because loads are in-order */ 122 /* don't need lfence here, because loads are in-order */
147 "jmp 1b\n" 123 "jmp 1b\n"
148 "2:" 124 "2:"
149 : "+Q" (inc), "+m" (lock->slock), "=r" (tmp) 125 : "+r" (inc), "+m" (lock->slock), "=&r" (tmp)
150 : 126 :
151 : "memory", "cc"); 127 : "memory", "cc");
152} 128}
@@ -160,13 +136,13 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
160 "movl %0,%1\n\t" 136 "movl %0,%1\n\t"
161 "roll $16, %0\n\t" 137 "roll $16, %0\n\t"
162 "cmpl %0,%1\n\t" 138 "cmpl %0,%1\n\t"
139 "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
163 "jne 1f\n\t" 140 "jne 1f\n\t"
164 "addl $0x00010000, %1\n\t"
165 LOCK_PREFIX "cmpxchgl %1,%2\n\t" 141 LOCK_PREFIX "cmpxchgl %1,%2\n\t"
166 "1:" 142 "1:"
167 "sete %b1\n\t" 143 "sete %b1\n\t"
168 "movzbl %b1,%0\n\t" 144 "movzbl %b1,%0\n\t"
169 : "=&a" (tmp), "=r" (new), "+m" (lock->slock) 145 : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
170 : 146 :
171 : "memory", "cc"); 147 : "memory", "cc");
172 148
@@ -182,7 +158,19 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
182} 158}
183#endif 159#endif
184 160
185#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock) 161static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
162{
163 int tmp = ACCESS_ONCE(lock->slock);
164
165 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
166}
167
168static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
169{
170 int tmp = ACCESS_ONCE(lock->slock);
171
172 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
173}
186 174
187#ifdef CONFIG_PARAVIRT 175#ifdef CONFIG_PARAVIRT
188/* 176/*
@@ -272,6 +260,13 @@ static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
272{ 260{
273 __ticket_spin_unlock(lock); 261 __ticket_spin_unlock(lock);
274} 262}
263
264static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
265 unsigned long flags)
266{
267 __raw_spin_lock(lock);
268}
269
275#endif /* CONFIG_PARAVIRT */ 270#endif /* CONFIG_PARAVIRT */
276 271
277static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) 272static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
diff --git a/include/asm-x86/tlbflush.h b/include/asm-x86/tlbflush.h
index ef68b76dc3c5..3cdd08b5bdb7 100644
--- a/include/asm-x86/tlbflush.h
+++ b/include/asm-x86/tlbflush.h
@@ -119,6 +119,10 @@ static inline void native_flush_tlb_others(const cpumask_t *cpumask,
119{ 119{
120} 120}
121 121
122static inline void reset_lazy_tlbstate(void)
123{
124}
125
122#else /* SMP */ 126#else /* SMP */
123 127
124#include <asm/smp.h> 128#include <asm/smp.h>
@@ -151,6 +155,12 @@ struct tlb_state {
151 char __cacheline_padding[L1_CACHE_BYTES-8]; 155 char __cacheline_padding[L1_CACHE_BYTES-8];
152}; 156};
153DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); 157DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
158
159void reset_lazy_tlbstate(void);
160#else
161static inline void reset_lazy_tlbstate(void)
162{
163}
154#endif 164#endif
155 165
156#endif /* SMP */ 166#endif /* SMP */
diff --git a/include/asm-x86/xen/hypervisor.h b/include/asm-x86/xen/hypervisor.h
index 0ef3a88b869d..445a24759560 100644
--- a/include/asm-x86/xen/hypervisor.h
+++ b/include/asm-x86/xen/hypervisor.h
@@ -54,7 +54,6 @@
54/* arch/i386/kernel/setup.c */ 54/* arch/i386/kernel/setup.c */
55extern struct shared_info *HYPERVISOR_shared_info; 55extern struct shared_info *HYPERVISOR_shared_info;
56extern struct start_info *xen_start_info; 56extern struct start_info *xen_start_info;
57#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
58 57
59/* arch/i386/mach-xen/evtchn.c */ 58/* arch/i386/mach-xen/evtchn.c */
60/* Force a proper event-channel callback from Xen. */ 59/* Force a proper event-channel callback from Xen. */
@@ -67,6 +66,17 @@ u64 jiffies_to_st(unsigned long jiffies);
67#define MULTI_UVMFLAGS_INDEX 3 66#define MULTI_UVMFLAGS_INDEX 3
68#define MULTI_UVMDOMID_INDEX 4 67#define MULTI_UVMDOMID_INDEX 4
69 68
70#define is_running_on_xen() (xen_start_info ? 1 : 0) 69enum xen_domain_type {
70 XEN_NATIVE,
71 XEN_PV_DOMAIN,
72 XEN_HVM_DOMAIN,
73};
74
75extern enum xen_domain_type xen_domain_type;
76
77#define xen_domain() (xen_domain_type != XEN_NATIVE)
78#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN)
79#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN)
80#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN)
71 81
72#endif /* ASM_X86__XEN__HYPERVISOR_H */ 82#endif /* ASM_X86__XEN__HYPERVISOR_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2651f805ba6d..75d81f157d2e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -182,7 +182,7 @@ extern int vsscanf(const char *, const char *, va_list)
182 182
183extern int get_option(char **str, int *pint); 183extern int get_option(char **str, int *pint);
184extern char *get_options(const char *str, int nints, int *ints); 184extern char *get_options(const char *str, int nints, int *ints);
185extern unsigned long long memparse(char *ptr, char **retptr); 185extern unsigned long long memparse(const char *ptr, char **retptr);
186 186
187extern int core_kernel_text(unsigned long addr); 187extern int core_kernel_text(unsigned long addr);
188extern int __kernel_text_address(unsigned long addr); 188extern int __kernel_text_address(unsigned long addr);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 72a15dc26bbf..4194bf8e4f6c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -919,7 +919,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
919} 919}
920#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ 920#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
921 921
922#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 922#if USE_SPLIT_PTLOCKS
923/* 923/*
924 * We tuck a spinlock to guard each pagetable page into its struct page, 924 * We tuck a spinlock to guard each pagetable page into its struct page,
925 * at page->private, with BUILD_BUG_ON to make sure that this will not 925 * at page->private, with BUILD_BUG_ON to make sure that this will not
@@ -932,14 +932,14 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
932} while (0) 932} while (0)
933#define pte_lock_deinit(page) ((page)->mapping = NULL) 933#define pte_lock_deinit(page) ((page)->mapping = NULL)
934#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) 934#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
935#else 935#else /* !USE_SPLIT_PTLOCKS */
936/* 936/*
937 * We use mm->page_table_lock to guard all pagetable pages of the mm. 937 * We use mm->page_table_lock to guard all pagetable pages of the mm.
938 */ 938 */
939#define pte_lock_init(page) do {} while (0) 939#define pte_lock_init(page) do {} while (0)
940#define pte_lock_deinit(page) do {} while (0) 940#define pte_lock_deinit(page) do {} while (0)
941#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) 941#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
942#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ 942#endif /* USE_SPLIT_PTLOCKS */
943 943
944static inline void pgtable_page_ctor(struct page *page) 944static inline void pgtable_page_ctor(struct page *page)
945{ 945{
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bf334138c7c1..9d49fa36bbef 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -21,11 +21,13 @@
21 21
22struct address_space; 22struct address_space;
23 23
24#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 24#define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
25
26#if USE_SPLIT_PTLOCKS
25typedef atomic_long_t mm_counter_t; 27typedef atomic_long_t mm_counter_t;
26#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ 28#else /* !USE_SPLIT_PTLOCKS */
27typedef unsigned long mm_counter_t; 29typedef unsigned long mm_counter_t;
28#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ 30#endif /* !USE_SPLIT_PTLOCKS */
29 31
30/* 32/*
31 * Each physical page in the system has a struct page associated with 33 * Each physical page in the system has a struct page associated with
@@ -65,7 +67,7 @@ struct page {
65 * see PAGE_MAPPING_ANON below. 67 * see PAGE_MAPPING_ANON below.
66 */ 68 */
67 }; 69 };
68#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 70#if USE_SPLIT_PTLOCKS
69 spinlock_t ptl; 71 spinlock_t ptl;
70#endif 72#endif
71 struct kmem_cache *slab; /* SLUB: Pointer to slab */ 73 struct kmem_cache *slab; /* SLUB: Pointer to slab */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5d0819ee442a..c226c7b82946 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -352,7 +352,7 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
352extern void arch_unmap_area(struct mm_struct *, unsigned long); 352extern void arch_unmap_area(struct mm_struct *, unsigned long);
353extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); 353extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
354 354
355#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 355#if USE_SPLIT_PTLOCKS
356/* 356/*
357 * The mm counters are not protected by its page_table_lock, 357 * The mm counters are not protected by its page_table_lock,
358 * so must be incremented atomically. 358 * so must be incremented atomically.
@@ -363,7 +363,7 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
363#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) 363#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
364#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) 364#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
365 365
366#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ 366#else /* !USE_SPLIT_PTLOCKS */
367/* 367/*
368 * The mm counters are protected by its page_table_lock, 368 * The mm counters are protected by its page_table_lock,
369 * so can be incremented directly. 369 * so can be incremented directly.
@@ -374,7 +374,7 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
374#define inc_mm_counter(mm, member) (mm)->_##member++ 374#define inc_mm_counter(mm, member) (mm)->_##member++
375#define dec_mm_counter(mm, member) (mm)->_##member-- 375#define dec_mm_counter(mm, member) (mm)->_##member--
376 376
377#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ 377#endif /* !USE_SPLIT_PTLOCKS */
378 378
379#define get_mm_rss(mm) \ 379#define get_mm_rss(mm) \
380 (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) 380 (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
diff --git a/include/xen/balloon.h b/include/xen/balloon.h
deleted file mode 100644
index fe43b0f3c86a..000000000000
--- a/include/xen/balloon.h
+++ /dev/null
@@ -1,61 +0,0 @@
1/******************************************************************************
2 * balloon.h
3 *
4 * Xen balloon driver - enables returning/claiming memory to/from Xen.
5 *
6 * Copyright (c) 2003, B Dragovic
7 * Copyright (c) 2003-2004, M Williamson, K Fraser
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation; or, when distributed
12 * separately from the Linux kernel or incorporated into other
13 * software packages, subject to the following license:
14 *
15 * Permission is hereby granted, free of charge, to any person obtaining a copy
16 * of this source file (the "Software"), to deal in the Software without
17 * restriction, including without limitation the rights to use, copy, modify,
18 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
19 * and to permit persons to whom the Software is furnished to do so, subject to
20 * the following conditions:
21 *
22 * The above copyright notice and this permission notice shall be included in
23 * all copies or substantial portions of the Software.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 * IN THE SOFTWARE.
32 */
33
34#ifndef __XEN_BALLOON_H__
35#define __XEN_BALLOON_H__
36
37#include <linux/spinlock.h>
38
39#if 0
40/*
41 * Inform the balloon driver that it should allow some slop for device-driver
42 * memory activities.
43 */
44void balloon_update_driver_allowance(long delta);
45
46/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */
47struct page **alloc_empty_pages_and_pagevec(int nr_pages);
48void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages);
49
50void balloon_release_driver_page(struct page *page);
51
52/*
53 * Prevent the balloon driver from changing the memory reservation during
54 * a driver critical region.
55 */
56extern spinlock_t balloon_lock;
57#define balloon_lock(__flags) spin_lock_irqsave(&balloon_lock, __flags)
58#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
59#endif
60
61#endif /* __XEN_BALLOON_H__ */
diff --git a/include/xen/events.h b/include/xen/events.h
index 4680ff3fbc91..0d5f1adc0363 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -46,6 +46,8 @@ extern void xen_irq_resume(void);
46 46
47/* Clear an irq's pending state, in preparation for polling on it */ 47/* Clear an irq's pending state, in preparation for polling on it */
48void xen_clear_irq_pending(int irq); 48void xen_clear_irq_pending(int irq);
49void xen_set_irq_pending(int irq);
50bool xen_test_irq_pending(int irq);
49 51
50/* Poll waiting for an irq to become pending. In the usual case, the 52/* Poll waiting for an irq to become pending. In the usual case, the
51 irq will be disabled so it won't deliver an interrupt. */ 53 irq will be disabled so it won't deliver an interrupt. */
diff --git a/lib/cmdline.c b/lib/cmdline.c
index 5ba8a942a478..f5f3ad8b62ff 100644
--- a/lib/cmdline.c
+++ b/lib/cmdline.c
@@ -126,7 +126,7 @@ char *get_options(const char *str, int nints, int *ints)
126 * megabyte, or one gigabyte, respectively. 126 * megabyte, or one gigabyte, respectively.
127 */ 127 */
128 128
129unsigned long long memparse(char *ptr, char **retptr) 129unsigned long long memparse(const char *ptr, char **retptr)
130{ 130{
131 char *endptr; /* local pointer to end of parsed string */ 131 char *endptr; /* local pointer to end of parsed string */
132 132