aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/i386/xen')
-rw-r--r--arch/i386/xen/Kconfig11
-rw-r--r--arch/i386/xen/Makefile4
-rw-r--r--arch/i386/xen/enlighten.c1144
-rw-r--r--arch/i386/xen/events.c590
-rw-r--r--arch/i386/xen/features.c29
-rw-r--r--arch/i386/xen/manage.c143
-rw-r--r--arch/i386/xen/mmu.c564
-rw-r--r--arch/i386/xen/mmu.h60
-rw-r--r--arch/i386/xen/multicalls.c90
-rw-r--r--arch/i386/xen/multicalls.h45
-rw-r--r--arch/i386/xen/setup.c96
-rw-r--r--arch/i386/xen/smp.c404
-rw-r--r--arch/i386/xen/time.c590
-rw-r--r--arch/i386/xen/xen-asm.S291
-rw-r--r--arch/i386/xen/xen-head.S36
-rw-r--r--arch/i386/xen/xen-ops.h71
16 files changed, 4168 insertions, 0 deletions
diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig
new file mode 100644
index 000000000000..9df99e1885a4
--- /dev/null
+++ b/arch/i386/xen/Kconfig
@@ -0,0 +1,11 @@
1#
2# This Kconfig describes xen options
3#
4
5config XEN
6 bool "Enable support for Xen hypervisor"
7 depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
8 help
9 This is the Linux Xen port. Enabling this will allow the
10 kernel to boot in a paravirtualized environment under the
11 Xen hypervisor.
diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
new file mode 100644
index 000000000000..343df246bd3e
--- /dev/null
+++ b/arch/i386/xen/Makefile
@@ -0,0 +1,4 @@
1obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
2 events.o time.o manage.o xen-asm.o
3
4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
new file mode 100644
index 000000000000..9a8c1181c001
--- /dev/null
+++ b/arch/i386/xen/enlighten.c
@@ -0,0 +1,1144 @@
1/*
2 * Core of Xen paravirt_ops implementation.
3 *
4 * This file contains the xen_paravirt_ops structure itself, and the
5 * implementations for:
6 * - privileged instructions
7 * - interrupt flags
8 * - segment operations
9 * - booting and setup
10 *
11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
12 */
13
14#include <linux/kernel.h>
15#include <linux/init.h>
16#include <linux/smp.h>
17#include <linux/preempt.h>
18#include <linux/hardirq.h>
19#include <linux/percpu.h>
20#include <linux/delay.h>
21#include <linux/start_kernel.h>
22#include <linux/sched.h>
23#include <linux/bootmem.h>
24#include <linux/module.h>
25#include <linux/mm.h>
26#include <linux/page-flags.h>
27#include <linux/highmem.h>
28#include <linux/smp.h>
29
30#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h>
33#include <xen/interface/sched.h>
34#include <xen/features.h>
35#include <xen/page.h>
36
37#include <asm/paravirt.h>
38#include <asm/page.h>
39#include <asm/xen/hypercall.h>
40#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h>
42#include <asm/processor.h>
43#include <asm/setup.h>
44#include <asm/desc.h>
45#include <asm/pgtable.h>
46#include <asm/tlbflush.h>
47#include <asm/reboot.h>
48
49#include "xen-ops.h"
50#include "mmu.h"
51#include "multicalls.h"
52
53EXPORT_SYMBOL_GPL(hypercall_page);
54
55DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
56
57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
59DEFINE_PER_CPU(unsigned long, xen_cr3);
60
61struct start_info *xen_start_info;
62EXPORT_SYMBOL_GPL(xen_start_info);
63
64static /* __initdata */ struct shared_info dummy_shared_info;
65
66/*
67 * Point at some empty memory to start with. We map the real shared_info
68 * page as soon as fixmap is up and running.
69 */
70struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
71
72/*
73 * Flag to determine whether vcpu info placement is available on all
74 * VCPUs. We assume it is to start with, and then set it to zero on
75 * the first failure. This is because it can succeed on some VCPUs
76 * and not others, since it can involve hypervisor memory allocation,
77 * or because the guest failed to guarantee all the appropriate
78 * constraints on all VCPUs (ie buffer can't cross a page boundary).
79 *
80 * Note that any particular CPU may be using a placed vcpu structure,
81 * but we can only optimise if the all are.
82 *
83 * 0: not available, 1: available
84 */
85static int have_vcpu_info_placement = 1;
86
87static void __init xen_vcpu_setup(int cpu)
88{
89 struct vcpu_register_vcpu_info info;
90 int err;
91 struct vcpu_info *vcpup;
92
93 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
94
95 if (!have_vcpu_info_placement)
96 return; /* already tested, not available */
97
98 vcpup = &per_cpu(xen_vcpu_info, cpu);
99
100 info.mfn = virt_to_mfn(vcpup);
101 info.offset = offset_in_page(vcpup);
102
103 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
104 cpu, vcpup, info.mfn, info.offset);
105
106 /* Check to see if the hypervisor will put the vcpu_info
107 structure where we want it, which allows direct access via
108 a percpu-variable. */
109 err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
110
111 if (err) {
112 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
113 have_vcpu_info_placement = 0;
114 } else {
115 /* This cpu is using the registered vcpu info, even if
116 later ones fail to. */
117 per_cpu(xen_vcpu, cpu) = vcpup;
118
119 printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
120 cpu, vcpup);
121 }
122}
123
124static void __init xen_banner(void)
125{
126 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
127 paravirt_ops.name);
128 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
129}
130
131static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
132 unsigned int *ecx, unsigned int *edx)
133{
134 unsigned maskedx = ~0;
135
136 /*
137 * Mask out inconvenient features, to try and disable as many
138 * unsupported kernel subsystems as possible.
139 */
140 if (*eax == 1)
141 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
142 (1 << X86_FEATURE_ACPI) | /* disable ACPI */
143 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
144
145 asm(XEN_EMULATE_PREFIX "cpuid"
146 : "=a" (*eax),
147 "=b" (*ebx),
148 "=c" (*ecx),
149 "=d" (*edx)
150 : "0" (*eax), "2" (*ecx));
151 *edx &= maskedx;
152}
153
154static void xen_set_debugreg(int reg, unsigned long val)
155{
156 HYPERVISOR_set_debugreg(reg, val);
157}
158
159static unsigned long xen_get_debugreg(int reg)
160{
161 return HYPERVISOR_get_debugreg(reg);
162}
163
164static unsigned long xen_save_fl(void)
165{
166 struct vcpu_info *vcpu;
167 unsigned long flags;
168
169 vcpu = x86_read_percpu(xen_vcpu);
170
171 /* flag has opposite sense of mask */
172 flags = !vcpu->evtchn_upcall_mask;
173
174 /* convert to IF type flag
175 -0 -> 0x00000000
176 -1 -> 0xffffffff
177 */
178 return (-flags) & X86_EFLAGS_IF;
179}
180
181static void xen_restore_fl(unsigned long flags)
182{
183 struct vcpu_info *vcpu;
184
185 /* convert from IF type flag */
186 flags = !(flags & X86_EFLAGS_IF);
187
188 /* There's a one instruction preempt window here. We need to
189 make sure we're don't switch CPUs between getting the vcpu
190 pointer and updating the mask. */
191 preempt_disable();
192 vcpu = x86_read_percpu(xen_vcpu);
193 vcpu->evtchn_upcall_mask = flags;
194 preempt_enable_no_resched();
195
196 /* Doesn't matter if we get preempted here, because any
197 pending event will get dealt with anyway. */
198
199 if (flags == 0) {
200 preempt_check_resched();
201 barrier(); /* unmask then check (avoid races) */
202 if (unlikely(vcpu->evtchn_upcall_pending))
203 force_evtchn_callback();
204 }
205}
206
207static void xen_irq_disable(void)
208{
209 /* There's a one instruction preempt window here. We need to
210 make sure we're don't switch CPUs between getting the vcpu
211 pointer and updating the mask. */
212 preempt_disable();
213 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
214 preempt_enable_no_resched();
215}
216
217static void xen_irq_enable(void)
218{
219 struct vcpu_info *vcpu;
220
221 /* There's a one instruction preempt window here. We need to
222 make sure we're don't switch CPUs between getting the vcpu
223 pointer and updating the mask. */
224 preempt_disable();
225 vcpu = x86_read_percpu(xen_vcpu);
226 vcpu->evtchn_upcall_mask = 0;
227 preempt_enable_no_resched();
228
229 /* Doesn't matter if we get preempted here, because any
230 pending event will get dealt with anyway. */
231
232 barrier(); /* unmask then check (avoid races) */
233 if (unlikely(vcpu->evtchn_upcall_pending))
234 force_evtchn_callback();
235}
236
237static void xen_safe_halt(void)
238{
239 /* Blocking includes an implicit local_irq_enable(). */
240 if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
241 BUG();
242}
243
244static void xen_halt(void)
245{
246 if (irqs_disabled())
247 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
248 else
249 xen_safe_halt();
250}
251
252static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
253{
254 BUG_ON(preemptible());
255
256 switch (mode) {
257 case PARAVIRT_LAZY_NONE:
258 BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
259 break;
260
261 case PARAVIRT_LAZY_MMU:
262 case PARAVIRT_LAZY_CPU:
263 BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
264 break;
265
266 case PARAVIRT_LAZY_FLUSH:
267 /* flush if necessary, but don't change state */
268 if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
269 xen_mc_flush();
270 return;
271 }
272
273 xen_mc_flush();
274 x86_write_percpu(xen_lazy_mode, mode);
275}
276
277static unsigned long xen_store_tr(void)
278{
279 return 0;
280}
281
282static void xen_set_ldt(const void *addr, unsigned entries)
283{
284 unsigned long linear_addr = (unsigned long)addr;
285 struct mmuext_op *op;
286 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
287
288 op = mcs.args;
289 op->cmd = MMUEXT_SET_LDT;
290 if (linear_addr) {
291 /* ldt my be vmalloced, use arbitrary_virt_to_machine */
292 xmaddr_t maddr;
293 maddr = arbitrary_virt_to_machine((unsigned long)addr);
294 linear_addr = (unsigned long)maddr.maddr;
295 }
296 op->arg1.linear_addr = linear_addr;
297 op->arg2.nr_ents = entries;
298
299 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
300
301 xen_mc_issue(PARAVIRT_LAZY_CPU);
302}
303
304static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
305{
306 unsigned long *frames;
307 unsigned long va = dtr->address;
308 unsigned int size = dtr->size + 1;
309 unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
310 int f;
311 struct multicall_space mcs;
312
313 /* A GDT can be up to 64k in size, which corresponds to 8192
314 8-byte entries, or 16 4k pages.. */
315
316 BUG_ON(size > 65536);
317 BUG_ON(va & ~PAGE_MASK);
318
319 mcs = xen_mc_entry(sizeof(*frames) * pages);
320 frames = mcs.args;
321
322 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
323 frames[f] = virt_to_mfn(va);
324 make_lowmem_page_readonly((void *)va);
325 }
326
327 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
328
329 xen_mc_issue(PARAVIRT_LAZY_CPU);
330}
331
332static void load_TLS_descriptor(struct thread_struct *t,
333 unsigned int cpu, unsigned int i)
334{
335 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
336 xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
337 struct multicall_space mc = __xen_mc_entry(0);
338
339 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
340}
341
342static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
343{
344 xen_mc_batch();
345
346 load_TLS_descriptor(t, cpu, 0);
347 load_TLS_descriptor(t, cpu, 1);
348 load_TLS_descriptor(t, cpu, 2);
349
350 xen_mc_issue(PARAVIRT_LAZY_CPU);
351
352 /*
353 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
354 * it means we're in a context switch, and %gs has just been
355 * saved. This means we can zero it out to prevent faults on
356 * exit from the hypervisor if the next process has no %gs.
357 * Either way, it has been saved, and the new value will get
358 * loaded properly. This will go away as soon as Xen has been
359 * modified to not save/restore %gs for normal hypercalls.
360 */
361 if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
362 loadsegment(gs, 0);
363}
364
365static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
366 u32 low, u32 high)
367{
368 unsigned long lp = (unsigned long)&dt[entrynum];
369 xmaddr_t mach_lp = virt_to_machine(lp);
370 u64 entry = (u64)high << 32 | low;
371
372 preempt_disable();
373
374 xen_mc_flush();
375 if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
376 BUG();
377
378 preempt_enable();
379}
380
381static int cvt_gate_to_trap(int vector, u32 low, u32 high,
382 struct trap_info *info)
383{
384 u8 type, dpl;
385
386 type = (high >> 8) & 0x1f;
387 dpl = (high >> 13) & 3;
388
389 if (type != 0xf && type != 0xe)
390 return 0;
391
392 info->vector = vector;
393 info->address = (high & 0xffff0000) | (low & 0x0000ffff);
394 info->cs = low >> 16;
395 info->flags = dpl;
396 /* interrupt gates clear IF */
397 if (type == 0xe)
398 info->flags |= 4;
399
400 return 1;
401}
402
403/* Locations of each CPU's IDT */
404static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
405
406/* Set an IDT entry. If the entry is part of the current IDT, then
407 also update Xen. */
408static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
409 u32 low, u32 high)
410{
411 unsigned long p = (unsigned long)&dt[entrynum];
412 unsigned long start, end;
413
414 preempt_disable();
415
416 start = __get_cpu_var(idt_desc).address;
417 end = start + __get_cpu_var(idt_desc).size + 1;
418
419 xen_mc_flush();
420
421 write_dt_entry(dt, entrynum, low, high);
422
423 if (p >= start && (p + 8) <= end) {
424 struct trap_info info[2];
425
426 info[1].address = 0;
427
428 if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
429 if (HYPERVISOR_set_trap_table(info))
430 BUG();
431 }
432
433 preempt_enable();
434}
435
436static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
437 struct trap_info *traps)
438{
439 unsigned in, out, count;
440
441 count = (desc->size+1) / 8;
442 BUG_ON(count > 256);
443
444 for (in = out = 0; in < count; in++) {
445 const u32 *entry = (u32 *)(desc->address + in * 8);
446
447 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
448 out++;
449 }
450 traps[out].address = 0;
451}
452
453void xen_copy_trap_info(struct trap_info *traps)
454{
455 const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
456
457 xen_convert_trap_info(desc, traps);
458}
459
460/* Load a new IDT into Xen. In principle this can be per-CPU, so we
461 hold a spinlock to protect the static traps[] array (static because
462 it avoids allocation, and saves stack space). */
463static void xen_load_idt(const struct Xgt_desc_struct *desc)
464{
465 static DEFINE_SPINLOCK(lock);
466 static struct trap_info traps[257];
467
468 spin_lock(&lock);
469
470 __get_cpu_var(idt_desc) = *desc;
471
472 xen_convert_trap_info(desc, traps);
473
474 xen_mc_flush();
475 if (HYPERVISOR_set_trap_table(traps))
476 BUG();
477
478 spin_unlock(&lock);
479}
480
481/* Write a GDT descriptor entry. Ignore LDT descriptors, since
482 they're handled differently. */
483static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
484 u32 low, u32 high)
485{
486 preempt_disable();
487
488 switch ((high >> 8) & 0xff) {
489 case DESCTYPE_LDT:
490 case DESCTYPE_TSS:
491 /* ignore */
492 break;
493
494 default: {
495 xmaddr_t maddr = virt_to_machine(&dt[entry]);
496 u64 desc = (u64)high << 32 | low;
497
498 xen_mc_flush();
499 if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
500 BUG();
501 }
502
503 }
504
505 preempt_enable();
506}
507
508static void xen_load_esp0(struct tss_struct *tss,
509 struct thread_struct *thread)
510{
511 struct multicall_space mcs = xen_mc_entry(0);
512 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
513 xen_mc_issue(PARAVIRT_LAZY_CPU);
514}
515
516static void xen_set_iopl_mask(unsigned mask)
517{
518 struct physdev_set_iopl set_iopl;
519
520 /* Force the change at ring 0. */
521 set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
522 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
523}
524
525static void xen_io_delay(void)
526{
527}
528
529#ifdef CONFIG_X86_LOCAL_APIC
530static unsigned long xen_apic_read(unsigned long reg)
531{
532 return 0;
533}
534
535static void xen_apic_write(unsigned long reg, unsigned long val)
536{
537 /* Warn to see if there's any stray references */
538 WARN_ON(1);
539}
540#endif
541
542static void xen_flush_tlb(void)
543{
544 struct mmuext_op *op;
545 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
546
547 op = mcs.args;
548 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
549 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
550
551 xen_mc_issue(PARAVIRT_LAZY_MMU);
552}
553
554static void xen_flush_tlb_single(unsigned long addr)
555{
556 struct mmuext_op *op;
557 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
558
559 op = mcs.args;
560 op->cmd = MMUEXT_INVLPG_LOCAL;
561 op->arg1.linear_addr = addr & PAGE_MASK;
562 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
563
564 xen_mc_issue(PARAVIRT_LAZY_MMU);
565}
566
567static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
568 unsigned long va)
569{
570 struct {
571 struct mmuext_op op;
572 cpumask_t mask;
573 } *args;
574 cpumask_t cpumask = *cpus;
575 struct multicall_space mcs;
576
577 /*
578 * A couple of (to be removed) sanity checks:
579 *
580 * - current CPU must not be in mask
581 * - mask must exist :)
582 */
583 BUG_ON(cpus_empty(cpumask));
584 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
585 BUG_ON(!mm);
586
587 /* If a CPU which we ran on has gone down, OK. */
588 cpus_and(cpumask, cpumask, cpu_online_map);
589 if (cpus_empty(cpumask))
590 return;
591
592 mcs = xen_mc_entry(sizeof(*args));
593 args = mcs.args;
594 args->mask = cpumask;
595 args->op.arg2.vcpumask = &args->mask;
596
597 if (va == TLB_FLUSH_ALL) {
598 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
599 } else {
600 args->op.cmd = MMUEXT_INVLPG_MULTI;
601 args->op.arg1.linear_addr = va;
602 }
603
604 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
605
606 xen_mc_issue(PARAVIRT_LAZY_MMU);
607}
608
609static void xen_write_cr2(unsigned long cr2)
610{
611 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
612}
613
614static unsigned long xen_read_cr2(void)
615{
616 return x86_read_percpu(xen_vcpu)->arch.cr2;
617}
618
619static unsigned long xen_read_cr2_direct(void)
620{
621 return x86_read_percpu(xen_vcpu_info.arch.cr2);
622}
623
624static void xen_write_cr4(unsigned long cr4)
625{
626 /* never allow TSC to be disabled */
627 native_write_cr4(cr4 & ~X86_CR4_TSD);
628}
629
630static unsigned long xen_read_cr3(void)
631{
632 return x86_read_percpu(xen_cr3);
633}
634
635static void xen_write_cr3(unsigned long cr3)
636{
637 BUG_ON(preemptible());
638
639 if (cr3 == x86_read_percpu(xen_cr3)) {
640 /* just a simple tlb flush */
641 xen_flush_tlb();
642 return;
643 }
644
645 x86_write_percpu(xen_cr3, cr3);
646
647
648 {
649 struct mmuext_op *op;
650 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
651 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
652
653 op = mcs.args;
654 op->cmd = MMUEXT_NEW_BASEPTR;
655 op->arg1.mfn = mfn;
656
657 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
658
659 xen_mc_issue(PARAVIRT_LAZY_CPU);
660 }
661}
662
663/* Early in boot, while setting up the initial pagetable, assume
664 everything is pinned. */
665static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
666{
667 BUG_ON(mem_map); /* should only be used early */
668 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
669}
670
671/* This needs to make sure the new pte page is pinned iff its being
672 attached to a pinned pagetable. */
673static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
674{
675 struct page *page = pfn_to_page(pfn);
676
677 if (PagePinned(virt_to_page(mm->pgd))) {
678 SetPagePinned(page);
679
680 if (!PageHighMem(page))
681 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
682 else
683 /* make sure there are no stray mappings of
684 this page */
685 kmap_flush_unused();
686 }
687}
688
689/* This should never happen until we're OK to use struct page */
690static void xen_release_pt(u32 pfn)
691{
692 struct page *page = pfn_to_page(pfn);
693
694 if (PagePinned(page)) {
695 if (!PageHighMem(page))
696 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
697 }
698}
699
700#ifdef CONFIG_HIGHPTE
701static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
702{
703 pgprot_t prot = PAGE_KERNEL;
704
705 if (PagePinned(page))
706 prot = PAGE_KERNEL_RO;
707
708 if (0 && PageHighMem(page))
709 printk("mapping highpte %lx type %d prot %s\n",
710 page_to_pfn(page), type,
711 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
712
713 return kmap_atomic_prot(page, type, prot);
714}
715#endif
716
717static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
718{
719 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
720 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
721 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
722 pte_val_ma(pte));
723
724 return pte;
725}
726
727/* Init-time set_pte while constructing initial pagetables, which
728 doesn't allow RO pagetable pages to be remapped RW */
729static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
730{
731 pte = mask_rw_pte(ptep, pte);
732
733 xen_set_pte(ptep, pte);
734}
735
736static __init void xen_pagetable_setup_start(pgd_t *base)
737{
738 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
739
740 /* special set_pte for pagetable initialization */
741 paravirt_ops.set_pte = xen_set_pte_init;
742
743 init_mm.pgd = base;
744 /*
745 * copy top-level of Xen-supplied pagetable into place. For
746 * !PAE we can use this as-is, but for PAE it is a stand-in
747 * while we copy the pmd pages.
748 */
749 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
750
751 if (PTRS_PER_PMD > 1) {
752 int i;
753 /*
754 * For PAE, need to allocate new pmds, rather than
755 * share Xen's, since Xen doesn't like pmd's being
756 * shared between address spaces.
757 */
758 for (i = 0; i < PTRS_PER_PGD; i++) {
759 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
760 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
761
762 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
763 PAGE_SIZE);
764
765 make_lowmem_page_readonly(pmd);
766
767 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
768 } else
769 pgd_clear(&base[i]);
770 }
771 }
772
773 /* make sure zero_page is mapped RO so we can use it in pagetables */
774 make_lowmem_page_readonly(empty_zero_page);
775 make_lowmem_page_readonly(base);
776 /*
777 * Switch to new pagetable. This is done before
778 * pagetable_init has done anything so that the new pages
779 * added to the table can be prepared properly for Xen.
780 */
781 xen_write_cr3(__pa(base));
782}
783
784static __init void xen_pagetable_setup_done(pgd_t *base)
785{
786 /* This will work as long as patching hasn't happened yet
787 (which it hasn't) */
788 paravirt_ops.alloc_pt = xen_alloc_pt;
789 paravirt_ops.set_pte = xen_set_pte;
790
791 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
792 /*
793 * Create a mapping for the shared info page.
794 * Should be set_fixmap(), but shared_info is a machine
795 * address with no corresponding pseudo-phys address.
796 */
797 set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
798 PFN_DOWN(xen_start_info->shared_info),
799 PAGE_KERNEL);
800
801 HYPERVISOR_shared_info =
802 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
803
804 } else
805 HYPERVISOR_shared_info =
806 (struct shared_info *)__va(xen_start_info->shared_info);
807
808 /* Actually pin the pagetable down, but we can't set PG_pinned
809 yet because the page structures don't exist yet. */
810 {
811 struct mmuext_op op;
812#ifdef CONFIG_X86_PAE
813 op.cmd = MMUEXT_PIN_L3_TABLE;
814#else
815 op.cmd = MMUEXT_PIN_L3_TABLE;
816#endif
817 op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
818 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
819 BUG();
820 }
821}
822
823/* This is called once we have the cpu_possible_map */
824void __init xen_setup_vcpu_info_placement(void)
825{
826 int cpu;
827
828 for_each_possible_cpu(cpu)
829 xen_vcpu_setup(cpu);
830
831 /* xen_vcpu_setup managed to place the vcpu_info within the
832 percpu area for all cpus, so make use of it */
833 if (have_vcpu_info_placement) {
834 printk(KERN_INFO "Xen: using vcpu_info placement\n");
835
836 paravirt_ops.save_fl = xen_save_fl_direct;
837 paravirt_ops.restore_fl = xen_restore_fl_direct;
838 paravirt_ops.irq_disable = xen_irq_disable_direct;
839 paravirt_ops.irq_enable = xen_irq_enable_direct;
840 paravirt_ops.read_cr2 = xen_read_cr2_direct;
841 paravirt_ops.iret = xen_iret_direct;
842 }
843}
844
845static unsigned xen_patch(u8 type, u16 clobbers, void *insns, unsigned len)
846{
847 char *start, *end, *reloc;
848 unsigned ret;
849
850 start = end = reloc = NULL;
851
852#define SITE(x) \
853 case PARAVIRT_PATCH(x): \
854 if (have_vcpu_info_placement) { \
855 start = (char *)xen_##x##_direct; \
856 end = xen_##x##_direct_end; \
857 reloc = xen_##x##_direct_reloc; \
858 } \
859 goto patch_site
860
861 switch (type) {
862 SITE(irq_enable);
863 SITE(irq_disable);
864 SITE(save_fl);
865 SITE(restore_fl);
866#undef SITE
867
868 patch_site:
869 if (start == NULL || (end-start) > len)
870 goto default_patch;
871
872 ret = paravirt_patch_insns(insns, len, start, end);
873
874 /* Note: because reloc is assigned from something that
875 appears to be an array, gcc assumes it's non-null,
876 but doesn't know its relationship with start and
877 end. */
878 if (reloc > start && reloc < end) {
879 int reloc_off = reloc - start;
880 long *relocp = (long *)(insns + reloc_off);
881 long delta = start - (char *)insns;
882
883 *relocp += delta;
884 }
885 break;
886
887 default_patch:
888 default:
889 ret = paravirt_patch_default(type, clobbers, insns, len);
890 break;
891 }
892
893 return ret;
894}
895
896static const struct paravirt_ops xen_paravirt_ops __initdata = {
897 .paravirt_enabled = 1,
898 .shared_kernel_pmd = 0,
899
900 .name = "Xen",
901 .banner = xen_banner,
902
903 .patch = xen_patch,
904
905 .memory_setup = xen_memory_setup,
906 .arch_setup = xen_arch_setup,
907 .init_IRQ = xen_init_IRQ,
908 .post_allocator_init = xen_mark_init_mm_pinned,
909
910 .time_init = xen_time_init,
911 .set_wallclock = xen_set_wallclock,
912 .get_wallclock = xen_get_wallclock,
913 .get_cpu_khz = xen_cpu_khz,
914 .sched_clock = xen_sched_clock,
915
916 .cpuid = xen_cpuid,
917
918 .set_debugreg = xen_set_debugreg,
919 .get_debugreg = xen_get_debugreg,
920
921 .clts = native_clts,
922
923 .read_cr0 = native_read_cr0,
924 .write_cr0 = native_write_cr0,
925
926 .read_cr2 = xen_read_cr2,
927 .write_cr2 = xen_write_cr2,
928
929 .read_cr3 = xen_read_cr3,
930 .write_cr3 = xen_write_cr3,
931
932 .read_cr4 = native_read_cr4,
933 .read_cr4_safe = native_read_cr4_safe,
934 .write_cr4 = xen_write_cr4,
935
936 .save_fl = xen_save_fl,
937 .restore_fl = xen_restore_fl,
938 .irq_disable = xen_irq_disable,
939 .irq_enable = xen_irq_enable,
940 .safe_halt = xen_safe_halt,
941 .halt = xen_halt,
942 .wbinvd = native_wbinvd,
943
944 .read_msr = native_read_msr_safe,
945 .write_msr = native_write_msr_safe,
946 .read_tsc = native_read_tsc,
947 .read_pmc = native_read_pmc,
948
949 .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
950 .irq_enable_sysexit = NULL, /* never called */
951
952 .load_tr_desc = paravirt_nop,
953 .set_ldt = xen_set_ldt,
954 .load_gdt = xen_load_gdt,
955 .load_idt = xen_load_idt,
956 .load_tls = xen_load_tls,
957
958 .store_gdt = native_store_gdt,
959 .store_idt = native_store_idt,
960 .store_tr = xen_store_tr,
961
962 .write_ldt_entry = xen_write_ldt_entry,
963 .write_gdt_entry = xen_write_gdt_entry,
964 .write_idt_entry = xen_write_idt_entry,
965 .load_esp0 = xen_load_esp0,
966
967 .set_iopl_mask = xen_set_iopl_mask,
968 .io_delay = xen_io_delay,
969
970#ifdef CONFIG_X86_LOCAL_APIC
971 .apic_write = xen_apic_write,
972 .apic_write_atomic = xen_apic_write,
973 .apic_read = xen_apic_read,
974 .setup_boot_clock = paravirt_nop,
975 .setup_secondary_clock = paravirt_nop,
976 .startup_ipi_hook = paravirt_nop,
977#endif
978
979 .flush_tlb_user = xen_flush_tlb,
980 .flush_tlb_kernel = xen_flush_tlb,
981 .flush_tlb_single = xen_flush_tlb_single,
982 .flush_tlb_others = xen_flush_tlb_others,
983
984 .pte_update = paravirt_nop,
985 .pte_update_defer = paravirt_nop,
986
987 .pagetable_setup_start = xen_pagetable_setup_start,
988 .pagetable_setup_done = xen_pagetable_setup_done,
989
990 .alloc_pt = xen_alloc_pt_init,
991 .release_pt = xen_release_pt,
992 .alloc_pd = paravirt_nop,
993 .alloc_pd_clone = paravirt_nop,
994 .release_pd = paravirt_nop,
995
996#ifdef CONFIG_HIGHPTE
997 .kmap_atomic_pte = xen_kmap_atomic_pte,
998#endif
999
1000 .set_pte = NULL, /* see xen_pagetable_setup_* */
1001 .set_pte_at = xen_set_pte_at,
1002 .set_pmd = xen_set_pmd,
1003
1004 .pte_val = xen_pte_val,
1005 .pgd_val = xen_pgd_val,
1006
1007 .make_pte = xen_make_pte,
1008 .make_pgd = xen_make_pgd,
1009
1010#ifdef CONFIG_X86_PAE
1011 .set_pte_atomic = xen_set_pte_atomic,
1012 .set_pte_present = xen_set_pte_at,
1013 .set_pud = xen_set_pud,
1014 .pte_clear = xen_pte_clear,
1015 .pmd_clear = xen_pmd_clear,
1016
1017 .make_pmd = xen_make_pmd,
1018 .pmd_val = xen_pmd_val,
1019#endif /* PAE */
1020
1021 .activate_mm = xen_activate_mm,
1022 .dup_mmap = xen_dup_mmap,
1023 .exit_mmap = xen_exit_mmap,
1024
1025 .set_lazy_mode = xen_set_lazy_mode,
1026};
1027
1028#ifdef CONFIG_SMP
1029static const struct smp_ops xen_smp_ops __initdata = {
1030 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1031 .smp_prepare_cpus = xen_smp_prepare_cpus,
1032 .cpu_up = xen_cpu_up,
1033 .smp_cpus_done = xen_smp_cpus_done,
1034
1035 .smp_send_stop = xen_smp_send_stop,
1036 .smp_send_reschedule = xen_smp_send_reschedule,
1037 .smp_call_function_mask = xen_smp_call_function_mask,
1038};
1039#endif /* CONFIG_SMP */
1040
1041static void xen_reboot(int reason)
1042{
1043#ifdef CONFIG_SMP
1044 smp_send_stop();
1045#endif
1046
1047 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
1048 BUG();
1049}
1050
1051static void xen_restart(char *msg)
1052{
1053 xen_reboot(SHUTDOWN_reboot);
1054}
1055
1056static void xen_emergency_restart(void)
1057{
1058 xen_reboot(SHUTDOWN_reboot);
1059}
1060
1061static void xen_machine_halt(void)
1062{
1063 xen_reboot(SHUTDOWN_poweroff);
1064}
1065
1066static void xen_crash_shutdown(struct pt_regs *regs)
1067{
1068 xen_reboot(SHUTDOWN_crash);
1069}
1070
1071static const struct machine_ops __initdata xen_machine_ops = {
1072 .restart = xen_restart,
1073 .halt = xen_machine_halt,
1074 .power_off = xen_machine_halt,
1075 .shutdown = xen_machine_halt,
1076 .crash_shutdown = xen_crash_shutdown,
1077 .emergency_restart = xen_emergency_restart,
1078};
1079
1080
1081/* First C function to be called on Xen boot */
1082asmlinkage void __init xen_start_kernel(void)
1083{
1084 pgd_t *pgd;
1085
1086 if (!xen_start_info)
1087 return;
1088
1089 BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
1090
1091 /* Install Xen paravirt ops */
1092 paravirt_ops = xen_paravirt_ops;
1093 machine_ops = xen_machine_ops;
1094
1095#ifdef CONFIG_SMP
1096 smp_ops = xen_smp_ops;
1097#endif
1098
1099 xen_setup_features();
1100
1101 /* Get mfn list */
1102 if (!xen_feature(XENFEAT_auto_translated_physmap))
1103 phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
1104
1105 pgd = (pgd_t *)xen_start_info->pt_base;
1106
1107 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1108
1109 init_mm.pgd = pgd; /* use the Xen pagetables to start */
1110
1111 /* keep using Xen gdt for now; no urgent need to change it */
1112
1113 x86_write_percpu(xen_cr3, __pa(pgd));
1114
1115#ifdef CONFIG_SMP
1116 /* Don't do the full vcpu_info placement stuff until we have a
1117 possible map. */
1118 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1119#else
1120 /* May as well do it now, since there's no good time to call
1121 it later on UP. */
1122 xen_setup_vcpu_info_placement();
1123#endif
1124
1125 paravirt_ops.kernel_rpl = 1;
1126 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1127 paravirt_ops.kernel_rpl = 0;
1128
1129 /* set the limit of our address space */
1130 reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
1131
1132 /* set up basic CPUID stuff */
1133 cpu_detect(&new_cpu_data);
1134 new_cpu_data.hard_math = 1;
1135 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1136
1137 /* Poke various useful things into boot_params */
1138 LOADER_TYPE = (9 << 4) | 0;
1139 INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
1140 INITRD_SIZE = xen_start_info->mod_len;
1141
1142 /* Start the world */
1143 start_kernel();
1144}
diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c
new file mode 100644
index 000000000000..8904acc20f8c
--- /dev/null
+++ b/arch/i386/xen/events.c
@@ -0,0 +1,590 @@
1/*
2 * Xen event channels
3 *
4 * Xen models interrupts with abstract event channels. Because each
5 * domain gets 1024 event channels, but NR_IRQ is not that large, we
6 * must dynamically map irqs<->event channels. The event channels
7 * interface with the rest of the kernel by defining a xen interrupt
8 * chip. When an event is recieved, it is mapped to an irq and sent
9 * through the normal interrupt processing path.
10 *
11 * There are four kinds of events which can be mapped to an event
12 * channel:
13 *
14 * 1. Inter-domain notifications. This includes all the virtual
15 * device events, since they're driven by front-ends in another domain
16 * (typically dom0).
17 * 2. VIRQs, typically used for timers. These are per-cpu events.
18 * 3. IPIs.
19 * 4. Hardware interrupts. Not supported at present.
20 *
21 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
22 */
23
24#include <linux/linkage.h>
25#include <linux/interrupt.h>
26#include <linux/irq.h>
27#include <linux/module.h>
28#include <linux/string.h>
29
30#include <asm/ptrace.h>
31#include <asm/irq.h>
32#include <asm/sync_bitops.h>
33#include <asm/xen/hypercall.h>
34
35#include <xen/events.h>
36#include <xen/interface/xen.h>
37#include <xen/interface/event_channel.h>
38
39#include "xen-ops.h"
40
41/*
42 * This lock protects updates to the following mapping and reference-count
43 * arrays. The lock does not need to be acquired to read the mapping tables.
44 */
45static DEFINE_SPINLOCK(irq_mapping_update_lock);
46
47/* IRQ <-> VIRQ mapping. */
48static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
49
50/* IRQ <-> IPI mapping */
51static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
52
53/* Packed IRQ information: binding type, sub-type index, and event channel. */
54struct packed_irq
55{
56 unsigned short evtchn;
57 unsigned char index;
58 unsigned char type;
59};
60
61static struct packed_irq irq_info[NR_IRQS];
62
63/* Binding types. */
64enum {
65 IRQT_UNBOUND,
66 IRQT_PIRQ,
67 IRQT_VIRQ,
68 IRQT_IPI,
69 IRQT_EVTCHN
70};
71
72/* Convenient shorthand for packed representation of an unbound IRQ. */
73#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
74
75static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
76 [0 ... NR_EVENT_CHANNELS-1] = -1
77};
78static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
79static u8 cpu_evtchn[NR_EVENT_CHANNELS];
80
81/* Reference counts for bindings to IRQs. */
82static int irq_bindcount[NR_IRQS];
83
84/* Xen will never allocate port zero for any purpose. */
85#define VALID_EVTCHN(chn) ((chn) != 0)
86
87/*
88 * Force a proper event-channel callback from Xen after clearing the
89 * callback mask. We do this in a very simple manner, by making a call
90 * down into Xen. The pending flag will be checked by Xen on return.
91 */
92void force_evtchn_callback(void)
93{
94 (void)HYPERVISOR_xen_version(0, NULL);
95}
96EXPORT_SYMBOL_GPL(force_evtchn_callback);
97
98static struct irq_chip xen_dynamic_chip;
99
100/* Constructor for packed IRQ information. */
101static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
102{
103 return (struct packed_irq) { evtchn, index, type };
104}
105
106/*
107 * Accessors for packed IRQ information.
108 */
109static inline unsigned int evtchn_from_irq(int irq)
110{
111 return irq_info[irq].evtchn;
112}
113
114static inline unsigned int index_from_irq(int irq)
115{
116 return irq_info[irq].index;
117}
118
119static inline unsigned int type_from_irq(int irq)
120{
121 return irq_info[irq].type;
122}
123
124static inline unsigned long active_evtchns(unsigned int cpu,
125 struct shared_info *sh,
126 unsigned int idx)
127{
128 return (sh->evtchn_pending[idx] &
129 cpu_evtchn_mask[cpu][idx] &
130 ~sh->evtchn_mask[idx]);
131}
132
133static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
134{
135 int irq = evtchn_to_irq[chn];
136
137 BUG_ON(irq == -1);
138#ifdef CONFIG_SMP
139 irq_desc[irq].affinity = cpumask_of_cpu(cpu);
140#endif
141
142 __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
143 __set_bit(chn, cpu_evtchn_mask[cpu]);
144
145 cpu_evtchn[chn] = cpu;
146}
147
148static void init_evtchn_cpu_bindings(void)
149{
150#ifdef CONFIG_SMP
151 int i;
152 /* By default all event channels notify CPU#0. */
153 for (i = 0; i < NR_IRQS; i++)
154 irq_desc[i].affinity = cpumask_of_cpu(0);
155#endif
156
157 memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
158 memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
159}
160
161static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
162{
163 return cpu_evtchn[evtchn];
164}
165
166static inline void clear_evtchn(int port)
167{
168 struct shared_info *s = HYPERVISOR_shared_info;
169 sync_clear_bit(port, &s->evtchn_pending[0]);
170}
171
172static inline void set_evtchn(int port)
173{
174 struct shared_info *s = HYPERVISOR_shared_info;
175 sync_set_bit(port, &s->evtchn_pending[0]);
176}
177
178
179/**
180 * notify_remote_via_irq - send event to remote end of event channel via irq
181 * @irq: irq of event channel to send event to
182 *
183 * Unlike notify_remote_via_evtchn(), this is safe to use across
184 * save/restore. Notifications on a broken connection are silently
185 * dropped.
186 */
187void notify_remote_via_irq(int irq)
188{
189 int evtchn = evtchn_from_irq(irq);
190
191 if (VALID_EVTCHN(evtchn))
192 notify_remote_via_evtchn(evtchn);
193}
194EXPORT_SYMBOL_GPL(notify_remote_via_irq);
195
196static void mask_evtchn(int port)
197{
198 struct shared_info *s = HYPERVISOR_shared_info;
199 sync_set_bit(port, &s->evtchn_mask[0]);
200}
201
202static void unmask_evtchn(int port)
203{
204 struct shared_info *s = HYPERVISOR_shared_info;
205 unsigned int cpu = get_cpu();
206
207 BUG_ON(!irqs_disabled());
208
209 /* Slow path (hypercall) if this is a non-local port. */
210 if (unlikely(cpu != cpu_from_evtchn(port))) {
211 struct evtchn_unmask unmask = { .port = port };
212 (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
213 } else {
214 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
215
216 sync_clear_bit(port, &s->evtchn_mask[0]);
217
218 /*
219 * The following is basically the equivalent of
220 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
221 * the interrupt edge' if the channel is masked.
222 */
223 if (sync_test_bit(port, &s->evtchn_pending[0]) &&
224 !sync_test_and_set_bit(port / BITS_PER_LONG,
225 &vcpu_info->evtchn_pending_sel))
226 vcpu_info->evtchn_upcall_pending = 1;
227 }
228
229 put_cpu();
230}
231
232static int find_unbound_irq(void)
233{
234 int irq;
235
236 /* Only allocate from dynirq range */
237 for (irq = 0; irq < NR_IRQS; irq++)
238 if (irq_bindcount[irq] == 0)
239 break;
240
241 if (irq == NR_IRQS)
242 panic("No available IRQ to bind to: increase NR_IRQS!\n");
243
244 return irq;
245}
246
247int bind_evtchn_to_irq(unsigned int evtchn)
248{
249 int irq;
250
251 spin_lock(&irq_mapping_update_lock);
252
253 irq = evtchn_to_irq[evtchn];
254
255 if (irq == -1) {
256 irq = find_unbound_irq();
257
258 dynamic_irq_init(irq);
259 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
260 handle_level_irq, "event");
261
262 evtchn_to_irq[evtchn] = irq;
263 irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
264 }
265
266 irq_bindcount[irq]++;
267
268 spin_unlock(&irq_mapping_update_lock);
269
270 return irq;
271}
272EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
273
274static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
275{
276 struct evtchn_bind_ipi bind_ipi;
277 int evtchn, irq;
278
279 spin_lock(&irq_mapping_update_lock);
280
281 irq = per_cpu(ipi_to_irq, cpu)[ipi];
282 if (irq == -1) {
283 irq = find_unbound_irq();
284 if (irq < 0)
285 goto out;
286
287 dynamic_irq_init(irq);
288 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
289 handle_level_irq, "ipi");
290
291 bind_ipi.vcpu = cpu;
292 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
293 &bind_ipi) != 0)
294 BUG();
295 evtchn = bind_ipi.port;
296
297 evtchn_to_irq[evtchn] = irq;
298 irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
299
300 per_cpu(ipi_to_irq, cpu)[ipi] = irq;
301
302 bind_evtchn_to_cpu(evtchn, cpu);
303 }
304
305 irq_bindcount[irq]++;
306
307 out:
308 spin_unlock(&irq_mapping_update_lock);
309 return irq;
310}
311
312
313static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
314{
315 struct evtchn_bind_virq bind_virq;
316 int evtchn, irq;
317
318 spin_lock(&irq_mapping_update_lock);
319
320 irq = per_cpu(virq_to_irq, cpu)[virq];
321
322 if (irq == -1) {
323 bind_virq.virq = virq;
324 bind_virq.vcpu = cpu;
325 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
326 &bind_virq) != 0)
327 BUG();
328 evtchn = bind_virq.port;
329
330 irq = find_unbound_irq();
331
332 dynamic_irq_init(irq);
333 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
334 handle_level_irq, "virq");
335
336 evtchn_to_irq[evtchn] = irq;
337 irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
338
339 per_cpu(virq_to_irq, cpu)[virq] = irq;
340
341 bind_evtchn_to_cpu(evtchn, cpu);
342 }
343
344 irq_bindcount[irq]++;
345
346 spin_unlock(&irq_mapping_update_lock);
347
348 return irq;
349}
350
351static void unbind_from_irq(unsigned int irq)
352{
353 struct evtchn_close close;
354 int evtchn = evtchn_from_irq(irq);
355
356 spin_lock(&irq_mapping_update_lock);
357
358 if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
359 close.port = evtchn;
360 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
361 BUG();
362
363 switch (type_from_irq(irq)) {
364 case IRQT_VIRQ:
365 per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
366 [index_from_irq(irq)] = -1;
367 break;
368 default:
369 break;
370 }
371
372 /* Closed ports are implicitly re-bound to VCPU0. */
373 bind_evtchn_to_cpu(evtchn, 0);
374
375 evtchn_to_irq[evtchn] = -1;
376 irq_info[irq] = IRQ_UNBOUND;
377
378 dynamic_irq_init(irq);
379 }
380
381 spin_unlock(&irq_mapping_update_lock);
382}
383
384int bind_evtchn_to_irqhandler(unsigned int evtchn,
385 irqreturn_t (*handler)(int, void *),
386 unsigned long irqflags,
387 const char *devname, void *dev_id)
388{
389 unsigned int irq;
390 int retval;
391
392 irq = bind_evtchn_to_irq(evtchn);
393 retval = request_irq(irq, handler, irqflags, devname, dev_id);
394 if (retval != 0) {
395 unbind_from_irq(irq);
396 return retval;
397 }
398
399 return irq;
400}
401EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
402
403int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
404 irqreturn_t (*handler)(int, void *),
405 unsigned long irqflags, const char *devname, void *dev_id)
406{
407 unsigned int irq;
408 int retval;
409
410 irq = bind_virq_to_irq(virq, cpu);
411 retval = request_irq(irq, handler, irqflags, devname, dev_id);
412 if (retval != 0) {
413 unbind_from_irq(irq);
414 return retval;
415 }
416
417 return irq;
418}
419EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
420
421int bind_ipi_to_irqhandler(enum ipi_vector ipi,
422 unsigned int cpu,
423 irq_handler_t handler,
424 unsigned long irqflags,
425 const char *devname,
426 void *dev_id)
427{
428 int irq, retval;
429
430 irq = bind_ipi_to_irq(ipi, cpu);
431 if (irq < 0)
432 return irq;
433
434 retval = request_irq(irq, handler, irqflags, devname, dev_id);
435 if (retval != 0) {
436 unbind_from_irq(irq);
437 return retval;
438 }
439
440 return irq;
441}
442
443void unbind_from_irqhandler(unsigned int irq, void *dev_id)
444{
445 free_irq(irq, dev_id);
446 unbind_from_irq(irq);
447}
448EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
449
450void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
451{
452 int irq = per_cpu(ipi_to_irq, cpu)[vector];
453 BUG_ON(irq < 0);
454 notify_remote_via_irq(irq);
455}
456
457
458/*
459 * Search the CPUs pending events bitmasks. For each one found, map
460 * the event number to an irq, and feed it into do_IRQ() for
461 * handling.
462 *
463 * Xen uses a two-level bitmap to speed searching. The first level is
464 * a bitset of words which contain pending event bits. The second
465 * level is a bitset of pending events themselves.
466 */
467fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
468{
469 int cpu = get_cpu();
470 struct shared_info *s = HYPERVISOR_shared_info;
471 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
472 unsigned long pending_words;
473
474 vcpu_info->evtchn_upcall_pending = 0;
475
476 /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
477 pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
478 while (pending_words != 0) {
479 unsigned long pending_bits;
480 int word_idx = __ffs(pending_words);
481 pending_words &= ~(1UL << word_idx);
482
483 while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
484 int bit_idx = __ffs(pending_bits);
485 int port = (word_idx * BITS_PER_LONG) + bit_idx;
486 int irq = evtchn_to_irq[port];
487
488 if (irq != -1) {
489 regs->orig_eax = ~irq;
490 do_IRQ(regs);
491 }
492 }
493 }
494
495 put_cpu();
496}
497
498/* Rebind an evtchn so that it gets delivered to a specific cpu */
499static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
500{
501 struct evtchn_bind_vcpu bind_vcpu;
502 int evtchn = evtchn_from_irq(irq);
503
504 if (!VALID_EVTCHN(evtchn))
505 return;
506
507 /* Send future instances of this interrupt to other vcpu. */
508 bind_vcpu.port = evtchn;
509 bind_vcpu.vcpu = tcpu;
510
511 /*
512 * If this fails, it usually just indicates that we're dealing with a
513 * virq or IPI channel, which don't actually need to be rebound. Ignore
514 * it, but don't do the xenlinux-level rebind in that case.
515 */
516 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
517 bind_evtchn_to_cpu(evtchn, tcpu);
518}
519
520
521static void set_affinity_irq(unsigned irq, cpumask_t dest)
522{
523 unsigned tcpu = first_cpu(dest);
524 rebind_irq_to_cpu(irq, tcpu);
525}
526
527static void enable_dynirq(unsigned int irq)
528{
529 int evtchn = evtchn_from_irq(irq);
530
531 if (VALID_EVTCHN(evtchn))
532 unmask_evtchn(evtchn);
533}
534
535static void disable_dynirq(unsigned int irq)
536{
537 int evtchn = evtchn_from_irq(irq);
538
539 if (VALID_EVTCHN(evtchn))
540 mask_evtchn(evtchn);
541}
542
543static void ack_dynirq(unsigned int irq)
544{
545 int evtchn = evtchn_from_irq(irq);
546
547 move_native_irq(irq);
548
549 if (VALID_EVTCHN(evtchn))
550 clear_evtchn(evtchn);
551}
552
553static int retrigger_dynirq(unsigned int irq)
554{
555 int evtchn = evtchn_from_irq(irq);
556 int ret = 0;
557
558 if (VALID_EVTCHN(evtchn)) {
559 set_evtchn(evtchn);
560 ret = 1;
561 }
562
563 return ret;
564}
565
566static struct irq_chip xen_dynamic_chip __read_mostly = {
567 .name = "xen-dyn",
568 .mask = disable_dynirq,
569 .unmask = enable_dynirq,
570 .ack = ack_dynirq,
571 .set_affinity = set_affinity_irq,
572 .retrigger = retrigger_dynirq,
573};
574
575void __init xen_init_IRQ(void)
576{
577 int i;
578
579 init_evtchn_cpu_bindings();
580
581 /* No event channels are 'live' right now. */
582 for (i = 0; i < NR_EVENT_CHANNELS; i++)
583 mask_evtchn(i);
584
585 /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
586 for (i = 0; i < NR_IRQS; i++)
587 irq_bindcount[i] = 0;
588
589 irq_ctx_init(smp_processor_id());
590}
diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c
new file mode 100644
index 000000000000..0707714e40d6
--- /dev/null
+++ b/arch/i386/xen/features.c
@@ -0,0 +1,29 @@
1/******************************************************************************
2 * features.c
3 *
4 * Xen feature flags.
5 *
6 * Copyright (c) 2006, Ian Campbell, XenSource Inc.
7 */
8#include <linux/types.h>
9#include <linux/cache.h>
10#include <linux/module.h>
11#include <asm/xen/hypervisor.h>
12#include <xen/features.h>
13
14u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
15EXPORT_SYMBOL_GPL(xen_features);
16
17void xen_setup_features(void)
18{
19 struct xen_feature_info fi;
20 int i, j;
21
22 for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
23 fi.submap_idx = i;
24 if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
25 break;
26 for (j = 0; j < 32; j++)
27 xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
28 }
29}
diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c
new file mode 100644
index 000000000000..aa7af9e6abc0
--- /dev/null
+++ b/arch/i386/xen/manage.c
@@ -0,0 +1,143 @@
1/*
2 * Handle extern requests for shutdown, reboot and sysrq
3 */
4#include <linux/kernel.h>
5#include <linux/err.h>
6#include <linux/reboot.h>
7#include <linux/sysrq.h>
8
9#include <xen/xenbus.h>
10
11#define SHUTDOWN_INVALID -1
12#define SHUTDOWN_POWEROFF 0
13#define SHUTDOWN_SUSPEND 2
14/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
15 * report a crash, not be instructed to crash!
16 * HALT is the same as POWEROFF, as far as we're concerned. The tools use
17 * the distinction when we return the reason code to them.
18 */
19#define SHUTDOWN_HALT 4
20
21/* Ignore multiple shutdown requests. */
22static int shutting_down = SHUTDOWN_INVALID;
23
24static void shutdown_handler(struct xenbus_watch *watch,
25 const char **vec, unsigned int len)
26{
27 char *str;
28 struct xenbus_transaction xbt;
29 int err;
30
31 if (shutting_down != SHUTDOWN_INVALID)
32 return;
33
34 again:
35 err = xenbus_transaction_start(&xbt);
36 if (err)
37 return;
38
39 str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
40 /* Ignore read errors and empty reads. */
41 if (XENBUS_IS_ERR_READ(str)) {
42 xenbus_transaction_end(xbt, 1);
43 return;
44 }
45
46 xenbus_write(xbt, "control", "shutdown", "");
47
48 err = xenbus_transaction_end(xbt, 0);
49 if (err == -EAGAIN) {
50 kfree(str);
51 goto again;
52 }
53
54 if (strcmp(str, "poweroff") == 0 ||
55 strcmp(str, "halt") == 0)
56 orderly_poweroff(false);
57 else if (strcmp(str, "reboot") == 0)
58 ctrl_alt_del();
59 else {
60 printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
61 shutting_down = SHUTDOWN_INVALID;
62 }
63
64 kfree(str);
65}
66
67static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
68 unsigned int len)
69{
70 char sysrq_key = '\0';
71 struct xenbus_transaction xbt;
72 int err;
73
74 again:
75 err = xenbus_transaction_start(&xbt);
76 if (err)
77 return;
78 if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
79 printk(KERN_ERR "Unable to read sysrq code in "
80 "control/sysrq\n");
81 xenbus_transaction_end(xbt, 1);
82 return;
83 }
84
85 if (sysrq_key != '\0')
86 xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
87
88 err = xenbus_transaction_end(xbt, 0);
89 if (err == -EAGAIN)
90 goto again;
91
92 if (sysrq_key != '\0')
93 handle_sysrq(sysrq_key, NULL);
94}
95
96static struct xenbus_watch shutdown_watch = {
97 .node = "control/shutdown",
98 .callback = shutdown_handler
99};
100
101static struct xenbus_watch sysrq_watch = {
102 .node = "control/sysrq",
103 .callback = sysrq_handler
104};
105
106static int setup_shutdown_watcher(void)
107{
108 int err;
109
110 err = register_xenbus_watch(&shutdown_watch);
111 if (err) {
112 printk(KERN_ERR "Failed to set shutdown watcher\n");
113 return err;
114 }
115
116 err = register_xenbus_watch(&sysrq_watch);
117 if (err) {
118 printk(KERN_ERR "Failed to set sysrq watcher\n");
119 return err;
120 }
121
122 return 0;
123}
124
125static int shutdown_event(struct notifier_block *notifier,
126 unsigned long event,
127 void *data)
128{
129 setup_shutdown_watcher();
130 return NOTIFY_DONE;
131}
132
133static int __init setup_shutdown_event(void)
134{
135 static struct notifier_block xenstore_notifier = {
136 .notifier_call = shutdown_event
137 };
138 register_xenstore_notifier(&xenstore_notifier);
139
140 return 0;
141}
142
143subsys_initcall(setup_shutdown_event);
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
new file mode 100644
index 000000000000..4ae038aa6c24
--- /dev/null
+++ b/arch/i386/xen/mmu.c
@@ -0,0 +1,564 @@
1/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
41#include <linux/sched.h>
42#include <linux/highmem.h>
43#include <linux/bug.h>
44#include <linux/sched.h>
45
46#include <asm/pgtable.h>
47#include <asm/tlbflush.h>
48#include <asm/mmu_context.h>
49#include <asm/paravirt.h>
50
51#include <asm/xen/hypercall.h>
52#include <asm/xen/hypervisor.h>
53
54#include <xen/page.h>
55#include <xen/interface/xen.h>
56
57#include "multicalls.h"
58#include "mmu.h"
59
60xmaddr_t arbitrary_virt_to_machine(unsigned long address)
61{
62 pte_t *pte = lookup_address(address);
63 unsigned offset = address & PAGE_MASK;
64
65 BUG_ON(pte == NULL);
66
67 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
68}
69
70void make_lowmem_page_readonly(void *vaddr)
71{
72 pte_t *pte, ptev;
73 unsigned long address = (unsigned long)vaddr;
74
75 pte = lookup_address(address);
76 BUG_ON(pte == NULL);
77
78 ptev = pte_wrprotect(*pte);
79
80 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
81 BUG();
82}
83
84void make_lowmem_page_readwrite(void *vaddr)
85{
86 pte_t *pte, ptev;
87 unsigned long address = (unsigned long)vaddr;
88
89 pte = lookup_address(address);
90 BUG_ON(pte == NULL);
91
92 ptev = pte_mkwrite(*pte);
93
94 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
95 BUG();
96}
97
98
99void xen_set_pmd(pmd_t *ptr, pmd_t val)
100{
101 struct multicall_space mcs;
102 struct mmu_update *u;
103
104 preempt_disable();
105
106 mcs = xen_mc_entry(sizeof(*u));
107 u = mcs.args;
108 u->ptr = virt_to_machine(ptr).maddr;
109 u->val = pmd_val_ma(val);
110 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
111
112 xen_mc_issue(PARAVIRT_LAZY_MMU);
113
114 preempt_enable();
115}
116
117/*
118 * Associate a virtual page frame with a given physical page frame
119 * and protection flags for that frame.
120 */
121void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
122{
123 pgd_t *pgd;
124 pud_t *pud;
125 pmd_t *pmd;
126 pte_t *pte;
127
128 pgd = swapper_pg_dir + pgd_index(vaddr);
129 if (pgd_none(*pgd)) {
130 BUG();
131 return;
132 }
133 pud = pud_offset(pgd, vaddr);
134 if (pud_none(*pud)) {
135 BUG();
136 return;
137 }
138 pmd = pmd_offset(pud, vaddr);
139 if (pmd_none(*pmd)) {
140 BUG();
141 return;
142 }
143 pte = pte_offset_kernel(pmd, vaddr);
144 /* <mfn,flags> stored as-is, to permit clearing entries */
145 xen_set_pte(pte, mfn_pte(mfn, flags));
146
147 /*
148 * It's enough to flush this one mapping.
149 * (PGE mappings get flushed as well)
150 */
151 __flush_tlb_one(vaddr);
152}
153
154void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
155 pte_t *ptep, pte_t pteval)
156{
157 if (mm == current->mm || mm == &init_mm) {
158 if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
159 struct multicall_space mcs;
160 mcs = xen_mc_entry(0);
161
162 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
163 xen_mc_issue(PARAVIRT_LAZY_MMU);
164 return;
165 } else
166 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
167 return;
168 }
169 xen_set_pte(ptep, pteval);
170}
171
172#ifdef CONFIG_X86_PAE
173void xen_set_pud(pud_t *ptr, pud_t val)
174{
175 struct multicall_space mcs;
176 struct mmu_update *u;
177
178 preempt_disable();
179
180 mcs = xen_mc_entry(sizeof(*u));
181 u = mcs.args;
182 u->ptr = virt_to_machine(ptr).maddr;
183 u->val = pud_val_ma(val);
184 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
185
186 xen_mc_issue(PARAVIRT_LAZY_MMU);
187
188 preempt_enable();
189}
190
191void xen_set_pte(pte_t *ptep, pte_t pte)
192{
193 ptep->pte_high = pte.pte_high;
194 smp_wmb();
195 ptep->pte_low = pte.pte_low;
196}
197
198void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
199{
200 set_64bit((u64 *)ptep, pte_val_ma(pte));
201}
202
203void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
204{
205 ptep->pte_low = 0;
206 smp_wmb(); /* make sure low gets written first */
207 ptep->pte_high = 0;
208}
209
210void xen_pmd_clear(pmd_t *pmdp)
211{
212 xen_set_pmd(pmdp, __pmd(0));
213}
214
215unsigned long long xen_pte_val(pte_t pte)
216{
217 unsigned long long ret = 0;
218
219 if (pte.pte_low) {
220 ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
221 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
222 }
223
224 return ret;
225}
226
227unsigned long long xen_pmd_val(pmd_t pmd)
228{
229 unsigned long long ret = pmd.pmd;
230 if (ret)
231 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
232 return ret;
233}
234
235unsigned long long xen_pgd_val(pgd_t pgd)
236{
237 unsigned long long ret = pgd.pgd;
238 if (ret)
239 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
240 return ret;
241}
242
243pte_t xen_make_pte(unsigned long long pte)
244{
245 if (pte & 1)
246 pte = phys_to_machine(XPADDR(pte)).maddr;
247
248 return (pte_t){ pte, pte >> 32 };
249}
250
251pmd_t xen_make_pmd(unsigned long long pmd)
252{
253 if (pmd & 1)
254 pmd = phys_to_machine(XPADDR(pmd)).maddr;
255
256 return (pmd_t){ pmd };
257}
258
259pgd_t xen_make_pgd(unsigned long long pgd)
260{
261 if (pgd & _PAGE_PRESENT)
262 pgd = phys_to_machine(XPADDR(pgd)).maddr;
263
264 return (pgd_t){ pgd };
265}
266#else /* !PAE */
267void xen_set_pte(pte_t *ptep, pte_t pte)
268{
269 *ptep = pte;
270}
271
272unsigned long xen_pte_val(pte_t pte)
273{
274 unsigned long ret = pte.pte_low;
275
276 if (ret & _PAGE_PRESENT)
277 ret = machine_to_phys(XMADDR(ret)).paddr;
278
279 return ret;
280}
281
282unsigned long xen_pgd_val(pgd_t pgd)
283{
284 unsigned long ret = pgd.pgd;
285 if (ret)
286 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
287 return ret;
288}
289
290pte_t xen_make_pte(unsigned long pte)
291{
292 if (pte & _PAGE_PRESENT)
293 pte = phys_to_machine(XPADDR(pte)).maddr;
294
295 return (pte_t){ pte };
296}
297
298pgd_t xen_make_pgd(unsigned long pgd)
299{
300 if (pgd & _PAGE_PRESENT)
301 pgd = phys_to_machine(XPADDR(pgd)).maddr;
302
303 return (pgd_t){ pgd };
304}
305#endif /* CONFIG_X86_PAE */
306
307
308
309/*
310 (Yet another) pagetable walker. This one is intended for pinning a
311 pagetable. This means that it walks a pagetable and calls the
312 callback function on each page it finds making up the page table,
313 at every level. It walks the entire pagetable, but it only bothers
314 pinning pte pages which are below pte_limit. In the normal case
315 this will be TASK_SIZE, but at boot we need to pin up to
316 FIXADDR_TOP. But the important bit is that we don't pin beyond
317 there, because then we start getting into Xen's ptes.
318*/
319static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
320 unsigned long limit)
321{
322 pgd_t *pgd = pgd_base;
323 int flush = 0;
324 unsigned long addr = 0;
325 unsigned long pgd_next;
326
327 BUG_ON(limit > FIXADDR_TOP);
328
329 if (xen_feature(XENFEAT_auto_translated_physmap))
330 return 0;
331
332 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
333 pud_t *pud;
334 unsigned long pud_limit, pud_next;
335
336 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
337
338 if (!pgd_val(*pgd))
339 continue;
340
341 pud = pud_offset(pgd, 0);
342
343 if (PTRS_PER_PUD > 1) /* not folded */
344 flush |= (*func)(virt_to_page(pud), 0);
345
346 for (; addr != pud_limit; pud++, addr = pud_next) {
347 pmd_t *pmd;
348 unsigned long pmd_limit;
349
350 pud_next = pud_addr_end(addr, pud_limit);
351
352 if (pud_next < limit)
353 pmd_limit = pud_next;
354 else
355 pmd_limit = limit;
356
357 if (pud_none(*pud))
358 continue;
359
360 pmd = pmd_offset(pud, 0);
361
362 if (PTRS_PER_PMD > 1) /* not folded */
363 flush |= (*func)(virt_to_page(pmd), 0);
364
365 for (; addr != pmd_limit; pmd++) {
366 addr += (PAGE_SIZE * PTRS_PER_PTE);
367 if ((pmd_limit-1) < (addr-1)) {
368 addr = pmd_limit;
369 break;
370 }
371
372 if (pmd_none(*pmd))
373 continue;
374
375 flush |= (*func)(pmd_page(*pmd), 0);
376 }
377 }
378 }
379
380 flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
381
382 return flush;
383}
384
385static int pin_page(struct page *page, unsigned flags)
386{
387 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
388 int flush;
389
390 if (pgfl)
391 flush = 0; /* already pinned */
392 else if (PageHighMem(page))
393 /* kmaps need flushing if we found an unpinned
394 highpage */
395 flush = 1;
396 else {
397 void *pt = lowmem_page_address(page);
398 unsigned long pfn = page_to_pfn(page);
399 struct multicall_space mcs = __xen_mc_entry(0);
400
401 flush = 0;
402
403 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
404 pfn_pte(pfn, PAGE_KERNEL_RO),
405 flags);
406 }
407
408 return flush;
409}
410
411/* This is called just after a mm has been created, but it has not
412 been used yet. We need to make sure that its pagetable is all
413 read-only, and can be pinned. */
414void xen_pgd_pin(pgd_t *pgd)
415{
416 struct multicall_space mcs;
417 struct mmuext_op *op;
418
419 xen_mc_batch();
420
421 if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
422 /* re-enable interrupts for kmap_flush_unused */
423 xen_mc_issue(0);
424 kmap_flush_unused();
425 xen_mc_batch();
426 }
427
428 mcs = __xen_mc_entry(sizeof(*op));
429 op = mcs.args;
430
431#ifdef CONFIG_X86_PAE
432 op->cmd = MMUEXT_PIN_L3_TABLE;
433#else
434 op->cmd = MMUEXT_PIN_L2_TABLE;
435#endif
436 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
437 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
438
439 xen_mc_issue(0);
440}
441
442/* The init_mm pagetable is really pinned as soon as its created, but
443 that's before we have page structures to store the bits. So do all
444 the book-keeping now. */
445static __init int mark_pinned(struct page *page, unsigned flags)
446{
447 SetPagePinned(page);
448 return 0;
449}
450
451void __init xen_mark_init_mm_pinned(void)
452{
453 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
454}
455
456static int unpin_page(struct page *page, unsigned flags)
457{
458 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
459
460 if (pgfl && !PageHighMem(page)) {
461 void *pt = lowmem_page_address(page);
462 unsigned long pfn = page_to_pfn(page);
463 struct multicall_space mcs = __xen_mc_entry(0);
464
465 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
466 pfn_pte(pfn, PAGE_KERNEL),
467 flags);
468 }
469
470 return 0; /* never need to flush on unpin */
471}
472
473/* Release a pagetables pages back as normal RW */
474static void xen_pgd_unpin(pgd_t *pgd)
475{
476 struct mmuext_op *op;
477 struct multicall_space mcs;
478
479 xen_mc_batch();
480
481 mcs = __xen_mc_entry(sizeof(*op));
482
483 op = mcs.args;
484 op->cmd = MMUEXT_UNPIN_TABLE;
485 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
486
487 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
488
489 pgd_walk(pgd, unpin_page, TASK_SIZE);
490
491 xen_mc_issue(0);
492}
493
494void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
495{
496 spin_lock(&next->page_table_lock);
497 xen_pgd_pin(next->pgd);
498 spin_unlock(&next->page_table_lock);
499}
500
501void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
502{
503 spin_lock(&mm->page_table_lock);
504 xen_pgd_pin(mm->pgd);
505 spin_unlock(&mm->page_table_lock);
506}
507
508
509#ifdef CONFIG_SMP
510/* Another cpu may still have their %cr3 pointing at the pagetable, so
511 we need to repoint it somewhere else before we can unpin it. */
512static void drop_other_mm_ref(void *info)
513{
514 struct mm_struct *mm = info;
515
516 if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
517 leave_mm(smp_processor_id());
518}
519
520static void drop_mm_ref(struct mm_struct *mm)
521{
522 if (current->active_mm == mm) {
523 if (current->mm == mm)
524 load_cr3(swapper_pg_dir);
525 else
526 leave_mm(smp_processor_id());
527 }
528
529 if (!cpus_empty(mm->cpu_vm_mask))
530 xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
531 mm, 1);
532}
533#else
534static void drop_mm_ref(struct mm_struct *mm)
535{
536 if (current->active_mm == mm)
537 load_cr3(swapper_pg_dir);
538}
539#endif
540
541/*
542 * While a process runs, Xen pins its pagetables, which means that the
543 * hypervisor forces it to be read-only, and it controls all updates
544 * to it. This means that all pagetable updates have to go via the
545 * hypervisor, which is moderately expensive.
546 *
547 * Since we're pulling the pagetable down, we switch to use init_mm,
548 * unpin old process pagetable and mark it all read-write, which
549 * allows further operations on it to be simple memory accesses.
550 *
551 * The only subtle point is that another CPU may be still using the
552 * pagetable because of lazy tlb flushing. This means we need need to
553 * switch all CPUs off this pagetable before we can unpin it.
554 */
555void xen_exit_mmap(struct mm_struct *mm)
556{
557 get_cpu(); /* make sure we don't move around */
558 drop_mm_ref(mm);
559 put_cpu();
560
561 spin_lock(&mm->page_table_lock);
562 xen_pgd_unpin(mm->pgd);
563 spin_unlock(&mm->page_table_lock);
564}
diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h
new file mode 100644
index 000000000000..c9ff27f3ac3a
--- /dev/null
+++ b/arch/i386/xen/mmu.h
@@ -0,0 +1,60 @@
1#ifndef _XEN_MMU_H
2
3#include <linux/linkage.h>
4#include <asm/page.h>
5
6/*
7 * Page-directory addresses above 4GB do not fit into architectural %cr3.
8 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
9 * must use the following accessor macros to pack/unpack valid MFNs.
10 *
11 * Note that Xen is using the fact that the pagetable base is always
12 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
13 * of cr3.
14 */
15#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
16#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
17
18
19void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
20
21void xen_set_pte(pte_t *ptep, pte_t pteval);
22void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
23 pte_t *ptep, pte_t pteval);
24void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
25
26void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
27void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
28void xen_exit_mmap(struct mm_struct *mm);
29
30void xen_pgd_pin(pgd_t *pgd);
31//void xen_pgd_unpin(pgd_t *pgd);
32
33#ifdef CONFIG_X86_PAE
34unsigned long long xen_pte_val(pte_t);
35unsigned long long xen_pmd_val(pmd_t);
36unsigned long long xen_pgd_val(pgd_t);
37
38pte_t xen_make_pte(unsigned long long);
39pmd_t xen_make_pmd(unsigned long long);
40pgd_t xen_make_pgd(unsigned long long);
41
42void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
43 pte_t *ptep, pte_t pteval);
44void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
45void xen_set_pud(pud_t *ptr, pud_t val);
46void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
47void xen_pmd_clear(pmd_t *pmdp);
48
49
50#else
51unsigned long xen_pte_val(pte_t);
52unsigned long xen_pmd_val(pmd_t);
53unsigned long xen_pgd_val(pgd_t);
54
55pte_t xen_make_pte(unsigned long);
56pmd_t xen_make_pmd(unsigned long);
57pgd_t xen_make_pgd(unsigned long);
58#endif
59
60#endif /* _XEN_MMU_H */
diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c
new file mode 100644
index 000000000000..c837e8e463db
--- /dev/null
+++ b/arch/i386/xen/multicalls.c
@@ -0,0 +1,90 @@
1/*
2 * Xen hypercall batching.
3 *
4 * Xen allows multiple hypercalls to be issued at once, using the
5 * multicall interface. This allows the cost of trapping into the
6 * hypervisor to be amortized over several calls.
7 *
8 * This file implements a simple interface for multicalls. There's a
9 * per-cpu buffer of outstanding multicalls. When you want to queue a
10 * multicall for issuing, you can allocate a multicall slot for the
11 * call and its arguments, along with storage for space which is
12 * pointed to by the arguments (for passing pointers to structures,
13 * etc). When the multicall is actually issued, all the space for the
14 * commands and allocated memory is freed for reuse.
15 *
16 * Multicalls are flushed whenever any of the buffers get full, or
17 * when explicitly requested. There's no way to get per-multicall
18 * return results back. It will BUG if any of the multicalls fail.
19 *
20 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
21 */
22#include <linux/percpu.h>
23#include <linux/hardirq.h>
24
25#include <asm/xen/hypercall.h>
26
27#include "multicalls.h"
28
29#define MC_BATCH 32
30#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
31
32struct mc_buffer {
33 struct multicall_entry entries[MC_BATCH];
34 u64 args[MC_ARGS];
35 unsigned mcidx, argidx;
36};
37
38static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
39DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
40
41void xen_mc_flush(void)
42{
43 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
44 int ret = 0;
45 unsigned long flags;
46
47 BUG_ON(preemptible());
48
49 /* Disable interrupts in case someone comes in and queues
50 something in the middle */
51 local_irq_save(flags);
52
53 if (b->mcidx) {
54 int i;
55
56 if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
57 BUG();
58 for (i = 0; i < b->mcidx; i++)
59 if (b->entries[i].result < 0)
60 ret++;
61 b->mcidx = 0;
62 b->argidx = 0;
63 } else
64 BUG_ON(b->argidx != 0);
65
66 local_irq_restore(flags);
67
68 BUG_ON(ret);
69}
70
71struct multicall_space __xen_mc_entry(size_t args)
72{
73 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
74 struct multicall_space ret;
75 unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
76
77 BUG_ON(preemptible());
78 BUG_ON(argspace > MC_ARGS);
79
80 if (b->mcidx == MC_BATCH ||
81 (b->argidx + argspace) > MC_ARGS)
82 xen_mc_flush();
83
84 ret.mc = &b->entries[b->mcidx];
85 b->mcidx++;
86 ret.args = &b->args[b->argidx];
87 b->argidx += argspace;
88
89 return ret;
90}
diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h
new file mode 100644
index 000000000000..e6f7530b156c
--- /dev/null
+++ b/arch/i386/xen/multicalls.h
@@ -0,0 +1,45 @@
1#ifndef _XEN_MULTICALLS_H
2#define _XEN_MULTICALLS_H
3
4#include "xen-ops.h"
5
6/* Multicalls */
7struct multicall_space
8{
9 struct multicall_entry *mc;
10 void *args;
11};
12
13/* Allocate room for a multicall and its args */
14struct multicall_space __xen_mc_entry(size_t args);
15
16DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
17
18/* Call to start a batch of multiple __xen_mc_entry()s. Must be
19 paired with xen_mc_issue() */
20static inline void xen_mc_batch(void)
21{
22 /* need to disable interrupts until this entry is complete */
23 local_irq_save(__get_cpu_var(xen_mc_irq_flags));
24}
25
26static inline struct multicall_space xen_mc_entry(size_t args)
27{
28 xen_mc_batch();
29 return __xen_mc_entry(args);
30}
31
32/* Flush all pending multicalls */
33void xen_mc_flush(void);
34
35/* Issue a multicall if we're not in a lazy mode */
36static inline void xen_mc_issue(unsigned mode)
37{
38 if ((xen_get_lazy_mode() & mode) == 0)
39 xen_mc_flush();
40
41 /* restore flags saved in xen_mc_batch */
42 local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
43}
44
45#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c
new file mode 100644
index 000000000000..2fe6eac510f0
--- /dev/null
+++ b/arch/i386/xen/setup.c
@@ -0,0 +1,96 @@
1/*
2 * Machine specific setup for xen
3 *
4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
5 */
6
7#include <linux/module.h>
8#include <linux/sched.h>
9#include <linux/mm.h>
10#include <linux/pm.h>
11
12#include <asm/elf.h>
13#include <asm/e820.h>
14#include <asm/setup.h>
15#include <asm/xen/hypervisor.h>
16#include <asm/xen/hypercall.h>
17
18#include <xen/interface/physdev.h>
19#include <xen/features.h>
20
21#include "xen-ops.h"
22
23/* These are code, but not functions. Defined in entry.S */
24extern const char xen_hypervisor_callback[];
25extern const char xen_failsafe_callback[];
26
27unsigned long *phys_to_machine_mapping;
28EXPORT_SYMBOL(phys_to_machine_mapping);
29
30/**
31 * machine_specific_memory_setup - Hook for machine specific memory setup.
32 **/
33
34char * __init xen_memory_setup(void)
35{
36 unsigned long max_pfn = xen_start_info->nr_pages;
37
38 e820.nr_map = 0;
39 add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
40
41 return "Xen";
42}
43
44static void xen_idle(void)
45{
46 local_irq_disable();
47
48 if (need_resched())
49 local_irq_enable();
50 else {
51 current_thread_info()->status &= ~TS_POLLING;
52 smp_mb__after_clear_bit();
53 safe_halt();
54 current_thread_info()->status |= TS_POLLING;
55 }
56}
57
58void __init xen_arch_setup(void)
59{
60 struct physdev_set_iopl set_iopl;
61 int rc;
62
63 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
64 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
65
66 if (!xen_feature(XENFEAT_auto_translated_physmap))
67 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
68
69 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
70 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
71
72 set_iopl.iopl = 1;
73 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
74 if (rc != 0)
75 printk(KERN_INFO "physdev_op failed %d\n", rc);
76
77#ifdef CONFIG_ACPI
78 if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
79 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
80 disable_acpi();
81 }
82#endif
83
84 memcpy(boot_command_line, xen_start_info->cmd_line,
85 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
86 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
87
88 pm_idle = xen_idle;
89
90#ifdef CONFIG_SMP
91 /* fill cpus_possible with all available cpus */
92 xen_fill_possible_map();
93#endif
94
95 paravirt_disable_iospace();
96}
diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c
new file mode 100644
index 000000000000..557b8e24706a
--- /dev/null
+++ b/arch/i386/xen/smp.c
@@ -0,0 +1,404 @@
1/*
2 * Xen SMP support
3 *
4 * This file implements the Xen versions of smp_ops. SMP under Xen is
5 * very straightforward. Bringing a CPU up is simply a matter of
6 * loading its initial context and setting it running.
7 *
8 * IPIs are handled through the Xen event mechanism.
9 *
10 * Because virtual CPUs can be scheduled onto any real CPU, there's no
11 * useful topology information for the kernel to make use of. As a
12 * result, all CPUs are treated as if they're single-core and
13 * single-threaded.
14 *
15 * This does not handle HOTPLUG_CPU yet.
16 */
17#include <linux/sched.h>
18#include <linux/err.h>
19#include <linux/smp.h>
20
21#include <asm/paravirt.h>
22#include <asm/desc.h>
23#include <asm/pgtable.h>
24#include <asm/cpu.h>
25
26#include <xen/interface/xen.h>
27#include <xen/interface/vcpu.h>
28
29#include <asm/xen/interface.h>
30#include <asm/xen/hypercall.h>
31
32#include <xen/page.h>
33#include <xen/events.h>
34
35#include "xen-ops.h"
36#include "mmu.h"
37
38static cpumask_t cpu_initialized_map;
39static DEFINE_PER_CPU(int, resched_irq);
40static DEFINE_PER_CPU(int, callfunc_irq);
41
42/*
43 * Structure and data for smp_call_function(). This is designed to minimise
44 * static memory requirements. It also looks cleaner.
45 */
46static DEFINE_SPINLOCK(call_lock);
47
48struct call_data_struct {
49 void (*func) (void *info);
50 void *info;
51 atomic_t started;
52 atomic_t finished;
53 int wait;
54};
55
56static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
57
58static struct call_data_struct *call_data;
59
60/*
61 * Reschedule call back. Nothing to do,
62 * all the work is done automatically when
63 * we return from the interrupt.
64 */
65static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
66{
67 return IRQ_HANDLED;
68}
69
70static __cpuinit void cpu_bringup_and_idle(void)
71{
72 int cpu = smp_processor_id();
73
74 cpu_init();
75
76 preempt_disable();
77 per_cpu(cpu_state, cpu) = CPU_ONLINE;
78
79 xen_setup_cpu_clockevents();
80
81 /* We can take interrupts now: we're officially "up". */
82 local_irq_enable();
83
84 wmb(); /* make sure everything is out */
85 cpu_idle();
86}
87
88static int xen_smp_intr_init(unsigned int cpu)
89{
90 int rc;
91 const char *resched_name, *callfunc_name;
92
93 per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
94
95 resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
96 rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
97 cpu,
98 xen_reschedule_interrupt,
99 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
100 resched_name,
101 NULL);
102 if (rc < 0)
103 goto fail;
104 per_cpu(resched_irq, cpu) = rc;
105
106 callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
107 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
108 cpu,
109 xen_call_function_interrupt,
110 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
111 callfunc_name,
112 NULL);
113 if (rc < 0)
114 goto fail;
115 per_cpu(callfunc_irq, cpu) = rc;
116
117 return 0;
118
119 fail:
120 if (per_cpu(resched_irq, cpu) >= 0)
121 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
122 if (per_cpu(callfunc_irq, cpu) >= 0)
123 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
124 return rc;
125}
126
127void __init xen_fill_possible_map(void)
128{
129 int i, rc;
130
131 for (i = 0; i < NR_CPUS; i++) {
132 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
133 if (rc >= 0)
134 cpu_set(i, cpu_possible_map);
135 }
136}
137
138void __init xen_smp_prepare_boot_cpu(void)
139{
140 int cpu;
141
142 BUG_ON(smp_processor_id() != 0);
143 native_smp_prepare_boot_cpu();
144
145 /* We've switched to the "real" per-cpu gdt, so make sure the
146 old memory can be recycled */
147 make_lowmem_page_readwrite(&per_cpu__gdt_page);
148
149 for (cpu = 0; cpu < NR_CPUS; cpu++) {
150 cpus_clear(cpu_sibling_map[cpu]);
151 cpus_clear(cpu_core_map[cpu]);
152 }
153
154 xen_setup_vcpu_info_placement();
155}
156
157void __init xen_smp_prepare_cpus(unsigned int max_cpus)
158{
159 unsigned cpu;
160
161 for (cpu = 0; cpu < NR_CPUS; cpu++) {
162 cpus_clear(cpu_sibling_map[cpu]);
163 cpus_clear(cpu_core_map[cpu]);
164 }
165
166 smp_store_cpu_info(0);
167 set_cpu_sibling_map(0);
168
169 if (xen_smp_intr_init(0))
170 BUG();
171
172 cpu_initialized_map = cpumask_of_cpu(0);
173
174 /* Restrict the possible_map according to max_cpus. */
175 while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
176 for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
177 continue;
178 cpu_clear(cpu, cpu_possible_map);
179 }
180
181 for_each_possible_cpu (cpu) {
182 struct task_struct *idle;
183
184 if (cpu == 0)
185 continue;
186
187 idle = fork_idle(cpu);
188 if (IS_ERR(idle))
189 panic("failed fork for CPU %d", cpu);
190
191 cpu_set(cpu, cpu_present_map);
192 }
193
194 //init_xenbus_allowed_cpumask();
195}
196
197static __cpuinit int
198cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
199{
200 struct vcpu_guest_context *ctxt;
201 struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
202
203 if (cpu_test_and_set(cpu, cpu_initialized_map))
204 return 0;
205
206 ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
207 if (ctxt == NULL)
208 return -ENOMEM;
209
210 ctxt->flags = VGCF_IN_KERNEL;
211 ctxt->user_regs.ds = __USER_DS;
212 ctxt->user_regs.es = __USER_DS;
213 ctxt->user_regs.fs = __KERNEL_PERCPU;
214 ctxt->user_regs.gs = 0;
215 ctxt->user_regs.ss = __KERNEL_DS;
216 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
217 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
218
219 memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
220
221 xen_copy_trap_info(ctxt->trap_ctxt);
222
223 ctxt->ldt_ents = 0;
224
225 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
226 make_lowmem_page_readonly(gdt->gdt);
227
228 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
229 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
230
231 ctxt->user_regs.cs = __KERNEL_CS;
232 ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
233
234 ctxt->kernel_ss = __KERNEL_DS;
235 ctxt->kernel_sp = idle->thread.esp0;
236
237 ctxt->event_callback_cs = __KERNEL_CS;
238 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
239 ctxt->failsafe_callback_cs = __KERNEL_CS;
240 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
241
242 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
243 ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
244
245 if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
246 BUG();
247
248 kfree(ctxt);
249 return 0;
250}
251
252int __cpuinit xen_cpu_up(unsigned int cpu)
253{
254 struct task_struct *idle = idle_task(cpu);
255 int rc;
256
257#if 0
258 rc = cpu_up_check(cpu);
259 if (rc)
260 return rc;
261#endif
262
263 init_gdt(cpu);
264 per_cpu(current_task, cpu) = idle;
265 irq_ctx_init(cpu);
266 xen_setup_timer(cpu);
267
268 /* make sure interrupts start blocked */
269 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
270
271 rc = cpu_initialize_context(cpu, idle);
272 if (rc)
273 return rc;
274
275 if (num_online_cpus() == 1)
276 alternatives_smp_switch(1);
277
278 rc = xen_smp_intr_init(cpu);
279 if (rc)
280 return rc;
281
282 smp_store_cpu_info(cpu);
283 set_cpu_sibling_map(cpu);
284 /* This must be done before setting cpu_online_map */
285 wmb();
286
287 cpu_set(cpu, cpu_online_map);
288
289 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
290 BUG_ON(rc);
291
292 return 0;
293}
294
295void xen_smp_cpus_done(unsigned int max_cpus)
296{
297}
298
299static void stop_self(void *v)
300{
301 int cpu = smp_processor_id();
302
303 /* make sure we're not pinning something down */
304 load_cr3(swapper_pg_dir);
305 /* should set up a minimal gdt */
306
307 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
308 BUG();
309}
310
311void xen_smp_send_stop(void)
312{
313 smp_call_function(stop_self, NULL, 0, 0);
314}
315
316void xen_smp_send_reschedule(int cpu)
317{
318 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
319}
320
321
322static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
323{
324 unsigned cpu;
325
326 cpus_and(mask, mask, cpu_online_map);
327
328 for_each_cpu_mask(cpu, mask)
329 xen_send_IPI_one(cpu, vector);
330}
331
332static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
333{
334 void (*func) (void *info) = call_data->func;
335 void *info = call_data->info;
336 int wait = call_data->wait;
337
338 /*
339 * Notify initiating CPU that I've grabbed the data and am
340 * about to execute the function
341 */
342 mb();
343 atomic_inc(&call_data->started);
344 /*
345 * At this point the info structure may be out of scope unless wait==1
346 */
347 irq_enter();
348 (*func)(info);
349 irq_exit();
350
351 if (wait) {
352 mb(); /* commit everything before setting finished */
353 atomic_inc(&call_data->finished);
354 }
355
356 return IRQ_HANDLED;
357}
358
359int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
360 void *info, int wait)
361{
362 struct call_data_struct data;
363 int cpus;
364
365 /* Holding any lock stops cpus from going down. */
366 spin_lock(&call_lock);
367
368 cpu_clear(smp_processor_id(), mask);
369
370 cpus = cpus_weight(mask);
371 if (!cpus) {
372 spin_unlock(&call_lock);
373 return 0;
374 }
375
376 /* Can deadlock when called with interrupts disabled */
377 WARN_ON(irqs_disabled());
378
379 data.func = func;
380 data.info = info;
381 atomic_set(&data.started, 0);
382 data.wait = wait;
383 if (wait)
384 atomic_set(&data.finished, 0);
385
386 call_data = &data;
387 mb(); /* write everything before IPI */
388
389 /* Send a message to other CPUs and wait for them to respond */
390 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
391
392 /* Make sure other vcpus get a chance to run.
393 XXX too severe? Maybe we should check the other CPU's states? */
394 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
395
396 /* Wait for response */
397 while (atomic_read(&data.started) != cpus ||
398 (wait && atomic_read(&data.finished) != cpus))
399 cpu_relax();
400
401 spin_unlock(&call_lock);
402
403 return 0;
404}
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
new file mode 100644
index 000000000000..51fdabf1fd4d
--- /dev/null
+++ b/arch/i386/xen/time.c
@@ -0,0 +1,590 @@
1/*
2 * Xen time implementation.
3 *
4 * This is implemented in terms of a clocksource driver which uses
5 * the hypervisor clock as a nanosecond timebase, and a clockevent
6 * driver which uses the hypervisor's timer mechanism.
7 *
8 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9 */
10#include <linux/kernel.h>
11#include <linux/interrupt.h>
12#include <linux/clocksource.h>
13#include <linux/clockchips.h>
14#include <linux/kernel_stat.h>
15
16#include <asm/xen/hypervisor.h>
17#include <asm/xen/hypercall.h>
18
19#include <xen/events.h>
20#include <xen/interface/xen.h>
21#include <xen/interface/vcpu.h>
22
23#include "xen-ops.h"
24
25#define XEN_SHIFT 22
26
27/* Xen may fire a timer up to this many ns early */
28#define TIMER_SLOP 100000
29#define NS_PER_TICK (1000000000LL / HZ)
30
31static cycle_t xen_clocksource_read(void);
32
33/* These are perodically updated in shared_info, and then copied here. */
34struct shadow_time_info {
35 u64 tsc_timestamp; /* TSC at last update of time vals. */
36 u64 system_timestamp; /* Time, in nanosecs, since boot. */
37 u32 tsc_to_nsec_mul;
38 int tsc_shift;
39 u32 version;
40};
41
42static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
43
44/* runstate info updated by Xen */
45static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
46
47/* snapshots of runstate info */
48static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
49
50/* unused ns of stolen and blocked time */
51static DEFINE_PER_CPU(u64, residual_stolen);
52static DEFINE_PER_CPU(u64, residual_blocked);
53
54/* return an consistent snapshot of 64-bit time/counter value */
55static u64 get64(const u64 *p)
56{
57 u64 ret;
58
59 if (BITS_PER_LONG < 64) {
60 u32 *p32 = (u32 *)p;
61 u32 h, l;
62
63 /*
64 * Read high then low, and then make sure high is
65 * still the same; this will only loop if low wraps
66 * and carries into high.
67 * XXX some clean way to make this endian-proof?
68 */
69 do {
70 h = p32[1];
71 barrier();
72 l = p32[0];
73 barrier();
74 } while (p32[1] != h);
75
76 ret = (((u64)h) << 32) | l;
77 } else
78 ret = *p;
79
80 return ret;
81}
82
83/*
84 * Runstate accounting
85 */
86static void get_runstate_snapshot(struct vcpu_runstate_info *res)
87{
88 u64 state_time;
89 struct vcpu_runstate_info *state;
90
91 BUG_ON(preemptible());
92
93 state = &__get_cpu_var(runstate);
94
95 /*
96 * The runstate info is always updated by the hypervisor on
97 * the current CPU, so there's no need to use anything
98 * stronger than a compiler barrier when fetching it.
99 */
100 do {
101 state_time = get64(&state->state_entry_time);
102 barrier();
103 *res = *state;
104 barrier();
105 } while (get64(&state->state_entry_time) != state_time);
106}
107
108static void setup_runstate_info(int cpu)
109{
110 struct vcpu_register_runstate_memory_area area;
111
112 area.addr.v = &per_cpu(runstate, cpu);
113
114 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
115 cpu, &area))
116 BUG();
117}
118
119static void do_stolen_accounting(void)
120{
121 struct vcpu_runstate_info state;
122 struct vcpu_runstate_info *snap;
123 s64 blocked, runnable, offline, stolen;
124 cputime_t ticks;
125
126 get_runstate_snapshot(&state);
127
128 WARN_ON(state.state != RUNSTATE_running);
129
130 snap = &__get_cpu_var(runstate_snapshot);
131
132 /* work out how much time the VCPU has not been runn*ing* */
133 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
134 runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
135 offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
136
137 *snap = state;
138
139 /* Add the appropriate number of ticks of stolen time,
140 including any left-overs from last time. Passing NULL to
141 account_steal_time accounts the time as stolen. */
142 stolen = runnable + offline + __get_cpu_var(residual_stolen);
143
144 if (stolen < 0)
145 stolen = 0;
146
147 ticks = 0;
148 while (stolen >= NS_PER_TICK) {
149 ticks++;
150 stolen -= NS_PER_TICK;
151 }
152 __get_cpu_var(residual_stolen) = stolen;
153 account_steal_time(NULL, ticks);
154
155 /* Add the appropriate number of ticks of blocked time,
156 including any left-overs from last time. Passing idle to
157 account_steal_time accounts the time as idle/wait. */
158 blocked += __get_cpu_var(residual_blocked);
159
160 if (blocked < 0)
161 blocked = 0;
162
163 ticks = 0;
164 while (blocked >= NS_PER_TICK) {
165 ticks++;
166 blocked -= NS_PER_TICK;
167 }
168 __get_cpu_var(residual_blocked) = blocked;
169 account_steal_time(idle_task(smp_processor_id()), ticks);
170}
171
172/*
173 * Xen sched_clock implementation. Returns the number of unstolen
174 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
175 * states.
176 */
177unsigned long long xen_sched_clock(void)
178{
179 struct vcpu_runstate_info state;
180 cycle_t now;
181 u64 ret;
182 s64 offset;
183
184 /*
185 * Ideally sched_clock should be called on a per-cpu basis
186 * anyway, so preempt should already be disabled, but that's
187 * not current practice at the moment.
188 */
189 preempt_disable();
190
191 now = xen_clocksource_read();
192
193 get_runstate_snapshot(&state);
194
195 WARN_ON(state.state != RUNSTATE_running);
196
197 offset = now - state.state_entry_time;
198 if (offset < 0)
199 offset = 0;
200
201 ret = state.time[RUNSTATE_blocked] +
202 state.time[RUNSTATE_running] +
203 offset;
204
205 preempt_enable();
206
207 return ret;
208}
209
210
211/* Get the CPU speed from Xen */
212unsigned long xen_cpu_khz(void)
213{
214 u64 cpu_khz = 1000000ULL << 32;
215 const struct vcpu_time_info *info =
216 &HYPERVISOR_shared_info->vcpu_info[0].time;
217
218 do_div(cpu_khz, info->tsc_to_system_mul);
219 if (info->tsc_shift < 0)
220 cpu_khz <<= -info->tsc_shift;
221 else
222 cpu_khz >>= info->tsc_shift;
223
224 return cpu_khz;
225}
226
227/*
228 * Reads a consistent set of time-base values from Xen, into a shadow data
229 * area.
230 */
231static unsigned get_time_values_from_xen(void)
232{
233 struct vcpu_time_info *src;
234 struct shadow_time_info *dst;
235
236 /* src is shared memory with the hypervisor, so we need to
237 make sure we get a consistent snapshot, even in the face of
238 being preempted. */
239 src = &__get_cpu_var(xen_vcpu)->time;
240 dst = &__get_cpu_var(shadow_time);
241
242 do {
243 dst->version = src->version;
244 rmb(); /* fetch version before data */
245 dst->tsc_timestamp = src->tsc_timestamp;
246 dst->system_timestamp = src->system_time;
247 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
248 dst->tsc_shift = src->tsc_shift;
249 rmb(); /* test version after fetching data */
250 } while ((src->version & 1) | (dst->version ^ src->version));
251
252 return dst->version;
253}
254
255/*
256 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
257 * yielding a 64-bit result.
258 */
259static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
260{
261 u64 product;
262#ifdef __i386__
263 u32 tmp1, tmp2;
264#endif
265
266 if (shift < 0)
267 delta >>= -shift;
268 else
269 delta <<= shift;
270
271#ifdef __i386__
272 __asm__ (
273 "mul %5 ; "
274 "mov %4,%%eax ; "
275 "mov %%edx,%4 ; "
276 "mul %5 ; "
277 "xor %5,%5 ; "
278 "add %4,%%eax ; "
279 "adc %5,%%edx ; "
280 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
281 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
282#elif __x86_64__
283 __asm__ (
284 "mul %%rdx ; shrd $32,%%rdx,%%rax"
285 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
286#else
287#error implement me!
288#endif
289
290 return product;
291}
292
293static u64 get_nsec_offset(struct shadow_time_info *shadow)
294{
295 u64 now, delta;
296 now = native_read_tsc();
297 delta = now - shadow->tsc_timestamp;
298 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
299}
300
301static cycle_t xen_clocksource_read(void)
302{
303 struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
304 cycle_t ret;
305 unsigned version;
306
307 do {
308 version = get_time_values_from_xen();
309 barrier();
310 ret = shadow->system_timestamp + get_nsec_offset(shadow);
311 barrier();
312 } while (version != __get_cpu_var(xen_vcpu)->time.version);
313
314 put_cpu_var(shadow_time);
315
316 return ret;
317}
318
319static void xen_read_wallclock(struct timespec *ts)
320{
321 const struct shared_info *s = HYPERVISOR_shared_info;
322 u32 version;
323 u64 delta;
324 struct timespec now;
325
326 /* get wallclock at system boot */
327 do {
328 version = s->wc_version;
329 rmb(); /* fetch version before time */
330 now.tv_sec = s->wc_sec;
331 now.tv_nsec = s->wc_nsec;
332 rmb(); /* fetch time before checking version */
333 } while ((s->wc_version & 1) | (version ^ s->wc_version));
334
335 delta = xen_clocksource_read(); /* time since system boot */
336 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
337
338 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
339 now.tv_sec = delta;
340
341 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
342}
343
344unsigned long xen_get_wallclock(void)
345{
346 struct timespec ts;
347
348 xen_read_wallclock(&ts);
349
350 return ts.tv_sec;
351}
352
353int xen_set_wallclock(unsigned long now)
354{
355 /* do nothing for domU */
356 return -1;
357}
358
359static struct clocksource xen_clocksource __read_mostly = {
360 .name = "xen",
361 .rating = 400,
362 .read = xen_clocksource_read,
363 .mask = ~0,
364 .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
365 .shift = XEN_SHIFT,
366 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
367};
368
369/*
370 Xen clockevent implementation
371
372 Xen has two clockevent implementations:
373
374 The old timer_op one works with all released versions of Xen prior
375 to version 3.0.4. This version of the hypervisor provides a
376 single-shot timer with nanosecond resolution. However, sharing the
377 same event channel is a 100Hz tick which is delivered while the
378 vcpu is running. We don't care about or use this tick, but it will
379 cause the core time code to think the timer fired too soon, and
380 will end up resetting it each time. It could be filtered, but
381 doing so has complications when the ktime clocksource is not yet
382 the xen clocksource (ie, at boot time).
383
384 The new vcpu_op-based timer interface allows the tick timer period
385 to be changed or turned off. The tick timer is not useful as a
386 periodic timer because events are only delivered to running vcpus.
387 The one-shot timer can report when a timeout is in the past, so
388 set_next_event is capable of returning -ETIME when appropriate.
389 This interface is used when available.
390*/
391
392
393/*
394 Get a hypervisor absolute time. In theory we could maintain an
395 offset between the kernel's time and the hypervisor's time, and
396 apply that to a kernel's absolute timeout. Unfortunately the
397 hypervisor and kernel times can drift even if the kernel is using
398 the Xen clocksource, because ntp can warp the kernel's clocksource.
399*/
400static s64 get_abs_timeout(unsigned long delta)
401{
402 return xen_clocksource_read() + delta;
403}
404
405static void xen_timerop_set_mode(enum clock_event_mode mode,
406 struct clock_event_device *evt)
407{
408 switch (mode) {
409 case CLOCK_EVT_MODE_PERIODIC:
410 /* unsupported */
411 WARN_ON(1);
412 break;
413
414 case CLOCK_EVT_MODE_ONESHOT:
415 break;
416
417 case CLOCK_EVT_MODE_UNUSED:
418 case CLOCK_EVT_MODE_SHUTDOWN:
419 HYPERVISOR_set_timer_op(0); /* cancel timeout */
420 break;
421 }
422}
423
424static int xen_timerop_set_next_event(unsigned long delta,
425 struct clock_event_device *evt)
426{
427 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
428
429 if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
430 BUG();
431
432 /* We may have missed the deadline, but there's no real way of
433 knowing for sure. If the event was in the past, then we'll
434 get an immediate interrupt. */
435
436 return 0;
437}
438
439static const struct clock_event_device xen_timerop_clockevent = {
440 .name = "xen",
441 .features = CLOCK_EVT_FEAT_ONESHOT,
442
443 .max_delta_ns = 0xffffffff,
444 .min_delta_ns = TIMER_SLOP,
445
446 .mult = 1,
447 .shift = 0,
448 .rating = 500,
449
450 .set_mode = xen_timerop_set_mode,
451 .set_next_event = xen_timerop_set_next_event,
452};
453
454
455
456static void xen_vcpuop_set_mode(enum clock_event_mode mode,
457 struct clock_event_device *evt)
458{
459 int cpu = smp_processor_id();
460
461 switch (mode) {
462 case CLOCK_EVT_MODE_PERIODIC:
463 WARN_ON(1); /* unsupported */
464 break;
465
466 case CLOCK_EVT_MODE_ONESHOT:
467 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
468 BUG();
469 break;
470
471 case CLOCK_EVT_MODE_UNUSED:
472 case CLOCK_EVT_MODE_SHUTDOWN:
473 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
474 HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
475 BUG();
476 break;
477 }
478}
479
480static int xen_vcpuop_set_next_event(unsigned long delta,
481 struct clock_event_device *evt)
482{
483 int cpu = smp_processor_id();
484 struct vcpu_set_singleshot_timer single;
485 int ret;
486
487 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
488
489 single.timeout_abs_ns = get_abs_timeout(delta);
490 single.flags = VCPU_SSHOTTMR_future;
491
492 ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
493
494 BUG_ON(ret != 0 && ret != -ETIME);
495
496 return ret;
497}
498
499static const struct clock_event_device xen_vcpuop_clockevent = {
500 .name = "xen",
501 .features = CLOCK_EVT_FEAT_ONESHOT,
502
503 .max_delta_ns = 0xffffffff,
504 .min_delta_ns = TIMER_SLOP,
505
506 .mult = 1,
507 .shift = 0,
508 .rating = 500,
509
510 .set_mode = xen_vcpuop_set_mode,
511 .set_next_event = xen_vcpuop_set_next_event,
512};
513
514static const struct clock_event_device *xen_clockevent =
515 &xen_timerop_clockevent;
516static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
517
518static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
519{
520 struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
521 irqreturn_t ret;
522
523 ret = IRQ_NONE;
524 if (evt->event_handler) {
525 evt->event_handler(evt);
526 ret = IRQ_HANDLED;
527 }
528
529 do_stolen_accounting();
530
531 return ret;
532}
533
534void xen_setup_timer(int cpu)
535{
536 const char *name;
537 struct clock_event_device *evt;
538 int irq;
539
540 printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
541
542 name = kasprintf(GFP_KERNEL, "timer%d", cpu);
543 if (!name)
544 name = "<timer kasprintf failed>";
545
546 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
547 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
548 name, NULL);
549
550 evt = &per_cpu(xen_clock_events, cpu);
551 memcpy(evt, xen_clockevent, sizeof(*evt));
552
553 evt->cpumask = cpumask_of_cpu(cpu);
554 evt->irq = irq;
555
556 setup_runstate_info(cpu);
557}
558
559void xen_setup_cpu_clockevents(void)
560{
561 BUG_ON(preemptible());
562
563 clockevents_register_device(&__get_cpu_var(xen_clock_events));
564}
565
566__init void xen_time_init(void)
567{
568 int cpu = smp_processor_id();
569
570 get_time_values_from_xen();
571
572 clocksource_register(&xen_clocksource);
573
574 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
575 /* Successfully turned off 100Hz tick, so we have the
576 vcpuop-based timer interface */
577 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
578 xen_clockevent = &xen_vcpuop_clockevent;
579 }
580
581 /* Set initial system time with full resolution */
582 xen_read_wallclock(&xtime);
583 set_normalized_timespec(&wall_to_monotonic,
584 -xtime.tv_sec, -xtime.tv_nsec);
585
586 tsc_disable = 0;
587
588 xen_setup_timer(cpu);
589 xen_setup_cpu_clockevents();
590}
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S
new file mode 100644
index 000000000000..1a43b60c0c62
--- /dev/null
+++ b/arch/i386/xen/xen-asm.S
@@ -0,0 +1,291 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/thread_info.h>
18#include <asm/percpu.h>
19#include <asm/processor-flags.h>
20#include <asm/segment.h>
21
22#include <xen/interface/xen.h>
23
24#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
25#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28#define XEN_EFLAGS_NMI 0x80000000
29
30/*
31 Enable events. This clears the event mask and tests the pending
32 event status with one and operation. If there are pending
33 events, then enter the hypervisor to get them handled.
34 */
35ENTRY(xen_irq_enable_direct)
36 /* Clear mask and test pending */
37 andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
38 /* Preempt here doesn't matter because that will deal with
39 any pending interrupts. The pending check may end up being
40 run on the wrong CPU, but that doesn't hurt. */
41 jz 1f
422: call check_events
431:
44ENDPATCH(xen_irq_enable_direct)
45 ret
46 ENDPROC(xen_irq_enable_direct)
47 RELOC(xen_irq_enable_direct, 2b+1)
48
49
50/*
51 Disabling events is simply a matter of making the event mask
52 non-zero.
53 */
54ENTRY(xen_irq_disable_direct)
55 movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
56ENDPATCH(xen_irq_disable_direct)
57 ret
58 ENDPROC(xen_irq_disable_direct)
59 RELOC(xen_irq_disable_direct, 0)
60
61/*
62 (xen_)save_fl is used to get the current interrupt enable status.
63 Callers expect the status to be in X86_EFLAGS_IF, and other bits
64 may be set in the return value. We take advantage of this by
65 making sure that X86_EFLAGS_IF has the right value (and other bits
66 in that byte are 0), but other bits in the return value are
67 undefined. We need to toggle the state of the bit, because
68 Xen and x86 use opposite senses (mask vs enable).
69 */
70ENTRY(xen_save_fl_direct)
71 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
72 setz %ah
73 addb %ah,%ah
74ENDPATCH(xen_save_fl_direct)
75 ret
76 ENDPROC(xen_save_fl_direct)
77 RELOC(xen_save_fl_direct, 0)
78
79
80/*
81 In principle the caller should be passing us a value return
82 from xen_save_fl_direct, but for robustness sake we test only
83 the X86_EFLAGS_IF flag rather than the whole byte. After
84 setting the interrupt mask state, it checks for unmasked
85 pending events and enters the hypervisor to get them delivered
86 if so.
87 */
88ENTRY(xen_restore_fl_direct)
89 testb $X86_EFLAGS_IF>>8, %ah
90 setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
91 /* Preempt here doesn't matter because that will deal with
92 any pending interrupts. The pending check may end up being
93 run on the wrong CPU, but that doesn't hurt. */
94
95 /* check for unmasked and pending */
96 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
97 jz 1f
982: call check_events
991:
100ENDPATCH(xen_restore_fl_direct)
101 ret
102 ENDPROC(xen_restore_fl_direct)
103 RELOC(xen_restore_fl_direct, 2b+1)
104
105/*
106 This is run where a normal iret would be run, with the same stack setup:
107 8: eflags
108 4: cs
109 esp-> 0: eip
110
111 This attempts to make sure that any pending events are dealt
112 with on return to usermode, but there is a small window in
113 which an event can happen just before entering usermode. If
114 the nested interrupt ends up setting one of the TIF_WORK_MASK
115 pending work flags, they will not be tested again before
116 returning to usermode. This means that a process can end up
117 with pending work, which will be unprocessed until the process
118 enters and leaves the kernel again, which could be an
119 unbounded amount of time. This means that a pending signal or
120 reschedule event could be indefinitely delayed.
121
122 The fix is to notice a nested interrupt in the critical
123 window, and if one occurs, then fold the nested interrupt into
124 the current interrupt stack frame, and re-process it
125 iteratively rather than recursively. This means that it will
126 exit via the normal path, and all pending work will be dealt
127 with appropriately.
128
129 Because the nested interrupt handler needs to deal with the
130 current stack state in whatever form its in, we keep things
131 simple by only using a single register which is pushed/popped
132 on the stack.
133
134 Non-direct iret could be done in the same way, but it would
135 require an annoying amount of code duplication. We'll assume
136 that direct mode will be the common case once the hypervisor
137 support becomes commonplace.
138 */
139ENTRY(xen_iret_direct)
140 /* test eflags for special cases */
141 testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
142 jnz hyper_iret
143
144 push %eax
145 ESP_OFFSET=4 # bytes pushed onto stack
146
147 /* Store vcpu_info pointer for easy access. Do it this
148 way to avoid having to reload %fs */
149#ifdef CONFIG_SMP
150 GET_THREAD_INFO(%eax)
151 movl TI_cpu(%eax),%eax
152 movl __per_cpu_offset(,%eax,4),%eax
153 lea per_cpu__xen_vcpu_info(%eax),%eax
154#else
155 movl $per_cpu__xen_vcpu_info, %eax
156#endif
157
158 /* check IF state we're restoring */
159 testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
160
161 /* Maybe enable events. Once this happens we could get a
162 recursive event, so the critical region starts immediately
163 afterwards. However, if that happens we don't end up
164 resuming the code, so we don't have to be worried about
165 being preempted to another CPU. */
166 setz XEN_vcpu_info_mask(%eax)
167xen_iret_start_crit:
168
169 /* check for unmasked and pending */
170 cmpw $0x0001, XEN_vcpu_info_pending(%eax)
171
172 /* If there's something pending, mask events again so we
173 can jump back into xen_hypervisor_callback */
174 sete XEN_vcpu_info_mask(%eax)
175
176 popl %eax
177
178 /* From this point on the registers are restored and the stack
179 updated, so we don't need to worry about it if we're preempted */
180iret_restore_end:
181
182 /* Jump to hypervisor_callback after fixing up the stack.
183 Events are masked, so jumping out of the critical
184 region is OK. */
185 je xen_hypervisor_callback
186
187 iret
188xen_iret_end_crit:
189
190hyper_iret:
191 /* put this out of line since its very rarely used */
192 jmp hypercall_page + __HYPERVISOR_iret * 32
193
194 .globl xen_iret_start_crit, xen_iret_end_crit
195
196/*
197 This is called by xen_hypervisor_callback in entry.S when it sees
198 that the EIP at the time of interrupt was between xen_iret_start_crit
199 and xen_iret_end_crit. We're passed the EIP in %eax so we can do
200 a more refined determination of what to do.
201
202 The stack format at this point is:
203 ----------------
204 ss : (ss/esp may be present if we came from usermode)
205 esp :
206 eflags } outer exception info
207 cs }
208 eip }
209 ---------------- <- edi (copy dest)
210 eax : outer eax if it hasn't been restored
211 ----------------
212 eflags } nested exception info
213 cs } (no ss/esp because we're nested
214 eip } from the same ring)
215 orig_eax }<- esi (copy src)
216 - - - - - - - -
217 fs }
218 es }
219 ds } SAVE_ALL state
220 eax }
221 : :
222 ebx }
223 ----------------
224 return addr <- esp
225 ----------------
226
227 In order to deliver the nested exception properly, we need to shift
228 everything from the return addr up to the error code so it
229 sits just under the outer exception info. This means that when we
230 handle the exception, we do it in the context of the outer exception
231 rather than starting a new one.
232
233 The only caveat is that if the outer eax hasn't been
234 restored yet (ie, it's still on stack), we need to insert
235 its value into the SAVE_ALL state before going on, since
236 it's usermode state which we eventually need to restore.
237 */
238ENTRY(xen_iret_crit_fixup)
239 /* offsets +4 for return address */
240
241 /*
242 Paranoia: Make sure we're really coming from userspace.
243 One could imagine a case where userspace jumps into the
244 critical range address, but just before the CPU delivers a GP,
245 it decides to deliver an interrupt instead. Unlikely?
246 Definitely. Easy to avoid? Yes. The Intel documents
247 explicitly say that the reported EIP for a bad jump is the
248 jump instruction itself, not the destination, but some virtual
249 environments get this wrong.
250 */
251 movl PT_CS+4(%esp), %ecx
252 andl $SEGMENT_RPL_MASK, %ecx
253 cmpl $USER_RPL, %ecx
254 je 2f
255
256 lea PT_ORIG_EAX+4(%esp), %esi
257 lea PT_EFLAGS+4(%esp), %edi
258
259 /* If eip is before iret_restore_end then stack
260 hasn't been restored yet. */
261 cmp $iret_restore_end, %eax
262 jae 1f
263
264 movl 0+4(%edi),%eax /* copy EAX */
265 movl %eax, PT_EAX+4(%esp)
266
267 lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
268
269 /* set up the copy */
2701: std
271 mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
272 rep movsl
273 cld
274
275 lea 4(%edi),%esp /* point esp to new frame */
2762: ret
277
278
279/*
280 Force an event check by making a hypercall,
281 but preserve regs before making the call.
282 */
283check_events:
284 push %eax
285 push %ecx
286 push %edx
287 call force_evtchn_callback
288 pop %edx
289 pop %ecx
290 pop %eax
291 ret
diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S
new file mode 100644
index 000000000000..2998d55a0017
--- /dev/null
+++ b/arch/i386/xen/xen-head.S
@@ -0,0 +1,36 @@
1/* Xen-specific pieces of head.S, intended to be included in the right
2 place in head.S */
3
4#ifdef CONFIG_XEN
5
6#include <linux/elfnote.h>
7#include <asm/boot.h>
8#include <xen/interface/elfnote.h>
9
10ENTRY(startup_xen)
11 movl %esi,xen_start_info
12 cld
13 movl $(init_thread_union+THREAD_SIZE),%esp
14 jmp xen_start_kernel
15
16.pushsection ".bss.page_aligned"
17 .align PAGE_SIZE_asm
18ENTRY(hypercall_page)
19 .skip 0x1000
20.popsection
21
22 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
23 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
24 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
25 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
26 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
27 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
28 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
29#ifdef CONFIG_X86_PAE
30 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
31#else
32 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
33#endif
34 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
35
36#endif /*CONFIG_XEN */
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
new file mode 100644
index 000000000000..b9aaea45f07f
--- /dev/null
+++ b/arch/i386/xen/xen-ops.h
@@ -0,0 +1,71 @@
1#ifndef XEN_OPS_H
2#define XEN_OPS_H
3
4#include <linux/init.h>
5
6/* These are code, but not functions. Defined in entry.S */
7extern const char xen_hypervisor_callback[];
8extern const char xen_failsafe_callback[];
9
10void xen_copy_trap_info(struct trap_info *traps);
11
12DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
13DECLARE_PER_CPU(unsigned long, xen_cr3);
14
15extern struct start_info *xen_start_info;
16extern struct shared_info *HYPERVISOR_shared_info;
17
18char * __init xen_memory_setup(void);
19void __init xen_arch_setup(void);
20void __init xen_init_IRQ(void);
21
22void xen_setup_timer(int cpu);
23void xen_setup_cpu_clockevents(void);
24unsigned long xen_cpu_khz(void);
25void __init xen_time_init(void);
26unsigned long xen_get_wallclock(void);
27int xen_set_wallclock(unsigned long time);
28unsigned long long xen_sched_clock(void);
29
30void xen_mark_init_mm_pinned(void);
31
32DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
33
34static inline unsigned xen_get_lazy_mode(void)
35{
36 return x86_read_percpu(xen_lazy_mode);
37}
38
39void __init xen_fill_possible_map(void);
40
41void __init xen_setup_vcpu_info_placement(void);
42void xen_smp_prepare_boot_cpu(void);
43void xen_smp_prepare_cpus(unsigned int max_cpus);
44int xen_cpu_up(unsigned int cpu);
45void xen_smp_cpus_done(unsigned int max_cpus);
46
47void xen_smp_send_stop(void);
48void xen_smp_send_reschedule(int cpu);
49int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
50 int wait);
51int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
52 int nonatomic, int wait);
53
54int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
55 void *info, int wait);
56
57
58/* Declare an asm function, along with symbols needed to make it
59 inlineable */
60#define DECL_ASM(ret, name, ...) \
61 ret name(__VA_ARGS__); \
62 extern char name##_end[]; \
63 extern char name##_reloc[] \
64
65DECL_ASM(void, xen_irq_enable_direct, void);
66DECL_ASM(void, xen_irq_disable_direct, void);
67DECL_ASM(unsigned long, xen_save_fl_direct, void);
68DECL_ASM(void, xen_restore_fl_direct, unsigned long);
69
70void xen_iret_direct(void);
71#endif /* XEN_OPS_H */